[llvm] 7de6107 - Revert "[AMDGPU] Improve v_cmpx usage on GFX10.3."

Mon Mar 21 01:55:06 PDT 2022

Author: Thomas Symalla
Date: 2022-03-21T09:50:44+01:00
New Revision: 7de6107dce5388968e3914601d02cf70b9853e9c

URL: https://github.com/llvm/llvm-project/commit/7de6107dce5388968e3914601d02cf70b9853e9c
DIFF: https://github.com/llvm/llvm-project/commit/7de6107dce5388968e3914601d02cf70b9853e9c.diff

LOG: Revert "[AMDGPU] Improve v_cmpx usage on GFX10.3."

This reverts commit 011c64191ef9ccc6538d52f4b57f98f37d4ea36e and
e725e2afe02e18398525652c9bceda1eb055ea64.

Differential Revision: https://reviews.llvm.org/D122117

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
    llvm/lib/Target/AMDGPU/SIInstrInfo.h
    llvm/lib/Target/AMDGPU/SIInstrInfo.td
    llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
    llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
    llvm/lib/Target/AMDGPU/VOPCInstructions.td
    llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx10-branch-offset-bug.ll
    llvm/test/CodeGen/AMDGPU/wqm.ll

Removed: 
    llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.ll
    llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.mir


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 5fd13e4d54f7b..5afd0bba095b2 100644

--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -3856,19 +3856,18 @@ static void copyFlagsToImplicitVCC(MachineInstr &MI,
 
 MachineInstr *SIInstrInfo::buildShrunkInst(MachineInstr &MI,
                                            unsigned Op32) const {
-  MachineBasicBlock *MBB = MI.getParent();
+  MachineBasicBlock *MBB = MI.getParent();;
   MachineInstrBuilder Inst32 =
     BuildMI(*MBB, MI, MI.getDebugLoc(), get(Op32))
     .setMIFlags(MI.getFlags());
 
   // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
   // For VOPC instructions, this is replaced by an implicit def of vcc.
-  if (AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::vdst) != -1) {
+  int Op32DstIdx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::vdst);
+  if (Op32DstIdx != -1) {
     // dst
     Inst32.add(MI.getOperand(0));
-  } else if (AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::sdst) != -1) {
-    // VOPCX instructions won't be writing to an explicit dst, so this should
-    // not fail for these instructions.
+  } else {
     assert(((MI.getOperand(0).getReg() == AMDGPU::VCC) ||
             (MI.getOperand(0).getReg() == AMDGPU::VCC_LO)) &&
            "Unexpected case");

diff  --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 4b7589002a718..25d3f4a765e6b 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1263,10 +1263,6 @@ namespace AMDGPU {
   LLVM_READONLY
   int getMFMAEarlyClobberOp(uint16_t Opcode);
 
-  /// \returns v_cmpx version of a v_cmp instruction.
-  LLVM_READONLY
-  int getVCMPXOpFromVCMP(uint16_t Opcode);
-
   const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL;
   const uint64_t RSRC_ELEMENT_SIZE_SHIFT = (32 + 19);
   const uint64_t RSRC_INDEX_STRIDE_SHIFT = (32 + 21);

diff  --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index e0e99680a962b..30c67d92b4cde 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -2669,15 +2669,6 @@ def getMFMAEarlyClobberOp : InstrMapping {
   let ValueCols = [["0"]];
 }
 
-// Maps an v_cmp instruction to its v_cmpx equivalent.
-def getVCMPXOpFromVCMP : InstrMapping {
-  let FilterClass = "VCMPVCMPXTable";
-  let RowFields = ["VCMPOp"];
-  let ColFields = ["IsVCMPX"];
-  let KeyCol = ["0"];
-  let ValueCols = [["1"]];
-}
-
 include "SIInstructions.td"
 
 include "DSInstructions.td"

diff  --git a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
index 471a871891aec..9a4cc25f00085 100644
--- a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
+++ b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
@@ -11,7 +11,6 @@
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/InitializePasses.h"
-#include "llvm/CodeGen/LivePhysRegs.h"
 
 using namespace llvm;
 
@@ -293,183 +292,6 @@ static bool isLiveOut(const MachineBasicBlock &MBB, unsigned Reg) {
   return false;
 }
 
-// Backwards-iterate from Origin (for n=MaxInstructions iterations) until either
-// the beginning of the BB is reached or Pred evaluates to true - which can be
-// an arbitrary condition based on the current MachineInstr, for instance an
-// target instruction. Breaks prematurely by returning nullptr if  one of the
-// registers given in NonModifiableRegs is modified by the current instruction.
-static MachineInstr *
-findInstrBackwards(MachineInstr &Origin,
-                   std::function<bool(MachineInstr *)> Pred,
-                   ArrayRef<MCRegister> NonModifiableRegs,
-                   const SIRegisterInfo *TRI, unsigned MaxInstructions = 5) {
-  MachineBasicBlock::reverse_iterator A = Origin.getReverseIterator(),
-                                      E = Origin.getParent()->rend();
-  unsigned CurrentIteration = 0;
-
-  for (++A; CurrentIteration < MaxInstructions && A != E; ++A) {
-    if (Pred(&*A))
-      return &*A;
-
-    for (MCRegister Reg : NonModifiableRegs) {
-      if (A->modifiesRegister(Reg, TRI))
-        return nullptr;
-    }
-
-    ++CurrentIteration;
-  }
-
-  return nullptr;
-}
-
-// Determine if a register Reg is not re-defined and still in use
-// in the range (Stop..BB.end].
-// It does so by backwards calculating liveness from the end of the BB until
-// either Stop or the beginning of the BB is reached.
-// After liveness is calculated, we can determine if Reg is still in use and not
-// defined inbetween the instructions.
-static bool isRegisterInUseAfter(MachineInstr &Stop, MCRegister Reg,
-                                 const SIRegisterInfo *TRI,
-                                 MachineRegisterInfo &MRI) {
-  LivePhysRegs LR(*TRI);
-  LR.addLiveOuts(*Stop.getParent());
-
-  for (auto A = Stop.getParent()->rbegin();
-       A != Stop.getParent()->rend() && A != Stop; ++A) {
-    LR.stepBackward(*A);
-  }
-
-  return !LR.available(MRI, Reg);
-}
-
-// Tries to find a possibility to optimize a v_cmp ..., s_and_saveexec sequence
-// by looking at an instance of a s_and_saveexec instruction. Returns a pointer
-// to the v_cmp instruction if it is safe to replace the sequence (see the
-// conditions in the function body). This is after register allocation, so some
-// checks on operand dependencies need to be considered.
-static MachineInstr *findPossibleVCMPVCMPXOptimization(
-    MachineInstr &SaveExec, MCRegister Exec, const SIRegisterInfo *TRI,
-    const SIInstrInfo *TII, MachineRegisterInfo &MRI) {
-
-  MachineInstr *VCmp = nullptr;
-
-  Register SaveExecDest = SaveExec.getOperand(0).getReg();
-  if (!TRI->isSGPRReg(MRI, SaveExecDest))
-    return nullptr;
-
-  MachineOperand *SaveExecSrc0 =
-      TII->getNamedOperand(SaveExec, AMDGPU::OpName::src0);
-  if (!SaveExecSrc0->isReg())
-    return nullptr;
-
-  // Try to find the last v_cmp instruction that defs the saveexec input
-  // operand without any write to Exec inbetween.
-  VCmp = findInstrBackwards(
-      SaveExec,
-      [&](MachineInstr *Check) {
-        return AMDGPU::getVCMPXOpFromVCMP(Check->getOpcode()) != -1 &&
-               Check->modifiesRegister(SaveExecSrc0->getReg(), TRI);
-      },
-      {Exec, SaveExecSrc0->getReg()}, TRI);
-
-  if (!VCmp)
-    return nullptr;
-  
-  MachineOperand *VCmpDest = TII->getNamedOperand(*VCmp, AMDGPU::OpName::sdst);
-  assert(VCmpDest && "Should have an sdst operand!");
-
-  // Check if any of the v_cmp source operands is written by the saveexec.
-  MachineOperand *Src0 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src0);
-  if (Src0->isReg() && TRI->isSGPRReg(MRI, Src0->getReg()) &&
-      SaveExec.modifiesRegister(Src0->getReg(), TRI))
-    return nullptr;
-
-  MachineOperand *Src1 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src1);
-  if (Src1->isReg() && TRI->isSGPRReg(MRI, Src1->getReg()) &&
-      SaveExec.modifiesRegister(Src1->getReg(), TRI))
-    return nullptr;
-
-  // Don't do the transformation if the destination operand is included in
-  // it's MBB Live-outs, meaning it's used in any of it's successors, leading
-  // to incorrect code if the v_cmp and therefore the def of
-  // the dest operand is removed.
-  if (isLiveOut(*VCmp->getParent(), VCmpDest->getReg()))
-    return nullptr;
-
-  // If the v_cmp target is in use after the s_and_saveexec, skip the
-  // optimization.
-  if (isRegisterInUseAfter(SaveExec, VCmpDest->getReg(), TRI,
-                          MRI))
-    return nullptr;
-
-  // Try to determine if there is a write to any of the VCmp
-  // operands between the saveexec and the vcmp.
-  // If yes, additional VGPR spilling might need to be inserted. In this case,
-  // it's not worth replacing the instruction sequence.
-  SmallVector<MCRegister, 2> NonDefRegs;
-  if (Src0->isReg())
-    NonDefRegs.push_back(Src0->getReg());
-
-  if (Src1->isReg())
-    NonDefRegs.push_back(Src1->getReg());
-
-  if (!findInstrBackwards(
-          SaveExec, [&](MachineInstr *Check) { return Check == VCmp; },
-          NonDefRegs, TRI))
-    return nullptr;
-
-  return VCmp;
-}
-
-// Inserts the optimized s_mov_b32 / v_cmpx sequence based on the
-// operands extracted from a v_cmp ..., s_and_saveexec pattern.
-static bool optimizeVCMPSaveExecSequence(MachineInstr &SaveExecInstr,
-                                         MachineInstr &VCmp, MCRegister Exec,
-                                         const SIInstrInfo *TII,
-                                         const SIRegisterInfo *TRI,
-                                         MachineRegisterInfo &MRI) {
-  const int NewOpcode = AMDGPU::getVCMPXOpFromVCMP(VCmp.getOpcode());
-
-  if (NewOpcode == -1)
-    return false;
-
-  MachineOperand *Src0 = TII->getNamedOperand(VCmp, AMDGPU::OpName::src0);
-  MachineOperand *Src1 = TII->getNamedOperand(VCmp, AMDGPU::OpName::src1);
-
-  Register MoveDest = SaveExecInstr.getOperand(0).getReg();
-
-  MachineBasicBlock::instr_iterator InsertPosIt = SaveExecInstr.getIterator();
-  if (!SaveExecInstr.uses().empty()) {
-    bool isSGPR32 = TRI->getRegSizeInBits(MoveDest, MRI) == 32;
-    unsigned MovOpcode = isSGPR32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
-    BuildMI(*SaveExecInstr.getParent(), InsertPosIt,
-            SaveExecInstr.getDebugLoc(), TII->get(MovOpcode), MoveDest)
-        .addReg(Exec);
-  }
-
-  // Omit dst as V_CMPX is implicitly writing to EXEC.
-  // Add dummy src and clamp modifiers, if needed.
-  auto Builder = BuildMI(*VCmp.getParent(), std::next(InsertPosIt),
-                         VCmp.getDebugLoc(), TII->get(NewOpcode));
-
-  if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src0_modifiers) !=
-      -1)
-    Builder.addImm(0);
-
-  Builder.add(*Src0);
-
-  if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src1_modifiers) !=
-      -1)
-    Builder.addImm(0);
-
-  Builder.add(*Src1);
-
-  if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::clamp) != -1)
-    Builder.addImm(0);
-
-  return true;
-}
-
 bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(MF.getFunction()))
     return false;
@@ -477,7 +299,6 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const SIRegisterInfo *TRI = ST.getRegisterInfo();
   const SIInstrInfo *TII = ST.getInstrInfo();
-  MachineRegisterInfo *MRI = &MF.getRegInfo();
   MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
 
   // Optimize sequences emitted for control flow lowering. They are originally
@@ -641,45 +462,5 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
     Changed = true;
   }
 
-  // After all s_op_saveexec instructions are inserted,
-  // replace (on GFX10.3 and later)
-  // v_cmp_* SGPR, IMM, VGPR
-  // s_and_saveexec_b32 EXEC_SGPR_DEST, SGPR
-  // with
-  // s_mov_b32 EXEC_SGPR_DEST, exec_lo
-  // v_cmpx_* IMM, VGPR
-  // to reduce pipeline stalls.
-  if (ST.hasGFX10_3Insts()) {
-    DenseMap<MachineInstr *, MachineInstr *> SaveExecVCmpMapping;
-    const unsigned AndSaveExecOpcode =
-        ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
-
-    for (MachineBasicBlock &MBB : MF) {
-      for (MachineInstr &MI : MBB) {
-        // Record relevant v_cmp / s_and_saveexec instruction pairs for
-        // replacement.
-        if (MI.getOpcode() != AndSaveExecOpcode)
-          continue;
-
-        if (MachineInstr *VCmp =
-                findPossibleVCMPVCMPXOptimization(MI, Exec, TRI, TII, *MRI))
-          SaveExecVCmpMapping[&MI] = VCmp;
-      }
-    }
-
-    for (const auto &Entry : SaveExecVCmpMapping) {
-      MachineInstr *SaveExecInstr = Entry.getFirst();
-      MachineInstr *VCmpInstr = Entry.getSecond();
-
-      if (optimizeVCMPSaveExecSequence(*SaveExecInstr, *VCmpInstr, Exec, TII,
-                                       TRI, *MRI)) {
-        SaveExecInstr->eraseFromParent();
-        VCmpInstr->eraseFromParent();
-
-        Changed = true;
-      }
-    }
-  }
-
   return Changed;
 }

diff  --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index 880df8bdb67ec..e437552c2afdc 100644
--- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -728,27 +728,21 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
       int Op32 = AMDGPU::getVOPe32(MI.getOpcode());
 
       if (TII->isVOPC(Op32)) {
-        MachineOperand &Op0 = MI.getOperand(0);
-        if (Op0.isReg()) {
-          // Exclude VOPCX instructions as these don't explicitly write a
-          // dst.
-          Register DstReg = Op0.getReg();
-          if (DstReg.isVirtual()) {
-            // VOPC instructions can only write to the VCC register. We can't
-            // force them to use VCC here, because this is only one register and
-            // cannot deal with sequences which would require multiple copies of
-            // VCC, e.g. S_AND_B64 (vcc = V_CMP_...), (vcc = V_CMP_...)
-            //
-            // So, instead of forcing the instruction to write to VCC, we
-            // provide a hint to the register allocator to use VCC and then we
-            // will run this pass again after RA and shrink it if it outputs to
-            // VCC.
-            MRI.setRegAllocationHint(DstReg, 0, VCCReg);
-            continue;
-          }
-          if (DstReg != VCCReg)
-            continue;
+        Register DstReg = MI.getOperand(0).getReg();
+        if (DstReg.isVirtual()) {
+          // VOPC instructions can only write to the VCC register. We can't
+          // force them to use VCC here, because this is only one register and
+          // cannot deal with sequences which would require multiple copies of
+          // VCC, e.g. S_AND_B64 (vcc = V_CMP_...), (vcc = V_CMP_...)
+          //
+          // So, instead of forcing the instruction to write to VCC, we provide
+          // a hint to the register allocator to use VCC and then we will run
+          // this pass again after RA and shrink it if it outputs to VCC.
+          MRI.setRegAllocationHint(MI.getOperand(0).getReg(), 0, VCCReg);
+          continue;
         }
+        if (DstReg != VCCReg)
+          continue;
       }
 
       if (Op32 == AMDGPU::V_CNDMASK_B32_e32) {

diff  --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
index 1220b5c8ac35d..c0cc91029d111 100644
--- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
@@ -205,11 +205,6 @@ class VCMPXNoSDstTable <bit has_sdst, string Name> {
   string NoSDstOp = Name;
 }
 
-class VCMPVCMPXTable <string Name> {
-  bit IsVCMPX = 0;
-  string VCMPOp = Name;
-}
-
 multiclass VOPC_Pseudos <string opName,
                          VOPC_Profile P,
                          SDPatternOperator cond = COND_NULL,
@@ -218,8 +213,7 @@ multiclass VOPC_Pseudos <string opName,
 
   def _e32 : VOPC_Pseudo <opName, P>,
              Commutable_REV<revOp#"_e32", !eq(revOp, opName)>,
-             VCMPXNoSDstTable<1, opName#"_e32">,
-             VCMPVCMPXTable<opName#"_e32"> {
+             VCMPXNoSDstTable<1, opName#"_e32"> {
     let Defs = !if(DefExec, [VCC, EXEC], [VCC]);
     let SchedRW = P.Schedule;
     let isConvergent = DefExec;
@@ -229,8 +223,7 @@ multiclass VOPC_Pseudos <string opName,
 
   def _e64 : VOP3_Pseudo<opName, P, getVOPCPat64<cond, P>.ret>,
     Commutable_REV<revOp#"_e64", !eq(revOp, opName)>,
-    VCMPXNoSDstTable<1, opName#"_e64">,
-    VCMPVCMPXTable<opName#"_e64"> {
+    VCMPXNoSDstTable<1, opName#"_e64"> {
     let Defs = !if(DefExec, [EXEC], []);
     let SchedRW = P.Schedule;
     let isCompare = 1;
@@ -255,27 +248,23 @@ multiclass VOPCX_Pseudos <string opName,
 
   def _nosdst_e32 : VOPC_Pseudo <opName#"_nosdst", P_NoSDst, [], 0>,
              Commutable_REV<revOp#"_nosdst_e32", !eq(revOp, opName)>,
-             VCMPXNoSDstTable<0, opName#"_e32">,
-             VCMPVCMPXTable<!subst("v_cmpx", "v_cmp", opName#"_e32")> {
+             VCMPXNoSDstTable<0, opName#"_e32"> {
     let Defs = [EXEC];
     let SchedRW = P_NoSDst.Schedule;
     let isConvergent = 1;
     let isCompare = 1;
     let isCommutable = 1;
     let SubtargetPredicate = HasNoSdstCMPX;
-    let IsVCMPX = 1;
   }
 
   def _nosdst_e64 : VOP3_Pseudo<opName#"_nosdst", P_NoSDst>,
     Commutable_REV<revOp#"_nosdst_e64", !eq(revOp, opName)>,
-    VCMPXNoSDstTable<0, opName#"_e64">,
-    VCMPVCMPXTable<!subst("v_cmpx", "v_cmp", opName#"_e64")> {
+    VCMPXNoSDstTable<0, opName#"_e64"> {
     let Defs = [EXEC];
     let SchedRW = P_NoSDst.Schedule;
     let isCompare = 1;
     let isCommutable = 1;
     let SubtargetPredicate = HasNoSdstCMPX;
-    let IsVCMPX = 1;
   }
 
   foreach _ = BoolToList<P_NoSDst.HasExtSDWA>.ret in

diff  --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx10-branch-offset-bug.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx10-branch-offset-bug.ll
index a2c35b97aef65..2fed25fff3d50 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx10-branch-offset-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx10-branch-offset-bug.ll
@@ -6,7 +6,7 @@
 
 ; GCN-LABEL: long_forward_scc_branch_3f_offset_bug:
 ; GFX1030: s_cmp_lg_u32
-; GFX1030: s_cbranch_scc1  [[ENDBB:.LBB[0-9]+_[0-9]+]]
+; GFX1030-NEXT: s_cbranch_scc1  [[ENDBB:.LBB[0-9]+_[0-9]+]]
 
 ; GFX1010: s_cmp_lg_u32
 ; GFX1010-NEXT: s_cbranch_scc0  [[RELAX_BB:.LBB[0-9]+_[0-9]+]]
@@ -51,9 +51,9 @@ bb3:
 }
 
 ; GCN-LABEL: {{^}}long_forward_exec_branch_3f_offset_bug:
-; GFX1030: s_mov_b32
-; GFX1030: v_cmpx_eq_u32
-; GFX1030: s_cbranch_execnz [[RELAX_BB:.LBB[0-9]+_[0-9]+]]
+; GFX1030: v_cmp_eq_u32
+; GFX1030: s_and_saveexec_b32
+; GFX1030-NEXT: s_cbranch_execnz [[RELAX_BB:.LBB[0-9]+_[0-9]+]]
 
 ; GFX1010: v_cmp_eq_u32
 ; GFX1010: s_and_saveexec_b32

diff  --git a/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.ll b/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.ll
deleted file mode 100644
index f8aae95f62b12..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.ll
+++ /dev/null
@@ -1,167 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1010 %s
-; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1030 %s
-
-; GCN-LABEL: {{^}}test_insert_vcmpx_pattern_lt:
-; GFX1010: v_cmp_lt_i32_e32 vcc_lo, 15, v{{.*}}
-; GFX1010-NEXT: s_and_saveexec_b32 s{{.*}}, vcc_lo
-; GFX1030: s_mov_b32 s{{.*}}, exec_lo
-; GFX1030-NEXT: v_cmpx_lt_i32_e32 15, v{{.*}}
-define i32 @test_insert_vcmpx_pattern_lt(i32 %x) {
-entry:
-  %bc = icmp slt i32 %x, 16
-  br i1 %bc, label %endif, label %if
-
-if:
-  %ret = shl i32 %x, 2
-  ret i32 %ret
-
-endif:
-  ret i32 %x
-}
-
-; GCN-LABEL: {{^}}test_insert_vcmpx_pattern_gt:
-; GFX1010: v_cmp_gt_i32_e32 vcc_lo, 17, v{{.*}}
-; GFX1010-NEXT: s_and_saveexec_b32 s{{.*}}, vcc_lo
-; GFX1030: s_mov_b32 s{{.*}}, exec_lo
-; GFX1030-NEXT: v_cmpx_gt_i32_e32 17, v{{.*}}
-define i32 @test_insert_vcmpx_pattern_gt(i32 %x) {
-entry:
-  %bc = icmp sgt i32 %x, 16
-  br i1 %bc, label %endif, label %if
-
-if:
-  %ret = shl i32 %x, 2
-  ret i32 %ret
-
-endif:
-  ret i32 %x
-}
-
-; GCN-LABEL: {{^}}test_insert_vcmpx_pattern_eq:
-; GFX1010: v_cmp_ne_u32_e32 vcc_lo, 16, v{{.*}}
-; GFX1010-NEXT: s_and_saveexec_b32 s{{.*}}, vcc_lo
-; GFX1030: s_mov_b32 s{{.*}}, exec_lo
-; GFX1030-NEXT: v_cmpx_ne_u32_e32 16, v{{.*}}
-define i32 @test_insert_vcmpx_pattern_eq(i32 %x) {
-entry:
-  %bc = icmp eq i32 %x, 16
-  br i1 %bc, label %endif, label %if
-
-if:
-  %ret = shl i32 %x, 2
-  ret i32 %ret
-
-endif:
-  ret i32 %x
-}
-
-; GCN-LABEL: {{^}}test_insert_vcmpx_pattern_ne:
-; GFX1010: v_cmp_eq_u32_e32 vcc_lo, 16, v{{.*}}
-; GFX1010-NEXT: s_and_saveexec_b32 s{{.*}}, vcc_lo
-; GFX1030: s_mov_b32 s{{.*}}, exec_lo
-; GFX1030-NEXT: v_cmpx_eq_u32_e32 16, v{{.*}}
-define i32 @test_insert_vcmpx_pattern_ne(i32 %x) {
-entry:
-  %bc = icmp ne i32 %x, 16
-  br i1 %bc, label %endif, label %if
-
-if:
-  %ret = shl i32 %x, 2
-  ret i32 %ret
-
-endif:
-  ret i32 %x
-}
-
-; GCN-LABEL: {{^}}test_insert_vcmpx_pattern_le:
-; GFX1010: v_cmp_lt_i32_e32 vcc_lo, 16, v{{.*}}
-; GFX1010-NEXT: s_and_saveexec_b32 s{{.*}}, vcc_lo
-; GFX1030: s_mov_b32 s{{.*}}, exec_lo
-; GFX1030-NEXT: v_cmpx_lt_i32_e32 16, v{{.*}}
-define i32 @test_insert_vcmpx_pattern_le(i32 %x) {
-entry:
-  %bc = icmp sle i32 %x, 16
-  br i1 %bc, label %endif, label %if
-
-if:
-  %ret = shl i32 %x, 2
-  ret i32 %ret
-
-endif:
-  ret i32 %x
-}
-
-; GCN-LABEL: {{^}}test_insert_vcmpx_pattern_ge:
-; GFX1010: v_cmp_gt_i32_e32 vcc_lo, 16, v{{.*}}
-; GFX1010-NEXT: s_and_saveexec_b32 s{{.*}}, vcc_lo
-; GFX1030: s_mov_b32 s{{.*}}, exec_lo
-; GFX1030-NEXT: v_cmpx_gt_i32_e32 16, v{{.*}}
-define i32 @test_insert_vcmpx_pattern_ge(i32 %x) {
-entry:
-  %bc = icmp sge i32 %x, 16
-  br i1 %bc, label %endif, label %if
-
-if:
-  %ret = shl i32 %x, 2
-  ret i32 %ret
-
-endif:
-  ret i32 %x
-}
-
-declare amdgpu_gfx void @check_live_outs_helper(i64) #0
-
-; In cases where the output operand cannot be safely removed,
-; don't apply the v_cmpx transformation.
-
-; GCN-LABEL: {{^}}check_live_outs:
-; GFX1010: v_cmp_eq_u32_e64 s{{.*}}, v{{.*}}, v{{.*}}
-; GFX1010: s_and_saveexec_b32 s{{.*}}, s{{.*}}
-; GFX1030: v_cmp_eq_u32_e64 s{{.*}}, v{{.*}}, v{{.*}}
-; GFX1030: s_and_saveexec_b32 s{{.*}}, s{{.*}}
-define amdgpu_cs void @check_live_outs(i32 %a, i32 %b) {
-  %cond = icmp eq i32 %a, %b
-  %result = call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 32)
-  br i1 %cond, label %l1, label %l2
-l1:
-  call amdgpu_gfx void @check_live_outs_helper(i64 %result)
-  br label %l2
-l2:
-  ret void
-}
-
-; Omit the transformation if the s_and_saveexec instruction overwrites
-; any of the v_cmp source operands.
-
-; GCN-LABEL: check_saveexec_overwrites_vcmp_source:
-; GCN:  ; %bb.1: ; %then
-; GFX1010:          v_cmp_ge_i32_e32 vcc_lo, s[[A:[0-9]+]], v{{.*}}
-; GFX1010-NEXT:     v_mov_b32_e32 {{.*}}, s[[A]]
-; GFX1010-NEXT:     s_and_saveexec_b32 s[[A]], vcc_lo
-; GFX1030:          v_cmp_ge_i32_e32 vcc_lo, s[[A:[0-9]+]], v{{.*}}
-; GFX1030-NEXT:     v_mov_b32_e32 {{.*}}, s[[A]]
-; GFX1030-NEXT:     s_and_saveexec_b32 s[[A]], vcc_lo
-define i32 @check_saveexec_overwrites_vcmp_source(i32 inreg %a, i32 inreg %b) {
-entry:
-  %0 = icmp sge i32 %a, 0
-  br i1 %0, label %if, label %then
-
-if:
-  %1 = shl i32 %a, 2
-  %2 = or i32 %1, %b
-  ret i32 %2
-
-then:
-  %3 = call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 32)
-  %4 = trunc i64 %3 to i32
-  %5 = icmp slt i32 %4, %b
-  br i1 %5, label %after, label %end
-
-after:
-  ret i32 %4
-
-end:
-  ret i32 %a
-}
-
-declare i64 @llvm.amdgcn.icmp.i32(i32, i32, i32) #0

diff  --git a/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.mir b/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.mir
deleted file mode 100644
index 1a39ad862827c..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.mir
+++ /dev/null
@@ -1,24 +0,0 @@
-# RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 %s -o - | FileCheck -check-prefix=GCN %s
-# RUN: llc -march=amdgcn -mcpu=gfx1030 -mattr=-wavefrontsize32,+wavefrontsize64 %s -o - | FileCheck -check-prefix=GCN %s
-
----
-
-# After the Optimize exec masking (post-RA) pass, there's a change of having v_cmpx instructions
-# being introduced whenever there's a sequence of v_cmp and s_and_saveexec instructions
-# which can be safely replaced in various cases.
-# However, it is not safe to do so when the generated code sequence would omit part of the EXEC mask
-# which could occur when a subset of EXEC is used as input operand in the v_cmp instruction.
-# The idea behind this test is to check if the subregisters are correctly handled here.
-
-# GCN-LABEL: vcmp_saveexec_to_mov_vcmpx_exec_subreg:
-# GCN: v_cmp_gt_u32_e64 s[[[SDST_LO:[0-9]+]]:[[SDST_HI:[0-9]+]]], exec_lo, v{{.*}}
-# GCN: s_and_saveexec_b64 s[[[EXEC_LO:[0-9]+]]:[[EXEC_HI:[0-9]+]]], s[[[SDST_LO]]:[[SDST_HI]]]
-name: vcmp_saveexec_to_mov_vcmpx_exec_subreg
-tracksRegLiveness: true
-body: |
-  bb.0:
-    liveins: $vgpr0
-    renamable $sgpr0_sgpr1 = V_CMP_GT_U32_e64 $exec_lo, killed $vgpr0, implicit $exec
-    $sgpr2_sgpr3 = COPY $exec, implicit-def $exec
-    $sgpr2_sgpr3 = S_AND_B64 killed renamable $sgpr2_sgpr3, killed renamable $sgpr0_sgpr1, implicit-def dead $scc
-    $exec = S_MOV_B64_term killed renamable $sgpr2_sgpr3

diff  --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll
index 7cfb43ba802ac..e462a460c93fa 100644
--- a/llvm/test/CodeGen/AMDGPU/wqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/wqm.ll
@@ -1250,8 +1250,8 @@ define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inr
 ; GFX10-W32:       ; %bb.0: ; %main_body
 ; GFX10-W32-NEXT:    s_mov_b32 s12, exec_lo
 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX10-W32-NEXT:    s_mov_b32 s13, exec_lo
-; GFX10-W32-NEXT:    v_cmpx_ne_u32_e32 0, v1
+; GFX10-W32-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
+; GFX10-W32-NEXT:    s_and_saveexec_b32 s13, vcc_lo
 ; GFX10-W32-NEXT:    s_xor_b32 s13, exec_lo, s13
 ; GFX10-W32-NEXT:    s_cbranch_execz .LBB23_2
 ; GFX10-W32-NEXT:  ; %bb.1: ; %ELSE
@@ -1329,8 +1329,8 @@ define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inr
 ; GFX10-W32:       ; %bb.0: ; %main_body
 ; GFX10-W32-NEXT:    s_mov_b32 s12, exec_lo
 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX10-W32-NEXT:    s_mov_b32 s13, exec_lo
-; GFX10-W32-NEXT:    v_cmpx_ne_u32_e32 0, v1
+; GFX10-W32-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
+; GFX10-W32-NEXT:    s_and_saveexec_b32 s13, vcc_lo
 ; GFX10-W32-NEXT:    s_xor_b32 s13, exec_lo, s13
 ; GFX10-W32-NEXT:    s_cbranch_execz .LBB24_2
 ; GFX10-W32-NEXT:  ; %bb.1: ; %IF
@@ -1508,10 +1508,10 @@ define amdgpu_ps float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inr
 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-W32-NEXT:    image_sample v1, v1, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-W32-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, 0, v1
 ; GFX10-W32-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 idxen
-; GFX10-W32-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX10-W32-NEXT:    ; implicit-def: $vgpr0
-; GFX10-W32-NEXT:    v_cmpx_nlt_f32_e32 0, v1
+; GFX10-W32-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX10-W32-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX10-W32-NEXT:  ; %bb.1: ; %ELSE
 ; GFX10-W32-NEXT:    v_mul_f32_e32 v0, 4.0, v1
@@ -1577,8 +1577,8 @@ define amdgpu_ps <4 x float> @test_control_flow_4(<8 x i32> inreg %rsrc, <4 x i3
 ; GFX10-W32:       ; %bb.0: ; %main_body
 ; GFX10-W32-NEXT:    s_mov_b32 s12, exec_lo
 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX10-W32-NEXT:    s_mov_b32 s13, exec_lo
-; GFX10-W32-NEXT:    v_cmpx_eq_u32_e32 0, v1
+; GFX10-W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX10-W32-NEXT:    s_and_saveexec_b32 s13, vcc_lo
 ; GFX10-W32-NEXT:    s_cbranch_execz .LBB27_2
 ; GFX10-W32-NEXT:  ; %bb.1: ; %IF
 ; GFX10-W32-NEXT:    s_and_saveexec_b32 s14, s12
@@ -2960,9 +2960,9 @@ define amdgpu_ps float @test_strict_wqm_within_wqm(<8 x i32> inreg %rsrc, <4 x i
 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
 ; GFX10-W32-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s13
+; GFX10-W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-W32-NEXT:    s_mov_b32 s13, exec_lo
-; GFX10-W32-NEXT:    v_cmpx_eq_u32_e32 0, v1
+; GFX10-W32-NEXT:    s_and_saveexec_b32 s13, vcc_lo
 ; GFX10-W32-NEXT:    s_cbranch_execz .LBB46_2
 ; GFX10-W32-NEXT:  ; %bb.1: ; %IF
 ; GFX10-W32-NEXT:    s_mov_b32 s14, exec_lo