[llvm] 718aec2 - [AMDGPU] Improve v_cmpx usage on GFX10.3.

Fri Mar 25 03:40:23 PDT 2022

Author: Thomas Symalla
Date: 2022-03-25T11:40:18+01:00
New Revision: 718aec209c891487294d8a6199cf12c796c6e901

URL: https://github.com/llvm/llvm-project/commit/718aec209c891487294d8a6199cf12c796c6e901
DIFF: https://github.com/llvm/llvm-project/commit/718aec209c891487294d8a6199cf12c796c6e901.diff

LOG: [AMDGPU] Improve v_cmpx usage on GFX10.3.

On GFX10.3 targets, the following instruction sequence

v_cmp_* SGPR, ...
s_and_saveexec ..., SGPR

leads to a fairly long stall caused by a VALU write to a SGPR and having the
following SALU wait for the SGPR.

An equivalent sequence is to save the exec mask manually instead of letting
s_and_saveexec do the work and use a v_cmpx instruction instead to do the
comparison.

This patch modifies the SIOptimizeExecMasking pass as this is the last position
where s_and_saveexec instructions are inserted. It does the transformation by
trying to find the pattern, extracting the operands and generating the new
instruction sequence.

It also changes some existing lit tests and introduces a few new tests to show
the changed behavior on GFX10.3 targets.

Same as D119696 including a buildbot and MIR test fix.

Reviewed By: critson

Differential Revision: https://reviews.llvm.org/D122332

Added: 
    llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.ll
    llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.mir

Modified: 
    llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
    llvm/lib/Target/AMDGPU/SIInstrInfo.h
    llvm/lib/Target/AMDGPU/SIInstrInfo.td
    llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
    llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
    llvm/lib/Target/AMDGPU/VOPCInstructions.td
    llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx10-branch-offset-bug.ll
    llvm/test/CodeGen/AMDGPU/wqm.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 3c53323dc93dd..453be72ad8815 100644

--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -3860,18 +3860,19 @@ static void copyFlagsToImplicitVCC(MachineInstr &MI,
 
 MachineInstr *SIInstrInfo::buildShrunkInst(MachineInstr &MI,
                                            unsigned Op32) const {
-  MachineBasicBlock *MBB = MI.getParent();;
+  MachineBasicBlock *MBB = MI.getParent();
   MachineInstrBuilder Inst32 =
     BuildMI(*MBB, MI, MI.getDebugLoc(), get(Op32))
     .setMIFlags(MI.getFlags());
 
   // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
   // For VOPC instructions, this is replaced by an implicit def of vcc.
-  int Op32DstIdx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::vdst);
-  if (Op32DstIdx != -1) {
+  if (AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::vdst) != -1) {
     // dst
     Inst32.add(MI.getOperand(0));
-  } else {
+  } else if (AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::sdst) != -1) {
+    // VOPCX instructions won't be writing to an explicit dst, so this should
+    // not fail for these instructions.
     assert(((MI.getOperand(0).getReg() == AMDGPU::VCC) ||
             (MI.getOperand(0).getReg() == AMDGPU::VCC_LO)) &&
            "Unexpected case");

diff  --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 25d3f4a765e6b..4b7589002a718 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1263,6 +1263,10 @@ namespace AMDGPU {
   LLVM_READONLY
   int getMFMAEarlyClobberOp(uint16_t Opcode);
 
+  /// \returns v_cmpx version of a v_cmp instruction.
+  LLVM_READONLY
+  int getVCMPXOpFromVCMP(uint16_t Opcode);
+
   const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL;
   const uint64_t RSRC_ELEMENT_SIZE_SHIFT = (32 + 19);
   const uint64_t RSRC_INDEX_STRIDE_SHIFT = (32 + 21);

diff  --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index a81c49e82ebab..787814172135f 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -2681,6 +2681,15 @@ def getMFMAEarlyClobberOp : InstrMapping {
   let ValueCols = [["0"]];
 }
 
+// Maps an v_cmp instruction to its v_cmpx equivalent.
+def getVCMPXOpFromVCMP : InstrMapping {
+  let FilterClass = "VCMPVCMPXTable";
+  let RowFields = ["VCMPOp"];
+  let ColFields = ["IsVCMPX"];
+  let KeyCol = ["0"];
+  let ValueCols = [["1"]];
+}
+
 include "SIInstructions.td"
 
 include "DSInstructions.td"

diff  --git a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
index 9a4cc25f00085..e45642b934d28 100644
--- a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
+++ b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
@@ -9,6 +9,7 @@
 #include "AMDGPU.h"
 #include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/InitializePasses.h"
 
@@ -292,6 +293,182 @@ static bool isLiveOut(const MachineBasicBlock &MBB, unsigned Reg) {
   return false;
 }
 
+// Backwards-iterate from Origin (for n=MaxInstructions iterations) until either
+// the beginning of the BB is reached or Pred evaluates to true - which can be
+// an arbitrary condition based on the current MachineInstr, for instance an
+// target instruction. Breaks prematurely by returning nullptr if  one of the
+// registers given in NonModifiableRegs is modified by the current instruction.
+static MachineInstr *
+findInstrBackwards(MachineInstr &Origin,
+                   std::function<bool(MachineInstr *)> Pred,
+                   ArrayRef<MCRegister> NonModifiableRegs,
+                   const SIRegisterInfo *TRI, unsigned MaxInstructions = 5) {
+  MachineBasicBlock::reverse_iterator A = Origin.getReverseIterator(),
+                                      E = Origin.getParent()->rend();
+  unsigned CurrentIteration = 0;
+
+  for (++A; CurrentIteration < MaxInstructions && A != E; ++A) {
+    if (Pred(&*A))
+      return &*A;
+
+    for (MCRegister Reg : NonModifiableRegs) {
+      if (A->modifiesRegister(Reg, TRI))
+        return nullptr;
+    }
+
+    ++CurrentIteration;
+  }
+
+  return nullptr;
+}
+
+// Determine if a register Reg is not re-defined and still in use
+// in the range (Stop..BB.end].
+// It does so by backwards calculating liveness from the end of the BB until
+// either Stop or the beginning of the BB is reached.
+// After liveness is calculated, we can determine if Reg is still in use and not
+// defined inbetween the instructions.
+static bool isRegisterInUseAfter(MachineInstr &Stop, MCRegister Reg,
+                                 const SIRegisterInfo *TRI,
+                                 MachineRegisterInfo &MRI) {
+  LivePhysRegs LR(*TRI);
+  LR.addLiveOuts(*Stop.getParent());
+
+  for (auto A = Stop.getParent()->rbegin();
+       A != Stop.getParent()->rend() && A != Stop; ++A) {
+    LR.stepBackward(*A);
+  }
+
+  return !LR.available(MRI, Reg);
+}
+
+// Tries to find a possibility to optimize a v_cmp ..., s_and_saveexec sequence
+// by looking at an instance of a s_and_saveexec instruction. Returns a pointer
+// to the v_cmp instruction if it is safe to replace the sequence (see the
+// conditions in the function body). This is after register allocation, so some
+// checks on operand dependencies need to be considered.
+static MachineInstr *findPossibleVCMPVCMPXOptimization(
+    MachineInstr &SaveExec, MCRegister Exec, const SIRegisterInfo *TRI,
+    const SIInstrInfo *TII, MachineRegisterInfo &MRI) {
+
+  MachineInstr *VCmp = nullptr;
+
+  Register SaveExecDest = SaveExec.getOperand(0).getReg();
+  if (!TRI->isSGPRReg(MRI, SaveExecDest))
+    return nullptr;
+
+  MachineOperand *SaveExecSrc0 =
+      TII->getNamedOperand(SaveExec, AMDGPU::OpName::src0);
+  if (!SaveExecSrc0->isReg())
+    return nullptr;
+
+  // Try to find the last v_cmp instruction that defs the saveexec input
+  // operand without any write to Exec or the saveexec input operand inbetween.
+  VCmp = findInstrBackwards(
+      SaveExec,
+      [&](MachineInstr *Check) {
+        return AMDGPU::getVCMPXOpFromVCMP(Check->getOpcode()) != -1 &&
+               Check->modifiesRegister(SaveExecSrc0->getReg(), TRI);
+      },
+      {Exec, SaveExecSrc0->getReg()}, TRI);
+
+  if (!VCmp)
+    return nullptr;
+
+  MachineOperand *VCmpDest = TII->getNamedOperand(*VCmp, AMDGPU::OpName::sdst);
+  assert(VCmpDest && "Should have an sdst operand!");
+
+  // Check if any of the v_cmp source operands is written by the saveexec.
+  MachineOperand *Src0 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src0);
+  if (Src0->isReg() && TRI->isSGPRReg(MRI, Src0->getReg()) &&
+      SaveExec.modifiesRegister(Src0->getReg(), TRI))
+    return nullptr;
+
+  MachineOperand *Src1 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src1);
+  if (Src1->isReg() && TRI->isSGPRReg(MRI, Src1->getReg()) &&
+      SaveExec.modifiesRegister(Src1->getReg(), TRI))
+    return nullptr;
+
+  // Don't do the transformation if the destination operand is included in
+  // it's MBB Live-outs, meaning it's used in any of it's successors, leading
+  // to incorrect code if the v_cmp and therefore the def of
+  // the dest operand is removed.
+  if (isLiveOut(*VCmp->getParent(), VCmpDest->getReg()))
+    return nullptr;
+
+  // If the v_cmp target is in use after the s_and_saveexec, skip the
+  // optimization.
+  if (isRegisterInUseAfter(SaveExec, VCmpDest->getReg(), TRI, MRI))
+    return nullptr;
+
+  // Try to determine if there is a write to any of the VCmp
+  // operands between the saveexec and the vcmp.
+  // If yes, additional VGPR spilling might need to be inserted. In this case,
+  // it's not worth replacing the instruction sequence.
+  SmallVector<MCRegister, 2> NonDefRegs;
+  if (Src0->isReg())
+    NonDefRegs.push_back(Src0->getReg());
+
+  if (Src1->isReg())
+    NonDefRegs.push_back(Src1->getReg());
+
+  if (!findInstrBackwards(
+          SaveExec, [&](MachineInstr *Check) { return Check == VCmp; },
+          NonDefRegs, TRI))
+    return nullptr;
+
+  return VCmp;
+}
+
+// Inserts the optimized s_mov_b32 / v_cmpx sequence based on the
+// operands extracted from a v_cmp ..., s_and_saveexec pattern.
+static bool optimizeVCMPSaveExecSequence(MachineInstr &SaveExecInstr,
+                                         MachineInstr &VCmp, MCRegister Exec,
+                                         const SIInstrInfo *TII,
+                                         const SIRegisterInfo *TRI,
+                                         MachineRegisterInfo &MRI) {
+  const int NewOpcode = AMDGPU::getVCMPXOpFromVCMP(VCmp.getOpcode());
+
+  if (NewOpcode == -1)
+    return false;
+
+  MachineOperand *Src0 = TII->getNamedOperand(VCmp, AMDGPU::OpName::src0);
+  MachineOperand *Src1 = TII->getNamedOperand(VCmp, AMDGPU::OpName::src1);
+
+  Register MoveDest = SaveExecInstr.getOperand(0).getReg();
+
+  MachineBasicBlock::instr_iterator InsertPosIt = SaveExecInstr.getIterator();
+  if (!SaveExecInstr.uses().empty()) {
+    bool isSGPR32 = TRI->getRegSizeInBits(MoveDest, MRI) == 32;
+    unsigned MovOpcode = isSGPR32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+    BuildMI(*SaveExecInstr.getParent(), InsertPosIt,
+            SaveExecInstr.getDebugLoc(), TII->get(MovOpcode), MoveDest)
+        .addReg(Exec);
+  }
+
+  // Omit dst as V_CMPX is implicitly writing to EXEC.
+  // Add dummy src and clamp modifiers, if needed.
+  auto Builder = BuildMI(*VCmp.getParent(), std::next(InsertPosIt),
+                         VCmp.getDebugLoc(), TII->get(NewOpcode));
+
+  if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src0_modifiers) !=
+      -1)
+    Builder.addImm(0);
+
+  Builder.add(*Src0);
+
+  if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src1_modifiers) !=
+      -1)
+    Builder.addImm(0);
+
+  Builder.add(*Src1);
+
+  if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::clamp) != -1)
+    Builder.addImm(0);
+
+  return true;
+}
+
 bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(MF.getFunction()))
     return false;
@@ -299,6 +476,7 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const SIRegisterInfo *TRI = ST.getRegisterInfo();
   const SIInstrInfo *TII = ST.getInstrInfo();
+  MachineRegisterInfo *MRI = &MF.getRegInfo();
   MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
 
   // Optimize sequences emitted for control flow lowering. They are originally
@@ -462,5 +640,45 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
     Changed = true;
   }
 
+  // After all s_op_saveexec instructions are inserted,
+  // replace (on GFX10.3 and later)
+  // v_cmp_* SGPR, IMM, VGPR
+  // s_and_saveexec_b32 EXEC_SGPR_DEST, SGPR
+  // with
+  // s_mov_b32 EXEC_SGPR_DEST, exec_lo
+  // v_cmpx_* IMM, VGPR
+  // to reduce pipeline stalls.
+  if (ST.hasGFX10_3Insts()) {
+    DenseMap<MachineInstr *, MachineInstr *> SaveExecVCmpMapping;
+    const unsigned AndSaveExecOpcode =
+        ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
+
+    for (MachineBasicBlock &MBB : MF) {
+      for (MachineInstr &MI : MBB) {
+        // Record relevant v_cmp / s_and_saveexec instruction pairs for
+        // replacement.
+        if (MI.getOpcode() != AndSaveExecOpcode)
+          continue;
+
+        if (MachineInstr *VCmp =
+                findPossibleVCMPVCMPXOptimization(MI, Exec, TRI, TII, *MRI))
+          SaveExecVCmpMapping[&MI] = VCmp;
+      }
+    }
+
+    for (const auto &Entry : SaveExecVCmpMapping) {
+      MachineInstr *SaveExecInstr = Entry.getFirst();
+      MachineInstr *VCmpInstr = Entry.getSecond();
+
+      if (optimizeVCMPSaveExecSequence(*SaveExecInstr, *VCmpInstr, Exec, TII,
+                                       TRI, *MRI)) {
+        SaveExecInstr->eraseFromParent();
+        VCmpInstr->eraseFromParent();
+
+        Changed = true;
+      }
+    }
+  }
+
   return Changed;
 }

diff  --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index e437552c2afdc..880df8bdb67ec 100644
--- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -728,21 +728,27 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
       int Op32 = AMDGPU::getVOPe32(MI.getOpcode());
 
       if (TII->isVOPC(Op32)) {
-        Register DstReg = MI.getOperand(0).getReg();
-        if (DstReg.isVirtual()) {
-          // VOPC instructions can only write to the VCC register. We can't
-          // force them to use VCC here, because this is only one register and
-          // cannot deal with sequences which would require multiple copies of
-          // VCC, e.g. S_AND_B64 (vcc = V_CMP_...), (vcc = V_CMP_...)
-          //
-          // So, instead of forcing the instruction to write to VCC, we provide
-          // a hint to the register allocator to use VCC and then we will run
-          // this pass again after RA and shrink it if it outputs to VCC.
-          MRI.setRegAllocationHint(MI.getOperand(0).getReg(), 0, VCCReg);
-          continue;
+        MachineOperand &Op0 = MI.getOperand(0);
+        if (Op0.isReg()) {
+          // Exclude VOPCX instructions as these don't explicitly write a
+          // dst.
+          Register DstReg = Op0.getReg();
+          if (DstReg.isVirtual()) {
+            // VOPC instructions can only write to the VCC register. We can't
+            // force them to use VCC here, because this is only one register and
+            // cannot deal with sequences which would require multiple copies of
+            // VCC, e.g. S_AND_B64 (vcc = V_CMP_...), (vcc = V_CMP_...)
+            //
+            // So, instead of forcing the instruction to write to VCC, we
+            // provide a hint to the register allocator to use VCC and then we
+            // will run this pass again after RA and shrink it if it outputs to
+            // VCC.
+            MRI.setRegAllocationHint(DstReg, 0, VCCReg);
+            continue;
+          }
+          if (DstReg != VCCReg)
+            continue;
         }
-        if (DstReg != VCCReg)
-          continue;
       }
 
       if (Op32 == AMDGPU::V_CNDMASK_B32_e32) {

diff  --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
index c0cc91029d111..1220b5c8ac35d 100644
--- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
@@ -205,6 +205,11 @@ class VCMPXNoSDstTable <bit has_sdst, string Name> {
   string NoSDstOp = Name;
 }
 
+class VCMPVCMPXTable <string Name> {
+  bit IsVCMPX = 0;
+  string VCMPOp = Name;
+}
+
 multiclass VOPC_Pseudos <string opName,
                          VOPC_Profile P,
                          SDPatternOperator cond = COND_NULL,
@@ -213,7 +218,8 @@ multiclass VOPC_Pseudos <string opName,
 
   def _e32 : VOPC_Pseudo <opName, P>,
              Commutable_REV<revOp#"_e32", !eq(revOp, opName)>,
-             VCMPXNoSDstTable<1, opName#"_e32"> {
+             VCMPXNoSDstTable<1, opName#"_e32">,
+             VCMPVCMPXTable<opName#"_e32"> {
     let Defs = !if(DefExec, [VCC, EXEC], [VCC]);
     let SchedRW = P.Schedule;
     let isConvergent = DefExec;
@@ -223,7 +229,8 @@ multiclass VOPC_Pseudos <string opName,
 
   def _e64 : VOP3_Pseudo<opName, P, getVOPCPat64<cond, P>.ret>,
     Commutable_REV<revOp#"_e64", !eq(revOp, opName)>,
-    VCMPXNoSDstTable<1, opName#"_e64"> {
+    VCMPXNoSDstTable<1, opName#"_e64">,
+    VCMPVCMPXTable<opName#"_e64"> {
     let Defs = !if(DefExec, [EXEC], []);
     let SchedRW = P.Schedule;
     let isCompare = 1;
@@ -248,23 +255,27 @@ multiclass VOPCX_Pseudos <string opName,
 
   def _nosdst_e32 : VOPC_Pseudo <opName#"_nosdst", P_NoSDst, [], 0>,
              Commutable_REV<revOp#"_nosdst_e32", !eq(revOp, opName)>,
-             VCMPXNoSDstTable<0, opName#"_e32"> {
+             VCMPXNoSDstTable<0, opName#"_e32">,
+             VCMPVCMPXTable<!subst("v_cmpx", "v_cmp", opName#"_e32")> {
     let Defs = [EXEC];
     let SchedRW = P_NoSDst.Schedule;
     let isConvergent = 1;
     let isCompare = 1;
     let isCommutable = 1;
     let SubtargetPredicate = HasNoSdstCMPX;
+    let IsVCMPX = 1;
   }
 
   def _nosdst_e64 : VOP3_Pseudo<opName#"_nosdst", P_NoSDst>,
     Commutable_REV<revOp#"_nosdst_e64", !eq(revOp, opName)>,
-    VCMPXNoSDstTable<0, opName#"_e64"> {
+    VCMPXNoSDstTable<0, opName#"_e64">,
+    VCMPVCMPXTable<!subst("v_cmpx", "v_cmp", opName#"_e64")> {
     let Defs = [EXEC];
     let SchedRW = P_NoSDst.Schedule;
     let isCompare = 1;
     let isCommutable = 1;
     let SubtargetPredicate = HasNoSdstCMPX;
+    let IsVCMPX = 1;
   }
 
   foreach _ = BoolToList<P_NoSDst.HasExtSDWA>.ret in

diff  --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx10-branch-offset-bug.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx10-branch-offset-bug.ll
index 2fed25fff3d50..a2c35b97aef65 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx10-branch-offset-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx10-branch-offset-bug.ll
@@ -6,7 +6,7 @@
 
 ; GCN-LABEL: long_forward_scc_branch_3f_offset_bug:
 ; GFX1030: s_cmp_lg_u32
-; GFX1030-NEXT: s_cbranch_scc1  [[ENDBB:.LBB[0-9]+_[0-9]+]]
+; GFX1030: s_cbranch_scc1  [[ENDBB:.LBB[0-9]+_[0-9]+]]
 
 ; GFX1010: s_cmp_lg_u32
 ; GFX1010-NEXT: s_cbranch_scc0  [[RELAX_BB:.LBB[0-9]+_[0-9]+]]
@@ -51,9 +51,9 @@ bb3:
 }
 
 ; GCN-LABEL: {{^}}long_forward_exec_branch_3f_offset_bug:
-; GFX1030: v_cmp_eq_u32
-; GFX1030: s_and_saveexec_b32
-; GFX1030-NEXT: s_cbranch_execnz [[RELAX_BB:.LBB[0-9]+_[0-9]+]]
+; GFX1030: s_mov_b32
+; GFX1030: v_cmpx_eq_u32
+; GFX1030: s_cbranch_execnz [[RELAX_BB:.LBB[0-9]+_[0-9]+]]
 
 ; GFX1010: v_cmp_eq_u32
 ; GFX1010: s_and_saveexec_b32

diff  --git a/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.ll b/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.ll
new file mode 100644
index 0000000000000..f8aae95f62b12
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.ll
@@ -0,0 +1,167 @@
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1010 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1030 %s
+
+; GCN-LABEL: {{^}}test_insert_vcmpx_pattern_lt:
+; GFX1010: v_cmp_lt_i32_e32 vcc_lo, 15, v{{.*}}
+; GFX1010-NEXT: s_and_saveexec_b32 s{{.*}}, vcc_lo
+; GFX1030: s_mov_b32 s{{.*}}, exec_lo
+; GFX1030-NEXT: v_cmpx_lt_i32_e32 15, v{{.*}}
+define i32 @test_insert_vcmpx_pattern_lt(i32 %x) {
+entry:
+  %bc = icmp slt i32 %x, 16
+  br i1 %bc, label %endif, label %if
+
+if:
+  %ret = shl i32 %x, 2
+  ret i32 %ret
+
+endif:
+  ret i32 %x
+}
+
+; GCN-LABEL: {{^}}test_insert_vcmpx_pattern_gt:
+; GFX1010: v_cmp_gt_i32_e32 vcc_lo, 17, v{{.*}}
+; GFX1010-NEXT: s_and_saveexec_b32 s{{.*}}, vcc_lo
+; GFX1030: s_mov_b32 s{{.*}}, exec_lo
+; GFX1030-NEXT: v_cmpx_gt_i32_e32 17, v{{.*}}
+define i32 @test_insert_vcmpx_pattern_gt(i32 %x) {
+entry:
+  %bc = icmp sgt i32 %x, 16
+  br i1 %bc, label %endif, label %if
+
+if:
+  %ret = shl i32 %x, 2
+  ret i32 %ret
+
+endif:
+  ret i32 %x
+}
+
+; GCN-LABEL: {{^}}test_insert_vcmpx_pattern_eq:
+; GFX1010: v_cmp_ne_u32_e32 vcc_lo, 16, v{{.*}}
+; GFX1010-NEXT: s_and_saveexec_b32 s{{.*}}, vcc_lo
+; GFX1030: s_mov_b32 s{{.*}}, exec_lo
+; GFX1030-NEXT: v_cmpx_ne_u32_e32 16, v{{.*}}
+define i32 @test_insert_vcmpx_pattern_eq(i32 %x) {
+entry:
+  %bc = icmp eq i32 %x, 16
+  br i1 %bc, label %endif, label %if
+
+if:
+  %ret = shl i32 %x, 2
+  ret i32 %ret
+
+endif:
+  ret i32 %x
+}
+
+; GCN-LABEL: {{^}}test_insert_vcmpx_pattern_ne:
+; GFX1010: v_cmp_eq_u32_e32 vcc_lo, 16, v{{.*}}
+; GFX1010-NEXT: s_and_saveexec_b32 s{{.*}}, vcc_lo
+; GFX1030: s_mov_b32 s{{.*}}, exec_lo
+; GFX1030-NEXT: v_cmpx_eq_u32_e32 16, v{{.*}}
+define i32 @test_insert_vcmpx_pattern_ne(i32 %x) {
+entry:
+  %bc = icmp ne i32 %x, 16
+  br i1 %bc, label %endif, label %if
+
+if:
+  %ret = shl i32 %x, 2
+  ret i32 %ret
+
+endif:
+  ret i32 %x
+}
+
+; GCN-LABEL: {{^}}test_insert_vcmpx_pattern_le:
+; GFX1010: v_cmp_lt_i32_e32 vcc_lo, 16, v{{.*}}
+; GFX1010-NEXT: s_and_saveexec_b32 s{{.*}}, vcc_lo
+; GFX1030: s_mov_b32 s{{.*}}, exec_lo
+; GFX1030-NEXT: v_cmpx_lt_i32_e32 16, v{{.*}}
+define i32 @test_insert_vcmpx_pattern_le(i32 %x) {
+entry:
+  %bc = icmp sle i32 %x, 16
+  br i1 %bc, label %endif, label %if
+
+if:
+  %ret = shl i32 %x, 2
+  ret i32 %ret
+
+endif:
+  ret i32 %x
+}
+
+; GCN-LABEL: {{^}}test_insert_vcmpx_pattern_ge:
+; GFX1010: v_cmp_gt_i32_e32 vcc_lo, 16, v{{.*}}
+; GFX1010-NEXT: s_and_saveexec_b32 s{{.*}}, vcc_lo
+; GFX1030: s_mov_b32 s{{.*}}, exec_lo
+; GFX1030-NEXT: v_cmpx_gt_i32_e32 16, v{{.*}}
+define i32 @test_insert_vcmpx_pattern_ge(i32 %x) {
+entry:
+  %bc = icmp sge i32 %x, 16
+  br i1 %bc, label %endif, label %if
+
+if:
+  %ret = shl i32 %x, 2
+  ret i32 %ret
+
+endif:
+  ret i32 %x
+}
+
+declare amdgpu_gfx void @check_live_outs_helper(i64) #0
+
+; In cases where the output operand cannot be safely removed,
+; don't apply the v_cmpx transformation.
+
+; GCN-LABEL: {{^}}check_live_outs:
+; GFX1010: v_cmp_eq_u32_e64 s{{.*}}, v{{.*}}, v{{.*}}
+; GFX1010: s_and_saveexec_b32 s{{.*}}, s{{.*}}
+; GFX1030: v_cmp_eq_u32_e64 s{{.*}}, v{{.*}}, v{{.*}}
+; GFX1030: s_and_saveexec_b32 s{{.*}}, s{{.*}}
+define amdgpu_cs void @check_live_outs(i32 %a, i32 %b) {
+  %cond = icmp eq i32 %a, %b
+  %result = call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 32)
+  br i1 %cond, label %l1, label %l2
+l1:
+  call amdgpu_gfx void @check_live_outs_helper(i64 %result)
+  br label %l2
+l2:
+  ret void
+}
+
+; Omit the transformation if the s_and_saveexec instruction overwrites
+; any of the v_cmp source operands.
+
+; GCN-LABEL: check_saveexec_overwrites_vcmp_source:
+; GCN:  ; %bb.1: ; %then
+; GFX1010:          v_cmp_ge_i32_e32 vcc_lo, s[[A:[0-9]+]], v{{.*}}
+; GFX1010-NEXT:     v_mov_b32_e32 {{.*}}, s[[A]]
+; GFX1010-NEXT:     s_and_saveexec_b32 s[[A]], vcc_lo
+; GFX1030:          v_cmp_ge_i32_e32 vcc_lo, s[[A:[0-9]+]], v{{.*}}
+; GFX1030-NEXT:     v_mov_b32_e32 {{.*}}, s[[A]]
+; GFX1030-NEXT:     s_and_saveexec_b32 s[[A]], vcc_lo
+define i32 @check_saveexec_overwrites_vcmp_source(i32 inreg %a, i32 inreg %b) {
+entry:
+  %0 = icmp sge i32 %a, 0
+  br i1 %0, label %if, label %then
+
+if:
+  %1 = shl i32 %a, 2
+  %2 = or i32 %1, %b
+  ret i32 %2
+
+then:
+  %3 = call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 32)
+  %4 = trunc i64 %3 to i32
+  %5 = icmp slt i32 %4, %b
+  br i1 %5, label %after, label %end
+
+after:
+  ret i32 %4
+
+end:
+  ret i32 %a
+}
+
+declare i64 @llvm.amdgcn.icmp.i32(i32, i32, i32) #0

diff  --git a/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.mir b/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.mir
new file mode 100644
index 0000000000000..52cd6f5d47604
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.mir
@@ -0,0 +1,24 @@
+# RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -run-pass=si-optimize-exec-masking -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s
+# RUN: llc -march=amdgcn -mcpu=gfx1030 -mattr=-wavefrontsize32,+wavefrontsize64 -run-pass=si-optimize-exec-masking -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s
+
+---
+
+# After the Optimize exec masking (post-RA) pass, there's a change of having v_cmpx instructions
+# being introduced whenever there's a sequence of v_cmp and s_and_saveexec instructions
+# which can be safely replaced in various cases.
+# However, it is not safe to do so when the generated code sequence would omit part of the EXEC mask
+# which could occur when a subset of EXEC is used as input operand in the v_cmp instruction.
+# The idea behind this test is to check if the subregisters are correctly handled here.
+
+# GCN-LABEL: name: vcmp_saveexec_to_mov_vcmpx_exec_subreg
+# GCN: V_CMP_GT_U32_e64
+# GCN: S_AND_SAVEEXEC_B64
+name: vcmp_saveexec_to_mov_vcmpx_exec_subreg
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $vgpr0, $sgpr2
+    renamable $sgpr0_sgpr1 = V_CMP_GT_U32_e64 $sgpr2, killed $vgpr0, implicit $exec
+    $sgpr2_sgpr3 = COPY $exec, implicit-def $exec
+    $sgpr2_sgpr3 = S_AND_B64 killed renamable $sgpr2_sgpr3, killed renamable $sgpr0_sgpr1, implicit-def dead $scc
+    $exec = S_MOV_B64_term killed renamable $sgpr2_sgpr3

diff  --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll
index e462a460c93fa..7cfb43ba802ac 100644
--- a/llvm/test/CodeGen/AMDGPU/wqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/wqm.ll
@@ -1250,8 +1250,8 @@ define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inr
 ; GFX10-W32:       ; %bb.0: ; %main_body
 ; GFX10-W32-NEXT:    s_mov_b32 s12, exec_lo
 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX10-W32-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
-; GFX10-W32-NEXT:    s_and_saveexec_b32 s13, vcc_lo
+; GFX10-W32-NEXT:    s_mov_b32 s13, exec_lo
+; GFX10-W32-NEXT:    v_cmpx_ne_u32_e32 0, v1
 ; GFX10-W32-NEXT:    s_xor_b32 s13, exec_lo, s13
 ; GFX10-W32-NEXT:    s_cbranch_execz .LBB23_2
 ; GFX10-W32-NEXT:  ; %bb.1: ; %ELSE
@@ -1329,8 +1329,8 @@ define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inr
 ; GFX10-W32:       ; %bb.0: ; %main_body
 ; GFX10-W32-NEXT:    s_mov_b32 s12, exec_lo
 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX10-W32-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
-; GFX10-W32-NEXT:    s_and_saveexec_b32 s13, vcc_lo
+; GFX10-W32-NEXT:    s_mov_b32 s13, exec_lo
+; GFX10-W32-NEXT:    v_cmpx_ne_u32_e32 0, v1
 ; GFX10-W32-NEXT:    s_xor_b32 s13, exec_lo, s13
 ; GFX10-W32-NEXT:    s_cbranch_execz .LBB24_2
 ; GFX10-W32-NEXT:  ; %bb.1: ; %IF
@@ -1508,10 +1508,10 @@ define amdgpu_ps float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inr
 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-W32-NEXT:    image_sample v1, v1, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-W32-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, 0, v1
 ; GFX10-W32-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 idxen
+; GFX10-W32-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX10-W32-NEXT:    ; implicit-def: $vgpr0
-; GFX10-W32-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX10-W32-NEXT:    v_cmpx_nlt_f32_e32 0, v1
 ; GFX10-W32-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX10-W32-NEXT:  ; %bb.1: ; %ELSE
 ; GFX10-W32-NEXT:    v_mul_f32_e32 v0, 4.0, v1
@@ -1577,8 +1577,8 @@ define amdgpu_ps <4 x float> @test_control_flow_4(<8 x i32> inreg %rsrc, <4 x i3
 ; GFX10-W32:       ; %bb.0: ; %main_body
 ; GFX10-W32-NEXT:    s_mov_b32 s12, exec_lo
 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX10-W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX10-W32-NEXT:    s_and_saveexec_b32 s13, vcc_lo
+; GFX10-W32-NEXT:    s_mov_b32 s13, exec_lo
+; GFX10-W32-NEXT:    v_cmpx_eq_u32_e32 0, v1
 ; GFX10-W32-NEXT:    s_cbranch_execz .LBB27_2
 ; GFX10-W32-NEXT:  ; %bb.1: ; %IF
 ; GFX10-W32-NEXT:    s_and_saveexec_b32 s14, s12
@@ -2960,9 +2960,9 @@ define amdgpu_ps float @test_strict_wqm_within_wqm(<8 x i32> inreg %rsrc, <4 x i
 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
 ; GFX10-W32-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s13
-; GFX10-W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-W32-NEXT:    s_and_saveexec_b32 s13, vcc_lo
+; GFX10-W32-NEXT:    s_mov_b32 s13, exec_lo
+; GFX10-W32-NEXT:    v_cmpx_eq_u32_e32 0, v1
 ; GFX10-W32-NEXT:    s_cbranch_execz .LBB46_2
 ; GFX10-W32-NEXT:  ; %bb.1: ; %IF
 ; GFX10-W32-NEXT:    s_mov_b32 s14, exec_lo