[llvm] b7836d8 - [CodeGen]Allow targets to use target specific COPY instructions for live range splitting

Fri Jul 7 10:00:09 PDT 2023

Author: Yashwant Singh
Date: 2023-07-07T22:29:50+05:30
New Revision: b7836d856206ec39509d42529f958c920368166b

URL: https://github.com/llvm/llvm-project/commit/b7836d856206ec39509d42529f958c920368166b
DIFF: https://github.com/llvm/llvm-project/commit/b7836d856206ec39509d42529f958c920368166b.diff

LOG: [CodeGen]Allow targets to use target specific COPY instructions for live range splitting

Replacing D143754. Right now the LiveRangeSplitting during register allocation uses
TargetOpcode::COPY instruction for splitting. For AMDGPU target that creates a
problem as we have both vector and scalar copies. Vector copies perform a copy over
a vector register but only on the lanes(threads) that are active. This is mostly sufficient
however we do run into cases when we have to copy the entire vector register and
not just active lane data. One major place where we need that is live range splitting.

Allowing targets to use their own copy instructions(if defined) will provide a lot of
flexibility and ease to lower these pseudo instructions to correct MIR.

- Introduce getTargetCopyOpcode() virtual function and use if to generate copy in Live range
 splitting.
- Replace necessary MI.isCopy() checks with TII.isCopyInstr() in register allocator pipeline.

Reviewed By: arsenm, cdevadas, kparzysz

Differential Revision: https://reviews.llvm.org/D150388

Added: 
    

Modified: 
    llvm/include/llvm/CodeGen/TargetInstrInfo.h
    llvm/lib/CodeGen/CalcSpillWeights.cpp
    llvm/lib/CodeGen/InlineSpiller.cpp
    llvm/lib/CodeGen/LiveRangeEdit.cpp
    llvm/lib/CodeGen/LiveRangeShrink.cpp
    llvm/lib/CodeGen/RegAllocGreedy.cpp
    llvm/lib/CodeGen/SplitKit.cpp
    llvm/lib/CodeGen/SplitKit.h
    llvm/lib/CodeGen/TargetInstrInfo.cpp
    llvm/test/CodeGen/Mips/madd-msub.ll
    llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
    llvm/test/CodeGen/X86/GlobalISel/add-ext.ll
    llvm/test/CodeGen/X86/dagcombine-cse.ll
    llvm/test/CodeGen/X86/fold-and-shift-x86_64.ll
    llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-lowhigh.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index 93dfcfc399247e..817d32ea0ef6f7 100644

--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -1044,6 +1044,16 @@ class TargetInstrInfo : public MCInstrInfo {
     return isCopyInstrImpl(MI);
   }
 
+  bool isFullCopyInstr(const MachineInstr &MI) const {
+    auto DestSrc = isCopyInstr(MI);
+    if (!DestSrc)
+      return false;
+
+    const MachineOperand *DestRegOp = DestSrc->Destination;
+    const MachineOperand *SrcRegOp = DestSrc->Source;
+    return !DestRegOp->getSubReg() && !SrcRegOp->getSubReg();
+  }
+
   /// If the specific machine instruction is an instruction that adds an
   /// immediate value and a physical register, and stores the result in
   /// the given physical register \c Reg, return a pair of the source
@@ -1958,6 +1968,13 @@ class TargetInstrInfo : public MCInstrInfo {
     return false;
   }
 
+  /// Allows targets to use appropriate copy instruction while spilitting live
+  /// range of a register in register allocation.
+  virtual unsigned getLiveRangeSplitOpcode(Register Reg,
+                                           const MachineFunction &MF) const {
+    return TargetOpcode::COPY;
+  }
+
   /// During PHI eleimination lets target to make necessary checks and
   /// insert the copy to the PHI destination register in a target specific
   /// manner.

diff  --git a/llvm/lib/CodeGen/CalcSpillWeights.cpp b/llvm/lib/CodeGen/CalcSpillWeights.cpp
index 5a005ba7b414d6..0377bc00206737 100644
--- a/llvm/lib/CodeGen/CalcSpillWeights.cpp
+++ b/llvm/lib/CodeGen/CalcSpillWeights.cpp
@@ -97,7 +97,7 @@ bool VirtRegAuxInfo::isRematerializable(const LiveInterval &LI,
     // Trace copies introduced by live range splitting.  The inline
     // spiller can rematerialize through these copies, so the spill
     // weight must reflect this.
-    while (MI->isFullCopy()) {
+    while (TII.isFullCopyInstr(*MI)) {
       // The copy destination must match the interval register.
       if (MI->getOperand(0).getReg() != Reg)
         return false;
@@ -224,7 +224,16 @@ float VirtRegAuxInfo::weightCalcHelper(LiveInterval &LI, SlotIndex *Start,
       continue;
 
     NumInstr++;
-    if (MI->isIdentityCopy() || MI->isImplicitDef())
+    bool identityCopy = false;
+    auto DestSrc = TII.isCopyInstr(*MI);
+    if (DestSrc) {
+      const MachineOperand *DestRegOp = DestSrc->Destination;
+      const MachineOperand *SrcRegOp = DestSrc->Source;
+      identityCopy = DestRegOp->getReg() == SrcRegOp->getReg() &&
+                     DestRegOp->getSubReg() == SrcRegOp->getSubReg();
+    }
+
+    if (identityCopy || MI->isImplicitDef())
       continue;
     if (!Visited.insert(MI).second)
       continue;
@@ -258,7 +267,7 @@ float VirtRegAuxInfo::weightCalcHelper(LiveInterval &LI, SlotIndex *Start,
     }
 
     // Get allocation hints from copies.
-    if (!MI->isCopy())
+    if (!TII.isCopyInstr(*MI))
       continue;
     Register HintReg = copyHint(MI, LI.reg(), TRI, MRI);
     if (!HintReg)

diff  --git a/llvm/lib/CodeGen/InlineSpiller.cpp b/llvm/lib/CodeGen/InlineSpiller.cpp
index 277c6be418c54a..e797e6953bccc2 100644
--- a/llvm/lib/CodeGen/InlineSpiller.cpp
+++ b/llvm/lib/CodeGen/InlineSpiller.cpp
@@ -256,11 +256,11 @@ Spiller *llvm::createInlineSpiller(MachineFunctionPass &Pass,
 // This minimizes register pressure and maximizes the store-to-load distance for
 // spill slots which can be important in tight loops.
 
-/// If MI is a COPY to or from Reg, return the other register, otherwise return
-/// 0.
-static Register isCopyOf(const MachineInstr &MI, Register Reg) {
-  assert(!MI.isBundled());
-  if (!MI.isCopy())
+/// isFullCopyOf - If MI is a COPY to or from Reg, return the other register,
+/// otherwise return 0.
+static Register isCopyOf(const MachineInstr &MI, Register Reg,
+                         const TargetInstrInfo &TII) {
+  if (!TII.isCopyInstr(MI))
     return Register();
 
   const MachineOperand &DstOp = MI.getOperand(0);
@@ -277,9 +277,10 @@ static Register isCopyOf(const MachineInstr &MI, Register Reg) {
 }
 
 /// Check for a copy bundle as formed by SplitKit.
-static Register isCopyOfBundle(const MachineInstr &FirstMI, Register Reg) {
+static Register isCopyOfBundle(const MachineInstr &FirstMI, Register Reg,
+                               const TargetInstrInfo &TII) {
   if (!FirstMI.isBundled())
-    return isCopyOf(FirstMI, Reg);
+    return isCopyOf(FirstMI, Reg, TII);
 
   assert(!FirstMI.isBundledWithPred() && FirstMI.isBundledWithSucc() &&
          "expected to see first instruction in bundle");
@@ -288,7 +289,7 @@ static Register isCopyOfBundle(const MachineInstr &FirstMI, Register Reg) {
   MachineBasicBlock::const_instr_iterator I = FirstMI.getIterator();
   while (I->isBundledWithSucc()) {
     const MachineInstr &MI = *I;
-    if (!MI.isCopy())
+    if (!TII.isCopyInstr(FirstMI))
       return Register();
 
     const MachineOperand &DstOp = MI.getOperand(0);
@@ -358,7 +359,7 @@ bool InlineSpiller::isSnippet(const LiveInterval &SnipLI) {
     MachineInstr &MI = *RI++;
 
     // Allow copies to/from Reg.
-    if (isCopyOfBundle(MI, Reg))
+    if (isCopyOfBundle(MI, Reg, TII))
       continue;
 
     // Allow stack slot loads.
@@ -396,7 +397,7 @@ void InlineSpiller::collectRegsToSpill() {
     return;
 
   for (MachineInstr &MI : llvm::make_early_inc_range(MRI.reg_bundles(Reg))) {
-    Register SnipReg = isCopyOfBundle(MI, Reg);
+    Register SnipReg = isCopyOfBundle(MI, Reg, TII);
     if (!isSibling(SnipReg))
       continue;
     LiveInterval &SnipLI = LIS.getInterval(SnipReg);
@@ -519,14 +520,14 @@ void InlineSpiller::eliminateRedundantSpills(LiveInterval &SLI, VNInfo *VNI) {
     // Find all spills and copies of VNI.
     for (MachineInstr &MI :
          llvm::make_early_inc_range(MRI.use_nodbg_bundles(Reg))) {
-      if (!MI.isCopy() && !MI.mayStore())
+      if (!MI.mayStore() && !TII.isCopyInstr(MI))
         continue;
       SlotIndex Idx = LIS.getInstructionIndex(MI);
       if (LI->getVNInfoAt(Idx) != VNI)
         continue;
 
       // Follow sibling copies down the dominator tree.
-      if (Register DstReg = isCopyOfBundle(MI, Reg)) {
+      if (Register DstReg = isCopyOfBundle(MI, Reg, TII)) {
         if (isSibling(DstReg)) {
           LiveInterval &DstLI = LIS.getInterval(DstReg);
           VNInfo *DstVNI = DstLI.getVNInfoAt(Idx.getRegSlot());
@@ -870,7 +871,7 @@ foldMemoryOperand(ArrayRef<std::pair<MachineInstr *, unsigned>> Ops,
   if (Ops.back().first != MI || MI->isBundled())
     return false;
 
-  bool WasCopy = MI->isCopy();
+  bool WasCopy = TII.isCopyInstr(*MI).has_value();
   Register ImpReg;
 
   // TII::foldMemoryOperand will do what we need here for statepoint
@@ -1155,7 +1156,7 @@ void InlineSpiller::spillAroundUses(Register Reg) {
         Idx = VNI->def;
 
     // Check for a sibling copy.
-    Register SibReg = isCopyOfBundle(MI, Reg);
+    Register SibReg = isCopyOfBundle(MI, Reg, TII);
     if (SibReg && isSibling(SibReg)) {
       // This may actually be a copy between snippets.
       if (isRegToSpill(SibReg)) {

diff  --git a/llvm/lib/CodeGen/LiveRangeEdit.cpp b/llvm/lib/CodeGen/LiveRangeEdit.cpp
index c3477cd8ce34ce..ff49e080090c2b 100644
--- a/llvm/lib/CodeGen/LiveRangeEdit.cpp
+++ b/llvm/lib/CodeGen/LiveRangeEdit.cpp
@@ -352,7 +352,8 @@ void LiveRangeEdit::eliminateDeadDef(MachineInstr *MI, ToShrinkSet &ToShrink) {
     // unlikely to change anything. We typically don't want to shrink the
     // PIC base register that has lots of uses everywhere.
     // Always shrink COPY uses that probably come from live range splitting.
-    if ((MI->readsVirtualRegister(Reg) && (MI->isCopy() || MO.isDef())) ||
+    if ((MI->readsVirtualRegister(Reg) &&
+         (MO.isDef() || TII.isCopyInstr(*MI))) ||
         (MO.readsReg() && (MRI.hasOneNonDBGUse(Reg) || useIsKill(LI, MO))))
       ToShrink.insert(&LI);
     else if (MO.readsReg())

diff  --git a/llvm/lib/CodeGen/LiveRangeShrink.cpp b/llvm/lib/CodeGen/LiveRangeShrink.cpp
index 93f5314539cdb1..af7d6c4403b8fc 100644
--- a/llvm/lib/CodeGen/LiveRangeShrink.cpp
+++ b/llvm/lib/CodeGen/LiveRangeShrink.cpp
@@ -23,6 +23,7 @@
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
@@ -109,6 +110,7 @@ bool LiveRangeShrink::runOnMachineFunction(MachineFunction &MF) {
     return false;
 
   MachineRegisterInfo &MRI = MF.getRegInfo();
+  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
 
   LLVM_DEBUG(dbgs() << "**** Analysing " << MF.getName() << '\n');
 
@@ -197,7 +199,7 @@ bool LiveRangeShrink::runOnMachineFunction(MachineFunction &MF) {
           // is because it needs more accurate model to handle register
           // pressure correctly.
           MachineInstr &DefInstr = *MRI.def_instr_begin(Reg);
-          if (!DefInstr.isCopy())
+          if (!TII.isCopyInstr(DefInstr))
             NumEligibleUse++;
           Insert = FindDominatedInstruction(DefInstr, Insert, IOM);
         } else {

diff  --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp
index 48187e575494fc..68f6ea3268a9ae 100644
--- a/llvm/lib/CodeGen/RegAllocGreedy.cpp
+++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp
@@ -1282,10 +1282,12 @@ static LaneBitmask getInstReadLaneMask(const MachineRegisterInfo &MRI,
 /// VirtReg.
 static bool readsLaneSubset(const MachineRegisterInfo &MRI,
                             const MachineInstr *MI, const LiveInterval &VirtReg,
-                            const TargetRegisterInfo *TRI, SlotIndex Use) {
+                            const TargetRegisterInfo *TRI, SlotIndex Use,
+                            const TargetInstrInfo *TII) {
   // Early check the common case.
-  if (MI->isCopy() &&
-      MI->getOperand(0).getSubReg() == MI->getOperand(1).getSubReg())
+  auto DestSrc = TII->isCopyInstr(*MI);
+  if (DestSrc &&
+      DestSrc->Destination->getSubReg() == DestSrc->Source->getSubReg())
     return false;
 
   // FIXME: We're only considering uses, but should be consider defs too?
@@ -1344,14 +1346,14 @@ unsigned RAGreedy::tryInstructionSplit(const LiveInterval &VirtReg,
   // the allocation.
   for (const SlotIndex Use : Uses) {
     if (const MachineInstr *MI = Indexes->getInstructionFromIndex(Use)) {
-      if (MI->isFullCopy() ||
+      if (TII->isFullCopyInstr(*MI) ||
           (SplitSubClass &&
            SuperRCNumAllocatableRegs ==
                getNumAllocatableRegsForConstraints(MI, VirtReg.reg(), SuperRC,
                                                    TII, TRI, RegClassInfo)) ||
           // TODO: Handle split for subranges with subclass constraints?
           (!SplitSubClass && VirtReg.hasSubRanges() &&
-           !readsLaneSubset(*MRI, MI, VirtReg, TRI, Use))) {
+           !readsLaneSubset(*MRI, MI, VirtReg, TRI, Use, TII))) {
         LLVM_DEBUG(dbgs() << "    skip:\t" << Use << '\t' << *MI);
         continue;
       }
@@ -2138,7 +2140,7 @@ void RAGreedy::initializeCSRCost() {
 /// \p Out is not cleared before being populated.
 void RAGreedy::collectHintInfo(Register Reg, HintsInfo &Out) {
   for (const MachineInstr &Instr : MRI->reg_nodbg_instructions(Reg)) {
-    if (!Instr.isFullCopy())
+    if (!TII->isFullCopyInstr(Instr))
       continue;
     // Look for the other end of the copy.
     Register OtherReg = Instr.getOperand(0).getReg();
@@ -2453,9 +2455,10 @@ RAGreedy::RAGreedyStats RAGreedy::computeStats(MachineBasicBlock &MBB) {
            MI.getOpcode() == TargetOpcode::STATEPOINT;
   };
   for (MachineInstr &MI : MBB) {
-    if (MI.isCopy()) {
-      const MachineOperand &Dest = MI.getOperand(0);
-      const MachineOperand &Src = MI.getOperand(1);
+    auto DestSrc = TII->isCopyInstr(MI);
+    if (DestSrc) {
+      const MachineOperand &Dest = *DestSrc->Destination;
+      const MachineOperand &Src = *DestSrc->Source;
       Register SrcReg = Src.getReg();
       Register DestReg = Dest.getReg();
       // Only count `COPY`s with a virtual register as source or destination.

diff  --git a/llvm/lib/CodeGen/SplitKit.cpp b/llvm/lib/CodeGen/SplitKit.cpp
index eee54f09fbad01..83964eced5977e 100644
--- a/llvm/lib/CodeGen/SplitKit.cpp
+++ b/llvm/lib/CodeGen/SplitKit.cpp
@@ -514,10 +514,10 @@ void SplitEditor::forceRecompute(unsigned RegIdx, const VNInfo &ParentVNI) {
   VFP = ValueForcePair(nullptr, true);
 }
 
-SlotIndex SplitEditor::buildSingleSubRegCopy(Register FromReg, Register ToReg,
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore,
-    unsigned SubIdx, LiveInterval &DestLI, bool Late, SlotIndex Def) {
-  const MCInstrDesc &Desc = TII.get(TargetOpcode::COPY);
+SlotIndex SplitEditor::buildSingleSubRegCopy(
+    Register FromReg, Register ToReg, MachineBasicBlock &MBB,
+    MachineBasicBlock::iterator InsertBefore, unsigned SubIdx,
+    LiveInterval &DestLI, bool Late, SlotIndex Def, const MCInstrDesc &Desc) {
   bool FirstCopy = !Def.isValid();
   MachineInstr *CopyMI = BuildMI(MBB, InsertBefore, DebugLoc(), Desc)
       .addReg(ToReg, RegState::Define | getUndefRegState(FirstCopy)
@@ -536,7 +536,8 @@ SlotIndex SplitEditor::buildSingleSubRegCopy(Register FromReg, Register ToReg,
 SlotIndex SplitEditor::buildCopy(Register FromReg, Register ToReg,
     LaneBitmask LaneMask, MachineBasicBlock &MBB,
     MachineBasicBlock::iterator InsertBefore, bool Late, unsigned RegIdx) {
-  const MCInstrDesc &Desc = TII.get(TargetOpcode::COPY);
+  const MCInstrDesc &Desc =
+      TII.get(TII.getLiveRangeSplitOpcode(FromReg, *MBB.getParent()));
   SlotIndexes &Indexes = *LIS.getSlotIndexes();
   if (LaneMask.all() || LaneMask == MRI.getMaxLaneMaskForVReg(FromReg)) {
     // The full vreg is copied.
@@ -564,7 +565,7 @@ SlotIndex SplitEditor::buildCopy(Register FromReg, Register ToReg,
   SlotIndex Def;
   for (unsigned BestIdx : SubIndexes) {
     Def = buildSingleSubRegCopy(FromReg, ToReg, MBB, InsertBefore, BestIdx,
-                                DestLI, Late, Def);
+                                DestLI, Late, Def, Desc);
   }
 
   BumpPtrAllocator &Allocator = LIS.getVNInfoAllocator();
@@ -1584,7 +1585,9 @@ bool SplitAnalysis::shouldSplitSingleBlock(const BlockInfo &BI,
   if (BI.LiveIn && BI.LiveOut)
     return true;
   // No point in isolating a copy. It has no register class constraints.
-  if (LIS.getInstructionFromIndex(BI.FirstInstr)->isCopyLike())
+  MachineInstr *MI = LIS.getInstructionFromIndex(BI.FirstInstr);
+  bool copyLike = TII.isCopyInstr(*MI) || MI->isSubregToReg();
+  if (copyLike)
     return false;
   // Finally, don't isolate an end point that was created by earlier splits.
   return isOriginalEndpoint(BI.FirstInstr);

diff  --git a/llvm/lib/CodeGen/SplitKit.h b/llvm/lib/CodeGen/SplitKit.h
index f764ffd4750ccb..1174e392e4e442 100644
--- a/llvm/lib/CodeGen/SplitKit.h
+++ b/llvm/lib/CodeGen/SplitKit.h
@@ -428,8 +428,11 @@ class LLVM_LIBRARY_VISIBILITY SplitEditor {
       bool Late, unsigned RegIdx);
 
   SlotIndex buildSingleSubRegCopy(Register FromReg, Register ToReg,
-      MachineBasicBlock &MB, MachineBasicBlock::iterator InsertBefore,
-      unsigned SubIdx, LiveInterval &DestLI, bool Late, SlotIndex Def);
+                                  MachineBasicBlock &MB,
+                                  MachineBasicBlock::iterator InsertBefore,
+                                  unsigned SubIdx, LiveInterval &DestLI,
+                                  bool Late, SlotIndex Def,
+                                  const MCInstrDesc &Desc);
 
 public:
   /// Create a new SplitEditor for editing the LiveInterval analyzed by SA.

diff  --git a/llvm/lib/CodeGen/TargetInstrInfo.cpp b/llvm/lib/CodeGen/TargetInstrInfo.cpp
index b29404b425190a..09dcddc17b0643 100644
--- a/llvm/lib/CodeGen/TargetInstrInfo.cpp
+++ b/llvm/lib/CodeGen/TargetInstrInfo.cpp
@@ -440,8 +440,9 @@ MachineInstr &TargetInstrInfo::duplicate(MachineBasicBlock &MBB,
 // If the COPY instruction in MI can be folded to a stack operation, return
 // the register class to use.
 static const TargetRegisterClass *canFoldCopy(const MachineInstr &MI,
+                                              const TargetInstrInfo &TII,
                                               unsigned FoldIdx) {
-  assert(MI.isCopy() && "MI must be a COPY instruction");
+  assert(TII.isCopyInstr(MI) && "MI must be a COPY instruction");
   if (MI.getNumOperands() != 2)
     return nullptr;
   assert(FoldIdx<2 && "FoldIdx refers no nonexistent operand");
@@ -630,10 +631,10 @@ MachineInstr *TargetInstrInfo::foldMemoryOperand(MachineInstr &MI,
   }
 
   // Straight COPY may fold as load/store.
-  if (!MI.isCopy() || Ops.size() != 1)
+  if (!isCopyInstr(MI) || Ops.size() != 1)
     return nullptr;
 
-  const TargetRegisterClass *RC = canFoldCopy(MI, Ops[0]);
+  const TargetRegisterClass *RC = canFoldCopy(MI, *this, Ops[0]);
   if (!RC)
     return nullptr;
 

diff  --git a/llvm/test/CodeGen/Mips/madd-msub.ll b/llvm/test/CodeGen/Mips/madd-msub.ll
index 3a6fd68e9df2e2..9f7145685ed311 100644
--- a/llvm/test/CodeGen/Mips/madd-msub.ll
+++ b/llvm/test/CodeGen/Mips/madd-msub.ll
@@ -42,22 +42,22 @@ define i64 @madd1(i32 %a, i32 %b, i32 %c) nounwind readnone {
 ;
 ; 64-LABEL: madd1:
 ; 64:       # %bb.0: # %entry
-; 64-NEXT:    sll $1, $4, 0
-; 64-NEXT:    sll $2, $5, 0
-; 64-NEXT:    dmult $2, $1
+; 64-NEXT:    sll $4, $4, 0
+; 64-NEXT:    sll $5, $5, 0
+; 64-NEXT:    dmult $5, $4
 ; 64-NEXT:    mflo $1
-; 64-NEXT:    sll $2, $6, 0
+; 64-NEXT:    sll $6, $6, 0
 ; 64-NEXT:    jr $ra
-; 64-NEXT:    daddu $2, $1, $2
+; 64-NEXT:    daddu $2, $1, $6
 ;
 ; 64R6-LABEL: madd1:
 ; 64R6:       # %bb.0: # %entry
-; 64R6-NEXT:    sll $1, $4, 0
-; 64R6-NEXT:    sll $2, $5, 0
-; 64R6-NEXT:    dmul $1, $2, $1
-; 64R6-NEXT:    sll $2, $6, 0
+; 64R6-NEXT:    sll $4, $4, 0
+; 64R6-NEXT:    sll $5, $5, 0
+; 64R6-NEXT:    dmul $1, $5, $4
+; 64R6-NEXT:    sll $6, $6, 0
 ; 64R6-NEXT:    jr $ra
-; 64R6-NEXT:    daddu $2, $1, $2
+; 64R6-NEXT:    daddu $2, $1, $6
 ;
 ; 16-LABEL: madd1:
 ; 16:       # %bb.0: # %entry
@@ -173,18 +173,18 @@ define i64 @madd3(i32 %a, i32 %b, i64 %c) nounwind readnone {
 ;
 ; 64-LABEL: madd3:
 ; 64:       # %bb.0: # %entry
-; 64-NEXT:    sll $1, $4, 0
-; 64-NEXT:    sll $2, $5, 0
-; 64-NEXT:    dmult $2, $1
+; 64-NEXT:    sll $4, $4, 0
+; 64-NEXT:    sll $5, $5, 0
+; 64-NEXT:    dmult $5, $4
 ; 64-NEXT:    mflo $1
 ; 64-NEXT:    jr $ra
 ; 64-NEXT:    daddu $2, $1, $6
 ;
 ; 64R6-LABEL: madd3:
 ; 64R6:       # %bb.0: # %entry
-; 64R6-NEXT:    sll $1, $4, 0
-; 64R6-NEXT:    sll $2, $5, 0
-; 64R6-NEXT:    dmul $1, $2, $1
+; 64R6-NEXT:    sll $4, $4, 0
+; 64R6-NEXT:    sll $5, $5, 0
+; 64R6-NEXT:    dmul $1, $5, $4
 ; 64R6-NEXT:    jr $ra
 ; 64R6-NEXT:    daddu $2, $1, $6
 ;
@@ -291,22 +291,22 @@ define i64 @msub1(i32 %a, i32 %b, i32 %c) nounwind readnone {
 ;
 ; 64-LABEL: msub1:
 ; 64:       # %bb.0: # %entry
-; 64-NEXT:    sll $1, $4, 0
-; 64-NEXT:    sll $2, $5, 0
-; 64-NEXT:    dmult $2, $1
+; 64-NEXT:    sll $4, $4, 0
+; 64-NEXT:    sll $5, $5, 0
+; 64-NEXT:    dmult $5, $4
 ; 64-NEXT:    mflo $1
-; 64-NEXT:    sll $2, $6, 0
+; 64-NEXT:    sll $6, $6, 0
 ; 64-NEXT:    jr $ra
-; 64-NEXT:    dsubu $2, $2, $1
+; 64-NEXT:    dsubu $2, $6, $1
 ;
 ; 64R6-LABEL: msub1:
 ; 64R6:       # %bb.0: # %entry
-; 64R6-NEXT:    sll $1, $4, 0
-; 64R6-NEXT:    sll $2, $5, 0
-; 64R6-NEXT:    dmul $1, $2, $1
-; 64R6-NEXT:    sll $2, $6, 0
+; 64R6-NEXT:    sll $4, $4, 0
+; 64R6-NEXT:    sll $5, $5, 0
+; 64R6-NEXT:    dmul $1, $5, $4
+; 64R6-NEXT:    sll $6, $6, 0
 ; 64R6-NEXT:    jr $ra
-; 64R6-NEXT:    dsubu $2, $2, $1
+; 64R6-NEXT:    dsubu $2, $6, $1
 ;
 ; 16-LABEL: msub1:
 ; 16:       # %bb.0: # %entry
@@ -424,18 +424,18 @@ define i64 @msub3(i32 %a, i32 %b, i64 %c) nounwind readnone {
 ;
 ; 64-LABEL: msub3:
 ; 64:       # %bb.0: # %entry
-; 64-NEXT:    sll $1, $4, 0
-; 64-NEXT:    sll $2, $5, 0
-; 64-NEXT:    dmult $2, $1
+; 64-NEXT:    sll $4, $4, 0
+; 64-NEXT:    sll $5, $5, 0
+; 64-NEXT:    dmult $5, $4
 ; 64-NEXT:    mflo $1
 ; 64-NEXT:    jr $ra
 ; 64-NEXT:    dsubu $2, $6, $1
 ;
 ; 64R6-LABEL: msub3:
 ; 64R6:       # %bb.0: # %entry
-; 64R6-NEXT:    sll $1, $4, 0
-; 64R6-NEXT:    sll $2, $5, 0
-; 64R6-NEXT:    dmul $1, $2, $1
+; 64R6-NEXT:    sll $4, $4, 0
+; 64R6-NEXT:    sll $5, $5, 0
+; 64R6-NEXT:    dmul $1, $5, $4
 ; 64R6-NEXT:    jr $ra
 ; 64R6-NEXT:    dsubu $2, $6, $1
 ;
@@ -546,22 +546,22 @@ define i64 @msub5(i32 %a, i32 %b, i32 %c) {
 ;
 ; 64-LABEL: msub5:
 ; 64:       # %bb.0: # %entry
-; 64-NEXT:    sll $1, $4, 0
-; 64-NEXT:    sll $2, $5, 0
-; 64-NEXT:    dmult $2, $1
+; 64-NEXT:    sll $4, $4, 0
+; 64-NEXT:    sll $5, $5, 0
+; 64-NEXT:    dmult $5, $4
 ; 64-NEXT:    mflo $1
-; 64-NEXT:    sll $2, $6, 0
+; 64-NEXT:    sll $6, $6, 0
 ; 64-NEXT:    jr $ra
-; 64-NEXT:    dsubu $2, $1, $2
+; 64-NEXT:    dsubu $2, $1, $6
 ;
 ; 64R6-LABEL: msub5:
 ; 64R6:       # %bb.0: # %entry
-; 64R6-NEXT:    sll $1, $4, 0
-; 64R6-NEXT:    sll $2, $5, 0
-; 64R6-NEXT:    dmul $1, $2, $1
-; 64R6-NEXT:    sll $2, $6, 0
+; 64R6-NEXT:    sll $4, $4, 0
+; 64R6-NEXT:    sll $5, $5, 0
+; 64R6-NEXT:    dmul $1, $5, $4
+; 64R6-NEXT:    sll $6, $6, 0
 ; 64R6-NEXT:    jr $ra
-; 64R6-NEXT:    dsubu $2, $1, $2
+; 64R6-NEXT:    dsubu $2, $1, $6
 ;
 ; 16-LABEL: msub5:
 ; 16:       # %bb.0: # %entry

diff  --git a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
index b7b19a477ab0fd..b5a360e0bdc7fe 100644
--- a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
@@ -1614,32 +1614,33 @@ define arm_aapcs_vfpcc void @arm_biquad_cascade_df1_f32(ptr nocapture readonly %
 ; CHECK-NEXT:    @ Parent Loop BB19_3 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    vmov r7, s7
-; CHECK-NEXT:    vldrw.u32 q2, [r9, #16]
+; CHECK-NEXT:    vldr s0, [r1, #12]
 ; CHECK-NEXT:    vmov r11, s6
 ; CHECK-NEXT:    vldrw.u32 q1, [r9, #112]
-; CHECK-NEXT:    vmov r4, s1
-; CHECK-NEXT:    vldr s1, [r1, #12]
 ; CHECK-NEXT:    vmov r3, s3
 ; CHECK-NEXT:    vldr s3, [r1, #8]
 ; CHECK-NEXT:    vstrw.32 q1, [sp, #32] @ 16-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q1, [r9]
-; CHECK-NEXT:    vmov r8, s1
+; CHECK-NEXT:    vmov r8, s0
+; CHECK-NEXT:    vldrw.u32 q2, [r9, #16]
 ; CHECK-NEXT:    ldr r6, [r1, #4]
 ; CHECK-NEXT:    vldrw.u32 q7, [r9, #32]
 ; CHECK-NEXT:    vmul.f32 q1, q1, r8
 ; CHECK-NEXT:    vmov r0, s3
-; CHECK-NEXT:    vldrw.u32 q3, [r9, #48]
 ; CHECK-NEXT:    vfma.f32 q1, q2, r0
+; CHECK-NEXT:    vldrw.u32 q3, [r9, #48]
 ; CHECK-NEXT:    ldr r0, [r1], #16
 ; CHECK-NEXT:    vfma.f32 q1, q7, r6
+; CHECK-NEXT:    vmov r4, s1
 ; CHECK-NEXT:    vldrw.u32 q6, [r9, #64]
-; CHECK-NEXT:    vmov.f32 s2, s1
+; CHECK-NEXT:    vmov.f32 s1, s0
 ; CHECK-NEXT:    vfma.f32 q1, q3, r0
+; CHECK-NEXT:    vmov.f32 s2, s0
 ; CHECK-NEXT:    vldrw.u32 q5, [r9, #80]
 ; CHECK-NEXT:    vfma.f32 q1, q6, r4
 ; CHECK-NEXT:    vldrw.u32 q4, [r9, #96]
-; CHECK-NEXT:    vldrw.u32 q2, [sp, #32] @ 16-byte Reload
 ; CHECK-NEXT:    vfma.f32 q1, q5, r3
+; CHECK-NEXT:    vldrw.u32 q2, [sp, #32] @ 16-byte Reload
 ; CHECK-NEXT:    vfma.f32 q1, q4, r7
 ; CHECK-NEXT:    vfma.f32 q1, q2, r11
 ; CHECK-NEXT:    vstrb.8 q1, [r5], #16

diff  --git a/llvm/test/CodeGen/X86/GlobalISel/add-ext.ll b/llvm/test/CodeGen/X86/GlobalISel/add-ext.ll
index 9beff7b7c2dfa6..0b8811f96bc804 100644
--- a/llvm/test/CodeGen/X86/GlobalISel/add-ext.ll
+++ b/llvm/test/CodeGen/X86/GlobalISel/add-ext.ll
@@ -205,8 +205,8 @@ define void @PR20134_zext(ptr %a, i32 %i) {
 ; CHECK-NEXT:    addq %rdi, %rcx
 ; CHECK-NEXT:    movl (%rcx), %ecx
 ; CHECK-NEXT:    addl (%rax), %ecx
-; CHECK-NEXT:    movl %esi, %eax
-; CHECK-NEXT:    imulq $4, %rax, %rax
+; CHECK-NEXT:    movl %esi, %esi
+; CHECK-NEXT:    imulq $4, %rsi, %rax
 ; CHECK-NEXT:    addq %rdi, %rax
 ; CHECK-NEXT:    movl %ecx, (%rax)
 ; CHECK-NEXT:    retq

diff  --git a/llvm/test/CodeGen/X86/dagcombine-cse.ll b/llvm/test/CodeGen/X86/dagcombine-cse.ll
index 222a4d78668b6e..6d817d89874d1f 100644
--- a/llvm/test/CodeGen/X86/dagcombine-cse.ll
+++ b/llvm/test/CodeGen/X86/dagcombine-cse.ll
@@ -106,24 +106,24 @@ define i96 @square_high(i96 %x) nounwind {
 ;
 ; X64-LABEL: square_high:
 ; X64:       ## %bb.0: ## %entry
-; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    movl %esi, %esi
+; X64-NEXT:    movq %rsi, %rax
 ; X64-NEXT:    mulq %rdi
-; X64-NEXT:    movq %rdx, %rsi
+; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    movq %rax, %r8
 ; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    mulq %rdi
 ; X64-NEXT:    addq %r8, %rdx
-; X64-NEXT:    movq %rsi, %rax
+; X64-NEXT:    movq %rcx, %rax
 ; X64-NEXT:    adcq $0, %rax
 ; X64-NEXT:    addq %rdx, %r8
-; X64-NEXT:    adcq %rsi, %rax
-; X64-NEXT:    imulq %rcx, %rcx
-; X64-NEXT:    addq %rax, %rcx
-; X64-NEXT:    shrdq $32, %rcx, %r8
-; X64-NEXT:    shrq $32, %rcx
+; X64-NEXT:    adcq %rcx, %rax
+; X64-NEXT:    imulq %rsi, %rsi
+; X64-NEXT:    addq %rax, %rsi
+; X64-NEXT:    shrdq $32, %rsi, %r8
+; X64-NEXT:    shrq $32, %rsi
 ; X64-NEXT:    movq %r8, %rax
-; X64-NEXT:    movq %rcx, %rdx
+; X64-NEXT:    movq %rsi, %rdx
 ; X64-NEXT:    retq
 entry:
   %conv = zext i96 %x to i192

diff  --git a/llvm/test/CodeGen/X86/fold-and-shift-x86_64.ll b/llvm/test/CodeGen/X86/fold-and-shift-x86_64.ll
index 62abc4e035f4ac..9a842a1e58d272 100644
--- a/llvm/test/CodeGen/X86/fold-and-shift-x86_64.ll
+++ b/llvm/test/CodeGen/X86/fold-and-shift-x86_64.ll
@@ -34,8 +34,8 @@ entry:
 define i8 @t3(ptr %X, i64 %i) {
 ; CHECK-LABEL: t3:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movl %esi, %eax
-; CHECK-NEXT:    movzbl (%rdi,%rax,4), %eax
+; CHECK-NEXT:    movl %esi, %esi
+; CHECK-NEXT:    movzbl (%rdi,%rsi,4), %eax
 ; CHECK-NEXT:    retq
 
 entry:

diff  --git a/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-lowhigh.ll b/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-lowhigh.ll
index eb6accd3e623b0..d41be734aa2d2b 100644
--- a/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-lowhigh.ll
+++ b/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-lowhigh.ll
@@ -78,18 +78,18 @@ define i32 @out32_constmask(i32 %x, i32 %y) {
 define i64 @out64_constmask(i64 %x, i64 %y) {
 ; CHECK-NOBMI-LABEL: out64_constmask:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    movl %edi, %ecx
+; CHECK-NOBMI-NEXT:    movl %edi, %edi
 ; CHECK-NOBMI-NEXT:    movabsq $-4294967296, %rax # imm = 0xFFFFFFFF00000000
 ; CHECK-NOBMI-NEXT:    andq %rsi, %rax
-; CHECK-NOBMI-NEXT:    orq %rcx, %rax
+; CHECK-NOBMI-NEXT:    orq %rdi, %rax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: out64_constmask:
 ; CHECK-BMI:       # %bb.0:
-; CHECK-BMI-NEXT:    movl %edi, %ecx
+; CHECK-BMI-NEXT:    movl %edi, %edi
 ; CHECK-BMI-NEXT:    movabsq $-4294967296, %rax # imm = 0xFFFFFFFF00000000
 ; CHECK-BMI-NEXT:    andq %rsi, %rax
-; CHECK-BMI-NEXT:    orq %rcx, %rax
+; CHECK-BMI-NEXT:    orq %rdi, %rax
 ; CHECK-BMI-NEXT:    retq
   %mx = and i64 %x, 4294967295
   %my = and i64 %y, -4294967296