[llvm] caf1294 - [AMDGPU] Experiments show that the GCNRegBankReassign pass significantly impacts

Baptiste Saleil via llvm-commits llvm-commits at lists.llvm.org
Mon Apr 26 14:22:12 PDT 2021


Author: Baptiste Saleil
Date: 2021-04-26T17:21:49-04:00
New Revision: caf1294d95785503a1d114c8c167e181fff7068b

URL: https://github.com/llvm/llvm-project/commit/caf1294d95785503a1d114c8c167e181fff7068b
DIFF: https://github.com/llvm/llvm-project/commit/caf1294d95785503a1d114c8c167e181fff7068b.diff

LOG: [AMDGPU] Experiments show that the GCNRegBankReassign pass significantly impacts
the compilation time and there is no case for which we see any improvement in
performance. This patch removes this pass and its associated test cases from
the tree.

Differential Revision: https://reviews.llvm.org/D101313

Change-Id: I0599169a7609c19a887f8d847a71e664030cc141

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPU.h
    llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
    llvm/lib/Target/AMDGPU/CMakeLists.txt
    llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.dim.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
    llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
    llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
    llvm/test/CodeGen/AMDGPU/ctlz.ll
    llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
    llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll
    llvm/test/CodeGen/AMDGPU/fneg-fold-legalize-dag-increase-insts.ll
    llvm/test/CodeGen/AMDGPU/frem.ll
    llvm/test/CodeGen/AMDGPU/fshr.ll
    llvm/test/CodeGen/AMDGPU/idiv-licm.ll
    llvm/test/CodeGen/AMDGPU/idot2.ll
    llvm/test/CodeGen/AMDGPU/idot4s.ll
    llvm/test/CodeGen/AMDGPU/idot4u.ll
    llvm/test/CodeGen/AMDGPU/idot8s.ll
    llvm/test/CodeGen/AMDGPU/idot8u.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll
    llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
    llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
    llvm/test/CodeGen/AMDGPU/memory_clause.ll
    llvm/test/CodeGen/AMDGPU/saddo.ll
    llvm/test/CodeGen/AMDGPU/saddsat.ll
    llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
    llvm/test/CodeGen/AMDGPU/ssubsat.ll
    llvm/test/CodeGen/AMDGPU/store-local.128.ll
    llvm/test/CodeGen/AMDGPU/store-local.96.ll
    llvm/test/CodeGen/AMDGPU/strict_fadd.f64.ll
    llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll
    llvm/test/CodeGen/AMDGPU/strict_fma.f64.ll
    llvm/test/CodeGen/AMDGPU/strict_fmul.f64.ll
    llvm/test/CodeGen/AMDGPU/strict_fsub.f64.ll
    llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll

Removed: 
    llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp
    llvm/test/CodeGen/AMDGPU/regbank-reassign-split.mir
    llvm/test/CodeGen/AMDGPU/regbank-reassign-wave64.mir
    llvm/test/CodeGen/AMDGPU/regbank-reassign.mir


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 7220616c2b79..a38d0a779bd6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -74,16 +74,6 @@ FunctionPass *createAMDGPURewriteOutArgumentsPass();
 ModulePass *createAMDGPULowerModuleLDSPass();
 FunctionPass *createSIModeRegisterPass();
 
-namespace AMDGPU {
-enum RegBankReassignMode {
-  RM_VGPR = 1,
-  RM_SGPR = 2,
-  RM_BOTH = RM_VGPR | RM_SGPR
-};
-}
-MachineFunctionPass *
-createGCNRegBankReassignPass(AMDGPU::RegBankReassignMode Mode);
-
 struct AMDGPUSimplifyLibCallsPass : PassInfoMixin<AMDGPUSimplifyLibCallsPass> {
   AMDGPUSimplifyLibCallsPass(TargetMachine &TM) : TM(TM) {}
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
@@ -342,9 +332,6 @@ ModulePass *createAMDGPUOpenCLEnqueuedBlockLoweringPass();
 void initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(PassRegistry &);
 extern char &AMDGPUOpenCLEnqueuedBlockLoweringID;
 
-void initializeGCNRegBankReassignPass(PassRegistry &);
-extern char &GCNRegBankReassignID;
-
 void initializeGCNNSAReassignPass(PassRegistry &);
 extern char &GCNNSAReassignID;
 

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index b7fcffb24473..b50e0eb8b87f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -262,7 +262,6 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeAMDGPUUseNativeCallsPass(*PR);
   initializeAMDGPUSimplifyLibCallsPass(*PR);
   initializeAMDGPUPrintfRuntimeBindingPass(*PR);
-  initializeGCNRegBankReassignPass(*PR);
   initializeGCNNSAReassignPass(*PR);
 }
 
@@ -1177,10 +1176,8 @@ void GCNPassConfig::addOptimizedRegAlloc() {
 }
 
 bool GCNPassConfig::addPreRewrite() {
-  if (EnableRegReassign) {
+  if (EnableRegReassign)
     addPass(&GCNNSAReassignID);
-    addPass(createGCNRegBankReassignPass(AMDGPU::RM_BOTH));
-  }
   return true;
 }
 

diff  --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index 4a4fee56539d..41d58d5b76b5 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -139,7 +139,6 @@ add_llvm_target(AMDGPUCodeGen
   SIShrinkInstructions.cpp
   SIWholeQuadMode.cpp
   GCNILPSched.cpp
-  GCNRegBankReassign.cpp
   GCNNSAReassign.cpp
   GCNDPPCombine.cpp
   SIModeRegister.cpp

diff  --git a/llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp b/llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp
deleted file mode 100644
index b877ef9be660..000000000000
--- a/llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp
+++ /dev/null
@@ -1,900 +0,0 @@
-//===-- GCNRegBankReassign.cpp - Reassign registers after regalloc --------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// \brief Try to reassign registers on GFX10+ to reduce register bank
-/// conflicts.
-///
-/// On GFX10 registers are organized in banks. VGPRs have 4 banks assigned in
-/// a round-robin fashion: v0, v4, v8... belong to bank 0. v1, v5, v9... to
-/// bank 1, etc. SGPRs have 8 banks and allocated in pairs, so that s0:s1,
-/// s16:s17, s32:s33 are at bank 0. s2:s3, s18:s19, s34:s35 are at bank 1 etc.
-///
-/// The shader can read one dword from each of these banks once per cycle.
-/// If an instruction has to read more register operands from the same bank
-/// an additional cycle is needed. HW attempts to pre-load registers through
-/// input operand gathering, but a stall cycle may occur if that fails. For
-/// example V_FMA_F32 V111 = V0 + V4 * V8 will need 3 cycles to read operands,
-/// potentially incuring 2 stall cycles.
-///
-/// The pass tries to reassign registers to reduce bank conflicts.
-///
-/// In this pass bank numbers 0-3 are VGPR banks and 4-11 are SGPR banks, so
-/// that 4 has to be subtracted from an SGPR bank number to get the real value.
-/// This also corresponds to bit numbers in bank masks used in the pass.
-///
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "GCNSubtarget.h"
-#include "SIMachineFunctionInfo.h"
-#include "llvm/ADT/SmallSet.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/LiveIntervals.h"
-#include "llvm/CodeGen/LiveRegMatrix.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineLoopInfo.h"
-#include "llvm/InitializePasses.h"
-
-using namespace llvm;
-using namespace AMDGPU;
-
-static cl::opt<unsigned> VerifyStallCycles("amdgpu-verify-regbanks-reassign",
-  cl::desc("Verify stall cycles in the regbanks reassign pass"),
-  cl::value_desc("0|1|2"),
-  cl::init(0), cl::Hidden);
-
-// Threshold to keep compile time reasonable.
-static cl::opt<unsigned> VRegThresh("amdgpu-regbanks-reassign-threshold",
-  cl::desc("Max number of vregs to run the regbanks reassign pass"),
-  cl::init(15000), cl::Hidden);
-
-#define DEBUG_TYPE "amdgpu-regbanks-reassign"
-
-#define NUM_VGPR_BANKS 4
-#define NUM_SGPR_BANKS 8
-#define NUM_BANKS (NUM_VGPR_BANKS + NUM_SGPR_BANKS)
-#define SGPR_BANK_OFFSET NUM_VGPR_BANKS
-#define VGPR_BANK_MASK 0xf
-#define SGPR_BANK_MASK 0xff0
-#define SGPR_BANK_SHIFTED_MASK (SGPR_BANK_MASK >> SGPR_BANK_OFFSET)
-
-STATISTIC(NumStallsDetected,
-          "Number of operand read stalls detected");
-STATISTIC(NumStallsRecovered,
-          "Number of operand read stalls recovered");
-
-namespace {
-
-class GCNRegBankReassign : public MachineFunctionPass {
-
-  class OperandMask {
-  public:
-    OperandMask(unsigned r, unsigned s, unsigned m)
-      : Reg(r), SubReg(s), Mask(m) {}
-    Register Reg;
-    unsigned SubReg;
-    unsigned Mask;
-  };
-
-  class Candidate {
-  public:
-    Candidate(MachineInstr *mi, Register reg, unsigned subreg,
-              unsigned freebanks)
-        : MI(mi), Reg(reg), SubReg(subreg), FreeBanks(freebanks) {}
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-    void dump(const GCNRegBankReassign *P) const {
-      MI->dump();
-      dbgs() << P->printReg(Reg) << " to banks ";
-      dumpFreeBanks(FreeBanks);
-      dbgs() << '\n';
-    }
-#endif
-
-    MachineInstr *MI;
-    Register Reg;
-    unsigned SubReg;
-    unsigned FreeBanks;
-  };
-
-  class CandidateList : public std::map<unsigned, std::list<Candidate>> {
-  public:
-    void push(unsigned Weight, const Candidate&& C) {
-      operator[](Weight).push_front(C);
-    }
-
-    Candidate &back() {
-      return rbegin()->second.back();
-    }
-
-    void pop_back() {
-      rbegin()->second.pop_back();
-      if (rbegin()->second.empty())
-        erase(rbegin()->first);
-    }
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-    void dump(const GCNRegBankReassign *P) const {
-      dbgs() << "\nCandidates:\n\n";
-      for (auto &B : *this) {
-        dbgs() << " Weight " << B.first << ":\n";
-        for (auto &C : B.second)
-          C.dump(P);
-      }
-      dbgs() << "\n\n";
-    }
-#endif
-  };
-
-public:
-  static char ID;
-
-public:
-  GCNRegBankReassign(RegBankReassignMode Mode = RM_BOTH)
-    : MachineFunctionPass(ID), Mode(Mode) {
-    initializeGCNRegBankReassignPass(*PassRegistry::getPassRegistry());
-  }
-
-  bool runOnMachineFunction(MachineFunction &MF) override;
-
-  StringRef getPassName() const override { return "GCN RegBank Reassign"; }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<MachineLoopInfo>();
-    AU.addRequired<LiveIntervals>();
-    AU.addRequired<VirtRegMap>();
-    AU.addRequired<LiveRegMatrix>();
-    AU.setPreservesAll();
-    MachineFunctionPass::getAnalysisUsage(AU);
-  }
-
-private:
-  const GCNSubtarget *ST;
-
-  const MachineRegisterInfo *MRI;
-
-  const SIRegisterInfo *TRI;
-
-  MachineLoopInfo *MLI;
-
-  VirtRegMap *VRM;
-
-  LiveRegMatrix *LRM;
-
-  LiveIntervals *LIS;
-
-  RegBankReassignMode Mode;
-
-  unsigned MaxNumVGPRs;
-
-  unsigned MaxNumSGPRs;
-
-  BitVector RegsUsed;
-
-  SmallVector<OperandMask, 8> OperandMasks;
-
-  CandidateList Candidates;
-
-  const MCPhysReg *CSRegs;
-
-  // Returns bank for a phys reg.
-  unsigned getPhysRegBank(Register Reg, unsigned SubReg) const;
-
-  // Return a bit set for each register bank used. 4 banks for VGPRs and
-  // 8 banks for SGPRs.
-  // Registers already processed and recorded in RegsUsed are excluded.
-  // If Bank is not -1 assume Reg:SubReg to belong to that Bank.
-  uint32_t getRegBankMask(Register Reg, unsigned SubReg, int Bank);
-
-  // Analyze one instruction returning the number of stalls and a mask of the
-  // banks used by all operands.
-  // If Reg and Bank are provided, assume all uses of Reg will be replaced with
-  // a register chosen from Bank.
-  std::pair<unsigned, unsigned> analyzeInst(const MachineInstr &MI,
-                                            Register Reg = Register(),
-                                            unsigned SubReg = 0, int Bank = -1);
-
-  // Return true if register is regular VGPR or SGPR or their tuples.
-  // Returns false for special registers like m0, vcc etc.
-  bool isReassignable(Register Reg) const;
-
-  // Check if registers' defs are old and may be pre-loaded.
-  // Returns 0 if both registers are old enough, 1 or 2 if one or both
-  // registers will not likely be pre-loaded.
-  unsigned getOperandGatherWeight(const MachineInstr& MI,
-                                  Register Reg1,
-                                  Register Reg2,
-                                  unsigned StallCycles) const;
-
-
-  // Find all bank bits in UsedBanks where Mask can be relocated to.
-  unsigned getFreeBanks(unsigned Mask, unsigned UsedBanks) const;
-
-  // Find all bank bits in UsedBanks where Mask can be relocated to.
-  // Bank is relative to the register and not its subregister component.
-  // Returns 0 is a register is not reassignable.
-  unsigned getFreeBanks(Register Reg, unsigned SubReg, unsigned Mask,
-                        unsigned UsedBanks) const;
-
-  // Add cadidate instruction to the work list.
-  void collectCandidates(MachineInstr& MI, unsigned UsedBanks,
-                         unsigned StallCycles);
-
-  // Collect cadidate instructions across function. Returns a number stall
-  // cycles detected. Only counts stalls if Collect is false.
-  unsigned collectCandidates(MachineFunction &MF, bool Collect = true);
-
-  // Remove all candidates that read specified register.
-  void removeCandidates(Register Reg);
-
-  // Compute stalls within the uses of SrcReg replaced by a register from
-  // Bank. If Bank is -1 does not perform substitution. If Collect is set
-  // candidates are collected and added to work list.
-  unsigned computeStallCycles(Register SrcReg,
-                              Register Reg = Register(),
-                              unsigned SubReg = 0, int Bank = -1,
-                              bool Collect = false);
-
-  // Search for a register in Bank unused within LI.
-  // Returns phys reg or NoRegister.
-  MCRegister scavengeReg(LiveInterval &LI, unsigned Bank,
-                         unsigned SubReg) const;
-
-  // Try to reassign candidate. Returns number or stall cycles saved.
-  unsigned tryReassign(Candidate &C);
-
-  bool verifyCycles(MachineFunction &MF,
-                    unsigned OriginalCycles, unsigned CyclesSaved);
-
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-public:
-  Printable printReg(Register Reg, unsigned SubReg = 0) const {
-    return Printable([Reg, SubReg, this](raw_ostream &OS) {
-      if (Reg.isPhysical()) {
-        OS << llvm::printReg(Reg, TRI);
-        return;
-      }
-      if (!VRM->isAssignedReg(Reg))
-        OS << "<unassigned> " << llvm::printReg(Reg, TRI);
-      else
-        OS << llvm::printReg(Reg, TRI) << '('
-           << llvm::printReg(VRM->getPhys(Reg), TRI) << ')';
-      if (SubReg)
-        OS << ':' << TRI->getSubRegIndexName(SubReg);
-    });
-  }
-
-  static Printable printBank(unsigned Bank) {
-    return Printable([Bank](raw_ostream &OS) {
-      OS << ((Bank >= SGPR_BANK_OFFSET) ? Bank - SGPR_BANK_OFFSET : Bank);
-    });
-  }
-
-  static void dumpFreeBanks(unsigned FreeBanks) {
-    for (unsigned L = 0; L < NUM_BANKS; ++L)
-      if (FreeBanks & (1 << L))
-        dbgs() << printBank(L) << ' ';
-  }
-#endif
-};
-
-} // End anonymous namespace.
-
-INITIALIZE_PASS_BEGIN(GCNRegBankReassign, DEBUG_TYPE, "GCN RegBank Reassign",
-                      false, false)
-INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
-INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
-INITIALIZE_PASS_DEPENDENCY(VirtRegMap)
-INITIALIZE_PASS_DEPENDENCY(LiveRegMatrix)
-INITIALIZE_PASS_END(GCNRegBankReassign, DEBUG_TYPE, "GCN RegBank Reassign",
-                    false, false)
-
-
-char GCNRegBankReassign::ID = 0;
-
-char &llvm::GCNRegBankReassignID = GCNRegBankReassign::ID;
-
-unsigned GCNRegBankReassign::getPhysRegBank(Register Reg,
-                                            unsigned SubReg) const {
-  assert(Reg.isPhysical());
-
-  const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
-  unsigned Size = TRI->getRegSizeInBits(*RC);
-  if (Size == 16)
-    Reg = TRI->get32BitRegister(Reg);
-  else if (Size > 32) {
-    if (SubReg) {
-      const TargetRegisterClass *SubRC = TRI->getSubRegClass(RC, SubReg);
-      Reg = TRI->getSubReg(Reg, SubReg);
-      if (TRI->getRegSizeInBits(*SubRC) > 32)
-        Reg = TRI->getSubReg(Reg, AMDGPU::sub0);
-    } else {
-      Reg = TRI->getSubReg(Reg, AMDGPU::sub0);
-    }
-  }
-
-  if (TRI->hasVGPRs(RC)) {
-    unsigned RegNo = Reg - AMDGPU::VGPR0;
-    return RegNo % NUM_VGPR_BANKS;
-  }
-
-  unsigned RegNo = TRI->getEncodingValue(AMDGPU::getMCReg(Reg, *ST)) / 2;
-  return RegNo % NUM_SGPR_BANKS + SGPR_BANK_OFFSET;
-}
-
-uint32_t GCNRegBankReassign::getRegBankMask(Register Reg, unsigned SubReg,
-                                            int Bank) {
-  if (Reg.isVirtual()) {
-    if (!VRM->isAssignedReg(Reg))
-      return 0;
-
-    Reg = VRM->getPhys(Reg);
-    if (!Reg)
-      return 0;
-    if (SubReg)
-      Reg = TRI->getSubReg(Reg, SubReg);
-  }
-
-  const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
-  unsigned Size = TRI->getRegSizeInBits(*RC);
-
-  if (Size == 16) {
-    Reg = TRI->get32BitRegister(Reg);
-    Size = 1;
-  } else {
-    Size /= 32;
-    if (Size > 1)
-      Reg = TRI->getSubReg(Reg, AMDGPU::sub0);
-  }
-
-  if (TRI->hasVGPRs(RC)) {
-    // VGPRs have 4 banks assigned in a round-robin fashion.
-    unsigned RegNo = Reg - AMDGPU::VGPR0;
-    uint32_t Mask = maskTrailingOnes<uint32_t>(Size);
-    unsigned Used = 0;
-    // Bitmask lacks an extract method
-    for (unsigned I = 0; I < Size; ++I)
-      if (RegsUsed.test(RegNo + I))
-        Used |= 1 << I;
-    RegsUsed.set(RegNo, RegNo + Size);
-    Mask &= ~Used;
-    Mask <<= (Bank == -1) ? RegNo % NUM_VGPR_BANKS : uint32_t(Bank);
-    return (Mask | (Mask >> NUM_VGPR_BANKS)) & VGPR_BANK_MASK;
-  }
-
-  // SGPRs have 8 banks holding 2 consequitive registers each.
-  unsigned RegNo = TRI->getEncodingValue(AMDGPU::getMCReg(Reg, *ST)) / 2;
-  unsigned StartBit = AMDGPU::VGPR_32RegClass.getNumRegs();
-  if (RegNo + StartBit >= RegsUsed.size())
-    return 0;
-
-  if (Size > 1)
-    Size /= 2;
-  unsigned Mask = (1 << Size) - 1;
-  unsigned Used = 0;
-  for (unsigned I = 0; I < Size; ++I)
-    if (RegsUsed.test(StartBit + RegNo + I))
-      Used |= 1 << I;
-  RegsUsed.set(StartBit + RegNo, StartBit + RegNo + Size);
-  Mask &= ~Used;
-  Mask <<= (Bank == -1) ? RegNo % NUM_SGPR_BANKS
-                        : unsigned(Bank - SGPR_BANK_OFFSET);
-  Mask = (Mask | (Mask >> NUM_SGPR_BANKS)) & SGPR_BANK_SHIFTED_MASK;
-  // Reserve 4 bank ids for VGPRs.
-  return Mask << SGPR_BANK_OFFSET;
-}
-
-std::pair<unsigned, unsigned>
-GCNRegBankReassign::analyzeInst(const MachineInstr &MI, Register Reg,
-                                unsigned SubReg, int Bank) {
-  unsigned StallCycles = 0;
-  unsigned UsedBanks = 0;
-
-  if (MI.isMetaInstruction())
-    return std::make_pair(StallCycles, UsedBanks);
-
-  if (!(Mode & RM_SGPR) &&
-      MI.getDesc().TSFlags & (SIInstrFlags::SMRD | SIInstrFlags::SALU))
-    return std::make_pair(StallCycles, UsedBanks);
-
-  RegsUsed.reset();
-  OperandMasks.clear();
-  for (const auto& Op : MI.explicit_uses()) {
-    // Undef can be assigned to any register, so two vregs can be assigned
-    // the same phys reg within the same instruction.
-    if (!Op.isReg() || Op.isUndef())
-      continue;
-
-    const Register R = Op.getReg();
-    const TargetRegisterClass *RC = TRI->getRegClassForReg(*MRI, R);
-
-    // Do not compute stalls for AGPRs
-    if (TRI->hasAGPRs(RC))
-      continue;
-    if ((Mode != RM_BOTH) && !(Mode & (TRI->hasVGPRs(RC) ? RM_VGPR : RM_SGPR)))
-      continue;
-
-    // Do not compute stalls if sub-register covers all banks
-    if (Op.getSubReg()) {
-      LaneBitmask LM = TRI->getSubRegIndexLaneMask(Op.getSubReg());
-      if (TRI->hasVGPRs(RC)) {
-        if (TRI->getNumCoveredRegs(LM) >= NUM_VGPR_BANKS)
-          continue;
-      } else {
-        if (TRI->getNumCoveredRegs(LM) / 2 >= NUM_SGPR_BANKS)
-          continue;
-      }
-    }
-
-    unsigned ShiftedBank = Bank;
-
-    if (Bank != -1 && R == Reg && (Op.getSubReg() || SubReg)) {
-      unsigned RegOffset =
-          TRI->getChannelFromSubReg(SubReg ? SubReg : (unsigned)AMDGPU::sub0);
-      unsigned Offset = TRI->getChannelFromSubReg(
-          Op.getSubReg() ? Op.getSubReg() : (unsigned)AMDGPU::sub0);
-      if (Bank < NUM_VGPR_BANKS) {
-        unsigned Shift = ((NUM_VGPR_BANKS + Offset) - RegOffset);
-        ShiftedBank = (Bank + Shift) % NUM_VGPR_BANKS;
-      } else if (Bank >= SGPR_BANK_OFFSET) {
-        unsigned Shift = (NUM_SGPR_BANKS + (Offset >> 1)) - (RegOffset >> 1);
-        ShiftedBank = SGPR_BANK_OFFSET +
-                      (Bank - SGPR_BANK_OFFSET + Shift) % NUM_SGPR_BANKS;
-      }
-    }
-
-    uint32_t Mask = getRegBankMask(R, Op.getSubReg(),
-                                   (Reg == R) ? ShiftedBank : -1);
-    StallCycles += countPopulation(UsedBanks & Mask);
-    UsedBanks |= Mask;
-    OperandMasks.push_back(OperandMask(Op.getReg(), Op.getSubReg(), Mask));
-  }
-
-  return std::make_pair(StallCycles, UsedBanks);
-}
-
-unsigned GCNRegBankReassign::getOperandGatherWeight(const MachineInstr& MI,
-                                                    Register Reg1,
-                                                    Register Reg2,
-                                                    unsigned StallCycles) const
-{
-  unsigned Defs = 0;
-  MachineBasicBlock::const_instr_iterator Def(MI.getIterator());
-  MachineBasicBlock::const_instr_iterator B(MI.getParent()->instr_begin());
-  for (unsigned S = StallCycles; S && Def != B && Defs != 3; --S) {
-    if (MI.isDebugInstr())
-      continue;
-    --Def;
-    if (Def->getOpcode() == TargetOpcode::IMPLICIT_DEF)
-      continue;
-    if (Def->modifiesRegister(Reg1, TRI))
-      Defs |= 1;
-    if (Def->modifiesRegister(Reg2, TRI))
-      Defs |= 2;
-  }
-  return countPopulation(Defs);
-}
-
-bool GCNRegBankReassign::isReassignable(Register Reg) const {
-  if (Reg.isPhysical() || !VRM->isAssignedReg(Reg))
-    return false;
-
-  // InlineSpiller does not call LRM::assign() after an LI split leaving it
-  // in an inconsistent state, so we cannot call LRM::unassign().
-  // See llvm bug #48911.
-  // Skip reassign if a register has originated from such split.
-  // FIXME: Remove the workaround when bug #48911 is fixed.
-  if (VRM->getPreSplitReg(Reg))
-    return false;
-
-  const MachineInstr *Def = MRI->getUniqueVRegDef(Reg);
-
-  Register PhysReg = VRM->getPhys(Reg);
-
-  if (Def && Def->isCopy() && Def->getOperand(1).getReg() == PhysReg)
-    return false;
-
-  for (auto U : MRI->use_nodbg_operands(Reg)) {
-    if (U.isImplicit())
-      return false;
-    const MachineInstr *UseInst = U.getParent();
-    if (UseInst->isCopy() && UseInst->getOperand(0).getReg() == PhysReg)
-      return false;
-  }
-
-  const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(PhysReg);
-  unsigned Size = TRI->getRegSizeInBits(*RC);
-
-  // TODO: Support 16 bit registers. Those needs to be moved with their
-  //       parent VGPR_32 and potentially a sibling 16 bit sub-register.
-  if (Size < 32)
-    return false;
-
-  if (TRI->hasVGPRs(RC))
-    return true;
-
-  if (Size == 16)
-    return AMDGPU::SGPR_LO16RegClass.contains(PhysReg);
-
-  if (Size > 32)
-    PhysReg = TRI->getSubReg(PhysReg, AMDGPU::sub0);
-
-  return AMDGPU::SGPR_32RegClass.contains(PhysReg);
-}
-
-unsigned GCNRegBankReassign::getFreeBanks(unsigned Mask,
-                                          unsigned UsedBanks) const {
-  unsigned Size = countPopulation(Mask);
-  unsigned FreeBanks = 0;
-  unsigned Bank = findFirstSet(Mask);
-
-  UsedBanks &= ~Mask;
-
-  // Find free VGPR banks
-  if ((Mask & VGPR_BANK_MASK) && (Size < NUM_VGPR_BANKS)) {
-    for (unsigned I = 0; I < NUM_VGPR_BANKS; ++I) {
-      if (Bank == I)
-        continue;
-      unsigned NewMask = ((1 << Size) - 1) << I;
-      NewMask = (NewMask | (NewMask >> NUM_VGPR_BANKS)) & VGPR_BANK_MASK;
-      if (!(UsedBanks & NewMask))
-        FreeBanks |= 1 << I;
-    }
-    return FreeBanks;
-  }
-
-  // Find free SGPR banks
-  // SGPR tuples must be aligned, so step is size in banks it
-  // crosses.
-  Bank -= SGPR_BANK_OFFSET;
-  for (unsigned I = 0; I < NUM_SGPR_BANKS; I += Size) {
-    if (Bank == I)
-      continue;
-    unsigned NewMask = ((1 << Size) - 1) << I;
-    NewMask = (NewMask | (NewMask >> NUM_SGPR_BANKS)) & SGPR_BANK_SHIFTED_MASK;
-    if (!(UsedBanks & (NewMask << SGPR_BANK_OFFSET)))
-      FreeBanks |= (1 << SGPR_BANK_OFFSET) << I;
-  }
-
-  return FreeBanks;
-}
-
-unsigned GCNRegBankReassign::getFreeBanks(Register Reg,
-                                          unsigned SubReg,
-                                          unsigned Mask,
-                                          unsigned UsedBanks) const {
-  if (!isReassignable(Reg))
-    return 0;
-
-  unsigned FreeBanks = getFreeBanks(Mask, UsedBanks);
-
-  unsigned Offset = TRI->getChannelFromSubReg(SubReg);
-  if (Offset && (Mask & VGPR_BANK_MASK)) {
-    unsigned Shift = Offset;
-    if (Shift >= NUM_VGPR_BANKS)
-      return 0;
-    unsigned VB = FreeBanks & VGPR_BANK_MASK;
-    FreeBanks = ((VB >> Shift) | (VB << (NUM_VGPR_BANKS - Shift))) &
-                VGPR_BANK_MASK;
-  } else if (Offset > 1 && (Mask & SGPR_BANK_MASK)) {
-    unsigned Shift = Offset >> 1;
-    if (Shift >= NUM_SGPR_BANKS)
-      return 0;
-    unsigned SB = FreeBanks >> SGPR_BANK_OFFSET;
-    FreeBanks = ((SB >> Shift) | (SB << (NUM_SGPR_BANKS - Shift))) &
-                SGPR_BANK_SHIFTED_MASK;
-    FreeBanks <<= SGPR_BANK_OFFSET;
-  }
-
-  LLVM_DEBUG(if (FreeBanks) {
-          dbgs() << "Potential reassignments of " << printReg(Reg, SubReg)
-                 << " to banks: "; dumpFreeBanks(FreeBanks);
-          dbgs() << '\n'; });
-
-  return FreeBanks;
-}
-
-void GCNRegBankReassign::collectCandidates(MachineInstr& MI,
-                                           unsigned UsedBanks,
-                                           unsigned StallCycles) {
-  LLVM_DEBUG(MI.dump());
-
-  if (!StallCycles)
-    return;
-
-  LLVM_DEBUG(dbgs() << "Stall cycles = " << StallCycles << '\n');
-
-  for (unsigned I = 0, E = OperandMasks.size(); I + 1 < E; ++I) {
-    for (unsigned J = I + 1; J != E; ++J) {
-      if (!(OperandMasks[I].Mask & OperandMasks[J].Mask))
-        continue;
-
-      Register Reg1 = OperandMasks[I].Reg;
-      Register Reg2 = OperandMasks[J].Reg;
-      unsigned SubReg1 = OperandMasks[I].SubReg;
-      unsigned SubReg2 = OperandMasks[J].SubReg;
-      unsigned Mask1 = OperandMasks[I].Mask;
-      unsigned Mask2 = OperandMasks[J].Mask;
-      unsigned Size1 = countPopulation(Mask1);
-      unsigned Size2 = countPopulation(Mask2);
-
-      LLVM_DEBUG(dbgs() << "Conflicting operands: " << printReg(Reg1, SubReg1) <<
-                      " and " << printReg(Reg2, SubReg2) << '\n');
-
-      unsigned Weight = getOperandGatherWeight(MI, Reg1, Reg2, StallCycles);
-      Weight += MLI->getLoopDepth(MI.getParent()) * 10;
-
-      LLVM_DEBUG(dbgs() << "Stall weight = " << Weight << '\n');
-
-      unsigned FreeBanks1 = getFreeBanks(Reg1, SubReg1, Mask1, UsedBanks);
-      unsigned FreeBanks2 = getFreeBanks(Reg2, SubReg2, Mask2, UsedBanks);
-      if (FreeBanks1)
-        Candidates.push(Weight + ((Size2 > Size1) ? 1 : 0),
-                        Candidate(&MI, Reg1, SubReg1, FreeBanks1));
-      if (FreeBanks2)
-        Candidates.push(Weight + ((Size1 > Size2) ? 1 : 0),
-                        Candidate(&MI, Reg2, SubReg2, FreeBanks2));
-    }
-  }
-}
-
-unsigned GCNRegBankReassign::computeStallCycles(Register SrcReg, Register Reg,
-                                                unsigned SubReg, int Bank,
-                                                bool Collect) {
-  unsigned TotalStallCycles = 0;
-  SmallSet<const MachineInstr *, 16> Visited;
-
-  for (auto &MI : MRI->use_nodbg_instructions(SrcReg)) {
-    if (MI.isBundle())
-      continue;
-    if (!Visited.insert(&MI).second)
-      continue;
-    unsigned StallCycles;
-    unsigned UsedBanks;
-    std::tie(StallCycles, UsedBanks) = analyzeInst(MI, Reg, SubReg, Bank);
-    TotalStallCycles += StallCycles;
-    if (Collect)
-      collectCandidates(MI, UsedBanks, StallCycles);
-  }
-
-  return TotalStallCycles;
-}
-
-MCRegister GCNRegBankReassign::scavengeReg(LiveInterval &LI, unsigned Bank,
-                                           unsigned SubReg) const {
-  const TargetRegisterClass *RC = MRI->getRegClass(LI.reg());
-  unsigned MaxNumRegs = (Bank < NUM_VGPR_BANKS) ? MaxNumVGPRs
-                                                : MaxNumSGPRs;
-  unsigned MaxReg = MaxNumRegs + (Bank < NUM_VGPR_BANKS ? AMDGPU::VGPR0
-                                                        : AMDGPU::SGPR0);
-
-  for (MCRegister Reg : RC->getRegisters()) {
-    // Check occupancy limit.
-    if (TRI->isSubRegisterEq(Reg, MaxReg))
-      break;
-
-    if (!MRI->isAllocatable(Reg) || getPhysRegBank(Reg, SubReg) != Bank)
-      continue;
-
-    for (unsigned I = 0; CSRegs[I]; ++I)
-      if (TRI->isSubRegisterEq(Reg, CSRegs[I]) &&
-          !LRM->isPhysRegUsed(CSRegs[I]))
-        return MCRegister::from(AMDGPU::NoRegister);
-
-    LLVM_DEBUG(dbgs() << "Trying register " << printReg(Reg) << '\n');
-
-    if (!LRM->checkInterference(LI, Reg))
-      return Reg;
-  }
-
-  return MCRegister::from(AMDGPU::NoRegister);
-}
-
-unsigned GCNRegBankReassign::tryReassign(Candidate &C) {
-  if (!LIS->hasInterval(C.Reg))
-    return 0;
-
-  LiveInterval &LI = LIS->getInterval(C.Reg);
-  LLVM_DEBUG(dbgs() << "Try reassign " << printReg(C.Reg) << " in "; C.MI->dump();
-             LI.dump());
-
-  // For each candidate bank walk all instructions in the range of live
-  // interval and check if replacing the register with one belonging to
-  // the candidate bank reduces conflicts.
-
-  unsigned OrigStalls = computeStallCycles(C.Reg);
-  LLVM_DEBUG(dbgs() << "--- Stall cycles in range = " << OrigStalls << '\n');
-  if (!OrigStalls)
-    return 0;
-
-  struct BankStall {
-    BankStall(unsigned b, unsigned s) : Bank(b), Stalls(s) {};
-    bool operator<(const BankStall &RHS) const {
-      if (Stalls == RHS.Stalls)
-        return Bank < RHS.Bank;
-      return Stalls > RHS.Stalls;
-    }
-    unsigned Bank;
-    unsigned Stalls;
-  };
-  SmallVector<BankStall, 8> BankStalls;
-
-  for (int Bank = 0; Bank < NUM_BANKS; ++Bank) {
-    if (C.FreeBanks & (1 << Bank)) {
-      LLVM_DEBUG(dbgs() << "Trying bank " << printBank(Bank) << '\n');
-      unsigned Stalls = computeStallCycles(C.Reg, C.Reg, C.SubReg, Bank);
-      if (Stalls < OrigStalls) {
-        LLVM_DEBUG(dbgs() << "With bank " << printBank(Bank) << " -> "
-                     << Stalls << '\n');
-        BankStalls.push_back(BankStall((unsigned)Bank, Stalls));
-      }
-    }
-  }
-  llvm::sort(BankStalls);
-
-  MCRegister OrigReg = VRM->getPhys(C.Reg);
-  LRM->unassign(LI);
-  while (!BankStalls.empty()) {
-    BankStall BS = BankStalls.pop_back_val();
-    MCRegister Reg = scavengeReg(LI, BS.Bank, C.SubReg);
-    if (Reg == AMDGPU::NoRegister) {
-      LLVM_DEBUG(dbgs() << "No free registers in bank " << printBank(BS.Bank)
-                   << '\n');
-      continue;
-    }
-    LLVM_DEBUG(dbgs() << "Found free register " << printReg(Reg)
-                 << (LRM->isPhysRegUsed(Reg) ? "" : " (new)")
-                 << " in bank " << printBank(BS.Bank) << '\n');
-
-    LRM->assign(LI, Reg);
-
-    LLVM_DEBUG(dbgs() << "--- Cycles saved: " << OrigStalls - BS.Stalls << '\n');
-
-    return OrigStalls - BS.Stalls;
-  }
-  LRM->assign(LI, OrigReg);
-
-  return 0;
-}
-
-unsigned GCNRegBankReassign::collectCandidates(MachineFunction &MF,
-                                               bool Collect) {
-  unsigned TotalStallCycles = 0;
-
-  for (MachineBasicBlock &MBB : MF) {
-
-    LLVM_DEBUG(if (Collect) {
-            if (MBB.getName().empty()) dbgs() << "bb." << MBB.getNumber();
-            else dbgs() << MBB.getName(); dbgs() << ":\n";
-          });
-
-    for (MachineInstr &MI : MBB.instrs()) {
-      if (MI.isBundle())
-          continue; // we analyze the instructions inside the bundle individually
-
-      unsigned StallCycles;
-      unsigned UsedBanks;
-      std::tie(StallCycles, UsedBanks) = analyzeInst(MI);
-
-      if (Collect)
-        collectCandidates(MI, UsedBanks, StallCycles);
-
-      TotalStallCycles += StallCycles;
-    }
-
-    LLVM_DEBUG(if (Collect) { dbgs() << '\n'; });
-  }
-
-  return TotalStallCycles;
-}
-
-void GCNRegBankReassign::removeCandidates(Register Reg) {
-  typename CandidateList::iterator Next;
-  for (auto I = Candidates.begin(), E = Candidates.end(); I != E; I = Next) {
-    Next = std::next(I);
-    I->second.remove_if([Reg, this](const Candidate& C) {
-      return C.MI->readsRegister(Reg, TRI);
-    });
-    if (I->second.empty())
-      Candidates.erase(I);
-  }
-}
-
-bool GCNRegBankReassign::verifyCycles(MachineFunction &MF,
-                                      unsigned OriginalCycles,
-                                      unsigned CyclesSaved) {
-  unsigned StallCycles = collectCandidates(MF, false);
-  LLVM_DEBUG(dbgs() << "=== After the pass " << StallCycles
-               << " stall cycles left\n");
-  return StallCycles + CyclesSaved == OriginalCycles;
-}
-
-bool GCNRegBankReassign::runOnMachineFunction(MachineFunction &MF) {
-  ST = &MF.getSubtarget<GCNSubtarget>();
-  if (!ST->hasRegisterBanking() || skipFunction(MF.getFunction()))
-    return false;
-
-  MRI = &MF.getRegInfo();
-
-  LLVM_DEBUG(dbgs() << "=== RegBanks reassign analysis on function "
-                    << MF.getName() << '\n'
-                    << ((Mode & RM_VGPR) ? "VGPR " : "")
-                    << ((Mode & RM_SGPR) ? "SGPR " : "") << "mode\n"
-                    << "NumVirtRegs = " << MRI->getNumVirtRegs() << "\n\n");
-
-  if (MRI->getNumVirtRegs() > VRegThresh) {
-    LLVM_DEBUG(dbgs() << "NumVirtRegs > " << VRegThresh
-                      << " threshold, skipping function.\n\n");
-    return false;
-  }
-
-  TRI = ST->getRegisterInfo();
-  MLI = &getAnalysis<MachineLoopInfo>();
-  VRM = &getAnalysis<VirtRegMap>();
-  LRM = &getAnalysis<LiveRegMatrix>();
-  LIS = &getAnalysis<LiveIntervals>();
-
-  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
-  unsigned Occupancy = MFI->getOccupancy();
-  MaxNumVGPRs = ST->getMaxNumVGPRs(MF);
-  MaxNumSGPRs = ST->getMaxNumSGPRs(MF);
-  MaxNumVGPRs = std::min(ST->getMaxNumVGPRs(Occupancy), MaxNumVGPRs);
-  MaxNumSGPRs = std::min(ST->getMaxNumSGPRs(Occupancy, true), MaxNumSGPRs);
-
-  CSRegs = MRI->getCalleeSavedRegs();
-  unsigned NumRegBanks = AMDGPU::VGPR_32RegClass.getNumRegs() +
-                         // Not a tight bound
-                         AMDGPU::SReg_32RegClass.getNumRegs() / 2 + 1;
-  RegsUsed.resize(NumRegBanks);
-
-  unsigned StallCycles = collectCandidates(MF);
-  NumStallsDetected += StallCycles;
-
-  LLVM_DEBUG(dbgs() << "=== " << StallCycles << " stall cycles detected in "
-                  "function " << MF.getName() << '\n');
-
-  LLVM_DEBUG(Candidates.dump(this));
-
-  unsigned CyclesSaved = 0;
-  while (!Candidates.empty()) {
-    Candidate C = Candidates.back();
-    unsigned LocalCyclesSaved = tryReassign(C);
-    CyclesSaved += LocalCyclesSaved;
-
-    if (VerifyStallCycles > 1 && !verifyCycles(MF, StallCycles, CyclesSaved))
-      report_fatal_error("RegBank reassign stall cycles verification failed.");
-
-    Candidates.pop_back();
-    if (LocalCyclesSaved) {
-      removeCandidates(C.Reg);
-      computeStallCycles(C.Reg, AMDGPU::NoRegister, 0, -1, true);
-
-      LLVM_DEBUG(Candidates.dump(this));
-    }
-  }
-  NumStallsRecovered += CyclesSaved;
-
-  LLVM_DEBUG(dbgs() << "=== After the pass " << CyclesSaved
-               << " cycles saved in function " << MF.getName() << '\n');
-
-  Candidates.clear();
-
-  if (VerifyStallCycles == 1 && !verifyCycles(MF, StallCycles, CyclesSaved))
-    report_fatal_error("RegBank reassign stall cycles verification failed.");
-
-  RegsUsed.clear();
-
-  return CyclesSaved > 0;
-}
-
-MachineFunctionPass *
-llvm::createGCNRegBankReassignPass(RegBankReassignMode Mode) {
-  return new GCNRegBankReassign(Mode);
-}

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
index c3e31672c852..a178f055ac06 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
@@ -1643,12 +1643,8 @@ define <2 x i64> @v_ashr_v2i64(<2 x i64> %value, <2 x i64> %amount) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_mov_b32_e32 v10, v0
-; GFX10-NEXT:    v_mov_b32_e32 v11, v1
-; GFX10-NEXT:    v_mov_b32_e32 v7, v2
-; GFX10-NEXT:    v_mov_b32_e32 v8, v3
-; GFX10-NEXT:    v_ashrrev_i64 v[0:1], v4, v[10:11]
-; GFX10-NEXT:    v_ashrrev_i64 v[2:3], v6, v[7:8]
+; GFX10-NEXT:    v_ashrrev_i64 v[0:1], v4, v[0:1]
+; GFX10-NEXT:    v_ashrrev_i64 v[2:3], v6, v[2:3]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %result = ashr <2 x i64> %value, %amount
   ret <2 x i64> %result

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll
index 80b599f441a2..57410918e0c2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll
@@ -314,45 +314,45 @@ define i128 @extractelement_vgpr_v4i128_vgpr_idx(<4 x i128> addrspace(1)* %ptr,
 ; GFX10-NEXT:    v_add_nc_u32_e32 v19, 1, v2
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, 1, v19
 ; GFX10-NEXT:    s_waitcnt vmcnt(2)
-; GFX10-NEXT:    v_cndmask_b32_e32 v18, v3, v5, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v15, v4, v6, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v15, v3, v5, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v16, v4, v6, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 2, v2
-; GFX10-NEXT:    v_cndmask_b32_e64 v23, v4, v6, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v22, v3, v5, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, v6, s4
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, 2, v19
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_cndmask_b32_e32 v27, v18, v7, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v15, v15, v8, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v15, v7, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v16, v8, vcc_lo
+; GFX10-NEXT:    global_load_dwordx4 v[15:18], v[0:1], off offset:48
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v2
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v22, v7, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, v23, v8, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, v8, s4
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, 3, v19
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v15, v10, vcc_lo
-; GFX10-NEXT:    global_load_dwordx4 v[15:18], v[0:1], off offset:48
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v27, v9, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v9, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v6, v10, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 4, v2
-; GFX10-NEXT:    v_cndmask_b32_e64 v22, v3, v9, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v7, v4, v10, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v9, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, v10, s4
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, 4, v19
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v5, v11, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v6, v12, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 5, v2
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v22, v11, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, v7, v12, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v11, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, v12, s4
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, 5, v19
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v13, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v14, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 6, v2
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, v3, v13, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v7, v4, v14, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v13, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, v14, s4
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, 6, v19
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v15, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v16, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 7, v2
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v6, v15, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, v7, v16, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v15, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, v16, s4
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, 7, v19
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v17, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v18, vcc_lo
@@ -577,54 +577,54 @@ define amdgpu_ps i128 @extractelement_sgpr_v4i128_vgpr_idx(<4 x i128> addrspace(
 ;
 ; GFX10-LABEL: extractelement_sgpr_v4i128_vgpr_idx:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx16 s[8:23], s[2:3], 0x0
+; GFX10-NEXT:    s_load_dwordx16 s[4:19], s[2:3], 0x0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    v_add_nc_u32_e32 v1, 1, v0
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 1, v1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v2, s10
-; GFX10-NEXT:    v_mov_b32_e32 v3, s11
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, s8, v2, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, s9, v3, vcc_lo
+; GFX10-NEXT:    v_mov_b32_e32 v2, s6
+; GFX10-NEXT:    v_mov_b32_e32 v3, s7
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, s4, v2, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, s5, v3, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 2, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, s8, v2, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, s9, v3, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, s4, v2, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, s5, v3, s0
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 2, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, s8, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, s9, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s8, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, s9, s0
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 3, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, s10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, s11, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 4, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s10, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, s11, s0
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 4, v1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, s12, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, s13, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v0
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 5, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s12, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, s13, s0
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 3, v1
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 5, v1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, s14, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, s15, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 4, v0
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 6, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s14, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, s15, s0
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s14, 4, v1
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 5, v1
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 6, v1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, s16, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, s17, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 5, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s16, s14
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, s17, s14
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, s18, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, s19, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 6, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s18, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, s19, s0
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 6, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, s20, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, s21, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 7, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s20, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, s21, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s16, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, s17, s0
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 7, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v5, s23, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v4, s22, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s22, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, s23, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v5, s19, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v4, s18, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s18, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, s19, s0
 ; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s2, v2

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll
index 525b2c2ec45a..3a88af6fb5dc 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll
@@ -581,9 +581,9 @@ define i16 @extractelement_vgpr_v8i16_vgpr_idx(<8 x i16> addrspace(1)* %ptr, i32
 ; GFX10-NEXT:    v_and_b32_e32 v2, 1, v2
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 2, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v6, vcc_lo
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 4, v2

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll
index 50def72b7425..c820562bf9f8 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll
@@ -223,9 +223,9 @@ define i8 @extractelement_vgpr_v4i8_vgpr_idx(<4 x i8> addrspace(1)* %ptr, i32 %i
 ; GFX10-NEXT:    s_movk_i32 s4, 0xff
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
 ; GFX10-NEXT:    v_and_b32_sdwa v4, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    v_and_or_b32 v6, v0, s4, v1
+; GFX10-NEXT:    v_and_or_b32 v0, v0, s4, v1
 ; GFX10-NEXT:    v_and_b32_e32 v1, 3, v2
-; GFX10-NEXT:    v_or3_b32 v0, v6, v4, v3
+; GFX10-NEXT:    v_or3_b32 v0, v0, v4, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 3, v1
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -1036,12 +1036,12 @@ define amdgpu_ps i8 @extractelement_vgpr_v8i8_sgpr_idx(<8 x i8> addrspace(1)* %p
 ; GFX10-NEXT:    v_and_b32_sdwa v7, v1, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX10-NEXT:    v_and_or_b32 v11, v0, s1, v2
-; GFX10-NEXT:    v_and_or_b32 v2, v1, s1, v3
+; GFX10-NEXT:    v_and_or_b32 v0, v0, s1, v2
+; GFX10-NEXT:    v_and_or_b32 v1, v1, s1, v3
 ; GFX10-NEXT:    s_lshr_b32 s0, s2, 2
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s0, 1
-; GFX10-NEXT:    v_or3_b32 v0, v11, v6, v4
-; GFX10-NEXT:    v_or3_b32 v1, v2, v7, v5
+; GFX10-NEXT:    v_or3_b32 v0, v0, v6, v4
+; GFX10-NEXT:    v_or3_b32 v1, v1, v7, v5
 ; GFX10-NEXT:    s_and_b32 s0, s2, 3
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, 3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
@@ -2613,25 +2613,25 @@ define amdgpu_ps i8 @extractelement_vgpr_v16i8_sgpr_idx(<16 x i8> addrspace(1)*
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v8, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 24, v2
 ; GFX10-NEXT:    v_and_b32_sdwa v13, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 8, v3
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v19, v5, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-NEXT:    v_and_b32_sdwa v14, v1, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    v_and_or_b32 v0, v0, s1, v6
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
-; GFX10-NEXT:    v_and_b32_sdwa v14, v1, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 8, v3
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v10, v5, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
-; GFX10-NEXT:    v_and_or_b32 v23, v1, s1, v8
+; GFX10-NEXT:    v_and_or_b32 v1, v1, s1, v8
 ; GFX10-NEXT:    s_lshr_b32 s0, s2, 2
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v5, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-NEXT:    v_and_b32_sdwa v17, v2, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_and_b32_sdwa v15, v2, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 24, v11
-; GFX10-NEXT:    v_and_or_b32 v2, v2, s1, v19
+; GFX10-NEXT:    v_and_or_b32 v2, v2, s1, v10
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 24, v3
 ; GFX10-NEXT:    v_or3_b32 v0, v0, v13, v7
-; GFX10-NEXT:    v_or3_b32 v1, v23, v14, v9
+; GFX10-NEXT:    v_or3_b32 v1, v1, v14, v9
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s0, 1
 ; GFX10-NEXT:    v_and_or_b32 v5, v3, v4, v5
 ; GFX10-NEXT:    v_and_b32_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    v_or3_b32 v2, v2, v17, v11
+; GFX10-NEXT:    v_or3_b32 v2, v2, v15, v11
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 24, v6
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s0, 2

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
index 647d22b68fc9..a944adb4375e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
@@ -186,9 +186,9 @@ define float @dyn_extract_v8f32_v_v(<8 x float> %vec, i32 %sel) {
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 2, v8
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v8
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v0, v3, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 4, v8
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 5, v8
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 6, v8
@@ -227,9 +227,9 @@ define amdgpu_ps float @dyn_extract_v8f32_v_s(<8 x float> %vec, i32 inreg %sel)
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s2, 2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s2, 3
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v0, v3, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s2, 4
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s2, 5
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s2, 6
@@ -346,20 +346,20 @@ define i64 @dyn_extract_v8i64_const_s_v(i32 %sel) {
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s7
 ; GFX10-NEXT:    s_mov_b64 s[4:5], 1
 ; GFX10-NEXT:    s_mov_b64 s[8:9], 3
-; GFX10-NEXT:    s_mov_b64 s[14:15], 4
+; GFX10-NEXT:    s_mov_b64 s[10:11], 4
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, s4, v1, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, s5, v2, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 2, v0
 ; GFX10-NEXT:    s_mov_b64 s[12:13], 5
+; GFX10-NEXT:    s_mov_b64 s[14:15], 6
 ; GFX10-NEXT:    s_mov_b64 s[16:17], 7
 ; GFX10-NEXT:    s_mov_b64 s[18:19], 8
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s8, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s9, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s14, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s15, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s11, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 4, v0
-; GFX10-NEXT:    s_mov_b64 s[14:15], 6
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s12, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s13, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 5, v0
@@ -561,11 +561,11 @@ define amdgpu_ps void @dyn_extract_v8i64_s_v(<8 x i64> inreg %vec, i32 %sel) {
 ; GFX10-NEXT:    s_mov_b32 s7, s9
 ; GFX10-NEXT:    s_mov_b32 s8, s10
 ; GFX10-NEXT:    s_mov_b32 s9, s11
-; GFX10-NEXT:    s_mov_b32 s46, s12
+; GFX10-NEXT:    s_mov_b32 s10, s12
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s4, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s5, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v0
-; GFX10-NEXT:    s_mov_b32 s47, s13
+; GFX10-NEXT:    s_mov_b32 s11, s13
 ; GFX10-NEXT:    s_mov_b32 s12, s14
 ; GFX10-NEXT:    s_mov_b32 s13, s15
 ; GFX10-NEXT:    s_mov_b32 s14, s16
@@ -576,8 +576,8 @@ define amdgpu_ps void @dyn_extract_v8i64_s_v(<8 x i64> inreg %vec, i32 %sel) {
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s8, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s9, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 5, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s46, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s47, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s11, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 6, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s12, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s13, vcc_lo
@@ -624,23 +624,23 @@ define i64 @dyn_extract_v8i64_v_v(<8 x i64> %vec, i32 %sel) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v16
-; GFX10-NEXT:    v_cndmask_b32_e32 v19, v0, v2, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 2, v16
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v19, v4, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v16
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v0, v6, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v1, v7, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 4, v16
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v8, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v7, v9, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 5, v16
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v0, v10, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v1, v11, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v11, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 6, v16
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v12, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v7, v13, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v12, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v13, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 7, v16
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v14, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v15, vcc_lo
@@ -860,9 +860,9 @@ define float @dyn_extract_v8f32_v_v_offset3(<8 x float> %vec, i32 %sel) {
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 2, v8
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v8
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v0, v3, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 4, v8
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 5, v8
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 6, v8
@@ -1360,23 +1360,23 @@ define double @dyn_extract_v8f64_v_v_offset3(<8 x double> %vec, i32 %sel) {
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_add_nc_u32_e32 v16, 3, v16
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v16
-; GFX10-NEXT:    v_cndmask_b32_e32 v19, v0, v2, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 2, v16
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v19, v4, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v16
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v0, v6, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v1, v7, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 4, v16
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v8, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v7, v9, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 5, v16
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v0, v10, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v1, v11, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v11, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 6, v16
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v12, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v7, v13, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v12, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v13, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 7, v16
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v14, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v15, vcc_lo
@@ -1416,9 +1416,9 @@ define i8 addrspace(3)* @dyn_extract_v8p3_v_v(<8 x i8 addrspace(3)*> %vec, i32 %
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 2, v8
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v8
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v0, v3, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 4, v8
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 5, v8
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 6, v8
@@ -1530,23 +1530,23 @@ define i8 addrspace(1)* @dyn_extract_v8p1_v_v(<8 x i8 addrspace(1)*> %vec, i32 %
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v16
-; GFX10-NEXT:    v_cndmask_b32_e32 v19, v0, v2, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 2, v16
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v19, v4, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v16
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v0, v6, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v1, v7, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 4, v16
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v8, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v7, v9, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 5, v16
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v0, v10, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v1, v11, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v11, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 6, v16
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v12, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v7, v13, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v12, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v13, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 7, v16
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v14, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v15, vcc_lo
@@ -2001,9 +2001,9 @@ define float @dyn_extract_v6f32_v_v(<6 x float> %vec, i32 %sel) {
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 2, v6
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v6
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v0, v3, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 4, v6
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 5, v6
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -2034,9 +2034,9 @@ define amdgpu_ps float @dyn_extract_v6f32_v_s(<6 x float> %vec, i32 inreg %sel)
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s2, 2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s2, 3
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v0, v3, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s2, 4
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s2, 5
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc_lo
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -2162,9 +2162,9 @@ define float @dyn_extract_v7f32_v_v(<7 x float> %vec, i32 %sel) {
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 2, v7
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v7
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v0, v3, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 4, v7
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 5, v7
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 6, v7
@@ -2199,9 +2199,9 @@ define amdgpu_ps float @dyn_extract_v7f32_v_s(<7 x float> %vec, i32 inreg %sel)
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s2, 2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s2, 3
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v0, v3, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s2, 4
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s2, 5
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s2, 6
@@ -2311,19 +2311,19 @@ define amdgpu_ps double @dyn_extract_v6f64_s_v(<6 x double> inreg %vec, i32 %sel
 ; GFX10-NEXT:    s_mov_b32 s7, s9
 ; GFX10-NEXT:    s_mov_b32 s8, s10
 ; GFX10-NEXT:    s_mov_b32 s9, s11
-; GFX10-NEXT:    s_mov_b32 s14, s12
+; GFX10-NEXT:    s_mov_b32 s10, s12
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s4, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s5, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v0
-; GFX10-NEXT:    s_mov_b32 s47, s13
+; GFX10-NEXT:    s_mov_b32 s11, s13
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s6, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s7, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 4, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s8, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s9, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 5, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v1, s14, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v2, s47, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v1, s10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v2, s11, vcc_lo
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -2358,17 +2358,17 @@ define double @dyn_extract_v6f64_v_v(<6 x double> %vec, i32 %sel) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v12
-; GFX10-NEXT:    v_cndmask_b32_e32 v15, v0, v2, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 2, v12
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v15, v4, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v12
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v0, v6, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v1, v7, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 4, v12
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v8, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v7, v9, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 5, v12
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v10, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v11, vcc_lo
@@ -2520,11 +2520,11 @@ define amdgpu_ps double @dyn_extract_v7f64_s_v(<7 x double> inreg %vec, i32 %sel
 ; GFX10-NEXT:    s_mov_b32 s7, s9
 ; GFX10-NEXT:    s_mov_b32 s8, s10
 ; GFX10-NEXT:    s_mov_b32 s9, s11
-; GFX10-NEXT:    s_mov_b32 s46, s12
+; GFX10-NEXT:    s_mov_b32 s10, s12
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s4, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s5, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v0
-; GFX10-NEXT:    s_mov_b32 s47, s13
+; GFX10-NEXT:    s_mov_b32 s11, s13
 ; GFX10-NEXT:    s_mov_b32 s12, s14
 ; GFX10-NEXT:    s_mov_b32 s13, s15
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s6, vcc_lo
@@ -2533,8 +2533,8 @@ define amdgpu_ps double @dyn_extract_v7f64_s_v(<7 x double> inreg %vec, i32 %sel
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s8, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s9, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 5, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s46, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s47, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s11, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 6, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v1, s12, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v2, s13, vcc_lo
@@ -2575,23 +2575,23 @@ define double @dyn_extract_v7f64_v_v(<7 x double> %vec, i32 %sel) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v14
-; GFX10-NEXT:    v_cndmask_b32_e32 v15, v0, v2, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 2, v14
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v15, v4, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v14
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v0, v6, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v1, v7, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 4, v14
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v8, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v7, v9, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 5, v14
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v0, v10, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v1, v11, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v11, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 6, v14
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v12, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v7, v13, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v12, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v13, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %ext = extractelement <7 x double> %vec, i32 %sel
@@ -3168,8 +3168,8 @@ define amdgpu_ps float @dyn_extract_v15f32_s_v(<15 x float> inreg %vec, i32 %sel
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s2, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v0
 ; GFX10-NEXT:    s_mov_b32 s9, s11
-; GFX10-NEXT:    s_mov_b32 s46, s12
-; GFX10-NEXT:    s_mov_b32 s47, s13
+; GFX10-NEXT:    s_mov_b32 s10, s12
+; GFX10-NEXT:    s_mov_b32 s11, s13
 ; GFX10-NEXT:    s_mov_b32 s12, s14
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 4, v0
@@ -3187,9 +3187,9 @@ define amdgpu_ps float @dyn_extract_v15f32_s_v(<15 x float> inreg %vec, i32 %sel
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 9, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s9, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 10, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s46, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s10, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 11, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s47, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s11, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 12, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s12, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 13, v0
@@ -3245,25 +3245,25 @@ define float @dyn_extract_v15f32_v_v(<15 x float> %vec, i32 %sel) {
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 2, v15
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v15
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v0, v3, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 4, v15
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 5, v15
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 6, v15
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 7, v15
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v0, v7, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 8, v15
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v8, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 9, v15
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v9, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 10, v15
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v10, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 11, v15
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v0, v11, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v11, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 12, v15
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v12, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v12, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 13, v15
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v13, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 14, v15
@@ -3476,25 +3476,25 @@ define float @dyn_extract_v15f32_v_v_offset3(<15 x float> %vec, i32 %sel) {
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 2, v15
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v15
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v0, v3, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 4, v15
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 5, v15
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 6, v15
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 7, v15
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v0, v7, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 8, v15
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v8, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 9, v15
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v9, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 10, v15
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v10, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 11, v15
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v0, v11, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v11, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 12, v15
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v12, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v12, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 13, v15
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v13, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 14, v15

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
index b399aad2faf0..de48249ae006 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
@@ -637,9 +637,9 @@ define <2 x half> @v_fdiv_v2f16(<2 x half> %a, <2 x half> %b) {
 ; GFX10-NEXT:    v_rcp_f32_e32 v3, v3
 ; GFX10-NEXT:    v_mul_f32_e32 v4, v7, v4
 ; GFX10-NEXT:    v_mul_f32_e32 v3, v6, v3
-; GFX10-NEXT:    v_cvt_f16_f32_e32 v7, v4
+; GFX10-NEXT:    v_cvt_f16_f32_e32 v4, v4
 ; GFX10-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX10-NEXT:    v_div_fixup_f16 v0, v7, v1, v0
+; GFX10-NEXT:    v_div_fixup_f16 v0, v4, v1, v0
 ; GFX10-NEXT:    v_div_fixup_f16 v2, v3, v2, v5
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
 ; GFX10-NEXT:    v_and_or_b32 v0, v0, 0xffff, v1
@@ -849,9 +849,9 @@ define <2 x half> @v_fdiv_v2f16_ulp25(<2 x half> %a, <2 x half> %b) {
 ; GFX10-NEXT:    v_rcp_f32_e32 v3, v3
 ; GFX10-NEXT:    v_mul_f32_e32 v4, v7, v4
 ; GFX10-NEXT:    v_mul_f32_e32 v3, v6, v3
-; GFX10-NEXT:    v_cvt_f16_f32_e32 v7, v4
+; GFX10-NEXT:    v_cvt_f16_f32_e32 v4, v4
 ; GFX10-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX10-NEXT:    v_div_fixup_f16 v0, v7, v1, v0
+; GFX10-NEXT:    v_div_fixup_f16 v0, v4, v1, v0
 ; GFX10-NEXT:    v_div_fixup_f16 v2, v3, v2, v5
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
 ; GFX10-NEXT:    v_and_or_b32 v0, v0, 0xffff, v1
@@ -1515,9 +1515,9 @@ define <2 x half> @v_fdiv_v2f16_arcp_ulp25(<2 x half> %a, <2 x half> %b) {
 ; GFX10-NEXT:    v_rcp_f32_e32 v3, v3
 ; GFX10-NEXT:    v_mul_f32_e32 v4, v7, v4
 ; GFX10-NEXT:    v_mul_f32_e32 v3, v6, v3
-; GFX10-NEXT:    v_cvt_f16_f32_e32 v7, v4
+; GFX10-NEXT:    v_cvt_f16_f32_e32 v4, v4
 ; GFX10-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX10-NEXT:    v_div_fixup_f16 v0, v7, v1, v0
+; GFX10-NEXT:    v_div_fixup_f16 v0, v4, v1, v0
 ; GFX10-NEXT:    v_div_fixup_f16 v2, v3, v2, v5
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
 ; GFX10-NEXT:    v_and_or_b32 v0, v0, 0xffff, v1

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll
index 6431eabf459a..aa6a244a1254 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll
@@ -712,27 +712,27 @@ define <2 x float> @v_fdiv_v2f32(<2 x float> %a, <2 x float> %b) {
 ; GFX10-IEEE-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-IEEE-NEXT:    v_div_scale_f32 v4, s4, v2, v2, v0
 ; GFX10-IEEE-NEXT:    v_div_scale_f32 v5, s4, v3, v3, v1
-; GFX10-IEEE-NEXT:    v_div_scale_f32 v15, vcc_lo, v0, v2, v0
+; GFX10-IEEE-NEXT:    v_div_scale_f32 v10, vcc_lo, v0, v2, v0
 ; GFX10-IEEE-NEXT:    v_rcp_f32_e32 v6, v4
-; GFX10-IEEE-NEXT:    v_rcp_f32_e32 v10, v5
+; GFX10-IEEE-NEXT:    v_rcp_f32_e32 v7, v5
 ; GFX10-IEEE-NEXT:    v_fma_f32 v8, -v4, v6, 1.0
-; GFX10-IEEE-NEXT:    v_fma_f32 v9, -v5, v10, 1.0
+; GFX10-IEEE-NEXT:    v_fma_f32 v9, -v5, v7, 1.0
 ; GFX10-IEEE-NEXT:    v_fmac_f32_e32 v6, v8, v6
-; GFX10-IEEE-NEXT:    v_fmac_f32_e32 v10, v9, v10
+; GFX10-IEEE-NEXT:    v_fmac_f32_e32 v7, v9, v7
 ; GFX10-IEEE-NEXT:    v_div_scale_f32 v8, s4, v1, v3, v1
-; GFX10-IEEE-NEXT:    v_mul_f32_e32 v9, v15, v6
-; GFX10-IEEE-NEXT:    v_mul_f32_e32 v11, v8, v10
-; GFX10-IEEE-NEXT:    v_fma_f32 v12, v9, -v4, v15
+; GFX10-IEEE-NEXT:    v_mul_f32_e32 v9, v10, v6
+; GFX10-IEEE-NEXT:    v_mul_f32_e32 v11, v8, v7
+; GFX10-IEEE-NEXT:    v_fma_f32 v12, v9, -v4, v10
 ; GFX10-IEEE-NEXT:    v_fma_f32 v13, v11, -v5, v8
 ; GFX10-IEEE-NEXT:    v_fmac_f32_e32 v9, v12, v6
-; GFX10-IEEE-NEXT:    v_fmac_f32_e32 v11, v13, v10
-; GFX10-IEEE-NEXT:    v_fmac_f32_e64 v15, -v4, v9
+; GFX10-IEEE-NEXT:    v_fmac_f32_e32 v11, v13, v7
+; GFX10-IEEE-NEXT:    v_fmac_f32_e64 v10, -v4, v9
 ; GFX10-IEEE-NEXT:    v_fmac_f32_e64 v8, -v5, v11
-; GFX10-IEEE-NEXT:    v_div_fmas_f32 v7, v15, v6, v9
+; GFX10-IEEE-NEXT:    v_div_fmas_f32 v4, v10, v6, v9
 ; GFX10-IEEE-NEXT:    s_mov_b32 vcc_lo, s4
-; GFX10-IEEE-NEXT:    v_div_fmas_f32 v6, v8, v10, v11
-; GFX10-IEEE-NEXT:    v_div_fixup_f32 v0, v7, v2, v0
-; GFX10-IEEE-NEXT:    v_div_fixup_f32 v1, v6, v3, v1
+; GFX10-IEEE-NEXT:    v_div_fmas_f32 v5, v8, v7, v11
+; GFX10-IEEE-NEXT:    v_div_fixup_f32 v0, v4, v2, v0
+; GFX10-IEEE-NEXT:    v_div_fixup_f32 v1, v5, v3, v1
 ; GFX10-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-FLUSH-LABEL: v_fdiv_v2f32:
@@ -752,18 +752,18 @@ define <2 x float> @v_fdiv_v2f32(<2 x float> %a, <2 x float> %b) {
 ; GFX10-FLUSH-NEXT:    s_denorm_mode 0
 ; GFX10-FLUSH-NEXT:    v_div_scale_f32 v4, s4, v3, v3, v1
 ; GFX10-FLUSH-NEXT:    v_div_fmas_f32 v5, v6, v5, v7
-; GFX10-FLUSH-NEXT:    v_div_scale_f32 v11, vcc_lo, v1, v3, v1
 ; GFX10-FLUSH-NEXT:    v_rcp_f32_e32 v6, v4
 ; GFX10-FLUSH-NEXT:    v_div_fixup_f32 v0, v5, v2, v0
+; GFX10-FLUSH-NEXT:    v_div_scale_f32 v2, vcc_lo, v1, v3, v1
 ; GFX10-FLUSH-NEXT:    s_denorm_mode 3
 ; GFX10-FLUSH-NEXT:    v_fma_f32 v5, -v4, v6, 1.0
 ; GFX10-FLUSH-NEXT:    v_fmac_f32_e32 v6, v5, v6
-; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v5, v11, v6
-; GFX10-FLUSH-NEXT:    v_fma_f32 v7, v5, -v4, v11
+; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v5, v2, v6
+; GFX10-FLUSH-NEXT:    v_fma_f32 v7, v5, -v4, v2
 ; GFX10-FLUSH-NEXT:    v_fmac_f32_e32 v5, v7, v6
-; GFX10-FLUSH-NEXT:    v_fmac_f32_e64 v11, -v4, v5
+; GFX10-FLUSH-NEXT:    v_fmac_f32_e64 v2, -v4, v5
 ; GFX10-FLUSH-NEXT:    s_denorm_mode 0
-; GFX10-FLUSH-NEXT:    v_div_fmas_f32 v2, v11, v6, v5
+; GFX10-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v6, v5
 ; GFX10-FLUSH-NEXT:    v_div_fixup_f32 v1, v2, v3, v1
 ; GFX10-FLUSH-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv <2 x float> %a, %b
@@ -874,27 +874,27 @@ define <2 x float> @v_fdiv_v2f32_ulp25(<2 x float> %a, <2 x float> %b) {
 ; GFX10-IEEE-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-IEEE-NEXT:    v_div_scale_f32 v4, s4, v2, v2, v0
 ; GFX10-IEEE-NEXT:    v_div_scale_f32 v5, s4, v3, v3, v1
-; GFX10-IEEE-NEXT:    v_div_scale_f32 v15, vcc_lo, v0, v2, v0
+; GFX10-IEEE-NEXT:    v_div_scale_f32 v10, vcc_lo, v0, v2, v0
 ; GFX10-IEEE-NEXT:    v_rcp_f32_e32 v6, v4
-; GFX10-IEEE-NEXT:    v_rcp_f32_e32 v10, v5
+; GFX10-IEEE-NEXT:    v_rcp_f32_e32 v7, v5
 ; GFX10-IEEE-NEXT:    v_fma_f32 v8, -v4, v6, 1.0
-; GFX10-IEEE-NEXT:    v_fma_f32 v9, -v5, v10, 1.0
+; GFX10-IEEE-NEXT:    v_fma_f32 v9, -v5, v7, 1.0
 ; GFX10-IEEE-NEXT:    v_fmac_f32_e32 v6, v8, v6
-; GFX10-IEEE-NEXT:    v_fmac_f32_e32 v10, v9, v10
+; GFX10-IEEE-NEXT:    v_fmac_f32_e32 v7, v9, v7
 ; GFX10-IEEE-NEXT:    v_div_scale_f32 v8, s4, v1, v3, v1
-; GFX10-IEEE-NEXT:    v_mul_f32_e32 v9, v15, v6
-; GFX10-IEEE-NEXT:    v_mul_f32_e32 v11, v8, v10
-; GFX10-IEEE-NEXT:    v_fma_f32 v12, v9, -v4, v15
+; GFX10-IEEE-NEXT:    v_mul_f32_e32 v9, v10, v6
+; GFX10-IEEE-NEXT:    v_mul_f32_e32 v11, v8, v7
+; GFX10-IEEE-NEXT:    v_fma_f32 v12, v9, -v4, v10
 ; GFX10-IEEE-NEXT:    v_fma_f32 v13, v11, -v5, v8
 ; GFX10-IEEE-NEXT:    v_fmac_f32_e32 v9, v12, v6
-; GFX10-IEEE-NEXT:    v_fmac_f32_e32 v11, v13, v10
-; GFX10-IEEE-NEXT:    v_fmac_f32_e64 v15, -v4, v9
+; GFX10-IEEE-NEXT:    v_fmac_f32_e32 v11, v13, v7
+; GFX10-IEEE-NEXT:    v_fmac_f32_e64 v10, -v4, v9
 ; GFX10-IEEE-NEXT:    v_fmac_f32_e64 v8, -v5, v11
-; GFX10-IEEE-NEXT:    v_div_fmas_f32 v7, v15, v6, v9
+; GFX10-IEEE-NEXT:    v_div_fmas_f32 v4, v10, v6, v9
 ; GFX10-IEEE-NEXT:    s_mov_b32 vcc_lo, s4
-; GFX10-IEEE-NEXT:    v_div_fmas_f32 v6, v8, v10, v11
-; GFX10-IEEE-NEXT:    v_div_fixup_f32 v0, v7, v2, v0
-; GFX10-IEEE-NEXT:    v_div_fixup_f32 v1, v6, v3, v1
+; GFX10-IEEE-NEXT:    v_div_fmas_f32 v5, v8, v7, v11
+; GFX10-IEEE-NEXT:    v_div_fixup_f32 v0, v4, v2, v0
+; GFX10-IEEE-NEXT:    v_div_fixup_f32 v1, v5, v3, v1
 ; GFX10-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-FLUSH-LABEL: v_fdiv_v2f32_ulp25:
@@ -905,16 +905,16 @@ define <2 x float> @v_fdiv_v2f32_ulp25(<2 x float> %a, <2 x float> %b) {
 ; GFX10-FLUSH-NEXT:    s_mov_b32 s5, 0x2f800000
 ; GFX10-FLUSH-NEXT:    v_cmp_gt_f32_e64 s6, |v2|, s4
 ; GFX10-FLUSH-NEXT:    v_cmp_gt_f32_e64 s4, |v3|, s4
-; GFX10-FLUSH-NEXT:    v_cndmask_b32_e64 v7, 1.0, s5, s6
-; GFX10-FLUSH-NEXT:    v_cndmask_b32_e64 v6, 1.0, s5, s4
-; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v2, v2, v7
-; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v3, v3, v6
+; GFX10-FLUSH-NEXT:    v_cndmask_b32_e64 v4, 1.0, s5, s6
+; GFX10-FLUSH-NEXT:    v_cndmask_b32_e64 v5, 1.0, s5, s4
+; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v2, v2, v4
+; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v3, v3, v5
 ; GFX10-FLUSH-NEXT:    v_rcp_f32_e32 v2, v2
 ; GFX10-FLUSH-NEXT:    v_rcp_f32_e32 v3, v3
 ; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v0, v7, v0
-; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v1, v6, v1
+; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v0, v4, v0
+; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v1, v5, v1
 ; GFX10-FLUSH-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv <2 x float> %a, %b, !fpmath !0
   ret <2 x float> %fdiv
@@ -1044,25 +1044,25 @@ define <2 x float> @v_rcp_v2f32(<2 x float> %x) {
 ; GFX10-IEEE-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-IEEE-NEXT:    v_div_scale_f32 v2, s4, v0, v0, 1.0
 ; GFX10-IEEE-NEXT:    v_div_scale_f32 v3, s4, v1, v1, 1.0
-; GFX10-IEEE-NEXT:    v_div_scale_f32 v13, vcc_lo, 1.0, v0, 1.0
+; GFX10-IEEE-NEXT:    v_div_scale_f32 v8, vcc_lo, 1.0, v0, 1.0
 ; GFX10-IEEE-NEXT:    v_rcp_f32_e32 v4, v2
-; GFX10-IEEE-NEXT:    v_rcp_f32_e32 v8, v3
+; GFX10-IEEE-NEXT:    v_rcp_f32_e32 v5, v3
 ; GFX10-IEEE-NEXT:    v_fma_f32 v6, -v2, v4, 1.0
-; GFX10-IEEE-NEXT:    v_fma_f32 v7, -v3, v8, 1.0
+; GFX10-IEEE-NEXT:    v_fma_f32 v7, -v3, v5, 1.0
 ; GFX10-IEEE-NEXT:    v_fmac_f32_e32 v4, v6, v4
-; GFX10-IEEE-NEXT:    v_fmac_f32_e32 v8, v7, v8
+; GFX10-IEEE-NEXT:    v_fmac_f32_e32 v5, v7, v5
 ; GFX10-IEEE-NEXT:    v_div_scale_f32 v6, s4, 1.0, v1, 1.0
-; GFX10-IEEE-NEXT:    v_mul_f32_e32 v7, v13, v4
-; GFX10-IEEE-NEXT:    v_mul_f32_e32 v9, v6, v8
-; GFX10-IEEE-NEXT:    v_fma_f32 v10, v7, -v2, v13
+; GFX10-IEEE-NEXT:    v_mul_f32_e32 v7, v8, v4
+; GFX10-IEEE-NEXT:    v_mul_f32_e32 v9, v6, v5
+; GFX10-IEEE-NEXT:    v_fma_f32 v10, v7, -v2, v8
 ; GFX10-IEEE-NEXT:    v_fma_f32 v11, v9, -v3, v6
 ; GFX10-IEEE-NEXT:    v_fmac_f32_e32 v7, v10, v4
-; GFX10-IEEE-NEXT:    v_fmac_f32_e32 v9, v11, v8
-; GFX10-IEEE-NEXT:    v_fmac_f32_e64 v13, -v2, v7
+; GFX10-IEEE-NEXT:    v_fmac_f32_e32 v9, v11, v5
+; GFX10-IEEE-NEXT:    v_fmac_f32_e64 v8, -v2, v7
 ; GFX10-IEEE-NEXT:    v_fmac_f32_e64 v6, -v3, v9
-; GFX10-IEEE-NEXT:    v_div_fmas_f32 v2, v13, v4, v7
+; GFX10-IEEE-NEXT:    v_div_fmas_f32 v2, v8, v4, v7
 ; GFX10-IEEE-NEXT:    s_mov_b32 vcc_lo, s4
-; GFX10-IEEE-NEXT:    v_div_fmas_f32 v3, v6, v8, v9
+; GFX10-IEEE-NEXT:    v_div_fmas_f32 v3, v6, v5, v9
 ; GFX10-IEEE-NEXT:    v_div_fixup_f32 v0, v2, v0, 1.0
 ; GFX10-IEEE-NEXT:    v_div_fixup_f32 v1, v3, v1, 1.0
 ; GFX10-IEEE-NEXT:    s_setpc_b64 s[30:31]
@@ -1226,25 +1226,25 @@ define <2 x float> @v_rcp_v2f32_arcp(<2 x float> %x) {
 ; GFX10-IEEE-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-IEEE-NEXT:    v_div_scale_f32 v2, s4, v0, v0, 1.0
 ; GFX10-IEEE-NEXT:    v_div_scale_f32 v3, s4, v1, v1, 1.0
-; GFX10-IEEE-NEXT:    v_div_scale_f32 v13, vcc_lo, 1.0, v0, 1.0
+; GFX10-IEEE-NEXT:    v_div_scale_f32 v8, vcc_lo, 1.0, v0, 1.0
 ; GFX10-IEEE-NEXT:    v_rcp_f32_e32 v4, v2
-; GFX10-IEEE-NEXT:    v_rcp_f32_e32 v8, v3
+; GFX10-IEEE-NEXT:    v_rcp_f32_e32 v5, v3
 ; GFX10-IEEE-NEXT:    v_fma_f32 v6, -v2, v4, 1.0
-; GFX10-IEEE-NEXT:    v_fma_f32 v7, -v3, v8, 1.0
+; GFX10-IEEE-NEXT:    v_fma_f32 v7, -v3, v5, 1.0
 ; GFX10-IEEE-NEXT:    v_fmac_f32_e32 v4, v6, v4
-; GFX10-IEEE-NEXT:    v_fmac_f32_e32 v8, v7, v8
+; GFX10-IEEE-NEXT:    v_fmac_f32_e32 v5, v7, v5
 ; GFX10-IEEE-NEXT:    v_div_scale_f32 v6, s4, 1.0, v1, 1.0
-; GFX10-IEEE-NEXT:    v_mul_f32_e32 v7, v13, v4
-; GFX10-IEEE-NEXT:    v_mul_f32_e32 v9, v6, v8
-; GFX10-IEEE-NEXT:    v_fma_f32 v10, v7, -v2, v13
+; GFX10-IEEE-NEXT:    v_mul_f32_e32 v7, v8, v4
+; GFX10-IEEE-NEXT:    v_mul_f32_e32 v9, v6, v5
+; GFX10-IEEE-NEXT:    v_fma_f32 v10, v7, -v2, v8
 ; GFX10-IEEE-NEXT:    v_fma_f32 v11, v9, -v3, v6
 ; GFX10-IEEE-NEXT:    v_fmac_f32_e32 v7, v10, v4
-; GFX10-IEEE-NEXT:    v_fmac_f32_e32 v9, v11, v8
-; GFX10-IEEE-NEXT:    v_fmac_f32_e64 v13, -v2, v7
+; GFX10-IEEE-NEXT:    v_fmac_f32_e32 v9, v11, v5
+; GFX10-IEEE-NEXT:    v_fmac_f32_e64 v8, -v2, v7
 ; GFX10-IEEE-NEXT:    v_fmac_f32_e64 v6, -v3, v9
-; GFX10-IEEE-NEXT:    v_div_fmas_f32 v2, v13, v4, v7
+; GFX10-IEEE-NEXT:    v_div_fmas_f32 v2, v8, v4, v7
 ; GFX10-IEEE-NEXT:    s_mov_b32 vcc_lo, s4
-; GFX10-IEEE-NEXT:    v_div_fmas_f32 v3, v6, v8, v9
+; GFX10-IEEE-NEXT:    v_div_fmas_f32 v3, v6, v5, v9
 ; GFX10-IEEE-NEXT:    v_div_fixup_f32 v0, v2, v0, 1.0
 ; GFX10-IEEE-NEXT:    v_div_fixup_f32 v1, v3, v1, 1.0
 ; GFX10-IEEE-NEXT:    s_setpc_b64 s[30:31]
@@ -1465,27 +1465,27 @@ define <2 x float> @v_fdiv_v2f32_arcp_ulp25(<2 x float> %a, <2 x float> %b) {
 ; GFX10-IEEE-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-IEEE-NEXT:    v_div_scale_f32 v4, s4, v2, v2, v0
 ; GFX10-IEEE-NEXT:    v_div_scale_f32 v5, s4, v3, v3, v1
-; GFX10-IEEE-NEXT:    v_div_scale_f32 v15, vcc_lo, v0, v2, v0
+; GFX10-IEEE-NEXT:    v_div_scale_f32 v10, vcc_lo, v0, v2, v0
 ; GFX10-IEEE-NEXT:    v_rcp_f32_e32 v6, v4
-; GFX10-IEEE-NEXT:    v_rcp_f32_e32 v10, v5
+; GFX10-IEEE-NEXT:    v_rcp_f32_e32 v7, v5
 ; GFX10-IEEE-NEXT:    v_fma_f32 v8, -v4, v6, 1.0
-; GFX10-IEEE-NEXT:    v_fma_f32 v9, -v5, v10, 1.0
+; GFX10-IEEE-NEXT:    v_fma_f32 v9, -v5, v7, 1.0
 ; GFX10-IEEE-NEXT:    v_fmac_f32_e32 v6, v8, v6
-; GFX10-IEEE-NEXT:    v_fmac_f32_e32 v10, v9, v10
+; GFX10-IEEE-NEXT:    v_fmac_f32_e32 v7, v9, v7
 ; GFX10-IEEE-NEXT:    v_div_scale_f32 v8, s4, v1, v3, v1
-; GFX10-IEEE-NEXT:    v_mul_f32_e32 v9, v15, v6
-; GFX10-IEEE-NEXT:    v_mul_f32_e32 v11, v8, v10
-; GFX10-IEEE-NEXT:    v_fma_f32 v12, v9, -v4, v15
+; GFX10-IEEE-NEXT:    v_mul_f32_e32 v9, v10, v6
+; GFX10-IEEE-NEXT:    v_mul_f32_e32 v11, v8, v7
+; GFX10-IEEE-NEXT:    v_fma_f32 v12, v9, -v4, v10
 ; GFX10-IEEE-NEXT:    v_fma_f32 v13, v11, -v5, v8
 ; GFX10-IEEE-NEXT:    v_fmac_f32_e32 v9, v12, v6
-; GFX10-IEEE-NEXT:    v_fmac_f32_e32 v11, v13, v10
-; GFX10-IEEE-NEXT:    v_fmac_f32_e64 v15, -v4, v9
+; GFX10-IEEE-NEXT:    v_fmac_f32_e32 v11, v13, v7
+; GFX10-IEEE-NEXT:    v_fmac_f32_e64 v10, -v4, v9
 ; GFX10-IEEE-NEXT:    v_fmac_f32_e64 v8, -v5, v11
-; GFX10-IEEE-NEXT:    v_div_fmas_f32 v7, v15, v6, v9
+; GFX10-IEEE-NEXT:    v_div_fmas_f32 v4, v10, v6, v9
 ; GFX10-IEEE-NEXT:    s_mov_b32 vcc_lo, s4
-; GFX10-IEEE-NEXT:    v_div_fmas_f32 v6, v8, v10, v11
-; GFX10-IEEE-NEXT:    v_div_fixup_f32 v0, v7, v2, v0
-; GFX10-IEEE-NEXT:    v_div_fixup_f32 v1, v6, v3, v1
+; GFX10-IEEE-NEXT:    v_div_fmas_f32 v5, v8, v7, v11
+; GFX10-IEEE-NEXT:    v_div_fixup_f32 v0, v4, v2, v0
+; GFX10-IEEE-NEXT:    v_div_fixup_f32 v1, v5, v3, v1
 ; GFX10-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-FLUSH-LABEL: v_fdiv_v2f32_arcp_ulp25:
@@ -1496,16 +1496,16 @@ define <2 x float> @v_fdiv_v2f32_arcp_ulp25(<2 x float> %a, <2 x float> %b) {
 ; GFX10-FLUSH-NEXT:    s_mov_b32 s5, 0x2f800000
 ; GFX10-FLUSH-NEXT:    v_cmp_gt_f32_e64 s6, |v2|, s4
 ; GFX10-FLUSH-NEXT:    v_cmp_gt_f32_e64 s4, |v3|, s4
-; GFX10-FLUSH-NEXT:    v_cndmask_b32_e64 v7, 1.0, s5, s6
-; GFX10-FLUSH-NEXT:    v_cndmask_b32_e64 v6, 1.0, s5, s4
-; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v2, v2, v7
-; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v3, v3, v6
+; GFX10-FLUSH-NEXT:    v_cndmask_b32_e64 v4, 1.0, s5, s6
+; GFX10-FLUSH-NEXT:    v_cndmask_b32_e64 v5, 1.0, s5, s4
+; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v2, v2, v4
+; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v3, v3, v5
 ; GFX10-FLUSH-NEXT:    v_rcp_f32_e32 v2, v2
 ; GFX10-FLUSH-NEXT:    v_rcp_f32_e32 v3, v3
 ; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v0, v7, v0
-; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v1, v6, v1
+; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v0, v4, v0
+; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v1, v5, v1
 ; GFX10-FLUSH-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv arcp <2 x float> %a, %b, !fpmath !0
   ret <2 x float> %fdiv

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll
index 946f54d1178b..ca836897baa4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll
@@ -105,10 +105,10 @@ define double @v_fdiv_f64_afn(double %a, double %b) {
 ; GFX10-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
 ; GFX10-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
 ; GFX10-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; GFX10-NEXT:    v_fma_f64 v[10:11], v[6:7], v[4:5], v[4:5]
-; GFX10-NEXT:    v_mul_f64 v[6:7], v[0:1], v[10:11]
+; GFX10-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
+; GFX10-NEXT:    v_mul_f64 v[6:7], v[0:1], v[4:5]
 ; GFX10-NEXT:    v_fma_f64 v[0:1], -v[2:3], v[6:7], v[0:1]
-; GFX10-NEXT:    v_fma_f64 v[0:1], v[0:1], v[10:11], v[6:7]
+; GFX10-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], v[6:7]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv afn double %a, %b
   ret double %fdiv
@@ -355,9 +355,9 @@ define double @v_rcp_f64_arcp_afn(double %x) {
 ; GFX10-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
 ; GFX10-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
 ; GFX10-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; GFX10-NEXT:    v_mul_f64 v[6:7], 1.0, v[2:3]
-; GFX10-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[6:7], 1.0
-; GFX10-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[6:7]
+; GFX10-NEXT:    v_mul_f64 v[4:5], 1.0, v[2:3]
+; GFX10-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0
+; GFX10-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv arcp afn double 1.0, %x
   ret double %fdiv
@@ -458,10 +458,10 @@ define double @v_fdiv_f64_afn_ulp25(double %a, double %b) {
 ; GFX10-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
 ; GFX10-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
 ; GFX10-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; GFX10-NEXT:    v_fma_f64 v[10:11], v[6:7], v[4:5], v[4:5]
-; GFX10-NEXT:    v_mul_f64 v[6:7], v[0:1], v[10:11]
+; GFX10-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
+; GFX10-NEXT:    v_mul_f64 v[6:7], v[0:1], v[4:5]
 ; GFX10-NEXT:    v_fma_f64 v[0:1], -v[2:3], v[6:7], v[0:1]
-; GFX10-NEXT:    v_fma_f64 v[0:1], v[0:1], v[10:11], v[6:7]
+; GFX10-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], v[6:7]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv afn double %a, %b, !fpmath !0
   ret double %fdiv
@@ -634,33 +634,29 @@ define <2 x double> @v_fdiv_v2f64(<2 x double> %a, <2 x double> %b) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_mov_b32_e32 v30, v4
-; GFX10-NEXT:    v_mov_b32_e32 v31, v5
-; GFX10-NEXT:    v_mov_b32_e32 v4, v6
-; GFX10-NEXT:    v_mov_b32_e32 v5, v7
-; GFX10-NEXT:    v_div_scale_f64 v[26:27], s4, v[30:31], v[30:31], v[0:1]
-; GFX10-NEXT:    v_div_scale_f64 v[20:21], vcc_lo, v[0:1], v[30:31], v[0:1]
-; GFX10-NEXT:    v_div_scale_f64 v[24:25], s4, v[4:5], v[4:5], v[2:3]
-; GFX10-NEXT:    v_div_scale_f64 v[16:17], s4, v[2:3], v[4:5], v[2:3]
-; GFX10-NEXT:    v_rcp_f64_e32 v[12:13], v[26:27]
-; GFX10-NEXT:    v_rcp_f64_e32 v[14:15], v[24:25]
-; GFX10-NEXT:    v_fma_f64 v[10:11], -v[26:27], v[12:13], 1.0
-; GFX10-NEXT:    v_fma_f64 v[8:9], -v[24:25], v[14:15], 1.0
-; GFX10-NEXT:    v_fma_f64 v[12:13], v[12:13], v[10:11], v[12:13]
-; GFX10-NEXT:    v_fma_f64 v[14:15], v[14:15], v[8:9], v[14:15]
-; GFX10-NEXT:    v_fma_f64 v[6:7], -v[26:27], v[12:13], 1.0
-; GFX10-NEXT:    v_fma_f64 v[8:9], -v[24:25], v[14:15], 1.0
-; GFX10-NEXT:    v_fma_f64 v[6:7], v[12:13], v[6:7], v[12:13]
-; GFX10-NEXT:    v_fma_f64 v[14:15], v[14:15], v[8:9], v[14:15]
-; GFX10-NEXT:    v_mul_f64 v[18:19], v[20:21], v[6:7]
+; GFX10-NEXT:    v_div_scale_f64 v[8:9], s4, v[4:5], v[4:5], v[0:1]
+; GFX10-NEXT:    v_div_scale_f64 v[10:11], s4, v[6:7], v[6:7], v[2:3]
+; GFX10-NEXT:    v_div_scale_f64 v[20:21], vcc_lo, v[0:1], v[4:5], v[0:1]
+; GFX10-NEXT:    v_rcp_f64_e32 v[12:13], v[8:9]
+; GFX10-NEXT:    v_rcp_f64_e32 v[14:15], v[10:11]
+; GFX10-NEXT:    v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0
+; GFX10-NEXT:    v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0
+; GFX10-NEXT:    v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13]
+; GFX10-NEXT:    v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15]
+; GFX10-NEXT:    v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0
+; GFX10-NEXT:    v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0
+; GFX10-NEXT:    v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13]
+; GFX10-NEXT:    v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15]
+; GFX10-NEXT:    v_div_scale_f64 v[16:17], s4, v[2:3], v[6:7], v[2:3]
+; GFX10-NEXT:    v_mul_f64 v[18:19], v[20:21], v[12:13]
 ; GFX10-NEXT:    v_mul_f64 v[22:23], v[16:17], v[14:15]
-; GFX10-NEXT:    v_fma_f64 v[8:9], -v[26:27], v[18:19], v[20:21]
-; GFX10-NEXT:    v_fma_f64 v[16:17], -v[24:25], v[22:23], v[16:17]
-; GFX10-NEXT:    v_div_fmas_f64 v[18:19], v[8:9], v[6:7], v[18:19]
+; GFX10-NEXT:    v_fma_f64 v[8:9], -v[8:9], v[18:19], v[20:21]
+; GFX10-NEXT:    v_fma_f64 v[10:11], -v[10:11], v[22:23], v[16:17]
+; GFX10-NEXT:    v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[18:19]
 ; GFX10-NEXT:    s_mov_b32 vcc_lo, s4
-; GFX10-NEXT:    v_div_fmas_f64 v[8:9], v[16:17], v[14:15], v[22:23]
-; GFX10-NEXT:    v_div_fixup_f64 v[0:1], v[18:19], v[30:31], v[0:1]
-; GFX10-NEXT:    v_div_fixup_f64 v[2:3], v[8:9], v[4:5], v[2:3]
+; GFX10-NEXT:    v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[22:23]
+; GFX10-NEXT:    v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1]
+; GFX10-NEXT:    v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv <2 x double> %a, %b
   ret <2 x double> %fdiv
@@ -692,30 +688,22 @@ define <2 x double> @v_fdiv_v2f64_afn(<2 x double> %a, <2 x double> %b) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_mov_b32_e32 v18, v4
-; GFX10-NEXT:    v_mov_b32_e32 v19, v5
-; GFX10-NEXT:    v_mov_b32_e32 v4, v6
-; GFX10-NEXT:    v_mov_b32_e32 v5, v7
-; GFX10-NEXT:    v_mov_b32_e32 v22, v0
-; GFX10-NEXT:    v_mov_b32_e32 v23, v1
-; GFX10-NEXT:    v_rcp_f64_e32 v[8:9], v[18:19]
-; GFX10-NEXT:    v_mov_b32_e32 v0, v2
-; GFX10-NEXT:    v_rcp_f64_e32 v[10:11], v[4:5]
-; GFX10-NEXT:    v_mov_b32_e32 v1, v3
-; GFX10-NEXT:    v_fma_f64 v[6:7], -v[18:19], v[8:9], 1.0
-; GFX10-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[10:11], 1.0
-; GFX10-NEXT:    v_fma_f64 v[8:9], v[6:7], v[8:9], v[8:9]
-; GFX10-NEXT:    v_fma_f64 v[10:11], v[12:13], v[10:11], v[10:11]
-; GFX10-NEXT:    v_fma_f64 v[6:7], -v[18:19], v[8:9], 1.0
-; GFX10-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[10:11], 1.0
-; GFX10-NEXT:    v_fma_f64 v[8:9], v[6:7], v[8:9], v[8:9]
-; GFX10-NEXT:    v_fma_f64 v[10:11], v[12:13], v[10:11], v[10:11]
-; GFX10-NEXT:    v_mul_f64 v[12:13], v[22:23], v[8:9]
-; GFX10-NEXT:    v_mul_f64 v[14:15], v[0:1], v[10:11]
-; GFX10-NEXT:    v_fma_f64 v[18:19], -v[18:19], v[12:13], v[22:23]
-; GFX10-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[14:15], v[0:1]
-; GFX10-NEXT:    v_fma_f64 v[0:1], v[18:19], v[8:9], v[12:13]
-; GFX10-NEXT:    v_fma_f64 v[2:3], v[4:5], v[10:11], v[14:15]
+; GFX10-NEXT:    v_rcp_f64_e32 v[8:9], v[4:5]
+; GFX10-NEXT:    v_rcp_f64_e32 v[10:11], v[6:7]
+; GFX10-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0
+; GFX10-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0
+; GFX10-NEXT:    v_fma_f64 v[8:9], v[12:13], v[8:9], v[8:9]
+; GFX10-NEXT:    v_fma_f64 v[10:11], v[14:15], v[10:11], v[10:11]
+; GFX10-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0
+; GFX10-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0
+; GFX10-NEXT:    v_fma_f64 v[8:9], v[12:13], v[8:9], v[8:9]
+; GFX10-NEXT:    v_fma_f64 v[10:11], v[14:15], v[10:11], v[10:11]
+; GFX10-NEXT:    v_mul_f64 v[12:13], v[0:1], v[8:9]
+; GFX10-NEXT:    v_mul_f64 v[14:15], v[2:3], v[10:11]
+; GFX10-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[12:13], v[0:1]
+; GFX10-NEXT:    v_fma_f64 v[2:3], -v[6:7], v[14:15], v[2:3]
+; GFX10-NEXT:    v_fma_f64 v[0:1], v[0:1], v[8:9], v[12:13]
+; GFX10-NEXT:    v_fma_f64 v[2:3], v[2:3], v[10:11], v[14:15]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv afn <2 x double> %a, %b
   ret <2 x double> %fdiv
@@ -816,33 +804,29 @@ define <2 x double> @v_fdiv_v2f64_ulp25(<2 x double> %a, <2 x double> %b) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_mov_b32_e32 v30, v4
-; GFX10-NEXT:    v_mov_b32_e32 v31, v5
-; GFX10-NEXT:    v_mov_b32_e32 v4, v6
-; GFX10-NEXT:    v_mov_b32_e32 v5, v7
-; GFX10-NEXT:    v_div_scale_f64 v[26:27], s4, v[30:31], v[30:31], v[0:1]
-; GFX10-NEXT:    v_div_scale_f64 v[20:21], vcc_lo, v[0:1], v[30:31], v[0:1]
-; GFX10-NEXT:    v_div_scale_f64 v[24:25], s4, v[4:5], v[4:5], v[2:3]
-; GFX10-NEXT:    v_div_scale_f64 v[16:17], s4, v[2:3], v[4:5], v[2:3]
-; GFX10-NEXT:    v_rcp_f64_e32 v[12:13], v[26:27]
-; GFX10-NEXT:    v_rcp_f64_e32 v[14:15], v[24:25]
-; GFX10-NEXT:    v_fma_f64 v[10:11], -v[26:27], v[12:13], 1.0
-; GFX10-NEXT:    v_fma_f64 v[8:9], -v[24:25], v[14:15], 1.0
-; GFX10-NEXT:    v_fma_f64 v[12:13], v[12:13], v[10:11], v[12:13]
-; GFX10-NEXT:    v_fma_f64 v[14:15], v[14:15], v[8:9], v[14:15]
-; GFX10-NEXT:    v_fma_f64 v[6:7], -v[26:27], v[12:13], 1.0
-; GFX10-NEXT:    v_fma_f64 v[8:9], -v[24:25], v[14:15], 1.0
-; GFX10-NEXT:    v_fma_f64 v[6:7], v[12:13], v[6:7], v[12:13]
-; GFX10-NEXT:    v_fma_f64 v[14:15], v[14:15], v[8:9], v[14:15]
-; GFX10-NEXT:    v_mul_f64 v[18:19], v[20:21], v[6:7]
+; GFX10-NEXT:    v_div_scale_f64 v[8:9], s4, v[4:5], v[4:5], v[0:1]
+; GFX10-NEXT:    v_div_scale_f64 v[10:11], s4, v[6:7], v[6:7], v[2:3]
+; GFX10-NEXT:    v_div_scale_f64 v[20:21], vcc_lo, v[0:1], v[4:5], v[0:1]
+; GFX10-NEXT:    v_rcp_f64_e32 v[12:13], v[8:9]
+; GFX10-NEXT:    v_rcp_f64_e32 v[14:15], v[10:11]
+; GFX10-NEXT:    v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0
+; GFX10-NEXT:    v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0
+; GFX10-NEXT:    v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13]
+; GFX10-NEXT:    v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15]
+; GFX10-NEXT:    v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0
+; GFX10-NEXT:    v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0
+; GFX10-NEXT:    v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13]
+; GFX10-NEXT:    v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15]
+; GFX10-NEXT:    v_div_scale_f64 v[16:17], s4, v[2:3], v[6:7], v[2:3]
+; GFX10-NEXT:    v_mul_f64 v[18:19], v[20:21], v[12:13]
 ; GFX10-NEXT:    v_mul_f64 v[22:23], v[16:17], v[14:15]
-; GFX10-NEXT:    v_fma_f64 v[8:9], -v[26:27], v[18:19], v[20:21]
-; GFX10-NEXT:    v_fma_f64 v[16:17], -v[24:25], v[22:23], v[16:17]
-; GFX10-NEXT:    v_div_fmas_f64 v[18:19], v[8:9], v[6:7], v[18:19]
+; GFX10-NEXT:    v_fma_f64 v[8:9], -v[8:9], v[18:19], v[20:21]
+; GFX10-NEXT:    v_fma_f64 v[10:11], -v[10:11], v[22:23], v[16:17]
+; GFX10-NEXT:    v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[18:19]
 ; GFX10-NEXT:    s_mov_b32 vcc_lo, s4
-; GFX10-NEXT:    v_div_fmas_f64 v[8:9], v[16:17], v[14:15], v[22:23]
-; GFX10-NEXT:    v_div_fixup_f64 v[0:1], v[18:19], v[30:31], v[0:1]
-; GFX10-NEXT:    v_div_fixup_f64 v[2:3], v[8:9], v[4:5], v[2:3]
+; GFX10-NEXT:    v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[22:23]
+; GFX10-NEXT:    v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1]
+; GFX10-NEXT:    v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv <2 x double> %a, %b, !fpmath !0
   ret <2 x double> %fdiv
@@ -943,29 +927,29 @@ define <2 x double> @v_rcp_v2f64(<2 x double> %x) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_div_scale_f64 v[22:23], s4, v[0:1], v[0:1], 1.0
-; GFX10-NEXT:    v_div_scale_f64 v[20:21], s4, v[2:3], v[2:3], 1.0
+; GFX10-NEXT:    v_div_scale_f64 v[4:5], s4, v[0:1], v[0:1], 1.0
+; GFX10-NEXT:    v_div_scale_f64 v[6:7], s4, v[2:3], v[2:3], 1.0
 ; GFX10-NEXT:    v_div_scale_f64 v[16:17], vcc_lo, 1.0, v[0:1], 1.0
+; GFX10-NEXT:    v_rcp_f64_e32 v[8:9], v[4:5]
+; GFX10-NEXT:    v_rcp_f64_e32 v[10:11], v[6:7]
+; GFX10-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0
+; GFX10-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0
+; GFX10-NEXT:    v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
+; GFX10-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
+; GFX10-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0
+; GFX10-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0
+; GFX10-NEXT:    v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
+; GFX10-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
 ; GFX10-NEXT:    v_div_scale_f64 v[12:13], s4, 1.0, v[2:3], 1.0
-; GFX10-NEXT:    v_rcp_f64_e32 v[8:9], v[22:23]
-; GFX10-NEXT:    v_rcp_f64_e32 v[10:11], v[20:21]
-; GFX10-NEXT:    v_fma_f64 v[6:7], -v[22:23], v[8:9], 1.0
-; GFX10-NEXT:    v_fma_f64 v[4:5], -v[20:21], v[10:11], 1.0
-; GFX10-NEXT:    v_fma_f64 v[8:9], v[8:9], v[6:7], v[8:9]
-; GFX10-NEXT:    v_fma_f64 v[10:11], v[10:11], v[4:5], v[10:11]
-; GFX10-NEXT:    v_fma_f64 v[6:7], -v[22:23], v[8:9], 1.0
-; GFX10-NEXT:    v_fma_f64 v[4:5], -v[20:21], v[10:11], 1.0
-; GFX10-NEXT:    v_fma_f64 v[6:7], v[8:9], v[6:7], v[8:9]
-; GFX10-NEXT:    v_fma_f64 v[10:11], v[10:11], v[4:5], v[10:11]
-; GFX10-NEXT:    v_mul_f64 v[14:15], v[16:17], v[6:7]
+; GFX10-NEXT:    v_mul_f64 v[14:15], v[16:17], v[8:9]
 ; GFX10-NEXT:    v_mul_f64 v[18:19], v[12:13], v[10:11]
-; GFX10-NEXT:    v_fma_f64 v[4:5], -v[22:23], v[14:15], v[16:17]
-; GFX10-NEXT:    v_fma_f64 v[12:13], -v[20:21], v[18:19], v[12:13]
-; GFX10-NEXT:    v_div_fmas_f64 v[6:7], v[4:5], v[6:7], v[14:15]
+; GFX10-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[14:15], v[16:17]
+; GFX10-NEXT:    v_fma_f64 v[6:7], -v[6:7], v[18:19], v[12:13]
+; GFX10-NEXT:    v_div_fmas_f64 v[4:5], v[4:5], v[8:9], v[14:15]
 ; GFX10-NEXT:    s_mov_b32 vcc_lo, s4
-; GFX10-NEXT:    v_div_fmas_f64 v[8:9], v[12:13], v[10:11], v[18:19]
-; GFX10-NEXT:    v_div_fixup_f64 v[0:1], v[6:7], v[0:1], 1.0
-; GFX10-NEXT:    v_div_fixup_f64 v[2:3], v[8:9], v[2:3], 1.0
+; GFX10-NEXT:    v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[18:19]
+; GFX10-NEXT:    v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0
+; GFX10-NEXT:    v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv <2 x double> <double 1.0, double 1.0>, %x
   ret <2 x double> %fdiv
@@ -1066,29 +1050,29 @@ define <2 x double> @v_rcp_v2f64_arcp(<2 x double> %x) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_div_scale_f64 v[22:23], s4, v[0:1], v[0:1], 1.0
-; GFX10-NEXT:    v_div_scale_f64 v[20:21], s4, v[2:3], v[2:3], 1.0
+; GFX10-NEXT:    v_div_scale_f64 v[4:5], s4, v[0:1], v[0:1], 1.0
+; GFX10-NEXT:    v_div_scale_f64 v[6:7], s4, v[2:3], v[2:3], 1.0
 ; GFX10-NEXT:    v_div_scale_f64 v[16:17], vcc_lo, 1.0, v[0:1], 1.0
+; GFX10-NEXT:    v_rcp_f64_e32 v[8:9], v[4:5]
+; GFX10-NEXT:    v_rcp_f64_e32 v[10:11], v[6:7]
+; GFX10-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0
+; GFX10-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0
+; GFX10-NEXT:    v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
+; GFX10-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
+; GFX10-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0
+; GFX10-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0
+; GFX10-NEXT:    v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
+; GFX10-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
 ; GFX10-NEXT:    v_div_scale_f64 v[12:13], s4, 1.0, v[2:3], 1.0
-; GFX10-NEXT:    v_rcp_f64_e32 v[8:9], v[22:23]
-; GFX10-NEXT:    v_rcp_f64_e32 v[10:11], v[20:21]
-; GFX10-NEXT:    v_fma_f64 v[6:7], -v[22:23], v[8:9], 1.0
-; GFX10-NEXT:    v_fma_f64 v[4:5], -v[20:21], v[10:11], 1.0
-; GFX10-NEXT:    v_fma_f64 v[8:9], v[8:9], v[6:7], v[8:9]
-; GFX10-NEXT:    v_fma_f64 v[10:11], v[10:11], v[4:5], v[10:11]
-; GFX10-NEXT:    v_fma_f64 v[6:7], -v[22:23], v[8:9], 1.0
-; GFX10-NEXT:    v_fma_f64 v[4:5], -v[20:21], v[10:11], 1.0
-; GFX10-NEXT:    v_fma_f64 v[6:7], v[8:9], v[6:7], v[8:9]
-; GFX10-NEXT:    v_fma_f64 v[10:11], v[10:11], v[4:5], v[10:11]
-; GFX10-NEXT:    v_mul_f64 v[14:15], v[16:17], v[6:7]
+; GFX10-NEXT:    v_mul_f64 v[14:15], v[16:17], v[8:9]
 ; GFX10-NEXT:    v_mul_f64 v[18:19], v[12:13], v[10:11]
-; GFX10-NEXT:    v_fma_f64 v[4:5], -v[22:23], v[14:15], v[16:17]
-; GFX10-NEXT:    v_fma_f64 v[12:13], -v[20:21], v[18:19], v[12:13]
-; GFX10-NEXT:    v_div_fmas_f64 v[6:7], v[4:5], v[6:7], v[14:15]
+; GFX10-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[14:15], v[16:17]
+; GFX10-NEXT:    v_fma_f64 v[6:7], -v[6:7], v[18:19], v[12:13]
+; GFX10-NEXT:    v_div_fmas_f64 v[4:5], v[4:5], v[8:9], v[14:15]
 ; GFX10-NEXT:    s_mov_b32 vcc_lo, s4
-; GFX10-NEXT:    v_div_fmas_f64 v[8:9], v[12:13], v[10:11], v[18:19]
-; GFX10-NEXT:    v_div_fixup_f64 v[0:1], v[6:7], v[0:1], 1.0
-; GFX10-NEXT:    v_div_fixup_f64 v[2:3], v[8:9], v[2:3], 1.0
+; GFX10-NEXT:    v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[18:19]
+; GFX10-NEXT:    v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0
+; GFX10-NEXT:    v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv arcp <2 x double> <double 1.0, double 1.0>, %x
   ret <2 x double> %fdiv
@@ -1120,26 +1104,22 @@ define <2 x double> @v_rcp_v2f64_arcp_afn(<2 x double> %x) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_mov_b32_e32 v14, v0
-; GFX10-NEXT:    v_mov_b32_e32 v15, v1
-; GFX10-NEXT:    v_mov_b32_e32 v0, v2
-; GFX10-NEXT:    v_mov_b32_e32 v1, v3
-; GFX10-NEXT:    v_rcp_f64_e32 v[4:5], v[14:15]
-; GFX10-NEXT:    v_rcp_f64_e32 v[6:7], v[0:1]
-; GFX10-NEXT:    v_fma_f64 v[2:3], -v[14:15], v[4:5], 1.0
-; GFX10-NEXT:    v_fma_f64 v[8:9], -v[0:1], v[6:7], 1.0
-; GFX10-NEXT:    v_fma_f64 v[4:5], v[2:3], v[4:5], v[4:5]
-; GFX10-NEXT:    v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7]
-; GFX10-NEXT:    v_fma_f64 v[2:3], -v[14:15], v[4:5], 1.0
-; GFX10-NEXT:    v_fma_f64 v[8:9], -v[0:1], v[6:7], 1.0
-; GFX10-NEXT:    v_fma_f64 v[4:5], v[2:3], v[4:5], v[4:5]
-; GFX10-NEXT:    v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7]
+; GFX10-NEXT:    v_rcp_f64_e32 v[4:5], v[0:1]
+; GFX10-NEXT:    v_rcp_f64_e32 v[6:7], v[2:3]
+; GFX10-NEXT:    v_fma_f64 v[8:9], -v[0:1], v[4:5], 1.0
+; GFX10-NEXT:    v_fma_f64 v[10:11], -v[2:3], v[6:7], 1.0
+; GFX10-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[4:5]
+; GFX10-NEXT:    v_fma_f64 v[6:7], v[10:11], v[6:7], v[6:7]
+; GFX10-NEXT:    v_fma_f64 v[8:9], -v[0:1], v[4:5], 1.0
+; GFX10-NEXT:    v_fma_f64 v[10:11], -v[2:3], v[6:7], 1.0
+; GFX10-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[4:5]
+; GFX10-NEXT:    v_fma_f64 v[6:7], v[10:11], v[6:7], v[6:7]
 ; GFX10-NEXT:    v_mul_f64 v[8:9], 1.0, v[4:5]
 ; GFX10-NEXT:    v_mul_f64 v[10:11], 1.0, v[6:7]
-; GFX10-NEXT:    v_fma_f64 v[14:15], -v[14:15], v[8:9], 1.0
-; GFX10-NEXT:    v_fma_f64 v[12:13], -v[0:1], v[10:11], 1.0
-; GFX10-NEXT:    v_fma_f64 v[0:1], v[14:15], v[4:5], v[8:9]
-; GFX10-NEXT:    v_fma_f64 v[2:3], v[12:13], v[6:7], v[10:11]
+; GFX10-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[8:9], 1.0
+; GFX10-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[10:11], 1.0
+; GFX10-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], v[8:9]
+; GFX10-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[10:11]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv arcp afn <2 x double> <double 1.0, double 1.0>, %x
   ret <2 x double> %fdiv
@@ -1240,29 +1220,29 @@ define <2 x double> @v_rcp_v2f64_ulp25(<2 x double> %x) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_div_scale_f64 v[22:23], s4, v[0:1], v[0:1], 1.0
-; GFX10-NEXT:    v_div_scale_f64 v[20:21], s4, v[2:3], v[2:3], 1.0
+; GFX10-NEXT:    v_div_scale_f64 v[4:5], s4, v[0:1], v[0:1], 1.0
+; GFX10-NEXT:    v_div_scale_f64 v[6:7], s4, v[2:3], v[2:3], 1.0
 ; GFX10-NEXT:    v_div_scale_f64 v[16:17], vcc_lo, 1.0, v[0:1], 1.0
+; GFX10-NEXT:    v_rcp_f64_e32 v[8:9], v[4:5]
+; GFX10-NEXT:    v_rcp_f64_e32 v[10:11], v[6:7]
+; GFX10-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0
+; GFX10-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0
+; GFX10-NEXT:    v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
+; GFX10-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
+; GFX10-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0
+; GFX10-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0
+; GFX10-NEXT:    v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
+; GFX10-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
 ; GFX10-NEXT:    v_div_scale_f64 v[12:13], s4, 1.0, v[2:3], 1.0
-; GFX10-NEXT:    v_rcp_f64_e32 v[8:9], v[22:23]
-; GFX10-NEXT:    v_rcp_f64_e32 v[10:11], v[20:21]
-; GFX10-NEXT:    v_fma_f64 v[6:7], -v[22:23], v[8:9], 1.0
-; GFX10-NEXT:    v_fma_f64 v[4:5], -v[20:21], v[10:11], 1.0
-; GFX10-NEXT:    v_fma_f64 v[8:9], v[8:9], v[6:7], v[8:9]
-; GFX10-NEXT:    v_fma_f64 v[10:11], v[10:11], v[4:5], v[10:11]
-; GFX10-NEXT:    v_fma_f64 v[6:7], -v[22:23], v[8:9], 1.0
-; GFX10-NEXT:    v_fma_f64 v[4:5], -v[20:21], v[10:11], 1.0
-; GFX10-NEXT:    v_fma_f64 v[6:7], v[8:9], v[6:7], v[8:9]
-; GFX10-NEXT:    v_fma_f64 v[10:11], v[10:11], v[4:5], v[10:11]
-; GFX10-NEXT:    v_mul_f64 v[14:15], v[16:17], v[6:7]
+; GFX10-NEXT:    v_mul_f64 v[14:15], v[16:17], v[8:9]
 ; GFX10-NEXT:    v_mul_f64 v[18:19], v[12:13], v[10:11]
-; GFX10-NEXT:    v_fma_f64 v[4:5], -v[22:23], v[14:15], v[16:17]
-; GFX10-NEXT:    v_fma_f64 v[12:13], -v[20:21], v[18:19], v[12:13]
-; GFX10-NEXT:    v_div_fmas_f64 v[6:7], v[4:5], v[6:7], v[14:15]
+; GFX10-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[14:15], v[16:17]
+; GFX10-NEXT:    v_fma_f64 v[6:7], -v[6:7], v[18:19], v[12:13]
+; GFX10-NEXT:    v_div_fmas_f64 v[4:5], v[4:5], v[8:9], v[14:15]
 ; GFX10-NEXT:    s_mov_b32 vcc_lo, s4
-; GFX10-NEXT:    v_div_fmas_f64 v[8:9], v[12:13], v[10:11], v[18:19]
-; GFX10-NEXT:    v_div_fixup_f64 v[0:1], v[6:7], v[0:1], 1.0
-; GFX10-NEXT:    v_div_fixup_f64 v[2:3], v[8:9], v[2:3], 1.0
+; GFX10-NEXT:    v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[18:19]
+; GFX10-NEXT:    v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0
+; GFX10-NEXT:    v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv <2 x double> <double 1.0, double 1.0>, %x, !fpmath !0
   ret <2 x double> %fdiv
@@ -1294,30 +1274,22 @@ define <2 x double> @v_fdiv_v2f64_afn_ulp25(<2 x double> %a, <2 x double> %b) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_mov_b32_e32 v18, v4
-; GFX10-NEXT:    v_mov_b32_e32 v19, v5
-; GFX10-NEXT:    v_mov_b32_e32 v4, v6
-; GFX10-NEXT:    v_mov_b32_e32 v5, v7
-; GFX10-NEXT:    v_mov_b32_e32 v22, v0
-; GFX10-NEXT:    v_mov_b32_e32 v23, v1
-; GFX10-NEXT:    v_rcp_f64_e32 v[8:9], v[18:19]
-; GFX10-NEXT:    v_mov_b32_e32 v0, v2
-; GFX10-NEXT:    v_rcp_f64_e32 v[10:11], v[4:5]
-; GFX10-NEXT:    v_mov_b32_e32 v1, v3
-; GFX10-NEXT:    v_fma_f64 v[6:7], -v[18:19], v[8:9], 1.0
-; GFX10-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[10:11], 1.0
-; GFX10-NEXT:    v_fma_f64 v[8:9], v[6:7], v[8:9], v[8:9]
-; GFX10-NEXT:    v_fma_f64 v[10:11], v[12:13], v[10:11], v[10:11]
-; GFX10-NEXT:    v_fma_f64 v[6:7], -v[18:19], v[8:9], 1.0
-; GFX10-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[10:11], 1.0
-; GFX10-NEXT:    v_fma_f64 v[8:9], v[6:7], v[8:9], v[8:9]
-; GFX10-NEXT:    v_fma_f64 v[10:11], v[12:13], v[10:11], v[10:11]
-; GFX10-NEXT:    v_mul_f64 v[12:13], v[22:23], v[8:9]
-; GFX10-NEXT:    v_mul_f64 v[14:15], v[0:1], v[10:11]
-; GFX10-NEXT:    v_fma_f64 v[18:19], -v[18:19], v[12:13], v[22:23]
-; GFX10-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[14:15], v[0:1]
-; GFX10-NEXT:    v_fma_f64 v[0:1], v[18:19], v[8:9], v[12:13]
-; GFX10-NEXT:    v_fma_f64 v[2:3], v[4:5], v[10:11], v[14:15]
+; GFX10-NEXT:    v_rcp_f64_e32 v[8:9], v[4:5]
+; GFX10-NEXT:    v_rcp_f64_e32 v[10:11], v[6:7]
+; GFX10-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0
+; GFX10-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0
+; GFX10-NEXT:    v_fma_f64 v[8:9], v[12:13], v[8:9], v[8:9]
+; GFX10-NEXT:    v_fma_f64 v[10:11], v[14:15], v[10:11], v[10:11]
+; GFX10-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0
+; GFX10-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0
+; GFX10-NEXT:    v_fma_f64 v[8:9], v[12:13], v[8:9], v[8:9]
+; GFX10-NEXT:    v_fma_f64 v[10:11], v[14:15], v[10:11], v[10:11]
+; GFX10-NEXT:    v_mul_f64 v[12:13], v[0:1], v[8:9]
+; GFX10-NEXT:    v_mul_f64 v[14:15], v[2:3], v[10:11]
+; GFX10-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[12:13], v[0:1]
+; GFX10-NEXT:    v_fma_f64 v[2:3], -v[6:7], v[14:15], v[2:3]
+; GFX10-NEXT:    v_fma_f64 v[0:1], v[0:1], v[8:9], v[12:13]
+; GFX10-NEXT:    v_fma_f64 v[2:3], v[2:3], v[10:11], v[14:15]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv afn <2 x double> %a, %b, !fpmath !0
   ret <2 x double> %fdiv
@@ -1418,33 +1390,29 @@ define <2 x double> @v_fdiv_v2f64_arcp_ulp25(<2 x double> %a, <2 x double> %b) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_mov_b32_e32 v30, v4
-; GFX10-NEXT:    v_mov_b32_e32 v31, v5
-; GFX10-NEXT:    v_mov_b32_e32 v4, v6
-; GFX10-NEXT:    v_mov_b32_e32 v5, v7
-; GFX10-NEXT:    v_div_scale_f64 v[26:27], s4, v[30:31], v[30:31], v[0:1]
-; GFX10-NEXT:    v_div_scale_f64 v[20:21], vcc_lo, v[0:1], v[30:31], v[0:1]
-; GFX10-NEXT:    v_div_scale_f64 v[24:25], s4, v[4:5], v[4:5], v[2:3]
-; GFX10-NEXT:    v_div_scale_f64 v[16:17], s4, v[2:3], v[4:5], v[2:3]
-; GFX10-NEXT:    v_rcp_f64_e32 v[12:13], v[26:27]
-; GFX10-NEXT:    v_rcp_f64_e32 v[14:15], v[24:25]
-; GFX10-NEXT:    v_fma_f64 v[10:11], -v[26:27], v[12:13], 1.0
-; GFX10-NEXT:    v_fma_f64 v[8:9], -v[24:25], v[14:15], 1.0
-; GFX10-NEXT:    v_fma_f64 v[12:13], v[12:13], v[10:11], v[12:13]
-; GFX10-NEXT:    v_fma_f64 v[14:15], v[14:15], v[8:9], v[14:15]
-; GFX10-NEXT:    v_fma_f64 v[6:7], -v[26:27], v[12:13], 1.0
-; GFX10-NEXT:    v_fma_f64 v[8:9], -v[24:25], v[14:15], 1.0
-; GFX10-NEXT:    v_fma_f64 v[6:7], v[12:13], v[6:7], v[12:13]
-; GFX10-NEXT:    v_fma_f64 v[14:15], v[14:15], v[8:9], v[14:15]
-; GFX10-NEXT:    v_mul_f64 v[18:19], v[20:21], v[6:7]
+; GFX10-NEXT:    v_div_scale_f64 v[8:9], s4, v[4:5], v[4:5], v[0:1]
+; GFX10-NEXT:    v_div_scale_f64 v[10:11], s4, v[6:7], v[6:7], v[2:3]
+; GFX10-NEXT:    v_div_scale_f64 v[20:21], vcc_lo, v[0:1], v[4:5], v[0:1]
+; GFX10-NEXT:    v_rcp_f64_e32 v[12:13], v[8:9]
+; GFX10-NEXT:    v_rcp_f64_e32 v[14:15], v[10:11]
+; GFX10-NEXT:    v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0
+; GFX10-NEXT:    v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0
+; GFX10-NEXT:    v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13]
+; GFX10-NEXT:    v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15]
+; GFX10-NEXT:    v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0
+; GFX10-NEXT:    v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0
+; GFX10-NEXT:    v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13]
+; GFX10-NEXT:    v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15]
+; GFX10-NEXT:    v_div_scale_f64 v[16:17], s4, v[2:3], v[6:7], v[2:3]
+; GFX10-NEXT:    v_mul_f64 v[18:19], v[20:21], v[12:13]
 ; GFX10-NEXT:    v_mul_f64 v[22:23], v[16:17], v[14:15]
-; GFX10-NEXT:    v_fma_f64 v[8:9], -v[26:27], v[18:19], v[20:21]
-; GFX10-NEXT:    v_fma_f64 v[16:17], -v[24:25], v[22:23], v[16:17]
-; GFX10-NEXT:    v_div_fmas_f64 v[18:19], v[8:9], v[6:7], v[18:19]
+; GFX10-NEXT:    v_fma_f64 v[8:9], -v[8:9], v[18:19], v[20:21]
+; GFX10-NEXT:    v_fma_f64 v[10:11], -v[10:11], v[22:23], v[16:17]
+; GFX10-NEXT:    v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[18:19]
 ; GFX10-NEXT:    s_mov_b32 vcc_lo, s4
-; GFX10-NEXT:    v_div_fmas_f64 v[8:9], v[16:17], v[14:15], v[22:23]
-; GFX10-NEXT:    v_div_fixup_f64 v[0:1], v[18:19], v[30:31], v[0:1]
-; GFX10-NEXT:    v_div_fixup_f64 v[2:3], v[8:9], v[4:5], v[2:3]
+; GFX10-NEXT:    v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[22:23]
+; GFX10-NEXT:    v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1]
+; GFX10-NEXT:    v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv arcp <2 x double> %a, %b, !fpmath !0
   ret <2 x double> %fdiv
@@ -1476,30 +1444,22 @@ define <2 x double> @v_fdiv_v2f64_arcp_afn_ulp25(<2 x double> %a, <2 x double> %
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_mov_b32_e32 v18, v4
-; GFX10-NEXT:    v_mov_b32_e32 v19, v5
-; GFX10-NEXT:    v_mov_b32_e32 v4, v6
-; GFX10-NEXT:    v_mov_b32_e32 v5, v7
-; GFX10-NEXT:    v_mov_b32_e32 v22, v0
-; GFX10-NEXT:    v_mov_b32_e32 v23, v1
-; GFX10-NEXT:    v_rcp_f64_e32 v[8:9], v[18:19]
-; GFX10-NEXT:    v_mov_b32_e32 v0, v2
-; GFX10-NEXT:    v_rcp_f64_e32 v[10:11], v[4:5]
-; GFX10-NEXT:    v_mov_b32_e32 v1, v3
-; GFX10-NEXT:    v_fma_f64 v[6:7], -v[18:19], v[8:9], 1.0
-; GFX10-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[10:11], 1.0
-; GFX10-NEXT:    v_fma_f64 v[8:9], v[6:7], v[8:9], v[8:9]
-; GFX10-NEXT:    v_fma_f64 v[10:11], v[12:13], v[10:11], v[10:11]
-; GFX10-NEXT:    v_fma_f64 v[6:7], -v[18:19], v[8:9], 1.0
-; GFX10-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[10:11], 1.0
-; GFX10-NEXT:    v_fma_f64 v[8:9], v[6:7], v[8:9], v[8:9]
-; GFX10-NEXT:    v_fma_f64 v[10:11], v[12:13], v[10:11], v[10:11]
-; GFX10-NEXT:    v_mul_f64 v[12:13], v[22:23], v[8:9]
-; GFX10-NEXT:    v_mul_f64 v[14:15], v[0:1], v[10:11]
-; GFX10-NEXT:    v_fma_f64 v[18:19], -v[18:19], v[12:13], v[22:23]
-; GFX10-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[14:15], v[0:1]
-; GFX10-NEXT:    v_fma_f64 v[0:1], v[18:19], v[8:9], v[12:13]
-; GFX10-NEXT:    v_fma_f64 v[2:3], v[4:5], v[10:11], v[14:15]
+; GFX10-NEXT:    v_rcp_f64_e32 v[8:9], v[4:5]
+; GFX10-NEXT:    v_rcp_f64_e32 v[10:11], v[6:7]
+; GFX10-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0
+; GFX10-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0
+; GFX10-NEXT:    v_fma_f64 v[8:9], v[12:13], v[8:9], v[8:9]
+; GFX10-NEXT:    v_fma_f64 v[10:11], v[14:15], v[10:11], v[10:11]
+; GFX10-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0
+; GFX10-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0
+; GFX10-NEXT:    v_fma_f64 v[8:9], v[12:13], v[8:9], v[8:9]
+; GFX10-NEXT:    v_fma_f64 v[10:11], v[14:15], v[10:11], v[10:11]
+; GFX10-NEXT:    v_mul_f64 v[12:13], v[0:1], v[8:9]
+; GFX10-NEXT:    v_mul_f64 v[14:15], v[2:3], v[10:11]
+; GFX10-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[12:13], v[0:1]
+; GFX10-NEXT:    v_fma_f64 v[2:3], -v[6:7], v[14:15], v[2:3]
+; GFX10-NEXT:    v_fma_f64 v[0:1], v[0:1], v[8:9], v[12:13]
+; GFX10-NEXT:    v_fma_f64 v[2:3], v[2:3], v[10:11], v[14:15]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv afn arcp <2 x double> %a, %b, !fpmath !0
   ret <2 x double> %fdiv

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll
index aad28b9b6cde..077f91302387 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll
@@ -479,12 +479,8 @@ define <2 x double> @v_fma_v2f64(<2 x double> %x, <2 x double> %y, <2 x double>
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_mov_b32_e32 v14, v0
-; GFX10-NEXT:    v_mov_b32_e32 v15, v1
-; GFX10-NEXT:    v_mov_b32_e32 v12, v2
-; GFX10-NEXT:    v_mov_b32_e32 v13, v3
-; GFX10-NEXT:    v_fma_f64 v[0:1], v[14:15], v[4:5], v[8:9]
-; GFX10-NEXT:    v_fma_f64 v[2:3], v[12:13], v[6:7], v[10:11]
+; GFX10-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], v[8:9]
+; GFX10-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[10:11]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %fma = call <2 x double> @llvm.fma.v2f64(<2 x double> %x, <2 x double> %y, <2 x double> %z)
   ret <2 x double> %fma

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
index fdd450fea986..83ec29db8f8e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
@@ -1159,7 +1159,6 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX10-NEXT:    v_and_b32_e32 v11, 7, v2
 ; GFX10-NEXT:    v_xor_b32_e32 v10, -1, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
@@ -1167,13 +1166,14 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX10-NEXT:    v_xor_b32_e32 v11, -1, v8
 ; GFX10-NEXT:    v_and_b32_e32 v8, 7, v8
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
-; GFX10-NEXT:    v_mov_b32_e32 v15, 0xff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
+; GFX10-NEXT:    v_mov_b32_e32 v13, 0xff
 ; GFX10-NEXT:    s_movk_i32 s4, 0xff
-; GFX10-NEXT:    v_xor_b32_e32 v13, -1, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
 ; GFX10-NEXT:    v_and_b32_e32 v12, s4, v1
 ; GFX10-NEXT:    v_and_b32_e32 v6, s4, v6
-; GFX10-NEXT:    v_and_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_and_b32_sdwa v1, v1, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_xor_b32_e32 v13, -1, v2
 ; GFX10-NEXT:    v_lshlrev_b16 v3, v8, v3
 ; GFX10-NEXT:    v_xor_b32_e32 v8, -1, v9
 ; GFX10-NEXT:    v_and_b32_e32 v2, 7, v2
@@ -2190,13 +2190,13 @@ define <2 x i24> @v_fshl_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v6, 24
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v7, 24
 ; GFX10-NEXT:    s_sub_i32 s4, 0, 24
-; GFX10-NEXT:    v_mov_b32_e32 v12, 0xffffff
+; GFX10-NEXT:    v_mov_b32_e32 v10, 0xffffff
 ; GFX10-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v6, v6
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v7, v7
-; GFX10-NEXT:    v_and_b32_e32 v5, v5, v12
-; GFX10-NEXT:    v_and_b32_e32 v2, v2, v12
-; GFX10-NEXT:    v_and_b32_e32 v3, v3, v12
+; GFX10-NEXT:    v_and_b32_e32 v5, v5, v10
+; GFX10-NEXT:    v_and_b32_e32 v2, v2, v10
+; GFX10-NEXT:    v_and_b32_e32 v3, v3, v10
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 1, v2
 ; GFX10-NEXT:    v_mul_f32_e32 v6, 0x4f7ffffe, v6
 ; GFX10-NEXT:    v_mul_f32_e32 v7, 0x4f7ffffe, v7
@@ -2224,19 +2224,19 @@ define <2 x i24> @v_fshl_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc_lo
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v4
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, 24, v5
-; GFX10-NEXT:    v_cndmask_b32_e32 v15, v4, v6, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc_lo
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v5
-; GFX10-NEXT:    v_sub_nc_u32_e32 v6, 23, v15
+; GFX10-NEXT:    v_sub_nc_u32_e32 v6, 23, v4
 ; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc_lo
-; GFX10-NEXT:    v_and_b32_e32 v11, v6, v12
+; GFX10-NEXT:    v_and_b32_e32 v4, v4, v10
+; GFX10-NEXT:    v_and_b32_e32 v6, v6, v10
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v7, 23, v5
-; GFX10-NEXT:    v_and_b32_e32 v10, v5, v12
-; GFX10-NEXT:    v_lshrrev_b32_e32 v2, v11, v2
-; GFX10-NEXT:    v_and_b32_e32 v6, v7, v12
-; GFX10-NEXT:    v_and_b32_e32 v7, v15, v12
-; GFX10-NEXT:    v_lshrrev_b32_e32 v3, v6, v3
-; GFX10-NEXT:    v_lshl_or_b32 v0, v0, v7, v2
-; GFX10-NEXT:    v_lshl_or_b32 v1, v1, v10, v3
+; GFX10-NEXT:    v_and_b32_e32 v5, v5, v10
+; GFX10-NEXT:    v_lshrrev_b32_e32 v2, v6, v2
+; GFX10-NEXT:    v_and_b32_e32 v7, v7, v10
+; GFX10-NEXT:    v_lshl_or_b32 v0, v0, v4, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, v7, v3
+; GFX10-NEXT:    v_lshl_or_b32 v1, v1, v5, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x i24> @llvm.fshl.v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt)
   ret <2 x i24> %result
@@ -2617,13 +2617,13 @@ define <2 x i32> @v_fshl_v2i32(<2 x i32> %lhs, <2 x i32> %rhs, <2 x i32> %amt) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_alignbit_b32 v2, v0, v2, 1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 1, v0
-; GFX10-NEXT:    v_xor_b32_e32 v4, -1, v4
 ; GFX10-NEXT:    v_alignbit_b32 v3, v1, v3, 1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 1, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 1, v0
+; GFX10-NEXT:    v_xor_b32_e32 v4, -1, v4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 1, v1
 ; GFX10-NEXT:    v_xor_b32_e32 v5, -1, v5
-; GFX10-NEXT:    v_alignbit_b32 v0, v7, v2, v4
-; GFX10-NEXT:    v_alignbit_b32 v1, v6, v3, v5
+; GFX10-NEXT:    v_alignbit_b32 v0, v0, v2, v4
+; GFX10-NEXT:    v_alignbit_b32 v1, v1, v3, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %lhs, <2 x i32> %rhs, <2 x i32> %amt)
   ret <2 x i32> %result
@@ -2770,22 +2770,22 @@ define <4 x i32> @v_fshl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs, <4 x i32> %amt) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_alignbit_b32 v22, v1, v5, 1
-; GFX10-NEXT:    v_alignbit_b32 v18, v0, v4, 1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v15, 1, v0
+; GFX10-NEXT:    v_alignbit_b32 v4, v0, v4, 1
+; GFX10-NEXT:    v_alignbit_b32 v5, v1, v5, 1
+; GFX10-NEXT:    v_alignbit_b32 v6, v2, v6, 1
+; GFX10-NEXT:    v_alignbit_b32 v7, v3, v7, 1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    v_xor_b32_e32 v8, -1, v8
-; GFX10-NEXT:    v_lshrrev_b32_e32 v19, 1, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 1, v1
 ; GFX10-NEXT:    v_xor_b32_e32 v9, -1, v9
-; GFX10-NEXT:    v_alignbit_b32 v5, v2, v6, 1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v23, 1, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 1, v2
 ; GFX10-NEXT:    v_xor_b32_e32 v10, -1, v10
-; GFX10-NEXT:    v_alignbit_b32 v13, v3, v7, 1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v14, 1, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 1, v3
 ; GFX10-NEXT:    v_xor_b32_e32 v11, -1, v11
-; GFX10-NEXT:    v_alignbit_b32 v0, v15, v18, v8
-; GFX10-NEXT:    v_alignbit_b32 v1, v19, v22, v9
-; GFX10-NEXT:    v_alignbit_b32 v2, v23, v5, v10
-; GFX10-NEXT:    v_alignbit_b32 v3, v14, v13, v11
+; GFX10-NEXT:    v_alignbit_b32 v0, v0, v4, v8
+; GFX10-NEXT:    v_alignbit_b32 v1, v1, v5, v9
+; GFX10-NEXT:    v_alignbit_b32 v2, v2, v6, v10
+; GFX10-NEXT:    v_alignbit_b32 v3, v3, v7, v11
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %lhs, <4 x i32> %rhs, <4 x i32> %amt)
   ret <4 x i32> %result
@@ -4176,15 +4176,15 @@ define <4 x half> @v_fshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt)
 ; GFX10-NEXT:    v_xor_b32_e32 v7, -1, v5
 ; GFX10-NEXT:    s_mov_b32 s4, 0xf000f
 ; GFX10-NEXT:    v_pk_lshrrev_b16 v2, 1, v2 op_sel_hi:[0,1]
-; GFX10-NEXT:    v_and_b32_e32 v11, s4, v4
-; GFX10-NEXT:    v_and_b32_e32 v15, s4, v6
-; GFX10-NEXT:    v_and_b32_e32 v19, s4, v5
-; GFX10-NEXT:    v_and_b32_e32 v6, s4, v7
+; GFX10-NEXT:    v_and_b32_e32 v4, s4, v4
+; GFX10-NEXT:    v_and_b32_e32 v6, s4, v6
+; GFX10-NEXT:    v_and_b32_e32 v5, s4, v5
+; GFX10-NEXT:    v_and_b32_e32 v7, s4, v7
 ; GFX10-NEXT:    v_pk_lshrrev_b16 v3, 1, v3 op_sel_hi:[0,1]
-; GFX10-NEXT:    v_pk_lshlrev_b16 v0, v11, v0
-; GFX10-NEXT:    v_pk_lshrrev_b16 v2, v15, v2
-; GFX10-NEXT:    v_pk_lshlrev_b16 v1, v19, v1
-; GFX10-NEXT:    v_pk_lshrrev_b16 v3, v6, v3
+; GFX10-NEXT:    v_pk_lshlrev_b16 v0, v4, v0
+; GFX10-NEXT:    v_pk_lshrrev_b16 v2, v6, v2
+; GFX10-NEXT:    v_pk_lshlrev_b16 v1, v5, v1
+; GFX10-NEXT:    v_pk_lshrrev_b16 v3, v7, v3
 ; GFX10-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_or_b32_e32 v1, v1, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -4290,9 +4290,9 @@ define i64 @v_fshl_i64(i64 %lhs, i64 %rhs, i64 %amt) {
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_xor_b32_e32 v5, -1, v4
 ; GFX10-NEXT:    v_lshrrev_b64 v[2:3], 1, v[2:3]
-; GFX10-NEXT:    v_and_b32_e32 v7, 63, v4
+; GFX10-NEXT:    v_and_b32_e32 v4, 63, v4
 ; GFX10-NEXT:    v_and_b32_e32 v5, 63, v5
-; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v7, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v4, v[0:1]
 ; GFX10-NEXT:    v_lshrrev_b64 v[2:3], v5, v[2:3]
 ; GFX10-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_or_b32_e32 v1, v1, v3
@@ -4703,18 +4703,18 @@ define <2 x i64> @v_fshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) {
 ; GFX10-NEXT:    v_xor_b32_e32 v11, -1, v10
 ; GFX10-NEXT:    v_lshrrev_b64 v[4:5], 1, v[4:5]
 ; GFX10-NEXT:    v_lshrrev_b64 v[6:7], 1, v[6:7]
-; GFX10-NEXT:    v_and_b32_e32 v15, 63, v8
-; GFX10-NEXT:    v_and_b32_e32 v19, 63, v9
-; GFX10-NEXT:    v_and_b32_e32 v13, 63, v11
-; GFX10-NEXT:    v_and_b32_e32 v9, 63, v10
-; GFX10-NEXT:    v_lshlrev_b64 v[11:12], v15, v[0:1]
-; GFX10-NEXT:    v_lshrrev_b64 v[4:5], v19, v[4:5]
-; GFX10-NEXT:    v_lshrrev_b64 v[6:7], v13, v[6:7]
-; GFX10-NEXT:    v_lshlrev_b64 v[15:16], v9, v[2:3]
-; GFX10-NEXT:    v_or_b32_e32 v0, v11, v4
-; GFX10-NEXT:    v_or_b32_e32 v1, v12, v5
-; GFX10-NEXT:    v_or_b32_e32 v2, v15, v6
-; GFX10-NEXT:    v_or_b32_e32 v3, v16, v7
+; GFX10-NEXT:    v_and_b32_e32 v8, 63, v8
+; GFX10-NEXT:    v_and_b32_e32 v9, 63, v9
+; GFX10-NEXT:    v_and_b32_e32 v10, 63, v10
+; GFX10-NEXT:    v_and_b32_e32 v11, 63, v11
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v8, v[0:1]
+; GFX10-NEXT:    v_lshrrev_b64 v[4:5], v9, v[4:5]
+; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v10, v[2:3]
+; GFX10-NEXT:    v_lshrrev_b64 v[6:7], v11, v[6:7]
+; GFX10-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX10-NEXT:    v_or_b32_e32 v1, v1, v5
+; GFX10-NEXT:    v_or_b32_e32 v2, v2, v6
+; GFX10-NEXT:    v_or_b32_e32 v3, v3, v7
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt)
   ret <2 x i64> %result
@@ -5178,16 +5178,14 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) {
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_xor_b32_e32 v9, -1, v8
 ; GFX10-NEXT:    s_movk_i32 s4, 0x7f
-; GFX10-NEXT:    v_mov_b32_e32 v27, v2
 ; GFX10-NEXT:    v_and_b32_e32 v18, s4, v8
-; GFX10-NEXT:    v_mov_b32_e32 v28, v3
 ; GFX10-NEXT:    v_and_b32_e32 v19, s4, v9
 ; GFX10-NEXT:    s_sub_i32 s4, 64, 1
 ; GFX10-NEXT:    v_lshrrev_b64 v[8:9], 1, v[4:5]
 ; GFX10-NEXT:    v_lshlrev_b64 v[10:11], s4, v[6:7]
 ; GFX10-NEXT:    s_sub_i32 s4, 1, 64
 ; GFX10-NEXT:    s_cmp_lt_u32 1, 64
-; GFX10-NEXT:    v_lshrrev_b64 v[15:16], s4, v[6:7]
+; GFX10-NEXT:    v_lshrrev_b64 v[12:13], s4, v[6:7]
 ; GFX10-NEXT:    s_cselect_b32 vcc_lo, 1, 0
 ; GFX10-NEXT:    s_cmp_eq_u32 1, 0
 ; GFX10-NEXT:    v_or_b32_e32 v8, v8, v10
@@ -5197,48 +5195,48 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) {
 ; GFX10-NEXT:    s_and_b32 s4, 1, s4
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v14, 64, v18
 ; GFX10-NEXT:    v_lshrrev_b64 v[6:7], 1, v[6:7]
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v15, v8, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v15, v16, v9, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v12, v12, v8, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v13, v13, v9, vcc_lo
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s4
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 s4, 0, s5
 ; GFX10-NEXT:    v_lshrrev_b64 v[8:9], v14, v[0:1]
-; GFX10-NEXT:    v_lshlrev_b64 v[10:11], v18, v[27:28]
+; GFX10-NEXT:    v_lshlrev_b64 v[10:11], v18, v[2:3]
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v16, 64, v19
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v3, v4, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v15, v5, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v21, 0, v6, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v22, 0, v7, s4
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v23, 64, v18
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, v6, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, v7, s4
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v20, 64, v18
 ; GFX10-NEXT:    v_or_b32_e32 v10, v8, v10
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v8, 64, v19
 ; GFX10-NEXT:    v_lshrrev_b64 v[14:15], v19, v[4:5]
-; GFX10-NEXT:    v_lshlrev_b64 v[16:17], v16, v[21:22]
+; GFX10-NEXT:    v_lshlrev_b64 v[16:17], v16, v[6:7]
 ; GFX10-NEXT:    v_lshlrev_b64 v[12:13], v18, v[0:1]
-; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v23, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v20, v[0:1]
 ; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v18
 ; GFX10-NEXT:    v_or_b32_e32 v11, v9, v11
-; GFX10-NEXT:    v_lshrrev_b64 v[8:9], v8, v[21:22]
+; GFX10-NEXT:    v_lshrrev_b64 v[8:9], v8, v[6:7]
 ; GFX10-NEXT:    v_or_b32_e32 v14, v14, v16
 ; GFX10-NEXT:    v_cmp_gt_u32_e64 s4, 64, v19
 ; GFX10-NEXT:    v_or_b32_e32 v15, v15, v17
 ; GFX10-NEXT:    v_cndmask_b32_e32 v10, v0, v10, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v11, v1, v11, vcc_lo
-; GFX10-NEXT:    v_lshrrev_b64 v[0:1], v19, v[21:22]
-; GFX10-NEXT:    v_cndmask_b32_e64 v23, v8, v14, s4
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s5, 0, v19
+; GFX10-NEXT:    v_lshrrev_b64 v[0:1], v19, v[6:7]
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s6, 0, v18
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, v8, v14, s4
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s5, 0, v19
 ; GFX10-NEXT:    v_cndmask_b32_e64 v6, v9, v15, s4
-; GFX10-NEXT:    v_cndmask_b32_e32 v31, 0, v12, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v12, 0, v12, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v7, 0, v13, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, v23, v4, s5
-; GFX10-NEXT:    v_cndmask_b32_e64 v15, v10, v27, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v8, v4, s5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v5, v6, v5, s5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, v0, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, v1, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v11, v28, s6
-; GFX10-NEXT:    v_or_b32_e32 v0, v31, v4
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s6
+; GFX10-NEXT:    v_or_b32_e32 v0, v12, v4
 ; GFX10-NEXT:    v_or_b32_e32 v1, v7, v5
-; GFX10-NEXT:    v_or_b32_e32 v2, v15, v6
+; GFX10-NEXT:    v_or_b32_e32 v2, v2, v6
 ; GFX10-NEXT:    v_or_b32_e32 v3, v3, v8
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %result = call i128 @llvm.fshl.i128(i128 %lhs, i128 %rhs, i128 %amt)
@@ -5473,7 +5471,7 @@ define amdgpu_ps <4 x float> @v_fshl_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v14, 64, v13
 ; GFX10-NEXT:    v_lshrrev_b64 v[6:7], v13, s[6:7]
 ; GFX10-NEXT:    v_lshlrev_b64 v[8:9], v0, s[8:9]
-; GFX10-NEXT:    v_lshlrev_b64 v[15:16], v10, s[0:1]
+; GFX10-NEXT:    v_lshlrev_b64 v[10:11], v10, s[0:1]
 ; GFX10-NEXT:    v_or_b32_e32 v3, v1, v3
 ; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v12
 ; GFX10-NEXT:    v_lshrrev_b64 v[0:1], v14, s[8:9]
@@ -5481,25 +5479,25 @@ define amdgpu_ps <4 x float> @v_fshl_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 ; GFX10-NEXT:    v_or_b32_e32 v6, v6, v8
 ; GFX10-NEXT:    v_cmp_gt_u32_e64 s0, 64, v13
 ; GFX10-NEXT:    v_or_b32_e32 v7, v7, v9
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v15, v2, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v10, v16, v3, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v10, v2, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v10, v11, v3, vcc_lo
 ; GFX10-NEXT:    v_lshrrev_b64 v[2:3], v13, s[8:9]
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, 0, v12
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s0
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s1, 0, v13
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s0
-; GFX10-NEXT:    v_cndmask_b32_e32 v11, 0, v4, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v15, 0, v5, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v19, v8, s2, s4
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, 0, v12
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, 0, v5, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, s6, s1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s7, s1
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v8, s2, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, v2, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, v10, s3, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, v10, s3, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, v3, s0
-; GFX10-NEXT:    v_or_b32_e32 v0, v11, v0
-; GFX10-NEXT:    v_or_b32_e32 v1, v15, v1
-; GFX10-NEXT:    v_or_b32_e32 v2, v19, v2
-; GFX10-NEXT:    v_or_b32_e32 v3, v6, v3
+; GFX10-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX10-NEXT:    v_or_b32_e32 v1, v5, v1
+; GFX10-NEXT:    v_or_b32_e32 v2, v6, v2
+; GFX10-NEXT:    v_or_b32_e32 v3, v7, v3
 ; GFX10-NEXT:    ; return to shader part epilog
   %result = call i128 @llvm.fshl.i128(i128 %lhs, i128 %rhs, i128 %amt)
   %cast.result = bitcast i128 %result to <4 x float>
@@ -5756,7 +5754,7 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX10-NEXT:    v_lshlrev_b64 v[6:7], s0, v[2:3]
 ; GFX10-NEXT:    s_sub_i32 s0, 1, 64
 ; GFX10-NEXT:    s_cmp_lt_u32 1, 64
-; GFX10-NEXT:    v_lshrrev_b64 v[11:12], s0, v[2:3]
+; GFX10-NEXT:    v_lshrrev_b64 v[8:9], s0, v[2:3]
 ; GFX10-NEXT:    s_cselect_b32 vcc_lo, 1, 0
 ; GFX10-NEXT:    s_cmp_eq_u32 1, 0
 ; GFX10-NEXT:    v_or_b32_e32 v4, v4, v6
@@ -5765,12 +5763,12 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX10-NEXT:    s_and_b32 s1, 1, vcc_lo
 ; GFX10-NEXT:    s_and_b32 s0, 1, s0
 ; GFX10-NEXT:    v_lshrrev_b64 v[2:3], 1, v[2:3]
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v11, v4, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v11, v12, v5, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v9, v5, vcc_lo
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, s1
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v11, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, v2, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, v3, s0
 ; GFX10-NEXT:    s_sub_i32 s0, 64, s4
@@ -5778,7 +5776,7 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX10-NEXT:    v_lshlrev_b64 v[6:7], s0, v[2:3]
 ; GFX10-NEXT:    s_sub_i32 s0, s4, 64
 ; GFX10-NEXT:    s_cmp_lt_u32 s4, 64
-; GFX10-NEXT:    v_lshrrev_b64 v[11:12], s0, v[2:3]
+; GFX10-NEXT:    v_lshrrev_b64 v[8:9], s0, v[2:3]
 ; GFX10-NEXT:    s_cselect_b32 vcc_lo, 1, 0
 ; GFX10-NEXT:    s_cmp_eq_u32 s4, 0
 ; GFX10-NEXT:    v_or_b32_e32 v4, v4, v6
@@ -5787,12 +5785,12 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX10-NEXT:    s_and_b32 s1, 1, vcc_lo
 ; GFX10-NEXT:    s_and_b32 s0, 1, s0
 ; GFX10-NEXT:    v_lshrrev_b64 v[2:3], s4, v[2:3]
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v11, v4, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v11, v12, v5, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v9, v5, vcc_lo
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, s1
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v11, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, v2, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, v3, s0
 ; GFX10-NEXT:    v_or_b32_e32 v0, s8, v0
@@ -6025,7 +6023,7 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
 ; GFX10-NEXT:    v_lshlrev_b64 v[6:7], s8, v[2:3]
 ; GFX10-NEXT:    v_lshlrev_b64 v[8:9], s8, v[0:1]
 ; GFX10-NEXT:    s_cselect_b32 s6, 1, 0
-; GFX10-NEXT:    v_lshlrev_b64 v[11:12], s5, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], s5, v[0:1]
 ; GFX10-NEXT:    s_and_b32 s5, 1, vcc_lo
 ; GFX10-NEXT:    s_and_b32 s12, 1, s6
 ; GFX10-NEXT:    s_sub_i32 s13, 1, 64
@@ -6045,10 +6043,10 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
 ; GFX10-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
 ; GFX10-NEXT:    s_lshr_b64 s[2:3], s[2:3], s13
 ; GFX10-NEXT:    s_cmp_lg_u32 s5, 0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v11, v4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
 ; GFX10-NEXT:    s_cselect_b64 s[2:3], s[6:7], s[2:3]
 ; GFX10-NEXT:    s_cmp_lg_u32 s14, 0
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v12, v5, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
 ; GFX10-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[2:3]
 ; GFX10-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s12
@@ -6419,7 +6417,7 @@ define i128 @v_fshl_i128_65(i128 %lhs, i128 %rhs) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    s_movk_i32 s4, 0x41
-; GFX10-NEXT:    v_lshrrev_b32_e32 v19, 31, v5
+; GFX10-NEXT:    v_lshrrev_b32_e32 v16, 31, v5
 ; GFX10-NEXT:    s_sub_i32 s5, 64, s4
 ; GFX10-NEXT:    v_lshlrev_b64 v[10:11], s4, v[2:3]
 ; GFX10-NEXT:    v_lshrrev_b64 v[8:9], s5, v[0:1]
@@ -6431,39 +6429,39 @@ define i128 @v_fshl_i128_65(i128 %lhs, i128 %rhs) {
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], s5, v[0:1]
 ; GFX10-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX10-NEXT:    s_sub_i32 s5, 64, 63
-; GFX10-NEXT:    v_or_b32_e32 v15, v9, v11
 ; GFX10-NEXT:    v_or_b32_e32 v14, v8, v10
+; GFX10-NEXT:    v_or_b32_e32 v15, v9, v11
 ; GFX10-NEXT:    v_lshlrev_b64 v[8:9], s5, v[6:7]
 ; GFX10-NEXT:    s_and_b32 s6, 1, vcc_lo
 ; GFX10-NEXT:    s_and_b32 s7, 1, s4
 ; GFX10-NEXT:    s_sub_i32 s4, 63, 64
-; GFX10-NEXT:    v_cndmask_b32_e32 v11, 0, v12, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v27, 0, v13, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v12, 0, v12, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v13, 0, v13, vcc_lo
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s6
-; GFX10-NEXT:    v_lshrrev_b64 v[23:24], s4, v[6:7]
+; GFX10-NEXT:    v_lshrrev_b64 v[10:11], s4, v[6:7]
 ; GFX10-NEXT:    s_cmp_lt_u32 63, 64
-; GFX10-NEXT:    v_or_b32_e32 v6, v19, v8
+; GFX10-NEXT:    v_or_b32_e32 v6, v16, v8
 ; GFX10-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX10-NEXT:    s_cmp_eq_u32 63, 0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v14, vcc_lo
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 s6, 0, s7
 ; GFX10-NEXT:    s_cselect_b32 s5, 1, 0
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, v23, v6, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v10, v6, s4
 ; GFX10-NEXT:    s_and_b32 s5, 1, s5
-; GFX10-NEXT:    v_cndmask_b32_e64 v8, v24, v9, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, v11, v9, s4
 ; GFX10-NEXT:    s_and_b32 s4, 1, s4
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 s5, 0, s5
-; GFX10-NEXT:    v_cndmask_b32_e64 v19, v0, v2, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v0, v2, s6
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 31, v7
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 s4, 0, s4
-; GFX10-NEXT:    v_cndmask_b32_e32 v10, v1, v15, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v1, v15, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v4, v6, v4, s5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v5, v8, v5, s5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, v0, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v10, v3, s6
-; GFX10-NEXT:    v_or_b32_e32 v0, v11, v4
-; GFX10-NEXT:    v_or_b32_e32 v1, v27, v5
-; GFX10-NEXT:    v_or_b32_e32 v2, v19, v6
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s6
+; GFX10-NEXT:    v_or_b32_e32 v0, v12, v4
+; GFX10-NEXT:    v_or_b32_e32 v1, v13, v5
+; GFX10-NEXT:    v_or_b32_e32 v2, v2, v6
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %result = call i128 @llvm.fshl.i128(i128 %lhs, i128 %rhs, i128 65)
   ret i128 %result
@@ -6810,21 +6808,19 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_movk_i32 s18, 0x7f
 ; GFX10-NEXT:    s_mov_b32 s19, 0
-; GFX10-NEXT:    s_mov_b32 s30, s0
 ; GFX10-NEXT:    s_and_b64 s[22:23], s[16:17], s[18:19]
 ; GFX10-NEXT:    s_andn2_b64 s[16:17], s[18:19], s[16:17]
 ; GFX10-NEXT:    s_sub_i32 s17, s22, 64
 ; GFX10-NEXT:    s_sub_i32 s23, 64, s22
 ; GFX10-NEXT:    s_cmp_lt_u32 s22, 64
-; GFX10-NEXT:    s_mov_b32 s31, s1
 ; GFX10-NEXT:    s_cselect_b32 s28, 1, 0
 ; GFX10-NEXT:    s_cmp_eq_u32 s22, 0
 ; GFX10-NEXT:    s_cselect_b32 s29, 1, 0
-; GFX10-NEXT:    s_lshr_b64 s[24:25], s[30:31], s23
+; GFX10-NEXT:    s_lshr_b64 s[24:25], s[0:1], s23
 ; GFX10-NEXT:    s_lshl_b64 s[26:27], s[2:3], s22
-; GFX10-NEXT:    s_lshl_b64 s[22:23], s[30:31], s22
+; GFX10-NEXT:    s_lshl_b64 s[22:23], s[0:1], s22
 ; GFX10-NEXT:    s_or_b64 s[24:25], s[24:25], s[26:27]
-; GFX10-NEXT:    s_lshl_b64 s[0:1], s[30:31], s17
+; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s17
 ; GFX10-NEXT:    s_cmp_lg_u32 s28, 0
 ; GFX10-NEXT:    s_cselect_b64 s[22:23], s[22:23], 0
 ; GFX10-NEXT:    s_cselect_b64 s[0:1], s[24:25], s[0:1]
@@ -6844,7 +6840,7 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX10-NEXT:    s_cmp_lg_u32 s17, 0
 ; GFX10-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[10:11]
 ; GFX10-NEXT:    s_cmp_lg_u32 s30, 0
-; GFX10-NEXT:    s_cselect_b64 s[46:47], s[8:9], s[0:1]
+; GFX10-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
 ; GFX10-NEXT:    s_cmp_lg_u32 s17, 0
 ; GFX10-NEXT:    s_cselect_b64 s[8:9], s[26:27], 0
 ; GFX10-NEXT:    s_sub_i32 s26, s16, 64
@@ -6853,7 +6849,7 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX10-NEXT:    s_cselect_b32 s27, 1, 0
 ; GFX10-NEXT:    s_cmp_eq_u32 s16, 0
 ; GFX10-NEXT:    s_cselect_b32 s30, 1, 0
-; GFX10-NEXT:    s_lshr_b64 s[10:11], s[46:47], s16
+; GFX10-NEXT:    s_lshr_b64 s[10:11], s[0:1], s16
 ; GFX10-NEXT:    s_lshl_b64 s[24:25], s[8:9], s17
 ; GFX10-NEXT:    s_lshr_b64 s[16:17], s[8:9], s16
 ; GFX10-NEXT:    s_or_b64 s[10:11], s[10:11], s[24:25]
@@ -6861,7 +6857,7 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX10-NEXT:    s_cmp_lg_u32 s27, 0
 ; GFX10-NEXT:    s_cselect_b64 s[8:9], s[10:11], s[8:9]
 ; GFX10-NEXT:    s_cmp_lg_u32 s30, 0
-; GFX10-NEXT:    s_cselect_b64 s[0:1], s[46:47], s[8:9]
+; GFX10-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[8:9]
 ; GFX10-NEXT:    s_cmp_lg_u32 s27, 0
 ; GFX10-NEXT:    s_cselect_b64 s[8:9], s[16:17], 0
 ; GFX10-NEXT:    s_andn2_b64 s[10:11], s[18:19], s[20:21]
@@ -7329,8 +7325,6 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX10-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX10-NEXT:    s_and_b32 s5, 1, vcc_lo
 ; GFX10-NEXT:    s_and_b32 s4, 1, s4
-; GFX10-NEXT:    v_mov_b32_e32 v29, v2
-; GFX10-NEXT:    v_mov_b32_e32 v30, v3
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v23, 64, v27
 ; GFX10-NEXT:    v_lshrrev_b64 v[10:11], 1, v[10:11]
 ; GFX10-NEXT:    v_cndmask_b32_e32 v21, v21, v16, vcc_lo
@@ -7338,20 +7332,20 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s4
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 s4, 0, s5
 ; GFX10-NEXT:    v_lshrrev_b64 v[16:17], v23, v[0:1]
-; GFX10-NEXT:    v_lshlrev_b64 v[18:19], v27, v[29:30]
+; GFX10-NEXT:    v_lshlrev_b64 v[18:19], v27, v[2:3]
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v25, 64, v28
-; GFX10-NEXT:    v_cndmask_b32_e32 v34, v21, v8, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v35, v22, v9, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v21, v8, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, v22, v9, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, v10, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, v11, s4
 ; GFX10-NEXT:    v_or_b32_e32 v18, v16, v18
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v31, 64, v27
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v29, 64, v27
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v16, 64, v28
-; GFX10-NEXT:    v_lshrrev_b64 v[23:24], v28, v[34:35]
+; GFX10-NEXT:    v_lshrrev_b64 v[23:24], v28, v[8:9]
 ; GFX10-NEXT:    v_lshlrev_b64 v[25:26], v25, v[10:11]
 ; GFX10-NEXT:    v_lshlrev_b64 v[21:22], v27, v[0:1]
 ; GFX10-NEXT:    v_or_b32_e32 v19, v17, v19
-; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v31, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v29, v[0:1]
 ; GFX10-NEXT:    v_lshrrev_b64 v[16:17], v16, v[10:11]
 ; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v27
 ; GFX10-NEXT:    v_or_b32_e32 v23, v23, v25
@@ -7363,77 +7357,77 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX10-NEXT:    v_lshrrev_b64 v[0:1], v28, v[10:11]
 ; GFX10-NEXT:    v_cndmask_b32_e64 v16, v16, v23, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v10, v17, v24, s4
-; GFX10-NEXT:    v_cndmask_b32_e32 v21, 0, v21, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s6, 0, v27
 ; GFX10-NEXT:    v_cndmask_b32_e32 v11, 0, v22, vcc_lo
-; GFX10-NEXT:    s_cmp_lt_u32 1, 64
-; GFX10-NEXT:    v_cndmask_b32_e64 v8, v16, v34, s5
-; GFX10-NEXT:    v_cndmask_b32_e64 v9, v10, v35, s5
+; GFX10-NEXT:    v_cndmask_b32_e32 v21, 0, v21, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, v16, v8, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, v10, v9, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v22, v19, v3, s6
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, v0, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v23, 0, v1, s4
 ; GFX10-NEXT:    v_xor_b32_e32 v16, -1, v20
-; GFX10-NEXT:    v_or_b32_e32 v0, v21, v8
 ; GFX10-NEXT:    v_or_b32_e32 v1, v11, v9
+; GFX10-NEXT:    v_or_b32_e32 v0, v21, v8
 ; GFX10-NEXT:    v_lshrrev_b64 v[8:9], 1, v[12:13]
 ; GFX10-NEXT:    v_lshlrev_b64 v[10:11], s8, v[14:15]
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s6, 0, v27
-; GFX10-NEXT:    v_and_b32_e32 v27, s7, v16
+; GFX10-NEXT:    s_cmp_lt_u32 1, 64
+; GFX10-NEXT:    v_and_b32_e32 v25, s7, v16
+; GFX10-NEXT:    v_and_b32_e32 v24, s7, v20
 ; GFX10-NEXT:    s_cselect_b32 vcc_lo, 1, 0
 ; GFX10-NEXT:    s_cmp_eq_u32 1, 0
 ; GFX10-NEXT:    v_lshrrev_b64 v[16:17], s9, v[14:15]
-; GFX10-NEXT:    v_or_b32_e32 v11, v9, v11
 ; GFX10-NEXT:    v_or_b32_e32 v10, v8, v10
+; GFX10-NEXT:    v_or_b32_e32 v11, v9, v11
 ; GFX10-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX10-NEXT:    s_and_b32 s5, 1, vcc_lo
 ; GFX10-NEXT:    s_and_b32 s4, 1, s4
-; GFX10-NEXT:    v_and_b32_e32 v24, s7, v20
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v18, v2, s6
 ; GFX10-NEXT:    v_lshrrev_b64 v[8:9], 1, v[14:15]
-; GFX10-NEXT:    v_cndmask_b32_e64 v22, v19, v30, s6
-; GFX10-NEXT:    v_cndmask_b32_e32 v19, v16, v10, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v31, v17, v11, vcc_lo
+; GFX10-NEXT:    v_sub_nc_u32_e32 v18, 64, v24
+; GFX10-NEXT:    v_cndmask_b32_e32 v16, v16, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v17, v17, v11, vcc_lo
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s4
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 s4, 0, s5
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v18, v29, s6
-; GFX10-NEXT:    v_sub_nc_u32_e32 v18, 64, v24
+; GFX10-NEXT:    v_lshrrev_b64 v[10:11], v18, v[4:5]
 ; GFX10-NEXT:    v_lshlrev_b64 v[14:15], v24, v[6:7]
-; GFX10-NEXT:    v_cndmask_b32_e32 v13, v31, v13, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v12, v19, v12, vcc_lo
+; GFX10-NEXT:    v_sub_nc_u32_e32 v20, 64, v25
+; GFX10-NEXT:    v_cndmask_b32_e32 v12, v16, v12, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v13, v17, v13, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, v9, s4
-; GFX10-NEXT:    v_sub_nc_u32_e32 v31, 64, v27
-; GFX10-NEXT:    v_lshrrev_b64 v[35:36], v18, v[4:5]
 ; GFX10-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, 64, v24
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v10, 64, v27
 ; GFX10-NEXT:    v_lshlrev_b64 v[16:17], v24, v[4:5]
-; GFX10-NEXT:    v_lshrrev_b64 v[18:19], v27, v[12:13]
-; GFX10-NEXT:    v_lshlrev_b64 v[20:21], v31, v[8:9]
+; GFX10-NEXT:    v_or_b32_e32 v14, v10, v14
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v10, 64, v25
+; GFX10-NEXT:    v_lshrrev_b64 v[18:19], v25, v[12:13]
+; GFX10-NEXT:    v_lshlrev_b64 v[20:21], v20, v[8:9]
 ; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v24
 ; GFX10-NEXT:    v_lshlrev_b64 v[3:4], v3, v[4:5]
-; GFX10-NEXT:    v_or_b32_e32 v5, v36, v15
-; GFX10-NEXT:    v_or_b32_e32 v14, v35, v14
+; GFX10-NEXT:    v_or_b32_e32 v5, v11, v15
 ; GFX10-NEXT:    v_lshrrev_b64 v[10:11], v10, v[8:9]
+; GFX10-NEXT:    v_cmp_gt_u32_e64 s4, 64, v25
 ; GFX10-NEXT:    v_cndmask_b32_e32 v15, 0, v16, vcc_lo
 ; GFX10-NEXT:    v_or_b32_e32 v16, v18, v20
-; GFX10-NEXT:    v_cmp_gt_u32_e64 s4, 64, v27
 ; GFX10-NEXT:    v_or_b32_e32 v18, v19, v21
-; GFX10-NEXT:    v_cndmask_b32_e32 v31, v3, v14, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v14, v3, v14, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v5, v4, v5, vcc_lo
-; GFX10-NEXT:    v_lshrrev_b64 v[3:4], v27, v[8:9]
+; GFX10-NEXT:    v_lshrrev_b64 v[3:4], v25, v[8:9]
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s6, 0, v24
 ; GFX10-NEXT:    v_cndmask_b32_e64 v8, v11, v18, s4
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s5, 0, v27
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s5, 0, v25
 ; GFX10-NEXT:    v_cndmask_b32_e64 v10, v10, v16, s4
 ; GFX10-NEXT:    v_cndmask_b32_e32 v9, 0, v17, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v14, v5, v7, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, v5, v7, s6
 ; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, v4, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v19, v31, v6, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v14, v6, s6
 ; GFX10-NEXT:    v_cndmask_b32_e64 v5, v10, v12, s5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, v3, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v8, v8, v13, s5
 ; GFX10-NEXT:    v_or_b32_e32 v3, v22, v23
-; GFX10-NEXT:    v_or_b32_e32 v7, v14, v11
+; GFX10-NEXT:    v_or_b32_e32 v7, v7, v11
 ; GFX10-NEXT:    v_or_b32_e32 v4, v15, v5
-; GFX10-NEXT:    v_or_b32_e32 v6, v19, v10
+; GFX10-NEXT:    v_or_b32_e32 v6, v6, v10
 ; GFX10-NEXT:    v_or_b32_e32 v5, v9, v8
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x i128> @llvm.fshl.v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %amt)

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
index d9abd3550960..35d17d88615a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
@@ -260,9 +260,9 @@ define i7 @v_fshr_i7(i7 %lhs, i7 %rhs, i7 %amt) {
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 0x7f
 ; GFX10-NEXT:    v_sub_nc_u16 v4, 6, v2
 ; GFX10-NEXT:    v_and_b32_e32 v2, v2, v3
-; GFX10-NEXT:    v_and_b32_e32 v7, v4, v3
+; GFX10-NEXT:    v_and_b32_e32 v4, v4, v3
 ; GFX10-NEXT:    v_lshrrev_b16 v1, v2, v1
-; GFX10-NEXT:    v_lshlrev_b16 v0, v7, v0
+; GFX10-NEXT:    v_lshlrev_b16 v0, v4, v0
 ; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %result = call i7 @llvm.fshr.i7(i7 %lhs, i7 %rhs, i7 %amt)
@@ -1158,38 +1158,38 @@ define i32 @v_fshr_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 8, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
 ; GFX10-NEXT:    v_xor_b32_e32 v8, -1, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 24, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
+; GFX10-NEXT:    v_xor_b32_e32 v11, -1, v6
+; GFX10-NEXT:    v_lshlrev_b16 v3, 1, v3
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
-; GFX10-NEXT:    v_xor_b32_e32 v11, -1, v6
-; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 24, v2
-; GFX10-NEXT:    v_and_b32_e32 v15, 7, v8
+; GFX10-NEXT:    v_xor_b32_e32 v14, -1, v12
+; GFX10-NEXT:    v_and_b32_e32 v11, 7, v11
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 8, v1
+; GFX10-NEXT:    v_and_b32_e32 v8, 7, v8
 ; GFX10-NEXT:    v_lshlrev_b16 v0, 1, v0
-; GFX10-NEXT:    v_and_b32_e32 v14, 7, v11
-; GFX10-NEXT:    v_lshlrev_b16 v3, 1, v3
+; GFX10-NEXT:    v_mov_b32_e32 v13, 0xff
+; GFX10-NEXT:    v_lshlrev_b16 v3, v11, v3
 ; GFX10-NEXT:    v_xor_b32_e32 v11, -1, v10
-; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 8, v1
-; GFX10-NEXT:    v_lshlrev_b16 v0, v15, v0
-; GFX10-NEXT:    v_mov_b32_e32 v15, 0xff
-; GFX10-NEXT:    v_lshlrev_b16 v3, v14, v3
-; GFX10-NEXT:    v_xor_b32_e32 v14, -1, v12
 ; GFX10-NEXT:    s_movk_i32 s4, 0xff
+; GFX10-NEXT:    v_lshlrev_b16 v0, v8, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 24, v1
 ; GFX10-NEXT:    v_and_b32_e32 v8, s4, v1
-; GFX10-NEXT:    v_and_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_and_b32_sdwa v1, v1, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    v_and_b32_e32 v10, 7, v10
 ; GFX10-NEXT:    v_and_b32_e32 v6, 7, v6
 ; GFX10-NEXT:    v_and_b32_e32 v7, s4, v7
 ; GFX10-NEXT:    v_and_b32_e32 v11, 7, v11
 ; GFX10-NEXT:    v_lshlrev_b16 v4, 1, v4
-; GFX10-NEXT:    v_and_b32_e32 v15, 7, v14
+; GFX10-NEXT:    v_and_b32_e32 v13, 7, v14
 ; GFX10-NEXT:    v_lshlrev_b16 v5, 1, v5
 ; GFX10-NEXT:    v_and_b32_e32 v12, 7, v12
 ; GFX10-NEXT:    v_and_b32_e32 v2, 7, v2
 ; GFX10-NEXT:    v_lshrrev_b16 v6, v6, v7
 ; GFX10-NEXT:    v_lshlrev_b16 v4, v11, v4
 ; GFX10-NEXT:    v_lshrrev_b16 v1, v10, v1
-; GFX10-NEXT:    v_lshlrev_b16 v5, v15, v5
+; GFX10-NEXT:    v_lshlrev_b16 v5, v13, v5
 ; GFX10-NEXT:    v_lshrrev_b16 v7, v12, v9
 ; GFX10-NEXT:    v_lshrrev_b16 v2, v2, v8
 ; GFX10-NEXT:    v_or_b32_e32 v3, v3, v6
@@ -2190,14 +2190,14 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v6, 24
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v7, 24
 ; GFX10-NEXT:    s_sub_i32 s4, 0, 24
-; GFX10-NEXT:    v_mov_b32_e32 v12, 0xffffff
+; GFX10-NEXT:    v_mov_b32_e32 v10, 0xffffff
 ; GFX10-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v6, v6
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v7, v7
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
-; GFX10-NEXT:    v_and_b32_e32 v5, v5, v12
-; GFX10-NEXT:    v_and_b32_e32 v2, v2, v12
-; GFX10-NEXT:    v_and_b32_e32 v3, v3, v12
+; GFX10-NEXT:    v_and_b32_e32 v5, v5, v10
+; GFX10-NEXT:    v_and_b32_e32 v2, v2, v10
+; GFX10-NEXT:    v_and_b32_e32 v3, v3, v10
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 1, v1
 ; GFX10-NEXT:    v_mul_f32_e32 v6, 0x4f7ffffe, v6
 ; GFX10-NEXT:    v_mul_f32_e32 v7, 0x4f7ffffe, v7
@@ -2224,18 +2224,18 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc_lo
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v4
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, 24, v5
-; GFX10-NEXT:    v_cndmask_b32_e32 v11, v4, v6, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc_lo
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v5
-; GFX10-NEXT:    v_and_b32_e32 v4, v11, v12
+; GFX10-NEXT:    v_sub_nc_u32_e32 v6, 23, v4
 ; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc_lo
-; GFX10-NEXT:    v_sub_nc_u32_e32 v6, 23, v11
-; GFX10-NEXT:    v_lshrrev_b32_e32 v2, v4, v2
+; GFX10-NEXT:    v_and_b32_e32 v4, v4, v10
+; GFX10-NEXT:    v_and_b32_e32 v6, v6, v10
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v7, 23, v5
-; GFX10-NEXT:    v_and_b32_e32 v5, v5, v12
-; GFX10-NEXT:    v_and_b32_e32 v11, v6, v12
-; GFX10-NEXT:    v_and_b32_e32 v4, v7, v12
+; GFX10-NEXT:    v_and_b32_e32 v5, v5, v10
+; GFX10-NEXT:    v_lshrrev_b32_e32 v2, v4, v2
+; GFX10-NEXT:    v_and_b32_e32 v4, v7, v10
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, v5, v3
-; GFX10-NEXT:    v_lshl_or_b32 v0, v0, v11, v2
+; GFX10-NEXT:    v_lshl_or_b32 v0, v0, v6, v2
 ; GFX10-NEXT:    v_lshl_or_b32 v1, v1, v4, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x i24> @llvm.fshr.v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt)
@@ -4424,9 +4424,9 @@ define i64 @v_fshr_i64(i64 %lhs, i64 %rhs, i64 %amt) {
 ; GFX10-NEXT:    v_xor_b32_e32 v5, -1, v4
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
 ; GFX10-NEXT:    v_and_b32_e32 v4, 63, v4
-; GFX10-NEXT:    v_and_b32_e32 v7, 63, v5
+; GFX10-NEXT:    v_and_b32_e32 v5, 63, v5
 ; GFX10-NEXT:    v_lshrrev_b64 v[2:3], v4, v[2:3]
-; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v7, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v5, v[0:1]
 ; GFX10-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_or_b32_e32 v1, v1, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -4833,18 +4833,18 @@ define <2 x i64> @v_fshr_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) {
 ; GFX10-NEXT:    v_xor_b32_e32 v11, -1, v10
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
 ; GFX10-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
-; GFX10-NEXT:    v_and_b32_e32 v19, 63, v8
-; GFX10-NEXT:    v_and_b32_e32 v15, 63, v9
-; GFX10-NEXT:    v_and_b32_e32 v9, 63, v11
-; GFX10-NEXT:    v_and_b32_e32 v13, 63, v10
-; GFX10-NEXT:    v_lshrrev_b64 v[4:5], v19, v[4:5]
-; GFX10-NEXT:    v_lshlrev_b64 v[11:12], v15, v[0:1]
-; GFX10-NEXT:    v_lshlrev_b64 v[15:16], v9, v[2:3]
-; GFX10-NEXT:    v_lshrrev_b64 v[6:7], v13, v[6:7]
-; GFX10-NEXT:    v_or_b32_e32 v0, v11, v4
-; GFX10-NEXT:    v_or_b32_e32 v1, v12, v5
-; GFX10-NEXT:    v_or_b32_e32 v2, v15, v6
-; GFX10-NEXT:    v_or_b32_e32 v3, v16, v7
+; GFX10-NEXT:    v_and_b32_e32 v8, 63, v8
+; GFX10-NEXT:    v_and_b32_e32 v9, 63, v9
+; GFX10-NEXT:    v_and_b32_e32 v11, 63, v11
+; GFX10-NEXT:    v_and_b32_e32 v10, 63, v10
+; GFX10-NEXT:    v_lshrrev_b64 v[4:5], v8, v[4:5]
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v9, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v11, v[2:3]
+; GFX10-NEXT:    v_lshrrev_b64 v[6:7], v10, v[6:7]
+; GFX10-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX10-NEXT:    v_or_b32_e32 v1, v1, v5
+; GFX10-NEXT:    v_or_b32_e32 v2, v2, v6
+; GFX10-NEXT:    v_or_b32_e32 v3, v3, v7
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt)
   ret <2 x i64> %result
@@ -5317,46 +5317,44 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) {
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v10, s4
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 s4, 0, s6
 ; GFX10-NEXT:    v_and_b32_e32 v19, s5, v15
-; GFX10-NEXT:    v_and_b32_e32 v20, s5, v8
 ; GFX10-NEXT:    v_cndmask_b32_e32 v9, 0, v13, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v10, 0, v14, vcc_lo
+; GFX10-NEXT:    v_and_b32_e32 v20, s5, v8
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s4
-; GFX10-NEXT:    v_sub_nc_u32_e32 v11, 64, v19
-; GFX10-NEXT:    v_sub_nc_u32_e32 v17, 64, v20
-; GFX10-NEXT:    v_mov_b32_e32 v25, v4
-; GFX10-NEXT:    v_mov_b32_e32 v26, v5
+; GFX10-NEXT:    v_sub_nc_u32_e32 v2, 64, v19
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v8, 64, v19
-; GFX10-NEXT:    v_lshrrev_b64 v[2:3], v11, v[9:10]
-; GFX10-NEXT:    v_lshlrev_b64 v[11:12], v19, v[0:1]
+; GFX10-NEXT:    v_sub_nc_u32_e32 v17, 64, v20
 ; GFX10-NEXT:    v_lshlrev_b64 v[13:14], v19, v[9:10]
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v21, 64, v20
+; GFX10-NEXT:    v_lshlrev_b64 v[11:12], v19, v[0:1]
+; GFX10-NEXT:    v_lshrrev_b64 v[2:3], v2, v[9:10]
 ; GFX10-NEXT:    v_lshlrev_b64 v[8:9], v8, v[9:10]
-; GFX10-NEXT:    v_lshrrev_b64 v[15:16], v20, v[25:26]
+; GFX10-NEXT:    v_lshrrev_b64 v[15:16], v20, v[4:5]
 ; GFX10-NEXT:    v_lshlrev_b64 v[17:18], v17, v[6:7]
 ; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v19
-; GFX10-NEXT:    v_or_b32_e32 v10, v3, v12
-; GFX10-NEXT:    v_or_b32_e32 v11, v2, v11
-; GFX10-NEXT:    v_lshrrev_b64 v[2:3], v21, v[6:7]
 ; GFX10-NEXT:    v_cmp_gt_u32_e64 s4, 64, v20
-; GFX10-NEXT:    v_cndmask_b32_e32 v23, 0, v13, vcc_lo
+; GFX10-NEXT:    v_or_b32_e32 v11, v2, v11
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v2, 64, v20
+; GFX10-NEXT:    v_or_b32_e32 v10, v3, v12
+; GFX10-NEXT:    v_cndmask_b32_e32 v12, 0, v13, vcc_lo
 ; GFX10-NEXT:    v_or_b32_e32 v13, v15, v17
-; GFX10-NEXT:    v_cndmask_b32_e32 v15, v9, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v8, v11, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b64 v[2:3], v2, v[6:7]
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, v9, v10, vcc_lo
 ; GFX10-NEXT:    v_or_b32_e32 v10, v16, v18
 ; GFX10-NEXT:    v_lshrrev_b64 v[6:7], v20, v[6:7]
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s6, 0, v19
-; GFX10-NEXT:    v_cndmask_b32_e32 v11, v8, v11, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s5, 0, v20
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v13, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v10, s4
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s5, 0, v20
-; GFX10-NEXT:    v_cndmask_b32_e64 v9, v15, v1, s6
-; GFX10-NEXT:    v_cndmask_b32_e64 v8, v11, v0, s6
 ; GFX10-NEXT:    v_cndmask_b32_e32 v10, 0, v14, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v2, v25, s5
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v3, v26, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, v8, v0, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, v9, v1, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v2, v4, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v3, v5, s5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, v6, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, v7, s4
-; GFX10-NEXT:    v_or_b32_e32 v0, v23, v0
+; GFX10-NEXT:    v_or_b32_e32 v0, v12, v0
 ; GFX10-NEXT:    v_or_b32_e32 v1, v10, v1
 ; GFX10-NEXT:    v_or_b32_e32 v2, v8, v2
 ; GFX10-NEXT:    v_or_b32_e32 v3, v9, v3
@@ -5591,31 +5589,31 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 ; GFX10-NEXT:    v_cmp_gt_u32_e64 s0, 64, v12
 ; GFX10-NEXT:    v_or_b32_e32 v2, v0, v2
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v0, 64, v12
-; GFX10-NEXT:    v_lshlrev_b64 v[15:16], v10, s[10:11]
+; GFX10-NEXT:    v_lshlrev_b64 v[10:11], v10, s[10:11]
 ; GFX10-NEXT:    v_or_b32_e32 v3, v1, v3
 ; GFX10-NEXT:    v_or_b32_e32 v6, v6, v8
 ; GFX10-NEXT:    v_or_b32_e32 v7, v7, v9
 ; GFX10-NEXT:    v_lshrrev_b64 v[0:1], v0, s[6:7]
 ; GFX10-NEXT:    v_lshlrev_b64 v[4:5], v13, s[10:11]
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v15, v2, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v10, v16, v3, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v10, v2, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v10, v11, v3, vcc_lo
 ; GFX10-NEXT:    v_lshrrev_b64 v[2:3], v12, s[6:7]
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s2, 0, v13
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s0
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s1, 0, v12
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s0
-; GFX10-NEXT:    v_cndmask_b32_e32 v11, 0, v4, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v15, 0, v5, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v19, v8, s8, s2
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s2, 0, v13
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, 0, v5, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, s4, s1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s5, s1
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v8, s8, s2
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, v2, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, v10, s9, s2
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, v10, s9, s2
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, v3, s0
-; GFX10-NEXT:    v_or_b32_e32 v0, v11, v0
-; GFX10-NEXT:    v_or_b32_e32 v1, v15, v1
-; GFX10-NEXT:    v_or_b32_e32 v2, v19, v2
-; GFX10-NEXT:    v_or_b32_e32 v3, v6, v3
+; GFX10-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX10-NEXT:    v_or_b32_e32 v1, v5, v1
+; GFX10-NEXT:    v_or_b32_e32 v2, v6, v2
+; GFX10-NEXT:    v_or_b32_e32 v3, v7, v3
 ; GFX10-NEXT:    ; return to shader part epilog
   %result = call i128 @llvm.fshr.i128(i128 %lhs, i128 %rhs, i128 %amt)
   %cast.result = bitcast i128 %result to <4 x float>
@@ -5870,7 +5868,7 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX10-NEXT:    v_lshlrev_b64 v[6:7], s0, v[2:3]
 ; GFX10-NEXT:    s_sub_i32 s0, s8, 64
 ; GFX10-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX10-NEXT:    v_lshrrev_b64 v[11:12], s0, v[2:3]
+; GFX10-NEXT:    v_lshrrev_b64 v[8:9], s0, v[2:3]
 ; GFX10-NEXT:    s_cselect_b32 vcc_lo, 1, 0
 ; GFX10-NEXT:    s_cmp_eq_u32 s8, 0
 ; GFX10-NEXT:    v_or_b32_e32 v4, v4, v6
@@ -5879,12 +5877,12 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX10-NEXT:    s_and_b32 s1, 1, vcc_lo
 ; GFX10-NEXT:    s_and_b32 s0, 1, s0
 ; GFX10-NEXT:    v_lshrrev_b64 v[2:3], s8, v[2:3]
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v11, v4, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v11, v12, v5, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v9, v5, vcc_lo
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, s1
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v11, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, v2, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, v3, s0
 ; GFX10-NEXT:    v_or_b32_e32 v0, s4, v0
@@ -6128,10 +6126,10 @@ define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
 ; GFX10-NEXT:    s_sub_i32 s5, 1, 64
 ; GFX10-NEXT:    v_lshrrev_b64 v[4:5], s4, v[0:1]
 ; GFX10-NEXT:    s_cmp_lt_u32 1, 64
-; GFX10-NEXT:    v_lshlrev_b64 v[13:14], s5, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b64 v[8:9], 1, v[0:1]
 ; GFX10-NEXT:    s_cselect_b32 vcc_lo, 1, 0
 ; GFX10-NEXT:    s_cmp_eq_u32 1, 0
-; GFX10-NEXT:    v_lshlrev_b64 v[8:9], 1, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], s5, v[0:1]
 ; GFX10-NEXT:    s_cselect_b32 s7, 1, 0
 ; GFX10-NEXT:    s_and_b32 s4, 1, vcc_lo
 ; GFX10-NEXT:    v_or_b32_e32 v6, v4, v6
@@ -6139,8 +6137,8 @@ define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
 ; GFX10-NEXT:    v_or_b32_e32 v5, v5, v7
 ; GFX10-NEXT:    s_and_b32 s5, 1, s7
 ; GFX10-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v13, v6, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v14, v5, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s4
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 s4, 0, s5
 ; GFX10-NEXT:    v_cndmask_b32_e32 v5, 0, v9, vcc_lo
 ; GFX10-NEXT:    s_sub_i32 s5, s6, 64
@@ -6148,7 +6146,7 @@ define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s4
 ; GFX10-NEXT:    s_sub_i32 s4, 64, s6
 ; GFX10-NEXT:    s_cmp_lt_u32 s6, 64
-; GFX10-NEXT:    v_lshrrev_b64 v[11:12], s4, v[4:5]
+; GFX10-NEXT:    v_lshrrev_b64 v[2:3], s4, v[4:5]
 ; GFX10-NEXT:    s_cselect_b32 vcc_lo, 1, 0
 ; GFX10-NEXT:    v_lshlrev_b64 v[6:7], s6, v[0:1]
 ; GFX10-NEXT:    s_cmp_eq_u32 s6, 0
@@ -6156,8 +6154,8 @@ define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
 ; GFX10-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX10-NEXT:    s_and_b32 s4, 1, vcc_lo
 ; GFX10-NEXT:    v_lshlrev_b64 v[4:5], s5, v[4:5]
-; GFX10-NEXT:    v_or_b32_e32 v2, v11, v6
-; GFX10-NEXT:    v_or_b32_e32 v3, v12, v7
+; GFX10-NEXT:    v_or_b32_e32 v2, v2, v6
+; GFX10-NEXT:    v_or_b32_e32 v3, v3, v7
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 s4, 0, s4
 ; GFX10-NEXT:    s_sub_i32 s10, s8, 64
 ; GFX10-NEXT:    v_cndmask_b32_e32 v6, 0, v8, vcc_lo
@@ -6538,22 +6536,22 @@ define i128 @v_fshr_i128_65(i128 %lhs, i128 %rhs) {
 ; GFX10-NEXT:    s_cmp_eq_u32 63, 0
 ; GFX10-NEXT:    v_lshrrev_b64 v[8:9], s4, v[0:1]
 ; GFX10-NEXT:    s_cselect_b32 s4, 1, 0
-; GFX10-NEXT:    v_lshlrev_b64 v[14:15], s5, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], s5, v[0:1]
 ; GFX10-NEXT:    s_and_b32 s5, 1, vcc_lo
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 31, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v23, 0, v11, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v13, 0, v11, vcc_lo
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s5
 ; GFX10-NEXT:    s_movk_i32 s6, 0x41
 ; GFX10-NEXT:    s_and_b32 s4, 1, s4
 ; GFX10-NEXT:    s_sub_i32 s5, 64, s6
 ; GFX10-NEXT:    v_or_b32_e32 v12, v9, v10
-; GFX10-NEXT:    v_cndmask_b32_e32 v19, v14, v8, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v14, v0, v8, vcc_lo
 ; GFX10-NEXT:    v_lshlrev_b64 v[10:11], s5, v[6:7]
 ; GFX10-NEXT:    v_lshrrev_b64 v[8:9], s6, v[4:5]
 ; GFX10-NEXT:    s_sub_i32 s5, s6, 64
 ; GFX10-NEXT:    s_cmp_lt_u32 s6, 64
-; GFX10-NEXT:    v_cndmask_b32_e32 v12, v15, v12, vcc_lo
-; GFX10-NEXT:    v_lshrrev_b64 v[15:16], s5, v[6:7]
+; GFX10-NEXT:    v_cndmask_b32_e32 v12, v1, v12, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b64 v[0:1], s5, v[6:7]
 ; GFX10-NEXT:    s_cselect_b32 vcc_lo, 1, 0
 ; GFX10-NEXT:    s_cmp_eq_u32 s6, 0
 ; GFX10-NEXT:    v_or_b32_e32 v8, v8, v10
@@ -6563,17 +6561,17 @@ define i128 @v_fshr_i128_65(i128 %lhs, i128 %rhs) {
 ; GFX10-NEXT:    s_and_b32 s5, 1, s5
 ; GFX10-NEXT:    s_and_b32 s6, 1, vcc_lo
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 s4, 0, s4
-; GFX10-NEXT:    v_cndmask_b32_e32 v11, v15, v8, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v15, v16, v9, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc_lo
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 s5, 0, s5
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s6
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v19, v2, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v14, v2, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, v12, v3, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v15, v5, s5
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v11, v4, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s5
 ; GFX10-NEXT:    v_cndmask_b32_e32 v4, 0, v6, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v5, 0, v7, vcc_lo
-; GFX10-NEXT:    v_or_b32_e32 v1, v23, v1
+; GFX10-NEXT:    v_or_b32_e32 v1, v13, v1
 ; GFX10-NEXT:    v_or_b32_e32 v2, v2, v4
 ; GFX10-NEXT:    v_or_b32_e32 v3, v3, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -6921,10 +6919,8 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX10-NEXT:    s_andn2_b64 s[16:17], s[18:19], s[16:17]
 ; GFX10-NEXT:    s_sub_i32 s31, 64, 1
 ; GFX10-NEXT:    s_cmp_lt_u32 1, 64
-; GFX10-NEXT:    s_mov_b32 s62, s10
 ; GFX10-NEXT:    s_cselect_b32 s17, 1, 0
 ; GFX10-NEXT:    s_cmp_eq_u32 1, 0
-; GFX10-NEXT:    s_mov_b32 s63, s11
 ; GFX10-NEXT:    s_cselect_b32 s23, 1, 0
 ; GFX10-NEXT:    s_lshr_b64 s[24:25], s[0:1], s31
 ; GFX10-NEXT:    s_lshl_b64 s[26:27], s[2:3], 1
@@ -6935,23 +6931,23 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX10-NEXT:    s_cselect_b64 s[26:27], s[28:29], 0
 ; GFX10-NEXT:    s_cselect_b64 s[0:1], s[24:25], s[0:1]
 ; GFX10-NEXT:    s_cmp_lg_u32 s23, 0
-; GFX10-NEXT:    s_cselect_b64 s[46:47], s[2:3], s[0:1]
+; GFX10-NEXT:    s_cselect_b64 s[0:1], s[2:3], s[0:1]
 ; GFX10-NEXT:    s_sub_i32 s23, s16, 64
 ; GFX10-NEXT:    s_sub_i32 s2, 64, s16
 ; GFX10-NEXT:    s_cmp_lt_u32 s16, 64
 ; GFX10-NEXT:    s_cselect_b32 s28, 1, 0
 ; GFX10-NEXT:    s_cmp_eq_u32 s16, 0
 ; GFX10-NEXT:    s_cselect_b32 s29, 1, 0
-; GFX10-NEXT:    s_lshl_b64 s[24:25], s[46:47], s16
+; GFX10-NEXT:    s_lshl_b64 s[24:25], s[0:1], s16
 ; GFX10-NEXT:    s_lshr_b64 s[2:3], s[26:27], s2
 ; GFX10-NEXT:    s_lshl_b64 s[16:17], s[26:27], s16
 ; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[24:25]
 ; GFX10-NEXT:    s_lshl_b64 s[24:25], s[26:27], s23
 ; GFX10-NEXT:    s_cmp_lg_u32 s28, 0
-; GFX10-NEXT:    s_cselect_b64 s[78:79], s[16:17], 0
+; GFX10-NEXT:    s_cselect_b64 s[16:17], s[16:17], 0
 ; GFX10-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[24:25]
 ; GFX10-NEXT:    s_cmp_lg_u32 s29, 0
-; GFX10-NEXT:    s_cselect_b64 s[2:3], s[46:47], s[2:3]
+; GFX10-NEXT:    s_cselect_b64 s[2:3], s[0:1], s[2:3]
 ; GFX10-NEXT:    s_sub_i32 s26, s22, 64
 ; GFX10-NEXT:    s_sub_i32 s23, 64, s22
 ; GFX10-NEXT:    s_cmp_lt_u32 s22, 64
@@ -6959,17 +6955,17 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX10-NEXT:    s_cmp_eq_u32 s22, 0
 ; GFX10-NEXT:    s_cselect_b32 s28, 1, 0
 ; GFX10-NEXT:    s_lshr_b64 s[0:1], s[8:9], s22
-; GFX10-NEXT:    s_lshl_b64 s[24:25], s[62:63], s23
-; GFX10-NEXT:    s_lshr_b64 s[22:23], s[62:63], s22
+; GFX10-NEXT:    s_lshl_b64 s[24:25], s[10:11], s23
+; GFX10-NEXT:    s_lshr_b64 s[22:23], s[10:11], s22
 ; GFX10-NEXT:    s_or_b64 s[0:1], s[0:1], s[24:25]
-; GFX10-NEXT:    s_lshr_b64 s[10:11], s[62:63], s26
+; GFX10-NEXT:    s_lshr_b64 s[10:11], s[10:11], s26
 ; GFX10-NEXT:    s_cmp_lg_u32 s27, 0
 ; GFX10-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[10:11]
 ; GFX10-NEXT:    s_cmp_lg_u32 s28, 0
 ; GFX10-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
 ; GFX10-NEXT:    s_cmp_lg_u32 s27, 0
 ; GFX10-NEXT:    s_cselect_b64 s[8:9], s[22:23], 0
-; GFX10-NEXT:    s_or_b64 s[0:1], s[78:79], s[0:1]
+; GFX10-NEXT:    s_or_b64 s[0:1], s[16:17], s[0:1]
 ; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
 ; GFX10-NEXT:    s_and_b64 s[8:9], s[20:21], s[18:19]
 ; GFX10-NEXT:    s_andn2_b64 s[10:11], s[18:19], s[20:21]
@@ -7413,7 +7409,7 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX10-NEXT:    s_sub_i32 s5, 64, 1
 ; GFX10-NEXT:    s_sub_i32 s6, 1, 64
 ; GFX10-NEXT:    s_cmp_lt_u32 1, 64
-; GFX10-NEXT:    v_lshrrev_b64 v[27:28], s5, v[0:1]
+; GFX10-NEXT:    v_lshrrev_b64 v[17:18], s5, v[0:1]
 ; GFX10-NEXT:    v_lshlrev_b64 v[21:22], 1, v[2:3]
 ; GFX10-NEXT:    s_cselect_b32 vcc_lo, 1, 0
 ; GFX10-NEXT:    s_cmp_eq_u32 1, 0
@@ -7421,117 +7417,115 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX10-NEXT:    s_cselect_b32 s8, 1, 0
 ; GFX10-NEXT:    s_and_b32 s4, 1, vcc_lo
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], s6, v[0:1]
-; GFX10-NEXT:    v_xor_b32_e32 v19, -1, v16
-; GFX10-NEXT:    v_or_b32_e32 v21, v27, v21
-; GFX10-NEXT:    v_or_b32_e32 v18, v28, v22
+; GFX10-NEXT:    v_or_b32_e32 v21, v17, v21
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 s4, 0, s4
+; GFX10-NEXT:    v_or_b32_e32 v18, v18, v22
+; GFX10-NEXT:    v_xor_b32_e32 v19, -1, v16
 ; GFX10-NEXT:    s_movk_i32 s7, 0x7f
 ; GFX10-NEXT:    s_and_b32 s8, 1, s8
-; GFX10-NEXT:    v_and_b32_e32 v31, s7, v19
-; GFX10-NEXT:    v_cndmask_b32_e32 v17, 0, v23, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v18, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v21, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v18, s4
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 s4, 0, s8
+; GFX10-NEXT:    v_and_b32_e32 v25, s7, v19
+; GFX10-NEXT:    v_cndmask_b32_e32 v17, 0, v23, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v18, 0, v24, vcc_lo
-; GFX10-NEXT:    v_sub_nc_u32_e32 v19, 64, v31
 ; GFX10-NEXT:    v_and_b32_e32 v26, s7, v16
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v16, 64, v31
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s4
-; GFX10-NEXT:    v_lshrrev_b64 v[2:3], v19, v[17:18]
-; GFX10-NEXT:    v_mov_b32_e32 v35, v10
-; GFX10-NEXT:    v_mov_b32_e32 v36, v11
-; GFX10-NEXT:    v_sub_nc_u32_e32 v25, 64, v26
-; GFX10-NEXT:    v_lshlrev_b64 v[21:22], v31, v[0:1]
-; GFX10-NEXT:    v_lshlrev_b64 v[23:24], v31, v[17:18]
-; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v31
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v29, 64, v26
+; GFX10-NEXT:    v_sub_nc_u32_e32 v2, 64, v25
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v16, 64, v25
+; GFX10-NEXT:    v_sub_nc_u32_e32 v19, 64, v26
+; GFX10-NEXT:    v_lshlrev_b64 v[23:24], v25, v[17:18]
+; GFX10-NEXT:    v_lshlrev_b64 v[21:22], v25, v[0:1]
+; GFX10-NEXT:    v_lshrrev_b64 v[2:3], v2, v[17:18]
+; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v25
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v27, 64, v26
 ; GFX10-NEXT:    v_cmp_gt_u32_e64 s4, 64, v26
-; GFX10-NEXT:    v_lshrrev_b64 v[27:28], s5, v[4:5]
+; GFX10-NEXT:    s_cmp_lt_u32 1, 64
 ; GFX10-NEXT:    v_or_b32_e32 v21, v2, v21
 ; GFX10-NEXT:    v_or_b32_e32 v22, v3, v22
 ; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v16, v[17:18]
-; GFX10-NEXT:    v_lshlrev_b64 v[18:19], v25, v[35:36]
 ; GFX10-NEXT:    v_lshrrev_b64 v[16:17], v26, v[8:9]
+; GFX10-NEXT:    v_lshlrev_b64 v[18:19], v19, v[10:11]
 ; GFX10-NEXT:    v_cndmask_b32_e32 v23, 0, v23, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v39, 0, v24, vcc_lo
-; GFX10-NEXT:    s_cmp_lt_u32 1, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v24, 0, v24, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v21, v2, v21, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v22, v3, v22, vcc_lo
-; GFX10-NEXT:    v_lshrrev_b64 v[2:3], v29, v[35:36]
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v31
+; GFX10-NEXT:    v_lshrrev_b64 v[2:3], v27, v[10:11]
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v25
 ; GFX10-NEXT:    v_or_b32_e32 v16, v16, v18
 ; GFX10-NEXT:    v_or_b32_e32 v17, v17, v19
-; GFX10-NEXT:    v_lshlrev_b64 v[10:11], 1, v[6:7]
-; GFX10-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX10-NEXT:    v_xor_b32_e32 v25, -1, v20
 ; GFX10-NEXT:    v_cndmask_b32_e32 v18, v21, v0, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v31, v22, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v22, v22, v1, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v16, s4
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v26
-; GFX10-NEXT:    s_cmp_eq_u32 1, 0
+; GFX10-NEXT:    v_lshrrev_b64 v[0:1], v26, v[10:11]
 ; GFX10-NEXT:    v_cndmask_b32_e64 v19, v3, v17, s4
-; GFX10-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX10-NEXT:    v_lshlrev_b64 v[10:11], 1, v[6:7]
 ; GFX10-NEXT:    v_lshlrev_b64 v[16:17], 1, v[4:5]
+; GFX10-NEXT:    v_cndmask_b32_e32 v21, v2, v8, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b64 v[2:3], s5, v[4:5]
+; GFX10-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX10-NEXT:    s_cmp_eq_u32 1, 0
 ; GFX10-NEXT:    v_lshlrev_b64 v[4:5], s6, v[4:5]
+; GFX10-NEXT:    s_cselect_b32 s8, 1, 0
 ; GFX10-NEXT:    s_and_b32 s6, 1, s5
-; GFX10-NEXT:    v_cndmask_b32_e32 v21, v2, v8, vcc_lo
-; GFX10-NEXT:    v_xor_b32_e32 v25, -1, v20
-; GFX10-NEXT:    v_or_b32_e32 v2, v27, v10
-; GFX10-NEXT:    v_or_b32_e32 v3, v28, v11
+; GFX10-NEXT:    v_or_b32_e32 v2, v2, v10
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 s6, 0, s6
+; GFX10-NEXT:    v_or_b32_e32 v3, v3, v11
 ; GFX10-NEXT:    s_and_b32 s8, 1, s8
-; GFX10-NEXT:    v_lshrrev_b64 v[0:1], v26, v[35:36]
 ; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, v16, s5
 ; GFX10-NEXT:    v_cndmask_b32_e32 v16, v19, v9, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v11, v4, v2, s6
-; GFX10-NEXT:    v_and_b32_e32 v30, s7, v25
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v4, v2, s6
+; GFX10-NEXT:    v_and_b32_e32 v25, s7, v25
 ; GFX10-NEXT:    v_cndmask_b32_e64 v4, v5, v3, s6
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s8
 ; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, v17, s5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v19, 0, v0, s4
-; GFX10-NEXT:    v_sub_nc_u32_e32 v2, 64, v30
 ; GFX10-NEXT:    v_or_b32_e32 v0, v23, v21
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v11, v6, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc_lo
 ; GFX10-NEXT:    v_and_b32_e32 v23, s7, v20
-; GFX10-NEXT:    v_lshrrev_b64 v[5:6], v2, v[8:9]
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, 64, v30
-; GFX10-NEXT:    v_cndmask_b32_e64 v26, 0, v1, s4
-; GFX10-NEXT:    v_lshlrev_b64 v[10:11], v30, v[3:4]
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v2, v6, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc_lo
+; GFX10-NEXT:    v_sub_nc_u32_e32 v2, 64, v25
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, 64, v25
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v20, 64, v23
-; GFX10-NEXT:    v_or_b32_e32 v1, v39, v16
+; GFX10-NEXT:    v_cndmask_b32_e64 v26, 0, v1, s4
+; GFX10-NEXT:    v_lshlrev_b64 v[10:11], v25, v[3:4]
+; GFX10-NEXT:    v_lshrrev_b64 v[5:6], v2, v[8:9]
+; GFX10-NEXT:    v_or_b32_e32 v1, v24, v16
 ; GFX10-NEXT:    v_or_b32_e32 v2, v18, v19
-; GFX10-NEXT:    v_lshlrev_b64 v[16:17], v30, v[8:9]
+; GFX10-NEXT:    v_lshlrev_b64 v[16:17], v25, v[8:9]
 ; GFX10-NEXT:    v_lshrrev_b64 v[18:19], v23, v[12:13]
+; GFX10-NEXT:    v_lshlrev_b64 v[20:21], v20, v[14:15]
 ; GFX10-NEXT:    v_or_b32_e32 v10, v5, v10
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v5, 64, v23
-; GFX10-NEXT:    v_lshlrev_b64 v[20:21], v20, v[14:15]
-; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v30
+; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v25
 ; GFX10-NEXT:    v_lshlrev_b64 v[7:8], v7, v[8:9]
 ; GFX10-NEXT:    v_or_b32_e32 v9, v6, v11
-; GFX10-NEXT:    v_lshrrev_b64 v[34:35], v5, v[14:15]
 ; GFX10-NEXT:    v_cmp_gt_u32_e64 s4, 64, v23
+; GFX10-NEXT:    v_lshrrev_b64 v[5:6], v5, v[14:15]
 ; GFX10-NEXT:    v_cndmask_b32_e32 v11, 0, v16, vcc_lo
 ; GFX10-NEXT:    v_or_b32_e32 v16, v18, v20
 ; GFX10-NEXT:    v_or_b32_e32 v18, v19, v21
 ; GFX10-NEXT:    v_cndmask_b32_e32 v10, v7, v10, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v9, v8, v9, vcc_lo
 ; GFX10-NEXT:    v_lshrrev_b64 v[7:8], v23, v[14:15]
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s6, 0, v30
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, v34, v16, s4
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s6, 0, v25
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, v16, s4
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s5, 0, v23
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, v35, v18, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v6, v18, s4
 ; GFX10-NEXT:    v_cndmask_b32_e32 v14, 0, v17, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v15, v10, v3, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v10, v10, v3, s6
 ; GFX10-NEXT:    v_cndmask_b32_e64 v9, v9, v4, s6
 ; GFX10-NEXT:    v_cndmask_b32_e64 v4, v5, v12, s5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v5, v6, v13, s5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, v7, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, v8, s4
-; GFX10-NEXT:    v_or_b32_e32 v3, v31, v26
+; GFX10-NEXT:    v_or_b32_e32 v3, v22, v26
 ; GFX10-NEXT:    v_or_b32_e32 v4, v11, v4
 ; GFX10-NEXT:    v_or_b32_e32 v5, v14, v5
-; GFX10-NEXT:    v_or_b32_e32 v6, v15, v6
+; GFX10-NEXT:    v_or_b32_e32 v6, v10, v6
 ; GFX10-NEXT:    v_or_b32_e32 v7, v9, v7
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x i128> @llvm.fshr.v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %amt)

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll
index 9502d23b4f8f..b4b0037ab677 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll
@@ -2235,8 +2235,8 @@ define amdgpu_ps void @insertelement_v_v8i16_s_v(<8 x i16> addrspace(1)* %ptr, i
 ; GFX10-NEXT:    v_xor_b32_e32 v7, -1, v7
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v11, v2, v5, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v11, v6, s1
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s1
 ; GFX10-NEXT:    v_and_or_b32 v7, v2, v7, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v7, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v5, v7, s0
@@ -2482,8 +2482,8 @@ define amdgpu_ps void @insertelement_v_v8i16_v_v(<8 x i16> addrspace(1)* %ptr, i
 ; GFX10-NEXT:    v_xor_b32_e32 v2, -1, v8
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v10, v3, v6, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v10, v7, s1
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s1
 ; GFX10-NEXT:    v_and_or_b32 v3, v3, v2, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v4, v3, s2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc_lo
@@ -2902,21 +2902,21 @@ define amdgpu_ps void @insertelement_v_v16i16_s_s(<16 x i16> addrspace(1)* %ptr,
 ; GFX10-NEXT:    s_and_b32 s9, s2, s8
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s2, s7, 7
 ; GFX10-NEXT:    s_and_b32 s3, s3, 1
-; GFX10-NEXT:    v_mov_b32_e32 v13, 0
+; GFX10-NEXT:    v_mov_b32_e32 v10, 0
 ; GFX10-NEXT:    s_lshl_b32 s3, s3, 4
+; GFX10-NEXT:    v_mov_b32_e32 v11, 0
 ; GFX10-NEXT:    s_lshl_b32 s8, s8, s3
 ; GFX10-NEXT:    s_lshl_b32 s3, s9, s3
 ; GFX10-NEXT:    s_not_b32 s8, s8
+; GFX10-NEXT:    v_mov_b32_e32 v13, 0
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_cndmask_b32_e32 v11, v2, v3, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v11, v4, s0
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v5, s1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v11, v0, v7, s5
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v11, v8, s6
-; GFX10-NEXT:    v_mov_b32_e32 v10, 0
-; GFX10-NEXT:    v_mov_b32_e32 v11, 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v7, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v8, s6
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v9, s2
 ; GFX10-NEXT:    v_and_or_b32 v12, v0, s8, s3
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s3, s7, 0
@@ -3822,19 +3822,19 @@ define amdgpu_ps void @insertelement_v_v16i16_s_v(<16 x i16> addrspace(1)* %ptr,
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, 5, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 4, v2
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s2, 6, v0
-; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_cndmask_b32_e32 v11, v3, v4, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v11, v5, s0
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v11, v2, s5
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s5, 7, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v2, v2, s6
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s6, 0, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s1
 ; GFX10-NEXT:    v_xor_b32_e32 v11, -1, v11
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s3
-; GFX10-NEXT:    v_cndmask_b32_e64 v15, v1, v8, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v15, v9, s2
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v8, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s2
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v10, s5
 ; GFX10-NEXT:    v_and_or_b32 v13, v1, v11, v2
 ; GFX10-NEXT:    v_mov_b32_e32 v11, 0
@@ -4020,16 +4020,16 @@ define amdgpu_ps void @insertelement_v_v16i16_v_s(<16 x i16> addrspace(1)* %ptr,
 ; GFX10-NEXT:    s_lshl_b32 s7, s8, s7
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s6, s6, 0
 ; GFX10-NEXT:    s_not_b32 s7, s7
+; GFX10-NEXT:    v_mov_b32_e32 v11, 0
+; GFX10-NEXT:    v_mov_b32_e32 v12, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v14, 0
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v5, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_cndmask_b32_e64 v11, v0, v7, s3
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v11, v8, s4
-; GFX10-NEXT:    v_mov_b32_e32 v11, 0
-; GFX10-NEXT:    v_mov_b32_e32 v12, 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v7, s3
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v8, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v9, s2
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v10, s5
 ; GFX10-NEXT:    v_and_or_b32 v13, v0, s7, v1
@@ -4201,6 +4201,7 @@ define amdgpu_ps void @insertelement_v_v16i16_v_v(<16 x i16> addrspace(1)* %ptr,
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 1, v3
 ; GFX10-NEXT:    v_and_b32_e32 v3, 1, v3
 ; GFX10-NEXT:    s_mov_b32 s4, 0xffff
+; GFX10-NEXT:    v_mov_b32_e32 v15, 0
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 2, v0
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s1, 3, v0
@@ -4220,9 +4221,8 @@ define amdgpu_ps void @insertelement_v_v16i16_v_v(<16 x i16> addrspace(1)* %ptr,
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_cndmask_b32_e64 v15, v1, v8, s2
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v15, v9, s3
-; GFX10-NEXT:    v_mov_b32_e32 v15, 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v8, s2
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s3
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v10, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v11, s5
 ; GFX10-NEXT:    v_and_or_b32 v14, v1, v3, v2

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll
index 7ac27aff1eb9..adf7a49ae0c7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll
@@ -1638,11 +1638,11 @@ define amdgpu_ps void @insertelement_v_v4i8_v_s(<4 x i8> addrspace(1)* %ptr, i8
 ; GFX10-NEXT:    s_movk_i32 s0, 0xff
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
 ; GFX10-NEXT:    v_and_b32_sdwa v4, v0, s0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    v_and_or_b32 v6, v0, s0, v1
+; GFX10-NEXT:    v_and_or_b32 v0, v0, s0, v1
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, s1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX10-NEXT:    s_lshl_b32 s1, s0, s1
 ; GFX10-NEXT:    s_not_b32 s1, s1
-; GFX10-NEXT:    v_or3_b32 v0, v6, v4, v3
+; GFX10-NEXT:    v_or3_b32 v0, v0, v4, v3
 ; GFX10-NEXT:    v_and_or_b32 v0, v0, s1, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 8
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
@@ -1794,9 +1794,9 @@ define amdgpu_ps void @insertelement_v_v4i8_v_v(<4 x i8> addrspace(1)* %ptr, i8
 ; GFX10-NEXT:    v_and_b32_sdwa v6, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, s0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; GFX10-NEXT:    v_and_or_b32 v3, v0, s1, v3
-; GFX10-NEXT:    v_or3_b32 v0, v3, v6, v4
+; GFX10-NEXT:    v_and_or_b32 v0, v0, s1, v3
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 0xff
+; GFX10-NEXT:    v_or3_b32 v0, v0, v6, v4
 ; GFX10-NEXT:    v_and_or_b32 v0, v0, v2, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 8
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
@@ -1804,10 +1804,10 @@ define amdgpu_ps void @insertelement_v_v4i8_v_v(<4 x i8> addrspace(1)* %ptr, i8
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX10-NEXT:    v_and_b32_sdwa v2, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; GFX10-NEXT:    v_and_or_b32 v3, v0, v3, v1
+; GFX10-NEXT:    v_and_or_b32 v0, v0, v3, v1
+; GFX10-NEXT:    v_or3_b32 v2, v0, v2, v4
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    v_or3_b32 v2, v3, v2, v4
 ; GFX10-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-NEXT:    s_endpgm
   %vec = load <4 x i8>, <4 x i8> addrspace(1)* %ptr
@@ -2324,13 +2324,13 @@ define amdgpu_ps void @insertelement_v_v8i8_s_s(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX10-NEXT:    v_and_b32_sdwa v7, v1, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX10-NEXT:    v_and_or_b32 v11, v0, s1, v2
-; GFX10-NEXT:    v_and_or_b32 v2, v1, s1, v3
+; GFX10-NEXT:    v_and_or_b32 v0, v0, s1, v2
+; GFX10-NEXT:    v_and_or_b32 v1, v1, s1, v3
 ; GFX10-NEXT:    s_lshr_b32 s0, s3, 2
 ; GFX10-NEXT:    s_and_b32 s3, s3, 3
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s0, 1
-; GFX10-NEXT:    v_or3_b32 v0, v11, v6, v4
-; GFX10-NEXT:    v_or3_b32 v1, v2, v7, v5
+; GFX10-NEXT:    v_or3_b32 v0, v0, v6, v4
+; GFX10-NEXT:    v_or3_b32 v1, v1, v7, v5
 ; GFX10-NEXT:    s_lshl_b32 s3, s3, 3
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s0, 0
 ; GFX10-NEXT:    s_lshl_b32 s4, s1, s3
@@ -2629,12 +2629,12 @@ define amdgpu_ps void @insertelement_s_v8i8_v_s(<8 x i8> addrspace(4)* inreg %pt
 ; GFX10-NEXT:    v_and_b32_sdwa v7, v1, s2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, s0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX10-NEXT:    v_and_or_b32 v11, v0, s2, v2
-; GFX10-NEXT:    v_and_or_b32 v2, v1, s2, v3
-; GFX10-NEXT:    v_or3_b32 v0, v11, v6, v4
-; GFX10-NEXT:    v_or3_b32 v1, v2, v7, v5
+; GFX10-NEXT:    v_and_or_b32 v0, v0, s2, v2
+; GFX10-NEXT:    v_and_or_b32 v1, v1, s2, v3
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-NEXT:    v_or3_b32 v0, v0, v6, v4
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 0
+; GFX10-NEXT:    v_or3_b32 v1, v1, v7, v5
 ; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
 ; GFX10-NEXT:    s_endpgm
   %vec = load <8 x i8>, <8 x i8> addrspace(4)* %ptr
@@ -2905,20 +2905,20 @@ define amdgpu_ps void @insertelement_s_v8i8_s_v(<8 x i8> addrspace(4)* inreg %pt
 ; GFX10-NEXT:    s_mov_b32 s0, 8
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
-; GFX10-NEXT:    v_and_b32_sdwa v7, v1, s2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
+; GFX10-NEXT:    v_and_b32_sdwa v6, v0, s2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, s0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX10-NEXT:    v_and_b32_sdwa v6, v0, s2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_and_b32_sdwa v7, v1, s2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; GFX10-NEXT:    v_and_or_b32 v11, v0, s2, v2
-; GFX10-NEXT:    v_and_or_b32 v2, v1, s2, v3
-; GFX10-NEXT:    v_or3_b32 v0, v11, v6, v4
-; GFX10-NEXT:    v_or3_b32 v1, v2, v7, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
+; GFX10-NEXT:    v_and_or_b32 v0, v0, s2, v2
+; GFX10-NEXT:    v_and_or_b32 v1, v1, s2, v3
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 0
+; GFX10-NEXT:    v_or3_b32 v0, v0, v6, v4
+; GFX10-NEXT:    v_or3_b32 v1, v1, v7, v5
 ; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
 ; GFX10-NEXT:    s_endpgm
   %vec = load <8 x i8>, <8 x i8> addrspace(4)* %ptr
@@ -3186,20 +3186,20 @@ define amdgpu_ps void @insertelement_s_v8i8_v_v(<8 x i8> addrspace(4)* inreg %pt
 ; GFX10-NEXT:    s_mov_b32 s0, 8
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
-; GFX10-NEXT:    v_and_b32_sdwa v7, v1, s2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
+; GFX10-NEXT:    v_and_b32_sdwa v6, v0, s2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, s0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX10-NEXT:    v_and_b32_sdwa v6, v0, s2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_and_b32_sdwa v7, v1, s2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; GFX10-NEXT:    v_and_or_b32 v11, v0, s2, v2
-; GFX10-NEXT:    v_and_or_b32 v2, v1, s2, v3
-; GFX10-NEXT:    v_or3_b32 v0, v11, v6, v4
-; GFX10-NEXT:    v_or3_b32 v1, v2, v7, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
+; GFX10-NEXT:    v_and_or_b32 v0, v0, s2, v2
+; GFX10-NEXT:    v_and_or_b32 v1, v1, s2, v3
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 0
+; GFX10-NEXT:    v_or3_b32 v0, v0, v6, v4
+; GFX10-NEXT:    v_or3_b32 v1, v1, v7, v5
 ; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
 ; GFX10-NEXT:    s_endpgm
   %vec = load <8 x i8>, <8 x i8> addrspace(4)* %ptr
@@ -3397,7 +3397,7 @@ define amdgpu_ps void @insertelement_v_v8i8_s_v(<8 x i8> addrspace(1)* %ptr, i8
 ;
 ; GFX10-LABEL: insertelement_v_v8i8_s_v:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    global_load_dwordx2 v[11:12], v[0:1], off
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX10-NEXT:    s_mov_b32 s0, 8
 ; GFX10-NEXT:    v_and_b32_e32 v3, 3, v2
 ; GFX10-NEXT:    s_movk_i32 s1, 0xff
@@ -3405,22 +3405,22 @@ define amdgpu_ps void @insertelement_v_v8i8_s_v(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 8, v11
-; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 8, v12
-; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 24, v11
-; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 24, v12
-; GFX10-NEXT:    v_and_b32_sdwa v8, v11, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 8, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 24, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; GFX10-NEXT:    v_and_b32_sdwa v8, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v4, s0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, s0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-NEXT:    v_and_b32_sdwa v9, v12, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_and_b32_sdwa v9, v1, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
-; GFX10-NEXT:    v_and_or_b32 v11, v11, s1, v4
-; GFX10-NEXT:    v_and_or_b32 v10, v12, s1, v5
+; GFX10-NEXT:    v_and_or_b32 v0, v0, s1, v4
+; GFX10-NEXT:    v_and_or_b32 v1, v1, s1, v5
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, s1
 ; GFX10-NEXT:    s_and_b32 s0, s2, s1
-; GFX10-NEXT:    v_or3_b32 v0, v11, v8, v6
-; GFX10-NEXT:    v_or3_b32 v1, v10, v9, v7
+; GFX10-NEXT:    v_or3_b32 v0, v0, v8, v6
+; GFX10-NEXT:    v_or3_b32 v1, v1, v9, v7
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v3, s0
 ; GFX10-NEXT:    v_xor_b32_e32 v4, -1, v4
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 0, v2
@@ -3906,34 +3906,34 @@ define amdgpu_ps void @insertelement_v_v8i8_v_v(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX10-NEXT:    v_and_b32_sdwa v11, v1, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
-; GFX10-NEXT:    v_and_or_b32 v15, v0, s1, v6
-; GFX10-NEXT:    v_and_or_b32 v14, v1, s1, v7
+; GFX10-NEXT:    v_and_or_b32 v0, v0, s1, v6
+; GFX10-NEXT:    v_and_or_b32 v1, v1, s1, v7
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v6, v4, v5
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 0, v3
-; GFX10-NEXT:    v_or3_b32 v0, v15, v10, v8
-; GFX10-NEXT:    v_or3_b32 v1, v14, v11, v9
+; GFX10-NEXT:    v_or3_b32 v0, v0, v10, v8
+; GFX10-NEXT:    v_or3_b32 v1, v1, v11, v9
 ; GFX10-NEXT:    v_xor_b32_e32 v4, -1, v6
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v0, v1, vcc_lo
-; GFX10-NEXT:    v_and_or_b32 v2, v7, v4, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v0, v1, vcc_lo
+; GFX10-NEXT:    v_and_or_b32 v2, v6, v4, v2
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s0
-; GFX10-NEXT:    v_cndmask_b32_e32 v11, v1, v2, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 8
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 8, v11
+; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 24, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 24, v11
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
 ; GFX10-NEXT:    v_and_b32_sdwa v8, v0, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-NEXT:    v_and_b32_sdwa v4, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX10-NEXT:    v_and_b32_sdwa v4, v11, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
-; GFX10-NEXT:    v_and_or_b32 v3, v0, v5, v3
-; GFX10-NEXT:    v_and_or_b32 v1, v11, v5, v2
-; GFX10-NEXT:    v_or3_b32 v0, v3, v8, v6
+; GFX10-NEXT:    v_and_or_b32 v0, v0, v5, v3
+; GFX10-NEXT:    v_and_or_b32 v1, v1, v5, v2
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
-; GFX10-NEXT:    v_or3_b32 v1, v1, v4, v7
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 0
+; GFX10-NEXT:    v_or3_b32 v0, v0, v8, v6
+; GFX10-NEXT:    v_or3_b32 v1, v1, v4, v7
 ; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
 ; GFX10-NEXT:    s_endpgm
   %vec = load <8 x i8>, <8 x i8> addrspace(1)* %ptr
@@ -4820,60 +4820,60 @@ define amdgpu_ps void @insertelement_v_v16i8_s_s(<16 x i8> addrspace(1)* %ptr, i
 ; GFX10-NEXT:    v_and_or_b32 v0, v0, s4, v5
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 8, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
 ; GFX10-NEXT:    v_and_or_b32 v1, v1, s4, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v9, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX10-NEXT:    v_and_b32_sdwa v15, v2, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 24, v10
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 24, v3
 ; GFX10-NEXT:    v_or3_b32 v0, v0, v13, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 24, v10
 ; GFX10-NEXT:    v_and_or_b32 v2, v2, s4, v9
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v10, v4, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-NEXT:    v_or3_b32 v7, v1, v14, v8
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v7, v4, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-NEXT:    v_or3_b32 v1, v1, v14, v8
 ; GFX10-NEXT:    v_and_b32_sdwa v16, v3, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 24, v12
 ; GFX10-NEXT:    v_or3_b32 v2, v2, v15, v5
-; GFX10-NEXT:    v_and_or_b32 v3, v3, s4, v10
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v0, v7, vcc_lo
+; GFX10-NEXT:    v_and_or_b32 v3, v3, s4, v7
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v0, v1, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s5, 2
 ; GFX10-NEXT:    v_or3_b32 v3, v3, v16, v6
 ; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, v2, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, v3, s1
 ; GFX10-NEXT:    v_and_or_b32 v5, v5, s3, s2
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s2, s5, 0
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v5, s2
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v7, v5, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 8, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v5, s2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 8, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 8, v3
-; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 24, v2
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v9, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 8, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 24, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 24, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 24, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 24, v3
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v7, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v9, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v4, v4, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 24, v3
 ; GFX10-NEXT:    v_and_b32_sdwa v13, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    v_and_or_b32 v0, v0, s4, v5
-; GFX10-NEXT:    v_and_b32_sdwa v15, v2, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
-; GFX10-NEXT:    v_and_or_b32 v5, v2, s4, v9
 ; GFX10-NEXT:    v_and_b32_sdwa v14, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX10-NEXT:    v_and_b32_sdwa v15, v2, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    v_and_b32_sdwa v16, v3, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX10-NEXT:    v_and_or_b32 v0, v0, s4, v5
+; GFX10-NEXT:    v_and_or_b32 v3, v3, s4, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 24, v12
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
 ; GFX10-NEXT:    v_and_or_b32 v1, v1, s4, v7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 24, v12
-; GFX10-NEXT:    v_and_or_b32 v18, v3, s4, v4
-; GFX10-NEXT:    v_or3_b32 v2, v5, v15, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
+; GFX10-NEXT:    v_and_or_b32 v2, v2, s4, v9
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-NEXT:    v_or3_b32 v0, v0, v13, v6
 ; GFX10-NEXT:    v_or3_b32 v1, v1, v14, v8
-; GFX10-NEXT:    v_or3_b32 v3, v18, v16, v11
+; GFX10-NEXT:    v_or3_b32 v3, v3, v16, v11
+; GFX10-NEXT:    v_or3_b32 v2, v2, v15, v10
 ; GFX10-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
 ; GFX10-NEXT:    s_endpgm
@@ -5323,12 +5323,11 @@ define amdgpu_ps void @insertelement_s_v16i8_v_s(<16 x i8> addrspace(4)* inreg %
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-NEXT:    s_mov_b32 s0, 8
-; GFX10-NEXT:    v_cndmask_b32_e32 v15, v0, v4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s7, 1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 24, v15
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s7, 2
-; GFX10-NEXT:    v_and_b32_sdwa v0, v15, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc_lo
@@ -5337,7 +5336,7 @@ define amdgpu_ps void @insertelement_s_v16i8_v_s(<16 x i8> addrspace(4)* inreg %
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v6, v9, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 8, v2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 8, v15
+; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 8, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 24, v2
 ; GFX10-NEXT:    v_and_or_b32 v6, v1, s5, v6
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
@@ -5347,18 +5346,19 @@ define amdgpu_ps void @insertelement_s_v16i8_v_s(<16 x i8> addrspace(4)* inreg %
 ; GFX10-NEXT:    v_and_b32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v9, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-NEXT:    v_and_or_b32 v19, v15, s5, v4
+; GFX10-NEXT:    v_and_or_b32 v4, v0, s5, v4
+; GFX10-NEXT:    v_and_b32_sdwa v0, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    v_and_or_b32 v8, v2, s5, v8
-; GFX10-NEXT:    v_and_b32_sdwa v15, v2, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_and_b32_sdwa v2, v2, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
 ; GFX10-NEXT:    v_and_or_b32 v9, v3, s5, v9
-; GFX10-NEXT:    v_and_b32_sdwa v14, v3, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_and_b32_sdwa v3, v3, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 24, v12
-; GFX10-NEXT:    v_or3_b32 v0, v19, v0, v5
+; GFX10-NEXT:    v_or3_b32 v0, v4, v0, v5
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-NEXT:    v_or3_b32 v1, v6, v1, v7
-; GFX10-NEXT:    v_or3_b32 v2, v8, v15, v10
-; GFX10-NEXT:    v_or3_b32 v3, v9, v14, v11
+; GFX10-NEXT:    v_or3_b32 v2, v8, v2, v10
+; GFX10-NEXT:    v_or3_b32 v3, v9, v3, v11
 ; GFX10-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
 ; GFX10-NEXT:    s_endpgm
@@ -5814,16 +5814,16 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(<16 x i8> addrspace(4)* inreg %
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s10
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s11
-; GFX10-NEXT:    v_cndmask_b32_e64 v15, v0, v5, s2
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v5, s2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s1
 ; GFX10-NEXT:    s_mov_b32 s2, 8
-; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 8, v15
+; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 8, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 8, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 8, v3
-; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 24, v15
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v4, s2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 24, v2
@@ -5831,23 +5831,23 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(<16 x i8> addrspace(4)* inreg %
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v9, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 24, v3
-; GFX10-NEXT:    v_and_or_b32 v19, v15, s5, v4
-; GFX10-NEXT:    v_and_b32_sdwa v0, v15, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_and_or_b32 v4, v0, s5, v4
+; GFX10-NEXT:    v_and_b32_sdwa v0, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
 ; GFX10-NEXT:    v_and_or_b32 v6, v1, s5, v6
+; GFX10-NEXT:    v_and_or_b32 v8, v2, s5, v8
+; GFX10-NEXT:    v_and_or_b32 v9, v3, s5, v9
 ; GFX10-NEXT:    v_and_b32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
-; GFX10-NEXT:    v_and_or_b32 v8, v2, s5, v8
-; GFX10-NEXT:    v_and_b32_sdwa v15, v2, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_and_b32_sdwa v2, v2, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
-; GFX10-NEXT:    v_and_or_b32 v9, v3, s5, v9
-; GFX10-NEXT:    v_and_b32_sdwa v14, v3, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_and_b32_sdwa v3, v3, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 24, v12
-; GFX10-NEXT:    v_or3_b32 v0, v19, v0, v5
+; GFX10-NEXT:    v_or3_b32 v0, v4, v0, v5
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-NEXT:    v_or3_b32 v1, v6, v1, v7
-; GFX10-NEXT:    v_or3_b32 v2, v8, v15, v10
-; GFX10-NEXT:    v_or3_b32 v3, v9, v14, v11
+; GFX10-NEXT:    v_or3_b32 v2, v8, v2, v10
+; GFX10-NEXT:    v_or3_b32 v3, v9, v3, v11
 ; GFX10-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
 ; GFX10-NEXT:    s_endpgm
@@ -6300,16 +6300,16 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(<16 x i8> addrspace(4)* inreg %
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s7
-; GFX10-NEXT:    v_cndmask_b32_e64 v15, v0, v5, s2
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v5, s2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s1
 ; GFX10-NEXT:    s_mov_b32 s2, 8
-; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 8, v15
+; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 8, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 8, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 8, v3
-; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 24, v15
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v4, s2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 24, v2
@@ -6317,23 +6317,23 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(<16 x i8> addrspace(4)* inreg %
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v9, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 24, v3
-; GFX10-NEXT:    v_and_or_b32 v19, v15, s8, v4
-; GFX10-NEXT:    v_and_b32_sdwa v0, v15, s8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_and_or_b32 v4, v0, s8, v4
+; GFX10-NEXT:    v_and_b32_sdwa v0, v0, s8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
 ; GFX10-NEXT:    v_and_or_b32 v6, v1, s8, v6
+; GFX10-NEXT:    v_and_or_b32 v8, v2, s8, v8
+; GFX10-NEXT:    v_and_or_b32 v9, v3, s8, v9
 ; GFX10-NEXT:    v_and_b32_sdwa v1, v1, s8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
-; GFX10-NEXT:    v_and_or_b32 v8, v2, s8, v8
-; GFX10-NEXT:    v_and_b32_sdwa v15, v2, s8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_and_b32_sdwa v2, v2, s8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
-; GFX10-NEXT:    v_and_or_b32 v9, v3, s8, v9
-; GFX10-NEXT:    v_and_b32_sdwa v14, v3, s8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_and_b32_sdwa v3, v3, s8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 24, v12
-; GFX10-NEXT:    v_or3_b32 v0, v19, v0, v5
+; GFX10-NEXT:    v_or3_b32 v0, v4, v0, v5
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-NEXT:    v_or3_b32 v1, v6, v1, v7
-; GFX10-NEXT:    v_or3_b32 v2, v8, v15, v10
-; GFX10-NEXT:    v_or3_b32 v3, v9, v14, v11
+; GFX10-NEXT:    v_or3_b32 v2, v8, v2, v10
+; GFX10-NEXT:    v_or3_b32 v3, v9, v3, v11
 ; GFX10-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
 ; GFX10-NEXT:    s_endpgm
@@ -6659,7 +6659,7 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(<16 x i8> addrspace(1)* %ptr, i
 ; GFX10-LABEL: insertelement_v_v16i8_s_v:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_load_dwordx4 v[3:6], v[0:1], off
-; GFX10-NEXT:    v_mov_b32_e32 v22, 8
+; GFX10-NEXT:    v_mov_b32_e32 v1, 8
 ; GFX10-NEXT:    s_mov_b32 s0, 8
 ; GFX10-NEXT:    s_movk_i32 s3, 0xff
 ; GFX10-NEXT:    v_and_b32_e32 v0, 3, v2
@@ -6669,76 +6669,76 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(<16 x i8> addrspace(1)* %ptr, i
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s2, 0, v2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 8, v4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 8, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 8, v4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 24, v3
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 24, v4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 8, v5
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v9, v22, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v26, s0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 24, v5
-; GFX10-NEXT:    v_lshrrev_b32_e32 v13, 8, v6
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v11, v22, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-NEXT:    v_and_or_b32 v19, v4, s3, v9
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v7, s0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v9, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX10-NEXT:    v_and_b32_sdwa v15, v3, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    v_and_b32_sdwa v16, v4, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 24, v5
+; GFX10-NEXT:    v_and_or_b32 v3, v3, s3, v7
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
-; GFX10-NEXT:    v_and_or_b32 v26, v3, s3, v26
+; GFX10-NEXT:    v_lshrrev_b32_e32 v13, 8, v6
+; GFX10-NEXT:    v_and_or_b32 v4, v4, s3, v9
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
-; GFX10-NEXT:    v_lshrrev_b32_e32 v14, 24, v6
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v11, v1, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX10-NEXT:    v_and_b32_sdwa v17, v5, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_lshrrev_b32_e32 v14, 24, v6
+; GFX10-NEXT:    v_or3_b32 v3, v3, v15, v8
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 24, v12
-; GFX10-NEXT:    v_and_or_b32 v30, v5, s3, v11
-; GFX10-NEXT:    v_or3_b32 v3, v26, v15, v8
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v9, v22, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-NEXT:    v_or3_b32 v26, v19, v16, v10
+; GFX10-NEXT:    v_and_or_b32 v5, v5, s3, v11
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v9, v1, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-NEXT:    v_or3_b32 v4, v4, v16, v10
 ; GFX10-NEXT:    v_and_b32_sdwa v18, v6, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    v_or3_b32 v5, v30, v17, v7
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 24, v14
-; GFX10-NEXT:    v_and_or_b32 v11, v6, s3, v9
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v3, v26, vcc_lo
+; GFX10-NEXT:    v_or3_b32 v5, v5, v17, v7
+; GFX10-NEXT:    v_and_or_b32 v6, v6, s3, v9
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v3, v4, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 2, v2
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v9, v0, s3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v0, v0, s1
-; GFX10-NEXT:    v_or3_b32 v6, v11, v18, v8
+; GFX10-NEXT:    v_or3_b32 v6, v6, v18, v8
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s1, 3, v2
 ; GFX10-NEXT:    v_cndmask_b32_e64 v7, v7, v5, s0
-; GFX10-NEXT:    v_xor_b32_e32 v10, -1, v9
+; GFX10-NEXT:    v_xor_b32_e32 v8, -1, v9
 ; GFX10-NEXT:    v_cndmask_b32_e64 v7, v7, v6, s1
-; GFX10-NEXT:    v_and_or_b32 v0, v7, v10, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v18, v26, v0, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, v5, v0, s0
+; GFX10-NEXT:    v_and_or_b32 v0, v7, v8, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v3, v0, s2
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v4, v0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v5, v0, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v6, v0, s1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 8, v18
-; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 8, v4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 8, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 8, v4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 8, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 24, v18
+; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 24, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 24, v3
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 24, v4
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v7, v22, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v22, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 24, v0
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, v22, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v9, v22, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 24, v2
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v7, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v9, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, v1, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-NEXT:    v_and_b32_sdwa v13, v2, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_and_b32_sdwa v14, v3, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    v_and_b32_sdwa v15, v4, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    v_and_or_b32 v19, v2, s3, v5
-; GFX10-NEXT:    v_and_b32_sdwa v14, v18, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_and_or_b32 v2, v2, s3, v5
 ; GFX10-NEXT:    v_and_b32_sdwa v16, v0, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX10-NEXT:    v_and_or_b32 v5, v0, s3, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
+; GFX10-NEXT:    v_and_or_b32 v3, v3, s3, v7
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 24, v12
-; GFX10-NEXT:    v_and_or_b32 v3, v18, s3, v7
-; GFX10-NEXT:    v_and_or_b32 v5, v0, s3, v1
 ; GFX10-NEXT:    v_and_or_b32 v4, v4, s3, v9
-; GFX10-NEXT:    v_and_b32_sdwa v13, v2, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX10-NEXT:    v_or3_b32 v0, v2, v13, v6
 ; GFX10-NEXT:    v_or3_b32 v1, v3, v14, v8
 ; GFX10-NEXT:    v_or3_b32 v3, v5, v16, v11
 ; GFX10-NEXT:    v_or3_b32 v2, v4, v15, v10
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
-; GFX10-NEXT:    v_or3_b32 v0, v19, v13, v6
 ; GFX10-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
 ; GFX10-NEXT:    s_endpgm
@@ -7063,7 +7063,7 @@ define amdgpu_ps void @insertelement_v_v16i8_v_s(<16 x i8> addrspace(1)* %ptr, i
 ; GFX10-LABEL: insertelement_v_v16i8_v_s:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_load_dwordx4 v[3:6], v[0:1], off
-; GFX10-NEXT:    v_mov_b32_e32 v18, 8
+; GFX10-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX10-NEXT:    s_mov_b32 s0, 8
 ; GFX10-NEXT:    s_movk_i32 s3, 0xff
 ; GFX10-NEXT:    s_lshr_b32 s4, s2, 2
@@ -7079,69 +7079,69 @@ define amdgpu_ps void @insertelement_v_v16i8_v_s(<16 x i8> addrspace(1)* %ptr, i
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 8, v4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 24, v3
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 24, v4
-; GFX10-NEXT:    v_lshrrev_b32_e32 v27, 8, v5
+; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 8, v5
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v19, v18, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 8, v6
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v8, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 24, v5
+; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 8, v6
 ; GFX10-NEXT:    v_and_b32_sdwa v14, v3, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    v_and_b32_sdwa v15, v4, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    v_and_or_b32 v22, v4, s3, v19
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
 ; GFX10-NEXT:    v_and_or_b32 v1, v3, s3, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
+; GFX10-NEXT:    v_and_or_b32 v3, v4, s3, v8
+; GFX10-NEXT:    v_and_b32_sdwa v15, v4, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v10, v18, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-NEXT:    v_and_b32_sdwa v23, v5, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v10, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-NEXT:    v_and_b32_sdwa v16, v5, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v13, 24, v6
 ; GFX10-NEXT:    v_or3_b32 v1, v1, v14, v7
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 24, v11
 ; GFX10-NEXT:    v_and_or_b32 v5, v5, s3, v10
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v8, v18, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-NEXT:    v_or3_b32 v3, v22, v15, v9
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v8, v0, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-NEXT:    v_or3_b32 v3, v3, v15, v9
 ; GFX10-NEXT:    v_and_b32_sdwa v17, v6, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 24, v13
-; GFX10-NEXT:    v_or3_b32 v4, v5, v23, v4
+; GFX10-NEXT:    v_or3_b32 v4, v5, v16, v4
 ; GFX10-NEXT:    v_and_or_b32 v6, v6, s3, v8
 ; GFX10-NEXT:    v_cndmask_b32_e32 v5, v1, v3, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s4, 2
-; GFX10-NEXT:    v_or3_b32 v7, v6, v17, v7
+; GFX10-NEXT:    v_or3_b32 v6, v6, v17, v7
 ; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, v4, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, v7, s1
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, v6, s1
 ; GFX10-NEXT:    v_and_or_b32 v2, v5, s2, v2
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s2, s4, 0
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v2, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, v2, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v19, v1, v2, s2
-; GFX10-NEXT:    v_cndmask_b32_e32 v22, v3, v2, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v7, v2, s1
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s2
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 8, v3
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 8, v4
-; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 8, v19
-; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 8, v22
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 8, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 24, v22
+; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 24, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 24, v3
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 24, v4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 24, v2
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v7, v18, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, v18, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v18, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v9, v18, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 24, v19
-; GFX10-NEXT:    v_and_b32_sdwa v13, v19, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v7, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v9, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-NEXT:    v_and_b32_sdwa v13, v1, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_and_b32_sdwa v14, v3, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    v_and_b32_sdwa v15, v4, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    v_and_or_b32 v19, v19, s3, v5
-; GFX10-NEXT:    v_and_b32_sdwa v14, v22, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_and_or_b32 v1, v1, s3, v5
 ; GFX10-NEXT:    v_and_b32_sdwa v16, v2, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX10-NEXT:    v_and_or_b32 v5, v2, s3, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
-; GFX10-NEXT:    v_and_or_b32 v3, v22, s3, v7
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 24, v12
 ; GFX10-NEXT:    v_and_or_b32 v4, v4, s3, v9
-; GFX10-NEXT:    v_and_or_b32 v5, v2, s3, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX10-NEXT:    v_or3_b32 v1, v3, v14, v8
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 24, v12
+; GFX10-NEXT:    v_and_or_b32 v3, v3, s3, v7
+; GFX10-NEXT:    v_or3_b32 v0, v1, v13, v6
 ; GFX10-NEXT:    v_or3_b32 v2, v4, v15, v10
+; GFX10-NEXT:    v_or3_b32 v1, v3, v14, v8
 ; GFX10-NEXT:    v_or3_b32 v3, v5, v16, v11
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
-; GFX10-NEXT:    v_or3_b32 v0, v19, v13, v6
 ; GFX10-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
 ; GFX10-NEXT:    s_endpgm
@@ -7489,66 +7489,66 @@ define amdgpu_ps void @insertelement_v_v16i8_v_v(<16 x i8> addrspace(1)* %ptr, i
 ; GFX10-NEXT:    v_and_b32_sdwa v18, v5, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v14, 24, v6
 ; GFX10-NEXT:    v_and_or_b32 v4, v4, s1, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
+; GFX10-NEXT:    v_lshrrev_b32_e32 v15, 8, v7
 ; GFX10-NEXT:    v_and_or_b32 v5, v5, s1, v11
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 24, v12
-; GFX10-NEXT:    v_lshrrev_b32_e32 v15, 8, v7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v13, v8, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX10-NEXT:    v_and_b32_sdwa v19, v6, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 24, v14
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v14, v8, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v16, 24, v7
+; GFX10-NEXT:    v_or3_b32 v4, v4, v17, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 24, v14
 ; GFX10-NEXT:    v_and_or_b32 v6, v6, s1, v13
-; GFX10-NEXT:    v_or3_b32 v15, v4, v17, v10
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v11, v8, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX10-NEXT:    v_or3_b32 v5, v5, v18, v12
 ; GFX10-NEXT:    v_and_b32_sdwa v20, v7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 24, v16
-; GFX10-NEXT:    v_and_or_b32 v7, v7, v1, v14
 ; GFX10-NEXT:    v_or3_b32 v6, v6, v19, v9
-; GFX10-NEXT:    v_cndmask_b32_e32 v9, v15, v5, vcc_lo
+; GFX10-NEXT:    v_and_or_b32 v7, v7, v1, v11
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, v4, v5, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 2, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v11, v0, v1
-; GFX10-NEXT:    v_or3_b32 v7, v7, v20, v10
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s1, 3, v3
+; GFX10-NEXT:    v_or3_b32 v7, v7, v20, v10
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v9, v9, v6, s0
 ; GFX10-NEXT:    v_xor_b32_e32 v2, -1, v11
 ; GFX10-NEXT:    v_cndmask_b32_e64 v9, v9, v7, s1
 ; GFX10-NEXT:    v_and_or_b32 v0, v9, v2, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v4, v0, s2
 ; GFX10-NEXT:    v_cndmask_b32_e64 v4, v6, v0, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v27, v7, v0, s1
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v15, v0, s2
-; GFX10-NEXT:    v_cndmask_b32_e32 v18, v5, v0, vcc_lo
-; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 8, v4
-; GFX10-NEXT:    v_lshrrev_b32_e32 v19, 8, v27
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v5, v0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v7, v0, s1
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 8, v18
+; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 8, v4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 8, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 8, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 24, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 24, v18
+; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 24, v3
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 24, v4
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v10, v8, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v13, 24, v27
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v23, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v13, 24, v0
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v8, v8, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-NEXT:    v_and_b32_sdwa v21, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    v_and_b32_sdwa v15, v18, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    v_and_or_b32 v19, v2, v1, v23
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v10, v8, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v8, v8, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-NEXT:    v_and_b32_sdwa v14, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_and_b32_sdwa v15, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    v_and_b32_sdwa v16, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    v_and_or_b32 v10, v4, v1, v10
-; GFX10-NEXT:    v_and_b32_sdwa v17, v27, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_and_or_b32 v2, v2, v1, v5
+; GFX10-NEXT:    v_and_b32_sdwa v17, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX10-NEXT:    v_and_or_b32 v5, v0, v1, v8
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 24, v11
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 24, v13
-; GFX10-NEXT:    v_and_or_b32 v3, v27, v1, v8
-; GFX10-NEXT:    v_and_or_b32 v2, v18, v1, v7
+; GFX10-NEXT:    v_and_or_b32 v3, v3, v1, v7
+; GFX10-NEXT:    v_and_or_b32 v4, v4, v1, v10
+; GFX10-NEXT:    v_or3_b32 v0, v2, v14, v6
+; GFX10-NEXT:    v_or3_b32 v1, v3, v15, v9
+; GFX10-NEXT:    v_or3_b32 v2, v4, v16, v11
+; GFX10-NEXT:    v_or3_b32 v3, v5, v17, v12
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
-; GFX10-NEXT:    v_or3_b32 v0, v19, v21, v6
 ; GFX10-NEXT:    v_mov_b32_e32 v5, 0
-; GFX10-NEXT:    v_or3_b32 v3, v3, v17, v12
-; GFX10-NEXT:    v_or3_b32 v1, v2, v15, v9
-; GFX10-NEXT:    v_or3_b32 v2, v10, v16, v11
 ; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
 ; GFX10-NEXT:    s_endpgm
   %vec = load <16 x i8>, <16 x i8> addrspace(1)* %ptr

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
index df7299573590..b5ccf4708ae5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
@@ -989,8 +989,8 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_s_v(<8 x double> inreg %vec, do
 ; MOVREL-NEXT:    s_mov_b32 s14, s16
 ; MOVREL-NEXT:    v_mov_b32_e32 v16, s15
 ; MOVREL-NEXT:    v_mov_b32_e32 v2, s1
-; MOVREL-NEXT:    v_mov_b32_e32 v1, s0
 ; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; MOVREL-NEXT:    v_mov_b32_e32 v1, s0
 ; MOVREL-NEXT:    v_mov_b32_e32 v15, s14
 ; MOVREL-NEXT:    v_mov_b32_e32 v14, s13
 ; MOVREL-NEXT:    v_mov_b32_e32 v13, s12
@@ -1005,30 +1005,28 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_s_v(<8 x double> inreg %vec, do
 ; MOVREL-NEXT:    v_mov_b32_e32 v4, s3
 ; MOVREL-NEXT:    v_mov_b32_e32 v3, s2
 ; MOVREL-NEXT:    v_cmp_eq_u32_e64 s0, 1, v0
-; MOVREL-NEXT:    s_mov_b32 s30, s18
-; MOVREL-NEXT:    s_mov_b32 s31, s19
 ; MOVREL-NEXT:    v_cmp_eq_u32_e64 s1, 2, v0
-; MOVREL-NEXT:    v_cndmask_b32_e64 v1, v1, s30, vcc_lo
-; MOVREL-NEXT:    v_cndmask_b32_e64 v2, v2, s31, vcc_lo
+; MOVREL-NEXT:    v_cndmask_b32_e64 v1, v1, s18, vcc_lo
+; MOVREL-NEXT:    v_cndmask_b32_e64 v2, v2, s19, vcc_lo
 ; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v0
 ; MOVREL-NEXT:    v_cmp_eq_u32_e64 s2, 5, v0
-; MOVREL-NEXT:    v_cndmask_b32_e64 v3, v3, s30, s0
-; MOVREL-NEXT:    v_cndmask_b32_e64 v4, v4, s31, s0
+; MOVREL-NEXT:    v_cndmask_b32_e64 v3, v3, s18, s0
+; MOVREL-NEXT:    v_cndmask_b32_e64 v4, v4, s19, s0
 ; MOVREL-NEXT:    v_cmp_eq_u32_e64 s0, 4, v0
 ; MOVREL-NEXT:    v_cmp_eq_u32_e64 s3, 6, v0
 ; MOVREL-NEXT:    v_cmp_eq_u32_e64 s4, 7, v0
-; MOVREL-NEXT:    v_cndmask_b32_e64 v5, v5, s30, s1
-; MOVREL-NEXT:    v_cndmask_b32_e64 v6, v6, s31, s1
-; MOVREL-NEXT:    v_cndmask_b32_e64 v7, v7, s30, vcc_lo
-; MOVREL-NEXT:    v_cndmask_b32_e64 v8, v8, s31, vcc_lo
-; MOVREL-NEXT:    v_cndmask_b32_e64 v9, v9, s30, s0
-; MOVREL-NEXT:    v_cndmask_b32_e64 v10, v10, s31, s0
-; MOVREL-NEXT:    v_cndmask_b32_e64 v11, v11, s30, s2
-; MOVREL-NEXT:    v_cndmask_b32_e64 v12, v12, s31, s2
-; MOVREL-NEXT:    v_cndmask_b32_e64 v13, v13, s30, s3
-; MOVREL-NEXT:    v_cndmask_b32_e64 v14, v14, s31, s3
-; MOVREL-NEXT:    v_cndmask_b32_e64 v15, v15, s30, s4
-; MOVREL-NEXT:    v_cndmask_b32_e64 v16, v16, s31, s4
+; MOVREL-NEXT:    v_cndmask_b32_e64 v5, v5, s18, s1
+; MOVREL-NEXT:    v_cndmask_b32_e64 v6, v6, s19, s1
+; MOVREL-NEXT:    v_cndmask_b32_e64 v7, v7, s18, vcc_lo
+; MOVREL-NEXT:    v_cndmask_b32_e64 v8, v8, s19, vcc_lo
+; MOVREL-NEXT:    v_cndmask_b32_e64 v9, v9, s18, s0
+; MOVREL-NEXT:    v_cndmask_b32_e64 v10, v10, s19, s0
+; MOVREL-NEXT:    v_cndmask_b32_e64 v11, v11, s18, s2
+; MOVREL-NEXT:    v_cndmask_b32_e64 v12, v12, s19, s2
+; MOVREL-NEXT:    v_cndmask_b32_e64 v13, v13, s18, s3
+; MOVREL-NEXT:    v_cndmask_b32_e64 v14, v14, s19, s3
+; MOVREL-NEXT:    v_cndmask_b32_e64 v15, v15, s18, s4
+; MOVREL-NEXT:    v_cndmask_b32_e64 v16, v16, s19, s4
 ; MOVREL-NEXT:    global_store_dwordx4 v[0:1], v[1:4], off
 ; MOVREL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; MOVREL-NEXT:    global_store_dwordx4 v[0:1], v[5:8], off
@@ -1525,19 +1523,17 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_v_v(<8 x double> %vec, double %
 ;
 ; MOVREL-LABEL: dyn_insertelement_v8f64_v_v_v:
 ; MOVREL:       ; %bb.0: ; %entry
-; MOVREL-NEXT:    v_cmp_eq_u32_e64 s0, 1, v18
-; MOVREL-NEXT:    v_mov_b32_e32 v19, v0
 ; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v18
-; MOVREL-NEXT:    v_mov_b32_e32 v23, v1
+; MOVREL-NEXT:    v_cmp_eq_u32_e64 s0, 1, v18
 ; MOVREL-NEXT:    v_cmp_eq_u32_e64 s1, 2, v18
 ; MOVREL-NEXT:    v_cmp_eq_u32_e64 s2, 3, v18
 ; MOVREL-NEXT:    v_cmp_eq_u32_e64 s3, 4, v18
 ; MOVREL-NEXT:    v_cmp_eq_u32_e64 s4, 5, v18
 ; MOVREL-NEXT:    v_cmp_eq_u32_e64 s5, 7, v18
 ; MOVREL-NEXT:    v_cmp_eq_u32_e64 s6, 6, v18
-; MOVREL-NEXT:    v_cndmask_b32_e32 v0, v19, v16, vcc_lo
+; MOVREL-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc_lo
 ; MOVREL-NEXT:    v_cndmask_b32_e64 v2, v2, v16, s0
-; MOVREL-NEXT:    v_cndmask_b32_e32 v1, v23, v17, vcc_lo
+; MOVREL-NEXT:    v_cndmask_b32_e32 v1, v1, v17, vcc_lo
 ; MOVREL-NEXT:    v_cndmask_b32_e64 v3, v3, v17, s0
 ; MOVREL-NEXT:    v_cndmask_b32_e64 v4, v4, v16, s1
 ; MOVREL-NEXT:    v_cndmask_b32_e64 v6, v6, v16, s2
@@ -2161,8 +2157,6 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_v_v_add_1(<8 x double> %vec, do
 ; MOVREL-LABEL: dyn_insertelement_v8f64_v_v_v_add_1:
 ; MOVREL:       ; %bb.0: ; %entry
 ; MOVREL-NEXT:    v_add_nc_u32_e32 v18, 1, v18
-; MOVREL-NEXT:    v_mov_b32_e32 v19, v0
-; MOVREL-NEXT:    v_mov_b32_e32 v23, v1
 ; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v18
 ; MOVREL-NEXT:    v_cmp_eq_u32_e64 s0, 1, v18
 ; MOVREL-NEXT:    v_cmp_eq_u32_e64 s1, 2, v18
@@ -2171,9 +2165,9 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_v_v_add_1(<8 x double> %vec, do
 ; MOVREL-NEXT:    v_cmp_eq_u32_e64 s4, 5, v18
 ; MOVREL-NEXT:    v_cmp_eq_u32_e64 s5, 7, v18
 ; MOVREL-NEXT:    v_cmp_eq_u32_e64 s6, 6, v18
-; MOVREL-NEXT:    v_cndmask_b32_e32 v0, v19, v16, vcc_lo
+; MOVREL-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc_lo
 ; MOVREL-NEXT:    v_cndmask_b32_e64 v2, v2, v16, s0
-; MOVREL-NEXT:    v_cndmask_b32_e32 v1, v23, v17, vcc_lo
+; MOVREL-NEXT:    v_cndmask_b32_e32 v1, v1, v17, vcc_lo
 ; MOVREL-NEXT:    v_cndmask_b32_e64 v3, v3, v17, s0
 ; MOVREL-NEXT:    v_cndmask_b32_e64 v4, v4, v16, s1
 ; MOVREL-NEXT:    v_cndmask_b32_e64 v6, v6, v16, s2
@@ -3550,28 +3544,28 @@ define amdgpu_ps <7 x float> @dyn_insertelement_v7f32_s_v_s(<7 x float> inreg %v
 ; MOVREL-NEXT:    s_mov_b32 s4, s6
 ; MOVREL-NEXT:    s_mov_b32 s5, s7
 ; MOVREL-NEXT:    s_mov_b32 s6, s8
-; MOVREL-NEXT:    v_mov_b32_e32 v16, s7
-; MOVREL-NEXT:    v_mov_b32_e32 v9, s0
+; MOVREL-NEXT:    v_mov_b32_e32 v14, s7
+; MOVREL-NEXT:    v_mov_b32_e32 v7, s0
 ; MOVREL-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s9, 0
-; MOVREL-NEXT:    v_mov_b32_e32 v10, s1
-; MOVREL-NEXT:    v_mov_b32_e32 v11, s2
-; MOVREL-NEXT:    v_mov_b32_e32 v12, s3
-; MOVREL-NEXT:    v_mov_b32_e32 v13, s4
-; MOVREL-NEXT:    v_cndmask_b32_e32 v7, v9, v0, vcc_lo
+; MOVREL-NEXT:    v_mov_b32_e32 v8, s1
+; MOVREL-NEXT:    v_mov_b32_e32 v9, s2
+; MOVREL-NEXT:    v_mov_b32_e32 v10, s3
+; MOVREL-NEXT:    v_mov_b32_e32 v11, s4
+; MOVREL-NEXT:    v_cndmask_b32_e32 v7, v7, v0, vcc_lo
 ; MOVREL-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s9, 1
-; MOVREL-NEXT:    v_mov_b32_e32 v14, s5
-; MOVREL-NEXT:    v_mov_b32_e32 v15, s6
-; MOVREL-NEXT:    v_cndmask_b32_e32 v1, v10, v0, vcc_lo
+; MOVREL-NEXT:    v_mov_b32_e32 v12, s5
+; MOVREL-NEXT:    v_mov_b32_e32 v13, s6
+; MOVREL-NEXT:    v_cndmask_b32_e32 v1, v8, v0, vcc_lo
 ; MOVREL-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s9, 2
-; MOVREL-NEXT:    v_cndmask_b32_e32 v2, v11, v0, vcc_lo
+; MOVREL-NEXT:    v_cndmask_b32_e32 v2, v9, v0, vcc_lo
 ; MOVREL-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s9, 3
-; MOVREL-NEXT:    v_cndmask_b32_e32 v3, v12, v0, vcc_lo
+; MOVREL-NEXT:    v_cndmask_b32_e32 v3, v10, v0, vcc_lo
 ; MOVREL-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s9, 4
-; MOVREL-NEXT:    v_cndmask_b32_e32 v4, v13, v0, vcc_lo
+; MOVREL-NEXT:    v_cndmask_b32_e32 v4, v11, v0, vcc_lo
 ; MOVREL-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s9, 5
-; MOVREL-NEXT:    v_cndmask_b32_e32 v5, v14, v0, vcc_lo
+; MOVREL-NEXT:    v_cndmask_b32_e32 v5, v12, v0, vcc_lo
 ; MOVREL-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s9, 6
-; MOVREL-NEXT:    v_cndmask_b32_e32 v6, v15, v0, vcc_lo
+; MOVREL-NEXT:    v_cndmask_b32_e32 v6, v13, v0, vcc_lo
 ; MOVREL-NEXT:    v_mov_b32_e32 v0, v7
 ; MOVREL-NEXT:    ; return to shader part epilog
 entry:
@@ -3624,29 +3618,29 @@ define amdgpu_ps <7 x float> @dyn_insertelement_v7f32_s_v_v(<7 x float> inreg %v
 ; MOVREL-NEXT:    s_mov_b32 s4, s6
 ; MOVREL-NEXT:    s_mov_b32 s5, s7
 ; MOVREL-NEXT:    s_mov_b32 s6, s8
-; MOVREL-NEXT:    v_mov_b32_e32 v16, s7
-; MOVREL-NEXT:    v_mov_b32_e32 v9, s0
+; MOVREL-NEXT:    v_mov_b32_e32 v15, s7
+; MOVREL-NEXT:    v_mov_b32_e32 v8, s0
 ; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; MOVREL-NEXT:    v_mov_b32_e32 v10, s1
-; MOVREL-NEXT:    v_mov_b32_e32 v11, s2
-; MOVREL-NEXT:    v_mov_b32_e32 v12, s3
-; MOVREL-NEXT:    v_mov_b32_e32 v13, s4
-; MOVREL-NEXT:    v_cndmask_b32_e32 v8, v9, v0, vcc_lo
+; MOVREL-NEXT:    v_mov_b32_e32 v9, s1
+; MOVREL-NEXT:    v_mov_b32_e32 v10, s2
+; MOVREL-NEXT:    v_mov_b32_e32 v11, s3
+; MOVREL-NEXT:    v_mov_b32_e32 v12, s4
+; MOVREL-NEXT:    v_cndmask_b32_e32 v8, v8, v0, vcc_lo
 ; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; MOVREL-NEXT:    v_mov_b32_e32 v14, s5
-; MOVREL-NEXT:    v_mov_b32_e32 v15, s6
-; MOVREL-NEXT:    v_cndmask_b32_e32 v7, v10, v0, vcc_lo
+; MOVREL-NEXT:    v_mov_b32_e32 v13, s5
+; MOVREL-NEXT:    v_mov_b32_e32 v14, s6
+; MOVREL-NEXT:    v_cndmask_b32_e32 v7, v9, v0, vcc_lo
 ; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 2, v1
-; MOVREL-NEXT:    v_cndmask_b32_e32 v2, v11, v0, vcc_lo
+; MOVREL-NEXT:    v_cndmask_b32_e32 v2, v10, v0, vcc_lo
 ; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v1
-; MOVREL-NEXT:    v_cndmask_b32_e32 v3, v12, v0, vcc_lo
+; MOVREL-NEXT:    v_cndmask_b32_e32 v3, v11, v0, vcc_lo
 ; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 4, v1
-; MOVREL-NEXT:    v_cndmask_b32_e32 v4, v13, v0, vcc_lo
+; MOVREL-NEXT:    v_cndmask_b32_e32 v4, v12, v0, vcc_lo
 ; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 5, v1
-; MOVREL-NEXT:    v_cndmask_b32_e32 v5, v14, v0, vcc_lo
+; MOVREL-NEXT:    v_cndmask_b32_e32 v5, v13, v0, vcc_lo
 ; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 6, v1
 ; MOVREL-NEXT:    v_mov_b32_e32 v1, v7
-; MOVREL-NEXT:    v_cndmask_b32_e32 v6, v15, v0, vcc_lo
+; MOVREL-NEXT:    v_cndmask_b32_e32 v6, v14, v0, vcc_lo
 ; MOVREL-NEXT:    v_mov_b32_e32 v0, v8
 ; MOVREL-NEXT:    ; return to shader part epilog
 entry:
@@ -4128,23 +4122,21 @@ define amdgpu_ps <7 x double> @dyn_insertelement_v7f64_v_v_v(<7 x double> %vec,
 ; MOVREL-LABEL: dyn_insertelement_v7f64_v_v_v:
 ; MOVREL:       ; %bb.0: ; %entry
 ; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v16
+; MOVREL-NEXT:    v_cmp_eq_u32_e64 s0, 1, v16
 ; MOVREL-NEXT:    v_cmp_eq_u32_e64 s1, 2, v16
 ; MOVREL-NEXT:    v_cmp_eq_u32_e64 s2, 3, v16
 ; MOVREL-NEXT:    v_cmp_eq_u32_e64 s3, 4, v16
 ; MOVREL-NEXT:    v_cmp_eq_u32_e64 s4, 5, v16
 ; MOVREL-NEXT:    v_cmp_eq_u32_e64 s5, 6, v16
-; MOVREL-NEXT:    v_mov_b32_e32 v19, v2
-; MOVREL-NEXT:    v_cmp_eq_u32_e64 s0, 1, v16
-; MOVREL-NEXT:    v_mov_b32_e32 v18, v3
+; MOVREL-NEXT:    v_cndmask_b32_e32 v0, v0, v14, vcc_lo
 ; MOVREL-NEXT:    v_cndmask_b32_e64 v6, v6, v14, s2
 ; MOVREL-NEXT:    v_cndmask_b32_e64 v8, v8, v14, s3
 ; MOVREL-NEXT:    v_cndmask_b32_e64 v10, v10, v14, s4
 ; MOVREL-NEXT:    v_cndmask_b32_e64 v12, v12, v14, s5
 ; MOVREL-NEXT:    v_cndmask_b32_e64 v7, v7, v15, s2
-; MOVREL-NEXT:    v_cndmask_b32_e64 v2, v19, v14, s0
+; MOVREL-NEXT:    v_cndmask_b32_e64 v2, v2, v14, s0
+; MOVREL-NEXT:    v_cndmask_b32_e64 v3, v3, v15, s0
 ; MOVREL-NEXT:    v_cndmask_b32_e64 v9, v9, v15, s3
-; MOVREL-NEXT:    v_cndmask_b32_e64 v3, v18, v15, s0
-; MOVREL-NEXT:    v_cndmask_b32_e32 v0, v0, v14, vcc_lo
 ; MOVREL-NEXT:    v_cndmask_b32_e64 v11, v11, v15, s4
 ; MOVREL-NEXT:    v_cndmask_b32_e64 v4, v4, v14, s1
 ; MOVREL-NEXT:    v_cndmask_b32_e64 v13, v13, v15, s5
@@ -4271,38 +4263,38 @@ define amdgpu_ps <5 x double> @dyn_insertelement_v5f64_s_v_s(<5 x double> inreg
 ; MOVREL-NEXT:    s_mov_b32 s7, s9
 ; MOVREL-NEXT:    s_mov_b32 s8, s10
 ; MOVREL-NEXT:    s_mov_b32 s9, s11
-; MOVREL-NEXT:    v_mov_b32_e32 v20, s15
-; MOVREL-NEXT:    v_mov_b32_e32 v19, s14
-; MOVREL-NEXT:    v_mov_b32_e32 v18, s13
-; MOVREL-NEXT:    v_mov_b32_e32 v17, s12
-; MOVREL-NEXT:    v_mov_b32_e32 v16, s11
-; MOVREL-NEXT:    v_mov_b32_e32 v15, s10
-; MOVREL-NEXT:    v_mov_b32_e32 v14, s9
-; MOVREL-NEXT:    v_mov_b32_e32 v13, s8
-; MOVREL-NEXT:    v_mov_b32_e32 v12, s7
-; MOVREL-NEXT:    v_mov_b32_e32 v11, s6
-; MOVREL-NEXT:    v_mov_b32_e32 v10, s5
-; MOVREL-NEXT:    v_mov_b32_e32 v9, s4
-; MOVREL-NEXT:    v_mov_b32_e32 v8, s3
-; MOVREL-NEXT:    v_mov_b32_e32 v7, s2
-; MOVREL-NEXT:    v_mov_b32_e32 v6, s1
-; MOVREL-NEXT:    v_mov_b32_e32 v5, s0
+; MOVREL-NEXT:    v_mov_b32_e32 v17, s15
+; MOVREL-NEXT:    v_mov_b32_e32 v16, s14
+; MOVREL-NEXT:    v_mov_b32_e32 v15, s13
+; MOVREL-NEXT:    v_mov_b32_e32 v14, s12
+; MOVREL-NEXT:    v_mov_b32_e32 v13, s11
+; MOVREL-NEXT:    v_mov_b32_e32 v12, s10
+; MOVREL-NEXT:    v_mov_b32_e32 v11, s9
+; MOVREL-NEXT:    v_mov_b32_e32 v10, s8
+; MOVREL-NEXT:    v_mov_b32_e32 v9, s7
+; MOVREL-NEXT:    v_mov_b32_e32 v8, s6
+; MOVREL-NEXT:    v_mov_b32_e32 v7, s5
+; MOVREL-NEXT:    v_mov_b32_e32 v6, s4
+; MOVREL-NEXT:    v_mov_b32_e32 v5, s3
+; MOVREL-NEXT:    v_mov_b32_e32 v4, s2
+; MOVREL-NEXT:    v_mov_b32_e32 v3, s1
+; MOVREL-NEXT:    v_mov_b32_e32 v2, s0
 ; MOVREL-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s12, 0
 ; MOVREL-NEXT:    v_cmp_eq_u32_e64 s0, s12, 1
 ; MOVREL-NEXT:    v_cmp_eq_u32_e64 s1, s12, 4
-; MOVREL-NEXT:    v_cndmask_b32_e32 v2, v5, v0, vcc_lo
-; MOVREL-NEXT:    v_cndmask_b32_e32 v3, v6, v1, vcc_lo
-; MOVREL-NEXT:    v_cndmask_b32_e64 v4, v7, v0, s0
+; MOVREL-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
+; MOVREL-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc_lo
 ; MOVREL-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s12, 2
-; MOVREL-NEXT:    v_cndmask_b32_e64 v5, v8, v1, s0
+; MOVREL-NEXT:    v_cndmask_b32_e64 v4, v4, v0, s0
+; MOVREL-NEXT:    v_cndmask_b32_e64 v5, v5, v1, s0
 ; MOVREL-NEXT:    v_cmp_eq_u32_e64 s0, s12, 3
+; MOVREL-NEXT:    v_cndmask_b32_e32 v6, v6, v0, vcc_lo
+; MOVREL-NEXT:    v_cndmask_b32_e32 v7, v7, v1, vcc_lo
 ; MOVREL-NEXT:    v_readfirstlane_b32 s2, v4
-; MOVREL-NEXT:    v_cndmask_b32_e32 v6, v9, v0, vcc_lo
-; MOVREL-NEXT:    v_cndmask_b32_e32 v7, v10, v1, vcc_lo
-; MOVREL-NEXT:    v_cndmask_b32_e64 v8, v11, v0, s0
-; MOVREL-NEXT:    v_cndmask_b32_e64 v9, v12, v1, s0
-; MOVREL-NEXT:    v_cndmask_b32_e64 v0, v13, v0, s1
-; MOVREL-NEXT:    v_cndmask_b32_e64 v1, v14, v1, s1
+; MOVREL-NEXT:    v_cndmask_b32_e64 v8, v8, v0, s0
+; MOVREL-NEXT:    v_cndmask_b32_e64 v9, v9, v1, s0
+; MOVREL-NEXT:    v_cndmask_b32_e64 v0, v10, v0, s1
+; MOVREL-NEXT:    v_cndmask_b32_e64 v1, v11, v1, s1
 ; MOVREL-NEXT:    v_readfirstlane_b32 s0, v2
 ; MOVREL-NEXT:    v_readfirstlane_b32 s1, v3
 ; MOVREL-NEXT:    v_readfirstlane_b32 s3, v5
@@ -4466,15 +4458,13 @@ define amdgpu_ps <5 x double> @dyn_insertelement_v5f64_v_v_s(<5 x double> %vec,
 ; MOVREL-LABEL: dyn_insertelement_v5f64_v_v_s:
 ; MOVREL:       ; %bb.0: ; %entry
 ; MOVREL-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s2, 0
-; MOVREL-NEXT:    v_mov_b32_e32 v15, v2
-; MOVREL-NEXT:    v_mov_b32_e32 v14, v3
 ; MOVREL-NEXT:    v_cndmask_b32_e32 v0, v0, v10, vcc_lo
 ; MOVREL-NEXT:    v_cndmask_b32_e32 v1, v1, v11, vcc_lo
 ; MOVREL-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s2, 1
 ; MOVREL-NEXT:    v_readfirstlane_b32 s0, v0
 ; MOVREL-NEXT:    v_readfirstlane_b32 s1, v1
-; MOVREL-NEXT:    v_cndmask_b32_e32 v3, v14, v11, vcc_lo
-; MOVREL-NEXT:    v_cndmask_b32_e32 v2, v15, v10, vcc_lo
+; MOVREL-NEXT:    v_cndmask_b32_e32 v3, v3, v11, vcc_lo
+; MOVREL-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc_lo
 ; MOVREL-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s2, 2
 ; MOVREL-NEXT:    v_readfirstlane_b32 s3, v3
 ; MOVREL-NEXT:    v_cndmask_b32_e32 v4, v4, v10, vcc_lo
@@ -4531,15 +4521,13 @@ define amdgpu_ps <5 x double> @dyn_insertelement_v5f64_v_v_v(<5 x double> %vec,
 ; MOVREL-LABEL: dyn_insertelement_v5f64_v_v_v:
 ; MOVREL:       ; %bb.0: ; %entry
 ; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v12
-; MOVREL-NEXT:    v_mov_b32_e32 v15, v2
-; MOVREL-NEXT:    v_mov_b32_e32 v14, v3
 ; MOVREL-NEXT:    v_cndmask_b32_e32 v0, v0, v10, vcc_lo
 ; MOVREL-NEXT:    v_cndmask_b32_e32 v1, v1, v11, vcc_lo
 ; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v12
 ; MOVREL-NEXT:    v_readfirstlane_b32 s0, v0
 ; MOVREL-NEXT:    v_readfirstlane_b32 s1, v1
-; MOVREL-NEXT:    v_cndmask_b32_e32 v2, v15, v10, vcc_lo
-; MOVREL-NEXT:    v_cndmask_b32_e32 v3, v14, v11, vcc_lo
+; MOVREL-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc_lo
+; MOVREL-NEXT:    v_cndmask_b32_e32 v3, v3, v11, vcc_lo
 ; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 2, v12
 ; MOVREL-NEXT:    v_readfirstlane_b32 s2, v2
 ; MOVREL-NEXT:    v_readfirstlane_b32 s3, v3

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll
index f3bc0466b5f5..062c0ad91ea9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll
@@ -1828,10 +1828,10 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(i64* %out, i64*
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v8, s3
-; GFX10-NEXT:    v_mov_b32_e32 v7, s2
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v7, v4
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v8, vcc_lo
+; GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, 40
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX10-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll
index b438719a47ae..9c01dda2b83c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll
@@ -29,7 +29,7 @@ define amdgpu_ps <4 x float> @gather4_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
 ;
 ; GFX10NSA-LABEL: gather4_2d:
 ; GFX10NSA:       ; %bb.0: ; %main_body
-; GFX10NSA-NEXT:    s_mov_b32 s28, exec_lo
+; GFX10NSA-NEXT:    s_mov_b32 s14, exec_lo
 ; GFX10NSA-NEXT:    s_mov_b32 s0, s2
 ; GFX10NSA-NEXT:    s_wqm_b32 exec_lo, exec_lo
 ; GFX10NSA-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
@@ -45,7 +45,7 @@ define amdgpu_ps <4 x float> @gather4_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
 ; GFX10NSA-NEXT:    s_mov_b32 s10, s12
 ; GFX10NSA-NEXT:    s_mov_b32 s11, s13
 ; GFX10NSA-NEXT:    v_and_or_b32 v0, v0, 0xffff, v1
-; GFX10NSA-NEXT:    s_and_b32 exec_lo, exec_lo, s28
+; GFX10NSA-NEXT:    s_and_b32 exec_lo, exec_lo, s14
 ; GFX10NSA-NEXT:    image_gather4 v[0:3], v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
 ; GFX10NSA-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10NSA-NEXT:    ; return to shader part epilog
@@ -83,7 +83,7 @@ define amdgpu_ps <4 x float> @gather4_cube(<8 x i32> inreg %rsrc, <4 x i32> inre
 ;
 ; GFX10NSA-LABEL: gather4_cube:
 ; GFX10NSA:       ; %bb.0: ; %main_body
-; GFX10NSA-NEXT:    s_mov_b32 s28, exec_lo
+; GFX10NSA-NEXT:    s_mov_b32 s14, exec_lo
 ; GFX10NSA-NEXT:    s_mov_b32 s0, s2
 ; GFX10NSA-NEXT:    s_wqm_b32 exec_lo, exec_lo
 ; GFX10NSA-NEXT:    v_mov_b32_e32 v3, 0xffff
@@ -102,7 +102,7 @@ define amdgpu_ps <4 x float> @gather4_cube(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX10NSA-NEXT:    v_and_or_b32 v0, v0, v3, v1
 ; GFX10NSA-NEXT:    s_mov_b32 s11, s13
 ; GFX10NSA-NEXT:    v_and_or_b32 v1, v2, v3, s12
-; GFX10NSA-NEXT:    s_and_b32 exec_lo, exec_lo, s28
+; GFX10NSA-NEXT:    s_and_b32 exec_lo, exec_lo, s14
 ; GFX10NSA-NEXT:    image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_CUBE a16
 ; GFX10NSA-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10NSA-NEXT:    ; return to shader part epilog
@@ -140,7 +140,7 @@ define amdgpu_ps <4 x float> @gather4_2darray(<8 x i32> inreg %rsrc, <4 x i32> i
 ;
 ; GFX10NSA-LABEL: gather4_2darray:
 ; GFX10NSA:       ; %bb.0: ; %main_body
-; GFX10NSA-NEXT:    s_mov_b32 s28, exec_lo
+; GFX10NSA-NEXT:    s_mov_b32 s14, exec_lo
 ; GFX10NSA-NEXT:    s_mov_b32 s0, s2
 ; GFX10NSA-NEXT:    s_wqm_b32 exec_lo, exec_lo
 ; GFX10NSA-NEXT:    v_mov_b32_e32 v3, 0xffff
@@ -159,7 +159,7 @@ define amdgpu_ps <4 x float> @gather4_2darray(<8 x i32> inreg %rsrc, <4 x i32> i
 ; GFX10NSA-NEXT:    v_and_or_b32 v0, v0, v3, v1
 ; GFX10NSA-NEXT:    s_mov_b32 s11, s13
 ; GFX10NSA-NEXT:    v_and_or_b32 v1, v2, v3, s12
-; GFX10NSA-NEXT:    s_and_b32 exec_lo, exec_lo, s28
+; GFX10NSA-NEXT:    s_and_b32 exec_lo, exec_lo, s14
 ; GFX10NSA-NEXT:    image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY a16
 ; GFX10NSA-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10NSA-NEXT:    ; return to shader part epilog
@@ -195,7 +195,7 @@ define amdgpu_ps <4 x float> @gather4_c_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
 ;
 ; GFX10NSA-LABEL: gather4_c_2d:
 ; GFX10NSA:       ; %bb.0: ; %main_body
-; GFX10NSA-NEXT:    s_mov_b32 s28, exec_lo
+; GFX10NSA-NEXT:    s_mov_b32 s14, exec_lo
 ; GFX10NSA-NEXT:    s_mov_b32 s0, s2
 ; GFX10NSA-NEXT:    s_wqm_b32 exec_lo, exec_lo
 ; GFX10NSA-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
@@ -211,7 +211,7 @@ define amdgpu_ps <4 x float> @gather4_c_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX10NSA-NEXT:    s_mov_b32 s10, s12
 ; GFX10NSA-NEXT:    s_mov_b32 s11, s13
 ; GFX10NSA-NEXT:    v_and_or_b32 v1, v1, 0xffff, v2
-; GFX10NSA-NEXT:    s_and_b32 exec_lo, exec_lo, s28
+; GFX10NSA-NEXT:    s_and_b32 exec_lo, exec_lo, s14
 ; GFX10NSA-NEXT:    image_gather4_c v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
 ; GFX10NSA-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10NSA-NEXT:    ; return to shader part epilog
@@ -249,7 +249,7 @@ define amdgpu_ps <4 x float> @gather4_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
 ;
 ; GFX10NSA-LABEL: gather4_cl_2d:
 ; GFX10NSA:       ; %bb.0: ; %main_body
-; GFX10NSA-NEXT:    s_mov_b32 s28, exec_lo
+; GFX10NSA-NEXT:    s_mov_b32 s14, exec_lo
 ; GFX10NSA-NEXT:    s_mov_b32 s0, s2
 ; GFX10NSA-NEXT:    s_wqm_b32 exec_lo, exec_lo
 ; GFX10NSA-NEXT:    v_mov_b32_e32 v3, 0xffff
@@ -268,7 +268,7 @@ define amdgpu_ps <4 x float> @gather4_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
 ; GFX10NSA-NEXT:    v_and_or_b32 v0, v0, v3, v1
 ; GFX10NSA-NEXT:    s_mov_b32 s11, s13
 ; GFX10NSA-NEXT:    v_and_or_b32 v1, v2, v3, s12
-; GFX10NSA-NEXT:    s_and_b32 exec_lo, exec_lo, s28
+; GFX10NSA-NEXT:    s_and_b32 exec_lo, exec_lo, s14
 ; GFX10NSA-NEXT:    image_gather4_cl v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
 ; GFX10NSA-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10NSA-NEXT:    ; return to shader part epilog
@@ -306,7 +306,7 @@ define amdgpu_ps <4 x float> @gather4_c_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
 ;
 ; GFX10NSA-LABEL: gather4_c_cl_2d:
 ; GFX10NSA:       ; %bb.0: ; %main_body
-; GFX10NSA-NEXT:    s_mov_b32 s28, exec_lo
+; GFX10NSA-NEXT:    s_mov_b32 s14, exec_lo
 ; GFX10NSA-NEXT:    s_mov_b32 s0, s2
 ; GFX10NSA-NEXT:    s_wqm_b32 exec_lo, exec_lo
 ; GFX10NSA-NEXT:    v_mov_b32_e32 v4, 0xffff
@@ -325,7 +325,7 @@ define amdgpu_ps <4 x float> @gather4_c_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
 ; GFX10NSA-NEXT:    v_and_or_b32 v1, v1, v4, v2
 ; GFX10NSA-NEXT:    s_mov_b32 s11, s13
 ; GFX10NSA-NEXT:    v_and_or_b32 v2, v3, v4, s12
-; GFX10NSA-NEXT:    s_and_b32 exec_lo, exec_lo, s28
+; GFX10NSA-NEXT:    s_and_b32 exec_lo, exec_lo, s14
 ; GFX10NSA-NEXT:    image_gather4_c_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
 ; GFX10NSA-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10NSA-NEXT:    ; return to shader part epilog
@@ -361,7 +361,7 @@ define amdgpu_ps <4 x float> @gather4_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
 ;
 ; GFX10NSA-LABEL: gather4_b_2d:
 ; GFX10NSA:       ; %bb.0: ; %main_body
-; GFX10NSA-NEXT:    s_mov_b32 s28, exec_lo
+; GFX10NSA-NEXT:    s_mov_b32 s14, exec_lo
 ; GFX10NSA-NEXT:    s_mov_b32 s0, s2
 ; GFX10NSA-NEXT:    s_wqm_b32 exec_lo, exec_lo
 ; GFX10NSA-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
@@ -377,7 +377,7 @@ define amdgpu_ps <4 x float> @gather4_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX10NSA-NEXT:    s_mov_b32 s10, s12
 ; GFX10NSA-NEXT:    s_mov_b32 s11, s13
 ; GFX10NSA-NEXT:    v_and_or_b32 v1, v1, 0xffff, v2
-; GFX10NSA-NEXT:    s_and_b32 exec_lo, exec_lo, s28
+; GFX10NSA-NEXT:    s_and_b32 exec_lo, exec_lo, s14
 ; GFX10NSA-NEXT:    image_gather4_b v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
 ; GFX10NSA-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10NSA-NEXT:    ; return to shader part epilog
@@ -413,7 +413,7 @@ define amdgpu_ps <4 x float> @gather4_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> in
 ;
 ; GFX10NSA-LABEL: gather4_c_b_2d:
 ; GFX10NSA:       ; %bb.0: ; %main_body
-; GFX10NSA-NEXT:    s_mov_b32 s28, exec_lo
+; GFX10NSA-NEXT:    s_mov_b32 s14, exec_lo
 ; GFX10NSA-NEXT:    s_mov_b32 s0, s2
 ; GFX10NSA-NEXT:    s_wqm_b32 exec_lo, exec_lo
 ; GFX10NSA-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
@@ -429,7 +429,7 @@ define amdgpu_ps <4 x float> @gather4_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> in
 ; GFX10NSA-NEXT:    s_mov_b32 s10, s12
 ; GFX10NSA-NEXT:    s_mov_b32 s11, s13
 ; GFX10NSA-NEXT:    v_and_or_b32 v2, v2, 0xffff, v3
-; GFX10NSA-NEXT:    s_and_b32 exec_lo, exec_lo, s28
+; GFX10NSA-NEXT:    s_and_b32 exec_lo, exec_lo, s14
 ; GFX10NSA-NEXT:    image_gather4_c_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
 ; GFX10NSA-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10NSA-NEXT:    ; return to shader part epilog
@@ -467,7 +467,7 @@ define amdgpu_ps <4 x float> @gather4_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
 ;
 ; GFX10NSA-LABEL: gather4_b_cl_2d:
 ; GFX10NSA:       ; %bb.0: ; %main_body
-; GFX10NSA-NEXT:    s_mov_b32 s28, exec_lo
+; GFX10NSA-NEXT:    s_mov_b32 s14, exec_lo
 ; GFX10NSA-NEXT:    s_mov_b32 s0, s2
 ; GFX10NSA-NEXT:    s_wqm_b32 exec_lo, exec_lo
 ; GFX10NSA-NEXT:    v_mov_b32_e32 v4, 0xffff
@@ -486,7 +486,7 @@ define amdgpu_ps <4 x float> @gather4_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
 ; GFX10NSA-NEXT:    v_and_or_b32 v1, v1, v4, v2
 ; GFX10NSA-NEXT:    s_mov_b32 s11, s13
 ; GFX10NSA-NEXT:    v_and_or_b32 v2, v3, v4, s12
-; GFX10NSA-NEXT:    s_and_b32 exec_lo, exec_lo, s28
+; GFX10NSA-NEXT:    s_and_b32 exec_lo, exec_lo, s14
 ; GFX10NSA-NEXT:    image_gather4_b_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
 ; GFX10NSA-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10NSA-NEXT:    ; return to shader part epilog
@@ -524,7 +524,7 @@ define amdgpu_ps <4 x float> @gather4_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
 ;
 ; GFX10NSA-LABEL: gather4_c_b_cl_2d:
 ; GFX10NSA:       ; %bb.0: ; %main_body
-; GFX10NSA-NEXT:    s_mov_b32 s28, exec_lo
+; GFX10NSA-NEXT:    s_mov_b32 s14, exec_lo
 ; GFX10NSA-NEXT:    s_mov_b32 s0, s2
 ; GFX10NSA-NEXT:    s_wqm_b32 exec_lo, exec_lo
 ; GFX10NSA-NEXT:    v_mov_b32_e32 v5, 0xffff
@@ -543,7 +543,7 @@ define amdgpu_ps <4 x float> @gather4_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
 ; GFX10NSA-NEXT:    v_and_or_b32 v2, v2, v5, v3
 ; GFX10NSA-NEXT:    s_mov_b32 s11, s13
 ; GFX10NSA-NEXT:    v_and_or_b32 v3, v4, v5, s12
-; GFX10NSA-NEXT:    s_and_b32 exec_lo, exec_lo, s28
+; GFX10NSA-NEXT:    s_and_b32 exec_lo, exec_lo, s14
 ; GFX10NSA-NEXT:    image_gather4_c_b_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
 ; GFX10NSA-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10NSA-NEXT:    ; return to shader part epilog

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.dim.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.dim.ll
index 5e82ab8c6ab1..f597fa920032 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.dim.ll
@@ -80,7 +80,7 @@ define amdgpu_ps <4 x float> @gather4_2d_tfe(<8 x i32> inreg %rsrc, <4 x i32> in
 ;
 ; GFX10NSA-LABEL: gather4_2d_tfe:
 ; GFX10NSA:       ; %bb.0: ; %main_body
-; GFX10NSA-NEXT:    s_mov_b32 s28, exec_lo
+; GFX10NSA-NEXT:    s_mov_b32 s14, exec_lo
 ; GFX10NSA-NEXT:    s_mov_b32 s0, s2
 ; GFX10NSA-NEXT:    s_wqm_b32 exec_lo, exec_lo
 ; GFX10NSA-NEXT:    v_mov_b32_e32 v5, v0
@@ -101,7 +101,7 @@ define amdgpu_ps <4 x float> @gather4_2d_tfe(<8 x i32> inreg %rsrc, <4 x i32> in
 ; GFX10NSA-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX10NSA-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10NSA-NEXT:    v_mov_b32_e32 v4, v0
-; GFX10NSA-NEXT:    s_and_b32 exec_lo, exec_lo, s28
+; GFX10NSA-NEXT:    s_and_b32 exec_lo, exec_lo, s14
 ; GFX10NSA-NEXT:    image_gather4 v[0:4], v[5:6], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe
 ; GFX10NSA-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10NSA-NEXT:    ; return to shader part epilog

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.ll
index 5226382f691a..d19db8b76a68 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.ll
@@ -65,16 +65,16 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, i32
 ;
 ; GFX10-LABEL: load_3d_v4f32_xyzw_tfe:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    v_mov_b32_e32 v11, 0
+; GFX10-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v7, v2
 ; GFX10-NEXT:    s_mov_b32 s0, s2
-; GFX10-NEXT:    v_mov_b32_e32 v12, v11
-; GFX10-NEXT:    v_mov_b32_e32 v13, v11
-; GFX10-NEXT:    v_mov_b32_e32 v14, v11
-; GFX10-NEXT:    v_mov_b32_e32 v15, v11
-; GFX10-NEXT:    v_mov_b32_e32 v0, v11
+; GFX10-NEXT:    v_mov_b32_e32 v9, v8
+; GFX10-NEXT:    v_mov_b32_e32 v10, v8
+; GFX10-NEXT:    v_mov_b32_e32 v11, v8
+; GFX10-NEXT:    v_mov_b32_e32 v12, v8
+; GFX10-NEXT:    v_mov_b32_e32 v0, v8
 ; GFX10-NEXT:    s_mov_b32 s1, s3
 ; GFX10-NEXT:    s_mov_b32 s2, s4
 ; GFX10-NEXT:    s_mov_b32 s3, s5
@@ -82,13 +82,13 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, i32
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s6, s8
 ; GFX10-NEXT:    s_mov_b32 s7, s9
-; GFX10-NEXT:    v_mov_b32_e32 v1, v12
-; GFX10-NEXT:    v_mov_b32_e32 v2, v13
-; GFX10-NEXT:    v_mov_b32_e32 v3, v14
-; GFX10-NEXT:    v_mov_b32_e32 v4, v15
+; GFX10-NEXT:    v_mov_b32_e32 v1, v9
+; GFX10-NEXT:    v_mov_b32_e32 v2, v10
+; GFX10-NEXT:    v_mov_b32_e32 v3, v11
+; GFX10-NEXT:    v_mov_b32_e32 v4, v12
 ; GFX10-NEXT:    image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm tfe
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    global_store_dword v11, v4, s[10:11]
+; GFX10-NEXT:    global_store_dword v8, v4, s[10:11]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    ; return to shader part epilog
   %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.3d.sl_v4f32i32s.i32(i32 15, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 1, i32 0)
@@ -129,16 +129,16 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe_lwe(<8 x i32> inreg %rsrc,
 ;
 ; GFX10-LABEL: load_3d_v4f32_xyzw_tfe_lwe:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    v_mov_b32_e32 v11, 0
+; GFX10-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v7, v2
 ; GFX10-NEXT:    s_mov_b32 s0, s2
-; GFX10-NEXT:    v_mov_b32_e32 v12, v11
-; GFX10-NEXT:    v_mov_b32_e32 v13, v11
-; GFX10-NEXT:    v_mov_b32_e32 v14, v11
-; GFX10-NEXT:    v_mov_b32_e32 v15, v11
-; GFX10-NEXT:    v_mov_b32_e32 v0, v11
+; GFX10-NEXT:    v_mov_b32_e32 v9, v8
+; GFX10-NEXT:    v_mov_b32_e32 v10, v8
+; GFX10-NEXT:    v_mov_b32_e32 v11, v8
+; GFX10-NEXT:    v_mov_b32_e32 v12, v8
+; GFX10-NEXT:    v_mov_b32_e32 v0, v8
 ; GFX10-NEXT:    s_mov_b32 s1, s3
 ; GFX10-NEXT:    s_mov_b32 s2, s4
 ; GFX10-NEXT:    s_mov_b32 s3, s5
@@ -146,13 +146,13 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe_lwe(<8 x i32> inreg %rsrc,
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s6, s8
 ; GFX10-NEXT:    s_mov_b32 s7, s9
-; GFX10-NEXT:    v_mov_b32_e32 v1, v12
-; GFX10-NEXT:    v_mov_b32_e32 v2, v13
-; GFX10-NEXT:    v_mov_b32_e32 v3, v14
-; GFX10-NEXT:    v_mov_b32_e32 v4, v15
+; GFX10-NEXT:    v_mov_b32_e32 v1, v9
+; GFX10-NEXT:    v_mov_b32_e32 v2, v10
+; GFX10-NEXT:    v_mov_b32_e32 v3, v11
+; GFX10-NEXT:    v_mov_b32_e32 v4, v12
 ; GFX10-NEXT:    image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm tfe lwe
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    global_store_dword v11, v4, s[10:11]
+; GFX10-NEXT:    global_store_dword v8, v4, s[10:11]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    ; return to shader part epilog
   %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.3d.sl_v4f32i32s.i32(i32 15, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 3, i32 0)

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll
index d4d526b26e86..1f1b34bcd736 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll
@@ -22,9 +22,9 @@ define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
 ; GFX10-NEXT:    v_mov_b32_e32 v6, 0xffff
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX10-NEXT:    v_and_or_b32 v7, v0, v6, v1
-; GFX10-NEXT:    v_and_or_b32 v2, v2, v6, v3
-; GFX10-NEXT:    image_sample_d_g16 v[0:3], [v7, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX10-NEXT:    v_and_or_b32 v0, v0, v6, v1
+; GFX10-NEXT:    v_and_or_b32 v1, v2, v6, v3
+; GFX10-NEXT:    image_sample_d_g16 v[0:3], [v0, v1, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 main_body:
@@ -35,14 +35,14 @@ main_body:
 define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %drdh, half %dsdv, half %dtdv, half %drdv, float %s, float %t, float %r) {
 ; GFX10-LABEL: sample_d_3d:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v11, 0xffff
+; GFX10-NEXT:    v_mov_b32_e32 v9, 0xffff
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX10-NEXT:    s_lshl_b32 s12, s0, 16
-; GFX10-NEXT:    v_and_or_b32 v0, v0, v11, v1
-; GFX10-NEXT:    v_and_or_b32 v1, v2, v11, s12
-; GFX10-NEXT:    v_and_or_b32 v2, v3, v11, v4
-; GFX10-NEXT:    v_and_or_b32 v3, v5, v11, s12
+; GFX10-NEXT:    v_and_or_b32 v0, v0, v9, v1
+; GFX10-NEXT:    v_and_or_b32 v1, v2, v9, s12
+; GFX10-NEXT:    v_and_or_b32 v2, v3, v9, v4
+; GFX10-NEXT:    v_and_or_b32 v3, v5, v9, s12
 ; GFX10-NEXT:    image_sample_d_g16 v[0:3], [v0, v1, v2, v3, v6, v7, v8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -72,9 +72,9 @@ define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
 ; GFX10-NEXT:    v_mov_b32_e32 v7, 0xffff
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX10-NEXT:    v_and_or_b32 v11, v1, v7, v2
+; GFX10-NEXT:    v_and_or_b32 v1, v1, v7, v2
 ; GFX10-NEXT:    v_and_or_b32 v2, v3, v7, v4
-; GFX10-NEXT:    image_sample_c_d_g16 v[0:3], [v0, v11, v2, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX10-NEXT:    image_sample_c_d_g16 v[0:3], [v0, v1, v2, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 main_body:
@@ -85,10 +85,10 @@ main_body:
 define amdgpu_ps <4 x float> @sample_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s, float %clamp) {
 ; GFX10-LABEL: sample_d_cl_1d:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v7, 0xffff
+; GFX10-NEXT:    v_mov_b32_e32 v4, 0xffff
 ; GFX10-NEXT:    s_lshl_b32 s12, s0, 16
-; GFX10-NEXT:    v_and_or_b32 v0, v0, v7, s12
-; GFX10-NEXT:    v_and_or_b32 v1, v1, v7, s12
+; GFX10-NEXT:    v_and_or_b32 v0, v0, v4, s12
+; GFX10-NEXT:    v_and_or_b32 v1, v1, v4, s12
 ; GFX10-NEXT:    image_sample_d_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -102,10 +102,10 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
 ; GFX10:       ; %bb.0: ; %main_body
 ; GFX10-NEXT:    v_mov_b32_e32 v7, 0xffff
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
-; GFX10-NEXT:    v_and_or_b32 v11, v0, v7, v1
-; GFX10-NEXT:    v_and_or_b32 v1, v2, v7, v9
-; GFX10-NEXT:    image_sample_d_cl_g16 v[0:3], [v11, v1, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_and_or_b32 v0, v0, v7, v1
+; GFX10-NEXT:    v_and_or_b32 v1, v2, v7, v3
+; GFX10-NEXT:    image_sample_d_cl_g16 v[0:3], [v0, v1, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 main_body:
@@ -116,10 +116,10 @@ main_body:
 define amdgpu_ps <4 x float> @sample_c_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp) {
 ; GFX10-LABEL: sample_c_d_cl_1d:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v7, 0xffff
+; GFX10-NEXT:    v_mov_b32_e32 v5, 0xffff
 ; GFX10-NEXT:    s_lshl_b32 s12, s0, 16
-; GFX10-NEXT:    v_and_or_b32 v1, v1, v7, s12
-; GFX10-NEXT:    v_and_or_b32 v2, v2, v7, s12
+; GFX10-NEXT:    v_and_or_b32 v1, v1, v5, s12
+; GFX10-NEXT:    v_and_or_b32 v2, v2, v5, s12
 ; GFX10-NEXT:    image_sample_c_d_cl_g16 v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -133,9 +133,9 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
 ; GFX10:       ; %bb.0: ; %main_body
 ; GFX10-NEXT:    v_mov_b32_e32 v8, 0xffff
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX10-NEXT:    v_and_or_b32 v1, v1, v8, v2
-; GFX10-NEXT:    v_and_or_b32 v2, v3, v8, v10
+; GFX10-NEXT:    v_and_or_b32 v2, v3, v8, v4
 ; GFX10-NEXT:    image_sample_c_d_cl_g16 v[0:3], [v0, v1, v2, v5, v6, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -165,9 +165,9 @@ define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX10-NEXT:    v_mov_b32_e32 v6, 0xffff
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX10-NEXT:    v_and_or_b32 v7, v0, v6, v1
-; GFX10-NEXT:    v_and_or_b32 v2, v2, v6, v3
-; GFX10-NEXT:    image_sample_cd_g16 v[0:3], [v7, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX10-NEXT:    v_and_or_b32 v0, v0, v6, v1
+; GFX10-NEXT:    v_and_or_b32 v1, v2, v6, v3
+; GFX10-NEXT:    image_sample_cd_g16 v[0:3], [v0, v1, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 main_body:
@@ -196,9 +196,9 @@ define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> in
 ; GFX10-NEXT:    v_mov_b32_e32 v7, 0xffff
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX10-NEXT:    v_and_or_b32 v11, v1, v7, v2
+; GFX10-NEXT:    v_and_or_b32 v1, v1, v7, v2
 ; GFX10-NEXT:    v_and_or_b32 v2, v3, v7, v4
-; GFX10-NEXT:    image_sample_c_cd_g16 v[0:3], [v0, v11, v2, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX10-NEXT:    image_sample_c_cd_g16 v[0:3], [v0, v1, v2, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 main_body:
@@ -209,10 +209,10 @@ main_body:
 define amdgpu_ps <4 x float> @sample_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s, float %clamp) {
 ; GFX10-LABEL: sample_cd_cl_1d:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v7, 0xffff
+; GFX10-NEXT:    v_mov_b32_e32 v4, 0xffff
 ; GFX10-NEXT:    s_lshl_b32 s12, s0, 16
-; GFX10-NEXT:    v_and_or_b32 v0, v0, v7, s12
-; GFX10-NEXT:    v_and_or_b32 v1, v1, v7, s12
+; GFX10-NEXT:    v_and_or_b32 v0, v0, v4, s12
+; GFX10-NEXT:    v_and_or_b32 v1, v1, v4, s12
 ; GFX10-NEXT:    image_sample_cd_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -226,10 +226,10 @@ define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
 ; GFX10:       ; %bb.0: ; %main_body
 ; GFX10-NEXT:    v_mov_b32_e32 v7, 0xffff
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
-; GFX10-NEXT:    v_and_or_b32 v11, v0, v7, v1
-; GFX10-NEXT:    v_and_or_b32 v1, v2, v7, v9
-; GFX10-NEXT:    image_sample_cd_cl_g16 v[0:3], [v11, v1, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_and_or_b32 v0, v0, v7, v1
+; GFX10-NEXT:    v_and_or_b32 v1, v2, v7, v3
+; GFX10-NEXT:    image_sample_cd_cl_g16 v[0:3], [v0, v1, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 main_body:
@@ -240,10 +240,10 @@ main_body:
 define amdgpu_ps <4 x float> @sample_c_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp) {
 ; GFX10-LABEL: sample_c_cd_cl_1d:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v7, 0xffff
+; GFX10-NEXT:    v_mov_b32_e32 v5, 0xffff
 ; GFX10-NEXT:    s_lshl_b32 s12, s0, 16
-; GFX10-NEXT:    v_and_or_b32 v1, v1, v7, s12
-; GFX10-NEXT:    v_and_or_b32 v2, v2, v7, s12
+; GFX10-NEXT:    v_and_or_b32 v1, v1, v5, s12
+; GFX10-NEXT:    v_and_or_b32 v2, v2, v5, s12
 ; GFX10-NEXT:    image_sample_c_cd_cl_g16 v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -257,9 +257,9 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
 ; GFX10:       ; %bb.0: ; %main_body
 ; GFX10-NEXT:    v_mov_b32_e32 v8, 0xffff
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX10-NEXT:    v_and_or_b32 v1, v1, v8, v2
-; GFX10-NEXT:    v_and_or_b32 v2, v3, v8, v10
+; GFX10-NEXT:    v_and_or_b32 v2, v3, v8, v4
 ; GFX10-NEXT:    image_sample_c_cd_cl_g16 v[0:3], [v0, v1, v2, v5, v6, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -273,9 +273,9 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32>
 ; GFX10:       ; %bb.0: ; %main_body
 ; GFX10-NEXT:    v_mov_b32_e32 v9, 0xffff
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; GFX10-NEXT:    v_and_or_b32 v2, v2, v9, v3
-; GFX10-NEXT:    v_and_or_b32 v3, v4, v9, v11
+; GFX10-NEXT:    v_and_or_b32 v3, v4, v9, v5
 ; GFX10-NEXT:    image_sample_c_d_o_g16 v0, [v0, v1, v2, v3, v6, v7, v8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -289,9 +289,9 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4
 ; GFX10:       ; %bb.0: ; %main_body
 ; GFX10-NEXT:    v_mov_b32_e32 v9, 0xffff
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; GFX10-NEXT:    v_and_or_b32 v2, v2, v9, v3
-; GFX10-NEXT:    v_and_or_b32 v3, v4, v9, v11
+; GFX10-NEXT:    v_and_or_b32 v3, v4, v9, v5
 ; GFX10-NEXT:    image_sample_c_d_o_g16 v[0:1], [v0, v1, v2, v3, v6, v7, v8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll
index 72a9dbbcb232..866bae4b3400 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll
@@ -79,9 +79,9 @@ define i32 @v_sdot4_cast_v4i8(<4 x i8> %a, <4 x i8> %b, i32 %c) {
 ; GFX10-NEXT:    v_and_or_b32 v3, v4, s5, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 24, v6
-; GFX10-NEXT:    v_or3_b32 v7, v0, v1, v2
+; GFX10-NEXT:    v_or3_b32 v0, v0, v1, v2
 ; GFX10-NEXT:    v_or3_b32 v1, v3, v4, v5
-; GFX10-NEXT:    v_dot4_i32_i8 v0, v7, v1, v8
+; GFX10-NEXT:    v_dot4_i32_i8 v0, v0, v1, v8
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %a.cast = bitcast <4 x i8> %a to i32
   %b.cast = bitcast <4 x i8> %b to i32

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll
index 70e4021ff4ad..ffcc4ed7d38f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll
@@ -79,9 +79,9 @@ define i32 @v_udot4_cast_v4i8(<4 x i8> %a, <4 x i8> %b, i32 %c) {
 ; GFX10-NEXT:    v_and_or_b32 v3, v4, s5, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 24, v6
-; GFX10-NEXT:    v_or3_b32 v7, v0, v1, v2
+; GFX10-NEXT:    v_or3_b32 v0, v0, v1, v2
 ; GFX10-NEXT:    v_or3_b32 v1, v3, v4, v5
-; GFX10-NEXT:    v_dot4_u32_u8 v0, v7, v1, v8
+; GFX10-NEXT:    v_dot4_u32_u8 v0, v0, v1, v8
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %a.cast = bitcast <4 x i8> %a to i32
   %b.cast = bitcast <4 x i8> %b to i32

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll
index 1b8689d10a1e..23cc4fb459d4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll
@@ -351,8 +351,8 @@ define amdgpu_ps <4 x float> @wqm_demote_1(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX10-32-NEXT:    s_andn2_b32 s12, s12, exec_lo
 ; GFX10-32-NEXT:    s_cbranch_scc0 BB3_4
 ; GFX10-32-NEXT:  ; %bb.2: ; %.demote
-; GFX10-32-NEXT:    s_wqm_b32 s28, s12
-; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s28
+; GFX10-32-NEXT:    s_wqm_b32 s14, s12
+; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s14
 ; GFX10-32-NEXT:  BB3_3: ; %.continue
 ; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s13
 ; GFX10-32-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
@@ -374,7 +374,7 @@ define amdgpu_ps <4 x float> @wqm_demote_1(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX10-64-NEXT:    s_wqm_b64 exec, exec
 ; GFX10-64-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v1
 ; GFX10-64-NEXT:    s_and_saveexec_b64 s[14:15], vcc
-; GFX10-64-NEXT:    s_xor_b64 s[28:29], exec, s[14:15]
+; GFX10-64-NEXT:    s_xor_b64 s[14:15], exec, s[14:15]
 ; GFX10-64-NEXT:    s_cbranch_execz BB3_3
 ; GFX10-64-NEXT:  ; %bb.1: ; %.demote
 ; GFX10-64-NEXT:    s_andn2_b64 s[12:13], s[12:13], exec
@@ -383,7 +383,7 @@ define amdgpu_ps <4 x float> @wqm_demote_1(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX10-64-NEXT:    s_wqm_b64 s[16:17], s[12:13]
 ; GFX10-64-NEXT:    s_and_b64 exec, exec, s[16:17]
 ; GFX10-64-NEXT:  BB3_3: ; %.continue
-; GFX10-64-NEXT:    s_or_b64 exec, exec, s[28:29]
+; GFX10-64-NEXT:    s_or_b64 exec, exec, s[14:15]
 ; GFX10-64-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
 ; GFX10-64-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-64-NEXT:    v_add_f32_e32 v0, v0, v0
@@ -487,8 +487,8 @@ define amdgpu_ps <4 x float> @wqm_demote_2(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX10-32-NEXT:    s_andn2_b32 s12, s12, exec_lo
 ; GFX10-32-NEXT:    s_cbranch_scc0 BB4_4
 ; GFX10-32-NEXT:  ; %bb.2: ; %.demote
-; GFX10-32-NEXT:    s_wqm_b32 s28, s12
-; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s28
+; GFX10-32-NEXT:    s_wqm_b32 s14, s12
+; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s14
 ; GFX10-32-NEXT:  BB4_3: ; %.continue
 ; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s13
 ; GFX10-32-NEXT:    v_add_f32_e32 v0, v0, v0
@@ -510,7 +510,7 @@ define amdgpu_ps <4 x float> @wqm_demote_2(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX10-64-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-64-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v0
 ; GFX10-64-NEXT:    s_and_saveexec_b64 s[14:15], vcc
-; GFX10-64-NEXT:    s_xor_b64 s[28:29], exec, s[14:15]
+; GFX10-64-NEXT:    s_xor_b64 s[14:15], exec, s[14:15]
 ; GFX10-64-NEXT:    s_cbranch_execz BB4_3
 ; GFX10-64-NEXT:  ; %bb.1: ; %.demote
 ; GFX10-64-NEXT:    s_andn2_b64 s[12:13], s[12:13], exec
@@ -519,7 +519,7 @@ define amdgpu_ps <4 x float> @wqm_demote_2(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX10-64-NEXT:    s_wqm_b64 s[16:17], s[12:13]
 ; GFX10-64-NEXT:    s_and_b64 exec, exec, s[16:17]
 ; GFX10-64-NEXT:  BB4_3: ; %.continue
-; GFX10-64-NEXT:    s_or_b64 exec, exec, s[28:29]
+; GFX10-64-NEXT:    s_or_b64 exec, exec, s[14:15]
 ; GFX10-64-NEXT:    v_add_f32_e32 v0, v0, v0
 ; GFX10-64-NEXT:    s_and_b64 exec, exec, s[12:13]
 ; GFX10-64-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
@@ -632,8 +632,8 @@ define amdgpu_ps <4 x float> @wqm_demote_dynamic(<8 x i32> inreg %rsrc, <4 x i32
 ; GFX10-64-NEXT:    s_andn2_b64 s[12:13], s[12:13], s[14:15]
 ; GFX10-64-NEXT:    s_cbranch_scc0 BB5_2
 ; GFX10-64-NEXT:  ; %bb.1: ; %.entry
-; GFX10-64-NEXT:    s_wqm_b64 s[28:29], s[12:13]
-; GFX10-64-NEXT:    s_and_b64 exec, exec, s[28:29]
+; GFX10-64-NEXT:    s_wqm_b64 s[14:15], s[12:13]
+; GFX10-64-NEXT:    s_and_b64 exec, exec, s[14:15]
 ; GFX10-64-NEXT:    v_add_f32_e32 v0, v0, v0
 ; GFX10-64-NEXT:    s_and_b64 exec, exec, s[12:13]
 ; GFX10-64-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll
index 38634ea10e5c..939b491ff08c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll
@@ -192,7 +192,7 @@ define <4 x i32> @load_lds_v4i32_align1(<4 x i32> addrspace(3)* %ptr) {
 ; GFX10-NEXT:    ds_read_u8 v10, v0 offset:8
 ; GFX10-NEXT:    ds_read_u8 v12, v0 offset:10
 ; GFX10-NEXT:    ds_read_u8 v13, v0 offset:11
-; GFX10-NEXT:    ds_read_u8 v25, v0 offset:12
+; GFX10-NEXT:    ds_read_u8 v14, v0 offset:12
 ; GFX10-NEXT:    ds_read_u8 v15, v0 offset:13
 ; GFX10-NEXT:    ds_read_u8 v16, v0 offset:14
 ; GFX10-NEXT:    ds_read_u8 v0, v0 offset:15
@@ -213,7 +213,7 @@ define <4 x i32> @load_lds_v4i32_align1(<4 x i32> addrspace(3)* %ptr) {
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(10)
 ; GFX10-NEXT:    v_and_b32_e32 v6, v6, v11
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(9)
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v21, v17, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v7, v17, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(8)
 ; GFX10-NEXT:    v_and_or_b32 v1, v8, s4, v1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(5)
@@ -221,7 +221,7 @@ define <4 x i32> @load_lds_v4i32_align1(<4 x i32> addrspace(3)* %ptr) {
 ; GFX10-NEXT:    v_and_or_b32 v4, v9, s4, v4
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(4)
 ; GFX10-NEXT:    v_and_b32_e32 v9, v13, v11
-; GFX10-NEXT:    v_and_or_b32 v7, v10, v11, v21
+; GFX10-NEXT:    v_and_or_b32 v7, v10, v11, v7
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(2)
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v10, v17, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(1)
@@ -230,7 +230,7 @@ define <4 x i32> @load_lds_v4i32_align1(<4 x i32> addrspace(3)* %ptr) {
 ; GFX10-NEXT:    v_and_b32_e32 v0, v0, v11
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX10-NEXT:    v_and_or_b32 v10, v25, v11, v10
+; GFX10-NEXT:    v_and_or_b32 v10, v14, v11, v10
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 24, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll
index 6dda1f4b2816..eeef6bcade9a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll
@@ -158,11 +158,11 @@ define <3 x i32> @load_lds_v3i32_align1(<3 x i32> addrspace(3)* %ptr) {
 ; GFX10-NEXT:    ds_read_u8 v5, v0 offset:6
 ; GFX10-NEXT:    ds_read_u8 v6, v0 offset:7
 ; GFX10-NEXT:    ds_read_u8 v7, v0 offset:9
-; GFX10-NEXT:    ds_read_u8 v15, v0 offset:10
+; GFX10-NEXT:    ds_read_u8 v8, v0 offset:10
 ; GFX10-NEXT:    ds_read_u8 v9, v0 offset:11
 ; GFX10-NEXT:    ds_read_u8 v10, v0
 ; GFX10-NEXT:    ds_read_u8 v11, v0 offset:4
-; GFX10-NEXT:    ds_read_u8 v14, v0 offset:8
+; GFX10-NEXT:    ds_read_u8 v0, v0 offset:8
 ; GFX10-NEXT:    v_mov_b32_e32 v12, 0xff
 ; GFX10-NEXT:    v_mov_b32_e32 v13, 8
 ; GFX10-NEXT:    s_movk_i32 s4, 0xff
@@ -182,19 +182,18 @@ define <3 x i32> @load_lds_v3i32_align1(<3 x i32> addrspace(3)* %ptr) {
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(5)
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v7, v13, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(4)
-; GFX10-NEXT:    v_and_b32_e32 v8, v15, v12
+; GFX10-NEXT:    v_and_b32_e32 v8, v8, v12
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(3)
 ; GFX10-NEXT:    v_and_b32_e32 v9, v9, v12
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(2)
 ; GFX10-NEXT:    v_and_or_b32 v1, v10, s4, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX10-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_and_or_b32 v7, v0, v12, v7
 ; GFX10-NEXT:    v_and_or_b32 v4, v11, s4, v4
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_and_or_b32 v7, v14, v12, v7
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
 ; GFX10-NEXT:    v_or3_b32 v0, v1, v2, v3
@@ -266,9 +265,9 @@ define <3 x i32> @load_lds_v3i32_align2(<3 x i32> addrspace(3)* %ptr) {
 ; GFX10-NEXT:    ds_read_u16 v1, v0 offset:2
 ; GFX10-NEXT:    ds_read_u16 v2, v0 offset:6
 ; GFX10-NEXT:    ds_read_u16 v3, v0 offset:10
-; GFX10-NEXT:    ds_read_u16 v7, v0
-; GFX10-NEXT:    ds_read_u16 v11, v0 offset:4
-; GFX10-NEXT:    ds_read_u16 v15, v0 offset:8
+; GFX10-NEXT:    ds_read_u16 v4, v0
+; GFX10-NEXT:    ds_read_u16 v5, v0 offset:4
+; GFX10-NEXT:    ds_read_u16 v6, v0 offset:8
 ; GFX10-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(5)
 ; GFX10-NEXT:    v_and_b32_e32 v0, s4, v1
@@ -280,11 +279,11 @@ define <3 x i32> @load_lds_v3i32_align2(<3 x i32> addrspace(3)* %ptr) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(2)
-; GFX10-NEXT:    v_and_or_b32 v0, v7, s4, v0
+; GFX10-NEXT:    v_and_or_b32 v0, v4, s4, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(1)
-; GFX10-NEXT:    v_and_or_b32 v1, v11, s4, v1
+; GFX10-NEXT:    v_and_or_b32 v1, v5, s4, v1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_and_or_b32 v2, v15, s4, v2
+; GFX10-NEXT:    v_and_or_b32 v2, v6, s4, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 2
   ret <3 x i32> %load

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll
index 73e1da080f19..0b8efd5e154d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll
@@ -108,7 +108,7 @@ define <4 x i32> @load_lds_v4i32_align1(<4 x i32> addrspace(3)* %ptr) {
 ; GFX10-NEXT:    ds_read_u8 v10, v0 offset:8
 ; GFX10-NEXT:    ds_read_u8 v12, v0 offset:10
 ; GFX10-NEXT:    ds_read_u8 v13, v0 offset:11
-; GFX10-NEXT:    ds_read_u8 v25, v0 offset:12
+; GFX10-NEXT:    ds_read_u8 v14, v0 offset:12
 ; GFX10-NEXT:    ds_read_u8 v15, v0 offset:13
 ; GFX10-NEXT:    ds_read_u8 v16, v0 offset:14
 ; GFX10-NEXT:    ds_read_u8 v0, v0 offset:15
@@ -129,7 +129,7 @@ define <4 x i32> @load_lds_v4i32_align1(<4 x i32> addrspace(3)* %ptr) {
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(10)
 ; GFX10-NEXT:    v_and_b32_e32 v6, v6, v11
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(9)
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v21, v17, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v7, v17, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(8)
 ; GFX10-NEXT:    v_and_or_b32 v1, v8, s4, v1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(5)
@@ -137,7 +137,7 @@ define <4 x i32> @load_lds_v4i32_align1(<4 x i32> addrspace(3)* %ptr) {
 ; GFX10-NEXT:    v_and_or_b32 v4, v9, s4, v4
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(4)
 ; GFX10-NEXT:    v_and_b32_e32 v9, v13, v11
-; GFX10-NEXT:    v_and_or_b32 v7, v10, v11, v21
+; GFX10-NEXT:    v_and_or_b32 v7, v10, v11, v7
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(2)
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v10, v17, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(1)
@@ -146,7 +146,7 @@ define <4 x i32> @load_lds_v4i32_align1(<4 x i32> addrspace(3)* %ptr) {
 ; GFX10-NEXT:    v_and_b32_e32 v0, v0, v11
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX10-NEXT:    v_and_or_b32 v10, v25, v11, v10
+; GFX10-NEXT:    v_and_or_b32 v10, v14, v11, v10
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 24, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
@@ -242,11 +242,11 @@ define <3 x i32> @load_lds_v3i32_align1(<3 x i32> addrspace(3)* %ptr) {
 ; GFX10-NEXT:    ds_read_u8 v5, v0 offset:6
 ; GFX10-NEXT:    ds_read_u8 v6, v0 offset:7
 ; GFX10-NEXT:    ds_read_u8 v7, v0 offset:9
-; GFX10-NEXT:    ds_read_u8 v15, v0 offset:10
+; GFX10-NEXT:    ds_read_u8 v8, v0 offset:10
 ; GFX10-NEXT:    ds_read_u8 v9, v0 offset:11
 ; GFX10-NEXT:    ds_read_u8 v10, v0
 ; GFX10-NEXT:    ds_read_u8 v11, v0 offset:4
-; GFX10-NEXT:    ds_read_u8 v14, v0 offset:8
+; GFX10-NEXT:    ds_read_u8 v0, v0 offset:8
 ; GFX10-NEXT:    v_mov_b32_e32 v12, 0xff
 ; GFX10-NEXT:    v_mov_b32_e32 v13, 8
 ; GFX10-NEXT:    s_movk_i32 s4, 0xff
@@ -266,19 +266,18 @@ define <3 x i32> @load_lds_v3i32_align1(<3 x i32> addrspace(3)* %ptr) {
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(5)
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v7, v13, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(4)
-; GFX10-NEXT:    v_and_b32_e32 v8, v15, v12
+; GFX10-NEXT:    v_and_b32_e32 v8, v8, v12
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(3)
 ; GFX10-NEXT:    v_and_b32_e32 v9, v9, v12
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(2)
 ; GFX10-NEXT:    v_and_or_b32 v1, v10, s4, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX10-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_and_or_b32 v7, v0, v12, v7
 ; GFX10-NEXT:    v_and_or_b32 v4, v11, s4, v4
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_and_or_b32 v7, v14, v12, v7
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
 ; GFX10-NEXT:    v_or3_b32 v0, v1, v2, v3
@@ -410,27 +409,27 @@ define void @store_lds_v3i32_align1(<3 x i32> addrspace(3)* %out, <3 x i32> %x)
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 8, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 24, v1
 ; GFX10-NEXT:    ds_write_b8 v0, v1
-; GFX10-NEXT:    ds_write_b8 v0, v7 offset:1
+; GFX10-NEXT:    ds_write_b8 v0, v4 offset:1
 ; GFX10-NEXT:    ds_write_b8 v0, v5 offset:2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 8, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 24, v2
 ; GFX10-NEXT:    ds_write_b8 v0, v6 offset:3
 ; GFX10-NEXT:    ds_write_b8 v0, v2 offset:4
 ; GFX10-NEXT:    ds_write_b8 v0, v1 offset:5
-; GFX10-NEXT:    ds_write_b8 v0, v7 offset:6
+; GFX10-NEXT:    ds_write_b8 v0, v4 offset:6
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 8, v3
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
-; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 24, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 24, v3
 ; GFX10-NEXT:    ds_write_b8 v0, v5 offset:7
 ; GFX10-NEXT:    ds_write_b8 v0, v3 offset:8
 ; GFX10-NEXT:    ds_write_b8 v0, v1 offset:9
 ; GFX10-NEXT:    ds_write_b8 v0, v2 offset:10
-; GFX10-NEXT:    ds_write_b8 v0, v7 offset:11
+; GFX10-NEXT:    ds_write_b8 v0, v4 offset:11
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 1

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
index 494593ea3554..b390c736a22c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
@@ -1608,12 +1608,8 @@ define <2 x i64> @v_lshr_v2i64(<2 x i64> %value, <2 x i64> %amount) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_mov_b32_e32 v10, v0
-; GFX10-NEXT:    v_mov_b32_e32 v11, v1
-; GFX10-NEXT:    v_mov_b32_e32 v7, v2
-; GFX10-NEXT:    v_mov_b32_e32 v8, v3
-; GFX10-NEXT:    v_lshrrev_b64 v[0:1], v4, v[10:11]
-; GFX10-NEXT:    v_lshrrev_b64 v[2:3], v6, v[7:8]
+; GFX10-NEXT:    v_lshrrev_b64 v[0:1], v4, v[0:1]
+; GFX10-NEXT:    v_lshrrev_b64 v[2:3], v6, v[2:3]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %result = lshr <2 x i64> %value, %amount
   ret <2 x i64> %result

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
index fbf6d90e624b..dddad69df467 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
@@ -585,12 +585,12 @@ define i96 @v_mul_i96(i96 %num, i96 %den) {
 ; GFX10-NEXT:    v_mul_lo_u32 v0, v0, v3
 ; GFX10-NEXT:    v_add_co_u32 v6, s4, v6, v7
 ; GFX10-NEXT:    v_mul_hi_u32 v7, v1, v3
-; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s4
 ; GFX10-NEXT:    v_add_nc_u32_e32 v2, v2, v9
 ; GFX10-NEXT:    v_add_co_u32 v1, s4, v6, v8
 ; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s4
 ; GFX10-NEXT:    v_add3_u32 v2, v2, v5, v7
-; GFX10-NEXT:    v_add_nc_u32_e32 v3, v11, v6
+; GFX10-NEXT:    v_add_nc_u32_e32 v3, v10, v6
 ; GFX10-NEXT:    v_add3_u32 v2, v2, v4, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %result = mul i96 %num, %den
@@ -997,24 +997,24 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
 ; GFX10-NEXT:    v_add_co_u32 v8, s5, v9, v12
 ; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s5
-; GFX10-NEXT:    v_add_co_u32 v18, s4, v13, v11
+; GFX10-NEXT:    v_add_co_u32 v11, s4, v13, v11
 ; GFX10-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s4
 ; GFX10-NEXT:    v_add_nc_u32_e32 v9, v10, v9
 ; GFX10-NEXT:    v_mul_lo_u32 v10, v2, v5
-; GFX10-NEXT:    v_add_co_u32 v11, s4, v18, v15
+; GFX10-NEXT:    v_add_co_u32 v11, s4, v11, v15
 ; GFX10-NEXT:    v_mul_hi_u32 v15, v2, v4
 ; GFX10-NEXT:    v_add3_u32 v12, v14, v12, v13
 ; GFX10-NEXT:    v_mul_lo_u32 v13, v1, v6
 ; GFX10-NEXT:    v_mul_hi_u32 v1, v1, v5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s4
 ; GFX10-NEXT:    v_add_co_u32 v2, s4, v11, v9
-; GFX10-NEXT:    v_add_nc_u32_e32 v10, v3, v10
+; GFX10-NEXT:    v_add_nc_u32_e32 v3, v3, v10
 ; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s4
 ; GFX10-NEXT:    v_mul_hi_u32 v6, v0, v6
 ; GFX10-NEXT:    v_mul_lo_u32 v0, v0, v4
-; GFX10-NEXT:    v_add3_u32 v10, v10, v13, v7
+; GFX10-NEXT:    v_add3_u32 v3, v3, v13, v7
 ; GFX10-NEXT:    v_add3_u32 v4, v12, v14, v5
-; GFX10-NEXT:    v_add3_u32 v1, v10, v15, v1
+; GFX10-NEXT:    v_add3_u32 v1, v3, v15, v1
 ; GFX10-NEXT:    v_add3_u32 v3, v1, v6, v4
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v8
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -2758,13 +2758,15 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
 ; GFX10-NEXT:    v_add_co_u32 v16, s4, v16, v17
 ; GFX10-NEXT:    v_mul_hi_u32 v27, v0, v10
 ; GFX10-NEXT:    v_cndmask_b32_e64 v17, 0, 1, s4
-; GFX10-NEXT:    v_mul_lo_u32 v7, v7, v8
-; GFX10-NEXT:    v_mul_lo_u32 v15, v0, v15
+; GFX10-NEXT:    v_mul_hi_u32 v29, v3, v9
+; GFX10-NEXT:    v_mul_hi_u32 v31, v4, v9
 ; GFX10-NEXT:    v_add_co_u32 v16, s4, v16, v18
+; GFX10-NEXT:    v_mul_lo_u32 v7, v7, v8
 ; GFX10-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s4
 ; GFX10-NEXT:    v_add_co_u32 v19, s4, v19, v20
 ; GFX10-NEXT:    v_mul_lo_u32 v20, v2, v9
 ; GFX10-NEXT:    v_cndmask_b32_e64 v24, 0, 1, s4
+; GFX10-NEXT:    v_mul_lo_u32 v15, v0, v15
 ; GFX10-NEXT:    v_add_nc_u32_e32 v17, v17, v18
 ; GFX10-NEXT:    v_mul_lo_u32 v18, v0, v10
 ; GFX10-NEXT:    v_add_co_u32 v18, s4, v19, v18
@@ -2781,7 +2783,7 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
 ; GFX10-NEXT:    v_mul_hi_u32 v21, v2, v8
 ; GFX10-NEXT:    v_cndmask_b32_e64 v24, 0, 1, s4
 ; GFX10-NEXT:    v_add_co_u32 v20, s4, v20, v22
-; GFX10-NEXT:    v_cndmask_b32_e64 v29, 0, 1, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v22, 0, 1, s5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v25, 0, 1, s4
 ; GFX10-NEXT:    v_add_co_u32 v17, s5, v18, v17
 ; GFX10-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s5
@@ -2791,11 +2793,10 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
 ; GFX10-NEXT:    v_mul_lo_u32 v25, v4, v8
 ; GFX10-NEXT:    v_mul_lo_u32 v26, v3, v9
 ; GFX10-NEXT:    v_add_co_u32 v20, s4, v20, v23
-; GFX10-NEXT:    v_add3_u32 v18, v19, v29, v18
+; GFX10-NEXT:    v_add3_u32 v18, v19, v22, v18
 ; GFX10-NEXT:    v_cndmask_b32_e64 v23, 0, 1, s4
-; GFX10-NEXT:    v_mul_hi_u32 v29, v3, v9
 ; GFX10-NEXT:    v_add_co_u32 v20, s5, v20, v27
-; GFX10-NEXT:    v_add3_u32 v30, v21, v24, v23
+; GFX10-NEXT:    v_add3_u32 v19, v21, v24, v23
 ; GFX10-NEXT:    v_mul_lo_u32 v21, v2, v10
 ; GFX10-NEXT:    v_add_co_u32 v22, s4, v25, v26
 ; GFX10-NEXT:    v_mul_lo_u32 v24, v1, v11
@@ -2813,7 +2814,7 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
 ; GFX10-NEXT:    v_mul_hi_u32 v22, v2, v9
 ; GFX10-NEXT:    v_add3_u32 v24, v25, v27, v24
 ; GFX10-NEXT:    v_cndmask_b32_e64 v25, 0, 1, s4
-; GFX10-NEXT:    v_add3_u32 v19, v30, v23, v20
+; GFX10-NEXT:    v_add3_u32 v19, v19, v23, v20
 ; GFX10-NEXT:    v_add_co_u32 v21, s4, v21, v26
 ; GFX10-NEXT:    v_mul_hi_u32 v20, v1, v10
 ; GFX10-NEXT:    v_cndmask_b32_e64 v26, 0, 1, s4
@@ -2822,120 +2823,119 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
 ; GFX10-NEXT:    v_mul_lo_u32 v22, v5, v8
 ; GFX10-NEXT:    v_add3_u32 v23, v24, v25, v26
 ; GFX10-NEXT:    v_mul_lo_u32 v24, v4, v9
-; GFX10-NEXT:    v_cndmask_b32_e64 v30, 0, 1, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v25, 0, 1, s4
 ; GFX10-NEXT:    v_add_co_u32 v20, s4, v21, v20
 ; GFX10-NEXT:    v_mul_lo_u32 v26, v3, v10
 ; GFX10-NEXT:    v_cndmask_b32_e64 v21, 0, 1, s4
 ; GFX10-NEXT:    v_add_co_u32 v20, s5, v20, v27
-; GFX10-NEXT:    v_add_co_u32 v31, s4, v22, v24
-; GFX10-NEXT:    v_add3_u32 v35, v23, v30, v21
+; GFX10-NEXT:    v_add_co_u32 v22, s4, v22, v24
+; GFX10-NEXT:    v_add3_u32 v21, v23, v25, v21
 ; GFX10-NEXT:    v_mul_lo_u32 v23, v2, v11
-; GFX10-NEXT:    v_cndmask_b32_e64 v34, 0, 1, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v24, 0, 1, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v25, 0, 1, s5
-; GFX10-NEXT:    v_add_co_u32 v22, s4, v31, v26
+; GFX10-NEXT:    v_add_co_u32 v22, s4, v22, v26
 ; GFX10-NEXT:    v_mul_lo_u32 v26, v1, v12
 ; GFX10-NEXT:    v_cndmask_b32_e64 v27, 0, 1, s4
 ; GFX10-NEXT:    v_add_co_u32 v19, s5, v20, v19
-; GFX10-NEXT:    v_add_co_u32 v31, s4, v22, v23
+; GFX10-NEXT:    v_add_co_u32 v22, s4, v22, v23
 ; GFX10-NEXT:    v_mul_lo_u32 v23, v0, v13
 ; GFX10-NEXT:    v_cndmask_b32_e64 v28, 0, 1, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s5
-; GFX10-NEXT:    v_add_co_u32 v22, s4, v31, v26
+; GFX10-NEXT:    v_add_co_u32 v22, s4, v22, v26
 ; GFX10-NEXT:    v_mul_hi_u32 v26, v4, v8
 ; GFX10-NEXT:    v_cndmask_b32_e64 v30, 0, 1, s4
-; GFX10-NEXT:    v_add3_u32 v20, v35, v25, v20
-; GFX10-NEXT:    v_add_co_u32 v31, s4, v22, v23
-; GFX10-NEXT:    v_add3_u32 v23, v34, v27, v28
+; GFX10-NEXT:    v_add3_u32 v20, v21, v25, v20
+; GFX10-NEXT:    v_add_co_u32 v22, s4, v22, v23
+; GFX10-NEXT:    v_add3_u32 v23, v24, v27, v28
 ; GFX10-NEXT:    v_cndmask_b32_e64 v24, 0, 1, s4
-; GFX10-NEXT:    v_mul_hi_u32 v22, v2, v10
+; GFX10-NEXT:    v_mul_lo_u32 v27, v6, v8
 ; GFX10-NEXT:    v_mul_lo_u32 v28, v5, v9
-; GFX10-NEXT:    v_add_co_u32 v27, s4, v31, v26
-; GFX10-NEXT:    v_mul_hi_u32 v26, v1, v11
+; GFX10-NEXT:    v_add_co_u32 v21, s4, v22, v26
+; GFX10-NEXT:    v_mul_hi_u32 v22, v2, v10
 ; GFX10-NEXT:    v_add3_u32 v23, v23, v30, v24
 ; GFX10-NEXT:    v_cndmask_b32_e64 v24, 0, 1, s4
-; GFX10-NEXT:    v_add_co_u32 v21, s4, v27, v29
-; GFX10-NEXT:    v_mul_lo_u32 v27, v6, v8
-; GFX10-NEXT:    v_cndmask_b32_e64 v25, 0, 1, s4
+; GFX10-NEXT:    v_mul_hi_u32 v26, v1, v11
+; GFX10-NEXT:    v_add_co_u32 v21, s4, v21, v29
 ; GFX10-NEXT:    v_mul_hi_u32 v29, v0, v12
+; GFX10-NEXT:    v_cndmask_b32_e64 v25, 0, 1, s4
 ; GFX10-NEXT:    v_add_co_u32 v21, s4, v21, v22
 ; GFX10-NEXT:    v_add3_u32 v23, v23, v24, v25
 ; GFX10-NEXT:    v_mul_lo_u32 v24, v4, v10
-; GFX10-NEXT:    v_cndmask_b32_e64 v33, 0, 1, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v22, 0, 1, s4
 ; GFX10-NEXT:    v_add_co_u32 v25, s4, v27, v28
-; GFX10-NEXT:    v_add_co_u32 v31, s5, v21, v26
+; GFX10-NEXT:    v_add_co_u32 v21, s5, v21, v26
 ; GFX10-NEXT:    v_mul_lo_u32 v27, v3, v11
 ; GFX10-NEXT:    v_cndmask_b32_e64 v26, 0, 1, s5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v28, 0, 1, s4
 ; GFX10-NEXT:    v_add_co_u32 v24, s4, v25, v24
-; GFX10-NEXT:    v_add_co_u32 v21, s5, v31, v29
-; GFX10-NEXT:    v_add3_u32 v39, v23, v33, v26
+; GFX10-NEXT:    v_add_co_u32 v21, s5, v21, v29
+; GFX10-NEXT:    v_add3_u32 v22, v23, v22, v26
 ; GFX10-NEXT:    v_mul_lo_u32 v23, v2, v12
-; GFX10-NEXT:    v_cndmask_b32_e64 v35, 0, 1, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v25, 0, 1, s4
 ; GFX10-NEXT:    v_add_co_u32 v24, s4, v24, v27
 ; GFX10-NEXT:    v_mul_lo_u32 v27, v1, v13
 ; GFX10-NEXT:    v_cndmask_b32_e64 v29, 0, 1, s4
-; GFX10-NEXT:    v_mul_hi_u32 v31, v4, v9
-; GFX10-NEXT:    v_mul_hi_u32 v25, v3, v10
 ; GFX10-NEXT:    v_cndmask_b32_e64 v26, 0, 1, s5
-; GFX10-NEXT:    v_add_co_u32 v34, s4, v24, v23
+; GFX10-NEXT:    v_add_co_u32 v20, s5, v21, v20
+; GFX10-NEXT:    v_add_co_u32 v23, s4, v24, v23
 ; GFX10-NEXT:    v_mul_lo_u32 v24, v0, v14
 ; GFX10-NEXT:    v_cndmask_b32_e64 v30, 0, 1, s4
-; GFX10-NEXT:    v_add3_u32 v35, v28, v35, v29
-; GFX10-NEXT:    v_add_co_u32 v20, s5, v21, v20
-; GFX10-NEXT:    v_add_co_u32 v23, s4, v34, v27
+; GFX10-NEXT:    v_cndmask_b32_e64 v21, 0, 1, s5
+; GFX10-NEXT:    v_add_co_u32 v23, s4, v23, v27
 ; GFX10-NEXT:    v_mul_hi_u32 v27, v5, v8
 ; GFX10-NEXT:    v_cndmask_b32_e64 v32, 0, 1, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v21, 0, 1, s5
-; GFX10-NEXT:    v_mul_lo_u32 v29, v3, v12
-; GFX10-NEXT:    v_add_co_u32 v34, s4, v23, v24
-; GFX10-NEXT:    v_mul_hi_u32 v3, v3, v11
-; GFX10-NEXT:    v_cndmask_b32_e64 v28, 0, 1, s4
-; GFX10-NEXT:    v_add3_u32 v22, v35, v30, v32
-; GFX10-NEXT:    v_add3_u32 v21, v39, v26, v21
-; GFX10-NEXT:    v_add_co_u32 v34, s4, v34, v27
+; GFX10-NEXT:    v_add3_u32 v21, v22, v26, v21
 ; GFX10-NEXT:    v_mul_hi_u32 v26, v2, v11
+; GFX10-NEXT:    v_add_co_u32 v23, s4, v23, v24
+; GFX10-NEXT:    v_add3_u32 v24, v28, v25, v29
+; GFX10-NEXT:    v_cndmask_b32_e64 v28, 0, 1, s4
+; GFX10-NEXT:    v_mul_hi_u32 v25, v3, v10
+; GFX10-NEXT:    v_mul_lo_u32 v29, v3, v12
+; GFX10-NEXT:    v_add_co_u32 v23, s4, v23, v27
+; GFX10-NEXT:    v_add3_u32 v24, v24, v30, v32
 ; GFX10-NEXT:    v_cndmask_b32_e64 v27, 0, 1, s4
-; GFX10-NEXT:    v_add_co_u32 v23, s4, v34, v31
+; GFX10-NEXT:    v_mul_hi_u32 v3, v3, v11
+; GFX10-NEXT:    v_add_co_u32 v23, s4, v23, v31
+; GFX10-NEXT:    v_add3_u32 v22, v24, v28, v27
 ; GFX10-NEXT:    v_cndmask_b32_e64 v24, 0, 1, s4
-; GFX10-NEXT:    v_add3_u32 v22, v22, v28, v27
 ; GFX10-NEXT:    v_mul_lo_u32 v28, v6, v9
 ; GFX10-NEXT:    v_add_co_u32 v23, s4, v23, v25
 ; GFX10-NEXT:    v_mul_hi_u32 v27, v1, v12
 ; GFX10-NEXT:    v_cndmask_b32_e64 v25, 0, 1, s4
 ; GFX10-NEXT:    v_mul_hi_u32 v6, v6, v8
-; GFX10-NEXT:    v_add_co_u32 v30, s4, v23, v26
-; GFX10-NEXT:    v_add3_u32 v33, v22, v24, v25
+; GFX10-NEXT:    v_add_co_u32 v23, s4, v23, v26
+; GFX10-NEXT:    v_add3_u32 v22, v22, v24, v25
 ; GFX10-NEXT:    v_mul_lo_u32 v24, v5, v10
 ; GFX10-NEXT:    v_mul_lo_u32 v25, v4, v11
 ; GFX10-NEXT:    v_add_nc_u32_e32 v7, v7, v28
 ; GFX10-NEXT:    v_mul_lo_u32 v28, v2, v13
-; GFX10-NEXT:    v_cndmask_b32_e64 v26, 0, 1, s4
-; GFX10-NEXT:    v_add_co_u32 v23, s4, v30, v27
 ; GFX10-NEXT:    v_mul_hi_u32 v5, v5, v9
-; GFX10-NEXT:    v_cndmask_b32_e64 v27, 0, 1, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v26, 0, 1, s4
+; GFX10-NEXT:    v_add_co_u32 v23, s4, v23, v27
 ; GFX10-NEXT:    v_mul_hi_u32 v4, v4, v10
+; GFX10-NEXT:    v_cndmask_b32_e64 v27, 0, 1, s4
 ; GFX10-NEXT:    v_add3_u32 v7, v7, v24, v25
 ; GFX10-NEXT:    v_mul_lo_u32 v24, v1, v14
 ; GFX10-NEXT:    v_mul_hi_u32 v25, v0, v13
-; GFX10-NEXT:    v_add3_u32 v33, v33, v26, v27
 ; GFX10-NEXT:    v_mul_hi_u32 v2, v2, v12
-; GFX10-NEXT:    v_add3_u32 v26, v7, v29, v28
 ; GFX10-NEXT:    v_mul_hi_u32 v1, v1, v13
-; GFX10-NEXT:    v_add3_u32 v7, v26, v24, v15
-; GFX10-NEXT:    v_add_co_u32 v11, s4, v23, v25
+; GFX10-NEXT:    v_add3_u32 v7, v7, v29, v28
+; GFX10-NEXT:    v_add3_u32 v22, v22, v26, v27
+; GFX10-NEXT:    v_add3_u32 v7, v7, v24, v15
+; GFX10-NEXT:    v_add_co_u32 v9, s4, v23, v25
 ; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s4
 ; GFX10-NEXT:    v_add3_u32 v5, v7, v6, v5
-; GFX10-NEXT:    v_add_co_u32 v6, s4, v11, v21
+; GFX10-NEXT:    v_add_co_u32 v6, s4, v9, v21
 ; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s4
 ; GFX10-NEXT:    v_add3_u32 v3, v5, v4, v3
 ; GFX10-NEXT:    v_mul_hi_u32 v4, v0, v14
 ; GFX10-NEXT:    v_mul_lo_u32 v0, v0, v8
-; GFX10-NEXT:    v_add3_u32 v5, v33, v10, v7
-; GFX10-NEXT:    v_add3_u32 v3, v3, v2, v1
-; GFX10-NEXT:    v_mov_b32_e32 v1, v16
+; GFX10-NEXT:    v_add3_u32 v5, v22, v10, v7
+; GFX10-NEXT:    v_add3_u32 v1, v3, v2, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v17
-; GFX10-NEXT:    v_add3_u32 v7, v3, v4, v5
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v18
+; GFX10-NEXT:    v_add3_u32 v7, v1, v4, v5
+; GFX10-NEXT:    v_mov_b32_e32 v1, v16
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v19
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v20
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll
index 1e0d7e88bc27..16c48719bf1c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll
@@ -413,12 +413,12 @@ define <4 x half> @v_roundeven_v4f16(<4 x half> %x) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_rndne_f16_e32 v3, v1
 ; GFX10-NEXT:    v_rndne_f16_e32 v2, v0
-; GFX10-NEXT:    v_rndne_f16_sdwa v7, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT:    v_rndne_f16_e32 v3, v1
+; GFX10-NEXT:    v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0xffff
 ; GFX10-NEXT:    v_rndne_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_and_or_b32 v0, v2, v4, v7
+; GFX10-NEXT:    v_and_or_b32 v0, v2, v4, v0
 ; GFX10-NEXT:    v_and_or_b32 v1, v3, v4, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %roundeven = call <4 x half> @llvm.roundeven.v4f16(<4 x half> %x)

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
index 50fa5c749844..12b3b5409b62 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
@@ -4199,16 +4199,16 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_add_co_u32 v10, vcc_lo, v0, v2
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v11, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, v0, v2
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
 ; GFX10-NEXT:    v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3]
-; GFX10-NEXT:    v_ashrrev_i32_e32 v6, 31, v11
-; GFX10-NEXT:    v_cmp_lt_i64_e64 s4, v[10:11], v[0:1]
+; GFX10-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
+; GFX10-NEXT:    v_cmp_lt_i64_e64 s4, v[4:5], v[0:1]
 ; GFX10-NEXT:    v_add_co_u32 v0, s5, v6, 0
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s5, 0x80000000, v6, s5
 ; GFX10-NEXT:    s_xor_b32 vcc_lo, vcc_lo, s4
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v10, v0, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v11, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %result = call i64 @llvm.sadd.sat.i64(i64 %lhs, i64 %rhs)
   ret i64 %result
@@ -4543,30 +4543,26 @@ define <2 x i64> @v_saddsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_mov_b32_e32 v14, v0
-; GFX10-NEXT:    v_mov_b32_e32 v15, v1
-; GFX10-NEXT:    v_mov_b32_e32 v17, v2
-; GFX10-NEXT:    v_mov_b32_e32 v18, v3
+; GFX10-NEXT:    v_add_co_u32 v8, vcc_lo, v0, v4
 ; GFX10-NEXT:    v_cmp_gt_i64_e64 s4, 0, v[4:5]
-; GFX10-NEXT:    v_add_co_u32 v8, vcc_lo, v14, v4
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, v1, v5, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v10, vcc_lo, v2, v6
 ; GFX10-NEXT:    v_cmp_gt_i64_e64 s6, 0, v[6:7]
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, v15, v5, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v19, vcc_lo, v17, v6
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v20, vcc_lo, v18, v7, vcc_lo
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v11, vcc_lo, v3, v7, vcc_lo
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v12, 31, v9
-; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[8:9], v[14:15]
-; GFX10-NEXT:    v_ashrrev_i32_e32 v0, 31, v20
+; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[8:9], v[0:1]
+; GFX10-NEXT:    v_ashrrev_i32_e32 v0, 31, v11
 ; GFX10-NEXT:    v_add_co_u32 v1, s5, v12, 0
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v4, s5, 0x80000000, v12, s5
-; GFX10-NEXT:    v_cmp_lt_i64_e64 s5, v[19:20], v[17:18]
+; GFX10-NEXT:    v_cmp_lt_i64_e64 s5, v[10:11], v[2:3]
 ; GFX10-NEXT:    v_add_co_u32 v2, s7, v0, 0
 ; GFX10-NEXT:    s_xor_b32 vcc_lo, s4, vcc_lo
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s7, 0x80000000, v0, s7
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v8, v1, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v9, v4, vcc_lo
 ; GFX10-NEXT:    s_xor_b32 vcc_lo, s6, s5
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v19, v2, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v20, v3, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
   ret <2 x i64> %result
@@ -5327,7 +5323,6 @@ define amdgpu_ps <4 x float> @saddsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
 ; GFX10-NEXT:    v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[4:5]
 ; GFX10-NEXT:    s_movk_i32 s0, 0x7f
 ; GFX10-NEXT:    s_sub_i32 s1, 64, s0
-; GFX10-NEXT:    v_lshrrev_b64 v[15:16], s0, v[4:5]
 ; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc_lo
 ; GFX10-NEXT:    v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[6:7]
 ; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc_lo
@@ -5335,33 +5330,34 @@ define amdgpu_ps <4 x float> @saddsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v10, v9, v8, vcc_lo
 ; GFX10-NEXT:    v_cmp_gt_u64_e32 vcc_lo, 0, v[0:1]
 ; GFX10-NEXT:    v_lshlrev_b64 v[8:9], s1, v[6:7]
+; GFX10-NEXT:    v_lshrrev_b64 v[0:1], s0, v[4:5]
 ; GFX10-NEXT:    s_sub_i32 s1, s0, 64
 ; GFX10-NEXT:    s_cmp_lt_u32 s0, 64
-; GFX10-NEXT:    v_ashrrev_i64 v[0:1], s0, v[6:7]
 ; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc_lo
 ; GFX10-NEXT:    v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3]
-; GFX10-NEXT:    v_or_b32_e32 v8, v15, v8
-; GFX10-NEXT:    v_or_b32_e32 v9, v16, v9
-; GFX10-NEXT:    v_ashrrev_i32_e32 v15, 31, v7
+; GFX10-NEXT:    v_or_b32_e32 v8, v0, v8
+; GFX10-NEXT:    v_or_b32_e32 v9, v1, v9
+; GFX10-NEXT:    v_ashrrev_i64 v[0:1], s0, v[6:7]
 ; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[2:3]
 ; GFX10-NEXT:    v_ashrrev_i64 v[2:3], s1, v[6:7]
 ; GFX10-NEXT:    v_cndmask_b32_e32 v11, v12, v11, vcc_lo
 ; GFX10-NEXT:    s_cselect_b32 vcc_lo, 1, 0
 ; GFX10-NEXT:    s_cmp_eq_u32 s0, 0
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v9, vcc_lo
-; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v8, vcc_lo
+; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v9, vcc_lo
 ; GFX10-NEXT:    s_and_b32 s0, 1, s1
 ; GFX10-NEXT:    s_and_b32 s1, 1, vcc_lo
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
+; GFX10-NEXT:    v_ashrrev_i32_e32 v8, 31, v7
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, s1
 ; GFX10-NEXT:    v_xor_b32_e32 v9, v11, v10
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v15, v0, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v8, v1, s0
 ; GFX10-NEXT:    v_and_b32_e32 v8, 1, v9
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v15, v1, s0
 ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, 0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, v8
@@ -5569,64 +5565,60 @@ define amdgpu_ps <4 x float> @saddsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
 ;
 ; GFX10-LABEL: saddsat_i128_vs:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    v_mov_b32_e32 v5, v0
-; GFX10-NEXT:    v_mov_b32_e32 v6, v1
-; GFX10-NEXT:    v_mov_b32_e32 v9, v2
-; GFX10-NEXT:    v_mov_b32_e32 v10, v3
-; GFX10-NEXT:    s_cmp_eq_u64 s[2:3], 0
-; GFX10-NEXT:    v_add_co_u32 v15, vcc_lo, v5, s0
+; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, v0, s0
 ; GFX10-NEXT:    v_cmp_lt_u64_e64 s0, s[0:1], 0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v16, vcc_lo, s1, v6, vcc_lo
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, s1, v1, vcc_lo
+; GFX10-NEXT:    s_cmp_eq_u64 s[2:3], 0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, s2, v2, vcc_lo
 ; GFX10-NEXT:    s_cselect_b32 s4, 1, 0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v19, vcc_lo, s2, v9, vcc_lo
-; GFX10-NEXT:    s_and_b32 s1, 1, s4
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v20, vcc_lo, s3, v10, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[15:16], v[5:6]
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, s3, v3, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[0:1]
 ; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s0
 ; GFX10-NEXT:    v_cmp_lt_i64_e64 s0, s[2:3], 0
-; GFX10-NEXT:    v_ashrrev_i32_e32 v7, 31, v20
+; GFX10-NEXT:    s_and_b32 s1, 1, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[19:20], v[9:10]
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[19:20], v[9:10]
+; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[6:7], v[2:3]
 ; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s0
 ; GFX10-NEXT:    s_movk_i32 s0, 0x7f
 ; GFX10-NEXT:    s_sub_i32 s2, 64, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3]
+; GFX10-NEXT:    v_lshlrev_b64 v[2:3], s2, v[6:7]
 ; GFX10-NEXT:    v_cndmask_b32_e32 v10, v1, v0, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b64 v[0:1], s0, v[4:5]
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s1
-; GFX10-NEXT:    v_lshrrev_b64 v[0:1], s0, v[15:16]
-; GFX10-NEXT:    v_lshlrev_b64 v[2:3], s2, v[19:20]
 ; GFX10-NEXT:    s_sub_i32 s1, s0, 64
 ; GFX10-NEXT:    s_cmp_lt_u32 s0, 64
 ; GFX10-NEXT:    v_cndmask_b32_e32 v11, v9, v8, vcc_lo
 ; GFX10-NEXT:    s_cselect_b32 vcc_lo, 1, 0
-; GFX10-NEXT:    v_ashrrev_i64 v[8:9], s1, v[19:20]
+; GFX10-NEXT:    v_ashrrev_i64 v[8:9], s1, v[6:7]
 ; GFX10-NEXT:    s_cmp_eq_u32 s0, 0
 ; GFX10-NEXT:    v_or_b32_e32 v2, v0, v2
 ; GFX10-NEXT:    v_or_b32_e32 v3, v1, v3
 ; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX10-NEXT:    v_ashrrev_i64 v[0:1], s0, v[19:20]
+; GFX10-NEXT:    v_ashrrev_i64 v[0:1], s0, v[6:7]
 ; GFX10-NEXT:    s_and_b32 s0, 1, s1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc_lo
 ; GFX10-NEXT:    s_and_b32 s1, 1, vcc_lo
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
+; GFX10-NEXT:    v_ashrrev_i32_e32 v8, 31, v7
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, s1
 ; GFX10-NEXT:    v_xor_b32_e32 v9, v11, v10
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v15, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v16, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v7, v0, s0
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v8, v1, s0
 ; GFX10-NEXT:    v_and_b32_e32 v8, 1, v9
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v7, v1, s0
 ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, 0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, v8
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v8, vcc_lo, 0, v0, vcc_lo
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, 0x80000000, v1, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v15, v2, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v16, v3, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v19, v8, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v20, v9, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v4, v2, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v5, v3, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v6, v8, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
 ; GFX10-NEXT:    ; return to shader part epilog
   %result = call i128 @llvm.sadd.sat.i128(i128 %lhs, i128 %rhs)
   %cast = bitcast i128 %result to <4 x float>
@@ -5959,28 +5951,20 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_mov_b32_e32 v22, v0
-; GFX10-NEXT:    v_mov_b32_e32 v23, v1
-; GFX10-NEXT:    v_mov_b32_e32 v20, v2
-; GFX10-NEXT:    v_mov_b32_e32 v21, v3
+; GFX10-NEXT:    v_add_co_u32 v16, vcc_lo, v0, v8
 ; GFX10-NEXT:    s_movk_i32 s5, 0x7f
-; GFX10-NEXT:    v_add_co_u32 v16, vcc_lo, v22, v8
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v17, vcc_lo, v1, v9, vcc_lo
 ; GFX10-NEXT:    s_sub_i32 s6, 64, s5
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v17, vcc_lo, v23, v9, vcc_lo
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v18, vcc_lo, v2, v10, vcc_lo
 ; GFX10-NEXT:    s_sub_i32 s7, s5, 64
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v18, vcc_lo, v20, v10, vcc_lo
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v19, vcc_lo, v3, v11, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[16:17], v[0:1]
 ; GFX10-NEXT:    s_cmp_lt_u32 s5, 64
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v19, vcc_lo, v21, v11, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[16:17], v[22:23]
-; GFX10-NEXT:    v_mov_b32_e32 v26, v4
-; GFX10-NEXT:    v_mov_b32_e32 v27, v5
-; GFX10-NEXT:    v_mov_b32_e32 v24, v6
-; GFX10-NEXT:    v_lshlrev_b64 v[2:3], s6, v[18:19]
-; GFX10-NEXT:    v_mov_b32_e32 v25, v7
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[18:19], v[20:21]
+; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[18:19], v[2:3]
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[18:19], v[20:21]
+; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[18:19], v[2:3]
+; GFX10-NEXT:    v_lshlrev_b64 v[2:3], s6, v[18:19]
 ; GFX10-NEXT:    v_cndmask_b32_e32 v20, v1, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_gt_u64_e32 vcc_lo, 0, v[8:9]
 ; GFX10-NEXT:    v_lshrrev_b64 v[0:1], s5, v[16:17]
@@ -5991,7 +5975,6 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX10-NEXT:    v_ashrrev_i64 v[0:1], s5, v[18:19]
 ; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[10:11]
-; GFX10-NEXT:    v_ashrrev_i32_e32 v11, 31, v19
 ; GFX10-NEXT:    v_cndmask_b32_e32 v10, v9, v8, vcc_lo
 ; GFX10-NEXT:    s_cselect_b32 vcc_lo, 1, 0
 ; GFX10-NEXT:    v_ashrrev_i64 v[8:9], s7, v[18:19]
@@ -5999,33 +5982,34 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX10-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX10-NEXT:    s_and_b32 s8, 1, vcc_lo
 ; GFX10-NEXT:    s_and_b32 s4, 1, s4
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc_lo
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s4
+; GFX10-NEXT:    v_ashrrev_i32_e32 v8, 31, v19
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 s4, 0, s8
 ; GFX10-NEXT:    v_xor_b32_e32 v9, v10, v20
 ; GFX10-NEXT:    s_cmp_lt_u32 s5, 64
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v16, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v17, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v11, v0, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v11, v1, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v8, v1, s4
 ; GFX10-NEXT:    v_and_b32_e32 v8, 1, v9
 ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, 0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v20, vcc_lo, 0, v0, vcc_lo
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v21, vcc_lo, 0x80000000, v1, vcc_lo
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v8
-; GFX10-NEXT:    v_add_co_u32 v8, s4, v26, v12
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v9, s4, v27, v13, s4
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v10, s4, v24, v14, s4
+; GFX10-NEXT:    v_add_co_u32 v8, s4, v4, v12
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v9, s4, v5, v13, s4
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v10, s4, v6, v14, s4
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v16, v2, vcc_lo
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v11, s4, v25, v15, s4
-; GFX10-NEXT:    v_cmp_lt_u64_e64 s4, v[8:9], v[26:27]
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v11, s4, v7, v15, s4
+; GFX10-NEXT:    v_cmp_lt_u64_e64 s4, v[8:9], v[4:5]
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v17, v3, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v18, v20, vcc_lo
 ; GFX10-NEXT:    v_lshrrev_b64 v[3:4], s5, v[8:9]
 ; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s4
-; GFX10-NEXT:    v_cmp_lt_i64_e64 s4, v[10:11], v[24:25]
+; GFX10-NEXT:    v_cmp_lt_i64_e64 s4, v[10:11], v[6:7]
 ; GFX10-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s4
 ; GFX10-NEXT:    v_cmp_gt_u64_e64 s4, 0, v[12:13]
 ; GFX10-NEXT:    v_lshlrev_b64 v[12:13], s6, v[10:11]
@@ -6035,7 +6019,7 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX10-NEXT:    v_or_b32_e32 v13, v4, v13
 ; GFX10-NEXT:    v_ashrrev_i64 v[3:4], s5, v[10:11]
 ; GFX10-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s4
-; GFX10-NEXT:    v_cmp_eq_u64_e64 s4, v[10:11], v[24:25]
+; GFX10-NEXT:    v_cmp_eq_u64_e64 s4, v[10:11], v[6:7]
 ; GFX10-NEXT:    v_cndmask_b32_e64 v7, v16, v5, s4
 ; GFX10-NEXT:    v_cmp_eq_u64_e64 s4, 0, v[14:15]
 ; GFX10-NEXT:    v_ashrrev_i64 v[5:6], s7, v[10:11]
@@ -6049,13 +6033,13 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX10-NEXT:    s_and_b32 s6, 1, s4
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 s4, 0, s5
 ; GFX10-NEXT:    v_xor_b32_e32 v7, v14, v7
-; GFX10-NEXT:    v_ashrrev_i32_e32 v18, 31, v11
+; GFX10-NEXT:    v_ashrrev_i32_e32 v12, 31, v11
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 s5, 0, s6
 ; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, v8, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v6, v6, v9, s4
 ; GFX10-NEXT:    v_and_b32_e32 v7, 1, v7
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v18, v3, s5
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, v18, v4, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v12, v3, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v12, v4, s5
 ; GFX10-NEXT:    v_add_co_u32 v5, s4, v5, 0
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v6, s4, 0, v6, s4
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 s5, 0, v7
@@ -6592,23 +6576,21 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_add_u32 s16, s0, s8
 ; GFX10-NEXT:    s_cselect_b32 s17, 1, 0
-; GFX10-NEXT:    s_mov_b32 s46, s0
 ; GFX10-NEXT:    s_and_b32 s17, s17, 1
-; GFX10-NEXT:    s_mov_b32 s47, s1
 ; GFX10-NEXT:    s_cmp_lg_u32 s17, 0
 ; GFX10-NEXT:    s_addc_u32 s17, s1, s9
 ; GFX10-NEXT:    s_cselect_b32 s18, 1, 0
-; GFX10-NEXT:    v_cmp_lt_u64_e64 s0, s[16:17], s[46:47]
+; GFX10-NEXT:    v_cmp_lt_u64_e64 s0, s[16:17], s[0:1]
 ; GFX10-NEXT:    s_and_b32 s18, s18, 1
 ; GFX10-NEXT:    s_cmp_lg_u32 s18, 0
-; GFX10-NEXT:    s_addc_u32 s30, s2, s10
+; GFX10-NEXT:    s_addc_u32 s18, s2, s10
 ; GFX10-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX10-NEXT:    s_and_b32 s19, s19, 1
 ; GFX10-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX10-NEXT:    s_addc_u32 s31, s3, s11
-; GFX10-NEXT:    v_cmp_lt_i64_e64 s0, s[30:31], s[2:3]
-; GFX10-NEXT:    s_cmp_eq_u64 s[30:31], s[2:3]
+; GFX10-NEXT:    s_addc_u32 s19, s3, s11
+; GFX10-NEXT:    v_cmp_lt_i64_e64 s0, s[18:19], s[2:3]
+; GFX10-NEXT:    s_cmp_eq_u64 s[18:19], s[2:3]
 ; GFX10-NEXT:    v_cmp_lt_u64_e64 s2, s[8:9], 0
 ; GFX10-NEXT:    s_cselect_b32 s20, 1, 0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
@@ -6628,13 +6610,13 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s2
 ; GFX10-NEXT:    s_cselect_b32 s24, 1, 0
 ; GFX10-NEXT:    s_lshr_b64 s[2:3], s[16:17], s20
-; GFX10-NEXT:    s_lshl_b64 s[8:9], s[30:31], s22
+; GFX10-NEXT:    s_lshl_b64 s[8:9], s[18:19], s22
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s1
-; GFX10-NEXT:    s_ashr_i32 s10, s31, 31
+; GFX10-NEXT:    s_ashr_i32 s10, s19, 31
 ; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
-; GFX10-NEXT:    s_ashr_i64 s[0:1], s[30:31], s20
-; GFX10-NEXT:    s_ashr_i64 s[8:9], s[30:31], s21
+; GFX10-NEXT:    s_ashr_i64 s[0:1], s[18:19], s20
+; GFX10-NEXT:    s_ashr_i64 s[8:9], s[18:19], s21
 ; GFX10-NEXT:    s_cmp_lg_u32 s23, 0
 ; GFX10-NEXT:    s_mov_b32 s11, s10
 ; GFX10-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[8:9]
@@ -6655,7 +6637,7 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX10-NEXT:    s_cselect_b32 s8, 1, 0
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    s_and_b32 s8, s8, 1
-; GFX10-NEXT:    v_mov_b32_e32 v3, s31
+; GFX10-NEXT:    v_mov_b32_e32 v3, s19
 ; GFX10-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX10-NEXT:    s_addc_u32 s0, s0, 0
 ; GFX10-NEXT:    s_cselect_b32 s8, 1, 0
@@ -6669,7 +6651,7 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX10-NEXT:    s_cselect_b32 s3, 1, 0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, s1, vcc_lo
 ; GFX10-NEXT:    s_and_b32 s3, s3, 1
-; GFX10-NEXT:    v_mov_b32_e32 v2, s30
+; GFX10-NEXT:    v_mov_b32_e32 v2, s18
 ; GFX10-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX10-NEXT:    s_addc_u32 s3, s5, s13
 ; GFX10-NEXT:    s_cselect_b32 s8, 1, 0

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll
index 174df2d5a832..4dcbd7c9e092 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll
@@ -1571,12 +1571,8 @@ define <2 x i64> @v_shl_v2i64(<2 x i64> %value, <2 x i64> %amount) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_mov_b32_e32 v10, v0
-; GFX10-NEXT:    v_mov_b32_e32 v11, v1
-; GFX10-NEXT:    v_mov_b32_e32 v7, v2
-; GFX10-NEXT:    v_mov_b32_e32 v8, v3
-; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v4, v[10:11]
-; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v6, v[7:8]
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v4, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v6, v[2:3]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %result = shl <2 x i64> %value, %amount
   ret <2 x i64> %result

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
index e4858b872ee6..4e99dacabf41 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
@@ -4185,16 +4185,16 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_sub_co_u32 v10, vcc_lo, v0, v2
-; GFX10-NEXT:    v_sub_co_ci_u32_e32 v11, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT:    v_sub_co_u32 v4, vcc_lo, v0, v2
+; GFX10-NEXT:    v_sub_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
 ; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, 0, v[2:3]
-; GFX10-NEXT:    v_ashrrev_i32_e32 v6, 31, v11
-; GFX10-NEXT:    v_cmp_lt_i64_e64 s4, v[10:11], v[0:1]
+; GFX10-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
+; GFX10-NEXT:    v_cmp_lt_i64_e64 s4, v[4:5], v[0:1]
 ; GFX10-NEXT:    v_add_co_u32 v0, s5, v6, 0
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s5, 0x80000000, v6, s5
 ; GFX10-NEXT:    s_xor_b32 vcc_lo, vcc_lo, s4
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v10, v0, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v11, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %result = call i64 @llvm.ssub.sat.i64(i64 %lhs, i64 %rhs)
   ret i64 %result
@@ -4529,30 +4529,26 @@ define <2 x i64> @v_ssubsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_mov_b32_e32 v14, v0
-; GFX10-NEXT:    v_mov_b32_e32 v15, v1
-; GFX10-NEXT:    v_mov_b32_e32 v17, v2
-; GFX10-NEXT:    v_mov_b32_e32 v18, v3
+; GFX10-NEXT:    v_sub_co_u32 v8, vcc_lo, v0, v4
 ; GFX10-NEXT:    v_cmp_lt_i64_e64 s4, 0, v[4:5]
-; GFX10-NEXT:    v_sub_co_u32 v8, vcc_lo, v14, v4
+; GFX10-NEXT:    v_sub_co_ci_u32_e32 v9, vcc_lo, v1, v5, vcc_lo
+; GFX10-NEXT:    v_sub_co_u32 v10, vcc_lo, v2, v6
 ; GFX10-NEXT:    v_cmp_lt_i64_e64 s6, 0, v[6:7]
-; GFX10-NEXT:    v_sub_co_ci_u32_e32 v9, vcc_lo, v15, v5, vcc_lo
-; GFX10-NEXT:    v_sub_co_u32 v19, vcc_lo, v17, v6
-; GFX10-NEXT:    v_sub_co_ci_u32_e32 v20, vcc_lo, v18, v7, vcc_lo
+; GFX10-NEXT:    v_sub_co_ci_u32_e32 v11, vcc_lo, v3, v7, vcc_lo
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v12, 31, v9
-; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[8:9], v[14:15]
-; GFX10-NEXT:    v_ashrrev_i32_e32 v0, 31, v20
+; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[8:9], v[0:1]
+; GFX10-NEXT:    v_ashrrev_i32_e32 v0, 31, v11
 ; GFX10-NEXT:    v_add_co_u32 v1, s5, v12, 0
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v4, s5, 0x80000000, v12, s5
-; GFX10-NEXT:    v_cmp_lt_i64_e64 s5, v[19:20], v[17:18]
+; GFX10-NEXT:    v_cmp_lt_i64_e64 s5, v[10:11], v[2:3]
 ; GFX10-NEXT:    v_add_co_u32 v2, s7, v0, 0
 ; GFX10-NEXT:    s_xor_b32 vcc_lo, s4, vcc_lo
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s7, 0x80000000, v0, s7
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v8, v1, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v9, v4, vcc_lo
 ; GFX10-NEXT:    s_xor_b32 vcc_lo, s6, s5
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v19, v2, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v20, v3, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
   ret <2 x i64> %result
@@ -5313,7 +5309,6 @@ define amdgpu_ps <4 x float> @ssubsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
 ; GFX10-NEXT:    v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[4:5]
 ; GFX10-NEXT:    s_movk_i32 s0, 0x7f
 ; GFX10-NEXT:    s_sub_i32 s1, 64, s0
-; GFX10-NEXT:    v_lshrrev_b64 v[15:16], s0, v[4:5]
 ; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc_lo
 ; GFX10-NEXT:    v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[6:7]
 ; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc_lo
@@ -5321,33 +5316,34 @@ define amdgpu_ps <4 x float> @ssubsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v10, v9, v8, vcc_lo
 ; GFX10-NEXT:    v_cmp_lt_u64_e32 vcc_lo, 0, v[0:1]
 ; GFX10-NEXT:    v_lshlrev_b64 v[8:9], s1, v[6:7]
+; GFX10-NEXT:    v_lshrrev_b64 v[0:1], s0, v[4:5]
 ; GFX10-NEXT:    s_sub_i32 s1, s0, 64
 ; GFX10-NEXT:    s_cmp_lt_u32 s0, 64
-; GFX10-NEXT:    v_ashrrev_i64 v[0:1], s0, v[6:7]
 ; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc_lo
 ; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, 0, v[2:3]
-; GFX10-NEXT:    v_or_b32_e32 v8, v15, v8
-; GFX10-NEXT:    v_or_b32_e32 v9, v16, v9
-; GFX10-NEXT:    v_ashrrev_i32_e32 v15, 31, v7
+; GFX10-NEXT:    v_or_b32_e32 v8, v0, v8
+; GFX10-NEXT:    v_or_b32_e32 v9, v1, v9
+; GFX10-NEXT:    v_ashrrev_i64 v[0:1], s0, v[6:7]
 ; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[2:3]
 ; GFX10-NEXT:    v_ashrrev_i64 v[2:3], s1, v[6:7]
 ; GFX10-NEXT:    v_cndmask_b32_e32 v11, v12, v11, vcc_lo
 ; GFX10-NEXT:    s_cselect_b32 vcc_lo, 1, 0
 ; GFX10-NEXT:    s_cmp_eq_u32 s0, 0
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v9, vcc_lo
-; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v8, vcc_lo
+; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v9, vcc_lo
 ; GFX10-NEXT:    s_and_b32 s0, 1, s1
 ; GFX10-NEXT:    s_and_b32 s1, 1, vcc_lo
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
+; GFX10-NEXT:    v_ashrrev_i32_e32 v8, 31, v7
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, s1
 ; GFX10-NEXT:    v_xor_b32_e32 v9, v11, v10
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v15, v0, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v8, v1, s0
 ; GFX10-NEXT:    v_and_b32_e32 v8, 1, v9
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v15, v1, s0
 ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, 0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, v8
@@ -5555,64 +5551,60 @@ define amdgpu_ps <4 x float> @ssubsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
 ;
 ; GFX10-LABEL: ssubsat_i128_vs:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    v_mov_b32_e32 v5, v0
-; GFX10-NEXT:    v_mov_b32_e32 v6, v1
-; GFX10-NEXT:    v_mov_b32_e32 v9, v2
-; GFX10-NEXT:    v_mov_b32_e32 v10, v3
-; GFX10-NEXT:    s_cmp_eq_u64 s[2:3], 0
-; GFX10-NEXT:    v_sub_co_u32 v15, vcc_lo, v5, s0
+; GFX10-NEXT:    v_sub_co_u32 v4, vcc_lo, v0, s0
 ; GFX10-NEXT:    v_cmp_gt_u64_e64 s0, s[0:1], 0
-; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v16, vcc_lo, s1, v6, vcc_lo
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v5, vcc_lo, s1, v1, vcc_lo
+; GFX10-NEXT:    s_cmp_eq_u64 s[2:3], 0
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v6, vcc_lo, s2, v2, vcc_lo
 ; GFX10-NEXT:    s_cselect_b32 s4, 1, 0
-; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v19, vcc_lo, s2, v9, vcc_lo
-; GFX10-NEXT:    s_and_b32 s1, 1, s4
-; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v20, vcc_lo, s3, v10, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[15:16], v[5:6]
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v7, vcc_lo, s3, v3, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[0:1]
 ; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s0
 ; GFX10-NEXT:    v_cmp_gt_i64_e64 s0, s[2:3], 0
-; GFX10-NEXT:    v_ashrrev_i32_e32 v7, 31, v20
+; GFX10-NEXT:    s_and_b32 s1, 1, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[19:20], v[9:10]
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[19:20], v[9:10]
+; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[6:7], v[2:3]
 ; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s0
 ; GFX10-NEXT:    s_movk_i32 s0, 0x7f
 ; GFX10-NEXT:    s_sub_i32 s2, 64, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3]
+; GFX10-NEXT:    v_lshlrev_b64 v[2:3], s2, v[6:7]
 ; GFX10-NEXT:    v_cndmask_b32_e32 v10, v1, v0, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b64 v[0:1], s0, v[4:5]
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s1
-; GFX10-NEXT:    v_lshrrev_b64 v[0:1], s0, v[15:16]
-; GFX10-NEXT:    v_lshlrev_b64 v[2:3], s2, v[19:20]
 ; GFX10-NEXT:    s_sub_i32 s1, s0, 64
 ; GFX10-NEXT:    s_cmp_lt_u32 s0, 64
 ; GFX10-NEXT:    v_cndmask_b32_e32 v11, v9, v8, vcc_lo
 ; GFX10-NEXT:    s_cselect_b32 vcc_lo, 1, 0
-; GFX10-NEXT:    v_ashrrev_i64 v[8:9], s1, v[19:20]
+; GFX10-NEXT:    v_ashrrev_i64 v[8:9], s1, v[6:7]
 ; GFX10-NEXT:    s_cmp_eq_u32 s0, 0
 ; GFX10-NEXT:    v_or_b32_e32 v2, v0, v2
 ; GFX10-NEXT:    v_or_b32_e32 v3, v1, v3
 ; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX10-NEXT:    v_ashrrev_i64 v[0:1], s0, v[19:20]
+; GFX10-NEXT:    v_ashrrev_i64 v[0:1], s0, v[6:7]
 ; GFX10-NEXT:    s_and_b32 s0, 1, s1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc_lo
 ; GFX10-NEXT:    s_and_b32 s1, 1, vcc_lo
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
+; GFX10-NEXT:    v_ashrrev_i32_e32 v8, 31, v7
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, s1
 ; GFX10-NEXT:    v_xor_b32_e32 v9, v11, v10
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v15, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v16, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v7, v0, s0
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v8, v1, s0
 ; GFX10-NEXT:    v_and_b32_e32 v8, 1, v9
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v7, v1, s0
 ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, 0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, v8
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v8, vcc_lo, 0, v0, vcc_lo
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, 0x80000000, v1, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v15, v2, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v16, v3, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v19, v8, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v20, v9, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v4, v2, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v5, v3, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v6, v8, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
 ; GFX10-NEXT:    ; return to shader part epilog
   %result = call i128 @llvm.ssub.sat.i128(i128 %lhs, i128 %rhs)
   %cast = bitcast i128 %result to <4 x float>
@@ -5945,28 +5937,20 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_mov_b32_e32 v22, v0
-; GFX10-NEXT:    v_mov_b32_e32 v23, v1
-; GFX10-NEXT:    v_mov_b32_e32 v20, v2
-; GFX10-NEXT:    v_mov_b32_e32 v21, v3
+; GFX10-NEXT:    v_sub_co_u32 v16, vcc_lo, v0, v8
 ; GFX10-NEXT:    s_movk_i32 s5, 0x7f
-; GFX10-NEXT:    v_sub_co_u32 v16, vcc_lo, v22, v8
+; GFX10-NEXT:    v_sub_co_ci_u32_e32 v17, vcc_lo, v1, v9, vcc_lo
 ; GFX10-NEXT:    s_sub_i32 s6, 64, s5
-; GFX10-NEXT:    v_sub_co_ci_u32_e32 v17, vcc_lo, v23, v9, vcc_lo
+; GFX10-NEXT:    v_sub_co_ci_u32_e32 v18, vcc_lo, v2, v10, vcc_lo
 ; GFX10-NEXT:    s_sub_i32 s7, s5, 64
-; GFX10-NEXT:    v_sub_co_ci_u32_e32 v18, vcc_lo, v20, v10, vcc_lo
+; GFX10-NEXT:    v_sub_co_ci_u32_e32 v19, vcc_lo, v3, v11, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[16:17], v[0:1]
 ; GFX10-NEXT:    s_cmp_lt_u32 s5, 64
-; GFX10-NEXT:    v_sub_co_ci_u32_e32 v19, vcc_lo, v21, v11, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[16:17], v[22:23]
-; GFX10-NEXT:    v_mov_b32_e32 v26, v4
-; GFX10-NEXT:    v_mov_b32_e32 v27, v5
-; GFX10-NEXT:    v_mov_b32_e32 v24, v6
-; GFX10-NEXT:    v_lshlrev_b64 v[2:3], s6, v[18:19]
-; GFX10-NEXT:    v_mov_b32_e32 v25, v7
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[18:19], v[20:21]
+; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[18:19], v[2:3]
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[18:19], v[20:21]
+; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[18:19], v[2:3]
+; GFX10-NEXT:    v_lshlrev_b64 v[2:3], s6, v[18:19]
 ; GFX10-NEXT:    v_cndmask_b32_e32 v20, v1, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_lt_u64_e32 vcc_lo, 0, v[8:9]
 ; GFX10-NEXT:    v_lshrrev_b64 v[0:1], s5, v[16:17]
@@ -5977,7 +5961,6 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX10-NEXT:    v_ashrrev_i64 v[0:1], s5, v[18:19]
 ; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[10:11]
-; GFX10-NEXT:    v_ashrrev_i32_e32 v11, 31, v19
 ; GFX10-NEXT:    v_cndmask_b32_e32 v10, v9, v8, vcc_lo
 ; GFX10-NEXT:    s_cselect_b32 vcc_lo, 1, 0
 ; GFX10-NEXT:    v_ashrrev_i64 v[8:9], s7, v[18:19]
@@ -5985,33 +5968,34 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX10-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX10-NEXT:    s_and_b32 s8, 1, vcc_lo
 ; GFX10-NEXT:    s_and_b32 s4, 1, s4
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc_lo
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s4
+; GFX10-NEXT:    v_ashrrev_i32_e32 v8, 31, v19
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 s4, 0, s8
 ; GFX10-NEXT:    v_xor_b32_e32 v9, v10, v20
 ; GFX10-NEXT:    s_cmp_lt_u32 s5, 64
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v16, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v17, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v11, v0, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v11, v1, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v8, v1, s4
 ; GFX10-NEXT:    v_and_b32_e32 v8, 1, v9
 ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, 0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v20, vcc_lo, 0, v0, vcc_lo
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v21, vcc_lo, 0x80000000, v1, vcc_lo
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v8
-; GFX10-NEXT:    v_sub_co_u32 v8, s4, v26, v12
-; GFX10-NEXT:    v_sub_co_ci_u32_e64 v9, s4, v27, v13, s4
-; GFX10-NEXT:    v_sub_co_ci_u32_e64 v10, s4, v24, v14, s4
+; GFX10-NEXT:    v_sub_co_u32 v8, s4, v4, v12
+; GFX10-NEXT:    v_sub_co_ci_u32_e64 v9, s4, v5, v13, s4
+; GFX10-NEXT:    v_sub_co_ci_u32_e64 v10, s4, v6, v14, s4
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v16, v2, vcc_lo
-; GFX10-NEXT:    v_sub_co_ci_u32_e64 v11, s4, v25, v15, s4
-; GFX10-NEXT:    v_cmp_lt_u64_e64 s4, v[8:9], v[26:27]
+; GFX10-NEXT:    v_sub_co_ci_u32_e64 v11, s4, v7, v15, s4
+; GFX10-NEXT:    v_cmp_lt_u64_e64 s4, v[8:9], v[4:5]
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v17, v3, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v18, v20, vcc_lo
 ; GFX10-NEXT:    v_lshrrev_b64 v[3:4], s5, v[8:9]
 ; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s4
-; GFX10-NEXT:    v_cmp_lt_i64_e64 s4, v[10:11], v[24:25]
+; GFX10-NEXT:    v_cmp_lt_i64_e64 s4, v[10:11], v[6:7]
 ; GFX10-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s4
 ; GFX10-NEXT:    v_cmp_lt_u64_e64 s4, 0, v[12:13]
 ; GFX10-NEXT:    v_lshlrev_b64 v[12:13], s6, v[10:11]
@@ -6021,7 +6005,7 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX10-NEXT:    v_or_b32_e32 v13, v4, v13
 ; GFX10-NEXT:    v_ashrrev_i64 v[3:4], s5, v[10:11]
 ; GFX10-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s4
-; GFX10-NEXT:    v_cmp_eq_u64_e64 s4, v[10:11], v[24:25]
+; GFX10-NEXT:    v_cmp_eq_u64_e64 s4, v[10:11], v[6:7]
 ; GFX10-NEXT:    v_cndmask_b32_e64 v7, v16, v5, s4
 ; GFX10-NEXT:    v_cmp_eq_u64_e64 s4, 0, v[14:15]
 ; GFX10-NEXT:    v_ashrrev_i64 v[5:6], s7, v[10:11]
@@ -6035,13 +6019,13 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX10-NEXT:    s_and_b32 s6, 1, s4
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 s4, 0, s5
 ; GFX10-NEXT:    v_xor_b32_e32 v7, v14, v7
-; GFX10-NEXT:    v_ashrrev_i32_e32 v18, 31, v11
+; GFX10-NEXT:    v_ashrrev_i32_e32 v12, 31, v11
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 s5, 0, s6
 ; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, v8, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v6, v6, v9, s4
 ; GFX10-NEXT:    v_and_b32_e32 v7, 1, v7
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v18, v3, s5
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, v18, v4, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v12, v3, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v12, v4, s5
 ; GFX10-NEXT:    v_add_co_u32 v5, s4, v5, 0
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v6, s4, 0, v6, s4
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 s5, 0, v7
@@ -6578,23 +6562,21 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_sub_u32 s16, s0, s8
 ; GFX10-NEXT:    s_cselect_b32 s17, 1, 0
-; GFX10-NEXT:    s_mov_b32 s46, s0
 ; GFX10-NEXT:    s_and_b32 s17, s17, 1
-; GFX10-NEXT:    s_mov_b32 s47, s1
 ; GFX10-NEXT:    s_cmp_lg_u32 s17, 0
 ; GFX10-NEXT:    s_subb_u32 s17, s1, s9
 ; GFX10-NEXT:    s_cselect_b32 s18, 1, 0
-; GFX10-NEXT:    v_cmp_lt_u64_e64 s0, s[16:17], s[46:47]
+; GFX10-NEXT:    v_cmp_lt_u64_e64 s0, s[16:17], s[0:1]
 ; GFX10-NEXT:    s_and_b32 s18, s18, 1
 ; GFX10-NEXT:    s_cmp_lg_u32 s18, 0
-; GFX10-NEXT:    s_subb_u32 s30, s2, s10
+; GFX10-NEXT:    s_subb_u32 s18, s2, s10
 ; GFX10-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX10-NEXT:    s_and_b32 s19, s19, 1
 ; GFX10-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX10-NEXT:    s_subb_u32 s31, s3, s11
-; GFX10-NEXT:    v_cmp_lt_i64_e64 s0, s[30:31], s[2:3]
-; GFX10-NEXT:    s_cmp_eq_u64 s[30:31], s[2:3]
+; GFX10-NEXT:    s_subb_u32 s19, s3, s11
+; GFX10-NEXT:    v_cmp_lt_i64_e64 s0, s[18:19], s[2:3]
+; GFX10-NEXT:    s_cmp_eq_u64 s[18:19], s[2:3]
 ; GFX10-NEXT:    v_cmp_gt_u64_e64 s2, s[8:9], 0
 ; GFX10-NEXT:    s_cselect_b32 s20, 1, 0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
@@ -6614,13 +6596,13 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s2
 ; GFX10-NEXT:    s_cselect_b32 s24, 1, 0
 ; GFX10-NEXT:    s_lshr_b64 s[2:3], s[16:17], s20
-; GFX10-NEXT:    s_lshl_b64 s[8:9], s[30:31], s22
+; GFX10-NEXT:    s_lshl_b64 s[8:9], s[18:19], s22
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s1
-; GFX10-NEXT:    s_ashr_i32 s10, s31, 31
+; GFX10-NEXT:    s_ashr_i32 s10, s19, 31
 ; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
-; GFX10-NEXT:    s_ashr_i64 s[0:1], s[30:31], s20
-; GFX10-NEXT:    s_ashr_i64 s[8:9], s[30:31], s21
+; GFX10-NEXT:    s_ashr_i64 s[0:1], s[18:19], s20
+; GFX10-NEXT:    s_ashr_i64 s[8:9], s[18:19], s21
 ; GFX10-NEXT:    s_cmp_lg_u32 s23, 0
 ; GFX10-NEXT:    s_mov_b32 s11, s10
 ; GFX10-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[8:9]
@@ -6641,7 +6623,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX10-NEXT:    s_cselect_b32 s8, 1, 0
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    s_and_b32 s8, s8, 1
-; GFX10-NEXT:    v_mov_b32_e32 v3, s31
+; GFX10-NEXT:    v_mov_b32_e32 v3, s19
 ; GFX10-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX10-NEXT:    s_addc_u32 s0, s0, 0
 ; GFX10-NEXT:    s_cselect_b32 s8, 1, 0
@@ -6655,7 +6637,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX10-NEXT:    s_cselect_b32 s3, 1, 0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, s1, vcc_lo
 ; GFX10-NEXT:    s_and_b32 s3, s3, 1
-; GFX10-NEXT:    v_mov_b32_e32 v2, s30
+; GFX10-NEXT:    v_mov_b32_e32 v2, s18
 ; GFX10-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX10-NEXT:    s_subb_u32 s3, s5, s13
 ; GFX10-NEXT:    s_cselect_b32 s8, 1, 0

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll
index f6fc451f8060..8c1bc5fb57ca 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll
@@ -176,22 +176,22 @@ define amdgpu_kernel void @store_lds_v4i32_align1(<4 x i32> addrspace(3)* %out,
 ; GFX10-NEXT:    s_lshr_b32 s9, s6, 16
 ; GFX10-NEXT:    v_mov_b32_e32 v4, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s6
-; GFX10-NEXT:    v_mov_b32_e32 v15, s1
+; GFX10-NEXT:    v_mov_b32_e32 v5, s1
 ; GFX10-NEXT:    v_mov_b32_e32 v10, s5
 ; GFX10-NEXT:    s_lshr_b32 s0, s6, 24
 ; GFX10-NEXT:    v_mov_b32_e32 v6, s3
 ; GFX10-NEXT:    v_mov_b32_e32 v11, s9
 ; GFX10-NEXT:    v_mov_b32_e32 v7, s2
 ; GFX10-NEXT:    v_mov_b32_e32 v8, s4
-; GFX10-NEXT:    v_mov_b32_e32 v19, s8
+; GFX10-NEXT:    v_mov_b32_e32 v9, s8
 ; GFX10-NEXT:    ds_write_b8 v1, v0
 ; GFX10-NEXT:    ds_write_b8 v1, v2 offset:4
 ; GFX10-NEXT:    ds_write_b8 v1, v4 offset:1
-; GFX10-NEXT:    ds_write_b8 v1, v15 offset:2
+; GFX10-NEXT:    ds_write_b8 v1, v5 offset:2
 ; GFX10-NEXT:    ds_write_b8 v1, v6 offset:3
 ; GFX10-NEXT:    ds_write_b8 v1, v7 offset:5
 ; GFX10-NEXT:    ds_write_b8 v1, v8 offset:6
-; GFX10-NEXT:    ds_write_b8 v1, v19 offset:7
+; GFX10-NEXT:    ds_write_b8 v1, v9 offset:7
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-NEXT:    s_lshr_b32 s0, s7, 8
 ; GFX10-NEXT:    s_lshr_b32 s1, s7, 16
@@ -202,12 +202,12 @@ define amdgpu_kernel void @store_lds_v4i32_align1(<4 x i32> addrspace(3)* %out,
 ; GFX10-NEXT:    s_lshr_b32 s2, s7, 24
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v4, s1
-; GFX10-NEXT:    v_mov_b32_e32 v7, s2
+; GFX10-NEXT:    v_mov_b32_e32 v5, s2
 ; GFX10-NEXT:    ds_write_b8 v1, v0 offset:11
 ; GFX10-NEXT:    ds_write_b8 v1, v2 offset:12
 ; GFX10-NEXT:    ds_write_b8 v1, v3 offset:13
 ; GFX10-NEXT:    ds_write_b8 v1, v4 offset:14
-; GFX10-NEXT:    ds_write_b8 v1, v7 offset:15
+; GFX10-NEXT:    ds_write_b8 v1, v5 offset:15
 ; GFX10-NEXT:    s_endpgm
   store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 1
   ret void
@@ -286,7 +286,7 @@ define amdgpu_kernel void @store_lds_v4i32_align2(<4 x i32> addrspace(3)* %out,
 ; GFX10-NEXT:    s_lshr_b32 s2, s6, 16
 ; GFX10-NEXT:    s_lshr_b32 s3, s7, 16
 ; GFX10-NEXT:    v_mov_b32_e32 v4, s7
-; GFX10-NEXT:    v_mov_b32_e32 v11, s0
+; GFX10-NEXT:    v_mov_b32_e32 v5, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v6, s1
 ; GFX10-NEXT:    v_mov_b32_e32 v7, s2
 ; GFX10-NEXT:    v_mov_b32_e32 v8, s3
@@ -294,7 +294,7 @@ define amdgpu_kernel void @store_lds_v4i32_align2(<4 x i32> addrspace(3)* %out,
 ; GFX10-NEXT:    ds_write_b16 v1, v2 offset:4
 ; GFX10-NEXT:    ds_write_b16 v1, v3 offset:8
 ; GFX10-NEXT:    ds_write_b16 v1, v4 offset:12
-; GFX10-NEXT:    ds_write_b16 v1, v11 offset:2
+; GFX10-NEXT:    ds_write_b16 v1, v5 offset:2
 ; GFX10-NEXT:    ds_write_b16 v1, v6 offset:6
 ; GFX10-NEXT:    ds_write_b16 v1, v7 offset:10
 ; GFX10-NEXT:    ds_write_b16 v1, v8 offset:14

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll
index 88277f4d2bdf..c96a98fe631f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll
@@ -147,12 +147,12 @@ define amdgpu_kernel void @store_lds_v3i32_align1(<3 x i32> addrspace(3)* %out,
 ; GFX10-NEXT:    s_lshr_b32 s3, s12, 24
 ; GFX10-NEXT:    s_lshr_b32 s6, s14, 8
 ; GFX10-NEXT:    v_mov_b32_e32 v4, s0
-; GFX10-NEXT:    v_mov_b32_e32 v15, s5
+; GFX10-NEXT:    v_mov_b32_e32 v9, s5
 ; GFX10-NEXT:    s_lshr_b32 s2, s13, 8
 ; GFX10-NEXT:    s_lshr_b32 s4, s13, 16
 ; GFX10-NEXT:    s_lshr_b32 s7, s14, 16
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s14
-; GFX10-NEXT:    v_mov_b32_e32 v11, s1
+; GFX10-NEXT:    v_mov_b32_e32 v5, s1
 ; GFX10-NEXT:    s_lshr_b32 s8, s14, 24
 ; GFX10-NEXT:    v_mov_b32_e32 v6, s3
 ; GFX10-NEXT:    v_mov_b32_e32 v10, s6
@@ -161,13 +161,13 @@ define amdgpu_kernel void @store_lds_v3i32_align1(<3 x i32> addrspace(3)* %out,
 ; GFX10-NEXT:    ds_write_b8 v1, v0
 ; GFX10-NEXT:    ds_write_b8 v1, v2 offset:4
 ; GFX10-NEXT:    ds_write_b8 v1, v4 offset:1
-; GFX10-NEXT:    ds_write_b8 v1, v11 offset:2
+; GFX10-NEXT:    ds_write_b8 v1, v5 offset:2
 ; GFX10-NEXT:    ds_write_b8 v1, v6 offset:3
 ; GFX10-NEXT:    ds_write_b8 v1, v7 offset:5
 ; GFX10-NEXT:    ds_write_b8 v1, v8 offset:6
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s7
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s8
-; GFX10-NEXT:    ds_write_b8 v1, v15 offset:7
+; GFX10-NEXT:    ds_write_b8 v1, v9 offset:7
 ; GFX10-NEXT:    ds_write_b8 v1, v3 offset:8
 ; GFX10-NEXT:    ds_write_b8 v1, v10 offset:9
 ; GFX10-NEXT:    ds_write_b8 v1, v0 offset:10
@@ -239,13 +239,13 @@ define amdgpu_kernel void @store_lds_v3i32_align2(<3 x i32> addrspace(3)* %out,
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s14
 ; GFX10-NEXT:    s_lshr_b32 s2, s14, 16
 ; GFX10-NEXT:    v_mov_b32_e32 v4, s0
-; GFX10-NEXT:    v_mov_b32_e32 v7, s1
+; GFX10-NEXT:    v_mov_b32_e32 v5, s1
 ; GFX10-NEXT:    v_mov_b32_e32 v6, s2
 ; GFX10-NEXT:    ds_write_b16 v1, v0
 ; GFX10-NEXT:    ds_write_b16 v1, v2 offset:4
 ; GFX10-NEXT:    ds_write_b16 v1, v3 offset:8
 ; GFX10-NEXT:    ds_write_b16 v1, v4 offset:2
-; GFX10-NEXT:    ds_write_b16 v1, v7 offset:6
+; GFX10-NEXT:    ds_write_b16 v1, v5 offset:6
 ; GFX10-NEXT:    ds_write_b16 v1, v6 offset:10
 ; GFX10-NEXT:    s_endpgm
   store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 2

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
index 0e23a1675782..681b8f0d1286 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
@@ -2819,20 +2819,16 @@ define <2 x i64> @v_uaddsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_mov_b32_e32 v10, v4
-; GFX10-NEXT:    v_mov_b32_e32 v11, v5
-; GFX10-NEXT:    v_mov_b32_e32 v15, v6
-; GFX10-NEXT:    v_mov_b32_e32 v16, v7
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v10
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v11, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v5, vcc_lo, v2, v15
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, v3, v16, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[10:11]
-; GFX10-NEXT:    v_cmp_lt_u64_e64 s4, v[5:6], v[15:16]
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v6
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[4:5]
+; GFX10-NEXT:    v_cmp_lt_u64_e64 s4, v[2:3], v[6:7]
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, -1, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v5, -1, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v6, -1, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, -1, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, -1, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
   ret <2 x i64> %result
@@ -3203,22 +3199,22 @@ define amdgpu_ps <4 x float> @uaddsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
 ;
 ; GFX10-LABEL: uaddsat_i128_sv:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    v_add_co_u32 v10, vcc_lo, s0, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v11, vcc_lo, s1, v1, vcc_lo
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, s2, v2, vcc_lo
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, s3, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[10:11], v[0:1]
+; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, s0, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, s1, v1, vcc_lo
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, s2, v2, vcc_lo
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, s3, v3, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[0:1]
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[2:3]
+; GFX10-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[6:7], v[2:3]
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[2:3]
+; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3]
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v10, -1, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v11, -1, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v4, -1, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v5, -1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v4, -1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v5, -1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v6, -1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v7, -1, vcc_lo
 ; GFX10-NEXT:    ; return to shader part epilog
   %result = call i128 @llvm.uadd.sat.i128(i128 %lhs, i128 %rhs)
   %cast = bitcast i128 %result to <4 x float>
@@ -3435,33 +3431,25 @@ define <2 x i128> @v_uaddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_mov_b32_e32 v18, v8
-; GFX10-NEXT:    v_mov_b32_e32 v19, v9
-; GFX10-NEXT:    v_mov_b32_e32 v16, v10
-; GFX10-NEXT:    v_mov_b32_e32 v17, v11
-; GFX10-NEXT:    v_mov_b32_e32 v10, v12
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v18
-; GFX10-NEXT:    v_mov_b32_e32 v11, v13
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v19, vcc_lo
-; GFX10-NEXT:    v_mov_b32_e32 v20, v14
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v2, v16, vcc_lo
-; GFX10-NEXT:    v_mov_b32_e32 v21, v15
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v17, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[18:19]
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v8
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v9, vcc_lo
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v2, v10, vcc_lo
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v11, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[8:9]
 ; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, v4, v10
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v5, v11, vcc_lo
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, v6, v20, vcc_lo
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, v7, v21, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[16:17]
+; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, v4, v12
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v5, v13, vcc_lo
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, v6, v14, vcc_lo
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, v7, v15, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[10:11]
 ; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[10:11]
+; GFX10-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[12:13]
 ; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[6:7], v[20:21]
+; GFX10-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[6:7], v[14:15]
 ; GFX10-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[16:17]
+; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[10:11]
 ; GFX10-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[20:21]
+; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[14:15]
 ; GFX10-NEXT:    v_and_b32_e32 v8, 1, v8
 ; GFX10-NEXT:    v_cndmask_b32_e32 v9, v13, v12, vcc_lo
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v8

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
index f5c9bb56e780..b71703a2abcc 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
@@ -2689,16 +2689,12 @@ define <2 x i64> @v_usubsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_mov_b32_e32 v10, v0
-; GFX10-NEXT:    v_mov_b32_e32 v11, v1
-; GFX10-NEXT:    v_mov_b32_e32 v0, v2
-; GFX10-NEXT:    v_mov_b32_e32 v1, v3
-; GFX10-NEXT:    v_sub_co_u32 v8, vcc_lo, v10, v4
-; GFX10-NEXT:    v_sub_co_ci_u32_e32 v9, vcc_lo, v11, v5, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[10:11], v[4:5]
-; GFX10-NEXT:    v_sub_co_u32 v4, s4, v0, v6
-; GFX10-NEXT:    v_sub_co_ci_u32_e64 v5, s4, v1, v7, s4
-; GFX10-NEXT:    v_cmp_lt_u64_e64 s4, v[0:1], v[6:7]
+; GFX10-NEXT:    v_sub_co_u32 v8, vcc_lo, v0, v4
+; GFX10-NEXT:    v_sub_co_ci_u32_e32 v9, vcc_lo, v1, v5, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[4:5]
+; GFX10-NEXT:    v_sub_co_u32 v4, s4, v2, v6
+; GFX10-NEXT:    v_sub_co_ci_u32_e64 v5, s4, v3, v7, s4
+; GFX10-NEXT:    v_cmp_lt_u64_e64 s4, v[2:3], v[6:7]
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v8, 0, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v9, 0, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v4, 0, s4
@@ -2974,7 +2970,7 @@ define amdgpu_ps i128 @s_usubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX10-NEXT:    s_and_b32 s10, s10, 1
 ; GFX10-NEXT:    s_cmp_lg_u32 s10, 0
-; GFX10-NEXT:    s_subb_u32 s14, s2, s6
+; GFX10-NEXT:    s_subb_u32 s10, s2, s6
 ; GFX10-NEXT:    s_cselect_b32 s11, 1, 0
 ; GFX10-NEXT:    s_and_b32 s11, s11, 1
 ; GFX10-NEXT:    s_cmp_lg_u32 s11, 0
@@ -2989,7 +2985,7 @@ define amdgpu_ps i128 @s_usubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, s8, 0, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, s14, 0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, s10, 0, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, s1, 0, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, s9, 0, vcc_lo
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
@@ -3305,41 +3301,33 @@ define <2 x i128> @v_usubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_mov_b32_e32 v22, v0
-; GFX10-NEXT:    v_mov_b32_e32 v23, v1
-; GFX10-NEXT:    v_mov_b32_e32 v20, v2
-; GFX10-NEXT:    v_mov_b32_e32 v21, v3
-; GFX10-NEXT:    v_mov_b32_e32 v26, v4
-; GFX10-NEXT:    v_mov_b32_e32 v27, v5
-; GFX10-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[22:23], v[8:9]
-; GFX10-NEXT:    v_mov_b32_e32 v24, v6
-; GFX10-NEXT:    v_mov_b32_e32 v25, v7
+; GFX10-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX10-NEXT:    v_cmp_eq_u64_e64 s5, v[6:7], v[14:15]
 ; GFX10-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[20:21], v[10:11]
-; GFX10-NEXT:    v_cmp_eq_u64_e64 s5, v[24:25], v[14:15]
+; GFX10-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[10:11]
 ; GFX10-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[20:21], v[10:11]
+; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[10:11]
 ; GFX10-NEXT:    v_cndmask_b32_e32 v16, v17, v16, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[26:27], v[12:13]
+; GFX10-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[12:13]
 ; GFX10-NEXT:    v_and_b32_e32 v16, 1, v16
 ; GFX10-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[24:25], v[14:15]
+; GFX10-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[6:7], v[14:15]
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 s4, 0, v16
 ; GFX10-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc_lo
-; GFX10-NEXT:    v_sub_co_u32 v0, vcc_lo, v22, v8
-; GFX10-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v23, v9, vcc_lo
+; GFX10-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v8
+; GFX10-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v9, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v8, v18, v17, s5
-; GFX10-NEXT:    v_sub_co_ci_u32_e32 v2, vcc_lo, v20, v10, vcc_lo
+; GFX10-NEXT:    v_sub_co_ci_u32_e32 v2, vcc_lo, v2, v10, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s4
-; GFX10-NEXT:    v_sub_co_ci_u32_e32 v3, vcc_lo, v21, v11, vcc_lo
-; GFX10-NEXT:    v_sub_co_u32 v4, vcc_lo, v26, v12
+; GFX10-NEXT:    v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v11, vcc_lo
+; GFX10-NEXT:    v_sub_co_u32 v4, vcc_lo, v4, v12
 ; GFX10-NEXT:    v_and_b32_e32 v8, 1, v8
-; GFX10-NEXT:    v_sub_co_ci_u32_e32 v5, vcc_lo, v27, v13, vcc_lo
+; GFX10-NEXT:    v_sub_co_ci_u32_e32 v5, vcc_lo, v5, v13, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s4
-; GFX10-NEXT:    v_sub_co_ci_u32_e32 v6, vcc_lo, v24, v14, vcc_lo
+; GFX10-NEXT:    v_sub_co_ci_u32_e32 v6, vcc_lo, v6, v14, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, 0, s4
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 s5, 0, v8
-; GFX10-NEXT:    v_sub_co_ci_u32_e32 v7, vcc_lo, v25, v15, vcc_lo
+; GFX10-NEXT:    v_sub_co_ci_u32_e32 v7, vcc_lo, v7, v15, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, 0, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, 0, s5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, 0, s5
@@ -3630,7 +3618,7 @@ define amdgpu_ps <2 x i128> @s_usubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX10-NEXT:    s_and_b32 s1, s1, 1
 ; GFX10-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX10-NEXT:    v_cmp_lt_u64_e64 s1, s[4:5], s[12:13]
-; GFX10-NEXT:    s_subb_u32 s30, s6, s14
+; GFX10-NEXT:    s_subb_u32 s10, s6, s14
 ; GFX10-NEXT:    s_cselect_b32 s0, 1, 0
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    s_and_b32 s0, s0, 1
@@ -3656,7 +3644,7 @@ define amdgpu_ps <2 x i128> @s_usubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX10-NEXT:    v_readfirstlane_b32 s2, v3
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, s3, 0, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, s8, 0, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, s30, 0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, s10, 0, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, s9, 0, vcc_lo
 ; GFX10-NEXT:    v_readfirstlane_b32 s3, v4
 ; GFX10-NEXT:    v_readfirstlane_b32 s5, v1

diff  --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index 921e0b35a8ae..5f28f31aff64 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -500,12 +500,12 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) {
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX1064-NEXT:    s_cbranch_execz BB2_2
 ; GFX1064-NEXT:  ; %bb.1:
-; GFX1064-NEXT:    v_mov_b32_e32 v7, local_var32 at abs32@lo
+; GFX1064-NEXT:    v_mov_b32_e32 v0, local_var32 at abs32@lo
 ; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
 ; GFX1064-NEXT:    s_mov_b32 s3, s7
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1064-NEXT:    ds_add_rtn_u32 v0, v7, v4
+; GFX1064-NEXT:    ds_add_rtn_u32 v0, v0, v4
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:  BB2_2:
@@ -551,11 +551,11 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) {
 ; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
 ; GFX1032-NEXT:    s_cbranch_execz BB2_2
 ; GFX1032-NEXT:  ; %bb.1:
-; GFX1032-NEXT:    v_mov_b32_e32 v7, local_var32 at abs32@lo
+; GFX1032-NEXT:    v_mov_b32_e32 v0, local_var32 at abs32@lo
 ; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1032-NEXT:    ds_add_rtn_u32 v0, v7, v4
+; GFX1032-NEXT:    ds_add_rtn_u32 v0, v0, v4
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:  BB2_2:
@@ -1680,12 +1680,12 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) {
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX1064-NEXT:    s_cbranch_execz BB9_2
 ; GFX1064-NEXT:  ; %bb.1:
-; GFX1064-NEXT:    v_mov_b32_e32 v7, local_var32 at abs32@lo
+; GFX1064-NEXT:    v_mov_b32_e32 v0, local_var32 at abs32@lo
 ; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
 ; GFX1064-NEXT:    s_mov_b32 s3, s7
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1064-NEXT:    ds_sub_rtn_u32 v0, v7, v4
+; GFX1064-NEXT:    ds_sub_rtn_u32 v0, v0, v4
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:  BB9_2:
@@ -1731,11 +1731,11 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) {
 ; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
 ; GFX1032-NEXT:    s_cbranch_execz BB9_2
 ; GFX1032-NEXT:  ; %bb.1:
-; GFX1032-NEXT:    v_mov_b32_e32 v7, local_var32 at abs32@lo
+; GFX1032-NEXT:    v_mov_b32_e32 v0, local_var32 at abs32@lo
 ; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1032-NEXT:    ds_sub_rtn_u32 v0, v7, v4
+; GFX1032-NEXT:    ds_sub_rtn_u32 v0, v0, v4
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:  BB9_2:
@@ -2534,12 +2534,12 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) {
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX1064-NEXT:    s_cbranch_execz BB14_2
 ; GFX1064-NEXT:  ; %bb.1:
-; GFX1064-NEXT:    v_mov_b32_e32 v7, local_var32 at abs32@lo
+; GFX1064-NEXT:    v_mov_b32_e32 v0, local_var32 at abs32@lo
 ; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
 ; GFX1064-NEXT:    s_mov_b32 s3, s7
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1064-NEXT:    ds_and_rtn_b32 v0, v7, v4
+; GFX1064-NEXT:    ds_and_rtn_b32 v0, v0, v4
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:  BB14_2:
@@ -2585,11 +2585,11 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) {
 ; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
 ; GFX1032-NEXT:    s_cbranch_execz BB14_2
 ; GFX1032-NEXT:  ; %bb.1:
-; GFX1032-NEXT:    v_mov_b32_e32 v7, local_var32 at abs32@lo
+; GFX1032-NEXT:    v_mov_b32_e32 v0, local_var32 at abs32@lo
 ; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1032-NEXT:    ds_and_rtn_b32 v0, v7, v4
+; GFX1032-NEXT:    ds_and_rtn_b32 v0, v0, v4
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:  BB14_2:
@@ -2768,12 +2768,12 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) {
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX1064-NEXT:    s_cbranch_execz BB15_2
 ; GFX1064-NEXT:  ; %bb.1:
-; GFX1064-NEXT:    v_mov_b32_e32 v7, local_var32 at abs32@lo
+; GFX1064-NEXT:    v_mov_b32_e32 v0, local_var32 at abs32@lo
 ; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
 ; GFX1064-NEXT:    s_mov_b32 s3, s7
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1064-NEXT:    ds_or_rtn_b32 v0, v7, v4
+; GFX1064-NEXT:    ds_or_rtn_b32 v0, v0, v4
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:  BB15_2:
@@ -2819,11 +2819,11 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) {
 ; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
 ; GFX1032-NEXT:    s_cbranch_execz BB15_2
 ; GFX1032-NEXT:  ; %bb.1:
-; GFX1032-NEXT:    v_mov_b32_e32 v7, local_var32 at abs32@lo
+; GFX1032-NEXT:    v_mov_b32_e32 v0, local_var32 at abs32@lo
 ; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1032-NEXT:    ds_or_rtn_b32 v0, v7, v4
+; GFX1032-NEXT:    ds_or_rtn_b32 v0, v0, v4
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:  BB15_2:
@@ -3002,12 +3002,12 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) {
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX1064-NEXT:    s_cbranch_execz BB16_2
 ; GFX1064-NEXT:  ; %bb.1:
-; GFX1064-NEXT:    v_mov_b32_e32 v7, local_var32 at abs32@lo
+; GFX1064-NEXT:    v_mov_b32_e32 v0, local_var32 at abs32@lo
 ; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
 ; GFX1064-NEXT:    s_mov_b32 s3, s7
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1064-NEXT:    ds_xor_rtn_b32 v0, v7, v4
+; GFX1064-NEXT:    ds_xor_rtn_b32 v0, v0, v4
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:  BB16_2:
@@ -3053,11 +3053,11 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) {
 ; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
 ; GFX1032-NEXT:    s_cbranch_execz BB16_2
 ; GFX1032-NEXT:  ; %bb.1:
-; GFX1032-NEXT:    v_mov_b32_e32 v7, local_var32 at abs32@lo
+; GFX1032-NEXT:    v_mov_b32_e32 v0, local_var32 at abs32@lo
 ; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1032-NEXT:    ds_xor_rtn_b32 v0, v7, v4
+; GFX1032-NEXT:    ds_xor_rtn_b32 v0, v0, v4
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:  BB16_2:
@@ -3238,12 +3238,12 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) {
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX1064-NEXT:    s_cbranch_execz BB17_2
 ; GFX1064-NEXT:  ; %bb.1:
-; GFX1064-NEXT:    v_mov_b32_e32 v7, local_var32 at abs32@lo
+; GFX1064-NEXT:    v_mov_b32_e32 v0, local_var32 at abs32@lo
 ; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
 ; GFX1064-NEXT:    s_mov_b32 s3, s7
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1064-NEXT:    ds_max_rtn_i32 v0, v7, v4
+; GFX1064-NEXT:    ds_max_rtn_i32 v0, v0, v4
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:  BB17_2:
@@ -3291,11 +3291,11 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) {
 ; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
 ; GFX1032-NEXT:    s_cbranch_execz BB17_2
 ; GFX1032-NEXT:  ; %bb.1:
-; GFX1032-NEXT:    v_mov_b32_e32 v7, local_var32 at abs32@lo
+; GFX1032-NEXT:    v_mov_b32_e32 v0, local_var32 at abs32@lo
 ; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1032-NEXT:    ds_max_rtn_i32 v0, v7, v4
+; GFX1032-NEXT:    ds_max_rtn_i32 v0, v0, v4
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:  BB17_2:
@@ -3655,12 +3655,12 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) {
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX1064-NEXT:    s_cbranch_execz BB19_2
 ; GFX1064-NEXT:  ; %bb.1:
-; GFX1064-NEXT:    v_mov_b32_e32 v7, local_var32 at abs32@lo
+; GFX1064-NEXT:    v_mov_b32_e32 v0, local_var32 at abs32@lo
 ; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
 ; GFX1064-NEXT:    s_mov_b32 s3, s7
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1064-NEXT:    ds_min_rtn_i32 v0, v7, v4
+; GFX1064-NEXT:    ds_min_rtn_i32 v0, v0, v4
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:  BB19_2:
@@ -3708,11 +3708,11 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) {
 ; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
 ; GFX1032-NEXT:    s_cbranch_execz BB19_2
 ; GFX1032-NEXT:  ; %bb.1:
-; GFX1032-NEXT:    v_mov_b32_e32 v7, local_var32 at abs32@lo
+; GFX1032-NEXT:    v_mov_b32_e32 v0, local_var32 at abs32@lo
 ; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1032-NEXT:    ds_min_rtn_i32 v0, v7, v4
+; GFX1032-NEXT:    ds_min_rtn_i32 v0, v0, v4
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:  BB19_2:
@@ -4070,12 +4070,12 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) {
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX1064-NEXT:    s_cbranch_execz BB21_2
 ; GFX1064-NEXT:  ; %bb.1:
-; GFX1064-NEXT:    v_mov_b32_e32 v7, local_var32 at abs32@lo
+; GFX1064-NEXT:    v_mov_b32_e32 v0, local_var32 at abs32@lo
 ; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
 ; GFX1064-NEXT:    s_mov_b32 s3, s7
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1064-NEXT:    ds_max_rtn_u32 v0, v7, v4
+; GFX1064-NEXT:    ds_max_rtn_u32 v0, v0, v4
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:  BB21_2:
@@ -4121,11 +4121,11 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) {
 ; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
 ; GFX1032-NEXT:    s_cbranch_execz BB21_2
 ; GFX1032-NEXT:  ; %bb.1:
-; GFX1032-NEXT:    v_mov_b32_e32 v7, local_var32 at abs32@lo
+; GFX1032-NEXT:    v_mov_b32_e32 v0, local_var32 at abs32@lo
 ; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1032-NEXT:    ds_max_rtn_u32 v0, v7, v4
+; GFX1032-NEXT:    ds_max_rtn_u32 v0, v0, v4
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:  BB21_2:
@@ -4480,12 +4480,12 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) {
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX1064-NEXT:    s_cbranch_execz BB23_2
 ; GFX1064-NEXT:  ; %bb.1:
-; GFX1064-NEXT:    v_mov_b32_e32 v7, local_var32 at abs32@lo
+; GFX1064-NEXT:    v_mov_b32_e32 v0, local_var32 at abs32@lo
 ; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
 ; GFX1064-NEXT:    s_mov_b32 s3, s7
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1064-NEXT:    ds_min_rtn_u32 v0, v7, v4
+; GFX1064-NEXT:    ds_min_rtn_u32 v0, v0, v4
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:  BB23_2:
@@ -4531,11 +4531,11 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) {
 ; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
 ; GFX1032-NEXT:    s_cbranch_execz BB23_2
 ; GFX1032-NEXT:  ; %bb.1:
-; GFX1032-NEXT:    v_mov_b32_e32 v7, local_var32 at abs32@lo
+; GFX1032-NEXT:    v_mov_b32_e32 v0, local_var32 at abs32@lo
 ; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1032-NEXT:    ds_min_rtn_u32 v0, v7, v4
+; GFX1032-NEXT:    ds_min_rtn_u32 v0, v0, v4
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:  BB23_2:

diff  --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
index 2781993221e0..765a68198216 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
@@ -92,7 +92,7 @@ define amdgpu_ps void @add_i32_constant(<4 x i32> inreg %out, <4 x i32> inreg %i
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s12, 0
 ; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s13, v0
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT:    s_and_saveexec_b64 s[28:29], vcc
+; GFX1064-NEXT:    s_and_saveexec_b64 s[10:11], vcc
 ; GFX1064-NEXT:    s_cbranch_execz BB0_3
 ; GFX1064-NEXT:  ; %bb.2:
 ; GFX1064-NEXT:    s_bcnt1_i32_b64 s12, s[12:13]
@@ -101,7 +101,7 @@ define amdgpu_ps void @add_i32_constant(<4 x i32> inreg %out, <4 x i32> inreg %i
 ; GFX1064-NEXT:    buffer_atomic_add v1, off, s[4:7], 0 glc
 ; GFX1064-NEXT:  BB0_3:
 ; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX1064-NEXT:    s_or_b64 exec, exec, s[28:29]
+; GFX1064-NEXT:    s_or_b64 exec, exec, s[10:11]
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-NEXT:    v_readfirstlane_b32 s4, v1
 ; GFX1064-NEXT:    v_mad_u32_u24 v0, v0, 5, s4
@@ -328,14 +328,14 @@ define amdgpu_ps void @add_i32_varying(<4 x i32> inreg %out, <4 x i32> inreg %in
 ; GFX1064-NEXT:    s_mov_b64 exec, s[10:11]
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-NEXT:    ; implicit-def: $vgpr0
-; GFX1064-NEXT:    s_and_saveexec_b64 s[28:29], vcc
+; GFX1064-NEXT:    s_and_saveexec_b64 s[10:11], vcc
 ; GFX1064-NEXT:    s_cbranch_execz BB1_3
 ; GFX1064-NEXT:  ; %bb.2:
 ; GFX1064-NEXT:    v_mov_b32_e32 v0, s12
 ; GFX1064-NEXT:    buffer_atomic_add v0, off, s[4:7], 0 glc
 ; GFX1064-NEXT:  BB1_3:
 ; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX1064-NEXT:    s_or_b64 exec, exec, s[28:29]
+; GFX1064-NEXT:    s_or_b64 exec, exec, s[10:11]
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX1064-NEXT:    v_mov_b32_e32 v0, v3

diff  --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll
index ab20b16624c0..8213a3700225 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll
@@ -233,7 +233,7 @@ define amdgpu_kernel void @v_ctlz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2
 ; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX10-NEXT:    v_mov_b32_e32 v7, 0
+; GFX10-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
@@ -243,7 +243,7 @@ define amdgpu_kernel void @v_ctlz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, 32, v2, vcc_lo
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, 32, v3, vcc_lo
-; GFX10-NEXT:    global_store_dwordx2 v7, v[0:1], s[0:1]
+; GFX10-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %valptr, i32 %tid

diff  --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
index 27c5fa4b1d66..e0b30adc0627 100644
--- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
@@ -1121,7 +1121,7 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX10-NEXT:    v_mov_b32_e32 v11, 0
+; GFX10-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x5
 ; GFX10-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:2
@@ -1144,9 +1144,9 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias
 ; GFX10-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v7
-; GFX10-NEXT:    global_store_dword v11, v6, s[0:1] offset:24
-; GFX10-NEXT:    global_store_dwordx2 v11, v[4:5], s[0:1] offset:16
-; GFX10-NEXT:    global_store_dwordx4 v11, v[0:3], s[0:1]
+; GFX10-NEXT:    global_store_dword v8, v6, s[0:1] offset:24
+; GFX10-NEXT:    global_store_dwordx2 v8, v[4:5], s[0:1] offset:16
+; GFX10-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
 ; GFX10-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr <7 x i8>, <7 x i8> addrspace(1)* %in, i32 %tid

diff  --git a/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll b/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll
index a1c8e48917d0..9dcffcdb7ca1 100644
--- a/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll
@@ -90,8 +90,8 @@ define amdgpu_kernel void @write_ds_sub0_offset0_global_clamp_bit(float %dummy.v
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    ds_write_b32 v3, v2 offset:12
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_div_fmas_f32 v7, s0, s0, s0
-; GFX10-NEXT:    global_store_dword v[0:1], v7, off
+; GFX10-NEXT:    v_div_fmas_f32 v4, s0, s0, s0
+; GFX10-NEXT:    global_store_dword v[0:1], v4, off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    s_endpgm
 entry:
@@ -340,8 +340,8 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    ds_write2_b32 v4, v2, v3 offset1:1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_div_fmas_f32 v7, s0, s0, s0
-; GFX10-NEXT:    global_store_dword v[0:1], v7, off
+; GFX10-NEXT:    v_div_fmas_f32 v5, s0, s0, s0
+; GFX10-NEXT:    global_store_dword v[0:1], v5, off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    s_endpgm
   %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0

diff  --git a/llvm/test/CodeGen/AMDGPU/fneg-fold-legalize-dag-increase-insts.ll b/llvm/test/CodeGen/AMDGPU/fneg-fold-legalize-dag-increase-insts.ll
index 8c126869820b..1ec14d119dba 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-fold-legalize-dag-increase-insts.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-fold-legalize-dag-increase-insts.ll
@@ -20,9 +20,7 @@ define { double, double } @testfn(double %arg, double %arg1, double %arg2) {
 ; GFX10:       ; %bb.0: ; %bb
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_mov_b32_e32 v7, v5
-; GFX10-NEXT:    v_mov_b32_e32 v6, v4
-; GFX10-NEXT:    v_add_f64 v[4:5], v[6:7], -v[0:1]
+; GFX10-NEXT:    v_add_f64 v[4:5], v[4:5], -v[0:1]
 ; GFX10-NEXT:    v_add_f64 v[0:1], v[4:5], -v[2:3]
 ; GFX10-NEXT:    v_add_f64 v[2:3], -v[2:3], -v[4:5]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]

diff  --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll
index 39af8c11e6d2..e465320da0bd 100644
--- a/llvm/test/CodeGen/AMDGPU/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/frem.ll
@@ -970,11 +970,11 @@ define amdgpu_kernel void @frem_f64(double addrspace(1)* %out, double addrspace(
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX10-NEXT:    v_mov_b32_e32 v15, 0
+; GFX10-NEXT:    v_mov_b32_e32 v12, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v15, s[6:7]
-; GFX10-NEXT:    global_load_dwordx2 v[2:3], v15, s[2:3]
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v12, s[6:7]
+; GFX10-NEXT:    global_load_dwordx2 v[2:3], v12, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_div_scale_f64 v[4:5], s0, v[2:3], v[2:3], v[0:1]
 ; GFX10-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
@@ -989,7 +989,7 @@ define amdgpu_kernel void @frem_f64(double addrspace(1)* %out, double addrspace(
 ; GFX10-NEXT:    v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1]
 ; GFX10-NEXT:    v_trunc_f64_e32 v[4:5], v[4:5]
 ; GFX10-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
-; GFX10-NEXT:    global_store_dwordx2 v15, v[0:1], s[4:5]
+; GFX10-NEXT:    global_store_dwordx2 v12, v[0:1], s[4:5]
 ; GFX10-NEXT:    s_endpgm
                       double addrspace(1)* %in2) #0 {
    %r0 = load double, double addrspace(1)* %in1, align 8
@@ -1141,10 +1141,10 @@ define amdgpu_kernel void @fast_frem_f64(double addrspace(1)* %out, double addrs
 ; GFX10-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
 ; GFX10-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
 ; GFX10-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; GFX10-NEXT:    v_fma_f64 v[14:15], v[6:7], v[4:5], v[4:5]
-; GFX10-NEXT:    v_mul_f64 v[6:7], v[0:1], v[14:15]
+; GFX10-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
+; GFX10-NEXT:    v_mul_f64 v[6:7], v[0:1], v[4:5]
 ; GFX10-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
-; GFX10-NEXT:    v_fma_f64 v[4:5], v[8:9], v[14:15], v[6:7]
+; GFX10-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
 ; GFX10-NEXT:    v_trunc_f64_e32 v[4:5], v[4:5]
 ; GFX10-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
 ; GFX10-NEXT:    global_store_dwordx2 v10, v[0:1], s[4:5]
@@ -1299,10 +1299,10 @@ define amdgpu_kernel void @unsafe_frem_f64(double addrspace(1)* %out, double add
 ; GFX10-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
 ; GFX10-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
 ; GFX10-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; GFX10-NEXT:    v_fma_f64 v[14:15], v[6:7], v[4:5], v[4:5]
-; GFX10-NEXT:    v_mul_f64 v[6:7], v[0:1], v[14:15]
+; GFX10-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
+; GFX10-NEXT:    v_mul_f64 v[6:7], v[0:1], v[4:5]
 ; GFX10-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
-; GFX10-NEXT:    v_fma_f64 v[4:5], v[8:9], v[14:15], v[6:7]
+; GFX10-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
 ; GFX10-NEXT:    v_trunc_f64_e32 v[4:5], v[4:5]
 ; GFX10-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
 ; GFX10-NEXT:    global_store_dwordx2 v10, v[0:1], s[4:5]
@@ -1893,49 +1893,49 @@ define amdgpu_kernel void @frem_v4f16(<4 x half> addrspace(1)* %out, <4 x half>
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX10-NEXT:    v_mov_b32_e32 v11, 0
+; GFX10-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v11, s[6:7]
-; GFX10-NEXT:    global_load_dwordx2 v[15:16], v11, s[2:3] offset:32
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v4, s[6:7]
+; GFX10-NEXT:    global_load_dwordx2 v[2:3], v4, s[2:3] offset:32
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-NEXT:    v_cvt_f32_f16_e32 v5, v1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_cvt_f32_f16_e32 v6, v16
-; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v16
-; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v15
+; GFX10-NEXT:    v_cvt_f32_f16_e32 v6, v3
 ; GFX10-NEXT:    v_rcp_f32_e32 v6, v6
-; GFX10-NEXT:    v_cvt_f32_f16_e32 v7, v3
-; GFX10-NEXT:    v_rcp_f32_e32 v7, v7
 ; GFX10-NEXT:    v_mul_f32_e32 v5, v5, v6
-; GFX10-NEXT:    v_cvt_f16_f32_e32 v6, v5
-; GFX10-NEXT:    v_div_fixup_f16 v5, v6, v16, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v1
+; GFX10-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX10-NEXT:    v_div_fixup_f16 v5, v5, v3, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_trunc_f16_e32 v5, v5
-; GFX10-NEXT:    v_fmac_f16_e64 v6, -v5, v16
-; GFX10-NEXT:    v_cvt_f32_f16_e32 v5, v4
-; GFX10-NEXT:    v_mul_f32_e32 v5, v5, v7
-; GFX10-NEXT:    v_cvt_f32_f16_e32 v7, v2
-; GFX10-NEXT:    v_cvt_f16_f32_e32 v10, v5
+; GFX10-NEXT:    v_fmac_f16_e64 v6, -v5, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_cvt_f32_f16_e32 v5, v1
+; GFX10-NEXT:    v_cvt_f32_f16_e32 v7, v3
 ; GFX10-NEXT:    v_rcp_f32_e32 v7, v7
-; GFX10-NEXT:    v_div_fixup_f16 v5, v10, v3, v4
-; GFX10-NEXT:    v_trunc_f16_e32 v10, v5
-; GFX10-NEXT:    v_fmac_f16_e64 v4, -v10, v3
+; GFX10-NEXT:    v_mul_f32_e32 v5, v5, v7
+; GFX10-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX10-NEXT:    v_div_fixup_f16 v5, v5, v3, v1
+; GFX10-NEXT:    v_trunc_f16_e32 v5, v5
+; GFX10-NEXT:    v_fmac_f16_e64 v1, -v5, v3
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 0xffff
 ; GFX10-NEXT:    v_and_b32_e32 v5, v3, v6
-; GFX10-NEXT:    v_cvt_f32_f16_e32 v6, v15
-; GFX10-NEXT:    v_lshl_or_b32 v1, v4, 16, v5
+; GFX10-NEXT:    v_cvt_f32_f16_e32 v6, v2
+; GFX10-NEXT:    v_lshl_or_b32 v1, v1, 16, v5
 ; GFX10-NEXT:    v_rcp_f32_e32 v6, v6
 ; GFX10-NEXT:    v_cvt_f32_f16_e32 v5, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v5, v5, v6
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v0
 ; GFX10-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; GFX10-NEXT:    v_div_fixup_f16 v5, v5, v15, v0
+; GFX10-NEXT:    v_div_fixup_f16 v5, v5, v2, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_trunc_f16_e32 v5, v5
-; GFX10-NEXT:    v_fmac_f16_e64 v6, -v5, v15
+; GFX10-NEXT:    v_fmac_f16_e64 v6, -v5, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX10-NEXT:    v_cvt_f32_f16_e32 v5, v0
+; GFX10-NEXT:    v_cvt_f32_f16_e32 v7, v2
+; GFX10-NEXT:    v_rcp_f32_e32 v7, v7
 ; GFX10-NEXT:    v_mul_f32_e32 v5, v5, v7
 ; GFX10-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; GFX10-NEXT:    v_div_fixup_f16 v5, v5, v2, v0
@@ -1943,7 +1943,7 @@ define amdgpu_kernel void @frem_v4f16(<4 x half> addrspace(1)* %out, <4 x half>
 ; GFX10-NEXT:    v_fmac_f16_e64 v0, -v5, v2
 ; GFX10-NEXT:    v_and_b32_e32 v2, v3, v6
 ; GFX10-NEXT:    v_lshl_or_b32 v0, v0, 16, v2
-; GFX10-NEXT:    global_store_dwordx2 v11, v[0:1], s[4:5]
+; GFX10-NEXT:    global_store_dwordx2 v4, v[0:1], s[4:5]
 ; GFX10-NEXT:    s_endpgm
                         <4 x half> addrspace(1)* %in2) #0 {
    %gep2 = getelementptr <4 x half>, <4 x half> addrspace(1)* %in2, i32 4
@@ -2161,11 +2161,11 @@ define amdgpu_kernel void @frem_v2f32(<2 x float> addrspace(1)* %out, <2 x float
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX10-NEXT:    v_mov_b32_e32 v11, 0
+; GFX10-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v11, s[6:7]
-; GFX10-NEXT:    global_load_dwordx2 v[2:3], v11, s[2:3] offset:32
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v4, s[6:7]
+; GFX10-NEXT:    global_load_dwordx2 v[2:3], v4, s[2:3] offset:32
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_div_scale_f32 v6, s0, v3, v3, v1
 ; GFX10-NEXT:    v_div_scale_f32 v5, vcc_lo, v1, v3, v1
@@ -2178,26 +2178,26 @@ define amdgpu_kernel void @frem_v2f32(<2 x float> addrspace(1)* %out, <2 x float
 ; GFX10-NEXT:    v_fma_f32 v8, v9, v7, v8
 ; GFX10-NEXT:    v_fma_f32 v5, -v6, v8, v5
 ; GFX10-NEXT:    s_denorm_mode 12
-; GFX10-NEXT:    v_div_fmas_f32 v6, v5, v7, v8
-; GFX10-NEXT:    v_div_scale_f32 v4, vcc_lo, v0, v2, v0
-; GFX10-NEXT:    v_div_fixup_f32 v5, v6, v3, v1
-; GFX10-NEXT:    v_trunc_f32_e32 v6, v5
+; GFX10-NEXT:    v_div_fmas_f32 v5, v5, v7, v8
+; GFX10-NEXT:    v_div_fixup_f32 v5, v5, v3, v1
+; GFX10-NEXT:    v_trunc_f32_e32 v5, v5
+; GFX10-NEXT:    v_fma_f32 v1, v3, -v5, v1
 ; GFX10-NEXT:    v_div_scale_f32 v5, s0, v2, v2, v0
-; GFX10-NEXT:    v_fma_f32 v1, v3, -v6, v1
+; GFX10-NEXT:    v_div_scale_f32 v3, vcc_lo, v0, v2, v0
 ; GFX10-NEXT:    v_rcp_f32_e32 v6, v5
 ; GFX10-NEXT:    s_denorm_mode 15
 ; GFX10-NEXT:    v_fma_f32 v7, -v5, v6, 1.0
 ; GFX10-NEXT:    v_fma_f32 v6, v7, v6, v6
-; GFX10-NEXT:    v_mul_f32_e32 v7, v4, v6
-; GFX10-NEXT:    v_fma_f32 v8, -v5, v7, v4
+; GFX10-NEXT:    v_mul_f32_e32 v7, v3, v6
+; GFX10-NEXT:    v_fma_f32 v8, -v5, v7, v3
 ; GFX10-NEXT:    v_fma_f32 v7, v8, v6, v7
-; GFX10-NEXT:    v_fma_f32 v5, -v5, v7, v4
+; GFX10-NEXT:    v_fma_f32 v3, -v5, v7, v3
 ; GFX10-NEXT:    s_denorm_mode 12
-; GFX10-NEXT:    v_div_fmas_f32 v3, v5, v6, v7
+; GFX10-NEXT:    v_div_fmas_f32 v3, v3, v6, v7
 ; GFX10-NEXT:    v_div_fixup_f32 v3, v3, v2, v0
 ; GFX10-NEXT:    v_trunc_f32_e32 v3, v3
 ; GFX10-NEXT:    v_fmac_f32_e64 v0, -v3, v2
-; GFX10-NEXT:    global_store_dwordx2 v11, v[0:1], s[4:5]
+; GFX10-NEXT:    global_store_dwordx2 v4, v[0:1], s[4:5]
 ; GFX10-NEXT:    s_endpgm
                         <2 x float> addrspace(1)* %in2) #0 {
    %gep2 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in2, i32 4
@@ -2538,11 +2538,11 @@ define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float
 ; GFX10-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dwordx4 v[15:18], v8, s[6:7]
+; GFX10-NEXT:    global_load_dwordx4 v[0:3], v8, s[6:7]
 ; GFX10-NEXT:    global_load_dwordx4 v[4:7], v8, s[2:3] offset:64
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_div_scale_f32 v10, s0, v7, v7, v18
-; GFX10-NEXT:    v_div_scale_f32 v9, vcc_lo, v18, v7, v18
+; GFX10-NEXT:    v_div_scale_f32 v10, s0, v7, v7, v3
+; GFX10-NEXT:    v_div_scale_f32 v9, vcc_lo, v3, v7, v3
 ; GFX10-NEXT:    v_rcp_f32_e32 v11, v10
 ; GFX10-NEXT:    s_denorm_mode 15
 ; GFX10-NEXT:    v_fma_f32 v12, -v10, v11, 1.0
@@ -2553,55 +2553,55 @@ define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float
 ; GFX10-NEXT:    v_fma_f32 v9, -v10, v12, v9
 ; GFX10-NEXT:    s_denorm_mode 12
 ; GFX10-NEXT:    v_div_fmas_f32 v9, v9, v11, v12
-; GFX10-NEXT:    v_div_scale_f32 v0, vcc_lo, v17, v6, v17
-; GFX10-NEXT:    v_div_fixup_f32 v9, v9, v7, v18
+; GFX10-NEXT:    v_div_fixup_f32 v9, v9, v7, v3
 ; GFX10-NEXT:    v_trunc_f32_e32 v9, v9
-; GFX10-NEXT:    v_fma_f32 v18, v7, -v9, v18
-; GFX10-NEXT:    v_div_scale_f32 v9, s0, v6, v6, v17
+; GFX10-NEXT:    v_fma_f32 v3, v7, -v9, v3
+; GFX10-NEXT:    v_div_scale_f32 v9, s0, v6, v6, v2
+; GFX10-NEXT:    v_div_scale_f32 v7, vcc_lo, v2, v6, v2
 ; GFX10-NEXT:    v_rcp_f32_e32 v10, v9
 ; GFX10-NEXT:    s_denorm_mode 15
 ; GFX10-NEXT:    v_fma_f32 v11, -v9, v10, 1.0
 ; GFX10-NEXT:    v_fma_f32 v10, v11, v10, v10
-; GFX10-NEXT:    v_mul_f32_e32 v11, v0, v10
-; GFX10-NEXT:    v_fma_f32 v12, -v9, v11, v0
+; GFX10-NEXT:    v_mul_f32_e32 v11, v7, v10
+; GFX10-NEXT:    v_fma_f32 v12, -v9, v11, v7
 ; GFX10-NEXT:    v_fma_f32 v11, v12, v10, v11
-; GFX10-NEXT:    v_fma_f32 v1, -v9, v11, v0
+; GFX10-NEXT:    v_fma_f32 v7, -v9, v11, v7
 ; GFX10-NEXT:    s_denorm_mode 12
-; GFX10-NEXT:    v_div_fmas_f32 v7, v1, v10, v11
-; GFX10-NEXT:    v_div_fixup_f32 v7, v7, v6, v17
+; GFX10-NEXT:    v_div_fmas_f32 v7, v7, v10, v11
+; GFX10-NEXT:    v_div_fixup_f32 v7, v7, v6, v2
 ; GFX10-NEXT:    v_trunc_f32_e32 v7, v7
-; GFX10-NEXT:    v_fma_f32 v17, v6, -v7, v17
-; GFX10-NEXT:    v_div_scale_f32 v7, s0, v5, v5, v16
-; GFX10-NEXT:    v_div_scale_f32 v6, vcc_lo, v16, v5, v16
+; GFX10-NEXT:    v_fma_f32 v2, v6, -v7, v2
+; GFX10-NEXT:    v_div_scale_f32 v7, s0, v5, v5, v1
+; GFX10-NEXT:    v_div_scale_f32 v6, vcc_lo, v1, v5, v1
 ; GFX10-NEXT:    v_rcp_f32_e32 v9, v7
 ; GFX10-NEXT:    s_denorm_mode 15
 ; GFX10-NEXT:    v_fma_f32 v10, -v7, v9, 1.0
 ; GFX10-NEXT:    v_fma_f32 v9, v10, v9, v9
-; GFX10-NEXT:    v_mul_f32_e32 v0, v6, v9
-; GFX10-NEXT:    v_fma_f32 v11, -v7, v0, v6
-; GFX10-NEXT:    v_fma_f32 v0, v11, v9, v0
-; GFX10-NEXT:    v_fma_f32 v6, -v7, v0, v6
+; GFX10-NEXT:    v_mul_f32_e32 v10, v6, v9
+; GFX10-NEXT:    v_fma_f32 v11, -v7, v10, v6
+; GFX10-NEXT:    v_fma_f32 v10, v11, v9, v10
+; GFX10-NEXT:    v_fma_f32 v6, -v7, v10, v6
 ; GFX10-NEXT:    s_denorm_mode 12
-; GFX10-NEXT:    v_div_fmas_f32 v6, v6, v9, v0
-; GFX10-NEXT:    v_div_fixup_f32 v6, v6, v5, v16
+; GFX10-NEXT:    v_div_fmas_f32 v6, v6, v9, v10
+; GFX10-NEXT:    v_div_fixup_f32 v6, v6, v5, v1
 ; GFX10-NEXT:    v_trunc_f32_e32 v6, v6
-; GFX10-NEXT:    v_fma_f32 v16, v5, -v6, v16
-; GFX10-NEXT:    v_div_scale_f32 v6, s0, v4, v4, v15
-; GFX10-NEXT:    v_div_scale_f32 v5, vcc_lo, v15, v4, v15
+; GFX10-NEXT:    v_fma_f32 v1, v5, -v6, v1
+; GFX10-NEXT:    v_div_scale_f32 v6, s0, v4, v4, v0
+; GFX10-NEXT:    v_div_scale_f32 v5, vcc_lo, v0, v4, v0
 ; GFX10-NEXT:    v_rcp_f32_e32 v7, v6
 ; GFX10-NEXT:    s_denorm_mode 15
 ; GFX10-NEXT:    v_fma_f32 v9, -v6, v7, 1.0
 ; GFX10-NEXT:    v_fma_f32 v7, v9, v7, v7
-; GFX10-NEXT:    v_mul_f32_e32 v0, v5, v7
-; GFX10-NEXT:    v_fma_f32 v10, -v6, v0, v5
-; GFX10-NEXT:    v_fma_f32 v0, v10, v7, v0
-; GFX10-NEXT:    v_fma_f32 v5, -v6, v0, v5
+; GFX10-NEXT:    v_mul_f32_e32 v9, v5, v7
+; GFX10-NEXT:    v_fma_f32 v10, -v6, v9, v5
+; GFX10-NEXT:    v_fma_f32 v9, v10, v7, v9
+; GFX10-NEXT:    v_fma_f32 v5, -v6, v9, v5
 ; GFX10-NEXT:    s_denorm_mode 12
-; GFX10-NEXT:    v_div_fmas_f32 v5, v5, v7, v0
-; GFX10-NEXT:    v_div_fixup_f32 v5, v5, v4, v15
+; GFX10-NEXT:    v_div_fmas_f32 v5, v5, v7, v9
+; GFX10-NEXT:    v_div_fixup_f32 v5, v5, v4, v0
 ; GFX10-NEXT:    v_trunc_f32_e32 v5, v5
-; GFX10-NEXT:    v_fmac_f32_e64 v15, -v5, v4
-; GFX10-NEXT:    global_store_dwordx4 v8, v[15:18], s[4:5]
+; GFX10-NEXT:    v_fmac_f32_e64 v0, -v5, v4
+; GFX10-NEXT:    global_store_dwordx4 v8, v[0:3], s[4:5]
 ; GFX10-NEXT:    s_endpgm
                         <4 x float> addrspace(1)* %in2) #0 {
    %gep2 = getelementptr <4 x float>, <4 x float> addrspace(1)* %in2, i32 4
@@ -2842,34 +2842,34 @@ define amdgpu_kernel void @frem_v2f64(<2 x double> addrspace(1)* %out, <2 x doub
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    global_load_dwordx4 v[0:3], v16, s[6:7]
-; GFX10-NEXT:    global_load_dwordx4 v[18:21], v16, s[2:3] offset:64
+; GFX10-NEXT:    global_load_dwordx4 v[4:7], v16, s[2:3] offset:64
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_div_scale_f64 v[8:9], s0, v[20:21], v[20:21], v[2:3]
-; GFX10-NEXT:    v_div_scale_f64 v[6:7], s0, v[18:19], v[18:19], v[0:1]
+; GFX10-NEXT:    v_div_scale_f64 v[8:9], s0, v[6:7], v[6:7], v[2:3]
 ; GFX10-NEXT:    v_rcp_f64_e32 v[10:11], v[8:9]
 ; GFX10-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
 ; GFX10-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
 ; GFX10-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
 ; GFX10-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
-; GFX10-NEXT:    v_div_scale_f64 v[12:13], vcc_lo, v[2:3], v[20:21], v[2:3]
+; GFX10-NEXT:    v_div_scale_f64 v[12:13], vcc_lo, v[2:3], v[6:7], v[2:3]
 ; GFX10-NEXT:    v_mul_f64 v[14:15], v[12:13], v[10:11]
 ; GFX10-NEXT:    v_fma_f64 v[8:9], -v[8:9], v[14:15], v[12:13]
 ; GFX10-NEXT:    v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[14:15]
-; GFX10-NEXT:    v_div_fixup_f64 v[8:9], v[8:9], v[20:21], v[2:3]
+; GFX10-NEXT:    v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3]
 ; GFX10-NEXT:    v_trunc_f64_e32 v[8:9], v[8:9]
-; GFX10-NEXT:    v_fma_f64 v[2:3], -v[8:9], v[20:21], v[2:3]
+; GFX10-NEXT:    v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3]
+; GFX10-NEXT:    v_div_scale_f64 v[6:7], s0, v[4:5], v[4:5], v[0:1]
 ; GFX10-NEXT:    v_rcp_f64_e32 v[8:9], v[6:7]
 ; GFX10-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
 ; GFX10-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
 ; GFX10-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
 ; GFX10-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
-; GFX10-NEXT:    v_div_scale_f64 v[10:11], vcc_lo, v[0:1], v[18:19], v[0:1]
+; GFX10-NEXT:    v_div_scale_f64 v[10:11], vcc_lo, v[0:1], v[4:5], v[0:1]
 ; GFX10-NEXT:    v_mul_f64 v[12:13], v[10:11], v[8:9]
 ; GFX10-NEXT:    v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11]
 ; GFX10-NEXT:    v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13]
-; GFX10-NEXT:    v_div_fixup_f64 v[6:7], v[6:7], v[18:19], v[0:1]
+; GFX10-NEXT:    v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1]
 ; GFX10-NEXT:    v_trunc_f64_e32 v[6:7], v[6:7]
-; GFX10-NEXT:    v_fma_f64 v[0:1], -v[6:7], v[18:19], v[0:1]
+; GFX10-NEXT:    v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1]
 ; GFX10-NEXT:    global_store_dwordx4 v16, v[0:3], s[4:5]
 ; GFX10-NEXT:    s_endpgm
                         <2 x double> addrspace(1)* %in2) #0 {

diff  --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll
index 550fa502b1bf..2648fde7b6c9 100644
--- a/llvm/test/CodeGen/AMDGPU/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/fshr.ll
@@ -843,31 +843,31 @@ define <3 x i16> @v_fshr_v3i16(<3 x i16> %src0, <3 x i16> %src1, <3 x i16> %src2
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
 ; GFX10-NEXT:    v_xor_b32_e32 v8, -1, v4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v0
 ; GFX10-NEXT:    v_lshlrev_b16 v0, 1, v0
 ; GFX10-NEXT:    v_and_b32_e32 v4, 15, v4
+; GFX10-NEXT:    v_and_b32_e32 v8, 15, v8
 ; GFX10-NEXT:    v_and_b32_e32 v9, 15, v6
 ; GFX10-NEXT:    v_xor_b32_e32 v6, -1, v6
-; GFX10-NEXT:    v_and_b32_e32 v15, 15, v8
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
 ; GFX10-NEXT:    v_lshrrev_b16 v2, v4, v2
+; GFX10-NEXT:    v_lshlrev_b16 v0, v8, v0
 ; GFX10-NEXT:    v_lshlrev_b16 v10, 1, v10
-; GFX10-NEXT:    v_and_b32_e32 v19, 15, v6
-; GFX10-NEXT:    v_lshlrev_b16 v0, v15, v0
+; GFX10-NEXT:    v_and_b32_e32 v6, 15, v6
 ; GFX10-NEXT:    v_xor_b32_e32 v11, -1, v5
 ; GFX10-NEXT:    v_lshrrev_b16 v4, v9, v7
-; GFX10-NEXT:    v_lshlrev_b16 v1, 1, v1
-; GFX10-NEXT:    v_lshlrev_b16 v6, v19, v10
 ; GFX10-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX10-NEXT:    v_and_b32_e32 v7, 15, v11
 ; GFX10-NEXT:    v_and_b32_e32 v2, 15, v5
-; GFX10-NEXT:    v_or_b32_e32 v11, v6, v4
+; GFX10-NEXT:    v_lshlrev_b16 v6, v6, v10
+; GFX10-NEXT:    v_lshlrev_b16 v1, 1, v1
+; GFX10-NEXT:    v_and_b32_e32 v7, 15, v11
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX10-NEXT:    v_lshlrev_b16 v1, v7, v1
 ; GFX10-NEXT:    v_lshrrev_b16 v2, v2, v3
-; GFX10-NEXT:    v_lshl_or_b32 v0, v11, 16, v0
+; GFX10-NEXT:    v_or_b32_e32 v4, v6, v4
+; GFX10-NEXT:    v_lshlrev_b16 v1, v7, v1
+; GFX10-NEXT:    v_lshl_or_b32 v0, v4, 16, v0
 ; GFX10-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %ret = call <3 x i16> @llvm.fshr.v3i16(<3 x i16> %src0, <3 x i16> %src1, <3 x i16> %src2)
@@ -1005,28 +1005,28 @@ define <4 x i16> @v_fshr_v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 16, v2
 ; GFX10-NEXT:    v_lshlrev_b16 v11, 1, v11
 ; GFX10-NEXT:    v_lshlrev_b16 v7, v9, v8
+; GFX10-NEXT:    v_xor_b32_e32 v8, -1, v4
 ; GFX10-NEXT:    v_xor_b32_e32 v9, -1, v10
 ; GFX10-NEXT:    v_xor_b32_e32 v10, -1, v5
-; GFX10-NEXT:    v_xor_b32_e32 v8, -1, v4
 ; GFX10-NEXT:    v_and_b32_e32 v4, 15, v4
 ; GFX10-NEXT:    v_and_b32_e32 v5, 15, v5
+; GFX10-NEXT:    v_and_b32_e32 v8, 15, v8
 ; GFX10-NEXT:    v_and_b32_e32 v9, 15, v9
 ; GFX10-NEXT:    v_and_b32_e32 v10, 15, v10
-; GFX10-NEXT:    v_and_b32_e32 v15, 15, v8
 ; GFX10-NEXT:    v_lshrrev_b16 v2, v4, v2
 ; GFX10-NEXT:    v_lshrrev_b16 v3, v5, v3
+; GFX10-NEXT:    v_lshlrev_b16 v0, v8, v0
 ; GFX10-NEXT:    v_lshrrev_b16 v4, v13, v12
 ; GFX10-NEXT:    v_lshlrev_b16 v1, v10, v1
-; GFX10-NEXT:    v_lshlrev_b16 v0, v15, v0
 ; GFX10-NEXT:    v_lshlrev_b16 v5, v9, v11
-; GFX10-NEXT:    v_or_b32_e32 v1, v1, v3
 ; GFX10-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0xffff
+; GFX10-NEXT:    v_or_b32_e32 v1, v1, v3
 ; GFX10-NEXT:    v_or_b32_e32 v3, v7, v6
-; GFX10-NEXT:    v_or_b32_e32 v7, v5, v4
+; GFX10-NEXT:    v_or_b32_e32 v4, v5, v4
 ; GFX10-NEXT:    v_and_b32_e32 v0, v2, v0
 ; GFX10-NEXT:    v_and_b32_e32 v1, v2, v1
-; GFX10-NEXT:    v_lshl_or_b32 v0, v7, 16, v0
+; GFX10-NEXT:    v_lshl_or_b32 v0, v4, 16, v0
 ; GFX10-NEXT:    v_lshl_or_b32 v1, v3, 16, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %ret = call <4 x i16> @llvm.fshr.v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2)
@@ -1085,9 +1085,9 @@ define i64 @v_fshr_i64(i64 %src0, i64 %src1, i64 %src2) {
 ; GFX10-NEXT:    v_not_b32_e32 v5, v4
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
 ; GFX10-NEXT:    v_and_b32_e32 v4, 63, v4
-; GFX10-NEXT:    v_and_b32_e32 v7, 63, v5
+; GFX10-NEXT:    v_and_b32_e32 v5, 63, v5
 ; GFX10-NEXT:    v_lshrrev_b64 v[2:3], v4, v[2:3]
-; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v7, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v5, v[0:1]
 ; GFX10-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_or_b32_e32 v1, v1, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -1172,18 +1172,18 @@ define <2 x i64> @v_fshr_v2i64(<2 x i64> %src0, <2 x i64> %src1, <2 x i64> %src2
 ; GFX10-NEXT:    v_not_b32_e32 v11, v10
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
 ; GFX10-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
-; GFX10-NEXT:    v_and_b32_e32 v15, 63, v8
-; GFX10-NEXT:    v_and_b32_e32 v19, 63, v9
-; GFX10-NEXT:    v_and_b32_e32 v9, 63, v10
-; GFX10-NEXT:    v_and_b32_e32 v13, 63, v11
-; GFX10-NEXT:    v_lshrrev_b64 v[4:5], v15, v[4:5]
-; GFX10-NEXT:    v_lshlrev_b64 v[11:12], v19, v[0:1]
-; GFX10-NEXT:    v_lshrrev_b64 v[6:7], v9, v[6:7]
-; GFX10-NEXT:    v_lshlrev_b64 v[15:16], v13, v[2:3]
-; GFX10-NEXT:    v_or_b32_e32 v0, v11, v4
-; GFX10-NEXT:    v_or_b32_e32 v1, v12, v5
-; GFX10-NEXT:    v_or_b32_e32 v2, v15, v6
-; GFX10-NEXT:    v_or_b32_e32 v3, v16, v7
+; GFX10-NEXT:    v_and_b32_e32 v8, 63, v8
+; GFX10-NEXT:    v_and_b32_e32 v9, 63, v9
+; GFX10-NEXT:    v_and_b32_e32 v10, 63, v10
+; GFX10-NEXT:    v_and_b32_e32 v11, 63, v11
+; GFX10-NEXT:    v_lshrrev_b64 v[4:5], v8, v[4:5]
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v9, v[0:1]
+; GFX10-NEXT:    v_lshrrev_b64 v[6:7], v10, v[6:7]
+; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v11, v[2:3]
+; GFX10-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX10-NEXT:    v_or_b32_e32 v1, v1, v5
+; GFX10-NEXT:    v_or_b32_e32 v2, v2, v6
+; GFX10-NEXT:    v_or_b32_e32 v3, v3, v7
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %ret = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %src0, <2 x i64> %src1, <2 x i64> %src2)
   ret <2 x i64> %ret
@@ -1331,10 +1331,10 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2
 ; GFX10-NEXT:    v_mul_lo_u32 v7, v7, 24
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v4, v4, v6
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v5, v5, v7
-; GFX10-NEXT:    v_add_nc_u32_e32 v7, 8, v4
-; GFX10-NEXT:    v_add_nc_u32_e32 v6, 8, v5
-; GFX10-NEXT:    v_alignbit_b32 v0, v0, v2, v7
-; GFX10-NEXT:    v_alignbit_b32 v1, v1, v3, v6
+; GFX10-NEXT:    v_add_nc_u32_e32 v4, 8, v4
+; GFX10-NEXT:    v_add_nc_u32_e32 v5, 8, v5
+; GFX10-NEXT:    v_alignbit_b32 v0, v0, v2, v4
+; GFX10-NEXT:    v_alignbit_b32 v1, v1, v3, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %ret = call <2 x i24> @llvm.fshr.v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2)
   ret <2 x i24> %ret

diff  --git a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
index 2d5588379947..32e4f58df884 100644
--- a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
+++ b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
@@ -466,8 +466,8 @@ define amdgpu_kernel void @udiv16_invariant_denom(i16 addrspace(1)* nocapture %a
 ; GFX10-NEXT:    v_mad_f32 v7, -v2, v0, v7
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GFX10-NEXT:    v_cmp_ge_f32_e64 s0, |v7|, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v7, s0, 0, v2, s0
-; GFX10-NEXT:    global_store_short v[5:6], v7, off
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v2, s0, 0, v2, s0
+; GFX10-NEXT:    global_store_short v[5:6], v2, off
 ; GFX10-NEXT:    s_cbranch_vccz BB4_1
 ; GFX10-NEXT:  ; %bb.2: ; %bb2
 ; GFX10-NEXT:    s_endpgm
@@ -546,16 +546,16 @@ define amdgpu_kernel void @urem16_invariant_denom(i16 addrspace(1)* nocapture %a
 ; GFX10-NEXT:    v_mul_f32_e32 v8, v7, v1
 ; GFX10-NEXT:    v_add_co_u32 v5, s0, s2, v5
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v6, s0, s3, v6, s0
-; GFX10-NEXT:    v_trunc_f32_e32 v10, v8
-; GFX10-NEXT:    v_mad_f32 v7, -v10, v0, v7
-; GFX10-NEXT:    v_cvt_u32_f32_e32 v8, v10
+; GFX10-NEXT:    v_trunc_f32_e32 v8, v8
+; GFX10-NEXT:    v_mad_f32 v7, -v8, v0, v7
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v8, v8
 ; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v7|, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v8, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x400, v4
 ; GFX10-NEXT:    v_mul_lo_u32 v7, v7, s4
 ; GFX10-NEXT:    s_and_b32 vcc_lo, exec_lo, vcc_lo
-; GFX10-NEXT:    v_sub_nc_u32_e32 v7, v2, v7
-; GFX10-NEXT:    global_store_short v[5:6], v7, off
+; GFX10-NEXT:    v_sub_nc_u32_e32 v2, v2, v7
+; GFX10-NEXT:    global_store_short v[5:6], v2, off
 ; GFX10-NEXT:    s_cbranch_vccz BB5_1
 ; GFX10-NEXT:  ; %bb.2: ; %bb2
 ; GFX10-NEXT:    s_endpgm
@@ -646,8 +646,8 @@ define amdgpu_kernel void @sdiv16_invariant_denom(i16 addrspace(1)* nocapture %a
 ; GFX10-NEXT:    v_cvt_i32_f32_e32 v2, v2
 ; GFX10-NEXT:    v_cmp_ge_f32_e64 s1, |v7|, |v0|
 ; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, v8, s1
-; GFX10-NEXT:    v_add_nc_u32_e32 v7, v2, v7
-; GFX10-NEXT:    global_store_short v[5:6], v7, off
+; GFX10-NEXT:    v_add_nc_u32_e32 v2, v2, v7
+; GFX10-NEXT:    global_store_short v[5:6], v2, off
 ; GFX10-NEXT:    s_cbranch_vccz BB6_1
 ; GFX10-NEXT:  ; %bb.2: ; %bb2
 ; GFX10-NEXT:    s_endpgm
@@ -725,14 +725,14 @@ define amdgpu_kernel void @srem16_invariant_denom(i16 addrspace(1)* nocapture %a
 ; GFX10-NEXT:    v_bfe_i32 v7, v4, 0, 16
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff, v4
 ; GFX10-NEXT:    v_add_nc_u16 v4, v4, 1
-; GFX10-NEXT:    v_cvt_f32_i32_e32 v11, v7
+; GFX10-NEXT:    v_cvt_f32_i32_e32 v5, v7
 ; GFX10-NEXT:    v_xor_b32_e32 v6, s1, v7
-; GFX10-NEXT:    v_mul_f32_e32 v8, v11, v1
+; GFX10-NEXT:    v_mul_f32_e32 v8, v5, v1
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v6, 30, v6
-; GFX10-NEXT:    v_trunc_f32_e32 v10, v8
+; GFX10-NEXT:    v_trunc_f32_e32 v8, v8
 ; GFX10-NEXT:    v_or_b32_e32 v6, 1, v6
-; GFX10-NEXT:    v_mad_f32 v5, -v10, v0, v11
-; GFX10-NEXT:    v_cvt_i32_f32_e32 v8, v10
+; GFX10-NEXT:    v_mad_f32 v5, -v8, v0, v5
+; GFX10-NEXT:    v_cvt_i32_f32_e32 v8, v8
 ; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v5|, |v0|
 ; GFX10-NEXT:    v_cndmask_b32_e32 v9, 0, v6, vcc_lo
 ; GFX10-NEXT:    v_lshlrev_b64 v[5:6], 1, v[2:3]
@@ -742,8 +742,8 @@ define amdgpu_kernel void @srem16_invariant_denom(i16 addrspace(1)* nocapture %a
 ; GFX10-NEXT:    s_and_b32 vcc_lo, exec_lo, vcc_lo
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v6, s0, s3, v6, s0
 ; GFX10-NEXT:    v_mul_lo_u32 v2, v2, s1
-; GFX10-NEXT:    v_sub_nc_u32_e32 v7, v7, v2
-; GFX10-NEXT:    global_store_short v[5:6], v7, off
+; GFX10-NEXT:    v_sub_nc_u32_e32 v2, v7, v2
+; GFX10-NEXT:    global_store_short v[5:6], v2, off
 ; GFX10-NEXT:    s_cbranch_vccz BB7_1
 ; GFX10-NEXT:  ; %bb.2: ; %bb2
 ; GFX10-NEXT:    s_endpgm

diff  --git a/llvm/test/CodeGen/AMDGPU/idot2.ll b/llvm/test/CodeGen/AMDGPU/idot2.ll
index 36951b7f5929..7a3fea9b85d9 100644
--- a/llvm/test/CodeGen/AMDGPU/idot2.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot2.ll
@@ -2732,11 +2732,11 @@ define amdgpu_kernel void @udot2_acc16(<2 x i16> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    global_load_dword v3, v0, s[6:7]
 ; GFX10-DL-NEXT:    global_load_ushort v4, v1, s[2:3]
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT:    v_mad_u16 v0, v7, v5, v4
+; GFX10-DL-NEXT:    v_mad_u16 v0, v0, v5, v4
 ; GFX10-DL-NEXT:    v_mad_u16 v0, v2, v3, v0
 ; GFX10-DL-NEXT:    global_store_short v1, v0, s[2:3]
 ; GFX10-DL-NEXT:    s_endpgm

diff  --git a/llvm/test/CodeGen/AMDGPU/idot4s.ll b/llvm/test/CodeGen/AMDGPU/idot4s.ll
index e3b5f81b19c3..6f44f2aa7080 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4s.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4s.ll
@@ -341,21 +341,21 @@ define amdgpu_kernel void @idot4_acc16(<4 x i8> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v6, 8, v2
 ; GFX10-DL-NEXT:    v_bfe_i32 v4, v1, 0, 8
-; GFX10-DL-NEXT:    v_bfe_i32 v10, v2, 0, 8
+; GFX10-DL-NEXT:    v_bfe_i32 v7, v2, 0, 8
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
 ; GFX10-DL-NEXT:    v_bfe_i32 v5, v5, 0, 8
 ; GFX10-DL-NEXT:    v_bfe_i32 v6, v6, 0, 8
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT:    v_mad_u16 v3, v4, v10, v3
+; GFX10-DL-NEXT:    v_mad_u16 v3, v4, v7, v3
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
 ; GFX10-DL-NEXT:    v_bfe_i32 v4, v8, 0, 8
-; GFX10-DL-NEXT:    v_bfe_i32 v10, v9, 0, 8
+; GFX10-DL-NEXT:    v_bfe_i32 v7, v9, 0, 8
 ; GFX10-DL-NEXT:    v_mad_u16 v3, v5, v6, v3
 ; GFX10-DL-NEXT:    v_bfe_i32 v1, v1, 0, 8
 ; GFX10-DL-NEXT:    v_bfe_i32 v2, v2, 0, 8
-; GFX10-DL-NEXT:    v_mad_u16 v3, v4, v10, v3
+; GFX10-DL-NEXT:    v_mad_u16 v3, v4, v7, v3
 ; GFX10-DL-NEXT:    v_mad_u16 v1, v1, v2, v3
 ; GFX10-DL-NEXT:    global_store_short v0, v1, s[2:3]
 ; GFX10-DL-NEXT:    s_endpgm
@@ -534,7 +534,7 @@ define amdgpu_kernel void @idot4_acc8(<4 x i8> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    global_load_dword v3, v0, s[6:7]
 ; GFX10-DL-NEXT:    global_load_ubyte v4, v1, s[2:3]
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v11, 8, v2
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v0, 8, v2
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 8, v3
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
@@ -543,7 +543,7 @@ define amdgpu_kernel void @idot4_acc8(<4 x i8> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v3, 24, v3
-; GFX10-DL-NEXT:    v_mad_u16 v0, v11, v5, v4
+; GFX10-DL-NEXT:    v_mad_u16 v0, v0, v5, v4
 ; GFX10-DL-NEXT:    v_mad_u16 v0, v6, v7, v0
 ; GFX10-DL-NEXT:    v_mad_u16 v0, v2, v3, v0
 ; GFX10-DL-NEXT:    global_store_byte v1, v0, s[2:3]
@@ -718,14 +718,14 @@ define amdgpu_kernel void @idot4_multiuse_mul1(<4 x i8> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    v_bfe_i32 v0, v1, 0, 8
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DL-NEXT:    v_bfe_i32 v3, v2, 0, 8
-; GFX10-DL-NEXT:    v_mul_i32_i24_sdwa v7, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX10-DL-NEXT:    v_mul_i32_i24_sdwa v4, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
 ; GFX10-DL-NEXT:    v_mul_i32_i24_e32 v5, v0, v3
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    v_mad_i32_i24 v0, v0, v3, s2
 ; GFX10-DL-NEXT:    v_mul_i32_i24_sdwa v3, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
 ; GFX10-DL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
-; GFX10-DL-NEXT:    v_add3_u32 v0, v7, v0, v5
+; GFX10-DL-NEXT:    v_add3_u32 v0, v4, v0, v5
 ; GFX10-DL-NEXT:    v_add3_u32 v0, v0, v3, v1
 ; GFX10-DL-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm
@@ -908,13 +908,13 @@ define amdgpu_kernel void @idot4_acc32_vecMul(<4 x i8> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    v_lshrrev_b16 v0, 8, v1
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DL-NEXT:    v_lshrrev_b16 v3, 8, v2
-; GFX10-DL-NEXT:    v_mul_i32_i24_sdwa v7, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX10-DL-NEXT:    v_mul_i32_i24_sdwa v4, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
 ; GFX10-DL-NEXT:    v_mul_i32_i24_sdwa v0, sext(v0), sext(v3) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
 ; GFX10-DL-NEXT:    v_mul_i32_i24_sdwa v3, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
 ; GFX10-DL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_add3_u32 v0, v7, s2, v0
+; GFX10-DL-NEXT:    v_add3_u32 v0, v4, s2, v0
 ; GFX10-DL-NEXT:    v_add3_u32 v0, v0, v3, v1
 ; GFX10-DL-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm

diff  --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll
index 5747d4f437b6..ad5a0a5bd65f 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4u.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll
@@ -327,17 +327,17 @@ define amdgpu_kernel void @udot4_acc16(<4 x i8> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v6, 8, v2
 ; GFX10-DL-NEXT:    v_and_b32_e32 v4, s0, v1
-; GFX10-DL-NEXT:    v_and_b32_e32 v10, s0, v2
+; GFX10-DL-NEXT:    v_and_b32_e32 v7, s0, v2
 ; GFX10-DL-NEXT:    v_and_b32_e32 v5, s0, v5
 ; GFX10-DL-NEXT:    v_and_b32_e32 v6, s0, v6
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT:    v_mad_u16 v3, v4, v10, v3
+; GFX10-DL-NEXT:    v_mad_u16 v3, v4, v7, v3
 ; GFX10-DL-NEXT:    v_and_b32_sdwa v4, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-DL-NEXT:    v_and_b32_sdwa v10, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-DL-NEXT:    v_and_b32_sdwa v7, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
 ; GFX10-DL-NEXT:    v_mad_u16 v3, v5, v6, v3
-; GFX10-DL-NEXT:    v_mad_u16 v3, v4, v10, v3
+; GFX10-DL-NEXT:    v_mad_u16 v3, v4, v7, v3
 ; GFX10-DL-NEXT:    v_mad_u16 v1, v1, v2, v3
 ; GFX10-DL-NEXT:    global_store_short v0, v1, s[2:3]
 ; GFX10-DL-NEXT:    s_endpgm
@@ -517,7 +517,7 @@ define amdgpu_kernel void @udot4_acc8(<4 x i8> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    global_load_dword v3, v0, s[6:7]
 ; GFX10-DL-NEXT:    global_load_ubyte v4, v1, s[2:3]
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v11, 8, v2
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v0, 8, v2
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 8, v3
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
@@ -526,7 +526,7 @@ define amdgpu_kernel void @udot4_acc8(<4 x i8> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v3, 24, v3
-; GFX10-DL-NEXT:    v_mad_u16 v0, v11, v5, v4
+; GFX10-DL-NEXT:    v_mad_u16 v0, v0, v5, v4
 ; GFX10-DL-NEXT:    v_mad_u16 v0, v6, v7, v0
 ; GFX10-DL-NEXT:    v_mad_u16 v0, v2, v3, v0
 ; GFX10-DL-NEXT:    global_store_byte v1, v0, s[2:3]
@@ -841,7 +841,7 @@ define amdgpu_kernel void @udot4_CommutationInsideMAD(<4 x i8> addrspace(1)* %sr
 ; GFX10-DL-NEXT:    global_load_dword v3, v0, s[6:7]
 ; GFX10-DL-NEXT:    global_load_ubyte v4, v1, s[2:3]
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v11, 8, v2
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v0, 8, v2
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 8, v3
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
@@ -850,7 +850,7 @@ define amdgpu_kernel void @udot4_CommutationInsideMAD(<4 x i8> addrspace(1)* %sr
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v3, 24, v3
-; GFX10-DL-NEXT:    v_mad_u16 v0, v5, v11, v4
+; GFX10-DL-NEXT:    v_mad_u16 v0, v5, v0, v4
 ; GFX10-DL-NEXT:    v_mad_u16 v0, v7, v6, v0
 ; GFX10-DL-NEXT:    v_mad_u16 v0, v3, v2, v0
 ; GFX10-DL-NEXT:    global_store_byte v1, v0, s[2:3]
@@ -1025,17 +1025,17 @@ define amdgpu_kernel void @udot4_CommutationAccrossMADs(<4 x i8> addrspace(1)* %
 ; GFX10-DL-NEXT:    global_load_dword v3, v0, s[6:7]
 ; GFX10-DL-NEXT:    global_load_ubyte v4, v1, s[2:3]
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v7, 8, v2
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v0, 8, v2
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 8, v3
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT:    v_mad_u16 v0, v5, v7, v4
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
+; GFX10-DL-NEXT:    v_mad_u16 v0, v5, v0, v4
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
 ; GFX10-DL-NEXT:    v_mad_u16 v0, v3, v2, v0
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v3, 24, v3
-; GFX10-DL-NEXT:    v_mad_u16 v0, v5, v7, v0
+; GFX10-DL-NEXT:    v_mad_u16 v0, v5, v4, v0
 ; GFX10-DL-NEXT:    v_mad_u16 v0, v3, v2, v0
 ; GFX10-DL-NEXT:    global_store_byte v1, v0, s[2:3]
 ; GFX10-DL-NEXT:    s_endpgm
@@ -1215,14 +1215,14 @@ define amdgpu_kernel void @udot4_multiuse_mul1(<4 x i8> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    v_and_b32_e32 v0, s3, v1
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DL-NEXT:    v_and_b32_e32 v3, s3, v2
-; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v7, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
 ; GFX10-DL-NEXT:    v_mul_u32_u24_e32 v5, v0, v3
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    v_mad_u32_u24 v0, v0, v3, s2
 ; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
 ; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
-; GFX10-DL-NEXT:    v_add3_u32 v0, v7, v0, v5
+; GFX10-DL-NEXT:    v_add3_u32 v0, v4, v0, v5
 ; GFX10-DL-NEXT:    v_add3_u32 v0, v0, v3, v1
 ; GFX10-DL-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm
@@ -1412,11 +1412,11 @@ define amdgpu_kernel void @udot4_multiuse_add1(<4 x i8> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    v_bfe_u32 v3, v2, 8, 8
 ; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_mad_u32_u24 v6, v0, v3, s2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v0, v0, v3, s2
 ; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
 ; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
-; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, s2, v6
-; GFX10-DL-NEXT:    v_add3_u32 v0, v6, v4, v3
+; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, s2, v0
+; GFX10-DL-NEXT:    v_add3_u32 v0, v0, v4, v3
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX10-DL-NEXT:    v_add3_u32 v0, v0, v1, v2
 ; GFX10-DL-NEXT:    global_store_dword v3, v0, s[0:1]
@@ -1622,7 +1622,7 @@ define amdgpu_kernel void @notdot4_mixedtypes(<4 x i8> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
 ; GFX10-DL-NEXT:    v_bfe_i32 v6, v1, 0, 8
-; GFX10-DL-NEXT:    v_bfe_i32 v9, v2, 0, 8
+; GFX10-DL-NEXT:    v_bfe_i32 v7, v2, 0, 8
 ; GFX10-DL-NEXT:    v_and_b32_e32 v4, s0, v4
 ; GFX10-DL-NEXT:    v_and_b32_e32 v5, s0, v5
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
@@ -1631,7 +1631,7 @@ define amdgpu_kernel void @notdot4_mixedtypes(<4 x i8> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    v_and_b32_sdwa v5, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
-; GFX10-DL-NEXT:    v_mad_u16 v3, v6, v9, v3
+; GFX10-DL-NEXT:    v_mad_u16 v3, v6, v7, v3
 ; GFX10-DL-NEXT:    v_mad_u16 v3, v4, v5, v3
 ; GFX10-DL-NEXT:    v_mad_u16 v1, v1, v2, v3
 ; GFX10-DL-NEXT:    global_store_short v0, v1, s[2:3]
@@ -1809,13 +1809,13 @@ define amdgpu_kernel void @udot4_acc32_vecMul(<4 x i8> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    v_and_b32_sdwa v0, s3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DL-NEXT:    v_and_b32_sdwa v3, s3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v7, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
 ; GFX10-DL-NEXT:    v_mul_u32_u24_e32 v0, v0, v3
 ; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
 ; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_add3_u32 v0, v7, s2, v0
+; GFX10-DL-NEXT:    v_add3_u32 v0, v4, s2, v0
 ; GFX10-DL-NEXT:    v_add3_u32 v0, v0, v3, v1
 ; GFX10-DL-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm
@@ -2230,7 +2230,7 @@ define amdgpu_kernel void @udot4_acc8_vecMul(<4 x i8> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    v_lshrrev_b16 v5, 8, v1
 ; GFX10-DL-NEXT:    v_mul_lo_u16 v9, v6, v7
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT:    v_mad_u16 v3, v1, v2, v3
+; GFX10-DL-NEXT:    v_mad_u16 v1, v1, v2, v3
 ; GFX10-DL-NEXT:    v_lshlrev_b16 v4, 8, v4
 ; GFX10-DL-NEXT:    v_mul_lo_u16 v5, v5, v8
 ; GFX10-DL-NEXT:    v_or_b32_sdwa v4, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -2239,7 +2239,7 @@ define amdgpu_kernel void @udot4_acc8_vecMul(<4 x i8> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 8, v4
 ; GFX10-DL-NEXT:    v_or_b32_sdwa v5, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 8, v5
-; GFX10-DL-NEXT:    v_add_nc_u16 v1, v3, v5
+; GFX10-DL-NEXT:    v_add_nc_u16 v1, v1, v5
 ; GFX10-DL-NEXT:    v_mad_u16 v1, v6, v7, v1
 ; GFX10-DL-NEXT:    v_add_nc_u16 v1, v1, v2
 ; GFX10-DL-NEXT:    global_store_byte v0, v1, s[0:1]

diff  --git a/llvm/test/CodeGen/AMDGPU/idot8s.ll b/llvm/test/CodeGen/AMDGPU/idot8s.ll
index 84a21ad4be4e..d0cde94b098c 100644
--- a/llvm/test/CodeGen/AMDGPU/idot8s.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot8s.ll
@@ -644,26 +644,26 @@ define amdgpu_kernel void @idot8_acc16(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v8, 12, v1
 ; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v9, 8, v1
 ; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v10, 4, v1
+; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v1, 12, v1
 ; GFX10-DL-XNACK-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v16, 4, v2
 ; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v17, 12, v2
-; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v1, 12, v1
 ; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v11, 28, v2
 ; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v10, 12, v10
-; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v16, 12, v16
 ; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v12, 24, v2
+; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v16, 12, v16
 ; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v13, 20, v2
 ; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v14, 16, v2
 ; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v15, 12, v2
 ; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v2, 8, v2
-; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v18, 12, v1
+; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v1, 12, v1
 ; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v17, 12, v17
 ; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v9, 12, v9
 ; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v10, 12, v10
 ; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v2, 12, v2
 ; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v16, 12, v16
 ; GFX10-DL-XNACK-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DL-XNACK-NEXT:    v_mad_u16 v1, v18, v17, v3
+; GFX10-DL-XNACK-NEXT:    v_mad_u16 v1, v1, v17, v3
 ; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v8, 12, v8
 ; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v3, 12, v9
 ; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v9, 12, v15
@@ -672,13 +672,13 @@ define amdgpu_kernel void @idot8_acc16(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v7, 12, v7
 ; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v10, 12, v14
 ; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v8, 12, v8
-; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v15, 12, v9
+; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v9, 12, v9
 ; GFX10-DL-XNACK-NEXT:    v_mad_u16 v1, v3, v2, v1
 ; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v6, 12, v6
 ; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v2, 12, v7
 ; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v7, 12, v13
 ; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v3, 12, v10
-; GFX10-DL-XNACK-NEXT:    v_mad_u16 v1, v8, v15, v1
+; GFX10-DL-XNACK-NEXT:    v_mad_u16 v1, v8, v9, v1
 ; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v5, 12, v5
 ; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v8, 12, v12
 ; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v6, 12, v6
@@ -686,13 +686,13 @@ define amdgpu_kernel void @idot8_acc16(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-XNACK-NEXT:    v_mad_u16 v1, v2, v3, v1
 ; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v4, 12, v4
 ; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v2, 12, v5
-; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v3, 12, v8
 ; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v5, 12, v11
+; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v3, 12, v8
 ; GFX10-DL-XNACK-NEXT:    v_mad_u16 v1, v6, v7, v1
 ; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v4, 12, v4
-; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v7, 12, v5
+; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v5, 12, v5
 ; GFX10-DL-XNACK-NEXT:    v_mad_u16 v1, v2, v3, v1
-; GFX10-DL-XNACK-NEXT:    v_mad_u16 v1, v4, v7, v1
+; GFX10-DL-XNACK-NEXT:    v_mad_u16 v1, v4, v5, v1
 ; GFX10-DL-XNACK-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-DL-XNACK-NEXT:    s_endpgm
 ;
@@ -722,55 +722,55 @@ define amdgpu_kernel void @idot8_acc16(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v8, 12, v1
 ; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v9, 8, v1
 ; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v10, 4, v1
+; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v1, 12, v1
 ; GFX10-DL-NOXNACK-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v16, 4, v0
 ; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v17, 12, v0
-; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v1, 12, v1
 ; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v11, 28, v0
 ; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v10, 12, v10
-; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v16, 12, v16
 ; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v12, 24, v0
+; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v16, 12, v16
 ; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v13, 20, v0
 ; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v14, 16, v0
 ; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v15, 12, v0
 ; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
-; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v18, 12, v1
+; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v1, 12, v1
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v17, 12, v17
 ; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v9, 12, v9
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v10, 12, v10
 ; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v0, 12, v0
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v16, 12, v16
 ; GFX10-DL-NOXNACK-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v1, v18, v17, v3
+; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v1, v1, v17, v3
+; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v8, 12, v8
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v3, 12, v9
 ; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v9, 12, v15
-; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v8, 12, v8
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v0, 12, v0
 ; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v1, v10, v16, v1
 ; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v7, 12, v7
 ; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v10, 12, v14
-; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v15, 12, v8
+; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v8, 12, v8
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v9, 12, v9
 ; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v0, v3, v0, v1
 ; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v6, 12, v6
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v1, 12, v7
 ; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v7, 12, v13
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v3, 12, v10
-; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v0, v15, v9, v0
+; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v0, v8, v9, v0
 ; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v5, 12, v5
 ; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v8, 12, v12
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v6, 12, v6
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v7, 12, v7
 ; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v0, v1, v3, v0
+; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v4, 12, v4
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v1, 12, v5
 ; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v5, 12, v11
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v3, 12, v8
-; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v4, 12, v4
 ; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v0, v6, v7, v0
+; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v4, 12, v4
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v5, 12, v5
-; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v7, 12, v4
 ; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v0, v1, v3, v0
-; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v0, v7, v5, v0
+; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v0, v4, v5, v0
 ; GFX10-DL-NOXNACK-NEXT:    global_store_short v2, v0, s[0:1]
 ; GFX10-DL-NOXNACK-NEXT:    s_endpgm
 ; GFX10-DL-LABEL: idot8_acc16:
@@ -1218,26 +1218,26 @@ define amdgpu_kernel void @idot8_acc8(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v8, 12, v1
 ; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v9, 8, v1
 ; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v10, 4, v1
+; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v1, 12, v1
 ; GFX10-DL-XNACK-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v16, 4, v2
 ; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v17, 12, v2
-; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v1, 12, v1
 ; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v11, 28, v2
 ; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v10, 12, v10
-; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v16, 12, v16
 ; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v12, 24, v2
+; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v16, 12, v16
 ; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v13, 20, v2
 ; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v14, 16, v2
 ; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v15, 12, v2
 ; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v2, 8, v2
-; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v18, 12, v1
+; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v1, 12, v1
 ; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v17, 12, v17
 ; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v9, 12, v9
 ; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v10, 12, v10
 ; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v2, 12, v2
 ; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v16, 12, v16
 ; GFX10-DL-XNACK-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DL-XNACK-NEXT:    v_mad_u16 v1, v18, v17, v3
+; GFX10-DL-XNACK-NEXT:    v_mad_u16 v1, v1, v17, v3
 ; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v8, 12, v8
 ; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v3, 12, v9
 ; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v9, 12, v15
@@ -1246,13 +1246,13 @@ define amdgpu_kernel void @idot8_acc8(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v7, 12, v7
 ; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v10, 12, v14
 ; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v8, 12, v8
-; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v15, 12, v9
+; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v9, 12, v9
 ; GFX10-DL-XNACK-NEXT:    v_mad_u16 v1, v3, v2, v1
 ; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v6, 12, v6
 ; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v2, 12, v7
 ; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v7, 12, v13
 ; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v3, 12, v10
-; GFX10-DL-XNACK-NEXT:    v_mad_u16 v1, v8, v15, v1
+; GFX10-DL-XNACK-NEXT:    v_mad_u16 v1, v8, v9, v1
 ; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v5, 12, v5
 ; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v8, 12, v12
 ; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v6, 12, v6
@@ -1260,13 +1260,13 @@ define amdgpu_kernel void @idot8_acc8(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-XNACK-NEXT:    v_mad_u16 v1, v2, v3, v1
 ; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v4, 12, v4
 ; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v2, 12, v5
-; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v3, 12, v8
 ; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v5, 12, v11
+; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v3, 12, v8
 ; GFX10-DL-XNACK-NEXT:    v_mad_u16 v1, v6, v7, v1
 ; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v4, 12, v4
-; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v7, 12, v5
+; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v5, 12, v5
 ; GFX10-DL-XNACK-NEXT:    v_mad_u16 v1, v2, v3, v1
-; GFX10-DL-XNACK-NEXT:    v_mad_u16 v1, v4, v7, v1
+; GFX10-DL-XNACK-NEXT:    v_mad_u16 v1, v4, v5, v1
 ; GFX10-DL-XNACK-NEXT:    global_store_byte v0, v1, s[0:1]
 ; GFX10-DL-XNACK-NEXT:    s_endpgm
 ;
@@ -1296,55 +1296,55 @@ define amdgpu_kernel void @idot8_acc8(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v8, 12, v1
 ; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v9, 8, v1
 ; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v10, 4, v1
+; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v1, 12, v1
 ; GFX10-DL-NOXNACK-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v16, 4, v0
 ; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v17, 12, v0
-; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v1, 12, v1
 ; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v11, 28, v0
 ; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v10, 12, v10
-; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v16, 12, v16
 ; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v12, 24, v0
+; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v16, 12, v16
 ; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v13, 20, v0
 ; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v14, 16, v0
 ; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v15, 12, v0
 ; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
-; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v18, 12, v1
+; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v1, 12, v1
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v17, 12, v17
 ; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v9, 12, v9
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v10, 12, v10
 ; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v0, 12, v0
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v16, 12, v16
 ; GFX10-DL-NOXNACK-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v1, v18, v17, v3
+; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v1, v1, v17, v3
+; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v8, 12, v8
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v3, 12, v9
 ; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v9, 12, v15
-; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v8, 12, v8
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v0, 12, v0
 ; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v1, v10, v16, v1
 ; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v7, 12, v7
 ; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v10, 12, v14
-; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v15, 12, v8
+; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v8, 12, v8
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v9, 12, v9
 ; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v0, v3, v0, v1
 ; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v6, 12, v6
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v1, 12, v7
 ; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v7, 12, v13
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v3, 12, v10
-; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v0, v15, v9, v0
+; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v0, v8, v9, v0
 ; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v5, 12, v5
 ; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v8, 12, v12
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v6, 12, v6
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v7, 12, v7
 ; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v0, v1, v3, v0
+; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v4, 12, v4
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v1, 12, v5
 ; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v5, 12, v11
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v3, 12, v8
-; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v4, 12, v4
 ; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v0, v6, v7, v0
+; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v4, 12, v4
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v5, 12, v5
-; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v7, 12, v4
 ; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v0, v1, v3, v0
-; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v0, v7, v5, v0
+; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v0, v4, v5, v0
 ; GFX10-DL-NOXNACK-NEXT:    global_store_byte v2, v0, s[0:1]
 ; GFX10-DL-NOXNACK-NEXT:    s_endpgm
 ; GFX10-DL-LABEL: idot8_acc8:
@@ -1713,25 +1713,25 @@ define amdgpu_kernel void @idot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-XNACK-NEXT:    v_mul_i32_i24_e32 v4, v5, v6
 ; GFX10-DL-XNACK-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-XNACK-NEXT:    v_mad_i32_i24 v5, v0, v7, s2
-; GFX10-DL-XNACK-NEXT:    v_bfe_i32 v15, v1, 16, 4
+; GFX10-DL-XNACK-NEXT:    v_bfe_i32 v6, v1, 16, 4
 ; GFX10-DL-XNACK-NEXT:    v_bfe_i32 v10, v2, 16, 4
 ; GFX10-DL-XNACK-NEXT:    v_bfe_i32 v11, v1, 20, 4
 ; GFX10-DL-XNACK-NEXT:    v_bfe_i32 v12, v2, 20, 4
-; GFX10-DL-XNACK-NEXT:    v_mad_i32_i24 v14, v0, v7, v5
+; GFX10-DL-XNACK-NEXT:    v_mad_i32_i24 v0, v0, v7, v5
 ; GFX10-DL-XNACK-NEXT:    v_bfe_i32 v7, v1, 24, 4
 ; GFX10-DL-XNACK-NEXT:    v_bfe_i32 v13, v2, 24, 4
-; GFX10-DL-XNACK-NEXT:    v_mul_i32_i24_e32 v6, v15, v10
 ; GFX10-DL-XNACK-NEXT:    v_mul_i32_i24_e32 v8, v8, v9
-; GFX10-DL-XNACK-NEXT:    v_add3_u32 v15, v14, v3, v4
+; GFX10-DL-XNACK-NEXT:    v_mul_i32_i24_e32 v6, v6, v10
+; GFX10-DL-XNACK-NEXT:    v_add3_u32 v0, v0, v3, v4
 ; GFX10-DL-XNACK-NEXT:    v_mul_i32_i24_e32 v3, v11, v12
 ; GFX10-DL-XNACK-NEXT:    v_mul_i32_i24_e32 v4, v7, v13
 ; GFX10-DL-XNACK-NEXT:    v_ashrrev_i32_e32 v1, 28, v1
 ; GFX10-DL-XNACK-NEXT:    v_ashrrev_i32_e32 v2, 28, v2
-; GFX10-DL-XNACK-NEXT:    v_add3_u32 v6, v15, v8, v6
-; GFX10-DL-XNACK-NEXT:    v_mul_i32_i24_e32 v7, v1, v2
-; GFX10-DL-XNACK-NEXT:    v_add3_u32 v0, v6, v3, v4
+; GFX10-DL-XNACK-NEXT:    v_add3_u32 v0, v0, v8, v6
+; GFX10-DL-XNACK-NEXT:    v_mul_i32_i24_e32 v1, v1, v2
+; GFX10-DL-XNACK-NEXT:    v_add3_u32 v0, v0, v3, v4
 ; GFX10-DL-XNACK-NEXT:    v_mov_b32_e32 v2, 0
-; GFX10-DL-XNACK-NEXT:    v_add3_u32 v0, v0, v7, v5
+; GFX10-DL-XNACK-NEXT:    v_add3_u32 v0, v0, v1, v5
 ; GFX10-DL-XNACK-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-DL-XNACK-NEXT:    s_endpgm
 ;
@@ -1765,25 +1765,25 @@ define amdgpu_kernel void @idot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NOXNACK-NEXT:    v_mul_i32_i24_e32 v4, v5, v6
 ; GFX10-DL-NOXNACK-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NOXNACK-NEXT:    v_mad_i32_i24 v5, v2, v7, s2
-; GFX10-DL-NOXNACK-NEXT:    v_bfe_i32 v15, v1, 16, 4
+; GFX10-DL-NOXNACK-NEXT:    v_bfe_i32 v6, v1, 16, 4
 ; GFX10-DL-NOXNACK-NEXT:    v_bfe_i32 v10, v0, 16, 4
 ; GFX10-DL-NOXNACK-NEXT:    v_bfe_i32 v11, v1, 20, 4
 ; GFX10-DL-NOXNACK-NEXT:    v_bfe_i32 v12, v0, 20, 4
 ; GFX10-DL-NOXNACK-NEXT:    v_mad_i32_i24 v2, v2, v7, v5
 ; GFX10-DL-NOXNACK-NEXT:    v_bfe_i32 v7, v1, 24, 4
 ; GFX10-DL-NOXNACK-NEXT:    v_bfe_i32 v13, v0, 24, 4
-; GFX10-DL-NOXNACK-NEXT:    v_mul_i32_i24_e32 v6, v15, v10
 ; GFX10-DL-NOXNACK-NEXT:    v_mul_i32_i24_e32 v8, v8, v9
-; GFX10-DL-NOXNACK-NEXT:    v_add3_u32 v15, v2, v3, v4
+; GFX10-DL-NOXNACK-NEXT:    v_mul_i32_i24_e32 v6, v6, v10
+; GFX10-DL-NOXNACK-NEXT:    v_add3_u32 v2, v2, v3, v4
 ; GFX10-DL-NOXNACK-NEXT:    v_mul_i32_i24_e32 v3, v11, v12
 ; GFX10-DL-NOXNACK-NEXT:    v_mul_i32_i24_e32 v4, v7, v13
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i32_e32 v1, 28, v1
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i32_e32 v0, 28, v0
-; GFX10-DL-NOXNACK-NEXT:    v_add3_u32 v2, v15, v8, v6
+; GFX10-DL-NOXNACK-NEXT:    v_add3_u32 v2, v2, v8, v6
 ; GFX10-DL-NOXNACK-NEXT:    v_mul_i32_i24_e32 v0, v1, v0
-; GFX10-DL-NOXNACK-NEXT:    v_add3_u32 v3, v2, v3, v4
+; GFX10-DL-NOXNACK-NEXT:    v_add3_u32 v1, v2, v3, v4
 ; GFX10-DL-NOXNACK-NEXT:    v_mov_b32_e32 v2, 0
-; GFX10-DL-NOXNACK-NEXT:    v_add3_u32 v0, v3, v0, v5
+; GFX10-DL-NOXNACK-NEXT:    v_add3_u32 v0, v1, v0, v5
 ; GFX10-DL-NOXNACK-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-DL-NOXNACK-NEXT:    s_endpgm
 ; GFX10-DL-LABEL: idot8_multiuses_mul1:
@@ -2550,7 +2550,7 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-XNACK-NEXT:    v_lshl_or_b32 v1, v1, 16, v11
 ; GFX10-DL-XNACK-NEXT:    v_and_b32_e32 v7, v4, v7
 ; GFX10-DL-XNACK-NEXT:    v_lshl_or_b32 v11, v16, 16, v13
-; GFX10-DL-XNACK-NEXT:    v_bfe_u32 v19, v2, 24, 4
+; GFX10-DL-XNACK-NEXT:    v_bfe_u32 v12, v2, 24, 4
 ; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v14, 28, v2
 ; GFX10-DL-XNACK-NEXT:    v_pk_lshlrev_b16 v1, 12, v1 op_sel_hi:[0,1]
 ; GFX10-DL-XNACK-NEXT:    v_bfe_u32 v15, v2, 16, 4
@@ -2577,7 +2577,7 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-XNACK-NEXT:    v_add_nc_u16 v1, v1, v3
 ; GFX10-DL-XNACK-NEXT:    v_and_b32_e32 v3, v4, v5
 ; GFX10-DL-XNACK-NEXT:    v_pk_lshlrev_b16 v5, 12, v8 op_sel_hi:[0,1]
-; GFX10-DL-XNACK-NEXT:    v_and_b32_e32 v4, v4, v19
+; GFX10-DL-XNACK-NEXT:    v_and_b32_e32 v4, v4, v12
 ; GFX10-DL-XNACK-NEXT:    v_pk_mul_lo_u16 v2, v9, v2
 ; GFX10-DL-XNACK-NEXT:    v_add_nc_u16 v1, v1, v10
 ; GFX10-DL-XNACK-NEXT:    v_lshl_or_b32 v3, v6, 16, v3
@@ -2592,9 +2592,9 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-XNACK-NEXT:    v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1]
 ; GFX10-DL-XNACK-NEXT:    v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1]
 ; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
-; GFX10-DL-XNACK-NEXT:    v_add_nc_u16 v7, v1, v4
+; GFX10-DL-XNACK-NEXT:    v_add_nc_u16 v1, v1, v4
 ; GFX10-DL-XNACK-NEXT:    v_pk_mul_lo_u16 v2, v2, v3
-; GFX10-DL-XNACK-NEXT:    v_add_nc_u16 v1, v7, v5
+; GFX10-DL-XNACK-NEXT:    v_add_nc_u16 v1, v1, v5
 ; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
 ; GFX10-DL-XNACK-NEXT:    v_add_nc_u16 v1, v1, v2
 ; GFX10-DL-XNACK-NEXT:    v_add_nc_u16 v1, v1, v3
@@ -2638,7 +2638,7 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NOXNACK-NEXT:    v_lshl_or_b32 v1, v1, 16, v11
 ; GFX10-DL-NOXNACK-NEXT:    v_and_b32_e32 v7, v4, v7
 ; GFX10-DL-NOXNACK-NEXT:    v_lshl_or_b32 v11, v16, 16, v13
-; GFX10-DL-NOXNACK-NEXT:    v_bfe_u32 v19, v0, 24, 4
+; GFX10-DL-NOXNACK-NEXT:    v_bfe_u32 v12, v0, 24, 4
 ; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v14, 28, v0
 ; GFX10-DL-NOXNACK-NEXT:    v_pk_lshlrev_b16 v1, 12, v1 op_sel_hi:[0,1]
 ; GFX10-DL-NOXNACK-NEXT:    v_bfe_u32 v15, v0, 16, 4
@@ -2665,7 +2665,7 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NOXNACK-NEXT:    v_add_nc_u16 v1, v1, v3
 ; GFX10-DL-NOXNACK-NEXT:    v_and_b32_e32 v3, v4, v5
 ; GFX10-DL-NOXNACK-NEXT:    v_pk_lshlrev_b16 v5, 12, v8 op_sel_hi:[0,1]
-; GFX10-DL-NOXNACK-NEXT:    v_and_b32_e32 v4, v4, v19
+; GFX10-DL-NOXNACK-NEXT:    v_and_b32_e32 v4, v4, v12
 ; GFX10-DL-NOXNACK-NEXT:    v_pk_mul_lo_u16 v0, v9, v0
 ; GFX10-DL-NOXNACK-NEXT:    v_add_nc_u16 v1, v1, v10
 ; GFX10-DL-NOXNACK-NEXT:    v_lshl_or_b32 v3, v6, 16, v3
@@ -2676,11 +2676,11 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NOXNACK-NEXT:    v_pk_lshlrev_b16 v1, 12, v3 op_sel_hi:[0,1]
 ; GFX10-DL-NOXNACK-NEXT:    v_pk_lshlrev_b16 v3, 12, v4 op_sel_hi:[0,1]
 ; GFX10-DL-NOXNACK-NEXT:    v_pk_mul_lo_u16 v4, v7, v5
-; GFX10-DL-NOXNACK-NEXT:    v_add_nc_u16 v7, v0, v6
+; GFX10-DL-NOXNACK-NEXT:    v_add_nc_u16 v0, v0, v6
 ; GFX10-DL-NOXNACK-NEXT:    v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1]
 ; GFX10-DL-NOXNACK-NEXT:    v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1]
 ; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
-; GFX10-DL-NOXNACK-NEXT:    v_add_nc_u16 v0, v7, v4
+; GFX10-DL-NOXNACK-NEXT:    v_add_nc_u16 v0, v0, v4
 ; GFX10-DL-NOXNACK-NEXT:    v_pk_mul_lo_u16 v1, v1, v3
 ; GFX10-DL-NOXNACK-NEXT:    v_add_nc_u16 v0, v0, v5
 ; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
@@ -3196,7 +3196,7 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-XNACK-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-DL-XNACK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-DL-XNACK-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX10-DL-XNACK-NEXT:    v_mov_b32_e32 v19, 0
+; GFX10-DL-XNACK-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-DL-XNACK-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
 ; GFX10-DL-XNACK-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
 ; GFX10-DL-XNACK-NEXT:    s_mov_b32 s10, -1
@@ -3207,7 +3207,7 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-XNACK-NEXT:    s_clause 0x1
 ; GFX10-DL-XNACK-NEXT:    global_load_dword v1, v0, s[4:5]
 ; GFX10-DL-XNACK-NEXT:    global_load_dword v2, v0, s[6:7]
-; GFX10-DL-XNACK-NEXT:    global_load_ubyte v3, v19, s[0:1]
+; GFX10-DL-XNACK-NEXT:    global_load_ubyte v3, v4, s[0:1]
 ; GFX10-DL-XNACK-NEXT:    s_waitcnt vmcnt(2)
 ; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v8, 12, v1
 ; GFX10-DL-XNACK-NEXT:    s_waitcnt vmcnt(1)
@@ -3250,8 +3250,8 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-XNACK-NEXT:    v_mul_lo_u16 v9, v9, v16
 ; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v8, 8, v8
 ; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v5, 12, v5
-; GFX10-DL-XNACK-NEXT:    v_mul_lo_u16 v6, v6, v13
 ; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v12, 12, v12
+; GFX10-DL-XNACK-NEXT:    v_mul_lo_u16 v6, v6, v13
 ; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v7, 12, v7
 ; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v0, 12, v0
 ; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v14, 12, v14
@@ -3262,13 +3262,13 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v2, 12, v2
 ; GFX10-DL-XNACK-NEXT:    v_mul_lo_u16 v9, v0, v11
 ; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v5, 12, v5
-; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v23, 12, v12
+; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v12, 12, v12
 ; GFX10-DL-XNACK-NEXT:    v_mul_lo_u16 v11, v7, v14
 ; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v6, 8, v6
 ; GFX10-DL-XNACK-NEXT:    v_mul_lo_u16 v1, v1, v2
 ; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v10, 8, v10
 ; GFX10-DL-XNACK-NEXT:    v_lshlrev_b32_e32 v0, 16, v8
-; GFX10-DL-XNACK-NEXT:    v_mul_lo_u16 v2, v5, v23
+; GFX10-DL-XNACK-NEXT:    v_mul_lo_u16 v2, v5, v12
 ; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v9, 8, v9
 ; GFX10-DL-XNACK-NEXT:    v_or_b32_sdwa v6, v11, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-DL-XNACK-NEXT:    v_or_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -3284,12 +3284,12 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
 ; GFX10-DL-XNACK-NEXT:    v_add_nc_u16 v0, v9, v8
 ; GFX10-DL-XNACK-NEXT:    v_add_nc_u16 v0, v0, v2
-; GFX10-DL-XNACK-NEXT:    v_mad_u16 v0, v5, v23, v0
+; GFX10-DL-XNACK-NEXT:    v_mad_u16 v0, v5, v12, v0
 ; GFX10-DL-XNACK-NEXT:    v_add_nc_u16 v0, v0, v1
 ; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v1, 8, v6
 ; GFX10-DL-XNACK-NEXT:    v_mad_u16 v0, v7, v14, v0
 ; GFX10-DL-XNACK-NEXT:    v_add_nc_u16 v0, v0, v1
-; GFX10-DL-XNACK-NEXT:    global_store_byte v19, v0, s[0:1]
+; GFX10-DL-XNACK-NEXT:    global_store_byte v4, v0, s[0:1]
 ; GFX10-DL-XNACK-NEXT:    s_endpgm
 ;
 ; GFX10-DL-NOXNACK-LABEL: idot8_acc8_vecMul:
@@ -3297,7 +3297,7 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NOXNACK-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-DL-NOXNACK-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX10-DL-NOXNACK-NEXT:    v_mov_b32_e32 v19, 0
+; GFX10-DL-NOXNACK-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-DL-NOXNACK-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
 ; GFX10-DL-NOXNACK-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
 ; GFX10-DL-NOXNACK-NEXT:    s_mov_b32 s10, -1
@@ -3308,7 +3308,7 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NOXNACK-NEXT:    s_clause 0x1
 ; GFX10-DL-NOXNACK-NEXT:    global_load_dword v1, v0, s[4:5]
 ; GFX10-DL-NOXNACK-NEXT:    global_load_dword v0, v0, s[6:7]
-; GFX10-DL-NOXNACK-NEXT:    global_load_ubyte v2, v19, s[0:1]
+; GFX10-DL-NOXNACK-NEXT:    global_load_ubyte v2, v4, s[0:1]
 ; GFX10-DL-NOXNACK-NEXT:    s_waitcnt vmcnt(2)
 ; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v8, 12, v1
 ; GFX10-DL-NOXNACK-NEXT:    s_waitcnt vmcnt(1)
@@ -3347,7 +3347,7 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v6, 12, v6
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v13, 12, v13
 ; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v1, 12, v1
-; GFX10-DL-NOXNACK-NEXT:    v_mul_lo_u16 v23, v9, v0
+; GFX10-DL-NOXNACK-NEXT:    v_mul_lo_u16 v0, v9, v0
 ; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v8, 8, v8
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v10, 12, v10
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v15, 12, v17
@@ -3360,7 +3360,7 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v14, 12, v14
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v1, 12, v1
 ; GFX10-DL-NOXNACK-NEXT:    v_mul_lo_u16 v10, v10, v15
-; GFX10-DL-NOXNACK-NEXT:    v_or_b32_sdwa v8, v23, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-DL-NOXNACK-NEXT:    v_or_b32_sdwa v8, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-DL-NOXNACK-NEXT:    v_mul_lo_u16 v3, v3, v9
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v5, 12, v5
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v11, 12, v12
@@ -3390,7 +3390,7 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v1, 8, v6
 ; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v0, v7, v14, v0
 ; GFX10-DL-NOXNACK-NEXT:    v_add_nc_u16 v0, v0, v1
-; GFX10-DL-NOXNACK-NEXT:    global_store_byte v19, v0, s[0:1]
+; GFX10-DL-NOXNACK-NEXT:    global_store_byte v4, v0, s[0:1]
 ; GFX10-DL-NOXNACK-NEXT:    s_endpgm
 ; GFX10-DL-LABEL: idot8_acc8_vecMul:
 ; GFX10-DL:       ; %bb.0: ; %entry

diff  --git a/llvm/test/CodeGen/AMDGPU/idot8u.ll b/llvm/test/CodeGen/AMDGPU/idot8u.ll
index aa8fc5513980..d3bb2a4981de 100644
--- a/llvm/test/CodeGen/AMDGPU/idot8u.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot8u.ll
@@ -494,31 +494,31 @@ define amdgpu_kernel void @udot8_acc16(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    global_load_dword v3, v0, s[6:7]
 ; GFX10-DL-NEXT:    global_load_ushort v4, v1, s[2:3]
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
-; GFX10-DL-NEXT:    v_and_b32_e32 v11, 15, v2
+; GFX10-DL-NEXT:    v_and_b32_e32 v0, 15, v2
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-DL-NEXT:    v_and_b32_e32 v5, 15, v3
 ; GFX10-DL-NEXT:    v_bfe_u32 v6, v2, 4, 4
 ; GFX10-DL-NEXT:    v_bfe_u32 v7, v3, 4, 4
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT:    v_mad_u16 v0, v11, v5, v4
-; GFX10-DL-NEXT:    v_bfe_u32 v11, v2, 8, 4
+; GFX10-DL-NEXT:    v_mad_u16 v0, v0, v5, v4
+; GFX10-DL-NEXT:    v_bfe_u32 v4, v2, 8, 4
 ; GFX10-DL-NEXT:    v_bfe_u32 v5, v3, 8, 4
 ; GFX10-DL-NEXT:    v_mad_u16 v0, v6, v7, v0
 ; GFX10-DL-NEXT:    v_bfe_u32 v6, v2, 12, 4
 ; GFX10-DL-NEXT:    v_bfe_u32 v7, v3, 12, 4
-; GFX10-DL-NEXT:    v_mad_u16 v0, v11, v5, v0
-; GFX10-DL-NEXT:    v_bfe_u32 v11, v2, 16, 4
+; GFX10-DL-NEXT:    v_mad_u16 v0, v4, v5, v0
+; GFX10-DL-NEXT:    v_bfe_u32 v4, v2, 16, 4
 ; GFX10-DL-NEXT:    v_bfe_u32 v5, v3, 16, 4
 ; GFX10-DL-NEXT:    v_mad_u16 v0, v6, v7, v0
 ; GFX10-DL-NEXT:    v_bfe_u32 v6, v2, 20, 4
 ; GFX10-DL-NEXT:    v_bfe_u32 v7, v3, 20, 4
-; GFX10-DL-NEXT:    v_mad_u16 v0, v11, v5, v0
-; GFX10-DL-NEXT:    v_bfe_u32 v11, v2, 24, 4
+; GFX10-DL-NEXT:    v_mad_u16 v0, v4, v5, v0
+; GFX10-DL-NEXT:    v_bfe_u32 v4, v2, 24, 4
 ; GFX10-DL-NEXT:    v_bfe_u32 v5, v3, 24, 4
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 28, v2
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v3, 28, v3
 ; GFX10-DL-NEXT:    v_mad_u16 v0, v6, v7, v0
-; GFX10-DL-NEXT:    v_mad_u16 v0, v11, v5, v0
+; GFX10-DL-NEXT:    v_mad_u16 v0, v4, v5, v0
 ; GFX10-DL-NEXT:    v_mad_u16 v0, v2, v3, v0
 ; GFX10-DL-NEXT:    global_store_short v1, v0, s[2:3]
 ; GFX10-DL-NEXT:    s_endpgm
@@ -812,31 +812,31 @@ define amdgpu_kernel void @udot8_acc8(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    global_load_dword v3, v0, s[6:7]
 ; GFX10-DL-NEXT:    global_load_ubyte v4, v1, s[2:3]
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
-; GFX10-DL-NEXT:    v_and_b32_e32 v11, 15, v2
+; GFX10-DL-NEXT:    v_and_b32_e32 v0, 15, v2
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-DL-NEXT:    v_and_b32_e32 v5, 15, v3
 ; GFX10-DL-NEXT:    v_bfe_u32 v6, v2, 4, 4
 ; GFX10-DL-NEXT:    v_bfe_u32 v7, v3, 4, 4
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT:    v_mad_u16 v0, v11, v5, v4
-; GFX10-DL-NEXT:    v_bfe_u32 v11, v2, 8, 4
+; GFX10-DL-NEXT:    v_mad_u16 v0, v0, v5, v4
+; GFX10-DL-NEXT:    v_bfe_u32 v4, v2, 8, 4
 ; GFX10-DL-NEXT:    v_bfe_u32 v5, v3, 8, 4
 ; GFX10-DL-NEXT:    v_mad_u16 v0, v6, v7, v0
 ; GFX10-DL-NEXT:    v_bfe_u32 v6, v2, 12, 4
 ; GFX10-DL-NEXT:    v_bfe_u32 v7, v3, 12, 4
-; GFX10-DL-NEXT:    v_mad_u16 v0, v11, v5, v0
-; GFX10-DL-NEXT:    v_bfe_u32 v11, v2, 16, 4
+; GFX10-DL-NEXT:    v_mad_u16 v0, v4, v5, v0
+; GFX10-DL-NEXT:    v_bfe_u32 v4, v2, 16, 4
 ; GFX10-DL-NEXT:    v_bfe_u32 v5, v3, 16, 4
 ; GFX10-DL-NEXT:    v_mad_u16 v0, v6, v7, v0
 ; GFX10-DL-NEXT:    v_bfe_u32 v6, v2, 20, 4
 ; GFX10-DL-NEXT:    v_bfe_u32 v7, v3, 20, 4
-; GFX10-DL-NEXT:    v_mad_u16 v0, v11, v5, v0
-; GFX10-DL-NEXT:    v_bfe_u32 v11, v2, 24, 4
+; GFX10-DL-NEXT:    v_mad_u16 v0, v4, v5, v0
+; GFX10-DL-NEXT:    v_bfe_u32 v4, v2, 24, 4
 ; GFX10-DL-NEXT:    v_bfe_u32 v5, v3, 24, 4
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 28, v2
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v3, 28, v3
 ; GFX10-DL-NEXT:    v_mad_u16 v0, v6, v7, v0
-; GFX10-DL-NEXT:    v_mad_u16 v0, v11, v5, v0
+; GFX10-DL-NEXT:    v_mad_u16 v0, v4, v5, v0
 ; GFX10-DL-NEXT:    v_mad_u16 v0, v2, v3, v0
 ; GFX10-DL-NEXT:    global_store_byte v1, v0, s[2:3]
 ; GFX10-DL-NEXT:    s_endpgm
@@ -1134,31 +1134,31 @@ define amdgpu_kernel void @udot8_acc4(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    global_load_dword v3, v0, s[6:7]
 ; GFX10-DL-NEXT:    global_load_ubyte v4, v1, s[2:3]
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
-; GFX10-DL-NEXT:    v_and_b32_e32 v11, 15, v2
+; GFX10-DL-NEXT:    v_and_b32_e32 v0, 15, v2
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-DL-NEXT:    v_and_b32_e32 v5, 15, v3
 ; GFX10-DL-NEXT:    v_bfe_u32 v6, v2, 4, 4
 ; GFX10-DL-NEXT:    v_bfe_u32 v7, v3, 4, 4
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT:    v_mad_u16 v0, v11, v5, v4
-; GFX10-DL-NEXT:    v_bfe_u32 v11, v2, 8, 4
+; GFX10-DL-NEXT:    v_mad_u16 v0, v0, v5, v4
+; GFX10-DL-NEXT:    v_bfe_u32 v4, v2, 8, 4
 ; GFX10-DL-NEXT:    v_bfe_u32 v5, v3, 8, 4
 ; GFX10-DL-NEXT:    v_mad_u16 v0, v6, v7, v0
 ; GFX10-DL-NEXT:    v_bfe_u32 v6, v2, 12, 4
 ; GFX10-DL-NEXT:    v_bfe_u32 v7, v3, 12, 4
-; GFX10-DL-NEXT:    v_mad_u16 v0, v11, v5, v0
-; GFX10-DL-NEXT:    v_bfe_u32 v11, v2, 16, 4
+; GFX10-DL-NEXT:    v_mad_u16 v0, v4, v5, v0
+; GFX10-DL-NEXT:    v_bfe_u32 v4, v2, 16, 4
 ; GFX10-DL-NEXT:    v_bfe_u32 v5, v3, 16, 4
 ; GFX10-DL-NEXT:    v_mad_u16 v0, v6, v7, v0
 ; GFX10-DL-NEXT:    v_bfe_u32 v6, v2, 20, 4
 ; GFX10-DL-NEXT:    v_bfe_u32 v7, v3, 20, 4
-; GFX10-DL-NEXT:    v_mad_u16 v0, v11, v5, v0
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v11, 24, v2
+; GFX10-DL-NEXT:    v_mad_u16 v0, v4, v5, v0
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v4, 24, v2
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 24, v3
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 28, v2
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v3, 28, v3
 ; GFX10-DL-NEXT:    v_mad_u16 v0, v6, v7, v0
-; GFX10-DL-NEXT:    v_mad_u16 v0, v11, v5, v0
+; GFX10-DL-NEXT:    v_mad_u16 v0, v4, v5, v0
 ; GFX10-DL-NEXT:    v_mad_u16 v0, v2, v3, v0
 ; GFX10-DL-NEXT:    v_and_b32_e32 v0, 15, v0
 ; GFX10-DL-NEXT:    global_store_byte v1, v0, s[2:3]
@@ -1441,31 +1441,31 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(<8 x i4> addrspace(1)* %sr
 ; GFX10-DL-NEXT:    global_load_dword v3, v0, s[6:7]
 ; GFX10-DL-NEXT:    global_load_ubyte v4, v1, s[2:3]
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
-; GFX10-DL-NEXT:    v_and_b32_e32 v11, 15, v2
+; GFX10-DL-NEXT:    v_and_b32_e32 v0, 15, v2
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-DL-NEXT:    v_and_b32_e32 v5, 15, v3
 ; GFX10-DL-NEXT:    v_bfe_u32 v6, v2, 4, 4
 ; GFX10-DL-NEXT:    v_bfe_u32 v7, v3, 4, 4
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT:    v_mad_u16 v0, v11, v5, v4
-; GFX10-DL-NEXT:    v_bfe_u32 v11, v2, 8, 4
+; GFX10-DL-NEXT:    v_mad_u16 v0, v0, v5, v4
+; GFX10-DL-NEXT:    v_bfe_u32 v4, v2, 8, 4
 ; GFX10-DL-NEXT:    v_bfe_u32 v5, v3, 8, 4
 ; GFX10-DL-NEXT:    v_mad_u16 v0, v6, v7, v0
 ; GFX10-DL-NEXT:    v_bfe_u32 v6, v2, 12, 4
 ; GFX10-DL-NEXT:    v_bfe_u32 v7, v3, 12, 4
-; GFX10-DL-NEXT:    v_mad_u16 v0, v11, v5, v0
-; GFX10-DL-NEXT:    v_bfe_u32 v11, v2, 16, 4
+; GFX10-DL-NEXT:    v_mad_u16 v0, v4, v5, v0
+; GFX10-DL-NEXT:    v_bfe_u32 v4, v2, 16, 4
 ; GFX10-DL-NEXT:    v_bfe_u32 v5, v3, 16, 4
 ; GFX10-DL-NEXT:    v_mad_u16 v0, v6, v7, v0
 ; GFX10-DL-NEXT:    v_bfe_u32 v6, v2, 20, 4
 ; GFX10-DL-NEXT:    v_bfe_u32 v7, v3, 20, 4
-; GFX10-DL-NEXT:    v_mad_u16 v0, v11, v5, v0
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v11, 24, v2
+; GFX10-DL-NEXT:    v_mad_u16 v0, v4, v5, v0
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v4, 24, v2
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 24, v3
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 28, v2
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v3, 28, v3
 ; GFX10-DL-NEXT:    v_mad_u16 v0, v6, v7, v0
-; GFX10-DL-NEXT:    v_mad_u16 v0, v11, v5, v0
+; GFX10-DL-NEXT:    v_mad_u16 v0, v4, v5, v0
 ; GFX10-DL-NEXT:    v_mad_u16 v0, v2, v3, v0
 ; GFX10-DL-NEXT:    v_and_b32_e32 v0, 15, v0
 ; GFX10-DL-NEXT:    global_store_byte v1, v0, s[2:3]
@@ -2373,49 +2373,49 @@ define amdgpu_kernel void @udot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-DL-NEXT:    v_and_b32_e32 v6, 15, v2
 ; GFX10-DL-NEXT:    v_bfe_u32 v9, v1, 4, 4
-; GFX10-DL-NEXT:    v_bfe_u32 v15, v2, 4, 4
-; GFX10-DL-NEXT:    v_bfe_u32 v13, v2, 8, 4
+; GFX10-DL-NEXT:    v_bfe_u32 v10, v2, 4, 4
+; GFX10-DL-NEXT:    v_bfe_u32 v12, v1, 8, 4
 ; GFX10-DL-NEXT:    v_and_b32_e32 v7, v4, v7
 ; GFX10-DL-NEXT:    v_and_b32_e32 v6, v4, v6
-; GFX10-DL-NEXT:    v_bfe_u32 v19, v1, 8, 4
-; GFX10-DL-NEXT:    v_bfe_u32 v10, v2, 12, 4
-; GFX10-DL-NEXT:    v_and_b32_e32 v13, v4, v13
+; GFX10-DL-NEXT:    v_bfe_u32 v13, v2, 8, 4
+; GFX10-DL-NEXT:    v_bfe_u32 v11, v1, 16, 4
+; GFX10-DL-NEXT:    v_and_b32_e32 v12, v4, v12
 ; GFX10-DL-NEXT:    v_lshl_or_b32 v7, v9, 16, v7
-; GFX10-DL-NEXT:    v_lshl_or_b32 v6, v15, 16, v6
+; GFX10-DL-NEXT:    v_lshl_or_b32 v6, v10, 16, v6
 ; GFX10-DL-NEXT:    v_bfe_u32 v9, v1, 12, 4
-; GFX10-DL-NEXT:    v_and_b32_e32 v12, v4, v19
-; GFX10-DL-NEXT:    v_bfe_u32 v11, v1, 16, 4
-; GFX10-DL-NEXT:    v_lshl_or_b32 v10, v10, 16, v13
+; GFX10-DL-NEXT:    v_bfe_u32 v10, v2, 12, 4
+; GFX10-DL-NEXT:    v_and_b32_e32 v13, v4, v13
+; GFX10-DL-NEXT:    v_bfe_u32 v5, v1, 24, 4
 ; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v6, v7, v6
 ; GFX10-DL-NEXT:    v_bfe_u32 v7, v2, 16, 4
 ; GFX10-DL-NEXT:    v_lshl_or_b32 v9, v9, 16, v12
-; GFX10-DL-NEXT:    v_bfe_u32 v5, v1, 24, 4
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v23, 28, v1
+; GFX10-DL-NEXT:    v_lshl_or_b32 v10, v10, 16, v13
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v8, 28, v1
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v12, 16, v6
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DL-NEXT:    v_add_nc_u16 v3, v6, v3
-; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v9, v9, v10
 ; GFX10-DL-NEXT:    v_bfe_u32 v1, v1, 20, 4
 ; GFX10-DL-NEXT:    v_and_b32_e32 v11, v4, v11
+; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v9, v9, v10
 ; GFX10-DL-NEXT:    v_bfe_u32 v6, v2, 20, 4
-; GFX10-DL-NEXT:    v_and_b32_e32 v7, v4, v7
 ; GFX10-DL-NEXT:    v_add_nc_u16 v3, v3, v12
+; GFX10-DL-NEXT:    v_and_b32_e32 v7, v4, v7
 ; GFX10-DL-NEXT:    v_bfe_u32 v10, v2, 24, 4
 ; GFX10-DL-NEXT:    v_lshl_or_b32 v1, v1, 16, v11
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 28, v2
+; GFX10-DL-NEXT:    v_add_nc_u16 v3, v3, v9
 ; GFX10-DL-NEXT:    v_lshl_or_b32 v6, v6, 16, v7
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v7, 16, v9
-; GFX10-DL-NEXT:    v_add_nc_u16 v14, v3, v9
 ; GFX10-DL-NEXT:    v_and_b32_e32 v9, v4, v10
 ; GFX10-DL-NEXT:    v_and_b32_e32 v4, v4, v5
 ; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v1, v1, v6
-; GFX10-DL-NEXT:    v_add_nc_u16 v3, v14, v7
+; GFX10-DL-NEXT:    v_add_nc_u16 v3, v3, v7
 ; GFX10-DL-NEXT:    v_lshl_or_b32 v2, v2, 16, v9
-; GFX10-DL-NEXT:    v_lshl_or_b32 v4, v23, 16, v4
+; GFX10-DL-NEXT:    v_lshl_or_b32 v4, v8, 16, v4
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; GFX10-DL-NEXT:    v_add_nc_u16 v3, v3, v1
+; GFX10-DL-NEXT:    v_add_nc_u16 v1, v3, v1
 ; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v2, v4, v2
-; GFX10-DL-NEXT:    v_add_nc_u16 v1, v3, v5
+; GFX10-DL-NEXT:    v_add_nc_u16 v1, v1, v5
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
 ; GFX10-DL-NEXT:    v_add_nc_u16 v1, v1, v2
 ; GFX10-DL-NEXT:    v_add_nc_u16 v1, v1, v3
@@ -2762,7 +2762,7 @@ define amdgpu_kernel void @udot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX10-DL-NEXT:    v_mov_b32_e32 v19, 0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-DL-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
 ; GFX10-DL-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
 ; GFX10-DL-NEXT:    s_mov_b32 s10, -1
@@ -2773,7 +2773,7 @@ define amdgpu_kernel void @udot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_clause 0x1
 ; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
 ; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
-; GFX10-DL-NEXT:    global_load_ubyte v3, v19, s[0:1]
+; GFX10-DL-NEXT:    global_load_ubyte v3, v4, s[0:1]
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
 ; GFX10-DL-NEXT:    v_bfe_u32 v9, v1, 12, 4
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
@@ -2794,7 +2794,7 @@ define amdgpu_kernel void @udot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    v_mul_lo_u16 v7, v7, v14
 ; GFX10-DL-NEXT:    v_bfe_u32 v10, v2, 20, 4
 ; GFX10-DL-NEXT:    v_bfe_u32 v13, v2, 24, 4
-; GFX10-DL-NEXT:    v_bfe_u32 v23, v2, 16, 4
+; GFX10-DL-NEXT:    v_bfe_u32 v12, v2, 16, 4
 ; GFX10-DL-NEXT:    v_and_b32_e32 v2, 15, v2
 ; GFX10-DL-NEXT:    v_mul_lo_u16 v1, v1, v15
 ; GFX10-DL-NEXT:    v_or_b32_e32 v8, v8, v9
@@ -2804,7 +2804,7 @@ define amdgpu_kernel void @udot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    v_mul_lo_u16 v2, v11, v2
 ; GFX10-DL-NEXT:    v_lshlrev_b16 v1, 8, v1
 ; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 16, v8
-; GFX10-DL-NEXT:    v_mul_lo_u16 v11, v5, v23
+; GFX10-DL-NEXT:    v_mul_lo_u16 v11, v5, v12
 ; GFX10-DL-NEXT:    v_or_b32_e32 v7, v10, v7
 ; GFX10-DL-NEXT:    v_lshlrev_b16 v9, 8, v9
 ; GFX10-DL-NEXT:    v_or_b32_sdwa v10, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -2820,12 +2820,12 @@ define amdgpu_kernel void @udot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
 ; GFX10-DL-NEXT:    v_add_nc_u16 v0, v9, v8
 ; GFX10-DL-NEXT:    v_add_nc_u16 v0, v0, v2
-; GFX10-DL-NEXT:    v_mad_u16 v0, v5, v23, v0
+; GFX10-DL-NEXT:    v_mad_u16 v0, v5, v12, v0
 ; GFX10-DL-NEXT:    v_add_nc_u16 v0, v0, v1
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v1, 8, v7
 ; GFX10-DL-NEXT:    v_mad_u16 v0, v6, v13, v0
 ; GFX10-DL-NEXT:    v_add_nc_u16 v0, v0, v1
-; GFX10-DL-NEXT:    global_store_byte v19, v0, s[0:1]
+; GFX10-DL-NEXT:    global_store_byte v4, v0, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm
                                              <8 x i4> addrspace(1)* %src2,
                                              i8 addrspace(1)* nocapture %dst) {
@@ -3115,7 +3115,6 @@ define amdgpu_kernel void @udot8_acc4_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    v_bfe_u32 v6, v1, 4, 4
 ; GFX10-DL-NEXT:    v_bfe_u32 v7, v2, 4, 4
 ; GFX10-DL-NEXT:    v_bfe_u32 v8, v2, 8, 4
-; GFX10-DL-NEXT:    v_bfe_u32 v11, v1, 24, 4
 ; GFX10-DL-NEXT:    v_mul_u32_u24_e32 v4, v4, v5
 ; GFX10-DL-NEXT:    v_bfe_u32 v5, v1, 8, 4
 ; GFX10-DL-NEXT:    v_mul_u32_u24_e32 v6, v6, v7
@@ -3133,12 +3132,13 @@ define amdgpu_kernel void @udot8_acc4_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    v_bfe_u32 v5, v1, 20, 4
 ; GFX10-DL-NEXT:    v_mul_u32_u24_e32 v6, v6, v8
 ; GFX10-DL-NEXT:    v_bfe_u32 v8, v2, 24, 4
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v1, 28, v1
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 28, v2
 ; GFX10-DL-NEXT:    v_add_nc_u16 v3, v3, v4
+; GFX10-DL-NEXT:    v_bfe_u32 v4, v1, 24, 4
 ; GFX10-DL-NEXT:    v_mul_u32_u24_e32 v5, v5, v7
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 28, v2
-; GFX10-DL-NEXT:    v_mul_u32_u24_e32 v4, v11, v8
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v1, 28, v1
 ; GFX10-DL-NEXT:    v_add_nc_u16 v3, v3, v6
+; GFX10-DL-NEXT:    v_mul_u32_u24_e32 v4, v4, v8
 ; GFX10-DL-NEXT:    v_mul_u32_u24_e32 v1, v1, v2
 ; GFX10-DL-NEXT:    v_add_nc_u16 v3, v3, v5
 ; GFX10-DL-NEXT:    v_add_nc_u16 v2, v3, v4

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll
index dedda14bf8d5..b4d0399831bb 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll
@@ -448,22 +448,22 @@ define amdgpu_ps <4 x float> @load_3d_tfe_lwe(<8 x i32> inreg %rsrc, i32 addrspa
 ;
 ; GFX10-LABEL: load_3d_tfe_lwe:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v11, 0 ; encoding: [0x80,0x02,0x16,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v8, 0 ; encoding: [0x80,0x02,0x10,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v7, v2 ; encoding: [0x02,0x03,0x0e,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v12, v11 ; encoding: [0x0b,0x03,0x18,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v13, v11 ; encoding: [0x0b,0x03,0x1a,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v14, v11 ; encoding: [0x0b,0x03,0x1c,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v15, v11 ; encoding: [0x0b,0x03,0x1e,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v0, v11 ; encoding: [0x0b,0x03,0x00,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v1, v12 ; encoding: [0x0c,0x03,0x02,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v2, v13 ; encoding: [0x0d,0x03,0x04,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v3, v14 ; encoding: [0x0e,0x03,0x06,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v4, v15 ; encoding: [0x0f,0x03,0x08,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v9, v8 ; encoding: [0x08,0x03,0x12,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v10, v8 ; encoding: [0x08,0x03,0x14,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v11, v8 ; encoding: [0x08,0x03,0x16,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v12, v8 ; encoding: [0x08,0x03,0x18,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v0, v8 ; encoding: [0x08,0x03,0x00,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v1, v9 ; encoding: [0x09,0x03,0x02,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v2, v10 ; encoding: [0x0a,0x03,0x04,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v3, v11 ; encoding: [0x0b,0x03,0x06,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v4, v12 ; encoding: [0x0c,0x03,0x08,0x7e]
 ; GFX10-NEXT:    image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm tfe lwe ; encoding: [0x10,0x1f,0x03,0xf0,0x05,0x00,0x00,0x00]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
-; GFX10-NEXT:    global_store_dword v11, v4, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x0b,0x04,0x08,0x00]
+; GFX10-NEXT:    global_store_dword v8, v4, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x08,0x04,0x08,0x00]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
 ; GFX10-NEXT:    ; return to shader part epilog
 main_body:
@@ -579,22 +579,22 @@ define amdgpu_ps <4 x float> @load_cube_lwe(<8 x i32> inreg %rsrc, i32 addrspace
 ;
 ; GFX10-LABEL: load_cube_lwe:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v11, 0 ; encoding: [0x80,0x02,0x16,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v8, 0 ; encoding: [0x80,0x02,0x10,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v7, v2 ; encoding: [0x02,0x03,0x0e,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v12, v11 ; encoding: [0x0b,0x03,0x18,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v13, v11 ; encoding: [0x0b,0x03,0x1a,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v14, v11 ; encoding: [0x0b,0x03,0x1c,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v15, v11 ; encoding: [0x0b,0x03,0x1e,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v0, v11 ; encoding: [0x0b,0x03,0x00,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v1, v12 ; encoding: [0x0c,0x03,0x02,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v2, v13 ; encoding: [0x0d,0x03,0x04,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v3, v14 ; encoding: [0x0e,0x03,0x06,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v4, v15 ; encoding: [0x0f,0x03,0x08,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v9, v8 ; encoding: [0x08,0x03,0x12,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v10, v8 ; encoding: [0x08,0x03,0x14,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v11, v8 ; encoding: [0x08,0x03,0x16,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v12, v8 ; encoding: [0x08,0x03,0x18,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v0, v8 ; encoding: [0x08,0x03,0x00,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v1, v9 ; encoding: [0x09,0x03,0x02,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v2, v10 ; encoding: [0x0a,0x03,0x04,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v3, v11 ; encoding: [0x0b,0x03,0x06,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v4, v12 ; encoding: [0x0c,0x03,0x08,0x7e]
 ; GFX10-NEXT:    image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm lwe ; encoding: [0x18,0x1f,0x02,0xf0,0x05,0x00,0x00,0x00]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
-; GFX10-NEXT:    global_store_dword v11, v4, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x0b,0x04,0x08,0x00]
+; GFX10-NEXT:    global_store_dword v8, v4, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x08,0x04,0x08,0x00]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
 ; GFX10-NEXT:    ; return to shader part epilog
 main_body:
@@ -837,22 +837,22 @@ define amdgpu_ps <4 x float> @load_2darray_lwe(<8 x i32> inreg %rsrc, i32 addrsp
 ;
 ; GFX10-LABEL: load_2darray_lwe:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v11, 0 ; encoding: [0x80,0x02,0x16,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v8, 0 ; encoding: [0x80,0x02,0x10,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v7, v2 ; encoding: [0x02,0x03,0x0e,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v12, v11 ; encoding: [0x0b,0x03,0x18,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v13, v11 ; encoding: [0x0b,0x03,0x1a,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v14, v11 ; encoding: [0x0b,0x03,0x1c,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v15, v11 ; encoding: [0x0b,0x03,0x1e,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v0, v11 ; encoding: [0x0b,0x03,0x00,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v1, v12 ; encoding: [0x0c,0x03,0x02,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v2, v13 ; encoding: [0x0d,0x03,0x04,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v3, v14 ; encoding: [0x0e,0x03,0x06,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v4, v15 ; encoding: [0x0f,0x03,0x08,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v9, v8 ; encoding: [0x08,0x03,0x12,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v10, v8 ; encoding: [0x08,0x03,0x14,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v11, v8 ; encoding: [0x08,0x03,0x16,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v12, v8 ; encoding: [0x08,0x03,0x18,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v0, v8 ; encoding: [0x08,0x03,0x00,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v1, v9 ; encoding: [0x09,0x03,0x02,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v2, v10 ; encoding: [0x0a,0x03,0x04,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v3, v11 ; encoding: [0x0b,0x03,0x06,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v4, v12 ; encoding: [0x0c,0x03,0x08,0x7e]
 ; GFX10-NEXT:    image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm lwe ; encoding: [0x28,0x1f,0x02,0xf0,0x05,0x00,0x00,0x00]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
-; GFX10-NEXT:    global_store_dword v11, v4, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x0b,0x04,0x08,0x00]
+; GFX10-NEXT:    global_store_dword v8, v4, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x08,0x04,0x08,0x00]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
 ; GFX10-NEXT:    ; return to shader part epilog
 main_body:
@@ -968,22 +968,22 @@ define amdgpu_ps <4 x float> @load_2dmsaa_both(<8 x i32> inreg %rsrc, i32 addrsp
 ;
 ; GFX10-LABEL: load_2dmsaa_both:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v11, 0 ; encoding: [0x80,0x02,0x16,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v8, 0 ; encoding: [0x80,0x02,0x10,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v7, v2 ; encoding: [0x02,0x03,0x0e,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v12, v11 ; encoding: [0x0b,0x03,0x18,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v13, v11 ; encoding: [0x0b,0x03,0x1a,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v14, v11 ; encoding: [0x0b,0x03,0x1c,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v15, v11 ; encoding: [0x0b,0x03,0x1e,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v0, v11 ; encoding: [0x0b,0x03,0x00,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v1, v12 ; encoding: [0x0c,0x03,0x02,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v2, v13 ; encoding: [0x0d,0x03,0x04,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v3, v14 ; encoding: [0x0e,0x03,0x06,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v4, v15 ; encoding: [0x0f,0x03,0x08,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v9, v8 ; encoding: [0x08,0x03,0x12,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v10, v8 ; encoding: [0x08,0x03,0x14,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v11, v8 ; encoding: [0x08,0x03,0x16,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v12, v8 ; encoding: [0x08,0x03,0x18,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v0, v8 ; encoding: [0x08,0x03,0x00,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v1, v9 ; encoding: [0x09,0x03,0x02,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v2, v10 ; encoding: [0x0a,0x03,0x04,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v3, v11 ; encoding: [0x0b,0x03,0x06,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v4, v12 ; encoding: [0x0c,0x03,0x08,0x7e]
 ; GFX10-NEXT:    image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA unorm tfe lwe ; encoding: [0x30,0x1f,0x03,0xf0,0x05,0x00,0x00,0x00]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
-; GFX10-NEXT:    global_store_dword v11, v4, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x0b,0x04,0x08,0x00]
+; GFX10-NEXT:    global_store_dword v8, v4, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x08,0x04,0x08,0x00]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
 ; GFX10-NEXT:    ; return to shader part epilog
 main_body:
@@ -1361,22 +1361,22 @@ define amdgpu_ps <4 x float> @load_mip_2d_tfe(<8 x i32> inreg %rsrc, i32 addrspa
 ;
 ; GFX10-LABEL: load_mip_2d_tfe:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v11, 0 ; encoding: [0x80,0x02,0x16,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v8, 0 ; encoding: [0x80,0x02,0x10,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v7, v2 ; encoding: [0x02,0x03,0x0e,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v12, v11 ; encoding: [0x0b,0x03,0x18,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v13, v11 ; encoding: [0x0b,0x03,0x1a,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v14, v11 ; encoding: [0x0b,0x03,0x1c,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v15, v11 ; encoding: [0x0b,0x03,0x1e,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v0, v11 ; encoding: [0x0b,0x03,0x00,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v1, v12 ; encoding: [0x0c,0x03,0x02,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v2, v13 ; encoding: [0x0d,0x03,0x04,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v3, v14 ; encoding: [0x0e,0x03,0x06,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v4, v15 ; encoding: [0x0f,0x03,0x08,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v9, v8 ; encoding: [0x08,0x03,0x12,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v10, v8 ; encoding: [0x08,0x03,0x14,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v11, v8 ; encoding: [0x08,0x03,0x16,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v12, v8 ; encoding: [0x08,0x03,0x18,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v0, v8 ; encoding: [0x08,0x03,0x00,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v1, v9 ; encoding: [0x09,0x03,0x02,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v2, v10 ; encoding: [0x0a,0x03,0x04,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v3, v11 ; encoding: [0x0b,0x03,0x06,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v4, v12 ; encoding: [0x0c,0x03,0x08,0x7e]
 ; GFX10-NEXT:    image_load_mip v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm tfe ; encoding: [0x08,0x1f,0x05,0xf0,0x05,0x00,0x00,0x00]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
-; GFX10-NEXT:    global_store_dword v11, v4, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x0b,0x04,0x08,0x00]
+; GFX10-NEXT:    global_store_dword v8, v4, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x08,0x04,0x08,0x00]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
 ; GFX10-NEXT:    ; return to shader part epilog
 main_body:

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll
index 0e0ea50e96e7..e39324874b33 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll
@@ -566,10 +566,10 @@ define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
 ;
 ; GFX10-LABEL: sample_d_2d:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v7, 0xffff
-; GFX10-NEXT:    v_and_b32_e32 v4, v7, v4
-; GFX10-NEXT:    v_and_b32_e32 v2, v7, v2
-; GFX10-NEXT:    v_and_b32_e32 v0, v7, v0
+; GFX10-NEXT:    v_mov_b32_e32 v6, 0xffff
+; GFX10-NEXT:    v_and_b32_e32 v4, v6, v4
+; GFX10-NEXT:    v_and_b32_e32 v2, v6, v2
+; GFX10-NEXT:    v_and_b32_e32 v0, v6, v0
 ; GFX10-NEXT:    v_lshl_or_b32 v4, v5, 16, v4
 ; GFX10-NEXT:    v_lshl_or_b32 v3, v3, 16, v2
 ; GFX10-NEXT:    v_lshl_or_b32 v2, v1, 16, v0
@@ -650,14 +650,14 @@ define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
 ;
 ; GFX10-LABEL: sample_c_d_2d:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v10, 0xffff
-; GFX10-NEXT:    v_and_b32_e32 v5, v10, v5
-; GFX10-NEXT:    v_and_b32_e32 v3, v10, v3
-; GFX10-NEXT:    v_and_b32_e32 v1, v10, v1
+; GFX10-NEXT:    v_mov_b32_e32 v7, 0xffff
+; GFX10-NEXT:    v_and_b32_e32 v5, v7, v5
+; GFX10-NEXT:    v_and_b32_e32 v3, v7, v3
+; GFX10-NEXT:    v_and_b32_e32 v1, v7, v1
 ; GFX10-NEXT:    v_lshl_or_b32 v5, v6, 16, v5
 ; GFX10-NEXT:    v_lshl_or_b32 v3, v4, 16, v3
-; GFX10-NEXT:    v_lshl_or_b32 v2, v2, 16, v1
-; GFX10-NEXT:    image_sample_c_d v[0:3], [v0, v2, v3, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
+; GFX10-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
+; GFX10-NEXT:    image_sample_c_d v[0:3], [v0, v1, v3, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 main_body:
@@ -707,9 +707,9 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
 ; GFX10-NEXT:    v_and_b32_e32 v2, v7, v2
 ; GFX10-NEXT:    v_and_b32_e32 v0, v7, v0
 ; GFX10-NEXT:    v_lshl_or_b32 v4, v5, 16, v4
-; GFX10-NEXT:    v_lshl_or_b32 v5, v3, 16, v2
-; GFX10-NEXT:    v_lshl_or_b32 v3, v1, 16, v0
-; GFX10-NEXT:    image_sample_d_cl v[0:3], [v3, v5, v4, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
+; GFX10-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
+; GFX10-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX10-NEXT:    image_sample_d_cl v[0:3], [v0, v2, v4, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 main_body:
@@ -762,8 +762,8 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
 ; GFX10-NEXT:    v_and_b32_e32 v1, v8, v1
 ; GFX10-NEXT:    v_lshl_or_b32 v5, v6, 16, v5
 ; GFX10-NEXT:    v_lshl_or_b32 v3, v4, 16, v3
-; GFX10-NEXT:    v_lshl_or_b32 v2, v2, 16, v1
-; GFX10-NEXT:    image_sample_c_d_cl v[0:3], [v0, v2, v3, v5, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
+; GFX10-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
+; GFX10-NEXT:    image_sample_c_d_cl v[0:3], [v0, v1, v3, v5, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 main_body:
@@ -804,10 +804,10 @@ define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
 ;
 ; GFX10-LABEL: sample_cd_2d:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v7, 0xffff
-; GFX10-NEXT:    v_and_b32_e32 v4, v7, v4
-; GFX10-NEXT:    v_and_b32_e32 v2, v7, v2
-; GFX10-NEXT:    v_and_b32_e32 v0, v7, v0
+; GFX10-NEXT:    v_mov_b32_e32 v6, 0xffff
+; GFX10-NEXT:    v_and_b32_e32 v4, v6, v4
+; GFX10-NEXT:    v_and_b32_e32 v2, v6, v2
+; GFX10-NEXT:    v_and_b32_e32 v0, v6, v0
 ; GFX10-NEXT:    v_lshl_or_b32 v4, v5, 16, v4
 ; GFX10-NEXT:    v_lshl_or_b32 v3, v3, 16, v2
 ; GFX10-NEXT:    v_lshl_or_b32 v2, v1, 16, v0
@@ -854,14 +854,14 @@ define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> in
 ;
 ; GFX10-LABEL: sample_c_cd_2d:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v10, 0xffff
-; GFX10-NEXT:    v_and_b32_e32 v5, v10, v5
-; GFX10-NEXT:    v_and_b32_e32 v3, v10, v3
-; GFX10-NEXT:    v_and_b32_e32 v1, v10, v1
+; GFX10-NEXT:    v_mov_b32_e32 v7, 0xffff
+; GFX10-NEXT:    v_and_b32_e32 v5, v7, v5
+; GFX10-NEXT:    v_and_b32_e32 v3, v7, v3
+; GFX10-NEXT:    v_and_b32_e32 v1, v7, v1
 ; GFX10-NEXT:    v_lshl_or_b32 v5, v6, 16, v5
 ; GFX10-NEXT:    v_lshl_or_b32 v3, v4, 16, v3
-; GFX10-NEXT:    v_lshl_or_b32 v2, v2, 16, v1
-; GFX10-NEXT:    image_sample_c_cd v[0:3], [v0, v2, v3, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
+; GFX10-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
+; GFX10-NEXT:    image_sample_c_cd v[0:3], [v0, v1, v3, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 main_body:
@@ -911,9 +911,9 @@ define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
 ; GFX10-NEXT:    v_and_b32_e32 v2, v7, v2
 ; GFX10-NEXT:    v_and_b32_e32 v0, v7, v0
 ; GFX10-NEXT:    v_lshl_or_b32 v4, v5, 16, v4
-; GFX10-NEXT:    v_lshl_or_b32 v5, v3, 16, v2
-; GFX10-NEXT:    v_lshl_or_b32 v3, v1, 16, v0
-; GFX10-NEXT:    image_sample_cd_cl v[0:3], [v3, v5, v4, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
+; GFX10-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
+; GFX10-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX10-NEXT:    image_sample_cd_cl v[0:3], [v0, v2, v4, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 main_body:
@@ -966,8 +966,8 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
 ; GFX10-NEXT:    v_and_b32_e32 v1, v8, v1
 ; GFX10-NEXT:    v_lshl_or_b32 v5, v6, 16, v5
 ; GFX10-NEXT:    v_lshl_or_b32 v3, v4, 16, v3
-; GFX10-NEXT:    v_lshl_or_b32 v2, v2, 16, v1
-; GFX10-NEXT:    image_sample_c_cd_cl v[0:3], [v0, v2, v3, v5, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
+; GFX10-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
+; GFX10-NEXT:    image_sample_c_cd_cl v[0:3], [v0, v1, v3, v5, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 main_body:
@@ -1162,8 +1162,8 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32>
 ; GFX10-NEXT:    v_and_b32_e32 v2, v9, v2
 ; GFX10-NEXT:    v_lshl_or_b32 v6, v7, 16, v6
 ; GFX10-NEXT:    v_lshl_or_b32 v4, v5, 16, v4
-; GFX10-NEXT:    v_lshl_or_b32 v3, v3, 16, v2
-; GFX10-NEXT:    image_sample_c_d_o v0, [v0, v1, v3, v4, v6, v8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY a16
+; GFX10-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
+; GFX10-NEXT:    image_sample_c_d_o v0, [v0, v1, v2, v4, v6, v8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY a16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 main_body:
@@ -1196,8 +1196,8 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4
 ; GFX10-NEXT:    v_and_b32_e32 v2, v9, v2
 ; GFX10-NEXT:    v_lshl_or_b32 v6, v7, 16, v6
 ; GFX10-NEXT:    v_lshl_or_b32 v4, v5, 16, v4
-; GFX10-NEXT:    v_lshl_or_b32 v3, v3, 16, v2
-; GFX10-NEXT:    image_sample_c_d_o v[0:1], [v0, v1, v3, v4, v6, v8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY a16
+; GFX10-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
+; GFX10-NEXT:    image_sample_c_d_o v[0:1], [v0, v1, v2, v4, v6, v8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY a16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 main_body:

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll
index 47765caa4090..90050524088d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll
@@ -96,13 +96,13 @@ define amdgpu_ps half @image_sample_2d_f16_tfe(<8 x i32> inreg %rsrc, <4 x i32>
 ;
 ; GFX10-LABEL: image_sample_2d_f16_tfe:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    s_mov_b32 s28, exec_lo
+; GFX10-NEXT:    s_mov_b32 s14, exec_lo
 ; GFX10-NEXT:    s_wqm_b32 exec_lo, exec_lo
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v4
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v5
-; GFX10-NEXT:    s_and_b32 exec_lo, exec_lo, s28
+; GFX10-NEXT:    s_and_b32 exec_lo, exec_lo, s14
 ; GFX10-NEXT:    image_sample v[2:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe d16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v2

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll
index f75fe13e49d0..6e9daac16d83 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll
@@ -79,7 +79,7 @@ define amdgpu_ps <4 x float> @sample_1d_tfe(<8 x i32> inreg %rsrc, <4 x i32> inr
 ;
 ; GFX10-LABEL: sample_1d_tfe:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    s_mov_b32 s28, exec_lo ; encoding: [0x7e,0x03,0x9c,0xbe]
+; GFX10-NEXT:    s_mov_b32 s14, exec_lo ; encoding: [0x7e,0x03,0x8e,0xbe]
 ; GFX10-NEXT:    s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe]
 ; GFX10-NEXT:    v_mov_b32_e32 v6, 0 ; encoding: [0x80,0x02,0x0c,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e]
@@ -92,7 +92,7 @@ define amdgpu_ps <4 x float> @sample_1d_tfe(<8 x i32> inreg %rsrc, <4 x i32> inr
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v8 ; encoding: [0x08,0x03,0x04,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v9 ; encoding: [0x09,0x03,0x06,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v10 ; encoding: [0x0a,0x03,0x08,0x7e]
-; GFX10-NEXT:    s_and_b32 exec_lo, exec_lo, s28 ; encoding: [0x7e,0x1c,0x7e,0x87]
+; GFX10-NEXT:    s_and_b32 exec_lo, exec_lo, s14 ; encoding: [0x7e,0x0e,0x7e,0x87]
 ; GFX10-NEXT:    image_sample v[0:4], v5, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D tfe ; encoding: [0x00,0x0f,0x81,0xf0,0x05,0x00,0x40,0x00]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
 ; GFX10-NEXT:    global_store_dword v6, v4, s[12:13] ; encoding: [0x00,0x80,0x70,0xdc,0x06,0x04,0x0c,0x00]
@@ -499,7 +499,7 @@ define amdgpu_ps <4 x float> @sample_1d_lwe(<8 x i32> inreg %rsrc, <4 x i32> inr
 ;
 ; GFX10-LABEL: sample_1d_lwe:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    s_mov_b32 s28, exec_lo ; encoding: [0x7e,0x03,0x9c,0xbe]
+; GFX10-NEXT:    s_mov_b32 s14, exec_lo ; encoding: [0x7e,0x03,0x8e,0xbe]
 ; GFX10-NEXT:    s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe]
 ; GFX10-NEXT:    v_mov_b32_e32 v6, 0 ; encoding: [0x80,0x02,0x0c,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e]
@@ -512,7 +512,7 @@ define amdgpu_ps <4 x float> @sample_1d_lwe(<8 x i32> inreg %rsrc, <4 x i32> inr
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v8 ; encoding: [0x08,0x03,0x04,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v9 ; encoding: [0x09,0x03,0x06,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v10 ; encoding: [0x0a,0x03,0x08,0x7e]
-; GFX10-NEXT:    s_and_b32 exec_lo, exec_lo, s28 ; encoding: [0x7e,0x1c,0x7e,0x87]
+; GFX10-NEXT:    s_and_b32 exec_lo, exec_lo, s14 ; encoding: [0x7e,0x0e,0x7e,0x87]
 ; GFX10-NEXT:    image_sample v[0:4], v5, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D lwe ; encoding: [0x00,0x0f,0x82,0xf0,0x05,0x00,0x40,0x00]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
 ; GFX10-NEXT:    global_store_dword v6, v4, s[12:13] ; encoding: [0x00,0x80,0x70,0xdc,0x06,0x04,0x0c,0x00]

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll
index b4f89e5d9ba0..6a3248e50ee9 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll
@@ -15,12 +15,12 @@ main_body:
 define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) {
 ; GFX10-LABEL: sample_d_2d:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v7, 0xffff ; encoding: [0xff,0x02,0x0e,0x7e,0xff,0xff,0x00,0x00]
-; GFX10-NEXT:    v_and_b32_e32 v2, v7, v2 ; encoding: [0x07,0x05,0x04,0x36]
-; GFX10-NEXT:    v_and_b32_e32 v0, v7, v0 ; encoding: [0x07,0x01,0x00,0x36]
+; GFX10-NEXT:    v_mov_b32_e32 v6, 0xffff ; encoding: [0xff,0x02,0x0c,0x7e,0xff,0xff,0x00,0x00]
+; GFX10-NEXT:    v_and_b32_e32 v2, v6, v2 ; encoding: [0x06,0x05,0x04,0x36]
+; GFX10-NEXT:    v_and_b32_e32 v0, v6, v0 ; encoding: [0x06,0x01,0x00,0x36]
 ; GFX10-NEXT:    v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04]
-; GFX10-NEXT:    v_lshl_or_b32 v3, v1, 16, v0 ; encoding: [0x03,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04]
-; GFX10-NEXT:    image_sample_d_g16 v[0:3], [v3, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0x88,0xf0,0x03,0x00,0x40,0x00,0x02,0x04,0x05,0x00]
+; GFX10-NEXT:    v_lshl_or_b32 v0, v1, 16, v0 ; encoding: [0x00,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04]
+; GFX10-NEXT:    image_sample_d_g16 v[0:3], [v0, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0x88,0xf0,0x00,0x00,0x40,0x00,0x02,0x04,0x05,0x00]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
 ; GFX10-NEXT:    ; return to shader part epilog
 main_body:
@@ -58,9 +58,9 @@ main_body:
 define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) {
 ; GFX10-LABEL: sample_c_d_2d:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v10, 0xffff ; encoding: [0xff,0x02,0x14,0x7e,0xff,0xff,0x00,0x00]
-; GFX10-NEXT:    v_and_b32_e32 v3, v10, v3 ; encoding: [0x0a,0x07,0x06,0x36]
-; GFX10-NEXT:    v_and_b32_e32 v1, v10, v1 ; encoding: [0x0a,0x03,0x02,0x36]
+; GFX10-NEXT:    v_mov_b32_e32 v7, 0xffff ; encoding: [0xff,0x02,0x0e,0x7e,0xff,0xff,0x00,0x00]
+; GFX10-NEXT:    v_and_b32_e32 v3, v7, v3 ; encoding: [0x07,0x07,0x06,0x36]
+; GFX10-NEXT:    v_and_b32_e32 v1, v7, v1 ; encoding: [0x07,0x03,0x02,0x36]
 ; GFX10-NEXT:    v_lshl_or_b32 v3, v4, 16, v3 ; encoding: [0x03,0x00,0x6f,0xd7,0x04,0x21,0x0d,0x04]
 ; GFX10-NEXT:    v_lshl_or_b32 v1, v2, 16, v1 ; encoding: [0x01,0x00,0x6f,0xd7,0x02,0x21,0x05,0x04]
 ; GFX10-NEXT:    image_sample_c_d_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0xa8,0xf0,0x00,0x00,0x40,0x00,0x01,0x03,0x05,0x06]
@@ -89,8 +89,8 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
 ; GFX10-NEXT:    v_and_b32_e32 v2, v7, v2 ; encoding: [0x07,0x05,0x04,0x36]
 ; GFX10-NEXT:    v_and_b32_e32 v0, v7, v0 ; encoding: [0x07,0x01,0x00,0x36]
 ; GFX10-NEXT:    v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04]
-; GFX10-NEXT:    v_lshl_or_b32 v3, v1, 16, v0 ; encoding: [0x03,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04]
-; GFX10-NEXT:    image_sample_d_cl_g16 v[0:3], [v3, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0x8c,0xf0,0x03,0x00,0x40,0x00,0x02,0x04,0x05,0x06]
+; GFX10-NEXT:    v_lshl_or_b32 v0, v1, 16, v0 ; encoding: [0x00,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04]
+; GFX10-NEXT:    image_sample_d_cl_g16 v[0:3], [v0, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0x8c,0xf0,0x00,0x00,0x40,0x00,0x02,0x04,0x05,0x06]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
 ; GFX10-NEXT:    ; return to shader part epilog
 main_body:
@@ -139,12 +139,12 @@ main_body:
 define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) {
 ; GFX10-LABEL: sample_cd_2d:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v7, 0xffff ; encoding: [0xff,0x02,0x0e,0x7e,0xff,0xff,0x00,0x00]
-; GFX10-NEXT:    v_and_b32_e32 v2, v7, v2 ; encoding: [0x07,0x05,0x04,0x36]
-; GFX10-NEXT:    v_and_b32_e32 v0, v7, v0 ; encoding: [0x07,0x01,0x00,0x36]
+; GFX10-NEXT:    v_mov_b32_e32 v6, 0xffff ; encoding: [0xff,0x02,0x0c,0x7e,0xff,0xff,0x00,0x00]
+; GFX10-NEXT:    v_and_b32_e32 v2, v6, v2 ; encoding: [0x06,0x05,0x04,0x36]
+; GFX10-NEXT:    v_and_b32_e32 v0, v6, v0 ; encoding: [0x06,0x01,0x00,0x36]
 ; GFX10-NEXT:    v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04]
-; GFX10-NEXT:    v_lshl_or_b32 v3, v1, 16, v0 ; encoding: [0x03,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04]
-; GFX10-NEXT:    image_sample_cd_g16 v[0:3], [v3, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0xa0,0xf1,0x03,0x00,0x40,0x00,0x02,0x04,0x05,0x00]
+; GFX10-NEXT:    v_lshl_or_b32 v0, v1, 16, v0 ; encoding: [0x00,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04]
+; GFX10-NEXT:    image_sample_cd_g16 v[0:3], [v0, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0xa0,0xf1,0x00,0x00,0x40,0x00,0x02,0x04,0x05,0x00]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
 ; GFX10-NEXT:    ; return to shader part epilog
 main_body:
@@ -166,9 +166,9 @@ main_body:
 define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) {
 ; GFX10-LABEL: sample_c_cd_2d:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v10, 0xffff ; encoding: [0xff,0x02,0x14,0x7e,0xff,0xff,0x00,0x00]
-; GFX10-NEXT:    v_and_b32_e32 v3, v10, v3 ; encoding: [0x0a,0x07,0x06,0x36]
-; GFX10-NEXT:    v_and_b32_e32 v1, v10, v1 ; encoding: [0x0a,0x03,0x02,0x36]
+; GFX10-NEXT:    v_mov_b32_e32 v7, 0xffff ; encoding: [0xff,0x02,0x0e,0x7e,0xff,0xff,0x00,0x00]
+; GFX10-NEXT:    v_and_b32_e32 v3, v7, v3 ; encoding: [0x07,0x07,0x06,0x36]
+; GFX10-NEXT:    v_and_b32_e32 v1, v7, v1 ; encoding: [0x07,0x03,0x02,0x36]
 ; GFX10-NEXT:    v_lshl_or_b32 v3, v4, 16, v3 ; encoding: [0x03,0x00,0x6f,0xd7,0x04,0x21,0x0d,0x04]
 ; GFX10-NEXT:    v_lshl_or_b32 v1, v2, 16, v1 ; encoding: [0x01,0x00,0x6f,0xd7,0x02,0x21,0x05,0x04]
 ; GFX10-NEXT:    image_sample_c_cd_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0xa8,0xf1,0x00,0x00,0x40,0x00,0x01,0x03,0x05,0x06]
@@ -197,8 +197,8 @@ define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
 ; GFX10-NEXT:    v_and_b32_e32 v2, v7, v2 ; encoding: [0x07,0x05,0x04,0x36]
 ; GFX10-NEXT:    v_and_b32_e32 v0, v7, v0 ; encoding: [0x07,0x01,0x00,0x36]
 ; GFX10-NEXT:    v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04]
-; GFX10-NEXT:    v_lshl_or_b32 v3, v1, 16, v0 ; encoding: [0x03,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04]
-; GFX10-NEXT:    image_sample_cd_cl_g16 v[0:3], [v3, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0xa4,0xf1,0x03,0x00,0x40,0x00,0x02,0x04,0x05,0x06]
+; GFX10-NEXT:    v_lshl_or_b32 v0, v1, 16, v0 ; encoding: [0x00,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04]
+; GFX10-NEXT:    image_sample_cd_cl_g16 v[0:3], [v0, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0xa4,0xf1,0x00,0x00,0x40,0x00,0x02,0x04,0x05,0x06]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
 ; GFX10-NEXT:    ; return to shader part epilog
 main_body:

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll
index 002d4e69ad26..7c20bc69189f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll
@@ -15,12 +15,12 @@ main_body:
 define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) {
 ; GFX10-LABEL: sample_d_2d:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v7, 0xffff
-; GFX10-NEXT:    v_and_b32_e32 v2, v7, v2
-; GFX10-NEXT:    v_and_b32_e32 v0, v7, v0
+; GFX10-NEXT:    v_mov_b32_e32 v6, 0xffff
+; GFX10-NEXT:    v_and_b32_e32 v2, v6, v2
+; GFX10-NEXT:    v_and_b32_e32 v0, v6, v0
 ; GFX10-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
-; GFX10-NEXT:    v_lshl_or_b32 v3, v1, 16, v0
-; GFX10-NEXT:    image_sample_d_g16 v[0:3], [v3, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX10-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX10-NEXT:    image_sample_d_g16 v[0:3], [v0, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 main_body:
@@ -58,9 +58,9 @@ main_body:
 define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) {
 ; GFX10-LABEL: sample_c_d_2d:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v10, 0xffff
-; GFX10-NEXT:    v_and_b32_e32 v3, v10, v3
-; GFX10-NEXT:    v_and_b32_e32 v1, v10, v1
+; GFX10-NEXT:    v_mov_b32_e32 v7, 0xffff
+; GFX10-NEXT:    v_and_b32_e32 v3, v7, v3
+; GFX10-NEXT:    v_and_b32_e32 v1, v7, v1
 ; GFX10-NEXT:    v_lshl_or_b32 v3, v4, 16, v3
 ; GFX10-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
 ; GFX10-NEXT:    image_sample_c_d_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
@@ -89,8 +89,8 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
 ; GFX10-NEXT:    v_and_b32_e32 v2, v7, v2
 ; GFX10-NEXT:    v_and_b32_e32 v0, v7, v0
 ; GFX10-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
-; GFX10-NEXT:    v_lshl_or_b32 v3, v1, 16, v0
-; GFX10-NEXT:    image_sample_d_cl_g16 v[0:3], [v3, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX10-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX10-NEXT:    image_sample_d_cl_g16 v[0:3], [v0, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 main_body:
@@ -139,12 +139,12 @@ main_body:
 define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) {
 ; GFX10-LABEL: sample_cd_2d:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v7, 0xffff
-; GFX10-NEXT:    v_and_b32_e32 v2, v7, v2
-; GFX10-NEXT:    v_and_b32_e32 v0, v7, v0
+; GFX10-NEXT:    v_mov_b32_e32 v6, 0xffff
+; GFX10-NEXT:    v_and_b32_e32 v2, v6, v2
+; GFX10-NEXT:    v_and_b32_e32 v0, v6, v0
 ; GFX10-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
-; GFX10-NEXT:    v_lshl_or_b32 v3, v1, 16, v0
-; GFX10-NEXT:    image_sample_cd_g16 v[0:3], [v3, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX10-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX10-NEXT:    image_sample_cd_g16 v[0:3], [v0, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 main_body:
@@ -166,9 +166,9 @@ main_body:
 define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) {
 ; GFX10-LABEL: sample_c_cd_2d:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v10, 0xffff
-; GFX10-NEXT:    v_and_b32_e32 v3, v10, v3
-; GFX10-NEXT:    v_and_b32_e32 v1, v10, v1
+; GFX10-NEXT:    v_mov_b32_e32 v7, 0xffff
+; GFX10-NEXT:    v_and_b32_e32 v3, v7, v3
+; GFX10-NEXT:    v_and_b32_e32 v1, v7, v1
 ; GFX10-NEXT:    v_lshl_or_b32 v3, v4, 16, v3
 ; GFX10-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
 ; GFX10-NEXT:    image_sample_c_cd_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
@@ -197,8 +197,8 @@ define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
 ; GFX10-NEXT:    v_and_b32_e32 v2, v7, v2
 ; GFX10-NEXT:    v_and_b32_e32 v0, v7, v0
 ; GFX10-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
-; GFX10-NEXT:    v_lshl_or_b32 v3, v1, 16, v0
-; GFX10-NEXT:    image_sample_cd_cl_g16 v[0:3], [v3, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX10-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX10-NEXT:    image_sample_cd_cl_g16 v[0:3], [v0, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 main_body:

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll
index 9edd1a397b78..e88b70fb449a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll
@@ -356,8 +356,8 @@ define amdgpu_ps <4 x float> @wqm_demote_1(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX10-32-NEXT:    s_andn2_b32 s12, s12, exec_lo
 ; GFX10-32-NEXT:    s_cbranch_scc0 BB3_4
 ; GFX10-32-NEXT:  ; %bb.2: ; %.demote
-; GFX10-32-NEXT:    s_wqm_b32 s28, s12
-; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s28
+; GFX10-32-NEXT:    s_wqm_b32 s14, s12
+; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s14
 ; GFX10-32-NEXT:  BB3_3: ; %.continue
 ; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s13
 ; GFX10-32-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
@@ -379,7 +379,7 @@ define amdgpu_ps <4 x float> @wqm_demote_1(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX10-64-NEXT:    s_wqm_b64 exec, exec
 ; GFX10-64-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v1
 ; GFX10-64-NEXT:    s_and_saveexec_b64 s[14:15], vcc
-; GFX10-64-NEXT:    s_xor_b64 s[28:29], exec, s[14:15]
+; GFX10-64-NEXT:    s_xor_b64 s[14:15], exec, s[14:15]
 ; GFX10-64-NEXT:    s_cbranch_execz BB3_3
 ; GFX10-64-NEXT:  ; %bb.1: ; %.demote
 ; GFX10-64-NEXT:    s_andn2_b64 s[12:13], s[12:13], exec
@@ -388,7 +388,7 @@ define amdgpu_ps <4 x float> @wqm_demote_1(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX10-64-NEXT:    s_wqm_b64 s[16:17], s[12:13]
 ; GFX10-64-NEXT:    s_and_b64 exec, exec, s[16:17]
 ; GFX10-64-NEXT:  BB3_3: ; %.continue
-; GFX10-64-NEXT:    s_or_b64 exec, exec, s[28:29]
+; GFX10-64-NEXT:    s_or_b64 exec, exec, s[14:15]
 ; GFX10-64-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
 ; GFX10-64-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-64-NEXT:    v_add_f32_e32 v0, v0, v0
@@ -492,8 +492,8 @@ define amdgpu_ps <4 x float> @wqm_demote_2(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX10-32-NEXT:    s_andn2_b32 s12, s12, exec_lo
 ; GFX10-32-NEXT:    s_cbranch_scc0 BB4_4
 ; GFX10-32-NEXT:  ; %bb.2: ; %.demote
-; GFX10-32-NEXT:    s_wqm_b32 s28, s12
-; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s28
+; GFX10-32-NEXT:    s_wqm_b32 s14, s12
+; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s14
 ; GFX10-32-NEXT:  BB4_3: ; %.continue
 ; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s13
 ; GFX10-32-NEXT:    v_add_f32_e32 v0, v0, v0
@@ -515,7 +515,7 @@ define amdgpu_ps <4 x float> @wqm_demote_2(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX10-64-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-64-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v0
 ; GFX10-64-NEXT:    s_and_saveexec_b64 s[14:15], vcc
-; GFX10-64-NEXT:    s_xor_b64 s[28:29], exec, s[14:15]
+; GFX10-64-NEXT:    s_xor_b64 s[14:15], exec, s[14:15]
 ; GFX10-64-NEXT:    s_cbranch_execz BB4_3
 ; GFX10-64-NEXT:  ; %bb.1: ; %.demote
 ; GFX10-64-NEXT:    s_andn2_b64 s[12:13], s[12:13], exec
@@ -524,7 +524,7 @@ define amdgpu_ps <4 x float> @wqm_demote_2(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX10-64-NEXT:    s_wqm_b64 s[16:17], s[12:13]
 ; GFX10-64-NEXT:    s_and_b64 exec, exec, s[16:17]
 ; GFX10-64-NEXT:  BB4_3: ; %.continue
-; GFX10-64-NEXT:    s_or_b64 exec, exec, s[28:29]
+; GFX10-64-NEXT:    s_or_b64 exec, exec, s[14:15]
 ; GFX10-64-NEXT:    v_add_f32_e32 v0, v0, v0
 ; GFX10-64-NEXT:    s_and_b64 exec, exec, s[12:13]
 ; GFX10-64-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
@@ -637,8 +637,8 @@ define amdgpu_ps <4 x float> @wqm_demote_dynamic(<8 x i32> inreg %rsrc, <4 x i32
 ; GFX10-64-NEXT:    s_andn2_b64 s[12:13], s[12:13], s[14:15]
 ; GFX10-64-NEXT:    s_cbranch_scc0 BB5_2
 ; GFX10-64-NEXT:  ; %bb.1: ; %.entry
-; GFX10-64-NEXT:    s_wqm_b64 s[28:29], s[12:13]
-; GFX10-64-NEXT:    s_and_b64 exec, exec, s[28:29]
+; GFX10-64-NEXT:    s_wqm_b64 s[14:15], s[12:13]
+; GFX10-64-NEXT:    s_and_b64 exec, exec, s[14:15]
 ; GFX10-64-NEXT:    v_add_f32_e32 v0, v0, v0
 ; GFX10-64-NEXT:    s_and_b64 exec, exec, s[12:13]
 ; GFX10-64-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
index bdeda3e4f04b..9b2f8aa23273 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
@@ -157,25 +157,25 @@ define { i64, i1 } @smulo_i64_s_s(i64 %x, i64 %y) {
 ; GFX10:       ; %bb.0: ; %bb
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_mul_lo_u32 v15, v0, v3
+; GFX10-NEXT:    v_mul_lo_u32 v4, v0, v3
 ; GFX10-NEXT:    v_mul_hi_u32 v5, v0, v2
 ; GFX10-NEXT:    v_mul_hi_u32 v6, v0, v3
 ; GFX10-NEXT:    v_mul_lo_u32 v8, v1, v2
 ; GFX10-NEXT:    v_mul_hi_u32 v7, v1, v2
 ; GFX10-NEXT:    v_mul_hi_i32 v9, v1, v3
 ; GFX10-NEXT:    v_mul_lo_u32 v11, v1, v3
-; GFX10-NEXT:    v_add_co_u32 v10, vcc_lo, v5, v15
+; GFX10-NEXT:    v_add_co_u32 v10, vcc_lo, v5, v4
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, 0, v6, vcc_lo
 ; GFX10-NEXT:    v_add_co_u32 v10, vcc_lo, v10, v8
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, v6, v7, vcc_lo
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v9, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v11, vcc_lo, v6, v11
+; GFX10-NEXT:    v_add_co_u32 v6, vcc_lo, v6, v11
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo
-; GFX10-NEXT:    v_sub_co_u32 v9, vcc_lo, v11, v2
+; GFX10-NEXT:    v_sub_co_u32 v9, vcc_lo, v6, v2
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v10, vcc_lo, 0, v7, vcc_lo
 ; GFX10-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 0, v1
-; GFX10-NEXT:    v_add3_u32 v1, v5, v15, v8
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v11, v9, vcc_lo
+; GFX10-NEXT:    v_add3_u32 v1, v5, v4, v8
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v6, v9, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v7, v7, v10, vcc_lo
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v4, 31, v1
 ; GFX10-NEXT:    v_sub_co_u32 v8, vcc_lo, v6, v0
@@ -461,8 +461,8 @@ define { i64, i1 } @smulo_i64_v_4(i64 %i) {
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_lshlrev_b64 v[4:5], 2, v[0:1]
 ; GFX10-NEXT:    v_alignbit_b32 v3, v1, v0, 30
-; GFX10-NEXT:    v_ashrrev_i64 v[6:7], 2, v[4:5]
-; GFX10-NEXT:    v_cmp_ne_u64_e32 vcc_lo, v[6:7], v[0:1]
+; GFX10-NEXT:    v_ashrrev_i64 v[5:6], 2, v[4:5]
+; GFX10-NEXT:    v_cmp_ne_u64_e32 vcc_lo, v[5:6], v[0:1]
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo

diff  --git a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
index 4f48c06fa1ce..d4fa0b3386b2 100644
--- a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
@@ -539,15 +539,15 @@ define amdgpu_kernel void @v_lshr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16>
 ; GFX10-LABEL: v_lshr_v4i16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 3, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v7, s[2:3]
-; GFX10-NEXT:    global_load_dwordx2 v[2:3], v7, s[2:3] offset:8
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v4, s[2:3]
+; GFX10-NEXT:    global_load_dwordx2 v[2:3], v4, s[2:3] offset:8
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_lshrrev_b16 v1, v3, v1
 ; GFX10-NEXT:    v_pk_lshrrev_b16 v0, v2, v0
-; GFX10-NEXT:    global_store_dwordx2 v7, v[0:1], s[0:1]
+; GFX10-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64

diff  --git a/llvm/test/CodeGen/AMDGPU/memory_clause.ll b/llvm/test/CodeGen/AMDGPU/memory_clause.ll
index d9962d2fbceb..d686af2f1db3 100644
--- a/llvm/test/CodeGen/AMDGPU/memory_clause.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory_clause.ll
@@ -330,12 +330,12 @@ define void @load_global_d16_hi(i16 addrspace(1)* %in, i16 %reg, <2 x i16> addrs
 ; GCN-SCRATCH:       ; %bb.0: ; %entry
 ; GCN-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SCRATCH-NEXT:    s_waitcnt_vscnt null, 0x0
-; GCN-SCRATCH-NEXT:    v_mov_b32_e32 v6, v2
+; GCN-SCRATCH-NEXT:    v_mov_b32_e32 v5, v2
 ; GCN-SCRATCH-NEXT:    s_clause 0x1
-; GCN-SCRATCH-NEXT:    global_load_short_d16_hi v6, v[0:1], off
+; GCN-SCRATCH-NEXT:    global_load_short_d16_hi v5, v[0:1], off
 ; GCN-SCRATCH-NEXT:    global_load_short_d16_hi v2, v[0:1], off offset:64
 ; GCN-SCRATCH-NEXT:    s_waitcnt vmcnt(1)
-; GCN-SCRATCH-NEXT:    global_store_dword v[3:4], v6, off
+; GCN-SCRATCH-NEXT:    global_store_dword v[3:4], v5, off
 ; GCN-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-SCRATCH-NEXT:    global_store_dword v[3:4], v2, off offset:128
 ; GCN-SCRATCH-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -373,12 +373,12 @@ define void @load_global_d16_lo(i16 addrspace(1)* %in, i32 %reg, <2 x i16> addrs
 ; GCN-SCRATCH:       ; %bb.0: ; %entry
 ; GCN-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SCRATCH-NEXT:    s_waitcnt_vscnt null, 0x0
-; GCN-SCRATCH-NEXT:    v_mov_b32_e32 v6, v2
+; GCN-SCRATCH-NEXT:    v_mov_b32_e32 v5, v2
 ; GCN-SCRATCH-NEXT:    s_clause 0x1
-; GCN-SCRATCH-NEXT:    global_load_short_d16 v6, v[0:1], off
+; GCN-SCRATCH-NEXT:    global_load_short_d16 v5, v[0:1], off
 ; GCN-SCRATCH-NEXT:    global_load_short_d16 v2, v[0:1], off offset:64
 ; GCN-SCRATCH-NEXT:    s_waitcnt vmcnt(1)
-; GCN-SCRATCH-NEXT:    global_store_dword v[3:4], v6, off
+; GCN-SCRATCH-NEXT:    global_store_dword v[3:4], v5, off
 ; GCN-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-SCRATCH-NEXT:    global_store_dword v[3:4], v2, off offset:128
 ; GCN-SCRATCH-NEXT:    s_waitcnt_vscnt null, 0x0

diff  --git a/llvm/test/CodeGen/AMDGPU/regbank-reassign-split.mir b/llvm/test/CodeGen/AMDGPU/regbank-reassign-split.mir
deleted file mode 100644
index 8862644d2264..000000000000
--- a/llvm/test/CodeGen/AMDGPU/regbank-reassign-split.mir
+++ /dev/null
@@ -1,38 +0,0 @@
-# RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass greedy,amdgpu-regbanks-reassign,virtregrewriter -o - %s | FileCheck -check-prefix=GCN %s
-
---- |
-  define amdgpu_kernel void @do_not_reassign_spill() #0 { ret void }
-
-  attributes #0 = { "amdgpu-num-vgpr"="8" }
-...
-
-# GCN-LABEL: do_not_reassign_spill{{$}}
-# GCN: V_AND_B32_e32 killed $vgpr1, killed $vgpr5,
----
-name:            do_not_reassign_spill
-tracksRegLiveness: true
-machineFunctionInfo:
-  stackPtrOffsetReg:  $sgpr32
-stack:
-  - { id: 0, type: default, offset: 0, size: 4, alignment: 4 }
-registers:
-  - { id: 0, class: vgpr_32, preferred-register: '$vgpr0' }
-  - { id: 1, class: vgpr_32, preferred-register: '$vgpr1' }
-  - { id: 2, class: vgpr_32, preferred-register: '$vgpr2' }
-  - { id: 3, class: vgpr_32, preferred-register: '$vgpr3' }
-  - { id: 4, class: vgpr_32, preferred-register: '$vgpr4' }
-  - { id: 5, class: vgpr_32, preferred-register: '$vgpr5' }
-  - { id: 6, class: vgpr_32 }
-body: |
-  bb.0:
-    %0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 4 from %stack.0, align 4, addrspace 5)
-    %1 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 4 from %stack.0, align 4, addrspace 5)
-    %2 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 4 from %stack.0, align 4, addrspace 5)
-    %3 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 4 from %stack.0, align 4, addrspace 5)
-    %4 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 4 from %stack.0, align 4, addrspace 5)
-    %5 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 4 from %stack.0, align 4, addrspace 5)
-    S_NOP 0, implicit-def dead $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
-    S_NOP 0, implicit %0, implicit %1, implicit %2, implicit %3, implicit %4, implicit %5
-    %6 = V_AND_B32_e32 %1, %5, implicit $exec
-    S_ENDPGM 0, implicit %6
-...

diff  --git a/llvm/test/CodeGen/AMDGPU/regbank-reassign-wave64.mir b/llvm/test/CodeGen/AMDGPU/regbank-reassign-wave64.mir
deleted file mode 100644
index 918e009b3bf6..000000000000
--- a/llvm/test/CodeGen/AMDGPU/regbank-reassign-wave64.mir
+++ /dev/null
@@ -1,69 +0,0 @@
-# RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -run-pass greedy,amdgpu-regbanks-reassign,virtregrewriter -o - %s | FileCheck -check-prefix=GCN %s
-
-
-# Test that subreg reassignments are correctly handled when whole register also
-# conflicts.  If this is mishandled stall counts will be incorrect and cause an
-# infinite loop.
-# GCN-LABEL: vgpr64_mixed_use{{$}}
-# GCN: $vgpr0_vgpr1 = IMPLICIT_DEF
-# GCN: $vgpr4_vgpr5 = IMPLICIT_DEF
-# GCN: $vcc = IMPLICIT_DEF
-# GCN: $vgpr2_vgpr3 = IMPLICIT_DEF
-# GCN: $vgpr6_vgpr7 = IMPLICIT_DEF
-# GCN: $vgpr8_vgpr9_vgpr10_vgpr11 = IMPLICIT_DEF
-# GCN: $vgpr12_vgpr13_vgpr14_vgpr15 = IMPLICIT_DEF
-# GCN: $vgpr16_vgpr17_vgpr18_vgpr19 = IMPLICIT_DEF
-# GCN: $vgpr20_vgpr21_vgpr22_vgpr23 = IMPLICIT_DEF
-# GCN: $vgpr24_vgpr25_vgpr26_vgpr27 = IMPLICIT_DEF
-# GCN: $vgpr28_vgpr29_vgpr30_vgpr31 = IMPLICIT_DEF
-# GCN: $vgpr32_vgpr33_vgpr34_vgpr35 = IMPLICIT_DEF
-# GCN: $vgpr36_vgpr37_vgpr38_vgpr39 = IMPLICIT_DEF
-# GCN: $vgpr40_vgpr41_vgpr42_vgpr43 = IMPLICIT_DEF
-# GCN: $vgpr44_vgpr45_vgpr46_vgpr47 = IMPLICIT_DEF
-# GCN: $vgpr2 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr5, $vcc, implicit $exec
-# GCN: $vgpr2 = V_CNDMASK_B32_e64 0, $vgpr0, 0, $vgpr4, killed $vcc, implicit $exec
-# GCN: $sgpr0_sgpr1 = V_CMP_LT_U64_e64 killed $vgpr4_vgpr5, killed $vgpr0_vgpr1, implicit $exec
----
-name:            vgpr64_mixed_use
-tracksRegLiveness: true
-registers:
-  - { id: 0, class: vreg_64, preferred-register: '$vgpr0_vgpr1' }
-  - { id: 1, class: vreg_64, preferred-register: '$vgpr4_vgpr5' }
-  - { id: 2, class: sreg_64_xexec, preferred-register: '$vcc' }
-  - { id: 3, class: vgpr_32 }
-  - { id: 4, class: vgpr_32 }
-  - { id: 5, class: sreg_64_xexec }
-  - { id: 6, class: vreg_64, preferred-register: '$vgpr2_vgpr3' }
-  - { id: 7, class: vreg_64, preferred-register: '$vgpr6_vgpr7' }
-  - { id: 8, class: vreg_128, preferred-register: '$vgpr8_vgpr9_vgpr10_vgpr11' }
-  - { id: 9, class: vreg_128, preferred-register: '$vgpr12_vgpr13_vgpr14_vgpr15' }
-  - { id: 10, class: vreg_128, preferred-register: '$vgpr16_vgpr17_vgpr18_vgpr19' }
-  - { id: 11, class: vreg_128, preferred-register: '$vgpr20_vgpr21_vgpr22_vgpr23' }
-  - { id: 12, class: vreg_128, preferred-register: '$vgpr24_vgpr25_vgpr26_vgpr27' }
-  - { id: 13, class: vreg_128, preferred-register: '$vgpr28_vgpr29_vgpr30_vgpr31' }
-  - { id: 14, class: vreg_128, preferred-register: '$vgpr32_vgpr33_vgpr34_vgpr35' }
-  - { id: 15, class: vreg_128, preferred-register: '$vgpr36_vgpr37_vgpr38_vgpr39' }
-  - { id: 16, class: vreg_128, preferred-register: '$vgpr40_vgpr41_vgpr42_vgpr43' }
-  - { id: 17, class: vreg_128, preferred-register: '$vgpr44_vgpr45_vgpr46_vgpr47' }
-body: |
-  bb.0:
-    %0 = IMPLICIT_DEF
-    %1 = IMPLICIT_DEF
-    %2 = IMPLICIT_DEF
-    %6 = IMPLICIT_DEF
-    %7 = IMPLICIT_DEF
-    %8 = IMPLICIT_DEF
-    %9 = IMPLICIT_DEF
-    %10 = IMPLICIT_DEF
-    %11 = IMPLICIT_DEF
-    %12 = IMPLICIT_DEF
-    %13 = IMPLICIT_DEF
-    %14 = IMPLICIT_DEF
-    %15 = IMPLICIT_DEF
-    %16 = IMPLICIT_DEF
-    %17 = IMPLICIT_DEF
-    %3 = V_CNDMASK_B32_e64 0, %0.sub1, 0, %1.sub1, %2, implicit $exec
-    %4 = V_CNDMASK_B32_e64 0, %0.sub0, 0, %1.sub0, %2, implicit $exec
-    %5 = V_CMP_LT_U64_e64 %1, %0, implicit $exec
-    S_ENDPGM 0
-...

diff  --git a/llvm/test/CodeGen/AMDGPU/regbank-reassign.mir b/llvm/test/CodeGen/AMDGPU/regbank-reassign.mir
deleted file mode 100644
index df057da98c2b..000000000000
--- a/llvm/test/CodeGen/AMDGPU/regbank-reassign.mir
+++ /dev/null
@@ -1,611 +0,0 @@
-# RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass greedy,amdgpu-regbanks-reassign,virtregrewriter -o - %s | FileCheck -check-prefix=GCN %s
-
-# GCN-LABEL: v1_vs_v5{{$}}
-# GCN: V_AND_B32_e32 killed $vgpr3, killed $vgpr1,
----
-name:            v1_vs_v5
-tracksRegLiveness: true
-registers:
-  - { id: 0, class: vgpr_32, preferred-register: '$vgpr1' }
-  - { id: 1, class: vgpr_32, preferred-register: '$vgpr5' }
-  - { id: 2, class: vgpr_32 }
-body: |
-  bb.0:
-    %0 = IMPLICIT_DEF
-    %1 = IMPLICIT_DEF
-    %2 = V_AND_B32_e32 %1, %0, implicit $exec
-    S_ENDPGM 0
-...
-
-# GCN-LABEL: v0_1_vs_v4{{$}}
-# GCN: GLOBAL_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr3,
----
-name:            v0_1_vs_v4
-tracksRegLiveness: true
-registers:
-  - { id: 0, class: vgpr_32, preferred-register: '$vgpr4' }
-  - { id: 1, class: vreg_64, preferred-register: '$vgpr0_vgpr1' }
-body: |
-  bb.0:
-    %0 = IMPLICIT_DEF
-    %1 = IMPLICIT_DEF
-    GLOBAL_STORE_DWORD %1, %0, 0, 0, implicit $exec
-    S_ENDPGM 0
-...
-
-# GCN-LABEL: v1_2_vs_v4_5{{$}}
-# GCN: GLOBAL_STORE_DWORDX2 killed renamable $vgpr2_vgpr3, killed renamable $vgpr4_vgpr5,
----
-name:            v1_2_vs_v4_5
-tracksRegLiveness: true
-registers:
-  - { id: 0, class: vreg_64, preferred-register: '$vgpr4_vgpr5' }
-  - { id: 1, class: vreg_64, preferred-register: '$vgpr1_vgpr2' }
-body: |
-  bb.0:
-    %0 = IMPLICIT_DEF
-    %1 = IMPLICIT_DEF
-    GLOBAL_STORE_DWORDX2 %1, %0, 0, 0, implicit $exec
-    S_ENDPGM 0
-...
-
-# GCN-LABEL: s11_vs_vcc{{$}}
-# GCN: $vgpr0, $vcc_lo = V_ADDC_U32_e64 killed $sgpr14, killed $vgpr0, killed $vcc_lo, 0
----
-name:            s11_vs_vcc
-tracksRegLiveness: true
-registers:
-  - { id: 0, class: sgpr_32, preferred-register: '$sgpr11' }
-  - { id: 1, class: vgpr_32 }
-  - { id: 2, class: vgpr_32 }
-body: |
-  bb.0:
-    %0 = IMPLICIT_DEF
-    %1 = IMPLICIT_DEF
-    $vcc_lo = IMPLICIT_DEF
-    %2, $vcc_lo = V_ADDC_U32_e64 killed %0, killed %1, killed $vcc_lo, 0, implicit $exec
-    S_ENDPGM 0
-...
-
-# GCN-LABEL: s0_vs_s16{{$}}
-# GCN: S_AND_B32 killed renamable $sgpr14, $sgpr0,
----
-name:            s0_vs_s16
-tracksRegLiveness: true
-registers:
-  - { id: 0, class: sgpr_32, preferred-register: '$sgpr16' }
-  - { id: 1, class: sgpr_32 }
-body: |
-  bb.0:
-    %0 = IMPLICIT_DEF
-    $sgpr0 = IMPLICIT_DEF
-    %1 = S_AND_B32 %0, $sgpr0, implicit-def $scc
-    S_ENDPGM 0
-...
-
-# GCN-LABEL: s1_vs_s16{{$}}
-# GCN: S_AND_B32 killed renamable $sgpr14, $sgpr1,
----
-name:            s1_vs_s16
-tracksRegLiveness: true
-registers:
-  - { id: 0, class: sgpr_32, preferred-register: '$sgpr16' }
-  - { id: 1, class: sgpr_32 }
-body: |
-  bb.0:
-    %0 = IMPLICIT_DEF
-    $sgpr1 = IMPLICIT_DEF
-    %1 = S_AND_B32 %0, $sgpr1, implicit-def $scc
-    S_ENDPGM 0
-...
-
-# GCN-LABEL: s12_vs_null{{$}}
-# GCN: S_AND_B32 $sgpr_null, killed renamable $sgpr14,
----
-name:            s12_vs_null
-tracksRegLiveness: true
-registers:
-  - { id: 0, class: sgpr_32, preferred-register: '$sgpr12' }
-  - { id: 1, class: sgpr_32 }
-body: |
-  bb.0:
-    %0 = IMPLICIT_DEF
-    %1 = S_AND_B32 $sgpr_null, %0, implicit-def $scc
-    S_ENDPGM 0
-...
-
-# GCN-LABEL: s13_vs_m0{{$}}
-# GCN: S_AND_B32 $m0, killed renamable $sgpr14,
----
-name:            s13_vs_m0
-tracksRegLiveness: true
-registers:
-  - { id: 0, class: sgpr_32, preferred-register: '$sgpr13' }
-  - { id: 1, class: sgpr_32 }
-body: |
-  bb.0:
-    %0 = IMPLICIT_DEF
-    %1 = S_AND_B32 $m0, %0, implicit-def $scc
-    S_ENDPGM 0
-...
-
-# GCN-LABEL: s12_13_vs_s28_s29{{$}}
-# GCN: S_AND_B64 $sgpr28_sgpr29, killed renamable $sgpr14_sgpr15,
----
-name:            s12_13_vs_s28_s29
-tracksRegLiveness: true
-registers:
-  - { id: 0, class: sreg_64, preferred-register: '$sgpr12_sgpr13' }
-  - { id: 1, class: sreg_64 }
-body: |
-  bb.0:
-    %0 = IMPLICIT_DEF
-    $sgpr28_sgpr29 = IMPLICIT_DEF
-    %1 = S_AND_B64 $sgpr28_sgpr29, %0, implicit-def $scc
-    S_ENDPGM 0
-...
-
-# GCN-LABEL: livein{{$}}
-# GCN: V_AND_B32_e32 killed $vgpr4, killed $vgpr0,
----
-name:            livein
-tracksRegLiveness: true
-registers:
-  - { id: 0, class: vgpr_32, preferred-register: '$vgpr0' }
-  - { id: 1, class: vgpr_32, preferred-register: '$vgpr4' }
-  - { id: 2, class: vgpr_32 }
-liveins:
-  - { reg: '$vgpr0', virtual-reg: '' }
-  - { reg: '$vgpr4', virtual-reg: '' }
-body: |
-  bb.0:
-    liveins: $vgpr0, $vgpr4
-
-    %0 = COPY $vgpr0
-    %1 = COPY $vgpr4
-    %2 = V_AND_B32_e32 %1, %0, implicit $exec
-    S_ENDPGM 0
-...
-
-# GCN-LABEL: liveout{{$}}
-# GCN: V_AND_B32_e32 $vgpr4, $vgpr0,
----
-name:            liveout
-tracksRegLiveness: true
-registers:
-  - { id: 0, class: vgpr_32, preferred-register: '$vgpr0' }
-  - { id: 1, class: vgpr_32, preferred-register: '$vgpr4' }
-  - { id: 2, class: vgpr_32 }
-body: |
-  bb.0:
-    %0 = IMPLICIT_DEF
-    %1 = IMPLICIT_DEF
-    %2 = V_AND_B32_e32 %1, %0, implicit $exec
-    $vgpr0 = COPY %0
-    $vgpr4 = COPY %1
-    S_ENDPGM 0
-...
-
-# GCN-LABEL: implicit{{$}}
-# GCN: V_MOV_B32_indirect undef $vgpr4, undef $vgpr0, implicit $exec, implicit-def dead renamable $vgpr0_vgpr1_vgpr2_vgpr3, implicit killed $vgpr4_vgpr5_vgpr6_vgpr7, implicit $m0
----
-name:            implicit
-tracksRegLiveness: true
-registers:
-  - { id: 0, class: vreg_128 }
-  - { id: 1, class: vreg_128, preferred-register: '$vgpr4_vgpr5_vgpr6_vgpr7' }
-body: |
-  bb.0:
-    %1 = IMPLICIT_DEF
-    V_MOV_B32_indirect undef %1.sub0:vreg_128, undef $vgpr0, implicit $exec, implicit-def %0:vreg_128, implicit %1:vreg_128, implicit $m0
-    S_ENDPGM 0
-...
-
-# GCN-LABEL: occupancy_limit{{$}}
-# GCN: V_AND_B32_e32 $vgpr4, $vgpr0,
----
-name:            occupancy_limit
-tracksRegLiveness: true
-registers:
-  - { id: 0, class: vgpr_32, preferred-register: '$vgpr0' }
-  - { id: 1, class: vgpr_32, preferred-register: '$vgpr4' }
-  - { id: 2, class: vgpr_32, preferred-register: '$vgpr1' }
-  - { id: 3, class: vreg_64, preferred-register: '$vgpr2_vgpr3' }
-  - { id: 4, class: vgpr_32, preferred-register: '$vgpr5' }
-  - { id: 5, class: vreg_64, preferred-register: '$vgpr6_vgpr7' }
-  - { id: 6, class: vreg_128, preferred-register: '$vgpr8_vgpr9_vgpr10_vgpr11' }
-  - { id: 7, class: vreg_128, preferred-register: '$vgpr12_vgpr13_vgpr14_vgpr15' }
-  - { id: 8, class: vreg_128, preferred-register: '$vgpr16_vgpr17_vgpr18_vgpr19' }
-  - { id: 9, class: vreg_128, preferred-register: '$vgpr20_vgpr21_vgpr22_vgpr23' }
-  - { id: 10, class: vreg_128, preferred-register: '$vgpr24_vgpr25_vgpr26_vgpr27' }
-  - { id: 11, class: vreg_128, preferred-register: '$vgpr28_vgpr29_vgpr30_vgpr31' }
-  - { id: 12, class: vreg_128, preferred-register: '$vgpr32_vgpr33_vgpr34_vgpr35' }
-  - { id: 13, class: vreg_128, preferred-register: '$vgpr36_vgpr37_vgpr38_vgpr39' }
-  - { id: 14, class: vreg_128, preferred-register: '$vgpr40_vgpr41_vgpr42_vgpr43' }
-  - { id: 15, class: vreg_128, preferred-register: '$vgpr44_vgpr45_vgpr46_vgpr47' }
-body: |
-  bb.0:
-    %0 = IMPLICIT_DEF
-    %1 = IMPLICIT_DEF
-    %3 = IMPLICIT_DEF
-    %4 = IMPLICIT_DEF
-    %5 = IMPLICIT_DEF
-    %6 = IMPLICIT_DEF
-    %7 = IMPLICIT_DEF
-    %8 = IMPLICIT_DEF
-    %9 = IMPLICIT_DEF
-    %10 = IMPLICIT_DEF
-    %11 = IMPLICIT_DEF
-    %12 = IMPLICIT_DEF
-    %13 = IMPLICIT_DEF
-    %14 = IMPLICIT_DEF
-    %15 = IMPLICIT_DEF
-    %2 = V_AND_B32_e32 %1, %0, implicit $exec
-    GLOBAL_STORE_DWORD %3, %0, 0, 0, implicit $exec
-    GLOBAL_STORE_DWORD %3, %1, 0, 0, implicit $exec
-    GLOBAL_STORE_DWORD %3, %2, 0, 0, implicit $exec
-    GLOBAL_STORE_DWORD %3, %4, 0, 0, implicit $exec
-    GLOBAL_STORE_DWORDX2 %3, %5, 0, 0, implicit $exec
-    GLOBAL_STORE_DWORDX4 %3, %6, 0, 0, implicit $exec
-    GLOBAL_STORE_DWORDX4 %3, %7, 0, 0, implicit $exec
-    GLOBAL_STORE_DWORDX4 %3, %8, 0, 0, implicit $exec
-    GLOBAL_STORE_DWORDX4 %3, %9, 0, 0, implicit $exec
-    S_ENDPGM 0
-...
-
-# GCN-LABEL: csr{{$}}
-# GCN: V_AND_B32_e32 $vgpr37, $vgpr0,
----
-name:            csr
-tracksRegLiveness: true
-registers:
-  - { id: 0, class: vgpr_32, preferred-register: '$vgpr0' }
-  - { id: 1, class: vgpr_32, preferred-register: '$vgpr4' }
-  - { id: 2, class: vgpr_32, preferred-register: '$vgpr1' }
-  - { id: 3, class: vreg_64, preferred-register: '$vgpr2_vgpr3' }
-  - { id: 4, class: vgpr_32, preferred-register: '$vgpr5' }
-  - { id: 5, class: vreg_64, preferred-register: '$vgpr6_vgpr7' }
-  - { id: 6, class: vreg_128, preferred-register: '$vgpr8_vgpr9_vgpr10_vgpr11' }
-  - { id: 7, class: vreg_128, preferred-register: '$vgpr12_vgpr13_vgpr14_vgpr15' }
-  - { id: 8, class: vreg_128, preferred-register: '$vgpr16_vgpr17_vgpr18_vgpr19' }
-  - { id: 9, class: vreg_128, preferred-register: '$vgpr20_vgpr21_vgpr22_vgpr23' }
-  - { id: 10, class: vreg_128, preferred-register: '$vgpr24_vgpr25_vgpr26_vgpr27' }
-  - { id: 11, class: vreg_128, preferred-register: '$vgpr28_vgpr29_vgpr30_vgpr31' }
-  - { id: 12, class: vgpr_32, preferred-register: '$vgpr33' }
-body: |
-  bb.0:
-    %0 = IMPLICIT_DEF
-    %1 = IMPLICIT_DEF
-    %3 = IMPLICIT_DEF
-    %4 = IMPLICIT_DEF
-    %5 = IMPLICIT_DEF
-    %6 = IMPLICIT_DEF
-    %7 = IMPLICIT_DEF
-    %8 = IMPLICIT_DEF
-    %9 = IMPLICIT_DEF
-    %10 = IMPLICIT_DEF
-    %11 = IMPLICIT_DEF
-    %12 = IMPLICIT_DEF
-    %2 = V_AND_B32_e32 %1, %0, implicit $exec
-    GLOBAL_STORE_DWORD %3, %0, 0, 0, implicit $exec
-    GLOBAL_STORE_DWORD %3, %1, 0, 0, implicit $exec
-    GLOBAL_STORE_DWORD %3, %2, 0, 0, implicit $exec
-    GLOBAL_STORE_DWORD %3, %4, 0, 0, implicit $exec
-    GLOBAL_STORE_DWORDX2 %3, %5, 0, 0, implicit $exec
-    GLOBAL_STORE_DWORDX4 %3, %6, 0, 0, implicit $exec
-    GLOBAL_STORE_DWORDX4 %3, %7, 0, 0, implicit $exec
-    GLOBAL_STORE_DWORDX4 %3, %8, 0, 0, implicit $exec
-    GLOBAL_STORE_DWORDX4 %3, %9, 0, 0, implicit $exec
-    GLOBAL_STORE_DWORDX4 %3, %10, 0, 0, implicit $exec
-    GLOBAL_STORE_DWORDX4 %3, %11, 0, 0, implicit $exec
-    GLOBAL_STORE_DWORD %3, %12, 0, 0, implicit $exec
-    S_ENDPGM 0
-...
-
-# Do not touch undefs
-# GCN-LABEL: s0_vs_s16_undef{{$}}
-# GCN: S_AND_B32 killed renamable $sgpr16, undef $sgpr0,
----
-name:            s0_vs_s16_undef
-tracksRegLiveness: true
-registers:
-  - { id: 0, class: sgpr_32, preferred-register: '$sgpr16' }
-  - { id: 1, class: sgpr_32 }
-body: |
-  bb.0:
-    %0 = IMPLICIT_DEF
-    %1 = S_AND_B32 %0, undef $sgpr0, implicit-def $scc
-    S_ENDPGM 0
-...
-
-# GCN-LABEL: smem_bundle{{$}}
-# GCN: S_BUFFER_LOAD_DWORD_SGPR renamable $sgpr0_sgpr1_sgpr2_sgpr3, renamable $sgpr14, 0
-# GCN: S_BUFFER_LOAD_DWORD_SGPR renamable $sgpr0_sgpr1_sgpr2_sgpr3, renamable $sgpr15, 0
----
-name:          smem_bundle
-tracksRegLiveness: true
-registers:
-  - { id: 0, class: sgpr_128, preferred-register: '$sgpr0_sgpr1_sgpr2_sgpr3' }
-  - { id: 1, class: sreg_32_xm0_xexec, preferred-register: '$sgpr16' }
-  - { id: 2, class: sreg_32_xm0_xexec, preferred-register: '$sgpr17' }
-  - { id: 3, class: sreg_32_xm0_xexec, preferred-register: '$sgpr4' }
-  - { id: 4, class: sreg_32_xm0_xexec, preferred-register: '$sgpr5' }
-body: |
-  bb.0:
-    %0 = IMPLICIT_DEF
-    %1 = IMPLICIT_DEF
-    %2 = IMPLICIT_DEF
-    early-clobber %3, early-clobber %4 = BUNDLE %0, %1, %2 {
-      %3 = S_BUFFER_LOAD_DWORD_SGPR %0, %1, 0
-      %4 = S_BUFFER_LOAD_DWORD_SGPR %0, %2, 0
-    }
-    S_ENDPGM 0
-...
-
-# GCN-LABEL: vreg_512_subs{{$}}
-# don't care about the assignment: this used to trigger an infinite loop
----
-name:            vreg_512_subs
-tracksRegLiveness: true
-registers:
-  - { id: 1, class: vreg_512, preferred-register: '$vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15' }
-  - { id: 2, class: vgpr_32, preferred-register: '$vgpr28' }
-body:             |
-  bb.0:
-    %1 = IMPLICIT_DEF
-    %2 = IMPLICIT_DEF
-    DS_WRITE2_B32_gfx9 %2, %1.sub0, %1.sub1, 0, 1, 0, implicit $exec
-    DS_WRITE2_B32_gfx9 %2, %1.sub2, %1.sub3, 2, 3, 0, implicit $exec
-    DS_WRITE2_B32_gfx9 %2, %1.sub4, %1.sub5, 4, 5, 0, implicit $exec
-    DS_WRITE2_B32_gfx9 %2, %1.sub6, %1.sub7, 6, 7, 0, implicit $exec
-    DS_WRITE2_B32_gfx9 %2, %1.sub8, %1.sub9, 8, 9, 0, implicit $exec
-    DS_WRITE2_B32_gfx9 %2, %1.sub10, %1.sub11, 10, 11, 0, implicit $exec
-    DS_WRITE2_B32_gfx9 %2, %1.sub12, %1.sub13, 12, 13, 0, implicit $exec
-    DS_WRITE2_B32_gfx9 %2, %1.sub14, %1.sub15, 14, 15, 0, implicit $exec
-    S_ENDPGM 0
-...
-
-# GCN-LABEL: vgpr_lo16_sub{{$}}
-# GCN: renamable $vgpr0 = V_AND_B32_e32 killed $vgpr3, killed $vgpr1, implicit $exec
-# GCN: renamable $vgpr1_lo16 = COPY killed renamable $vgpr0_lo16
----
-name:            vgpr_lo16_sub
-tracksRegLiveness: true
-registers:
-  - { id: 0, class: vgpr_32, preferred-register: '$vgpr1' }
-  - { id: 1, class: vgpr_32, preferred-register: '$vgpr5' }
-  - { id: 2, class: vgpr_32 }
-  - { id: 3, class: vgpr_lo16 }
-body: |
-  bb.0:
-    %0 = IMPLICIT_DEF
-    %1 = IMPLICIT_DEF
-    %2 = V_AND_B32_e32 %1, %0, implicit $exec
-    %3 = COPY %2.lo16
-    $vgpr1_lo16 = COPY %3
-    SI_RETURN_TO_EPILOG $vgpr1_lo16
-...
-
-# GCN-LABEL: vgpr_lo16{{$}}
-# GCN: $vgpr1_lo16 = COPY killed renamable $vgpr0_lo16
----
-name:            vgpr_lo16
-tracksRegLiveness: true
-registers:
-  - { id: 0, class: vgpr_lo16, preferred-register: '$vgpr4_lo16' }
-body: |
-  bb.0:
-    liveins: $vgpr0_lo16
-
-    %0 = COPY $vgpr0_lo16
-    $vgpr1_lo16 = COPY %0
-    SI_RETURN_TO_EPILOG $vgpr1_lo16
-...
-
-# GCN-LABEL: vgpr_hi16_sub{{$}}
-# GCN: renamable $vgpr0 = V_AND_B32_e32 killed $vgpr3, killed $vgpr1, implicit $exec
-# GCN: renamable $vgpr1_hi16 = COPY killed renamable $vgpr0_hi16
----
-name:            vgpr_hi16_sub
-tracksRegLiveness: true
-registers:
-  - { id: 0, class: vgpr_32, preferred-register: '$vgpr1' }
-  - { id: 1, class: vgpr_32, preferred-register: '$vgpr5' }
-  - { id: 2, class: vgpr_32 }
-  - { id: 3, class: vgpr_hi16 }
-body: |
-  bb.0:
-    %0 = IMPLICIT_DEF
-    %1 = IMPLICIT_DEF
-    %2 = V_AND_B32_e32 %1, %0, implicit $exec
-    %3 = COPY %2.hi16
-    $vgpr1_hi16 = COPY %3
-    SI_RETURN_TO_EPILOG $vgpr1_hi16
-...
-
-# GCN-LABEL: vgpr_hi16{{$}}
-# GCN: $vgpr1_hi16 = COPY killed renamable $vgpr0_hi16
----
-name:            vgpr_hi16
-tracksRegLiveness: true
-registers:
-  - { id: 0, class: vgpr_hi16, preferred-register: '$vgpr4_hi16' }
-body: |
-  bb.0:
-    liveins: $vgpr0_hi16
-
-    %0 = COPY $vgpr0_hi16
-    $vgpr1_hi16 = COPY %0
-    SI_RETURN_TO_EPILOG $vgpr1_hi16
-...
-
-# GCN-LABEL: sgpr_lo16_sub{{$}}
-# GCN: renamable $sgpr0 = S_AND_B32 killed renamable $sgpr14, $sgpr0, implicit-def $scc
-# GCN: renamable $sgpr1_lo16 = COPY killed renamable $sgpr0_lo16
----
-name:            sgpr_lo16_sub
-tracksRegLiveness: true
-registers:
-  - { id: 0, class: sgpr_32, preferred-register: '$sgpr16' }
-  - { id: 1, class: sgpr_32 }
-  - { id: 2, class: sgpr_lo16 }
-body: |
-  bb.0:
-    %0 = IMPLICIT_DEF
-    $sgpr0 = IMPLICIT_DEF
-    %1 = S_AND_B32 %0, $sgpr0, implicit-def $scc
-    %2 = COPY %1.lo16
-    $sgpr1_lo16 = COPY %2
-    SI_RETURN_TO_EPILOG $sgpr1_lo16
-...
-
-# GCN-LABEL: sgpr_lo16{{$}}
-# GCN: $sgpr1_lo16 = COPY killed renamable $sgpr0_lo16
----
-name:            sgpr_lo16
-tracksRegLiveness: true
-registers:
-  - { id: 0, class: sgpr_lo16, preferred-register: '$sgpr4_lo16' }
-body: |
-  bb.0:
-    liveins: $sgpr0_lo16
-
-    %0 = COPY $sgpr0_lo16
-    $sgpr1_lo16 = COPY %0
-    SI_RETURN_TO_EPILOG $sgpr1_lo16
-...
-
-# Check that we do not use VGPR3 which we would use otherwise.
-# We cannot use it because of interference with VGPR3_LO16.
-# GCN-LABEL: v1_vs_v5_src_interence{{$}}
-# GCN: V_AND_B32_e32 killed $vgpr7, killed $vgpr1,
----
-name:            v1_vs_v5_src_interence
-tracksRegLiveness: true
-registers:
-  - { id: 0, class: vgpr_32, preferred-register: '$vgpr1' }
-  - { id: 1, class: vgpr_32, preferred-register: '$vgpr5' }
-  - { id: 2, class: vgpr_32 }
-body: |
-  bb.0:
-    %0 = IMPLICIT_DEF
-    %1 = IMPLICIT_DEF
-    $vgpr3_lo16 = IMPLICIT_DEF
-    %2 = V_AND_B32_e32 %1, %0, implicit $exec
-    S_ENDPGM 0
-...
-
-# Test that bank of subreg is considered during scavenging.
-# If handled incorrectly an infinite loop occurs.
-# GCN-LABEL: s0_vs_s15_16_17_sub1{{$}}
-# GCN: S_AND_B32 killed renamable $sgpr13, $sgpr0,
----
-name:            s0_vs_s15_16_17_sub1
-tracksRegLiveness: true
-registers:
-  - { id: 0, class: sgpr_96, preferred-register: '$sgpr15_sgpr16_sgpr17' }
-  - { id: 1, class: sgpr_32 }
-body: |
-  bb.0:
-    %0 = IMPLICIT_DEF
-    $sgpr0 = IMPLICIT_DEF
-    %1 = S_AND_B32 %0.sub1, $sgpr0, implicit-def $scc
-    S_ENDPGM 0
-...
-
-# Test that the size of subreg is correctly handled in bank calculation.
-# If handled incorrectly an infinite loop occurs.
-# GCN-LABEL: vgpr_sub_dependence{{$}}
-# GCN: $vgpr9_vgpr10_vgpr11_vgpr12 = IMPLICIT_DEF
-# GCN: $vgpr16_vgpr17 = IMPLICIT_DEF
-# GCN: $vgpr14_vgpr15 = IMPLICIT_DEF
-# GCN: $vgpr0_vgpr1 = IMPLICIT_DEF
-# GCN: $vgpr7_vgpr8 = IMPLICIT_DEF
-# GCN: $vgpr3_vgpr4_vgpr5_vgpr6 = IMPLICIT_DEF
-# GCN: $vgpr18_vgpr19 = IMPLICIT_DEF
-# GCN: $vgpr20_vgpr21_vgpr22_vgpr23 = IMPLICIT_DEF
-# GCN: $vgpr24_vgpr25_vgpr26_vgpr27 = IMPLICIT_DEF
-# GCN: $vgpr28_vgpr29_vgpr30_vgpr31 = IMPLICIT_DEF
-# GCN: $vgpr32_vgpr33_vgpr34_vgpr35 = IMPLICIT_DEF
-# GCN: $vgpr36_vgpr37_vgpr38_vgpr39 = IMPLICIT_DEF
-# GCN: $vgpr40_vgpr41_vgpr42_vgpr43 = IMPLICIT_DEF
-# GCN: $vgpr44_vgpr45_vgpr46_vgpr47 = IMPLICIT_DEF
-# GCN: $vgpr0_vgpr1 = V_ADD_F64_e64 0, $vgpr11_vgpr12, 0, killed $vgpr16_vgpr17, 0, 0, implicit $mode, implicit $exec
-# GCN: $vgpr0_vgpr1 = V_ADD_F64_e64 0, killed $vgpr9_vgpr10, 0, killed $vgpr14_vgpr15, 0, 0, implicit $mode, implicit $exec
----
-name:            vgpr_sub_dependence
-tracksRegLiveness: true
-registers:
-  - { id: 0, class: vreg_128, preferred-register: '$vgpr10_vgpr11_vgpr12_vgpr13' }
-  - { id: 1, class: vreg_64, preferred-register: '$vgpr16_vgpr17' }
-  - { id: 2, class: vreg_64, preferred-register: '$vgpr14_vgpr15' }
-  - { id: 3, class: vreg_64 }
-  - { id: 4, class: vreg_64 }
-  - { id: 5, class: vreg_64, preferred-register: '$vgpr0_vgpr1' }
-  - { id: 6, class: vreg_64, preferred-register: '$vgpr7_vgpr8' }
-  - { id: 7, class: vreg_128, preferred-register: '$vgpr3_vgpr4_vgpr5_vgpr6' }
-  - { id: 8, class: vreg_64, preferred-register: '$vgpr18_vgpr19' }
-  - { id: 9, class: vreg_128, preferred-register: '$vgpr20_vgpr21_vgpr22_vgpr23' }
-  - { id: 10, class: vreg_128, preferred-register: '$vgpr24_vgpr25_vgpr26_vgpr27' }
-  - { id: 11, class: vreg_128, preferred-register: '$vgpr28_vgpr29_vgpr30_vgpr31' }
-  - { id: 12, class: vreg_128, preferred-register: '$vgpr32_vgpr33_vgpr34_vgpr35' }
-  - { id: 13, class: vreg_128, preferred-register: '$vgpr36_vgpr37_vgpr38_vgpr39' }
-  - { id: 14, class: vreg_128, preferred-register: '$vgpr40_vgpr41_vgpr42_vgpr43' }
-  - { id: 15, class: vreg_128, preferred-register: '$vgpr44_vgpr45_vgpr46_vgpr47' }
-body: |
-  bb.0:
-    %0 = IMPLICIT_DEF
-    %1 = IMPLICIT_DEF
-    %2 = IMPLICIT_DEF
-    %5 = IMPLICIT_DEF
-    %6 = IMPLICIT_DEF
-    %7 = IMPLICIT_DEF
-    %8 = IMPLICIT_DEF
-    %9 = IMPLICIT_DEF
-    %10 = IMPLICIT_DEF
-    %11 = IMPLICIT_DEF
-    %12 = IMPLICIT_DEF
-    %13 = IMPLICIT_DEF
-    %14 = IMPLICIT_DEF
-    %15 = IMPLICIT_DEF
-    %3 = V_ADD_F64_e64 0, %0.sub2_sub3:vreg_128, 0, %1:vreg_64, 0, 0, implicit $mode, implicit $exec
-    %4 = V_ADD_F64_e64 0, %0.sub0_sub1:vreg_128, 0, %2:vreg_64, 0, 0, implicit $mode, implicit $exec
-    S_ENDPGM 0
-...
-
-# GCN-LABEL: dbg_value_v1_v5{{$}}
-# GCN: renamable $vgpr1 = IMPLICIT_DEF
-# GCN: renamable $vgpr5 = IMPLICIT_DEF
----
-name:            dbg_value_v1_v5
-tracksRegLiveness: true
-registers:
-  - { id: 0, class: vgpr_32, preferred-register: '$vgpr1' }
-  - { id: 1, class: vgpr_32, preferred-register: '$vgpr5' }
-  - { id: 2, class: vgpr_32 }
-body: |
-  bb.0:
-    %0 = IMPLICIT_DEF
-    %1 = IMPLICIT_DEF
-    DBG_VALUE debug-use %1, debug-use %0
-    S_ENDPGM 0, implicit %0, implicit %1
-...
-
-# GCN-LABEL: kill_v1_v5{{$}}
-# GCN: renamable $vgpr1 = IMPLICIT_DEF
-# GCN: renamable $vgpr5 = IMPLICIT_DEF
-# GCN: KILL killed renamable $vgpr5, killed renamable $vgpr1
----
-name:            kill_v1_v5
-tracksRegLiveness: true
-registers:
-  - { id: 0, class: vgpr_32, preferred-register: '$vgpr1' }
-  - { id: 1, class: vgpr_32, preferred-register: '$vgpr5' }
-  - { id: 2, class: vgpr_32 }
-body: |
-  bb.0:
-    %0 = IMPLICIT_DEF
-    %1 = IMPLICIT_DEF
-    KILL %1, %0
-    S_ENDPGM 0
-...

diff  --git a/llvm/test/CodeGen/AMDGPU/saddo.ll b/llvm/test/CodeGen/AMDGPU/saddo.ll
index e789388325de..33526c956a8e 100644
--- a/llvm/test/CodeGen/AMDGPU/saddo.ll
+++ b/llvm/test/CodeGen/AMDGPU/saddo.ll
@@ -458,16 +458,16 @@ define amdgpu_kernel void @v_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)*
 ; GFX10-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dwordx2 v[9:10], v6, s[8:9]
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v6, s[8:9]
 ; GFX10-NEXT:    global_load_dwordx2 v[2:3], v6, s[10:11]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v7, vcc_lo, v9, v2
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v8, vcc_lo, v10, v3, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, v0, v2
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
 ; GFX10-NEXT:    v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3]
-; GFX10-NEXT:    v_cmp_lt_i64_e64 s0, v[7:8], v[9:10]
+; GFX10-NEXT:    v_cmp_lt_i64_e64 s0, v[4:5], v[0:1]
 ; GFX10-NEXT:    s_xor_b32 s0, vcc_lo, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX10-NEXT:    global_store_dwordx2 v6, v[7:8], s[4:5]
+; GFX10-NEXT:    global_store_dwordx2 v6, v[4:5], s[4:5]
 ; GFX10-NEXT:    global_store_byte v6, v0, s[6:7]
 ; GFX10-NEXT:    s_endpgm
   %a = load i64, i64 addrspace(1)* %aptr, align 4
@@ -575,14 +575,14 @@ define amdgpu_kernel void @v_saddo_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32>
 ; GFX10-NEXT:    global_load_dwordx2 v[2:3], v4, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_add_nc_i32 v5, v1, v3 clamp
-; GFX10-NEXT:    v_add_nc_u32_e32 v10, v1, v3
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, v1, v3
 ; GFX10-NEXT:    v_add_nc_i32 v6, v0, v2 clamp
-; GFX10-NEXT:    v_add_nc_u32_e32 v9, v0, v2
-; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v10, v5
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, v0, v2
+; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc_lo
-; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v9, v6
+; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v0, v6
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
-; GFX10-NEXT:    global_store_dwordx2 v4, v[9:10], s[0:1]
+; GFX10-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
 ; GFX10-NEXT:    global_store_dwordx2 v4, v[2:3], s[2:3]
 ; GFX10-NEXT:    s_endpgm
   %a = load <2 x i32>, <2 x i32> addrspace(1)* %aptr, align 4

diff  --git a/llvm/test/CodeGen/AMDGPU/saddsat.ll b/llvm/test/CodeGen/AMDGPU/saddsat.ll
index 812049942783..3c2b66c302c1 100644
--- a/llvm/test/CodeGen/AMDGPU/saddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/saddsat.ll
@@ -486,17 +486,17 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_add_co_u32 v10, vcc_lo, v0, v2
+; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, v0, v2
 ; GFX10-NEXT:    v_bfrev_b32_e32 v6, -2
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v11, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
 ; GFX10-NEXT:    v_cmp_gt_i64_e64 s4, 0, v[2:3]
-; GFX10-NEXT:    v_cmp_gt_i64_e64 s5, 0, v[10:11]
-; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[10:11], v[0:1]
-; GFX10-NEXT:    v_ashrrev_i32_e32 v0, 31, v11
+; GFX10-NEXT:    v_cmp_gt_i64_e64 s5, 0, v[4:5]
+; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1]
+; GFX10-NEXT:    v_ashrrev_i32_e32 v0, 31, v5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0x80000000, v6, s5
 ; GFX10-NEXT:    s_xor_b32 vcc_lo, s4, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v10, v0, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v11, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %result = call i64 @llvm.sadd.sat.i64(i64 %lhs, i64 %rhs)
   ret i64 %result

diff  --git a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
index 480b0269ea95..1c7c1db25923 100644
--- a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
@@ -544,15 +544,15 @@ define amdgpu_kernel void @v_shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> a
 ; GFX10-LABEL: v_shl_v4i16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 3, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v7, s[2:3]
-; GFX10-NEXT:    global_load_dwordx2 v[2:3], v7, s[2:3] offset:8
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v4, s[2:3]
+; GFX10-NEXT:    global_load_dwordx2 v[2:3], v4, s[2:3] offset:8
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v1, v3, v1
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v0, v2, v0
-; GFX10-NEXT:    global_store_dwordx2 v7, v[0:1], s[0:1]
+; GFX10-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64

diff  --git a/llvm/test/CodeGen/AMDGPU/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/ssubsat.ll
index 4764fad87b46..fde23b00aec5 100644
--- a/llvm/test/CodeGen/AMDGPU/ssubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/ssubsat.ll
@@ -1100,17 +1100,17 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_sub_co_u32 v10, vcc_lo, v0, v2
+; GFX10-NEXT:    v_sub_co_u32 v4, vcc_lo, v0, v2
 ; GFX10-NEXT:    v_bfrev_b32_e32 v6, -2
-; GFX10-NEXT:    v_sub_co_ci_u32_e32 v11, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT:    v_sub_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
 ; GFX10-NEXT:    v_cmp_lt_i64_e64 s4, 0, v[2:3]
-; GFX10-NEXT:    v_cmp_gt_i64_e64 s5, 0, v[10:11]
-; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[10:11], v[0:1]
-; GFX10-NEXT:    v_ashrrev_i32_e32 v0, 31, v11
+; GFX10-NEXT:    v_cmp_gt_i64_e64 s5, 0, v[4:5]
+; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1]
+; GFX10-NEXT:    v_ashrrev_i32_e32 v0, 31, v5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0x80000000, v6, s5
 ; GFX10-NEXT:    s_xor_b32 vcc_lo, s4, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v10, v0, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v11, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %result = call i64 @llvm.ssub.sat.i64(i64 %lhs, i64 %rhs)
   ret i64 %result

diff  --git a/llvm/test/CodeGen/AMDGPU/store-local.128.ll b/llvm/test/CodeGen/AMDGPU/store-local.128.ll
index 831bf871e7b6..cf2f5577df4b 100644
--- a/llvm/test/CodeGen/AMDGPU/store-local.128.ll
+++ b/llvm/test/CodeGen/AMDGPU/store-local.128.ll
@@ -230,12 +230,12 @@ define amdgpu_kernel void @store_lds_v4i32_align1(<4 x i32> addrspace(3)* %out,
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX10-NEXT:    s_lshr_b32 s1, s7, 24
 ; GFX10-NEXT:    s_lshr_b32 s5, s5, 24
-; GFX10-NEXT:    v_mov_b32_e32 v15, s3
+; GFX10-NEXT:    v_mov_b32_e32 v8, s3
 ; GFX10-NEXT:    v_mov_b32_e32 v5, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v9, s6
 ; GFX10-NEXT:    s_lshr_b32 s0, s4, 8
 ; GFX10-NEXT:    v_mov_b32_e32 v6, s1
-; GFX10-NEXT:    v_mov_b32_e32 v11, s4
+; GFX10-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX10-NEXT:    v_mov_b32_e32 v7, s2
 ; GFX10-NEXT:    ds_write_b8 v0, v1 offset:12
 ; GFX10-NEXT:    ds_write_b8_d16_hi v0, v1 offset:14
@@ -243,8 +243,8 @@ define amdgpu_kernel void @store_lds_v4i32_align1(<4 x i32> addrspace(3)* %out,
 ; GFX10-NEXT:    ds_write_b8_d16_hi v0, v2 offset:10
 ; GFX10-NEXT:    ds_write_b8 v0, v3 offset:4
 ; GFX10-NEXT:    ds_write_b8_d16_hi v0, v3 offset:6
-; GFX10-NEXT:    ds_write_b8 v0, v11
-; GFX10-NEXT:    ds_write_b8_d16_hi v0, v11 offset:2
+; GFX10-NEXT:    ds_write_b8 v0, v4
+; GFX10-NEXT:    ds_write_b8_d16_hi v0, v4 offset:2
 ; GFX10-NEXT:    ds_write_b8 v0, v5 offset:13
 ; GFX10-NEXT:    ds_write_b8 v0, v6 offset:15
 ; GFX10-NEXT:    ds_write_b8 v0, v7 offset:9
@@ -252,7 +252,7 @@ define amdgpu_kernel void @store_lds_v4i32_align1(<4 x i32> addrspace(3)* %out,
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s1
-; GFX10-NEXT:    ds_write_b8 v0, v15 offset:11
+; GFX10-NEXT:    ds_write_b8 v0, v8 offset:11
 ; GFX10-NEXT:    ds_write_b8 v0, v9 offset:5
 ; GFX10-NEXT:    ds_write_b8 v0, v1 offset:7
 ; GFX10-NEXT:    ds_write_b8 v0, v2 offset:1
@@ -351,15 +351,15 @@ define amdgpu_kernel void @store_lds_v4i32_align2(<4 x i32> addrspace(3)* %out,
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-NEXT:    v_mov_b32_e32 v7, s4
+; GFX10-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX10-NEXT:    ds_write_b16 v0, v1 offset:12
 ; GFX10-NEXT:    ds_write_b16_d16_hi v0, v1 offset:14
 ; GFX10-NEXT:    ds_write_b16 v0, v2 offset:8
 ; GFX10-NEXT:    ds_write_b16_d16_hi v0, v2 offset:10
 ; GFX10-NEXT:    ds_write_b16 v0, v3 offset:4
 ; GFX10-NEXT:    ds_write_b16_d16_hi v0, v3 offset:6
-; GFX10-NEXT:    ds_write_b16 v0, v7
-; GFX10-NEXT:    ds_write_b16_d16_hi v0, v7 offset:2
+; GFX10-NEXT:    ds_write_b16 v0, v4
+; GFX10-NEXT:    ds_write_b16_d16_hi v0, v4 offset:2
 ; GFX10-NEXT:    s_endpgm
   store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 2
   ret void
@@ -420,9 +420,9 @@ define amdgpu_kernel void @store_lds_v4i32_align4(<4 x i32> addrspace(3)* %out,
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s6
-; GFX10-NEXT:    v_mov_b32_e32 v6, s7
+; GFX10-NEXT:    v_mov_b32_e32 v4, s7
 ; GFX10-NEXT:    ds_write2_b32 v0, v1, v2 offset1:1
-; GFX10-NEXT:    ds_write2_b32 v0, v3, v6 offset0:2 offset1:3
+; GFX10-NEXT:    ds_write2_b32 v0, v3, v4 offset0:2 offset1:3
 ; GFX10-NEXT:    s_endpgm
   store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 4
   ret void

diff  --git a/llvm/test/CodeGen/AMDGPU/store-local.96.ll b/llvm/test/CodeGen/AMDGPU/store-local.96.ll
index 6babc931aedb..d54d41824c7c 100644
--- a/llvm/test/CodeGen/AMDGPU/store-local.96.ll
+++ b/llvm/test/CodeGen/AMDGPU/store-local.96.ll
@@ -196,11 +196,11 @@ define amdgpu_kernel void @store_lds_v3i32_align1(<3 x i32> addrspace(3)* %out,
 ; GFX10-NEXT:    s_lshr_b32 s5, s4, 8
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX10-NEXT:    s_lshr_b32 s4, s4, 24
-; GFX10-NEXT:    v_mov_b32_e32 v11, s0
+; GFX10-NEXT:    v_mov_b32_e32 v4, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v5, s1
 ; GFX10-NEXT:    v_mov_b32_e32 v6, s2
 ; GFX10-NEXT:    v_mov_b32_e32 v7, s3
-; GFX10-NEXT:    v_mov_b32_e32 v15, s5
+; GFX10-NEXT:    v_mov_b32_e32 v8, s5
 ; GFX10-NEXT:    v_mov_b32_e32 v9, s4
 ; GFX10-NEXT:    ds_write_b8 v0, v1 offset:8
 ; GFX10-NEXT:    ds_write_b8_d16_hi v0, v1 offset:10
@@ -208,11 +208,11 @@ define amdgpu_kernel void @store_lds_v3i32_align1(<3 x i32> addrspace(3)* %out,
 ; GFX10-NEXT:    ds_write_b8_d16_hi v0, v2 offset:6
 ; GFX10-NEXT:    ds_write_b8 v0, v3
 ; GFX10-NEXT:    ds_write_b8_d16_hi v0, v3 offset:2
-; GFX10-NEXT:    ds_write_b8 v0, v11 offset:9
+; GFX10-NEXT:    ds_write_b8 v0, v4 offset:9
 ; GFX10-NEXT:    ds_write_b8 v0, v5 offset:11
 ; GFX10-NEXT:    ds_write_b8 v0, v6 offset:5
 ; GFX10-NEXT:    ds_write_b8 v0, v7 offset:7
-; GFX10-NEXT:    ds_write_b8 v0, v15 offset:1
+; GFX10-NEXT:    ds_write_b8 v0, v8 offset:1
 ; GFX10-NEXT:    ds_write_b8 v0, v9 offset:3
 ; GFX10-NEXT:    s_endpgm
   store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 1

diff  --git a/llvm/test/CodeGen/AMDGPU/strict_fadd.f64.ll b/llvm/test/CodeGen/AMDGPU/strict_fadd.f64.ll
index af94bd4a1f25..97412b1e4c26 100644
--- a/llvm/test/CodeGen/AMDGPU/strict_fadd.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/strict_fadd.f64.ll
@@ -65,12 +65,8 @@ define <2 x double> @v_constained_fadd_v2f64_fpexcept_strict(<2 x double> %x, <2
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_mov_b32_e32 v9, v3
-; GFX10-NEXT:    v_mov_b32_e32 v8, v2
-; GFX10-NEXT:    v_mov_b32_e32 v11, v1
-; GFX10-NEXT:    v_mov_b32_e32 v10, v0
-; GFX10-NEXT:    v_add_f64 v[2:3], v[8:9], v[6:7]
-; GFX10-NEXT:    v_add_f64 v[0:1], v[10:11], v[4:5]
+; GFX10-NEXT:    v_add_f64 v[0:1], v[0:1], v[4:5]
+; GFX10-NEXT:    v_add_f64 v[2:3], v[2:3], v[6:7]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %val = call <2 x double> @llvm.experimental.constrained.fadd.v2f64(<2 x double> %x, <2 x double> %y, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret <2 x double> %val
@@ -88,12 +84,8 @@ define <2 x double> @v_constained_fadd_v2f64_fpexcept_ignore(<2 x double> %x, <2
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_mov_b32_e32 v9, v3
-; GFX10-NEXT:    v_mov_b32_e32 v8, v2
-; GFX10-NEXT:    v_mov_b32_e32 v11, v1
-; GFX10-NEXT:    v_mov_b32_e32 v10, v0
-; GFX10-NEXT:    v_add_f64 v[2:3], v[8:9], v[6:7]
-; GFX10-NEXT:    v_add_f64 v[0:1], v[10:11], v[4:5]
+; GFX10-NEXT:    v_add_f64 v[0:1], v[0:1], v[4:5]
+; GFX10-NEXT:    v_add_f64 v[2:3], v[2:3], v[6:7]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %val = call <2 x double> @llvm.experimental.constrained.fadd.v2f64(<2 x double> %x, <2 x double> %y, metadata !"round.tonearest", metadata !"fpexcept.ignore")
   ret <2 x double> %val
@@ -111,12 +103,8 @@ define <2 x double> @v_constained_fadd_v2f64_fpexcept_maytrap(<2 x double> %x, <
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_mov_b32_e32 v9, v3
-; GFX10-NEXT:    v_mov_b32_e32 v8, v2
-; GFX10-NEXT:    v_mov_b32_e32 v11, v1
-; GFX10-NEXT:    v_mov_b32_e32 v10, v0
-; GFX10-NEXT:    v_add_f64 v[2:3], v[8:9], v[6:7]
-; GFX10-NEXT:    v_add_f64 v[0:1], v[10:11], v[4:5]
+; GFX10-NEXT:    v_add_f64 v[0:1], v[0:1], v[4:5]
+; GFX10-NEXT:    v_add_f64 v[2:3], v[2:3], v[6:7]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %val = call <2 x double> @llvm.experimental.constrained.fadd.v2f64(<2 x double> %x, <2 x double> %y, metadata !"round.tonearest", metadata !"fpexcept.maytrap")
   ret <2 x double> %val

diff  --git a/llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll b/llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll
index 89493343c6fc..110e65144e0d 100644
--- a/llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll
@@ -75,10 +75,9 @@ define <3 x half> @v_constained_fma_v3f16_fpexcept_strict(<3 x half> %x, <3 x ha
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_mov_b32_e32 v6, v5
+; GFX10-NEXT:    v_fmac_f16_e32 v5, v1, v3
 ; GFX10-NEXT:    v_pk_fma_f16 v0, v0, v2, v4
-; GFX10-NEXT:    v_fmac_f16_e32 v6, v1, v3
-; GFX10-NEXT:    v_mov_b32_e32 v1, v6
+; GFX10-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %val = call <3 x half> @llvm.experimental.constrained.fma.v3f16(<3 x half> %x, <3 x half> %y, <3 x half> %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret <3 x half> %val
@@ -128,23 +127,21 @@ define <4 x half> @v_constained_fma_v4f16_fpexcept_strict(<4 x half> %x, <4 x ha
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_mov_b32_e32 v14, v5
-; GFX10-NEXT:    v_mov_b32_e32 v15, v4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v5
+; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v14
-; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v15
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v0
-; GFX10-NEXT:    v_fmac_f16_e32 v15, v0, v2
+; GFX10-NEXT:    v_fmac_f16_e32 v4, v0, v2
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0xffff
-; GFX10-NEXT:    v_fmac_f16_e32 v14, v1, v3
-; GFX10-NEXT:    v_fmac_f16_e32 v5, v8, v7
-; GFX10-NEXT:    v_fmac_f16_e32 v4, v11, v10
-; GFX10-NEXT:    v_and_b32_e32 v1, v0, v15
-; GFX10-NEXT:    v_and_b32_e32 v2, v0, v14
-; GFX10-NEXT:    v_lshl_or_b32 v0, v4, 16, v1
-; GFX10-NEXT:    v_lshl_or_b32 v1, v5, 16, v2
+; GFX10-NEXT:    v_fmac_f16_e32 v5, v1, v3
+; GFX10-NEXT:    v_fmac_f16_e32 v6, v8, v7
+; GFX10-NEXT:    v_fmac_f16_e32 v9, v11, v10
+; GFX10-NEXT:    v_and_b32_e32 v1, v0, v4
+; GFX10-NEXT:    v_and_b32_e32 v2, v0, v5
+; GFX10-NEXT:    v_lshl_or_b32 v0, v9, 16, v1
+; GFX10-NEXT:    v_lshl_or_b32 v1, v6, 16, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %val = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> %x, <4 x half> %y, <4 x half> %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret <4 x half> %val

diff  --git a/llvm/test/CodeGen/AMDGPU/strict_fma.f64.ll b/llvm/test/CodeGen/AMDGPU/strict_fma.f64.ll
index 067640c4cb1c..38077938fd7d 100644
--- a/llvm/test/CodeGen/AMDGPU/strict_fma.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/strict_fma.f64.ll
@@ -31,12 +31,8 @@ define <2 x double> @v_constained_fma_v2f64_fpexcept_strict(<2 x double> %x, <2
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_mov_b32_e32 v13, v3
-; GFX10-NEXT:    v_mov_b32_e32 v12, v2
-; GFX10-NEXT:    v_mov_b32_e32 v15, v1
-; GFX10-NEXT:    v_mov_b32_e32 v14, v0
-; GFX10-NEXT:    v_fma_f64 v[2:3], v[12:13], v[6:7], v[10:11]
-; GFX10-NEXT:    v_fma_f64 v[0:1], v[14:15], v[4:5], v[8:9]
+; GFX10-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], v[8:9]
+; GFX10-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[10:11]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %val = call <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double> %x, <2 x double> %y, <2 x double> %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret <2 x double> %val
@@ -77,18 +73,10 @@ define <4 x double> @v_constained_fma_v4f64_fpexcept_strict(<4 x double> %x, <4
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_mov_b32_e32 v29, v7
-; GFX10-NEXT:    v_mov_b32_e32 v28, v6
-; GFX10-NEXT:    v_mov_b32_e32 v31, v5
-; GFX10-NEXT:    v_mov_b32_e32 v30, v4
-; GFX10-NEXT:    v_mov_b32_e32 v25, v3
-; GFX10-NEXT:    v_mov_b32_e32 v24, v2
-; GFX10-NEXT:    v_mov_b32_e32 v27, v1
-; GFX10-NEXT:    v_mov_b32_e32 v26, v0
-; GFX10-NEXT:    v_fma_f64 v[4:5], v[30:31], v[12:13], v[20:21]
-; GFX10-NEXT:    v_fma_f64 v[6:7], v[28:29], v[14:15], v[22:23]
-; GFX10-NEXT:    v_fma_f64 v[2:3], v[24:25], v[10:11], v[18:19]
-; GFX10-NEXT:    v_fma_f64 v[0:1], v[26:27], v[8:9], v[16:17]
+; GFX10-NEXT:    v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17]
+; GFX10-NEXT:    v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19]
+; GFX10-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21]
+; GFX10-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %val = call <4 x double> @llvm.experimental.constrained.fma.v4f64(<4 x double> %x, <4 x double> %y, <4 x double> %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret <4 x double> %val
@@ -162,12 +150,8 @@ define <2 x double> @v_constained_fma_v2f64_fpexcept_strict_fneg_fneg(<2 x doubl
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_mov_b32_e32 v13, v3
-; GFX10-NEXT:    v_mov_b32_e32 v12, v2
-; GFX10-NEXT:    v_mov_b32_e32 v15, v1
-; GFX10-NEXT:    v_mov_b32_e32 v14, v0
-; GFX10-NEXT:    v_fma_f64 v[2:3], -v[12:13], -v[6:7], v[10:11]
-; GFX10-NEXT:    v_fma_f64 v[0:1], -v[14:15], -v[4:5], v[8:9]
+; GFX10-NEXT:    v_fma_f64 v[0:1], -v[0:1], -v[4:5], v[8:9]
+; GFX10-NEXT:    v_fma_f64 v[2:3], -v[2:3], -v[6:7], v[10:11]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %neg.x = fneg <2 x double> %x
   %neg.y = fneg <2 x double> %y

diff  --git a/llvm/test/CodeGen/AMDGPU/strict_fmul.f64.ll b/llvm/test/CodeGen/AMDGPU/strict_fmul.f64.ll
index daa7dcc8344d..9fc32fa3556c 100644
--- a/llvm/test/CodeGen/AMDGPU/strict_fmul.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/strict_fmul.f64.ll
@@ -65,12 +65,8 @@ define <2 x double> @v_constained_fmul_v2f64_fpexcept_strict(<2 x double> %x, <2
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_mov_b32_e32 v9, v3
-; GFX10-NEXT:    v_mov_b32_e32 v8, v2
-; GFX10-NEXT:    v_mov_b32_e32 v11, v1
-; GFX10-NEXT:    v_mov_b32_e32 v10, v0
-; GFX10-NEXT:    v_mul_f64 v[2:3], v[8:9], v[6:7]
-; GFX10-NEXT:    v_mul_f64 v[0:1], v[10:11], v[4:5]
+; GFX10-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
+; GFX10-NEXT:    v_mul_f64 v[2:3], v[2:3], v[6:7]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %val = call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> %x, <2 x double> %y, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret <2 x double> %val
@@ -88,12 +84,8 @@ define <2 x double> @v_constained_fmul_v2f64_fpexcept_ignore(<2 x double> %x, <2
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_mov_b32_e32 v9, v3
-; GFX10-NEXT:    v_mov_b32_e32 v8, v2
-; GFX10-NEXT:    v_mov_b32_e32 v11, v1
-; GFX10-NEXT:    v_mov_b32_e32 v10, v0
-; GFX10-NEXT:    v_mul_f64 v[2:3], v[8:9], v[6:7]
-; GFX10-NEXT:    v_mul_f64 v[0:1], v[10:11], v[4:5]
+; GFX10-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
+; GFX10-NEXT:    v_mul_f64 v[2:3], v[2:3], v[6:7]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %val = call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> %x, <2 x double> %y, metadata !"round.tonearest", metadata !"fpexcept.ignore")
   ret <2 x double> %val
@@ -111,12 +103,8 @@ define <2 x double> @v_constained_fmul_v2f64_fpexcept_maytrap(<2 x double> %x, <
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_mov_b32_e32 v9, v3
-; GFX10-NEXT:    v_mov_b32_e32 v8, v2
-; GFX10-NEXT:    v_mov_b32_e32 v11, v1
-; GFX10-NEXT:    v_mov_b32_e32 v10, v0
-; GFX10-NEXT:    v_mul_f64 v[2:3], v[8:9], v[6:7]
-; GFX10-NEXT:    v_mul_f64 v[0:1], v[10:11], v[4:5]
+; GFX10-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
+; GFX10-NEXT:    v_mul_f64 v[2:3], v[2:3], v[6:7]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %val = call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> %x, <2 x double> %y, metadata !"round.tonearest", metadata !"fpexcept.maytrap")
   ret <2 x double> %val

diff  --git a/llvm/test/CodeGen/AMDGPU/strict_fsub.f64.ll b/llvm/test/CodeGen/AMDGPU/strict_fsub.f64.ll
index 8e4e406ccf50..115d52ef838c 100644
--- a/llvm/test/CodeGen/AMDGPU/strict_fsub.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/strict_fsub.f64.ll
@@ -65,12 +65,8 @@ define <2 x double> @v_constained_fsub_v2f64_fpexcept_strict(<2 x double> %x, <2
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_mov_b32_e32 v9, v3
-; GFX10-NEXT:    v_mov_b32_e32 v8, v2
-; GFX10-NEXT:    v_mov_b32_e32 v11, v1
-; GFX10-NEXT:    v_mov_b32_e32 v10, v0
-; GFX10-NEXT:    v_add_f64 v[2:3], v[8:9], -v[6:7]
-; GFX10-NEXT:    v_add_f64 v[0:1], v[10:11], -v[4:5]
+; GFX10-NEXT:    v_add_f64 v[0:1], v[0:1], -v[4:5]
+; GFX10-NEXT:    v_add_f64 v[2:3], v[2:3], -v[6:7]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %val = call <2 x double> @llvm.experimental.constrained.fsub.v2f64(<2 x double> %x, <2 x double> %y, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret <2 x double> %val
@@ -88,12 +84,8 @@ define <2 x double> @v_constained_fsub_v2f64_fpexcept_ignore(<2 x double> %x, <2
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_mov_b32_e32 v9, v3
-; GFX10-NEXT:    v_mov_b32_e32 v8, v2
-; GFX10-NEXT:    v_mov_b32_e32 v11, v1
-; GFX10-NEXT:    v_mov_b32_e32 v10, v0
-; GFX10-NEXT:    v_add_f64 v[2:3], v[8:9], -v[6:7]
-; GFX10-NEXT:    v_add_f64 v[0:1], v[10:11], -v[4:5]
+; GFX10-NEXT:    v_add_f64 v[0:1], v[0:1], -v[4:5]
+; GFX10-NEXT:    v_add_f64 v[2:3], v[2:3], -v[6:7]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %val = call <2 x double> @llvm.experimental.constrained.fsub.v2f64(<2 x double> %x, <2 x double> %y, metadata !"round.tonearest", metadata !"fpexcept.ignore")
   ret <2 x double> %val
@@ -111,12 +103,8 @@ define <2 x double> @v_constained_fsub_v2f64_fpexcept_maytrap(<2 x double> %x, <
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_mov_b32_e32 v9, v3
-; GFX10-NEXT:    v_mov_b32_e32 v8, v2
-; GFX10-NEXT:    v_mov_b32_e32 v11, v1
-; GFX10-NEXT:    v_mov_b32_e32 v10, v0
-; GFX10-NEXT:    v_add_f64 v[2:3], v[8:9], -v[6:7]
-; GFX10-NEXT:    v_add_f64 v[0:1], v[10:11], -v[4:5]
+; GFX10-NEXT:    v_add_f64 v[0:1], v[0:1], -v[4:5]
+; GFX10-NEXT:    v_add_f64 v[2:3], v[2:3], -v[6:7]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %val = call <2 x double> @llvm.experimental.constrained.fsub.v2f64(<2 x double> %x, <2 x double> %y, metadata !"round.tonearest", metadata !"fpexcept.maytrap")
   ret <2 x double> %val

diff  --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
index 30beac73efd1..3500090e8455 100644
--- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
@@ -746,11 +746,11 @@ define <4 x half> @shuffle_v4f16_3456(<4 x half> addrspace(1)* %arg0, <4 x half>
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    global_load_dword v6, v[0:1], off offset:4
 ; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
-; GFX10-NEXT:    v_mov_b32_e32 v3, 0xffff
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0xffff
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_and_b32_sdwa v1, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    v_and_b32_sdwa v1, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_and_b32_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    v_and_b32_sdwa v2, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX10-NEXT:    v_lshl_or_b32 v0, v4, 16, v1
 ; GFX10-NEXT:    v_lshl_or_b32 v1, v5, 16, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -778,15 +778,15 @@ define <4 x half> @shuffle_v4f16_5634(<4 x half> addrspace(1)* %arg0, <4 x half>
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_load_dwordx2 v[9:10], v[2:3], off
+; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GFX10-NEXT:    global_load_dword v6, v[0:1], off offset:4
-; GFX10-NEXT:    v_mov_b32_e32 v3, 0xffff
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0xffff
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_and_b32_sdwa v1, v3, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    v_and_b32_sdwa v1, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_and_b32_sdwa v2, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT:    v_lshl_or_b32 v0, v10, 16, v1
-; GFX10-NEXT:    v_lshl_or_b32 v1, v9, 16, v2
+; GFX10-NEXT:    v_and_b32_sdwa v2, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    v_lshl_or_b32 v0, v5, 16, v1
+; GFX10-NEXT:    v_lshl_or_b32 v1, v4, 16, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
   %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
@@ -816,12 +816,12 @@ define <4 x half> @shuffle_v4f16_5734(<4 x half> addrspace(1)* %arg0, <4 x half>
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GFX10-NEXT:    global_load_dword v6, v[0:1], off offset:4
-; GFX10-NEXT:    v_mov_b32_e32 v3, 0xffff
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0xffff
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_and_b32_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v5
+; GFX10-NEXT:    v_and_b32_sdwa v2, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_and_b32_sdwa v3, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    v_and_b32_sdwa v3, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX10-NEXT:    v_lshl_or_b32 v0, v1, 16, v2
 ; GFX10-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -1319,14 +1319,14 @@ define amdgpu_kernel void @fma_shuffle(<4 x half> addrspace(1)* nocapture readon
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 3, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x2
-; GFX10-NEXT:    global_load_dwordx2 v[7:8], v6, s[0:1]
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v6, s[0:1]
 ; GFX10-NEXT:    global_load_dwordx2 v[2:3], v6, s[2:3]
 ; GFX10-NEXT:    global_load_dwordx2 v[4:5], v6, s[8:9]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_pk_fma_f16 v4, v7, v2, v4 op_sel_hi:[0,1,1]
-; GFX10-NEXT:    v_pk_fma_f16 v2, v8, v2, v5 op_sel_hi:[0,1,1]
-; GFX10-NEXT:    v_pk_fma_f16 v0, v7, v3, v4 op_sel:[1,0,0]
-; GFX10-NEXT:    v_pk_fma_f16 v1, v8, v3, v2 op_sel:[1,0,0]
+; GFX10-NEXT:    v_pk_fma_f16 v4, v0, v2, v4 op_sel_hi:[0,1,1]
+; GFX10-NEXT:    v_pk_fma_f16 v2, v1, v2, v5 op_sel_hi:[0,1,1]
+; GFX10-NEXT:    v_pk_fma_f16 v0, v0, v3, v4 op_sel:[1,0,0]
+; GFX10-NEXT:    v_pk_fma_f16 v1, v1, v3, v2 op_sel:[1,0,0]
 ; GFX10-NEXT:    global_store_dwordx2 v6, v[0:1], s[8:9]
 ; GFX10-NEXT:    s_endpgm
 entry:
@@ -1380,14 +1380,16 @@ define <4 x half> @shuffle_v4f16_0456(<4 x half> addrspace(1)* %arg0, <4 x half>
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
-; GFX10-NEXT:    global_load_dwordx2 v[6:7], v[2:3], off
-; GFX10-NEXT:    v_mov_b32_e32 v3, 0xffff
-; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_and_b32_e32 v1, v3, v4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_and_b32_sdwa v2, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT:    v_lshl_or_b32 v0, v6, 16, v1
-; GFX10-NEXT:    v_lshl_or_b32 v1, v7, 16, v2
+; GFX10-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
+; GFX10-NEXT:    ; kill: killed $vgpr0 killed $vgpr1
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0xffff
+; GFX10-NEXT:    ; kill: killed $vgpr2 killed $vgpr3
+; GFX10-NEXT:    v_and_b32_e32 v1, v0, v4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_and_b32_sdwa v2, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    v_lshl_or_b32 v0, v5, 16, v1
+; GFX10-NEXT:    v_lshl_or_b32 v1, v6, 16, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
   %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1


        


More information about the llvm-commits mailing list