[llvm] [AMDGPU] Adding multiple use analysis to SIPeepholeSDWA (PR #94800)

Brian Favela via llvm-commits llvm-commits at lists.llvm.org
Fri Jun 7 14:23:35 PDT 2024


https://github.com/bfavela updated https://github.com/llvm/llvm-project/pull/94800

>From a6e46bf0f0432d01f184ac9a2860719e89f564e4 Mon Sep 17 00:00:00 2001
From: Brian Favela <brianfavela at microsoft.com>
Date: Tue, 10 Aug 2021 22:29:32 +0000
Subject: [PATCH 1/5] Enhance SDWA peephole phase to look at all uses of an
 inst to evaluate if it can be folded

; v_and_b32 v2, lit(0x0000ffff), v2
; v_and_b32 v3, 6, v2
; v_and_b32 v2, 1, v2

Can be folded to:
; v_and_b32 v3, 6, sel_lo(v2)
; v_and_b32 v2, 1, sel_lo(v2)
---
 llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp | 81 ++++++++++++++++++-----
 1 file changed, 65 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index 1fadd8ce45b1f..43348a0f68b13 100644
--- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -37,20 +37,24 @@ STATISTIC(NumSDWAInstructionsPeepholed,
 
 namespace {
 
+bool isConvertibleToSDWA(MachineInstr &MI, const GCNSubtarget &ST,
+                         const SIInstrInfo *TII);
 class SDWAOperand;
 class SDWADstOperand;
 
-class SIPeepholeSDWA : public MachineFunctionPass {
-public:
-  using SDWAOperandsVector = SmallVector<SDWAOperand *, 4>;
+using SDWAOperandsVector = SmallVector<SDWAOperand *, 4>;
+
+// helper typedef to make code cleaner
+typedef std::unordered_map<MachineInstr *, SDWAOperandsVector> SDWAOperandsMap;
 
+class SIPeepholeSDWA : public MachineFunctionPass {
 private:
   MachineRegisterInfo *MRI;
   const SIRegisterInfo *TRI;
   const SIInstrInfo *TII;
 
-  MapVector<MachineInstr *, std::unique_ptr<SDWAOperand>> SDWAOperands;
-  MapVector<MachineInstr *, SDWAOperandsVector> PotentialMatches;
+  std::unordered_map<MachineInstr *, std::unique_ptr<SDWAOperand>> SDWAOperands;
+  SDWAOperandsMap PotentialMatches;
   SmallVector<MachineInstr *, 8> ConvertedInstructions;
 
   std::optional<int64_t> foldToImm(const MachineOperand &Op) const;
@@ -65,7 +69,6 @@ class SIPeepholeSDWA : public MachineFunctionPass {
   bool runOnMachineFunction(MachineFunction &MF) override;
   void matchSDWAOperands(MachineBasicBlock &MBB);
   std::unique_ptr<SDWAOperand> matchSDWAOperand(MachineInstr &MI);
-  bool isConvertibleToSDWA(MachineInstr &MI, const GCNSubtarget &ST) const;
   void pseudoOpConvertToVOP2(MachineInstr &MI,
                              const GCNSubtarget &ST) const;
   bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands);
@@ -93,7 +96,9 @@ class SDWAOperand {
 
   virtual ~SDWAOperand() = default;
 
-  virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII) = 0;
+  virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII,
+                                           const GCNSubtarget &ST,
+                                           SDWAOperandsMap *PotentialMatches = nullptr) = 0;
   virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) = 0;
 
   MachineOperand *getTargetOperand() const { return Target; }
@@ -126,7 +131,9 @@ class SDWASrcOperand : public SDWAOperand {
       : SDWAOperand(TargetOp, ReplacedOp),
         SrcSel(SrcSel_), Abs(Abs_), Neg(Neg_), Sext(Sext_) {}
 
-  MachineInstr *potentialToConvert(const SIInstrInfo *TII) override;
+  MachineInstr *potentialToConvert(const SIInstrInfo *TII,
+                                   const GCNSubtarget &ST,
+                                   SDWAOperandsMap *PotentialMatches = nullptr) override;
   bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
 
   SdwaSel getSrcSel() const { return SrcSel; }
@@ -153,7 +160,9 @@ class SDWADstOperand : public SDWAOperand {
                  SdwaSel DstSel_ = DWORD, DstUnused DstUn_ = UNUSED_PAD)
     : SDWAOperand(TargetOp, ReplacedOp), DstSel(DstSel_), DstUn(DstUn_) {}
 
-  MachineInstr *potentialToConvert(const SIInstrInfo *TII) override;
+  MachineInstr *potentialToConvert(const SIInstrInfo *TII,
+                                   const GCNSubtarget &ST,
+                                   SDWAOperandsMap *PotentialMatches = nullptr) override;
   bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
 
   SdwaSel getDstSel() const { return DstSel; }
@@ -327,7 +336,42 @@ uint64_t SDWASrcOperand::getSrcMods(const SIInstrInfo *TII,
   return Mods;
 }
 
-MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII) {
+MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII,
+                                                 const GCNSubtarget &ST,
+                                                 SDWAOperandsMap *PotentialMatches) {
+  // If PotentialMatches is not null, then fill out the map for all uses,
+  // if all can be converted
+  if (PotentialMatches != nullptr) {
+    MachineOperand *Reg = getReplacedOperand();
+    if (!Reg->isReg() || !Reg->isDef()) {
+      return nullptr;
+    }
+
+    for (MachineOperand &UseMO : getMRI()->use_nodbg_operands(Reg->getReg())) {
+      // If there exist use of subreg of Reg then return nullptr
+      if (!isSameReg(UseMO, *Reg))
+        return nullptr;
+
+      // Check that all instructions the use Reg can be converted
+      if (!isConvertibleToSDWA(*(UseMO.getParent()), ST, TII)) {
+        return nullptr;
+      }
+
+      // Not handling the obscure case where the same use is in multiple operands
+      if (PotentialMatches->find(UseMO.getParent()) != PotentialMatches->end()) {
+        return nullptr;
+      }
+    }
+    // Now that it's guaranteed all uses are legal, iterate over the uses again
+    // to add them for later conversion.
+    for (MachineOperand &UseMO : getMRI()->use_nodbg_operands(Reg->getReg())) {
+      SDWAOperandsMap& potentialMatchesMap = *PotentialMatches;
+      MachineInstr* UseMI = UseMO.getParent();
+      potentialMatchesMap[UseMI].push_back(this);
+    }
+    return nullptr;
+  }
+
   // For SDWA src operand potential instruction is one that use register
   // defined by parent instruction
   MachineOperand *PotentialMO = findSingleRegUse(getReplacedOperand(), getMRI());
@@ -420,7 +464,9 @@ bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
   return true;
 }
 
-MachineInstr *SDWADstOperand::potentialToConvert(const SIInstrInfo *TII) {
+MachineInstr *SDWADstOperand::potentialToConvert(const SIInstrInfo *TII,
+                                                 const GCNSubtarget &ST,
+                                                 SDWAOperandsMap *PotentialMatches) {
   // For SDWA dst operand potential instruction is one that defines register
   // that this operand uses
   MachineRegisterInfo *MRI = getMRI();
@@ -919,8 +965,10 @@ void SIPeepholeSDWA::pseudoOpConvertToVOP2(MachineInstr &MI,
   MISucc.substituteRegister(CarryIn->getReg(), TRI->getVCC(), 0, *TRI);
 }
 
-bool SIPeepholeSDWA::isConvertibleToSDWA(MachineInstr &MI,
-                                         const GCNSubtarget &ST) const {
+namespace {
+bool isConvertibleToSDWA(MachineInstr &MI,
+                         const GCNSubtarget &ST,
+                         const SIInstrInfo* TII) {
   // Check if this is already an SDWA instruction
   unsigned Opc = MI.getOpcode();
   if (TII->isSDWA(Opc))
@@ -980,6 +1028,7 @@ bool SIPeepholeSDWA::isConvertibleToSDWA(MachineInstr &MI,
 
   return true;
 }
+} // namespace
 
 bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
                                    const SDWAOperandsVector &SDWAOperands) {
@@ -1215,7 +1264,7 @@ bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) {
       matchSDWAOperands(MBB);
       for (const auto &OperandPair : SDWAOperands) {
         const auto &Operand = OperandPair.second;
-        MachineInstr *PotentialMI = Operand->potentialToConvert(TII);
+        MachineInstr *PotentialMI = Operand->potentialToConvert(TII, ST);
         if (PotentialMI &&
            (PotentialMI->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 ||
             PotentialMI->getOpcode() == AMDGPU::V_SUB_CO_U32_e64))
@@ -1228,8 +1277,8 @@ bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) {
 
       for (const auto &OperandPair : SDWAOperands) {
         const auto &Operand = OperandPair.second;
-        MachineInstr *PotentialMI = Operand->potentialToConvert(TII);
-        if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST)) {
+        MachineInstr *PotentialMI = Operand->potentialToConvert(TII, ST, &PotentialMatches);
+        if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST, TII)) {
           PotentialMatches[PotentialMI].push_back(Operand.get());
         }
       }

>From 4b1a12930e9bfc8556488510291e91f1163a0b64 Mon Sep 17 00:00:00 2001
From: Brian Favela <brianfavela at microsoft.com>
Date: Thu, 6 Jun 2024 14:23:49 -0400
Subject: [PATCH 2/5] First round of test fixes

---
 .../AMDGPU/GlobalISel/cvt_f32_ubyte.ll        |  27 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll   | 308 +++++-----
 llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll   | 359 ++++++------
 .../CodeGen/AMDGPU/GlobalISel/llvm.abs.ll     |  55 +-
 .../test/CodeGen/AMDGPU/GlobalISel/saddsat.ll | 325 ++++++-----
 .../test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll | 323 +++++------
 llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll     |  36 +-
 llvm/test/CodeGen/AMDGPU/fract-match.ll       |  25 +-
 llvm/test/CodeGen/AMDGPU/fshr.ll              |  92 ++-
 llvm/test/CodeGen/AMDGPU/idiv-licm.ll         |  86 ++-
 llvm/test/CodeGen/AMDGPU/idot4u.ll            |  31 +-
 llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll  |  24 +-
 llvm/test/CodeGen/AMDGPU/llvm.frexp.ll        |  42 +-
 ...ne-sink-temporal-divergence-swdev407790.ll |  25 +-
 llvm/test/CodeGen/AMDGPU/permute_i8.ll        | 545 ++++++++----------
 .../AMDGPU/reassoc-mul-add-1-to-mad.ll        |   5 +-
 .../CodeGen/AMDGPU/sdwa-peephole-instr.mir    |   6 +
 llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll     |  17 +-
 update.bat                                    |  21 +
 19 files changed, 1160 insertions(+), 1192 deletions(-)
 create mode 100644 update.bat

diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
index 02781e763f44a..eb20178f9f4d8 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
@@ -771,7 +771,8 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    v_mov_b32_e32 v6, 8
+; VI-NEXT:    v_mov_b32_e32 v6, 9
+; VI-NEXT:    v_mov_b32_e32 v7, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -779,28 +780,28 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v1, v[0:1]
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT:    v_mov_b32_e32 v2, 9
+; VI-NEXT:    v_mov_b32_e32 v2, 0xff
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v5, s1
 ; VI-NEXT:    v_mov_b32_e32 v4, s0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_lshrrev_b32_e32 v7, 8, v1
-; VI-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
+; VI-NEXT:    v_lshrrev_b32_e32 v8, 8, v1
+; VI-NEXT:    v_and_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
 ; VI-NEXT:    v_cvt_f32_ubyte3_e32 v3, v1
 ; VI-NEXT:    v_add_u16_e32 v9, 9, v1
-; VI-NEXT:    v_add_u16_sdwa v10, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
-; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
-; VI-NEXT:    v_add_u16_e32 v7, 9, v7
+; VI-NEXT:    v_add_u16_sdwa v10, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_add_u16_sdwa v6, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+; VI-NEXT:    v_cvt_f32_ubyte0_e32 v2, v2
 ; VI-NEXT:    v_add_u16_e32 v8, 9, v8
-; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; VI-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; VI-NEXT:    v_lshlrev_b32_sdwa v0, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    v_and_b32_e32 v1, 0xff, v8
+; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; VI-NEXT:    v_lshlrev_b32_sdwa v0, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v10
 ; VI-NEXT:    v_or_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 24, v10
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 24, v6
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
 ; VI-NEXT:    v_or_b32_e32 v2, v0, v2
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
index 06930388901b0..4df5fa18e2942 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
@@ -1271,46 +1271,45 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX8-LABEL: v_fshl_v4i8:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_not_b32_e32 v7, v2
+; GFX8-NEXT:    v_mov_b32_e32 v9, 1
+; GFX8-NEXT:    v_and_b32_e32 v6, 7, v2
+; GFX8-NEXT:    v_and_b32_e32 v7, 7, v7
+; GFX8-NEXT:    v_lshrrev_b16_sdwa v10, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 24, v2
-; GFX8-NEXT:    v_and_b32_e32 v8, 7, v2
-; GFX8-NEXT:    v_not_b32_e32 v2, v2
-; GFX8-NEXT:    v_mov_b32_e32 v10, 1
-; GFX8-NEXT:    v_and_b32_e32 v2, 7, v2
-; GFX8-NEXT:    v_lshrrev_b16_sdwa v11, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_lshlrev_b16_e32 v8, v8, v0
-; GFX8-NEXT:    v_lshrrev_b16_e32 v2, v2, v11
+; GFX8-NEXT:    v_lshlrev_b16_e32 v6, v6, v0
+; GFX8-NEXT:    v_lshrrev_b16_e32 v7, v7, v10
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
-; GFX8-NEXT:    v_or_b32_e32 v2, v8, v2
-; GFX8-NEXT:    v_and_b32_e32 v8, 7, v5
+; GFX8-NEXT:    v_or_b32_e32 v6, v6, v7
+; GFX8-NEXT:    v_and_b32_e32 v7, 7, v5
 ; GFX8-NEXT:    v_not_b32_e32 v5, v5
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
 ; GFX8-NEXT:    v_and_b32_e32 v5, 7, v5
-; GFX8-NEXT:    v_lshrrev_b16_sdwa v4, v10, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_mov_b32_e32 v9, 0xff
-; GFX8-NEXT:    v_lshlrev_b16_e32 v3, v8, v3
+; GFX8-NEXT:    v_lshrrev_b16_sdwa v4, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshlrev_b16_e32 v3, v7, v3
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v4, v5, v4
+; GFX8-NEXT:    v_mov_b32_e32 v8, 0xff
 ; GFX8-NEXT:    v_or_b32_e32 v3, v3, v4
-; GFX8-NEXT:    v_and_b32_e32 v4, 7, v6
-; GFX8-NEXT:    v_not_b32_e32 v5, v6
-; GFX8-NEXT:    v_and_b32_sdwa v6, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_e32 v5, 7, v5
-; GFX8-NEXT:    v_lshrrev_b16_e32 v6, 1, v6
-; GFX8-NEXT:    v_lshlrev_b16_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_lshrrev_b16_e32 v5, v5, v6
-; GFX8-NEXT:    v_not_b32_e32 v6, v7
-; GFX8-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX8-NEXT:    v_and_b32_e32 v5, 7, v7
-; GFX8-NEXT:    v_and_b32_e32 v6, 7, v6
-; GFX8-NEXT:    v_lshrrev_b16_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX8-NEXT:    v_lshlrev_b16_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX8-NEXT:    v_lshrrev_b16_e32 v1, v6, v1
+; GFX8-NEXT:    v_mov_b32_e32 v4, 7
+; GFX8-NEXT:    v_and_b32_sdwa v5, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_not_b32_sdwa v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT:    v_and_b32_sdwa v8, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_not_b32_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+; GFX8-NEXT:    v_and_b32_e32 v7, 7, v7
+; GFX8-NEXT:    v_lshrrev_b16_e32 v8, 1, v8
+; GFX8-NEXT:    v_and_b32_e32 v2, 7, v2
+; GFX8-NEXT:    v_lshrrev_b16_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX8-NEXT:    v_lshlrev_b16_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_lshrrev_b16_e32 v7, v7, v8
+; GFX8-NEXT:    v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX8-NEXT:    v_lshrrev_b16_e32 v1, v2, v1
+; GFX8-NEXT:    v_or_b32_e32 v5, v5, v7
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 8
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xff, v4
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xff, v5
+; GFX8-NEXT:    v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
@@ -1321,47 +1320,46 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX9-LABEL: v_fshl_v4i8:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_not_b32_e32 v7, v2
+; GFX9-NEXT:    v_mov_b32_e32 v9, 1
+; GFX9-NEXT:    v_and_b32_e32 v6, 7, v2
+; GFX9-NEXT:    v_and_b32_e32 v7, 7, v7
+; GFX9-NEXT:    v_lshrrev_b16_sdwa v10, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 24, v2
-; GFX9-NEXT:    v_and_b32_e32 v8, 7, v2
-; GFX9-NEXT:    v_not_b32_e32 v2, v2
-; GFX9-NEXT:    v_mov_b32_e32 v10, 1
-; GFX9-NEXT:    v_and_b32_e32 v2, 7, v2
-; GFX9-NEXT:    v_lshrrev_b16_sdwa v11, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_lshlrev_b16_e32 v8, v8, v0
-; GFX9-NEXT:    v_lshrrev_b16_e32 v2, v2, v11
+; GFX9-NEXT:    v_lshlrev_b16_e32 v6, v6, v0
+; GFX9-NEXT:    v_lshrrev_b16_e32 v7, v7, v10
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
-; GFX9-NEXT:    v_or_b32_e32 v2, v8, v2
-; GFX9-NEXT:    v_and_b32_e32 v8, 7, v5
+; GFX9-NEXT:    v_or_b32_e32 v6, v6, v7
+; GFX9-NEXT:    v_and_b32_e32 v7, 7, v5
 ; GFX9-NEXT:    v_not_b32_e32 v5, v5
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
 ; GFX9-NEXT:    v_and_b32_e32 v5, 7, v5
-; GFX9-NEXT:    v_lshrrev_b16_sdwa v4, v10, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_mov_b32_e32 v9, 0xff
-; GFX9-NEXT:    v_lshlrev_b16_e32 v3, v8, v3
+; GFX9-NEXT:    v_lshrrev_b16_sdwa v4, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshlrev_b16_e32 v3, v7, v3
 ; GFX9-NEXT:    v_lshrrev_b16_e32 v4, v5, v4
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0xff
 ; GFX9-NEXT:    v_or_b32_e32 v3, v3, v4
-; GFX9-NEXT:    v_and_b32_e32 v4, 7, v6
-; GFX9-NEXT:    v_not_b32_e32 v5, v6
-; GFX9-NEXT:    v_and_b32_sdwa v6, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_e32 v5, 7, v5
-; GFX9-NEXT:    v_lshrrev_b16_e32 v6, 1, v6
-; GFX9-NEXT:    v_lshlrev_b16_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT:    v_lshrrev_b16_e32 v5, v5, v6
-; GFX9-NEXT:    v_not_b32_e32 v6, v7
-; GFX9-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX9-NEXT:    v_and_b32_e32 v5, 7, v7
-; GFX9-NEXT:    v_and_b32_e32 v6, 7, v6
-; GFX9-NEXT:    v_lshrrev_b16_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX9-NEXT:    v_lshlrev_b16_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX9-NEXT:    v_lshrrev_b16_e32 v1, v6, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 7
+; GFX9-NEXT:    v_and_b32_sdwa v5, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_not_b32_sdwa v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    v_and_b32_sdwa v10, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_not_b32_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+; GFX9-NEXT:    v_and_b32_e32 v7, 7, v7
+; GFX9-NEXT:    v_lshrrev_b16_e32 v10, 1, v10
+; GFX9-NEXT:    v_and_b32_e32 v2, 7, v2
+; GFX9-NEXT:    v_lshrrev_b16_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX9-NEXT:    v_lshlrev_b16_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_lshrrev_b16_e32 v7, v7, v10
+; GFX9-NEXT:    v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX9-NEXT:    v_lshrrev_b16_e32 v1, v2, v1
+; GFX9-NEXT:    v_or_b32_e32 v5, v5, v7
 ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 8
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_and_or_b32 v1, v2, v9, v1
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xff, v4
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xff, v5
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX9-NEXT:    v_and_or_b32 v1, v6, v8, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; GFX9-NEXT:    v_or3_b32 v0, v1, v2, v0
@@ -1370,42 +1368,41 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX10-LABEL: v_fshl_v4i8:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 8, v2
-; GFX10-NEXT:    v_and_b32_e32 v10, 7, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 8, v2
+; GFX10-NEXT:    v_and_b32_e32 v9, 7, v2
+; GFX10-NEXT:    v_and_b32_e32 v11, 0xff, v1
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX10-NEXT:    v_not_b32_e32 v12, v7
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
-; GFX10-NEXT:    v_not_b32_e32 v9, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
-; GFX10-NEXT:    v_lshlrev_b16 v0, v10, v0
-; GFX10-NEXT:    v_not_b32_e32 v10, v8
-; GFX10-NEXT:    v_and_b32_e32 v8, 7, v8
-; GFX10-NEXT:    v_mov_b32_e32 v13, 0xff
-; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
-; GFX10-NEXT:    v_and_b32_e32 v12, 0xff, v1
+; GFX10-NEXT:    v_lshlrev_b16 v0, v9, v0
+; GFX10-NEXT:    v_and_b32_e32 v7, 7, v7
+; GFX10-NEXT:    v_lshrrev_b16 v9, 1, v11
+; GFX10-NEXT:    v_and_b32_e32 v11, 7, v12
+; GFX10-NEXT:    v_mov_b32_e32 v12, 0xff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 24, v1
 ; GFX10-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX10-NEXT:    v_lshlrev_b16 v3, v8, v3
-; GFX10-NEXT:    v_not_b32_e32 v8, v11
-; GFX10-NEXT:    v_and_b32_sdwa v1, v1, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    v_not_b32_e32 v13, v2
-; GFX10-NEXT:    v_and_b32_e32 v10, 7, v10
+; GFX10-NEXT:    v_lshlrev_b16 v3, v7, v3
+; GFX10-NEXT:    v_mov_b32_e32 v7, 7
+; GFX10-NEXT:    v_not_b32_sdwa v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT:    v_and_b32_sdwa v1, v1, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_not_b32_sdwa v12, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+; GFX10-NEXT:    v_not_b32_e32 v8, v2
 ; GFX10-NEXT:    v_lshrrev_b16 v6, 1, v6
-; GFX10-NEXT:    v_and_b32_e32 v11, 7, v11
-; GFX10-NEXT:    v_and_b32_e32 v8, 7, v8
-; GFX10-NEXT:    v_lshrrev_b16 v1, 1, v1
-; GFX10-NEXT:    v_and_b32_e32 v2, 7, v2
+; GFX10-NEXT:    v_and_b32_sdwa v14, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    v_and_b32_e32 v13, 7, v13
-; GFX10-NEXT:    v_lshrrev_b16 v7, 1, v7
-; GFX10-NEXT:    v_and_b32_e32 v9, 7, v9
-; GFX10-NEXT:    v_lshrrev_b16 v12, 1, v12
-; GFX10-NEXT:    v_lshrrev_b16 v6, v10, v6
-; GFX10-NEXT:    v_lshlrev_b16 v4, v11, v4
-; GFX10-NEXT:    v_lshrrev_b16 v1, v8, v1
+; GFX10-NEXT:    v_lshrrev_b16 v1, 1, v1
+; GFX10-NEXT:    v_and_b32_sdwa v2, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX10-NEXT:    v_and_b32_e32 v7, 7, v12
+; GFX10-NEXT:    v_lshrrev_b16 v10, 1, v10
+; GFX10-NEXT:    v_and_b32_e32 v8, 7, v8
+; GFX10-NEXT:    v_lshrrev_b16 v6, v11, v6
+; GFX10-NEXT:    v_lshlrev_b16 v4, v14, v4
+; GFX10-NEXT:    v_lshrrev_b16 v1, v13, v1
 ; GFX10-NEXT:    v_lshlrev_b16 v2, v2, v5
-; GFX10-NEXT:    v_lshrrev_b16 v5, v13, v7
-; GFX10-NEXT:    v_lshrrev_b16 v7, v9, v12
+; GFX10-NEXT:    v_lshrrev_b16 v5, v7, v10
+; GFX10-NEXT:    v_lshrrev_b16 v7, v8, v9
 ; GFX10-NEXT:    v_or_b32_e32 v3, v3, v6
 ; GFX10-NEXT:    v_mov_b32_e32 v6, 8
 ; GFX10-NEXT:    v_or_b32_e32 v1, v4, v1
@@ -3932,25 +3929,26 @@ define <2 x i16> @v_fshl_v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt) {
 ; GFX8-LABEL: v_fshl_v2i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v4, 15, v2
-; GFX8-NEXT:    v_xor_b32_e32 v2, -1, v2
-; GFX8-NEXT:    v_and_b32_e32 v2, 15, v2
+; GFX8-NEXT:    v_xor_b32_e32 v4, -1, v2
+; GFX8-NEXT:    v_and_b32_e32 v3, 15, v2
+; GFX8-NEXT:    v_and_b32_e32 v4, 15, v4
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v5, 1, v1
-; GFX8-NEXT:    v_lshlrev_b16_e32 v4, v4, v0
-; GFX8-NEXT:    v_lshrrev_b16_e32 v2, v2, v5
-; GFX8-NEXT:    v_or_b32_e32 v2, v4, v2
-; GFX8-NEXT:    v_and_b32_e32 v4, 15, v3
-; GFX8-NEXT:    v_xor_b32_e32 v3, -1, v3
+; GFX8-NEXT:    v_lshlrev_b16_e32 v3, v3, v0
+; GFX8-NEXT:    v_lshrrev_b16_e32 v4, v4, v5
+; GFX8-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX8-NEXT:    v_mov_b32_e32 v4, 15
+; GFX8-NEXT:    v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_mov_b32_e32 v5, -1
+; GFX8-NEXT:    v_xor_b32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-NEXT:    v_mov_b32_e32 v4, 1
-; GFX8-NEXT:    v_and_b32_e32 v3, 15, v3
+; GFX8-NEXT:    v_and_b32_e32 v2, 15, v2
 ; GFX8-NEXT:    v_lshrrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_lshrrev_b16_e32 v1, v3, v1
+; GFX8-NEXT:    v_lshrrev_b16_e32 v1, v2, v1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fshl_v2i16:
@@ -4083,27 +4081,28 @@ define amdgpu_ps float @v_fshl_v2i16_ssv(<2 x i16> inreg %lhs, <2 x i16> inreg %
 ;
 ; GFX8-LABEL: v_fshl_v2i16_ssv:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    v_and_b32_e32 v2, 15, v0
+; GFX8-NEXT:    v_and_b32_e32 v1, 15, v0
 ; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX8-NEXT:    v_xor_b32_e32 v0, -1, v0
-; GFX8-NEXT:    v_lshlrev_b16_e64 v2, v2, s0
+; GFX8-NEXT:    v_xor_b32_e32 v2, -1, v0
+; GFX8-NEXT:    v_lshlrev_b16_e64 v1, v1, s0
 ; GFX8-NEXT:    s_and_b32 s0, 0xffff, s1
-; GFX8-NEXT:    v_and_b32_e32 v0, 15, v0
+; GFX8-NEXT:    v_and_b32_e32 v2, 15, v2
 ; GFX8-NEXT:    s_lshr_b32 s0, s0, 1
-; GFX8-NEXT:    v_lshrrev_b16_e64 v0, v0, s0
+; GFX8-NEXT:    v_lshrrev_b16_e64 v2, v2, s0
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX8-NEXT:    v_mov_b32_e32 v2, 15
+; GFX8-NEXT:    v_mov_b32_e32 v3, -1
 ; GFX8-NEXT:    s_lshr_b32 s3, s1, 16
-; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
-; GFX8-NEXT:    v_and_b32_e32 v2, 15, v1
-; GFX8-NEXT:    v_xor_b32_e32 v1, -1, v1
-; GFX8-NEXT:    v_and_b32_e32 v1, 15, v1
+; GFX8-NEXT:    v_and_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_xor_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_e32 v0, 15, v0
 ; GFX8-NEXT:    s_lshr_b32 s0, s3, 1
 ; GFX8-NEXT:    v_lshlrev_b16_e64 v2, v2, s2
-; GFX8-NEXT:    v_lshrrev_b16_e64 v1, v1, s0
-; GFX8-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b16_e64 v0, v0, s0
+; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: v_fshl_v2i16_ssv:
@@ -4620,32 +4619,33 @@ define <3 x half> @v_fshl_v3i16(<3 x i16> %lhs, <3 x i16> %rhs, <3 x i16> %amt)
 ; GFX8-LABEL: v_fshl_v3i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
-; GFX8-NEXT:    v_and_b32_e32 v7, 15, v4
-; GFX8-NEXT:    v_xor_b32_e32 v4, -1, v4
-; GFX8-NEXT:    v_and_b32_e32 v4, 15, v4
+; GFX8-NEXT:    v_xor_b32_e32 v7, -1, v4
+; GFX8-NEXT:    v_and_b32_e32 v6, 15, v4
+; GFX8-NEXT:    v_and_b32_e32 v7, 15, v7
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v8, 1, v2
-; GFX8-NEXT:    v_lshlrev_b16_e32 v7, v7, v0
-; GFX8-NEXT:    v_lshrrev_b16_e32 v4, v4, v8
-; GFX8-NEXT:    v_or_b32_e32 v4, v7, v4
-; GFX8-NEXT:    v_and_b32_e32 v7, 15, v6
-; GFX8-NEXT:    v_xor_b32_e32 v6, -1, v6
+; GFX8-NEXT:    v_lshlrev_b16_e32 v6, v6, v0
+; GFX8-NEXT:    v_lshrrev_b16_e32 v7, v7, v8
+; GFX8-NEXT:    v_or_b32_e32 v6, v6, v7
+; GFX8-NEXT:    v_mov_b32_e32 v7, 15
+; GFX8-NEXT:    v_and_b32_sdwa v7, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_mov_b32_e32 v8, -1
+; GFX8-NEXT:    v_xor_b32_sdwa v4, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b16_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-NEXT:    v_mov_b32_e32 v7, 1
-; GFX8-NEXT:    v_and_b32_e32 v6, 15, v6
+; GFX8-NEXT:    v_and_b32_e32 v4, 15, v4
 ; GFX8-NEXT:    v_lshrrev_b16_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_lshrrev_b16_e32 v2, v6, v2
+; GFX8-NEXT:    v_lshrrev_b16_e32 v2, v4, v2
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 15, v5
-; GFX8-NEXT:    v_xor_b32_e32 v5, -1, v5
-; GFX8-NEXT:    v_and_b32_e32 v5, 15, v5
+; GFX8-NEXT:    v_xor_b32_e32 v4, -1, v5
+; GFX8-NEXT:    v_and_b32_e32 v4, 15, v4
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v1, v2, v1
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v2, 1, v3
-; GFX8-NEXT:    v_lshrrev_b16_e32 v2, v5, v2
+; GFX8-NEXT:    v_lshrrev_b16_e32 v2, v4, v2
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4984,42 +4984,42 @@ define <4 x half> @v_fshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt)
 ; GFX8-LABEL: v_fshl_v4i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
-; GFX8-NEXT:    v_and_b32_e32 v8, 15, v4
-; GFX8-NEXT:    v_xor_b32_e32 v4, -1, v4
-; GFX8-NEXT:    v_and_b32_e32 v4, 15, v4
-; GFX8-NEXT:    v_lshrrev_b16_e32 v9, 1, v2
-; GFX8-NEXT:    v_lshlrev_b16_e32 v8, v8, v0
-; GFX8-NEXT:    v_lshrrev_b16_e32 v4, v4, v9
-; GFX8-NEXT:    v_or_b32_e32 v4, v8, v4
-; GFX8-NEXT:    v_and_b32_e32 v8, 15, v6
-; GFX8-NEXT:    v_xor_b32_e32 v6, -1, v6
+; GFX8-NEXT:    v_xor_b32_e32 v7, -1, v4
+; GFX8-NEXT:    v_and_b32_e32 v6, 15, v4
+; GFX8-NEXT:    v_and_b32_e32 v7, 15, v7
+; GFX8-NEXT:    v_lshrrev_b16_e32 v8, 1, v2
+; GFX8-NEXT:    v_lshlrev_b16_e32 v6, v6, v0
+; GFX8-NEXT:    v_lshrrev_b16_e32 v7, v7, v8
+; GFX8-NEXT:    v_or_b32_e32 v6, v6, v7
+; GFX8-NEXT:    v_mov_b32_e32 v7, 15
+; GFX8-NEXT:    v_and_b32_sdwa v8, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_mov_b32_e32 v9, -1
+; GFX8-NEXT:    v_xor_b32_sdwa v4, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b16_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-NEXT:    v_mov_b32_e32 v8, 1
-; GFX8-NEXT:    v_and_b32_e32 v6, 15, v6
+; GFX8-NEXT:    v_and_b32_e32 v4, 15, v4
 ; GFX8-NEXT:    v_lshrrev_b16_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_lshrrev_b16_e32 v2, v6, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
+; GFX8-NEXT:    v_lshrrev_b16_e32 v2, v4, v2
+; GFX8-NEXT:    v_xor_b32_e32 v4, -1, v5
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 15, v5
-; GFX8-NEXT:    v_xor_b32_e32 v5, -1, v5
-; GFX8-NEXT:    v_and_b32_e32 v5, 15, v5
-; GFX8-NEXT:    v_lshrrev_b16_e32 v6, 1, v3
+; GFX8-NEXT:    v_and_b32_e32 v4, 15, v4
+; GFX8-NEXT:    v_lshrrev_b16_e32 v10, 1, v3
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v2, v2, v1
-; GFX8-NEXT:    v_lshrrev_b16_e32 v5, v5, v6
-; GFX8-NEXT:    v_xor_b32_e32 v6, -1, v7
-; GFX8-NEXT:    v_or_b32_e32 v2, v2, v5
-; GFX8-NEXT:    v_and_b32_e32 v5, 15, v7
-; GFX8-NEXT:    v_and_b32_e32 v6, 15, v6
+; GFX8-NEXT:    v_lshrrev_b16_e32 v4, v4, v10
+; GFX8-NEXT:    v_or_b32_e32 v2, v2, v4
+; GFX8-NEXT:    v_and_b32_sdwa v4, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_xor_b32_sdwa v5, v5, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_e32 v5, 15, v5
 ; GFX8-NEXT:    v_lshrrev_b16_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_lshlrev_b16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_lshrrev_b16_e32 v3, v6, v3
+; GFX8-NEXT:    v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_lshrrev_b16_e32 v3, v5, v3
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v3
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
index ff93cddafc872..61588e640be18 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
@@ -1272,46 +1272,45 @@ define i32 @v_fshr_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX8-LABEL: v_fshr_v4i8:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_not_b32_e32 v7, v2
+; GFX8-NEXT:    v_and_b32_e32 v6, 7, v2
+; GFX8-NEXT:    v_and_b32_e32 v7, 7, v7
+; GFX8-NEXT:    v_lshlrev_b16_e32 v8, 1, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 24, v2
-; GFX8-NEXT:    v_and_b32_e32 v8, 7, v2
-; GFX8-NEXT:    v_not_b32_e32 v2, v2
-; GFX8-NEXT:    v_and_b32_e32 v2, 7, v2
-; GFX8-NEXT:    v_lshlrev_b16_e32 v9, 1, v0
-; GFX8-NEXT:    v_lshlrev_b16_e32 v2, v2, v9
-; GFX8-NEXT:    v_lshrrev_b16_sdwa v8, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshlrev_b16_e32 v7, v7, v8
+; GFX8-NEXT:    v_lshrrev_b16_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
-; GFX8-NEXT:    v_or_b32_e32 v2, v2, v8
-; GFX8-NEXT:    v_and_b32_e32 v8, 7, v5
+; GFX8-NEXT:    v_or_b32_e32 v6, v7, v6
+; GFX8-NEXT:    v_and_b32_e32 v7, 7, v5
 ; GFX8-NEXT:    v_not_b32_e32 v5, v5
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
 ; GFX8-NEXT:    v_and_b32_e32 v5, 7, v5
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v3, 1, v3
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v3, v5, v3
-; GFX8-NEXT:    v_lshrrev_b16_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshrrev_b16_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX8-NEXT:    v_or_b32_e32 v3, v3, v4
-; GFX8-NEXT:    v_and_b32_e32 v4, 7, v6
-; GFX8-NEXT:    v_not_b32_e32 v5, v6
-; GFX8-NEXT:    v_mov_b32_e32 v6, 1
-; GFX8-NEXT:    v_mov_b32_e32 v9, 0xff
-; GFX8-NEXT:    v_and_b32_e32 v5, 7, v5
-; GFX8-NEXT:    v_lshlrev_b16_sdwa v8, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_lshlrev_b16_e32 v5, v5, v8
-; GFX8-NEXT:    v_and_b32_sdwa v8, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_lshrrev_b16_e32 v4, v4, v8
-; GFX8-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX8-NEXT:    v_and_b32_e32 v5, 7, v7
-; GFX8-NEXT:    v_not_b32_e32 v7, v7
+; GFX8-NEXT:    v_mov_b32_e32 v4, 7
+; GFX8-NEXT:    v_mov_b32_e32 v8, 0xff
+; GFX8-NEXT:    v_and_b32_sdwa v5, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_not_b32_sdwa v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT:    v_mov_b32_e32 v9, 1
+; GFX8-NEXT:    v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_not_b32_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
 ; GFX8-NEXT:    v_and_b32_e32 v7, 7, v7
-; GFX8-NEXT:    v_lshlrev_b16_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX8-NEXT:    v_lshlrev_b16_e32 v0, v7, v0
-; GFX8-NEXT:    v_lshrrev_b16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX8-NEXT:    v_lshlrev_b16_sdwa v10, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_and_b32_sdwa v8, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_e32 v2, 7, v2
+; GFX8-NEXT:    v_lshlrev_b16_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX8-NEXT:    v_lshlrev_b16_e32 v7, v7, v10
+; GFX8-NEXT:    v_lshrrev_b16_e32 v5, v5, v8
+; GFX8-NEXT:    v_lshlrev_b16_e32 v0, v2, v0
+; GFX8-NEXT:    v_lshrrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX8-NEXT:    v_or_b32_e32 v5, v7, v5
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 8
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xff, v4
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xff, v5
+; GFX8-NEXT:    v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
@@ -1322,47 +1321,46 @@ define i32 @v_fshr_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX9-LABEL: v_fshr_v4i8:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_not_b32_e32 v7, v2
+; GFX9-NEXT:    v_and_b32_e32 v6, 7, v2
+; GFX9-NEXT:    v_and_b32_e32 v7, 7, v7
+; GFX9-NEXT:    v_lshlrev_b16_e32 v8, 1, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 24, v2
-; GFX9-NEXT:    v_and_b32_e32 v8, 7, v2
-; GFX9-NEXT:    v_not_b32_e32 v2, v2
-; GFX9-NEXT:    v_and_b32_e32 v2, 7, v2
-; GFX9-NEXT:    v_lshlrev_b16_e32 v9, 1, v0
-; GFX9-NEXT:    v_lshlrev_b16_e32 v2, v2, v9
-; GFX9-NEXT:    v_lshrrev_b16_sdwa v8, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshlrev_b16_e32 v7, v7, v8
+; GFX9-NEXT:    v_lshrrev_b16_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
-; GFX9-NEXT:    v_or_b32_e32 v2, v2, v8
-; GFX9-NEXT:    v_and_b32_e32 v8, 7, v5
+; GFX9-NEXT:    v_or_b32_e32 v6, v7, v6
+; GFX9-NEXT:    v_and_b32_e32 v7, 7, v5
 ; GFX9-NEXT:    v_not_b32_e32 v5, v5
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
 ; GFX9-NEXT:    v_and_b32_e32 v5, 7, v5
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v3, 1, v3
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v3, v5, v3
-; GFX9-NEXT:    v_lshrrev_b16_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshrrev_b16_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_or_b32_e32 v3, v3, v4
-; GFX9-NEXT:    v_and_b32_e32 v4, 7, v6
-; GFX9-NEXT:    v_not_b32_e32 v5, v6
-; GFX9-NEXT:    v_mov_b32_e32 v6, 1
-; GFX9-NEXT:    v_mov_b32_e32 v9, 0xff
-; GFX9-NEXT:    v_and_b32_e32 v5, 7, v5
-; GFX9-NEXT:    v_lshlrev_b16_sdwa v8, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT:    v_lshlrev_b16_e32 v5, v5, v8
-; GFX9-NEXT:    v_and_b32_sdwa v8, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_lshrrev_b16_e32 v4, v4, v8
-; GFX9-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX9-NEXT:    v_and_b32_e32 v5, 7, v7
-; GFX9-NEXT:    v_not_b32_e32 v7, v7
+; GFX9-NEXT:    v_mov_b32_e32 v4, 7
+; GFX9-NEXT:    v_not_b32_sdwa v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    v_mov_b32_e32 v9, 1
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0xff
+; GFX9-NEXT:    v_and_b32_sdwa v5, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-NEXT:    v_and_b32_e32 v7, 7, v7
-; GFX9-NEXT:    v_lshlrev_b16_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX9-NEXT:    v_lshlrev_b16_e32 v0, v7, v0
-; GFX9-NEXT:    v_lshrrev_b16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX9-NEXT:    v_lshlrev_b16_sdwa v10, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_not_b32_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+; GFX9-NEXT:    v_lshlrev_b16_e32 v7, v7, v10
+; GFX9-NEXT:    v_and_b32_sdwa v10, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_e32 v2, 7, v2
+; GFX9-NEXT:    v_lshlrev_b16_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX9-NEXT:    v_lshrrev_b16_e32 v5, v5, v10
+; GFX9-NEXT:    v_lshlrev_b16_e32 v0, v2, v0
+; GFX9-NEXT:    v_lshrrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX9-NEXT:    v_or_b32_e32 v5, v7, v5
 ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 8
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_and_or_b32 v1, v2, v9, v1
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xff, v4
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xff, v5
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX9-NEXT:    v_and_or_b32 v1, v6, v8, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; GFX9-NEXT:    v_or3_b32 v0, v1, v2, v0
@@ -1372,52 +1370,51 @@ define i32 @v_fshr_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 8, v0
 ; GFX10-NEXT:    v_not_b32_e32 v8, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 24, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 24, v0
 ; GFX10-NEXT:    v_not_b32_e32 v10, v5
-; GFX10-NEXT:    v_lshlrev_b16 v3, 1, v3
-; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 24, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 8, v1
-; GFX10-NEXT:    v_and_b32_e32 v10, 7, v10
 ; GFX10-NEXT:    v_lshlrev_b16 v0, 1, v0
 ; GFX10-NEXT:    v_and_b32_e32 v8, 7, v8
-; GFX10-NEXT:    v_mov_b32_e32 v13, 0xff
-; GFX10-NEXT:    v_not_b32_e32 v14, v12
-; GFX10-NEXT:    v_lshlrev_b16 v3, v10, v3
-; GFX10-NEXT:    v_not_b32_e32 v10, v11
-; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 24, v1
+; GFX10-NEXT:    v_lshlrev_b16 v4, 1, v4
+; GFX10-NEXT:    v_mov_b32_e32 v3, 7
+; GFX10-NEXT:    v_and_b32_e32 v10, 7, v10
+; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 8, v1
 ; GFX10-NEXT:    v_lshlrev_b16 v0, v8, v0
-; GFX10-NEXT:    v_and_b32_e32 v8, 0xff, v1
+; GFX10-NEXT:    v_not_b32_sdwa v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT:    v_not_b32_sdwa v14, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+; GFX10-NEXT:    v_lshlrev_b16 v4, v10, v4
+; GFX10-NEXT:    v_mov_b32_e32 v10, 0xff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 24, v1
+; GFX10-NEXT:    v_and_b32_e32 v12, 7, v2
+; GFX10-NEXT:    v_and_b32_e32 v13, 0xff, v1
 ; GFX10-NEXT:    v_and_b32_e32 v5, 7, v5
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX10-NEXT:    v_and_b32_e32 v11, 7, v11
-; GFX10-NEXT:    v_and_b32_e32 v10, 7, v10
-; GFX10-NEXT:    v_lshlrev_b16 v4, 1, v4
-; GFX10-NEXT:    v_and_b32_sdwa v1, v1, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    v_and_b32_e32 v13, 7, v14
+; GFX10-NEXT:    v_and_b32_e32 v9, 0xff, v9
+; GFX10-NEXT:    v_and_b32_sdwa v15, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_and_b32_e32 v8, 7, v8
 ; GFX10-NEXT:    v_lshlrev_b16 v6, 1, v6
-; GFX10-NEXT:    v_and_b32_e32 v12, 7, v12
-; GFX10-NEXT:    v_and_b32_e32 v2, 7, v2
-; GFX10-NEXT:    v_lshrrev_b16 v5, v5, v7
-; GFX10-NEXT:    v_lshlrev_b16 v4, v10, v4
-; GFX10-NEXT:    v_lshrrev_b16 v1, v11, v1
-; GFX10-NEXT:    v_lshlrev_b16 v6, v13, v6
-; GFX10-NEXT:    v_lshrrev_b16 v7, v12, v9
-; GFX10-NEXT:    v_lshrrev_b16 v2, v2, v8
-; GFX10-NEXT:    v_or_b32_e32 v3, v3, v5
-; GFX10-NEXT:    v_mov_b32_e32 v5, 8
-; GFX10-NEXT:    v_or_b32_e32 v1, v4, v1
-; GFX10-NEXT:    v_or_b32_e32 v4, v6, v7
-; GFX10-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-NEXT:    v_and_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_and_b32_e32 v10, 7, v14
+; GFX10-NEXT:    v_lshlrev_b16 v7, 1, v7
+; GFX10-NEXT:    v_and_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX10-NEXT:    v_lshrrev_b16 v3, v5, v9
+; GFX10-NEXT:    v_lshlrev_b16 v5, v8, v6
+; GFX10-NEXT:    v_lshrrev_b16 v1, v15, v1
+; GFX10-NEXT:    v_lshlrev_b16 v6, v10, v7
+; GFX10-NEXT:    v_lshrrev_b16 v2, v2, v11
+; GFX10-NEXT:    v_lshrrev_b16 v7, v12, v13
+; GFX10-NEXT:    v_or_b32_e32 v3, v4, v3
+; GFX10-NEXT:    v_mov_b32_e32 v4, 8
+; GFX10-NEXT:    v_or_b32_e32 v1, v5, v1
+; GFX10-NEXT:    v_or_b32_e32 v2, v6, v2
+; GFX10-NEXT:    v_or_b32_e32 v0, v0, v7
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX10-NEXT:    v_and_or_b32 v0, 0xff, v0, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX10-NEXT:    v_and_or_b32 v0, 0xff, v0, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
 ; GFX10-NEXT:    v_or3_b32 v0, v0, v1, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3718,29 +3715,29 @@ define <2 x i16> @v_fshr_v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt) {
 ; GFX8-NEXT:    v_mov_b32_e32 v4, 1
 ; GFX8-NEXT:    v_mov_b32_e32 v5, 15
 ; GFX8-NEXT:    v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_lshrrev_b16_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_lshrrev_b16_sdwa v6, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-NEXT:    v_xor_b32_e32 v2, -1, v2
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v5
-; GFX8-NEXT:    v_lshlrev_b16_e32 v5, 1, v1
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v6
+; GFX8-NEXT:    v_lshlrev_b16_e32 v6, 1, v1
 ; GFX8-NEXT:    v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v6, 15, v2
-; GFX8-NEXT:    v_xor_b32_e32 v2, -1, v2
+; GFX8-NEXT:    v_and_b32_e32 v4, 15, v2
+; GFX8-NEXT:    v_xor_b32_e32 v7, -1, v2
+; GFX8-NEXT:    v_and_b32_e32 v7, 15, v7
+; GFX8-NEXT:    v_lshlrev_b16_e32 v3, v4, v3
+; GFX8-NEXT:    v_lshrrev_b16_e32 v4, 1, v6
+; GFX8-NEXT:    v_lshrrev_b16_e32 v4, v7, v4
+; GFX8-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX8-NEXT:    v_and_b32_sdwa v4, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_mov_b32_e32 v5, -1
+; GFX8-NEXT:    v_xor_b32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_and_b32_e32 v2, 15, v2
-; GFX8-NEXT:    v_lshrrev_b16_e32 v5, 1, v5
-; GFX8-NEXT:    v_lshlrev_b16_e32 v3, v6, v3
-; GFX8-NEXT:    v_lshrrev_b16_e32 v2, v2, v5
-; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX8-NEXT:    v_and_b32_e32 v3, 15, v4
-; GFX8-NEXT:    v_xor_b32_e32 v4, -1, v4
-; GFX8-NEXT:    v_and_b32_e32 v4, 15, v4
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v1, 1, v1
-; GFX8-NEXT:    v_lshlrev_b16_e32 v0, v3, v0
-; GFX8-NEXT:    v_lshrrev_b16_e32 v1, v4, v1
+; GFX8-NEXT:    v_lshlrev_b16_e32 v0, v4, v0
+; GFX8-NEXT:    v_lshrrev_b16_e32 v1, v2, v1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fshr_v2i16:
@@ -3896,30 +3893,31 @@ define amdgpu_ps float @v_fshr_v2i16_ssv(<2 x i16> inreg %lhs, <2 x i16> inreg %
 ; GFX8-NEXT:    s_lshr_b32 s3, s1, 16
 ; GFX8-NEXT:    s_or_b32 s0, s0, s4
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 1
-; GFX8-NEXT:    v_and_b32_e32 v2, 15, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX8-NEXT:    v_xor_b32_e32 v0, -1, v0
-; GFX8-NEXT:    v_lshlrev_b16_e64 v2, v2, s0
+; GFX8-NEXT:    v_and_b32_e32 v1, 15, v0
+; GFX8-NEXT:    v_xor_b32_e32 v2, -1, v0
+; GFX8-NEXT:    v_lshlrev_b16_e64 v1, v1, s0
 ; GFX8-NEXT:    s_and_b32 s0, 0xffff, s1
-; GFX8-NEXT:    v_and_b32_e32 v0, 15, v0
+; GFX8-NEXT:    v_and_b32_e32 v2, 15, v2
 ; GFX8-NEXT:    s_lshr_b32 s0, s0, 1
+; GFX8-NEXT:    v_lshrrev_b16_e64 v2, v2, s0
 ; GFX8-NEXT:    s_lshr_b32 s4, s3, 15
 ; GFX8-NEXT:    s_lshl_b32 s3, s3, 1
-; GFX8-NEXT:    v_lshrrev_b16_e64 v0, v0, s0
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX8-NEXT:    v_mov_b32_e32 v2, 15
+; GFX8-NEXT:    v_mov_b32_e32 v3, -1
 ; GFX8-NEXT:    s_lshl_b32 s2, s2, 1
-; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
-; GFX8-NEXT:    v_and_b32_e32 v2, 15, v1
-; GFX8-NEXT:    v_xor_b32_e32 v1, -1, v1
+; GFX8-NEXT:    v_and_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_xor_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    s_and_b32 s0, 0xffff, s3
 ; GFX8-NEXT:    s_or_b32 s2, s2, s4
-; GFX8-NEXT:    v_and_b32_e32 v1, 15, v1
+; GFX8-NEXT:    v_and_b32_e32 v0, 15, v0
 ; GFX8-NEXT:    s_lshr_b32 s0, s0, 1
 ; GFX8-NEXT:    v_lshlrev_b16_e64 v2, v2, s2
-; GFX8-NEXT:    v_lshrrev_b16_e64 v1, v1, s0
-; GFX8-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b16_e64 v0, v0, s0
+; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: v_fshr_v2i16_ssv:
@@ -4536,47 +4534,47 @@ define <3 x half> @v_fshr_v3i16(<3 x i16> %lhs, <3 x i16> %rhs, <3 x i16> %amt)
 ; GFX8-LABEL: v_fshr_v3i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b16_e32 v7, 1, v0
-; GFX8-NEXT:    v_lshrrev_b16_e32 v8, 15, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
-; GFX8-NEXT:    v_or_b32_e32 v7, v7, v8
-; GFX8-NEXT:    v_mov_b32_e32 v8, 1
-; GFX8-NEXT:    v_lshlrev_b16_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_lshrrev_b16_e32 v8, 15, v6
-; GFX8-NEXT:    v_xor_b32_e32 v4, -1, v4
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v8
-; GFX8-NEXT:    v_lshlrev_b16_e32 v2, 1, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
-; GFX8-NEXT:    v_and_b32_e32 v9, 15, v4
+; GFX8-NEXT:    v_lshlrev_b16_e32 v6, 1, v0
+; GFX8-NEXT:    v_lshrrev_b16_e32 v7, 15, v2
+; GFX8-NEXT:    v_or_b32_e32 v6, v6, v7
+; GFX8-NEXT:    v_mov_b32_e32 v7, 1
+; GFX8-NEXT:    v_mov_b32_e32 v8, 15
+; GFX8-NEXT:    v_lshlrev_b16_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_lshrrev_b16_sdwa v9, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-NEXT:    v_xor_b32_e32 v4, -1, v4
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v9
+; GFX8-NEXT:    v_lshlrev_b16_e32 v9, 1, v2
+; GFX8-NEXT:    v_lshlrev_b16_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_and_b32_e32 v7, 15, v4
+; GFX8-NEXT:    v_xor_b32_e32 v10, -1, v4
+; GFX8-NEXT:    v_and_b32_e32 v10, 15, v10
+; GFX8-NEXT:    v_lshlrev_b16_e32 v6, v7, v6
+; GFX8-NEXT:    v_lshrrev_b16_e32 v7, 1, v9
+; GFX8-NEXT:    v_lshrrev_b16_e32 v7, v10, v7
+; GFX8-NEXT:    v_or_b32_e32 v6, v6, v7
+; GFX8-NEXT:    v_and_b32_sdwa v7, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_mov_b32_e32 v8, -1
+; GFX8-NEXT:    v_xor_b32_sdwa v4, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_and_b32_e32 v4, 15, v4
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v2, 1, v2
-; GFX8-NEXT:    v_lshlrev_b16_e32 v7, v9, v7
+; GFX8-NEXT:    v_lshlrev_b16_e32 v0, v7, v0
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v2, v4, v2
-; GFX8-NEXT:    v_lshlrev_b16_e32 v6, 1, v6
-; GFX8-NEXT:    v_or_b32_e32 v2, v7, v2
-; GFX8-NEXT:    v_and_b32_e32 v4, 15, v8
-; GFX8-NEXT:    v_xor_b32_e32 v7, -1, v8
-; GFX8-NEXT:    v_and_b32_e32 v7, 15, v7
-; GFX8-NEXT:    v_lshlrev_b16_e32 v0, v4, v0
-; GFX8-NEXT:    v_lshrrev_b16_e32 v4, 1, v6
-; GFX8-NEXT:    v_lshrrev_b16_e32 v4, v7, v4
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 1, v1
-; GFX8-NEXT:    v_lshrrev_b16_e32 v4, 15, v3
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
-; GFX8-NEXT:    v_xor_b32_e32 v4, -1, v5
-; GFX8-NEXT:    v_lshlrev_b16_e32 v3, 1, v3
-; GFX8-NEXT:    v_and_b32_e32 v5, 15, v4
-; GFX8-NEXT:    v_xor_b32_e32 v4, -1, v4
-; GFX8-NEXT:    v_and_b32_e32 v4, 15, v4
-; GFX8-NEXT:    v_lshrrev_b16_e32 v3, 1, v3
-; GFX8-NEXT:    v_lshlrev_b16_e32 v1, v5, v1
-; GFX8-NEXT:    v_lshrrev_b16_e32 v3, v4, v3
+; GFX8-NEXT:    v_lshrrev_b16_e32 v2, 15, v3
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX8-NEXT:    v_lshlrev_b16_e32 v2, 1, v3
+; GFX8-NEXT:    v_xor_b32_e32 v3, -1, v5
+; GFX8-NEXT:    v_and_b32_e32 v4, 15, v3
+; GFX8-NEXT:    v_xor_b32_e32 v3, -1, v3
+; GFX8-NEXT:    v_and_b32_e32 v3, 15, v3
+; GFX8-NEXT:    v_lshrrev_b16_e32 v2, 1, v2
+; GFX8-NEXT:    v_lshlrev_b16_e32 v1, v4, v1
+; GFX8-NEXT:    v_lshrrev_b16_e32 v2, v3, v2
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -5001,44 +4999,43 @@ define <4 x half> @v_fshr_v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt)
 ; GFX8-NEXT:    v_xor_b32_e32 v4, -1, v4
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v9
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v9, 1, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v4
-; GFX8-NEXT:    v_and_b32_e32 v11, 15, v4
-; GFX8-NEXT:    v_xor_b32_e32 v4, -1, v4
-; GFX8-NEXT:    v_and_b32_e32 v4, 15, v4
+; GFX8-NEXT:    v_xor_b32_e32 v11, -1, v4
+; GFX8-NEXT:    v_and_b32_e32 v10, 15, v4
+; GFX8-NEXT:    v_and_b32_e32 v11, 15, v11
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v9, 1, v9
+; GFX8-NEXT:    v_lshlrev_b16_e32 v6, v10, v6
+; GFX8-NEXT:    v_lshrrev_b16_e32 v9, v11, v9
+; GFX8-NEXT:    v_mov_b32_e32 v10, -1
 ; GFX8-NEXT:    v_lshlrev_b16_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_lshlrev_b16_e32 v6, v11, v6
-; GFX8-NEXT:    v_lshrrev_b16_e32 v4, v4, v9
-; GFX8-NEXT:    v_xor_b32_e32 v9, -1, v10
-; GFX8-NEXT:    v_or_b32_e32 v4, v6, v4
-; GFX8-NEXT:    v_and_b32_e32 v6, 15, v10
-; GFX8-NEXT:    v_and_b32_e32 v9, 15, v9
+; GFX8-NEXT:    v_or_b32_e32 v6, v6, v9
+; GFX8-NEXT:    v_and_b32_sdwa v9, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_xor_b32_sdwa v4, v4, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_e32 v4, 15, v4
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v2, 1, v2
-; GFX8-NEXT:    v_lshlrev_b16_e32 v0, v6, v0
-; GFX8-NEXT:    v_lshrrev_b16_e32 v2, v9, v2
+; GFX8-NEXT:    v_lshlrev_b16_e32 v0, v9, v0
+; GFX8-NEXT:    v_lshrrev_b16_e32 v2, v4, v2
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v2, 1, v1
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v4, 15, v3
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX8-NEXT:    v_or_b32_e32 v2, v2, v4
 ; GFX8-NEXT:    v_lshlrev_b16_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-NEXT:    v_lshrrev_b16_sdwa v4, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-NEXT:    v_xor_b32_e32 v5, -1, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v4, 1, v3
 ; GFX8-NEXT:    v_lshlrev_b16_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v5
-; GFX8-NEXT:    v_and_b32_e32 v7, 15, v5
-; GFX8-NEXT:    v_xor_b32_e32 v5, -1, v5
-; GFX8-NEXT:    v_and_b32_e32 v5, 15, v5
+; GFX8-NEXT:    v_xor_b32_e32 v7, -1, v5
+; GFX8-NEXT:    v_or_b32_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_e32 v6, 15, v5
+; GFX8-NEXT:    v_and_b32_e32 v7, 15, v7
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v4, 1, v4
-; GFX8-NEXT:    v_lshlrev_b16_e32 v2, v7, v2
-; GFX8-NEXT:    v_lshrrev_b16_e32 v4, v5, v4
-; GFX8-NEXT:    v_xor_b32_e32 v5, -1, v6
+; GFX8-NEXT:    v_lshlrev_b16_e32 v2, v6, v2
+; GFX8-NEXT:    v_lshrrev_b16_e32 v4, v7, v4
 ; GFX8-NEXT:    v_or_b32_e32 v2, v2, v4
-; GFX8-NEXT:    v_and_b32_e32 v4, 15, v6
+; GFX8-NEXT:    v_and_b32_sdwa v4, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_xor_b32_sdwa v5, v5, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_and_b32_e32 v5, 15, v5
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v3, 1, v3
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v1, v4, v1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll
index 3ef059057ac8e..41e915a4c1011 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll
@@ -248,13 +248,12 @@ define amdgpu_cs <2 x i8> @abs_vgpr_v2i8(<2 x i8> %arg) {
 ; GFX8-LABEL: abs_vgpr_v2i8:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX8-NEXT:    v_ashrrev_i16_e32 v0, 8, v0
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX8-NEXT:    v_sub_u16_e32 v2, 0, v0
-; GFX8-NEXT:    v_ashrrev_i16_e32 v1, 8, v1
-; GFX8-NEXT:    v_max_i16_e32 v0, v0, v2
-; GFX8-NEXT:    v_sub_u16_e32 v2, 0, v1
-; GFX8-NEXT:    v_max_i16_e32 v1, v1, v2
+; GFX8-NEXT:    v_sub_u16_sdwa v3, v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX8-NEXT:    v_sub_u16_sdwa v2, v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX8-NEXT:    v_max_i16_sdwa v0, sext(v0), v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX8-NEXT:    v_max_i16_sdwa v1, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX8-NEXT:    ; return to shader part epilog
@@ -340,17 +339,15 @@ define amdgpu_cs <3 x i8> @abs_vgpr_v3i8(<3 x i8>  %arg) {
 ; GFX8-LABEL: abs_vgpr_v3i8:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX8-NEXT:    v_ashrrev_i16_e32 v0, 8, v0
+; GFX8-NEXT:    v_mov_b32_e32 v3, 0
+; GFX8-NEXT:    v_sub_u16_sdwa v4, v3, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX8-NEXT:    v_sub_u16_e32 v3, 0, v0
-; GFX8-NEXT:    v_ashrrev_i16_e32 v1, 8, v1
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX8-NEXT:    v_max_i16_e32 v0, v0, v3
-; GFX8-NEXT:    v_sub_u16_e32 v3, 0, v1
-; GFX8-NEXT:    v_ashrrev_i16_e32 v2, 8, v2
-; GFX8-NEXT:    v_max_i16_e32 v1, v1, v3
-; GFX8-NEXT:    v_sub_u16_e32 v3, 0, v2
-; GFX8-NEXT:    v_max_i16_e32 v2, v2, v3
+; GFX8-NEXT:    v_max_i16_sdwa v0, sext(v0), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX8-NEXT:    v_sub_u16_sdwa v4, v3, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX8-NEXT:    v_sub_u16_sdwa v3, v3, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX8-NEXT:    v_max_i16_sdwa v1, sext(v1), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX8-NEXT:    v_max_i16_sdwa v2, sext(v2), v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX8-NEXT:    v_readfirstlane_b32 s2, v2
@@ -424,12 +421,12 @@ define amdgpu_cs <2 x i16> @abs_vgpr_v2i16(<2 x i16> %arg) {
 ;
 ; GFX8-LABEL: abs_vgpr_v2i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX8-NEXT:    v_sub_u16_e32 v2, 0, v0
-; GFX8-NEXT:    v_sub_u16_e32 v3, 0, v1
-; GFX8-NEXT:    v_max_i16_e32 v0, v0, v2
-; GFX8-NEXT:    v_max_i16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0
+; GFX8-NEXT:    v_sub_u16_e32 v1, 0, v0
+; GFX8-NEXT:    v_sub_u16_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_max_i16_e32 v1, v0, v1
+; GFX8-NEXT:    v_max_i16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
@@ -503,14 +500,14 @@ define amdgpu_cs <3 x i16> @abs_vgpr_v3i16(<3 x i16> %arg) {
 ;
 ; GFX8-LABEL: abs_vgpr_v3i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX8-NEXT:    v_sub_u16_e32 v3, 0, v0
-; GFX8-NEXT:    v_sub_u16_e32 v4, 0, v2
-; GFX8-NEXT:    v_sub_u16_e32 v5, 0, v1
-; GFX8-NEXT:    v_max_i16_e32 v0, v0, v3
-; GFX8-NEXT:    v_max_i16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX8-NEXT:    v_max_i16_e32 v1, v1, v5
+; GFX8-NEXT:    v_mov_b32_e32 v3, 0
+; GFX8-NEXT:    v_sub_u16_e32 v2, 0, v0
+; GFX8-NEXT:    v_sub_u16_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_sub_u16_e32 v4, 0, v1
+; GFX8-NEXT:    v_max_i16_e32 v2, v0, v2
+; GFX8-NEXT:    v_max_i16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX8-NEXT:    v_max_i16_e32 v1, v1, v4
 ; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX8-NEXT:    ; return to shader part epilog
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
index a6f9bb7ee055d..04da7b24156dc 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
@@ -298,8 +298,9 @@ define i16 @v_saddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
 ; GFX9-NEXT:    v_pk_add_i16 v0, v0, v1 clamp
 ; GFX9-NEXT:    v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0xff
-; GFX9-NEXT:    v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -606,10 +607,12 @@ define i32 @v_saddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
 ; GFX8-NEXT:    v_add_u16_e32 v3, v3, v4
 ; GFX8-NEXT:    v_mov_b32_e32 v4, 0xff
 ; GFX8-NEXT:    v_and_b32_sdwa v1, sext(v1), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX8-NEXT:    v_ashrrev_i16_e32 v2, 8, v2
 ; GFX8-NEXT:    v_and_b32_sdwa v0, sext(v0), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT:    v_and_b32_sdwa v1, sext(v2), v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_and_b32_sdwa v1, sext(v3), v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -2774,22 +2777,22 @@ define <2 x i16> @v_saddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
 ; GFX8-LABEL: v_saddsat_v2i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_min_i16_e32 v4, 0, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX8-NEXT:    v_max_i16_e32 v3, 0, v0
-; GFX8-NEXT:    v_sub_u16_e32 v4, 0x8000, v4
-; GFX8-NEXT:    v_sub_u16_e32 v3, 0x7fff, v3
-; GFX8-NEXT:    v_max_i16_e32 v4, v4, v1
-; GFX8-NEXT:    v_min_i16_e32 v5, 0, v2
-; GFX8-NEXT:    v_min_i16_e32 v3, v4, v3
-; GFX8-NEXT:    v_max_i16_e32 v4, 0, v2
-; GFX8-NEXT:    v_sub_u16_e32 v5, 0x8000, v5
+; GFX8-NEXT:    v_min_i16_e32 v3, 0, v0
+; GFX8-NEXT:    v_max_i16_e32 v2, 0, v0
+; GFX8-NEXT:    v_sub_u16_e32 v3, 0x8000, v3
+; GFX8-NEXT:    v_sub_u16_e32 v2, 0x7fff, v2
+; GFX8-NEXT:    v_max_i16_e32 v3, v3, v1
+; GFX8-NEXT:    v_min_i16_e32 v2, v3, v2
+; GFX8-NEXT:    v_mov_b32_e32 v3, 0
+; GFX8-NEXT:    v_max_i16_sdwa v4, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_min_i16_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_sub_u16_e32 v3, 0x8000, v3
 ; GFX8-NEXT:    v_sub_u16_e32 v4, 0x7fff, v4
-; GFX8-NEXT:    v_max_i16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_max_i16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-NEXT:    v_min_i16_e32 v1, v1, v4
-; GFX8-NEXT:    v_add_u16_e32 v0, v0, v3
-; GFX8-NEXT:    v_add_u16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    v_add_u16_e32 v2, v0, v2
+; GFX8-NEXT:    v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_saddsat_v2i16:
@@ -2987,23 +2990,23 @@ define amdgpu_ps float @saddsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) {
 ;
 ; GFX8-LABEL: saddsat_v2i16_vs:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    v_min_i16_e32 v3, 0, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX8-NEXT:    v_max_i16_e32 v2, 0, v0
-; GFX8-NEXT:    v_sub_u16_e32 v3, 0x8000, v3
-; GFX8-NEXT:    v_sub_u16_e32 v2, 0x7fff, v2
-; GFX8-NEXT:    v_max_i16_e32 v3, s0, v3
-; GFX8-NEXT:    v_min_i16_e32 v4, 0, v1
+; GFX8-NEXT:    v_min_i16_e32 v2, 0, v0
+; GFX8-NEXT:    v_max_i16_e32 v1, 0, v0
+; GFX8-NEXT:    v_sub_u16_e32 v2, 0x8000, v2
+; GFX8-NEXT:    v_sub_u16_e32 v1, 0x7fff, v1
+; GFX8-NEXT:    v_max_i16_e32 v2, s0, v2
+; GFX8-NEXT:    v_min_i16_e32 v1, v2, v1
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0
+; GFX8-NEXT:    v_max_i16_sdwa v3, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_min_i16_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    s_lshr_b32 s1, s0, 16
-; GFX8-NEXT:    v_min_i16_e32 v2, v3, v2
-; GFX8-NEXT:    v_max_i16_e32 v3, 0, v1
-; GFX8-NEXT:    v_sub_u16_e32 v4, 0x8000, v4
+; GFX8-NEXT:    v_sub_u16_e32 v2, 0x8000, v2
 ; GFX8-NEXT:    v_sub_u16_e32 v3, 0x7fff, v3
-; GFX8-NEXT:    v_max_i16_e32 v4, s1, v4
-; GFX8-NEXT:    v_min_i16_e32 v3, v4, v3
-; GFX8-NEXT:    v_add_u16_e32 v0, v0, v2
-; GFX8-NEXT:    v_add_u16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    v_max_i16_e32 v2, s1, v2
+; GFX8-NEXT:    v_min_i16_e32 v2, v2, v3
+; GFX8-NEXT:    v_add_u16_e32 v1, v0, v1
+; GFX8-NEXT:    v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: saddsat_v2i16_vs:
@@ -3090,38 +3093,37 @@ define <2 x float> @v_saddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 ; GFX8-LABEL: v_saddsat_v4i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_min_i16_e32 v7, 0, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
-; GFX8-NEXT:    v_max_i16_e32 v6, 0, v0
+; GFX8-NEXT:    v_min_i16_e32 v5, 0, v0
+; GFX8-NEXT:    v_max_i16_e32 v4, 0, v0
+; GFX8-NEXT:    v_sub_u16_e32 v5, 0x8000, v5
+; GFX8-NEXT:    v_sub_u16_e32 v4, 0x7fff, v4
+; GFX8-NEXT:    v_max_i16_e32 v5, v5, v2
+; GFX8-NEXT:    v_min_i16_e32 v4, v5, v4
+; GFX8-NEXT:    v_mov_b32_e32 v5, 0
+; GFX8-NEXT:    v_min_i16_sdwa v7, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_max_i16_sdwa v6, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_sub_u16_e32 v7, 0x8000, v7
+; GFX8-NEXT:    v_sub_u16_e32 v6, 0x7fff, v6
+; GFX8-NEXT:    v_max_i16_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_min_i16_e32 v7, 0, v1
+; GFX8-NEXT:    v_min_i16_e32 v2, v2, v6
+; GFX8-NEXT:    v_max_i16_e32 v6, 0, v1
 ; GFX8-NEXT:    v_sub_u16_e32 v7, 0x8000, v7
 ; GFX8-NEXT:    v_sub_u16_e32 v6, 0x7fff, v6
-; GFX8-NEXT:    v_max_i16_e32 v7, v7, v2
-; GFX8-NEXT:    v_min_i16_e32 v8, 0, v4
+; GFX8-NEXT:    v_max_i16_e32 v7, v7, v3
 ; GFX8-NEXT:    v_min_i16_e32 v6, v7, v6
-; GFX8-NEXT:    v_max_i16_e32 v7, 0, v4
-; GFX8-NEXT:    v_sub_u16_e32 v8, 0x8000, v8
-; GFX8-NEXT:    v_sub_u16_e32 v7, 0x7fff, v7
-; GFX8-NEXT:    v_max_i16_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_min_i16_e32 v8, 0, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; GFX8-NEXT:    v_min_i16_e32 v2, v2, v7
-; GFX8-NEXT:    v_max_i16_e32 v7, 0, v1
-; GFX8-NEXT:    v_sub_u16_e32 v8, 0x8000, v8
+; GFX8-NEXT:    v_max_i16_sdwa v7, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_min_i16_sdwa v5, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_sub_u16_e32 v5, 0x8000, v5
 ; GFX8-NEXT:    v_sub_u16_e32 v7, 0x7fff, v7
-; GFX8-NEXT:    v_max_i16_e32 v8, v8, v3
-; GFX8-NEXT:    v_min_i16_e32 v9, 0, v5
-; GFX8-NEXT:    v_min_i16_e32 v7, v8, v7
-; GFX8-NEXT:    v_max_i16_e32 v8, 0, v5
-; GFX8-NEXT:    v_sub_u16_e32 v9, 0x8000, v9
-; GFX8-NEXT:    v_sub_u16_e32 v8, 0x7fff, v8
-; GFX8-NEXT:    v_max_i16_sdwa v3, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_min_i16_e32 v3, v3, v8
-; GFX8-NEXT:    v_add_u16_e32 v0, v0, v6
-; GFX8-NEXT:    v_add_u16_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX8-NEXT:    v_add_u16_e32 v1, v1, v7
-; GFX8-NEXT:    v_add_u16_sdwa v2, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX8-NEXT:    v_max_i16_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_min_i16_e32 v3, v3, v7
+; GFX8-NEXT:    v_add_u16_e32 v4, v0, v4
+; GFX8-NEXT:    v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_add_u16_e32 v2, v1, v6
+; GFX8-NEXT:    v_add_u16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX8-NEXT:    v_or_b32_e32 v1, v2, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_saddsat_v4i16:
@@ -3376,54 +3378,52 @@ define <3 x float> @v_saddsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) {
 ; GFX8-LABEL: v_saddsat_v6i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_min_i16_e32 v10, 0, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
-; GFX8-NEXT:    v_max_i16_e32 v9, 0, v0
+; GFX8-NEXT:    v_min_i16_e32 v7, 0, v0
+; GFX8-NEXT:    v_max_i16_e32 v6, 0, v0
+; GFX8-NEXT:    v_sub_u16_e32 v7, 0x8000, v7
+; GFX8-NEXT:    v_sub_u16_e32 v6, 0x7fff, v6
+; GFX8-NEXT:    v_max_i16_e32 v7, v7, v3
+; GFX8-NEXT:    v_min_i16_e32 v6, v7, v6
+; GFX8-NEXT:    v_mov_b32_e32 v7, 0
+; GFX8-NEXT:    v_min_i16_sdwa v9, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_max_i16_sdwa v8, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_sub_u16_e32 v9, 0x8000, v9
+; GFX8-NEXT:    v_sub_u16_e32 v8, 0x7fff, v8
+; GFX8-NEXT:    v_max_i16_sdwa v3, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_min_i16_e32 v9, 0, v1
+; GFX8-NEXT:    v_min_i16_e32 v3, v3, v8
+; GFX8-NEXT:    v_max_i16_e32 v8, 0, v1
+; GFX8-NEXT:    v_sub_u16_e32 v9, 0x8000, v9
+; GFX8-NEXT:    v_sub_u16_e32 v8, 0x7fff, v8
+; GFX8-NEXT:    v_max_i16_e32 v9, v9, v4
+; GFX8-NEXT:    v_min_i16_sdwa v10, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_min_i16_e32 v8, v9, v8
+; GFX8-NEXT:    v_max_i16_sdwa v9, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_sub_u16_e32 v10, 0x8000, v10
+; GFX8-NEXT:    v_sub_u16_e32 v9, 0x7fff, v9
+; GFX8-NEXT:    v_max_i16_sdwa v4, v10, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_min_i16_e32 v10, 0, v2
+; GFX8-NEXT:    v_min_i16_e32 v4, v4, v9
+; GFX8-NEXT:    v_max_i16_e32 v9, 0, v2
 ; GFX8-NEXT:    v_sub_u16_e32 v10, 0x8000, v10
 ; GFX8-NEXT:    v_sub_u16_e32 v9, 0x7fff, v9
-; GFX8-NEXT:    v_max_i16_e32 v10, v10, v3
-; GFX8-NEXT:    v_min_i16_e32 v11, 0, v6
+; GFX8-NEXT:    v_max_i16_e32 v10, v10, v5
 ; GFX8-NEXT:    v_min_i16_e32 v9, v10, v9
-; GFX8-NEXT:    v_max_i16_e32 v10, 0, v6
-; GFX8-NEXT:    v_sub_u16_e32 v11, 0x8000, v11
-; GFX8-NEXT:    v_sub_u16_e32 v10, 0x7fff, v10
-; GFX8-NEXT:    v_max_i16_sdwa v3, v11, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_min_i16_e32 v11, 0, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
-; GFX8-NEXT:    v_min_i16_e32 v3, v3, v10
-; GFX8-NEXT:    v_max_i16_e32 v10, 0, v1
-; GFX8-NEXT:    v_sub_u16_e32 v11, 0x8000, v11
+; GFX8-NEXT:    v_max_i16_sdwa v10, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_min_i16_sdwa v7, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_sub_u16_e32 v7, 0x8000, v7
 ; GFX8-NEXT:    v_sub_u16_e32 v10, 0x7fff, v10
-; GFX8-NEXT:    v_max_i16_e32 v11, v11, v4
-; GFX8-NEXT:    v_min_i16_e32 v12, 0, v7
-; GFX8-NEXT:    v_min_i16_e32 v10, v11, v10
-; GFX8-NEXT:    v_max_i16_e32 v11, 0, v7
-; GFX8-NEXT:    v_sub_u16_e32 v12, 0x8000, v12
-; GFX8-NEXT:    v_sub_u16_e32 v11, 0x7fff, v11
-; GFX8-NEXT:    v_max_i16_sdwa v4, v12, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_min_i16_e32 v12, 0, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
-; GFX8-NEXT:    v_min_i16_e32 v4, v4, v11
-; GFX8-NEXT:    v_max_i16_e32 v11, 0, v2
-; GFX8-NEXT:    v_sub_u16_e32 v12, 0x8000, v12
-; GFX8-NEXT:    v_sub_u16_e32 v11, 0x7fff, v11
-; GFX8-NEXT:    v_max_i16_e32 v12, v12, v5
-; GFX8-NEXT:    v_min_i16_e32 v13, 0, v8
-; GFX8-NEXT:    v_min_i16_e32 v11, v12, v11
-; GFX8-NEXT:    v_max_i16_e32 v12, 0, v8
-; GFX8-NEXT:    v_sub_u16_e32 v13, 0x8000, v13
-; GFX8-NEXT:    v_sub_u16_e32 v12, 0x7fff, v12
-; GFX8-NEXT:    v_max_i16_sdwa v5, v13, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_add_u16_e32 v0, v0, v9
-; GFX8-NEXT:    v_add_u16_sdwa v3, v6, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_min_i16_e32 v5, v5, v12
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v3
-; GFX8-NEXT:    v_add_u16_e32 v1, v1, v10
-; GFX8-NEXT:    v_add_u16_sdwa v3, v7, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v3
-; GFX8-NEXT:    v_add_u16_e32 v2, v2, v11
-; GFX8-NEXT:    v_add_u16_sdwa v3, v8, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v2, v2, v3
+; GFX8-NEXT:    v_max_i16_sdwa v5, v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_min_i16_e32 v5, v5, v10
+; GFX8-NEXT:    v_add_u16_e32 v6, v0, v6
+; GFX8-NEXT:    v_add_u16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_add_u16_e32 v3, v1, v8
+; GFX8-NEXT:    v_add_u16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v1, v3, v1
+; GFX8-NEXT:    v_add_u16_e32 v3, v2, v9
+; GFX8-NEXT:    v_add_u16_sdwa v2, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v6, v0
+; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_saddsat_v6i16:
@@ -3752,70 +3752,67 @@ define <4 x float> @v_saddsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
 ; GFX8-LABEL: v_saddsat_v8i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_min_i16_e32 v13, 0, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
-; GFX8-NEXT:    v_max_i16_e32 v12, 0, v0
+; GFX8-NEXT:    v_min_i16_e32 v9, 0, v0
+; GFX8-NEXT:    v_max_i16_e32 v8, 0, v0
+; GFX8-NEXT:    v_sub_u16_e32 v9, 0x8000, v9
+; GFX8-NEXT:    v_sub_u16_e32 v8, 0x7fff, v8
+; GFX8-NEXT:    v_max_i16_e32 v9, v9, v4
+; GFX8-NEXT:    v_min_i16_e32 v8, v9, v8
+; GFX8-NEXT:    v_mov_b32_e32 v9, 0
+; GFX8-NEXT:    v_min_i16_sdwa v11, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_max_i16_sdwa v10, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_sub_u16_e32 v11, 0x8000, v11
+; GFX8-NEXT:    v_sub_u16_e32 v10, 0x7fff, v10
+; GFX8-NEXT:    v_max_i16_sdwa v4, v11, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_min_i16_e32 v11, 0, v1
+; GFX8-NEXT:    v_min_i16_e32 v4, v4, v10
+; GFX8-NEXT:    v_max_i16_e32 v10, 0, v1
+; GFX8-NEXT:    v_sub_u16_e32 v11, 0x8000, v11
+; GFX8-NEXT:    v_sub_u16_e32 v10, 0x7fff, v10
+; GFX8-NEXT:    v_max_i16_e32 v11, v11, v5
+; GFX8-NEXT:    v_min_i16_sdwa v12, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_min_i16_e32 v10, v11, v10
+; GFX8-NEXT:    v_max_i16_sdwa v11, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_sub_u16_e32 v12, 0x8000, v12
+; GFX8-NEXT:    v_sub_u16_e32 v11, 0x7fff, v11
+; GFX8-NEXT:    v_max_i16_sdwa v5, v12, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_min_i16_e32 v12, 0, v2
+; GFX8-NEXT:    v_min_i16_e32 v5, v5, v11
+; GFX8-NEXT:    v_max_i16_e32 v11, 0, v2
+; GFX8-NEXT:    v_sub_u16_e32 v12, 0x8000, v12
+; GFX8-NEXT:    v_sub_u16_e32 v11, 0x7fff, v11
+; GFX8-NEXT:    v_max_i16_e32 v12, v12, v6
+; GFX8-NEXT:    v_min_i16_sdwa v13, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_min_i16_e32 v11, v12, v11
+; GFX8-NEXT:    v_max_i16_sdwa v12, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_sub_u16_e32 v13, 0x8000, v13
+; GFX8-NEXT:    v_sub_u16_e32 v12, 0x7fff, v12
+; GFX8-NEXT:    v_max_i16_sdwa v6, v13, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_min_i16_e32 v13, 0, v3
+; GFX8-NEXT:    v_min_i16_e32 v6, v6, v12
+; GFX8-NEXT:    v_max_i16_e32 v12, 0, v3
 ; GFX8-NEXT:    v_sub_u16_e32 v13, 0x8000, v13
 ; GFX8-NEXT:    v_sub_u16_e32 v12, 0x7fff, v12
-; GFX8-NEXT:    v_max_i16_e32 v13, v13, v4
-; GFX8-NEXT:    v_min_i16_e32 v14, 0, v8
+; GFX8-NEXT:    v_max_i16_e32 v13, v13, v7
 ; GFX8-NEXT:    v_min_i16_e32 v12, v13, v12
-; GFX8-NEXT:    v_max_i16_e32 v13, 0, v8
-; GFX8-NEXT:    v_sub_u16_e32 v14, 0x8000, v14
-; GFX8-NEXT:    v_sub_u16_e32 v13, 0x7fff, v13
-; GFX8-NEXT:    v_max_i16_sdwa v4, v14, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_min_i16_e32 v14, 0, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
-; GFX8-NEXT:    v_min_i16_e32 v4, v4, v13
-; GFX8-NEXT:    v_max_i16_e32 v13, 0, v1
-; GFX8-NEXT:    v_sub_u16_e32 v14, 0x8000, v14
+; GFX8-NEXT:    v_max_i16_sdwa v13, v3, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_min_i16_sdwa v9, v3, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_sub_u16_e32 v9, 0x8000, v9
 ; GFX8-NEXT:    v_sub_u16_e32 v13, 0x7fff, v13
-; GFX8-NEXT:    v_max_i16_e32 v14, v14, v5
-; GFX8-NEXT:    v_min_i16_e32 v15, 0, v9
-; GFX8-NEXT:    v_min_i16_e32 v13, v14, v13
-; GFX8-NEXT:    v_max_i16_e32 v14, 0, v9
-; GFX8-NEXT:    v_sub_u16_e32 v15, 0x8000, v15
-; GFX8-NEXT:    v_sub_u16_e32 v14, 0x7fff, v14
-; GFX8-NEXT:    v_max_i16_sdwa v5, v15, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_min_i16_e32 v15, 0, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
-; GFX8-NEXT:    v_min_i16_e32 v5, v5, v14
-; GFX8-NEXT:    v_max_i16_e32 v14, 0, v2
-; GFX8-NEXT:    v_sub_u16_e32 v15, 0x8000, v15
-; GFX8-NEXT:    v_sub_u16_e32 v14, 0x7fff, v14
-; GFX8-NEXT:    v_max_i16_e32 v15, v15, v6
-; GFX8-NEXT:    v_min_i16_e32 v16, 0, v10
-; GFX8-NEXT:    v_min_i16_e32 v14, v15, v14
-; GFX8-NEXT:    v_max_i16_e32 v15, 0, v10
-; GFX8-NEXT:    v_sub_u16_e32 v16, 0x8000, v16
-; GFX8-NEXT:    v_sub_u16_e32 v15, 0x7fff, v15
-; GFX8-NEXT:    v_max_i16_sdwa v6, v16, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_min_i16_e32 v16, 0, v3
-; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
-; GFX8-NEXT:    v_min_i16_e32 v6, v6, v15
-; GFX8-NEXT:    v_max_i16_e32 v15, 0, v3
-; GFX8-NEXT:    v_sub_u16_e32 v16, 0x8000, v16
-; GFX8-NEXT:    v_sub_u16_e32 v15, 0x7fff, v15
-; GFX8-NEXT:    v_max_i16_e32 v16, v16, v7
-; GFX8-NEXT:    v_min_i16_e32 v17, 0, v11
-; GFX8-NEXT:    v_min_i16_e32 v15, v16, v15
-; GFX8-NEXT:    v_max_i16_e32 v16, 0, v11
-; GFX8-NEXT:    v_sub_u16_e32 v17, 0x8000, v17
-; GFX8-NEXT:    v_add_u16_e32 v0, v0, v12
-; GFX8-NEXT:    v_add_u16_sdwa v4, v8, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_sub_u16_e32 v16, 0x7fff, v16
-; GFX8-NEXT:    v_max_i16_sdwa v7, v17, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
-; GFX8-NEXT:    v_add_u16_e32 v1, v1, v13
-; GFX8-NEXT:    v_add_u16_sdwa v4, v9, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_min_i16_e32 v7, v7, v16
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
-; GFX8-NEXT:    v_add_u16_e32 v2, v2, v14
-; GFX8-NEXT:    v_add_u16_sdwa v4, v10, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v2, v2, v4
-; GFX8-NEXT:    v_add_u16_e32 v3, v3, v15
-; GFX8-NEXT:    v_add_u16_sdwa v4, v11, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX8-NEXT:    v_max_i16_sdwa v7, v9, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_add_u16_e32 v8, v0, v8
+; GFX8-NEXT:    v_add_u16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_add_u16_e32 v4, v1, v10
+; GFX8-NEXT:    v_add_u16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_min_i16_e32 v7, v7, v13
+; GFX8-NEXT:    v_or_b32_e32 v1, v4, v1
+; GFX8-NEXT:    v_add_u16_e32 v4, v2, v11
+; GFX8-NEXT:    v_add_u16_sdwa v2, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX8-NEXT:    v_add_u16_e32 v4, v3, v12
+; GFX8-NEXT:    v_add_u16_sdwa v3, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v8, v0
+; GFX8-NEXT:    v_or_b32_e32 v3, v4, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_saddsat_v8i16:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
index 320dfbb4980e4..d7b1e37a81a1b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
@@ -279,9 +279,11 @@ define i16 @v_ssubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
 ; GFX8-NEXT:    v_max_i16_e32 v1, v1, v2
 ; GFX8-NEXT:    v_min_i16_e32 v1, v1, v4
 ; GFX8-NEXT:    v_sub_u16_e32 v1, v3, v1
+; GFX8-NEXT:    v_ashrrev_i16_e32 v1, 8, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0xff
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX8-NEXT:    v_and_b32_sdwa v0, sext(v0), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v1, sext(v1), v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -298,8 +300,9 @@ define i16 @v_ssubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
 ; GFX9-NEXT:    v_pk_sub_i16 v0, v0, v1 clamp
 ; GFX9-NEXT:    v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0xff
-; GFX9-NEXT:    v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2774,22 +2777,22 @@ define <2 x i16> @v_ssubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
 ; GFX8-LABEL: v_ssubsat_v2i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_max_i16_e32 v3, -1, v0
-; GFX8-NEXT:    v_subrev_u16_e32 v3, 0x7fff, v3
-; GFX8-NEXT:    v_min_i16_e32 v4, -1, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX8-NEXT:    v_subrev_u16_e32 v4, 0x8000, v4
-; GFX8-NEXT:    v_max_i16_e32 v3, v3, v1
-; GFX8-NEXT:    v_min_i16_e32 v3, v3, v4
-; GFX8-NEXT:    v_max_i16_e32 v4, -1, v2
+; GFX8-NEXT:    v_max_i16_e32 v2, -1, v0
+; GFX8-NEXT:    v_subrev_u16_e32 v2, 0x7fff, v2
+; GFX8-NEXT:    v_min_i16_e32 v3, -1, v0
+; GFX8-NEXT:    v_subrev_u16_e32 v3, 0x8000, v3
+; GFX8-NEXT:    v_max_i16_e32 v2, v2, v1
+; GFX8-NEXT:    v_min_i16_e32 v2, v2, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, -1
+; GFX8-NEXT:    v_max_i16_sdwa v4, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_subrev_u16_e32 v4, 0x7fff, v4
-; GFX8-NEXT:    v_min_i16_e32 v5, -1, v2
-; GFX8-NEXT:    v_subrev_u16_e32 v5, 0x8000, v5
+; GFX8-NEXT:    v_min_i16_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_subrev_u16_e32 v3, 0x8000, v3
 ; GFX8-NEXT:    v_max_i16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_min_i16_e32 v1, v1, v5
-; GFX8-NEXT:    v_sub_u16_e32 v0, v0, v3
-; GFX8-NEXT:    v_sub_u16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    v_min_i16_e32 v1, v1, v3
+; GFX8-NEXT:    v_sub_u16_e32 v2, v0, v2
+; GFX8-NEXT:    v_sub_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ssubsat_v2i16:
@@ -2987,23 +2990,23 @@ define amdgpu_ps float @ssubsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) {
 ;
 ; GFX8-LABEL: ssubsat_v2i16_vs:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    v_max_i16_e32 v2, -1, v0
-; GFX8-NEXT:    v_subrev_u16_e32 v2, 0x7fff, v2
-; GFX8-NEXT:    v_min_i16_e32 v3, -1, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX8-NEXT:    v_subrev_u16_e32 v3, 0x8000, v3
-; GFX8-NEXT:    v_max_i16_e32 v2, s0, v2
-; GFX8-NEXT:    v_min_i16_e32 v2, v2, v3
-; GFX8-NEXT:    v_max_i16_e32 v3, -1, v1
+; GFX8-NEXT:    v_max_i16_e32 v1, -1, v0
+; GFX8-NEXT:    v_subrev_u16_e32 v1, 0x7fff, v1
+; GFX8-NEXT:    v_min_i16_e32 v2, -1, v0
+; GFX8-NEXT:    v_subrev_u16_e32 v2, 0x8000, v2
+; GFX8-NEXT:    v_max_i16_e32 v1, s0, v1
+; GFX8-NEXT:    v_min_i16_e32 v1, v1, v2
+; GFX8-NEXT:    v_mov_b32_e32 v2, -1
+; GFX8-NEXT:    v_max_i16_sdwa v3, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    s_lshr_b32 s1, s0, 16
 ; GFX8-NEXT:    v_subrev_u16_e32 v3, 0x7fff, v3
-; GFX8-NEXT:    v_min_i16_e32 v4, -1, v1
-; GFX8-NEXT:    v_subrev_u16_e32 v4, 0x8000, v4
+; GFX8-NEXT:    v_min_i16_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_subrev_u16_e32 v2, 0x8000, v2
 ; GFX8-NEXT:    v_max_i16_e32 v3, s1, v3
-; GFX8-NEXT:    v_min_i16_e32 v3, v3, v4
-; GFX8-NEXT:    v_sub_u16_e32 v0, v0, v2
-; GFX8-NEXT:    v_sub_u16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    v_min_i16_e32 v2, v3, v2
+; GFX8-NEXT:    v_sub_u16_e32 v1, v0, v1
+; GFX8-NEXT:    v_sub_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: ssubsat_v2i16_vs:
@@ -3090,38 +3093,37 @@ define <2 x float> @v_ssubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 ; GFX8-LABEL: v_ssubsat_v4i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_max_i16_e32 v6, -1, v0
+; GFX8-NEXT:    v_max_i16_e32 v4, -1, v0
+; GFX8-NEXT:    v_subrev_u16_e32 v4, 0x7fff, v4
+; GFX8-NEXT:    v_min_i16_e32 v5, -1, v0
+; GFX8-NEXT:    v_subrev_u16_e32 v5, 0x8000, v5
+; GFX8-NEXT:    v_max_i16_e32 v4, v4, v2
+; GFX8-NEXT:    v_min_i16_e32 v4, v4, v5
+; GFX8-NEXT:    v_mov_b32_e32 v5, -1
+; GFX8-NEXT:    v_max_i16_sdwa v6, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_subrev_u16_e32 v6, 0x7fff, v6
-; GFX8-NEXT:    v_min_i16_e32 v7, -1, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX8-NEXT:    v_min_i16_sdwa v7, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_subrev_u16_e32 v7, 0x8000, v7
+; GFX8-NEXT:    v_max_i16_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_max_i16_e32 v6, -1, v1
+; GFX8-NEXT:    v_min_i16_e32 v2, v2, v7
+; GFX8-NEXT:    v_subrev_u16_e32 v6, 0x7fff, v6
+; GFX8-NEXT:    v_min_i16_e32 v7, -1, v1
 ; GFX8-NEXT:    v_subrev_u16_e32 v7, 0x8000, v7
-; GFX8-NEXT:    v_max_i16_e32 v6, v6, v2
+; GFX8-NEXT:    v_max_i16_e32 v6, v6, v3
 ; GFX8-NEXT:    v_min_i16_e32 v6, v6, v7
-; GFX8-NEXT:    v_max_i16_e32 v7, -1, v4
+; GFX8-NEXT:    v_max_i16_sdwa v7, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_subrev_u16_e32 v7, 0x7fff, v7
-; GFX8-NEXT:    v_min_i16_e32 v8, -1, v4
-; GFX8-NEXT:    v_subrev_u16_e32 v8, 0x8000, v8
-; GFX8-NEXT:    v_max_i16_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_max_i16_e32 v7, -1, v1
-; GFX8-NEXT:    v_min_i16_e32 v2, v2, v8
-; GFX8-NEXT:    v_subrev_u16_e32 v7, 0x7fff, v7
-; GFX8-NEXT:    v_min_i16_e32 v8, -1, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; GFX8-NEXT:    v_subrev_u16_e32 v8, 0x8000, v8
-; GFX8-NEXT:    v_max_i16_e32 v7, v7, v3
-; GFX8-NEXT:    v_min_i16_e32 v7, v7, v8
-; GFX8-NEXT:    v_max_i16_e32 v8, -1, v5
-; GFX8-NEXT:    v_subrev_u16_e32 v8, 0x7fff, v8
-; GFX8-NEXT:    v_min_i16_e32 v9, -1, v5
-; GFX8-NEXT:    v_subrev_u16_e32 v9, 0x8000, v9
-; GFX8-NEXT:    v_max_i16_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_min_i16_e32 v3, v3, v9
-; GFX8-NEXT:    v_sub_u16_e32 v0, v0, v6
-; GFX8-NEXT:    v_sub_u16_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX8-NEXT:    v_sub_u16_e32 v1, v1, v7
-; GFX8-NEXT:    v_sub_u16_sdwa v2, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX8-NEXT:    v_min_i16_sdwa v5, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_subrev_u16_e32 v5, 0x8000, v5
+; GFX8-NEXT:    v_max_i16_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_min_i16_e32 v3, v3, v5
+; GFX8-NEXT:    v_sub_u16_e32 v4, v0, v4
+; GFX8-NEXT:    v_sub_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_sub_u16_e32 v2, v1, v6
+; GFX8-NEXT:    v_sub_u16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX8-NEXT:    v_or_b32_e32 v1, v2, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ssubsat_v4i16:
@@ -3376,54 +3378,52 @@ define <3 x float> @v_ssubsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) {
 ; GFX8-LABEL: v_ssubsat_v6i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_max_i16_e32 v9, -1, v0
+; GFX8-NEXT:    v_max_i16_e32 v6, -1, v0
+; GFX8-NEXT:    v_subrev_u16_e32 v6, 0x7fff, v6
+; GFX8-NEXT:    v_min_i16_e32 v7, -1, v0
+; GFX8-NEXT:    v_subrev_u16_e32 v7, 0x8000, v7
+; GFX8-NEXT:    v_max_i16_e32 v6, v6, v3
+; GFX8-NEXT:    v_min_i16_e32 v6, v6, v7
+; GFX8-NEXT:    v_mov_b32_e32 v7, -1
+; GFX8-NEXT:    v_max_i16_sdwa v8, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_subrev_u16_e32 v8, 0x7fff, v8
+; GFX8-NEXT:    v_min_i16_sdwa v9, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_subrev_u16_e32 v9, 0x8000, v9
+; GFX8-NEXT:    v_max_i16_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_max_i16_e32 v8, -1, v1
+; GFX8-NEXT:    v_min_i16_e32 v3, v3, v9
+; GFX8-NEXT:    v_subrev_u16_e32 v8, 0x7fff, v8
+; GFX8-NEXT:    v_min_i16_e32 v9, -1, v1
+; GFX8-NEXT:    v_subrev_u16_e32 v9, 0x8000, v9
+; GFX8-NEXT:    v_max_i16_e32 v8, v8, v4
+; GFX8-NEXT:    v_min_i16_e32 v8, v8, v9
+; GFX8-NEXT:    v_max_i16_sdwa v9, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_subrev_u16_e32 v9, 0x7fff, v9
-; GFX8-NEXT:    v_min_i16_e32 v10, -1, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
+; GFX8-NEXT:    v_min_i16_sdwa v10, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_subrev_u16_e32 v10, 0x8000, v10
-; GFX8-NEXT:    v_max_i16_e32 v9, v9, v3
+; GFX8-NEXT:    v_max_i16_sdwa v4, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_max_i16_e32 v9, -1, v2
+; GFX8-NEXT:    v_min_i16_e32 v4, v4, v10
+; GFX8-NEXT:    v_subrev_u16_e32 v9, 0x7fff, v9
+; GFX8-NEXT:    v_min_i16_e32 v10, -1, v2
+; GFX8-NEXT:    v_subrev_u16_e32 v10, 0x8000, v10
+; GFX8-NEXT:    v_max_i16_e32 v9, v9, v5
 ; GFX8-NEXT:    v_min_i16_e32 v9, v9, v10
-; GFX8-NEXT:    v_max_i16_e32 v10, -1, v6
+; GFX8-NEXT:    v_max_i16_sdwa v10, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_subrev_u16_e32 v10, 0x7fff, v10
-; GFX8-NEXT:    v_min_i16_e32 v11, -1, v6
-; GFX8-NEXT:    v_subrev_u16_e32 v11, 0x8000, v11
-; GFX8-NEXT:    v_max_i16_sdwa v3, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_max_i16_e32 v10, -1, v1
-; GFX8-NEXT:    v_min_i16_e32 v3, v3, v11
-; GFX8-NEXT:    v_subrev_u16_e32 v10, 0x7fff, v10
-; GFX8-NEXT:    v_min_i16_e32 v11, -1, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
-; GFX8-NEXT:    v_subrev_u16_e32 v11, 0x8000, v11
-; GFX8-NEXT:    v_max_i16_e32 v10, v10, v4
-; GFX8-NEXT:    v_min_i16_e32 v10, v10, v11
-; GFX8-NEXT:    v_max_i16_e32 v11, -1, v7
-; GFX8-NEXT:    v_subrev_u16_e32 v11, 0x7fff, v11
-; GFX8-NEXT:    v_min_i16_e32 v12, -1, v7
-; GFX8-NEXT:    v_subrev_u16_e32 v12, 0x8000, v12
-; GFX8-NEXT:    v_max_i16_sdwa v4, v11, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_max_i16_e32 v11, -1, v2
-; GFX8-NEXT:    v_min_i16_e32 v4, v4, v12
-; GFX8-NEXT:    v_subrev_u16_e32 v11, 0x7fff, v11
-; GFX8-NEXT:    v_min_i16_e32 v12, -1, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
-; GFX8-NEXT:    v_subrev_u16_e32 v12, 0x8000, v12
-; GFX8-NEXT:    v_max_i16_e32 v11, v11, v5
-; GFX8-NEXT:    v_min_i16_e32 v11, v11, v12
-; GFX8-NEXT:    v_max_i16_e32 v12, -1, v8
-; GFX8-NEXT:    v_subrev_u16_e32 v12, 0x7fff, v12
-; GFX8-NEXT:    v_min_i16_e32 v13, -1, v8
-; GFX8-NEXT:    v_subrev_u16_e32 v13, 0x8000, v13
-; GFX8-NEXT:    v_max_i16_sdwa v5, v12, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_sub_u16_e32 v0, v0, v9
-; GFX8-NEXT:    v_sub_u16_sdwa v3, v6, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_min_i16_e32 v5, v5, v13
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v3
-; GFX8-NEXT:    v_sub_u16_e32 v1, v1, v10
-; GFX8-NEXT:    v_sub_u16_sdwa v3, v7, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v3
-; GFX8-NEXT:    v_sub_u16_e32 v2, v2, v11
-; GFX8-NEXT:    v_sub_u16_sdwa v3, v8, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v2, v2, v3
+; GFX8-NEXT:    v_min_i16_sdwa v7, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_subrev_u16_e32 v7, 0x8000, v7
+; GFX8-NEXT:    v_max_i16_sdwa v5, v10, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_min_i16_e32 v5, v5, v7
+; GFX8-NEXT:    v_sub_u16_e32 v6, v0, v6
+; GFX8-NEXT:    v_sub_u16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_sub_u16_e32 v3, v1, v8
+; GFX8-NEXT:    v_sub_u16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v1, v3, v1
+; GFX8-NEXT:    v_sub_u16_e32 v3, v2, v9
+; GFX8-NEXT:    v_sub_u16_sdwa v2, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v6, v0
+; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ssubsat_v6i16:
@@ -3752,70 +3752,67 @@ define <4 x float> @v_ssubsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
 ; GFX8-LABEL: v_ssubsat_v8i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_max_i16_e32 v12, -1, v0
+; GFX8-NEXT:    v_max_i16_e32 v8, -1, v0
+; GFX8-NEXT:    v_subrev_u16_e32 v8, 0x7fff, v8
+; GFX8-NEXT:    v_min_i16_e32 v9, -1, v0
+; GFX8-NEXT:    v_subrev_u16_e32 v9, 0x8000, v9
+; GFX8-NEXT:    v_max_i16_e32 v8, v8, v4
+; GFX8-NEXT:    v_min_i16_e32 v8, v8, v9
+; GFX8-NEXT:    v_mov_b32_e32 v9, -1
+; GFX8-NEXT:    v_max_i16_sdwa v10, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_subrev_u16_e32 v10, 0x7fff, v10
+; GFX8-NEXT:    v_min_i16_sdwa v11, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_subrev_u16_e32 v11, 0x8000, v11
+; GFX8-NEXT:    v_max_i16_sdwa v4, v10, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_max_i16_e32 v10, -1, v1
+; GFX8-NEXT:    v_min_i16_e32 v4, v4, v11
+; GFX8-NEXT:    v_subrev_u16_e32 v10, 0x7fff, v10
+; GFX8-NEXT:    v_min_i16_e32 v11, -1, v1
+; GFX8-NEXT:    v_subrev_u16_e32 v11, 0x8000, v11
+; GFX8-NEXT:    v_max_i16_e32 v10, v10, v5
+; GFX8-NEXT:    v_min_i16_e32 v10, v10, v11
+; GFX8-NEXT:    v_max_i16_sdwa v11, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_subrev_u16_e32 v11, 0x7fff, v11
+; GFX8-NEXT:    v_min_i16_sdwa v12, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_subrev_u16_e32 v12, 0x8000, v12
+; GFX8-NEXT:    v_max_i16_sdwa v5, v11, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_max_i16_e32 v11, -1, v2
+; GFX8-NEXT:    v_min_i16_e32 v5, v5, v12
+; GFX8-NEXT:    v_subrev_u16_e32 v11, 0x7fff, v11
+; GFX8-NEXT:    v_min_i16_e32 v12, -1, v2
+; GFX8-NEXT:    v_subrev_u16_e32 v12, 0x8000, v12
+; GFX8-NEXT:    v_max_i16_e32 v11, v11, v6
+; GFX8-NEXT:    v_min_i16_e32 v11, v11, v12
+; GFX8-NEXT:    v_max_i16_sdwa v12, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_subrev_u16_e32 v12, 0x7fff, v12
+; GFX8-NEXT:    v_min_i16_sdwa v13, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_subrev_u16_e32 v13, 0x8000, v13
+; GFX8-NEXT:    v_max_i16_sdwa v6, v12, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_max_i16_e32 v12, -1, v3
+; GFX8-NEXT:    v_min_i16_e32 v6, v6, v13
 ; GFX8-NEXT:    v_subrev_u16_e32 v12, 0x7fff, v12
-; GFX8-NEXT:    v_min_i16_e32 v13, -1, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
+; GFX8-NEXT:    v_min_i16_e32 v13, -1, v3
 ; GFX8-NEXT:    v_subrev_u16_e32 v13, 0x8000, v13
-; GFX8-NEXT:    v_max_i16_e32 v12, v12, v4
+; GFX8-NEXT:    v_max_i16_e32 v12, v12, v7
 ; GFX8-NEXT:    v_min_i16_e32 v12, v12, v13
-; GFX8-NEXT:    v_max_i16_e32 v13, -1, v8
-; GFX8-NEXT:    v_subrev_u16_e32 v13, 0x7fff, v13
-; GFX8-NEXT:    v_min_i16_e32 v14, -1, v8
-; GFX8-NEXT:    v_subrev_u16_e32 v14, 0x8000, v14
-; GFX8-NEXT:    v_max_i16_sdwa v4, v13, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_max_i16_e32 v13, -1, v1
-; GFX8-NEXT:    v_min_i16_e32 v4, v4, v14
+; GFX8-NEXT:    v_max_i16_sdwa v13, v3, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_subrev_u16_e32 v13, 0x7fff, v13
-; GFX8-NEXT:    v_min_i16_e32 v14, -1, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
-; GFX8-NEXT:    v_subrev_u16_e32 v14, 0x8000, v14
-; GFX8-NEXT:    v_max_i16_e32 v13, v13, v5
-; GFX8-NEXT:    v_min_i16_e32 v13, v13, v14
-; GFX8-NEXT:    v_max_i16_e32 v14, -1, v9
-; GFX8-NEXT:    v_subrev_u16_e32 v14, 0x7fff, v14
-; GFX8-NEXT:    v_min_i16_e32 v15, -1, v9
-; GFX8-NEXT:    v_subrev_u16_e32 v15, 0x8000, v15
-; GFX8-NEXT:    v_max_i16_sdwa v5, v14, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_max_i16_e32 v14, -1, v2
-; GFX8-NEXT:    v_min_i16_e32 v5, v5, v15
-; GFX8-NEXT:    v_subrev_u16_e32 v14, 0x7fff, v14
-; GFX8-NEXT:    v_min_i16_e32 v15, -1, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
-; GFX8-NEXT:    v_subrev_u16_e32 v15, 0x8000, v15
-; GFX8-NEXT:    v_max_i16_e32 v14, v14, v6
-; GFX8-NEXT:    v_min_i16_e32 v14, v14, v15
-; GFX8-NEXT:    v_max_i16_e32 v15, -1, v10
-; GFX8-NEXT:    v_subrev_u16_e32 v15, 0x7fff, v15
-; GFX8-NEXT:    v_min_i16_e32 v16, -1, v10
-; GFX8-NEXT:    v_subrev_u16_e32 v16, 0x8000, v16
-; GFX8-NEXT:    v_max_i16_sdwa v6, v15, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_max_i16_e32 v15, -1, v3
-; GFX8-NEXT:    v_min_i16_e32 v6, v6, v16
-; GFX8-NEXT:    v_subrev_u16_e32 v15, 0x7fff, v15
-; GFX8-NEXT:    v_min_i16_e32 v16, -1, v3
-; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
-; GFX8-NEXT:    v_subrev_u16_e32 v16, 0x8000, v16
-; GFX8-NEXT:    v_max_i16_e32 v15, v15, v7
-; GFX8-NEXT:    v_min_i16_e32 v15, v15, v16
-; GFX8-NEXT:    v_max_i16_e32 v16, -1, v11
-; GFX8-NEXT:    v_subrev_u16_e32 v16, 0x7fff, v16
-; GFX8-NEXT:    v_min_i16_e32 v17, -1, v11
-; GFX8-NEXT:    v_sub_u16_e32 v0, v0, v12
-; GFX8-NEXT:    v_sub_u16_sdwa v4, v8, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_subrev_u16_e32 v17, 0x8000, v17
-; GFX8-NEXT:    v_max_i16_sdwa v7, v16, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
-; GFX8-NEXT:    v_sub_u16_e32 v1, v1, v13
-; GFX8-NEXT:    v_sub_u16_sdwa v4, v9, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_min_i16_e32 v7, v7, v17
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
-; GFX8-NEXT:    v_sub_u16_e32 v2, v2, v14
-; GFX8-NEXT:    v_sub_u16_sdwa v4, v10, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v2, v2, v4
-; GFX8-NEXT:    v_sub_u16_e32 v3, v3, v15
-; GFX8-NEXT:    v_sub_u16_sdwa v4, v11, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX8-NEXT:    v_min_i16_sdwa v9, v3, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_subrev_u16_e32 v9, 0x8000, v9
+; GFX8-NEXT:    v_max_i16_sdwa v7, v13, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_sub_u16_e32 v8, v0, v8
+; GFX8-NEXT:    v_sub_u16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_sub_u16_e32 v4, v1, v10
+; GFX8-NEXT:    v_sub_u16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_min_i16_e32 v7, v7, v9
+; GFX8-NEXT:    v_or_b32_e32 v1, v4, v1
+; GFX8-NEXT:    v_sub_u16_e32 v4, v2, v11
+; GFX8-NEXT:    v_sub_u16_sdwa v2, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX8-NEXT:    v_sub_u16_e32 v4, v3, v12
+; GFX8-NEXT:    v_sub_u16_sdwa v3, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v8, v0
+; GFX8-NEXT:    v_or_b32_e32 v3, v4, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ssubsat_v8i16:
diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
index 028a28ed9a23b..3f513e120e141 100644
--- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
@@ -1608,34 +1608,35 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; VI-NEXT:    s_mov_b32 s7, 0xf000
 ; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    v_mov_b32_e32 v5, 0xffffff00
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v4, v[0:1]
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    v_mov_b32_e32 v6, 9
+; VI-NEXT:    v_mov_b32_e32 v7, 0x900
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_mov_b32 s4, s2
 ; VI-NEXT:    s_mov_b32 s5, s3
 ; VI-NEXT:    s_mov_b32 s2, s6
 ; VI-NEXT:    s_mov_b32 s3, s7
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
 ; VI-NEXT:    v_cvt_f32_ubyte3_e32 v3, v4
 ; VI-NEXT:    v_cvt_f32_ubyte2_e32 v2, v4
 ; VI-NEXT:    v_cvt_f32_ubyte1_e32 v1, v4
 ; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v4
-; VI-NEXT:    v_and_b32_e32 v6, 0xffffff00, v4
-; VI-NEXT:    v_add_u16_e32 v4, 9, v4
+; VI-NEXT:    v_and_b32_e32 v8, 0xffffff00, v4
+; VI-NEXT:    v_add_u16_e32 v9, 9, v4
+; VI-NEXT:    v_and_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_add_u16_sdwa v4, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; VI-NEXT:    s_nop 0
-; VI-NEXT:    v_and_b32_e32 v1, 0xffffff00, v5
-; VI-NEXT:    v_add_u16_e32 v2, 9, v5
-; VI-NEXT:    v_or_b32_sdwa v0, v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    v_mov_b32_e32 v2, 0x900
+; VI-NEXT:    v_or_b32_sdwa v0, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    v_or_b32_sdwa v1, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u16_e32 v0, 0x900, v0
-; VI-NEXT:    v_add_u16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_add_u16_sdwa v1, v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
@@ -1674,28 +1675,29 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT:    s_movk_i32 s4, 0xff00
+; GFX9-NEXT:    v_mov_b32_e32 v6, 9
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0
-; GFX9-NEXT:    s_movk_i32 s4, 0x900
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v4, v0, s[0:1]
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_movk_i32 s5, 0x900
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
 ; GFX9-NEXT:    v_cvt_f32_ubyte3_e32 v3, v4
 ; GFX9-NEXT:    v_cvt_f32_ubyte2_e32 v2, v4
 ; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v1, v4
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v4
 ; GFX9-NEXT:    v_and_b32_e32 v7, 0xffffff00, v4
-; GFX9-NEXT:    v_add_u16_e32 v4, 9, v4
+; GFX9-NEXT:    v_add_u16_e32 v8, 9, v4
+; GFX9-NEXT:    v_and_b32_sdwa v9, v4, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_add_u16_sdwa v4, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1]
 ; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffffff00, v6
-; GFX9-NEXT:    v_add_u16_e32 v2, 9, v6
-; GFX9-NEXT:    v_or_b32_sdwa v0, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_or_b32_sdwa v0, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_or_b32_sdwa v1, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_add_u16_e32 v0, 0x900, v0
-; GFX9-NEXT:    v_add_u16_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    v_add_u16_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX9-NEXT:    global_store_dword v5, v0, s[2:3]
 ; GFX9-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fract-match.ll b/llvm/test/CodeGen/AMDGPU/fract-match.ll
index e361aa4db2aa9..1b28ddb2c5862 100644
--- a/llvm/test/CodeGen/AMDGPU/fract-match.ll
+++ b/llvm/test/CodeGen/AMDGPU/fract-match.ll
@@ -2135,19 +2135,18 @@ define <2 x half> @safe_math_fract_v2f16(<2 x half> %x, ptr addrspace(1) nocaptu
 ; GFX8-LABEL: safe_math_fract_v2f16:
 ; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX8-NEXT:    v_mov_b32_e32 v7, 0x204
-; GFX8-NEXT:    v_floor_f16_e32 v4, v3
-; GFX8-NEXT:    v_floor_f16_e32 v5, v0
-; GFX8-NEXT:    v_fract_f16_e32 v6, v3
-; GFX8-NEXT:    v_cmp_class_f16_e32 vcc, v3, v7
-; GFX8-NEXT:    v_pack_b32_f16 v4, v5, v4
-; GFX8-NEXT:    v_fract_f16_e32 v5, v0
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v6, 0, vcc
-; GFX8-NEXT:    v_cmp_class_f16_e32 vcc, v0, v7
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v5, 0, vcc
-; GFX8-NEXT:    v_pack_b32_f16 v0, v0, v3
-; GFX8-NEXT:    global_store_dword v[1:2], v4, off
+; GFX8-NEXT:    v_mov_b32_e32 v6, 0x204
+; GFX8-NEXT:    v_floor_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT:    v_floor_f16_e32 v4, v0
+; GFX8-NEXT:    v_cmp_class_f16_sdwa s[4:5], v0, v6 src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_pack_b32_f16 v3, v4, v3
+; GFX8-NEXT:    v_fract_f16_e32 v4, v0
+; GFX8-NEXT:    v_fract_f16_sdwa v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT:    v_cmp_class_f16_e32 vcc, v0, v6
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, v5, 0, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX8-NEXT:    v_pack_b32_f16 v0, v0, v5
+; GFX8-NEXT:    global_store_dword v[1:2], v3, off
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll
index 3118d63788042..e8310e73f9a47 100644
--- a/llvm/test/CodeGen/AMDGPU/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/fshr.ll
@@ -803,13 +803,13 @@ define <2 x i16> @v_fshr_v2i16(<2 x i16> %src0, <2 x i16> %src1, <2 x i16> %src2
 ; VI-LABEL: v_fshr_v2i16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; VI-NEXT:    v_mov_b32_e32 v5, 1
-; VI-NEXT:    v_lshrrev_b16_sdwa v4, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; VI-NEXT:    v_lshlrev_b16_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; VI-NEXT:    v_xor_b32_e32 v3, -1, v3
-; VI-NEXT:    v_lshlrev_b16_e32 v3, v3, v5
-; VI-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_mov_b32_e32 v4, 1
+; VI-NEXT:    v_mov_b32_e32 v5, -1
+; VI-NEXT:    v_lshlrev_b16_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; VI-NEXT:    v_xor_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_lshrrev_b16_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_lshlrev_b16_e32 v4, v5, v4
+; VI-NEXT:    v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 1, v0
 ; VI-NEXT:    v_xor_b32_e32 v4, -1, v2
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, v4, v0
@@ -887,13 +887,13 @@ define <3 x i16> @v_fshr_v3i16(<3 x i16> %src0, <3 x i16> %src1, <3 x i16> %src2
 ; VI-LABEL: v_fshr_v3i16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
-; VI-NEXT:    v_mov_b32_e32 v8, 1
-; VI-NEXT:    v_lshrrev_b16_sdwa v7, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; VI-NEXT:    v_lshlrev_b16_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; VI-NEXT:    v_xor_b32_e32 v6, -1, v6
-; VI-NEXT:    v_lshlrev_b16_e32 v6, v6, v8
-; VI-NEXT:    v_or_b32_sdwa v6, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_mov_b32_e32 v7, 1
+; VI-NEXT:    v_mov_b32_e32 v8, -1
+; VI-NEXT:    v_lshlrev_b16_sdwa v7, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; VI-NEXT:    v_xor_b32_sdwa v8, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_lshrrev_b16_sdwa v6, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_lshlrev_b16_e32 v7, v8, v7
+; VI-NEXT:    v_or_b32_sdwa v6, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 1, v1
 ; VI-NEXT:    v_xor_b32_e32 v7, -1, v5
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, v7, v1
@@ -910,13 +910,13 @@ define <3 x i16> @v_fshr_v3i16(<3 x i16> %src0, <3 x i16> %src1, <3 x i16> %src2
 ; GFX9-LABEL: v_fshr_v3i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
-; GFX9-NEXT:    v_mov_b32_e32 v8, 1
-; GFX9-NEXT:    v_lshrrev_b16_sdwa v7, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT:    v_lshlrev_b16_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT:    v_xor_b32_e32 v6, -1, v6
-; GFX9-NEXT:    v_lshlrev_b16_e32 v6, v6, v8
-; GFX9-NEXT:    v_or_b32_e32 v6, v6, v7
+; GFX9-NEXT:    v_mov_b32_e32 v7, 1
+; GFX9-NEXT:    v_mov_b32_e32 v8, -1
+; GFX9-NEXT:    v_lshlrev_b16_sdwa v7, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_xor_b32_sdwa v8, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshrrev_b16_sdwa v6, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_lshlrev_b16_e32 v7, v8, v7
+; GFX9-NEXT:    v_or_b32_e32 v6, v7, v6
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 1, v1
 ; GFX9-NEXT:    v_xor_b32_e32 v7, -1, v5
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, v7, v1
@@ -1019,18 +1019,18 @@ define <4 x i16> @v_fshr_v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2
 ; VI-LABEL: v_fshr_v4i16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v5
-; VI-NEXT:    v_mov_b32_e32 v8, 1
-; VI-NEXT:    v_lshrrev_b16_sdwa v7, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; VI-NEXT:    v_lshlrev_b16_sdwa v9, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; VI-NEXT:    v_xor_b32_e32 v6, -1, v6
-; VI-NEXT:    v_lshlrev_b16_e32 v6, v6, v9
-; VI-NEXT:    v_or_b32_sdwa v6, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
-; VI-NEXT:    v_lshrrev_b16_sdwa v9, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; VI-NEXT:    v_lshlrev_b16_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; VI-NEXT:    v_xor_b32_e32 v7, -1, v7
-; VI-NEXT:    v_lshlrev_b16_e32 v7, v7, v8
+; VI-NEXT:    v_mov_b32_e32 v7, 1
+; VI-NEXT:    v_mov_b32_e32 v9, -1
+; VI-NEXT:    v_lshlrev_b16_sdwa v8, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; VI-NEXT:    v_xor_b32_sdwa v10, v5, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_lshrrev_b16_sdwa v6, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_lshlrev_b16_e32 v8, v10, v8
+; VI-NEXT:    v_lshlrev_b16_sdwa v7, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; VI-NEXT:    v_xor_b32_sdwa v9, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v6, v8, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_lshrrev_b16_sdwa v8, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_lshlrev_b16_e32 v7, v9, v7
+; VI-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 1, v1
 ; VI-NEXT:    v_xor_b32_e32 v8, -1, v5
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, v8, v1
@@ -1040,7 +1040,6 @@ define <4 x i16> @v_fshr_v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2
 ; VI-NEXT:    v_xor_b32_e32 v3, -1, v4
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, v3, v0
 ; VI-NEXT:    v_lshrrev_b16_e32 v2, v4, v2
-; VI-NEXT:    v_or_b32_sdwa v7, v7, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v2
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -1049,18 +1048,18 @@ define <4 x i16> @v_fshr_v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2
 ; GFX9-LABEL: v_fshr_v4i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v5
-; GFX9-NEXT:    v_mov_b32_e32 v8, 1
-; GFX9-NEXT:    v_lshrrev_b16_sdwa v7, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT:    v_lshlrev_b16_sdwa v9, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT:    v_xor_b32_e32 v6, -1, v6
-; GFX9-NEXT:    v_lshlrev_b16_e32 v6, v6, v9
-; GFX9-NEXT:    v_or_b32_e32 v6, v6, v7
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
-; GFX9-NEXT:    v_lshrrev_b16_sdwa v9, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT:    v_lshlrev_b16_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT:    v_xor_b32_e32 v7, -1, v7
-; GFX9-NEXT:    v_lshlrev_b16_e32 v7, v7, v8
+; GFX9-NEXT:    v_mov_b32_e32 v7, 1
+; GFX9-NEXT:    v_mov_b32_e32 v9, -1
+; GFX9-NEXT:    v_lshlrev_b16_sdwa v8, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_xor_b32_sdwa v10, v5, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshrrev_b16_sdwa v6, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_lshlrev_b16_e32 v8, v10, v8
+; GFX9-NEXT:    v_lshlrev_b16_sdwa v7, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_xor_b32_sdwa v9, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_e32 v6, v8, v6
+; GFX9-NEXT:    v_lshrrev_b16_sdwa v8, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_lshlrev_b16_e32 v7, v9, v7
+; GFX9-NEXT:    v_or_b32_e32 v7, v7, v8
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 1, v1
 ; GFX9-NEXT:    v_xor_b32_e32 v8, -1, v5
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, v8, v1
@@ -1070,7 +1069,6 @@ define <4 x i16> @v_fshr_v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2
 ; GFX9-NEXT:    v_xor_b32_e32 v3, -1, v4
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, v3, v0
 ; GFX9-NEXT:    v_lshrrev_b16_e32 v2, v4, v2
-; GFX9-NEXT:    v_or_b32_e32 v7, v7, v9
 ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v7, v0, s4
diff --git a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
index d4ff845e1edf3..7ee31bf4dce7c 100644
--- a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
+++ b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
@@ -637,6 +637,7 @@ define amdgpu_kernel void @udiv16_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_movk_i32 s4, 0x400
+; GFX9-NEXT:    v_mov_b32_e32 v3, 1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_and_b32 s2, s2, 0xffff
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s2
@@ -644,19 +645,18 @@ define amdgpu_kernel void @udiv16_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v0
 ; GFX9-NEXT:  .LBB4_1: ; %bb3
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff, v2
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, v3
+; GFX9-NEXT:    v_cvt_f32_u32_sdwa v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v5, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX9-NEXT:    v_add_u16_e32 v2, 1, v2
 ; GFX9-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 1, v3
-; GFX9-NEXT:    v_mul_f32_e32 v5, v4, v1
-; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v5
-; GFX9-NEXT:    v_mad_f32 v4, -v5, v0, v4
+; GFX9-NEXT:    v_mul_f32_e32 v6, v4, v1
+; GFX9-NEXT:    v_trunc_f32_e32 v6, v6
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v7, v6
+; GFX9-NEXT:    v_mad_f32 v4, -v6, v0, v4
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v4|, v0
-; GFX9-NEXT:    v_addc_co_u32_e64 v4, s[0:1], 0, v6, s[0:1]
+; GFX9-NEXT:    v_addc_co_u32_e64 v4, s[0:1], 0, v7, s[0:1]
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_store_short v3, v4, s[2:3]
+; GFX9-NEXT:    global_store_short v5, v4, s[2:3]
 ; GFX9-NEXT:    s_cbranch_vccz .LBB4_1
 ; GFX9-NEXT:  ; %bb.2: ; %bb2
 ; GFX9-NEXT:    s_endpgm
@@ -667,25 +667,25 @@ define amdgpu_kernel void @udiv16_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX10-NEXT:    s_load_dword s4, s[0:1], 0x2c
 ; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-NEXT:    v_mov_b32_e32 v3, 1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_and_b32 s0, s4, 0xffff
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s0
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v1, v0
 ; GFX10-NEXT:  .LBB4_1: ; %bb3
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff, v2
+; GFX10-NEXT:    v_cvt_f32_u32_sdwa v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX10-NEXT:    v_add_nc_u16 v2, v2, 1
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v4, v3
+; GFX10-NEXT:    v_mul_f32_e32 v6, v4, v1
 ; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x400, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 1, v3
-; GFX10-NEXT:    v_mul_f32_e32 v5, v4, v1
+; GFX10-NEXT:    v_trunc_f32_e32 v6, v6
 ; GFX10-NEXT:    s_and_b32 vcc_lo, exec_lo, vcc_lo
-; GFX10-NEXT:    v_trunc_f32_e32 v5, v5
-; GFX10-NEXT:    v_mad_f32 v4, -v5, v0, v4
-; GFX10-NEXT:    v_cvt_u32_f32_e32 v5, v5
+; GFX10-NEXT:    v_mad_f32 v4, -v6, v0, v4
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v6, v6
 ; GFX10-NEXT:    v_cmp_ge_f32_e64 s0, |v4|, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v4, s0, 0, v5, s0
-; GFX10-NEXT:    global_store_short v3, v4, s[2:3]
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v4, s0, 0, v6, s0
+; GFX10-NEXT:    global_store_short v5, v4, s[2:3]
 ; GFX10-NEXT:    s_cbranch_vccz .LBB4_1
 ; GFX10-NEXT:  ; %bb.2: ; %bb2
 ; GFX10-NEXT:    s_endpgm
@@ -748,30 +748,28 @@ define amdgpu_kernel void @urem16_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    s_movk_i32 s5, 0x400
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT:    s_movk_i32 s3, 0x400
+; GFX9-NEXT:    v_mov_b32_e32 v3, 1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_and_b32 s4, s2, 0xffff
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s4
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s2
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v0
 ; GFX9-NEXT:  .LBB5_1: ; %bb3
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff, v2
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, v3
+; GFX9-NEXT:    v_cvt_f32_u32_sdwa v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX9-NEXT:    v_mul_f32_e32 v5, v4, v1
+; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v5
+; GFX9-NEXT:    v_mad_f32 v4, -v5, v0, v4
+; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, v0
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v5, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v6, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v4, v4, s2
+; GFX9-NEXT:    v_sub_u32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_add_u16_e32 v2, 1, v2
-; GFX9-NEXT:    v_cmp_eq_u16_e32 vcc, s5, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 1, v3
-; GFX9-NEXT:    v_mul_f32_e32 v6, v4, v1
-; GFX9-NEXT:    v_trunc_f32_e32 v6, v6
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v7, v6
-; GFX9-NEXT:    v_mad_f32 v4, -v6, v0, v4
-; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v4|, v0
-; GFX9-NEXT:    s_and_b64 vcc, exec, vcc
-; GFX9-NEXT:    v_addc_co_u32_e64 v4, s[0:1], 0, v7, s[0:1]
-; GFX9-NEXT:    v_mul_lo_u32 v4, v4, s4
-; GFX9-NEXT:    v_sub_u32_e32 v3, v3, v4
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_store_short v5, v3, s[2:3]
+; GFX9-NEXT:    v_cmp_eq_u16_e32 vcc, s3, v2
+; GFX9-NEXT:    global_store_short v5, v4, s[0:1]
 ; GFX9-NEXT:    s_cbranch_vccz .LBB5_1
 ; GFX9-NEXT:  ; %bb.2: ; %bb2
 ; GFX9-NEXT:    s_endpgm
@@ -782,26 +780,26 @@ define amdgpu_kernel void @urem16_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX10-NEXT:    s_load_dword s4, s[0:1], 0x2c
 ; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-NEXT:    v_mov_b32_e32 v3, 1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_and_b32 s0, s4, 0xffff
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s0
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v1, v0
 ; GFX10-NEXT:  .LBB5_1: ; %bb3
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff, v2
-; GFX10-NEXT:    v_add_nc_u16 v2, v2, 1
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v4, v3
+; GFX10-NEXT:    v_cvt_f32_u32_sdwa v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
 ; GFX10-NEXT:    v_mul_f32_e32 v5, v4, v1
 ; GFX10-NEXT:    v_trunc_f32_e32 v5, v5
 ; GFX10-NEXT:    v_mad_f32 v4, -v5, v0, v4
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v5, v5
 ; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v4|, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v5, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x400, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 1, v3
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX10-NEXT:    v_mul_lo_u32 v4, v4, s0
-; GFX10-NEXT:    v_sub_nc_u32_e32 v3, v3, v4
-; GFX10-NEXT:    global_store_short v5, v3, s[2:3]
+; GFX10-NEXT:    v_sub_nc_u32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_add_nc_u16 v2, v2, 1
+; GFX10-NEXT:    global_store_short v5, v4, s[2:3]
+; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x400, v2
 ; GFX10-NEXT:    s_cbranch_vccz .LBB5_1
 ; GFX10-NEXT:  ; %bb.2: ; %bb2
 ; GFX10-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll
index 9a1de74034cd8..0b131ea74f1ab 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4u.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll
@@ -4713,29 +4713,24 @@ define amdgpu_kernel void @udot4_acc32_multi(ptr addrspace(1) %src1,
 ; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT:    global_load_dword v3, v2, s[6:7]
 ; GFX9-NODL-NEXT:    global_load_dwordx2 v[0:1], v2, s[4:5]
+; GFX9-NODL-NEXT:    global_load_dword v3, v2, s[6:7]
 ; GFX9-NODL-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NODL-NEXT:    v_and_b32_e32 v4, 0xff, v3
-; GFX9-NODL-NEXT:    v_bfe_u32 v6, v3, 16, 8
-; GFX9-NODL-NEXT:    v_bfe_u32 v5, v3, 8, 8
-; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v3, 24, v3
 ; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v7, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v9, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
-; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v8, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v5, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v6, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
-; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v4, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v6, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v5, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v7, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v8, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v9, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT:    v_add3_u32 v3, v7, s0, v9
-; GFX9-NODL-NEXT:    v_add3_u32 v3, v3, v4, v6
-; GFX9-NODL-NEXT:    v_add3_u32 v0, v8, v3, v0
-; GFX9-NODL-NEXT:    v_add3_u32 v0, v0, v5, v1
+; GFX9-NODL-NEXT:    v_add3_u32 v3, v4, s0, v6
+; GFX9-NODL-NEXT:    v_add3_u32 v3, v3, v7, v9
+; GFX9-NODL-NEXT:    v_add3_u32 v0, v5, v3, v0
+; GFX9-NODL-NEXT:    v_add3_u32 v0, v0, v8, v1
 ; GFX9-NODL-NEXT:    global_store_dword v2, v0, s[2:3]
 ; GFX9-NODL-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
index a2e30603b6afc..663b009116286 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
@@ -659,28 +659,30 @@ define amdgpu_kernel void @fmuladd_v2f16(
 ; VI-FLUSH-NEXT:    s_mov_b32 s14, s10
 ; VI-FLUSH-NEXT:    s_mov_b32 s15, s11
 ; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-FLUSH-NEXT:    s_mov_b32 s12, s2
-; VI-FLUSH-NEXT:    s_mov_b32 s13, s3
 ; VI-FLUSH-NEXT:    s_mov_b32 s16, s4
 ; VI-FLUSH-NEXT:    s_mov_b32 s17, s5
-; VI-FLUSH-NEXT:    s_mov_b32 s18, s10
-; VI-FLUSH-NEXT:    s_mov_b32 s19, s11
 ; VI-FLUSH-NEXT:    s_mov_b32 s4, s6
 ; VI-FLUSH-NEXT:    s_mov_b32 s5, s7
 ; VI-FLUSH-NEXT:    s_mov_b32 s6, s10
 ; VI-FLUSH-NEXT:    s_mov_b32 s7, s11
-; VI-FLUSH-NEXT:    buffer_load_dword v0, off, s[12:15], 0
-; VI-FLUSH-NEXT:    buffer_load_dword v1, off, s[4:7], 0
-; VI-FLUSH-NEXT:    buffer_load_dword v2, off, s[16:19], 0
+; VI-FLUSH-NEXT:    s_mov_b32 s12, s2
+; VI-FLUSH-NEXT:    s_mov_b32 s13, s3
+; VI-FLUSH-NEXT:    s_mov_b32 s18, s10
+; VI-FLUSH-NEXT:    s_mov_b32 s19, s11
+; VI-FLUSH-NEXT:    buffer_load_dword v0, off, s[4:7], 0
+; VI-FLUSH-NEXT:    buffer_load_dword v1, off, s[16:19], 0
+; VI-FLUSH-NEXT:    buffer_load_dword v2, off, s[12:15], 0
 ; VI-FLUSH-NEXT:    s_mov_b32 s8, s0
 ; VI-FLUSH-NEXT:    s_mov_b32 s9, s1
+; VI-FLUSH-NEXT:    s_waitcnt vmcnt(2)
+; VI-FLUSH-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(1)
-; VI-FLUSH-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; VI-FLUSH-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; VI-FLUSH-NEXT:    v_mac_f16_sdwa v3, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-FLUSH-NEXT:    v_mac_f16_sdwa v3, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-FLUSH-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; VI-FLUSH-NEXT:    v_mac_f16_e32 v1, v0, v2
-; VI-FLUSH-NEXT:    v_or_b32_e32 v0, v1, v3
+; VI-FLUSH-NEXT:    v_mac_f16_e32 v0, v2, v1
+; VI-FLUSH-NEXT:    v_or_b32_e32 v0, v0, v3
 ; VI-FLUSH-NEXT:    buffer_store_dword v0, off, s[8:11], 0
 ; VI-FLUSH-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll b/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll
index 14742c5827c1e..b9fef0834cb24 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll
@@ -183,11 +183,10 @@ define { <2 x half>, <2 x i32> } @test_frexp_v2f16_v2i32(<2 x half> %a) {
 ; GFX8-SDAG-LABEL: test_frexp_v2f16_v2i32:
 ; GFX8-SDAG:       ; %bb.0:
 ; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-SDAG-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX8-SDAG-NEXT:    v_frexp_mant_f16_e32 v1, v0
-; GFX8-SDAG-NEXT:    v_frexp_mant_f16_sdwa v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; GFX8-SDAG-NEXT:    v_or_b32_e32 v3, v1, v3
-; GFX8-SDAG-NEXT:    v_frexp_exp_i16_f16_e32 v1, v2
+; GFX8-SDAG-NEXT:    v_frexp_mant_f16_sdwa v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-SDAG-NEXT:    v_or_b32_e32 v3, v1, v2
+; GFX8-SDAG-NEXT:    v_frexp_exp_i16_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX8-SDAG-NEXT:    v_frexp_exp_i16_f16_e32 v0, v0
 ; GFX8-SDAG-NEXT:    v_bfe_i32 v2, v1, 0, 16
 ; GFX8-SDAG-NEXT:    v_bfe_i32 v1, v0, 0, 16
@@ -197,11 +196,10 @@ define { <2 x half>, <2 x i32> } @test_frexp_v2f16_v2i32(<2 x half> %a) {
 ; GFX9-SDAG-LABEL: test_frexp_v2f16_v2i32:
 ; GFX9-SDAG:       ; %bb.0:
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX9-SDAG-NEXT:    v_frexp_mant_f16_e32 v2, v1
-; GFX9-SDAG-NEXT:    v_frexp_mant_f16_e32 v3, v0
-; GFX9-SDAG-NEXT:    v_pack_b32_f16 v3, v3, v2
-; GFX9-SDAG-NEXT:    v_frexp_exp_i16_f16_e32 v1, v1
+; GFX9-SDAG-NEXT:    v_frexp_mant_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-SDAG-NEXT:    v_frexp_mant_f16_e32 v2, v0
+; GFX9-SDAG-NEXT:    v_pack_b32_f16 v3, v2, v1
+; GFX9-SDAG-NEXT:    v_frexp_exp_i16_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX9-SDAG-NEXT:    v_frexp_exp_i16_f16_e32 v0, v0
 ; GFX9-SDAG-NEXT:    v_bfe_i32 v2, v1, 0, 16
 ; GFX9-SDAG-NEXT:    v_bfe_i32 v1, v0, 0, 16
@@ -246,27 +244,25 @@ define { <2 x half>, <2 x i32> } @test_frexp_v2f16_v2i32(<2 x half> %a) {
 ; GFX8-GISEL-LABEL: test_frexp_v2f16_v2i32:
 ; GFX8-GISEL:       ; %bb.0:
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX8-GISEL-NEXT:    v_frexp_mant_f16_e32 v3, v0
-; GFX8-GISEL-NEXT:    v_frexp_exp_i16_f16_e32 v0, v0
-; GFX8-GISEL-NEXT:    v_bfe_i32 v1, v0, 0, 16
-; GFX8-GISEL-NEXT:    v_frexp_mant_f16_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; GFX8-GISEL-NEXT:    v_frexp_exp_i16_f16_e32 v2, v2
-; GFX8-GISEL-NEXT:    v_bfe_i32 v2, v2, 0, 16
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v3, v0
+; GFX8-GISEL-NEXT:    v_frexp_exp_i16_f16_e32 v1, v0
+; GFX8-GISEL-NEXT:    v_frexp_mant_f16_sdwa v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_frexp_exp_i16_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_bfe_i32 v1, v1, 0, 16
+; GFX8-GISEL-NEXT:    v_bfe_i32 v2, v0, 0, 16
+; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v3, v4
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: test_frexp_v2f16_v2i32:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX9-GISEL-NEXT:    v_frexp_mant_f16_e32 v3, v0
-; GFX9-GISEL-NEXT:    v_frexp_exp_i16_f16_e32 v0, v0
-; GFX9-GISEL-NEXT:    v_bfe_i32 v1, v0, 0, 16
-; GFX9-GISEL-NEXT:    v_frexp_mant_f16_e32 v0, v2
-; GFX9-GISEL-NEXT:    v_frexp_exp_i16_f16_e32 v2, v2
-; GFX9-GISEL-NEXT:    v_bfe_i32 v2, v2, 0, 16
-; GFX9-GISEL-NEXT:    v_pack_b32_f16 v0, v3, v0
+; GFX9-GISEL-NEXT:    v_frexp_exp_i16_f16_e32 v1, v0
+; GFX9-GISEL-NEXT:    v_frexp_mant_f16_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_frexp_exp_i16_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_bfe_i32 v1, v1, 0, 16
+; GFX9-GISEL-NEXT:    v_bfe_i32 v2, v0, 0, 16
+; GFX9-GISEL-NEXT:    v_pack_b32_f16 v0, v3, v4
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call { <2 x half>, <2 x i32> } @llvm.frexp.v2f16.v2i32(<2 x half> %a)
   ret { <2 x half>, <2 x i32> } %result
diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
index 6672568b98a20..8861ee380be03 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
+++ b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
@@ -147,11 +147,10 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    s_add_i32 s48, s49, 1
 ; CHECK-NEXT:    s_add_i32 s5, s49, 5
 ; CHECK-NEXT:    v_or3_b32 v57, s4, v43, s48
-; CHECK-NEXT:    ds_read_u8 v0, v0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    ds_read_u8 v56, v0
 ; CHECK-NEXT:    v_mov_b32_e32 v58, s48
 ; CHECK-NEXT:    s_mov_b32 s52, exec_lo
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_and_b32_e32 v56, 0xff, v0
 ; CHECK-NEXT:    v_cmpx_lt_u32_e64 s5, v42
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_17
 ; CHECK-NEXT:  ; %bb.6: ; %.preheader2
@@ -175,10 +174,10 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    ; => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    v_add_nc_u32_e32 v59, s54, v46
 ; CHECK-NEXT:    v_add_nc_u32_e32 v58, s54, v57
-; CHECK-NEXT:    s_mov_b32 s55, exec_lo
 ; CHECK-NEXT:    ds_read_u8 v0, v59
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_cmpx_eq_u16_e64 v56, v0
+; CHECK-NEXT:    v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD
+; CHECK-NEXT:    s_and_saveexec_b32 s55, s4
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_10
 ; CHECK-NEXT:  ; %bb.9: ; in Loop: Header=BB0_8 Depth=2
 ; CHECK-NEXT:    v_mov_b32_e32 v31, v40
@@ -200,9 +199,9 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:  .LBB0_10: ; in Loop: Header=BB0_8 Depth=2
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s55
 ; CHECK-NEXT:    ds_read_u8 v0, v59 offset:1
-; CHECK-NEXT:    s_mov_b32 s55, exec_lo
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_cmpx_eq_u16_e64 v56, v0
+; CHECK-NEXT:    v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD
+; CHECK-NEXT:    s_and_saveexec_b32 s55, s4
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_12
 ; CHECK-NEXT:  ; %bb.11: ; in Loop: Header=BB0_8 Depth=2
 ; CHECK-NEXT:    v_mov_b32_e32 v31, v40
@@ -225,9 +224,9 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:  .LBB0_12: ; in Loop: Header=BB0_8 Depth=2
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s55
 ; CHECK-NEXT:    ds_read_u8 v0, v59 offset:2
-; CHECK-NEXT:    s_mov_b32 s55, exec_lo
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_cmpx_eq_u16_e64 v56, v0
+; CHECK-NEXT:    v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD
+; CHECK-NEXT:    s_and_saveexec_b32 s55, s4
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_14
 ; CHECK-NEXT:  ; %bb.13: ; in Loop: Header=BB0_8 Depth=2
 ; CHECK-NEXT:    v_mov_b32_e32 v31, v40
@@ -250,9 +249,9 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:  .LBB0_14: ; in Loop: Header=BB0_8 Depth=2
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s55
 ; CHECK-NEXT:    ds_read_u8 v0, v59 offset:3
-; CHECK-NEXT:    s_mov_b32 s55, exec_lo
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_cmpx_eq_u16_e64 v56, v0
+; CHECK-NEXT:    v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD
+; CHECK-NEXT:    s_and_saveexec_b32 s55, s4
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_7
 ; CHECK-NEXT:  ; %bb.15: ; in Loop: Header=BB0_8 Depth=2
 ; CHECK-NEXT:    v_mov_b32_e32 v31, v40
@@ -300,10 +299,10 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:  .LBB0_20: ; Parent Loop BB0_5 Depth=1
 ; CHECK-NEXT:    ; => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    v_add_nc_u32_e32 v0, v44, v58
-; CHECK-NEXT:    s_mov_b32 s53, exec_lo
 ; CHECK-NEXT:    ds_read_u8 v0, v0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_cmpx_eq_u16_e64 v56, v0
+; CHECK-NEXT:    v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD
+; CHECK-NEXT:    s_and_saveexec_b32 s53, s4
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_19
 ; CHECK-NEXT:  ; %bb.21: ; in Loop: Header=BB0_20 Depth=2
 ; CHECK-NEXT:    v_mov_b32_e32 v31, v40
diff --git a/llvm/test/CodeGen/AMDGPU/permute_i8.ll b/llvm/test/CodeGen/AMDGPU/permute_i8.ll
index 7ca9ae359a499..352c1ecf8ece4 100644
--- a/llvm/test/CodeGen/AMDGPU/permute_i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/permute_i8.ll
@@ -697,18 +697,16 @@ define hidden void @add(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, p
 ; GFX9-LABEL: add:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v4, v[2:3], off
-; GFX9-NEXT:    global_load_dword v7, v[0:1], off
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshrrev_b16_e32 v0, 8, v4
+; GFX9-NEXT:    global_load_dword v4, v[0:1], off
+; GFX9-NEXT:    global_load_dword v7, v[2:3], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_add_u16_sdwa v1, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX9-NEXT:    v_add_u16_sdwa v2, v4, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX9-NEXT:    v_add_u16_sdwa v3, v7, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_add_u16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_add_u16_sdwa v0, v4, v7 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1
+; GFX9-NEXT:    v_add_u16_sdwa v1, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX9-NEXT:    v_add_u16_sdwa v2, v7, v7 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX9-NEXT:    v_add_u16_sdwa v3, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v1, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    global_store_dword v[5:6], v0, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -808,17 +806,16 @@ define hidden void @add_store(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %
 ; GFX9-NEXT:    global_load_dword v9, v[2:3], off
 ; GFX9-NEXT:    s_movk_i32 s4, 0xff00
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshrrev_b16_e32 v0, 8, v4
-; GFX9-NEXT:    v_and_b32_sdwa v1, v4, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v0, v4, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_add_u16_sdwa v2, v4, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1
-; GFX9-NEXT:    v_or_b32_e32 v1, v0, v1
-; GFX9-NEXT:    v_add_u16_e32 v0, v0, v9
-; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    v_add_u16_sdwa v1, v4, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1
+; GFX9-NEXT:    v_add_u16_sdwa v2, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX9-NEXT:    global_store_dword v[5:6], v0, off
-; GFX9-NEXT:    global_store_dword v[7:8], v1, off
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    global_store_dword v[5:6], v1, off
+; GFX9-NEXT:    global_store_dword v[7:8], v0, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
@@ -868,23 +865,22 @@ define hidden void @add_store_div_16(ptr addrspace(1) %in0, ptr addrspace(1) %in
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dword v9, v[0:1], off
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v4
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
-; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX9-NEXT:    global_load_dword v4, v[0:1], off
+; GFX9-NEXT:    global_load_dword v9, v[2:3], off
 ; GFX9-NEXT:    s_movk_i32 s4, 0xff00
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshrrev_b16_e32 v1, 8, v9
-; GFX9-NEXT:    v_and_b32_sdwa v2, v9, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_e32 v2, v1, v2
+; GFX9-NEXT:    v_and_b32_sdwa v0, v4, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_add_u16_sdwa v3, v9, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1
-; GFX9-NEXT:    v_add_u16_e32 v0, v1, v0
-; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_add_u16_sdwa v1, v4, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1
+; GFX9-NEXT:    v_add_u16_sdwa v2, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v2
-; GFX9-NEXT:    global_store_dword v[5:6], v0, off
-; GFX9-NEXT:    global_store_dword v[7:8], v1, off
+; GFX9-NEXT:    global_store_dword v[5:6], v1, off
+; GFX9-NEXT:    global_store_dword v[7:8], v0, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -942,22 +938,20 @@ define hidden void @add_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX9-NEXT:    global_load_dword v4, v[2:3], off
-; GFX9-NEXT:    global_load_dword v9, v[0:1], off
+; GFX9-NEXT:    global_load_dword v4, v[0:1], off
+; GFX9-NEXT:    global_load_dword v9, v[2:3], off
 ; GFX9-NEXT:    s_mov_b32 s4, 0x10705
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshrrev_b16_e32 v0, 8, v4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_perm_b32 v1, v9, v4, s4
-; GFX9-NEXT:    v_add_u16_sdwa v2, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX9-NEXT:    v_add_u16_sdwa v3, v4, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX9-NEXT:    v_add_u16_sdwa v9, v9, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_add_u16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT:    v_or_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    global_store_dword v[5:6], v0, off
-; GFX9-NEXT:    global_store_dword v[7:8], v1, off
+; GFX9-NEXT:    v_perm_b32 v0, v4, v9, s4
+; GFX9-NEXT:    v_add_u16_sdwa v1, v4, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1
+; GFX9-NEXT:    v_add_u16_sdwa v2, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX9-NEXT:    v_add_u16_sdwa v3, v9, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX9-NEXT:    v_add_u16_sdwa v4, v9, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    global_store_dword v[5:6], v1, off
+; GFX9-NEXT:    global_store_dword v[7:8], v0, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1391,22 +1385,20 @@ define hidden void @mul_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX9-NEXT:    global_load_dword v4, v[2:3], off
-; GFX9-NEXT:    global_load_dword v9, v[0:1], off
+; GFX9-NEXT:    global_load_dword v4, v[0:1], off
+; GFX9-NEXT:    global_load_dword v9, v[2:3], off
 ; GFX9-NEXT:    s_mov_b32 s4, 0x2000504
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mul_lo_u16_e32 v2, v9, v4
-; GFX9-NEXT:    v_mul_lo_u16_sdwa v3, v9, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; GFX9-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_mul_lo_u16_sdwa v3, v0, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX9-NEXT:    v_mul_lo_u16_e32 v0, v4, v0
-; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_perm_b32 v1, v9, v4, s4
-; GFX9-NEXT:    global_store_dword v[5:6], v0, off
-; GFX9-NEXT:    global_store_dword v[7:8], v1, off
+; GFX9-NEXT:    v_perm_b32 v0, v4, v9, s4
+; GFX9-NEXT:    v_mul_lo_u16_e32 v1, v4, v9
+; GFX9-NEXT:    v_mul_lo_u16_sdwa v2, v4, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX9-NEXT:    v_mul_lo_u16_sdwa v3, v9, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_3
+; GFX9-NEXT:    v_mul_lo_u16_sdwa v4, v9, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    global_store_dword v[5:6], v1, off
+; GFX9-NEXT:    global_store_dword v[7:8], v0, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1503,67 +1495,61 @@ define hidden void @sdiv_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
 ; GFX10-NEXT:    global_load_dword v4, v[2:3], off
 ; GFX10-NEXT:    global_load_dword v9, v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_bfe_i32 v0, v4, 0, 8
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_ashrrev_i32_e32 v2, 24, v9
-; GFX10-NEXT:    v_bfe_i32 v3, v4, 8, 8
-; GFX10-NEXT:    v_bfe_i32 v1, v9, 16, 8
-; GFX10-NEXT:    v_bfe_i32 v10, v4, 16, 8
-; GFX10-NEXT:    v_cvt_f32_i32_e32 v13, v0
-; GFX10-NEXT:    v_ashrrev_i32_e32 v11, 24, v4
-; GFX10-NEXT:    v_xor_b32_e32 v15, v2, v3
-; GFX10-NEXT:    v_cvt_f32_i32_e32 v3, v3
-; GFX10-NEXT:    v_xor_b32_e32 v12, v1, v0
-; GFX10-NEXT:    v_rcp_iflag_f32_e32 v16, v13
-; GFX10-NEXT:    v_cvt_f32_i32_e32 v14, v1
-; GFX10-NEXT:    v_xor_b32_e32 v1, v1, v10
-; GFX10-NEXT:    v_cvt_f32_i32_e32 v10, v10
-; GFX10-NEXT:    v_rcp_iflag_f32_e32 v17, v3
-; GFX10-NEXT:    v_xor_b32_e32 v0, v0, v11
-; GFX10-NEXT:    v_cvt_f32_i32_e32 v11, v11
-; GFX10-NEXT:    v_cvt_f32_i32_e32 v2, v2
-; GFX10-NEXT:    v_rcp_iflag_f32_e32 v18, v10
-; GFX10-NEXT:    v_ashrrev_i32_e32 v12, 30, v12
-; GFX10-NEXT:    v_mul_f32_e32 v16, v14, v16
-; GFX10-NEXT:    v_rcp_iflag_f32_e32 v19, v11
-; GFX10-NEXT:    v_ashrrev_i32_e32 v15, 30, v15
-; GFX10-NEXT:    v_ashrrev_i32_e32 v1, 30, v1
+; GFX10-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+; GFX10-NEXT:    v_cvt_f32_i32_sdwa v10, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_cvt_f32_i32_sdwa v2, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
+; GFX10-NEXT:    v_cvt_f32_i32_sdwa v12, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
+; GFX10-NEXT:    v_cvt_f32_i32_sdwa v14, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+; GFX10-NEXT:    v_rcp_iflag_f32_e32 v15, v1
+; GFX10-NEXT:    v_rcp_iflag_f32_e32 v16, v10
+; GFX10-NEXT:    v_cvt_f32_i32_sdwa v19, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+; GFX10-NEXT:    v_rcp_iflag_f32_e32 v17, v12
+; GFX10-NEXT:    v_xor_b32_sdwa v0, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0
+; GFX10-NEXT:    v_rcp_iflag_f32_e32 v18, v14
+; GFX10-NEXT:    v_xor_b32_sdwa v3, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1
+; GFX10-NEXT:    v_xor_b32_sdwa v11, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
+; GFX10-NEXT:    v_xor_b32_sdwa v13, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_3
+; GFX10-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
+; GFX10-NEXT:    v_mul_f32_e32 v15, v2, v15
+; GFX10-NEXT:    v_mul_f32_e32 v16, v19, v16
+; GFX10-NEXT:    v_ashrrev_i32_e32 v3, 30, v3
 ; GFX10-NEXT:    v_mul_f32_e32 v17, v2, v17
+; GFX10-NEXT:    v_or_b32_e32 v0, 1, v0
+; GFX10-NEXT:    v_trunc_f32_e32 v15, v15
 ; GFX10-NEXT:    v_trunc_f32_e32 v16, v16
-; GFX10-NEXT:    v_or_b32_e32 v12, 1, v12
-; GFX10-NEXT:    v_or_b32_e32 v15, 1, v15
-; GFX10-NEXT:    v_mul_f32_e32 v18, v14, v18
+; GFX10-NEXT:    v_mul_f32_e32 v18, v1, v18
 ; GFX10-NEXT:    v_trunc_f32_e32 v17, v17
-; GFX10-NEXT:    v_mad_f32 v20, -v16, v13, v14
-; GFX10-NEXT:    v_mul_f32_e32 v19, v13, v19
-; GFX10-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
+; GFX10-NEXT:    v_ashrrev_i32_e32 v11, 30, v11
+; GFX10-NEXT:    v_mad_f32 v20, -v15, v1, v2
+; GFX10-NEXT:    v_mad_f32 v19, -v16, v10, v19
+; GFX10-NEXT:    v_or_b32_e32 v3, 1, v3
 ; GFX10-NEXT:    v_trunc_f32_e32 v18, v18
-; GFX10-NEXT:    v_mad_f32 v2, -v17, v3, v2
-; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v20|, |v13|
-; GFX10-NEXT:    v_trunc_f32_e32 v19, v19
-; GFX10-NEXT:    v_or_b32_e32 v1, 1, v1
-; GFX10-NEXT:    v_mad_f32 v14, -v18, v10, v14
-; GFX10-NEXT:    v_or_b32_e32 v0, 1, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v12, 0, v12, vcc_lo
-; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v2|, |v3|
-; GFX10-NEXT:    v_mad_f32 v21, -v19, v11, v13
+; GFX10-NEXT:    v_mad_f32 v2, -v17, v12, v2
+; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v20|, |v1|
+; GFX10-NEXT:    v_ashrrev_i32_e32 v13, 30, v13
+; GFX10-NEXT:    v_or_b32_e32 v11, 1, v11
+; GFX10-NEXT:    v_mad_f32 v21, -v18, v14, v1
+; GFX10-NEXT:    v_cvt_i32_f32_e32 v15, v15
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v19|, |v10|
+; GFX10-NEXT:    v_or_b32_e32 v13, 1, v13
 ; GFX10-NEXT:    v_cvt_i32_f32_e32 v16, v16
 ; GFX10-NEXT:    v_cvt_i32_f32_e32 v17, v17
 ; GFX10-NEXT:    v_cvt_i32_f32_e32 v18, v18
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, 0, v15, vcc_lo
-; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v14|, |v10|
-; GFX10-NEXT:    v_cvt_i32_f32_e32 v19, v19
-; GFX10-NEXT:    v_add_nc_u32_e32 v3, v16, v12
-; GFX10-NEXT:    v_add_nc_u32_sdwa v2, v17, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v21|, |v11|
-; GFX10-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-NEXT:    v_add_nc_u32_e32 v1, v18, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc_lo
-; GFX10-NEXT:    v_add_nc_u32_sdwa v0, v19, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc_lo
+; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v2|, |v12|
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, v15, v0
+; GFX10-NEXT:    v_add_nc_u32_sdwa v1, v16, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, 0, v11, vcc_lo
+; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v21|, |v14|
+; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-NEXT:    v_add_nc_u32_e32 v2, v17, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, 0, v13, vcc_lo
+; GFX10-NEXT:    v_add_nc_u32_sdwa v3, v18, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_sdwa v1, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX10-NEXT:    v_perm_b32 v1, v9, v4, 0x60706
-; GFX10-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX10-NEXT:    global_store_dword v[5:6], v0, off
 ; GFX10-NEXT:    global_store_dword v[7:8], v1, off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -1581,67 +1567,61 @@ define hidden void @sdiv_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
 ; GFX9-NEXT:    global_load_dword v9, v[0:1], off
 ; GFX9-NEXT:    s_mov_b32 s4, 0x60706
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_bfe_i32 v1, v4, 0, 8
+; GFX9-NEXT:    v_cvt_f32_i32_sdwa v2, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+; GFX9-NEXT:    v_cvt_f32_i32_sdwa v12, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_perm_b32 v0, v9, v4, s4
-; GFX9-NEXT:    v_bfe_i32 v2, v9, 16, 8
-; GFX9-NEXT:    v_ashrrev_i32_e32 v3, 24, v9
-; GFX9-NEXT:    v_bfe_i32 v9, v4, 8, 8
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v12, v1
-; GFX9-NEXT:    v_bfe_i32 v10, v4, 16, 8
-; GFX9-NEXT:    v_ashrrev_i32_e32 v4, 24, v4
-; GFX9-NEXT:    v_xor_b32_e32 v14, v3, v9
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v9, v9
-; GFX9-NEXT:    v_xor_b32_e32 v11, v2, v1
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v13, v2
-; GFX9-NEXT:    v_xor_b32_e32 v2, v2, v10
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v10, v10
-; GFX9-NEXT:    v_xor_b32_e32 v1, v1, v4
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v4, v4
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v15, v12
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, v3
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v16, v9
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v17, v10
+; GFX9-NEXT:    v_xor_b32_sdwa v1, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0
+; GFX9-NEXT:    v_cvt_f32_i32_sdwa v3, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
+; GFX9-NEXT:    v_xor_b32_sdwa v10, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1
+; GFX9-NEXT:    v_cvt_f32_i32_sdwa v11, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+; GFX9-NEXT:    v_xor_b32_sdwa v9, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
+; GFX9-NEXT:    v_cvt_f32_i32_sdwa v13, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
+; GFX9-NEXT:    v_xor_b32_sdwa v14, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_3
+; GFX9-NEXT:    v_cvt_f32_i32_sdwa v4, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v15, v2
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v16, v12
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v17, v13
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v18, v4
-; GFX9-NEXT:    v_mul_f32_e32 v15, v13, v15
-; GFX9-NEXT:    v_mul_f32_e32 v16, v3, v16
+; GFX9-NEXT:    v_mul_f32_e32 v15, v3, v15
+; GFX9-NEXT:    v_mul_f32_e32 v16, v11, v16
 ; GFX9-NEXT:    v_trunc_f32_e32 v15, v15
-; GFX9-NEXT:    v_ashrrev_i32_e32 v11, 30, v11
-; GFX9-NEXT:    v_mul_f32_e32 v17, v13, v17
-; GFX9-NEXT:    v_mul_f32_e32 v18, v12, v18
+; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 30, v1
+; GFX9-NEXT:    v_mul_f32_e32 v17, v3, v17
+; GFX9-NEXT:    v_mul_f32_e32 v18, v2, v18
 ; GFX9-NEXT:    v_trunc_f32_e32 v16, v16
-; GFX9-NEXT:    v_mad_f32 v19, -v15, v12, v13
-; GFX9-NEXT:    v_ashrrev_i32_e32 v14, 30, v14
-; GFX9-NEXT:    v_or_b32_e32 v11, 1, v11
+; GFX9-NEXT:    v_mad_f32 v19, -v15, v2, v3
+; GFX9-NEXT:    v_ashrrev_i32_e32 v10, 30, v10
+; GFX9-NEXT:    v_or_b32_e32 v1, 1, v1
 ; GFX9-NEXT:    v_trunc_f32_e32 v17, v17
 ; GFX9-NEXT:    v_trunc_f32_e32 v18, v18
-; GFX9-NEXT:    v_mad_f32 v3, -v16, v9, v3
-; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v19|, |v12|
-; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 30, v2
-; GFX9-NEXT:    v_or_b32_e32 v14, 1, v14
+; GFX9-NEXT:    v_mad_f32 v11, -v16, v12, v11
+; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v19|, |v2|
+; GFX9-NEXT:    v_ashrrev_i32_e32 v9, 30, v9
+; GFX9-NEXT:    v_or_b32_e32 v10, 1, v10
 ; GFX9-NEXT:    v_cvt_i32_f32_e32 v15, v15
 ; GFX9-NEXT:    v_cvt_i32_f32_e32 v16, v16
-; GFX9-NEXT:    v_mad_f32 v13, -v17, v10, v13
+; GFX9-NEXT:    v_mad_f32 v3, -v17, v13, v3
 ; GFX9-NEXT:    v_cvt_i32_f32_e32 v17, v17
-; GFX9-NEXT:    v_mad_f32 v20, -v18, v4, v12
+; GFX9-NEXT:    v_mad_f32 v20, -v18, v4, v2
 ; GFX9-NEXT:    v_cvt_i32_f32_e32 v18, v18
-; GFX9-NEXT:    v_cndmask_b32_e32 v11, 0, v11, vcc
-; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v9|
-; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 30, v1
-; GFX9-NEXT:    v_or_b32_e32 v2, 1, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, 0, v14, vcc
-; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v13|, |v10|
-; GFX9-NEXT:    v_or_b32_e32 v1, 1, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
-; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v20|, |v4|
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX9-NEXT:    v_add_u32_e32 v4, v15, v11
-; GFX9-NEXT:    v_add_u32_sdwa v3, v16, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT:    v_add_u32_e32 v2, v17, v2
-; GFX9-NEXT:    v_add_u32_sdwa v1, v18, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v11|, |v12|
+; GFX9-NEXT:    v_ashrrev_i32_e32 v14, 30, v14
+; GFX9-NEXT:    v_or_b32_e32 v9, 1, v9
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v10, vcc
+; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v13|
+; GFX9-NEXT:    v_or_b32_e32 v14, 1, v14
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc
+; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v20|, |v4|
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, 0, v14, vcc
+; GFX9-NEXT:    v_add_u32_e32 v1, v15, v1
+; GFX9-NEXT:    v_add_u32_sdwa v2, v16, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    v_add_u32_e32 v3, v17, v3
+; GFX9-NEXT:    v_add_u32_sdwa v4, v18, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    global_store_dword v[5:6], v1, off
 ; GFX9-NEXT:    global_store_dword v[7:8], v0, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -1876,73 +1856,67 @@ define hidden void @srem_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
 ; GFX10-NEXT:    global_load_dword v4, v[2:3], off
 ; GFX10-NEXT:    global_load_dword v9, v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_bfe_i32 v1, v4, 0, 8
-; GFX10-NEXT:    v_bfe_i32 v2, v4, 16, 8
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_ashrrev_i32_e32 v10, 24, v9
-; GFX10-NEXT:    v_bfe_i32 v11, v4, 8, 8
-; GFX10-NEXT:    v_ashrrev_i32_e32 v12, 24, v4
-; GFX10-NEXT:    v_bfe_i32 v13, v9, 16, 8
-; GFX10-NEXT:    v_xor_b32_e32 v14, v2, v1
-; GFX10-NEXT:    v_cvt_f32_i32_e32 v1, v1
-; GFX10-NEXT:    v_xor_b32_e32 v16, v10, v11
-; GFX10-NEXT:    v_cvt_f32_i32_e32 v11, v11
-; GFX10-NEXT:    v_cvt_f32_i32_e32 v15, v2
-; GFX10-NEXT:    v_cvt_f32_i32_e32 v10, v10
-; GFX10-NEXT:    v_rcp_iflag_f32_e32 v18, v1
-; GFX10-NEXT:    v_cvt_f32_i32_e32 v17, v12
-; GFX10-NEXT:    v_rcp_iflag_f32_e32 v19, v11
+; GFX10-NEXT:    v_cvt_f32_i32_sdwa v2, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+; GFX10-NEXT:    v_cvt_f32_i32_sdwa v13, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
+; GFX10-NEXT:    v_cvt_f32_i32_sdwa v3, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_cvt_f32_i32_sdwa v12, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+; GFX10-NEXT:    v_cvt_f32_i32_sdwa v15, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+; GFX10-NEXT:    v_rcp_iflag_f32_e32 v17, v2
+; GFX10-NEXT:    v_rcp_iflag_f32_e32 v18, v13
+; GFX10-NEXT:    v_rcp_iflag_f32_e32 v19, v3
+; GFX10-NEXT:    v_xor_b32_sdwa v1, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v20, v15
-; GFX10-NEXT:    v_xor_b32_e32 v2, v12, v2
-; GFX10-NEXT:    v_xor_b32_e32 v12, v13, v12
-; GFX10-NEXT:    v_rcp_iflag_f32_e32 v21, v17
-; GFX10-NEXT:    v_ashrrev_i32_e32 v14, 30, v14
-; GFX10-NEXT:    v_cvt_f32_i32_e32 v13, v13
-; GFX10-NEXT:    v_ashrrev_i32_e32 v16, 30, v16
-; GFX10-NEXT:    v_mul_f32_e32 v18, v15, v18
-; GFX10-NEXT:    v_ashrrev_i32_e32 v2, 30, v2
-; GFX10-NEXT:    v_mul_f32_e32 v19, v10, v19
-; GFX10-NEXT:    v_mul_f32_e32 v20, v17, v20
-; GFX10-NEXT:    v_or_b32_e32 v14, 1, v14
+; GFX10-NEXT:    v_xor_b32_sdwa v11, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1
+; GFX10-NEXT:    v_cvt_f32_i32_sdwa v21, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
+; GFX10-NEXT:    v_xor_b32_sdwa v14, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_2
+; GFX10-NEXT:    v_ashrrev_i32_e32 v1, 30, v1
+; GFX10-NEXT:    v_xor_b32_sdwa v16, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_3
+; GFX10-NEXT:    v_mul_f32_e32 v17, v3, v17
+; GFX10-NEXT:    v_mul_f32_e32 v18, v12, v18
+; GFX10-NEXT:    v_mul_f32_e32 v19, v15, v19
+; GFX10-NEXT:    v_ashrrev_i32_e32 v11, 30, v11
+; GFX10-NEXT:    v_or_b32_e32 v1, 1, v1
+; GFX10-NEXT:    v_trunc_f32_e32 v17, v17
 ; GFX10-NEXT:    v_trunc_f32_e32 v18, v18
-; GFX10-NEXT:    v_mul_f32_e32 v21, v13, v21
+; GFX10-NEXT:    v_mul_f32_e32 v20, v21, v20
 ; GFX10-NEXT:    v_trunc_f32_e32 v19, v19
+; GFX10-NEXT:    v_ashrrev_i32_e32 v14, 30, v14
+; GFX10-NEXT:    v_mad_f32 v22, -v17, v2, v3
+; GFX10-NEXT:    v_mad_f32 v12, -v18, v13, v12
+; GFX10-NEXT:    v_or_b32_e32 v11, 1, v11
 ; GFX10-NEXT:    v_trunc_f32_e32 v20, v20
+; GFX10-NEXT:    v_mad_f32 v23, -v19, v3, v15
+; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v22|, |v2|
+; GFX10-NEXT:    v_ashrrev_i32_e32 v16, 30, v16
+; GFX10-NEXT:    v_or_b32_e32 v14, 1, v14
+; GFX10-NEXT:    v_mad_f32 v21, -v20, v15, v21
+; GFX10-NEXT:    v_cvt_i32_f32_e32 v17, v17
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v12|, |v13|
 ; GFX10-NEXT:    v_or_b32_e32 v16, 1, v16
-; GFX10-NEXT:    v_mad_f32 v22, -v18, v1, v15
-; GFX10-NEXT:    v_trunc_f32_e32 v21, v21
-; GFX10-NEXT:    v_mad_f32 v10, -v19, v11, v10
-; GFX10-NEXT:    v_mad_f32 v23, -v20, v15, v17
-; GFX10-NEXT:    v_ashrrev_i32_e32 v12, 30, v12
-; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v22|, |v1|
-; GFX10-NEXT:    v_or_b32_e32 v2, 1, v2
-; GFX10-NEXT:    v_mad_f32 v13, -v21, v17, v13
 ; GFX10-NEXT:    v_cvt_i32_f32_e32 v18, v18
-; GFX10-NEXT:    v_or_b32_e32 v12, 1, v12
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0, v14, vcc_lo
-; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v10|, |v11|
 ; GFX10-NEXT:    v_cvt_i32_f32_e32 v19, v19
 ; GFX10-NEXT:    v_cvt_i32_f32_e32 v20, v20
-; GFX10-NEXT:    v_cvt_i32_f32_e32 v21, v21
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, 0, v11, vcc_lo
+; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v23|, |v3|
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v10, 0, v16, vcc_lo
-; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v23|, |v15|
-; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 8, v4
-; GFX10-NEXT:    v_add_nc_u32_e32 v1, v18, v1
-; GFX10-NEXT:    v_add_nc_u32_e32 v10, v19, v10
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v13|, |v17|
-; GFX10-NEXT:    v_mul_lo_u32 v1, v1, v4
-; GFX10-NEXT:    v_mul_lo_u32 v3, v10, v3
-; GFX10-NEXT:    v_add_nc_u32_e32 v2, v20, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v11, 0, v12, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 8, v4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 24, v4
-; GFX10-NEXT:    v_mul_lo_u32 v2, v2, v0
-; GFX10-NEXT:    v_add_nc_u32_e32 v11, v21, v11
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, v17, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, 0, v14, vcc_lo
+; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v21|, |v15|
+; GFX10-NEXT:    v_add_nc_u32_e32 v2, v18, v2
+; GFX10-NEXT:    v_mul_lo_u32 v1, v1, v4
+; GFX10-NEXT:    v_add_nc_u32_e32 v3, v19, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v11, 0, v16, vcc_lo
+; GFX10-NEXT:    v_mul_lo_u32 v2, v2, v10
+; GFX10-NEXT:    v_mul_lo_u32 v3, v3, v0
+; GFX10-NEXT:    v_add_nc_u32_e32 v11, v20, v11
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v0, v0, v1
-; GFX10-NEXT:    v_sub_nc_u32_sdwa v1, v9, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX10-NEXT:    v_sub_nc_u32_sdwa v1, v9, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
 ; GFX10-NEXT:    v_mul_lo_u32 v10, v11, v12
-; GFX10-NEXT:    v_sub_nc_u32_e32 v2, v12, v2
+; GFX10-NEXT:    v_sub_nc_u32_e32 v2, v12, v3
 ; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-NEXT:    v_sub_nc_u32_sdwa v3, v9, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    v_or_b32_sdwa v1, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -1965,74 +1939,68 @@ define hidden void @srem_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
 ; GFX9-NEXT:    global_load_dword v9, v[0:1], off
 ; GFX9-NEXT:    s_mov_b32 s4, 0x2070306
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_bfe_i32 v2, v4, 0, 8
-; GFX9-NEXT:    v_bfe_i32 v3, v4, 16, 8
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_ashrrev_i32_e32 v11, 24, v9
-; GFX9-NEXT:    v_bfe_i32 v12, v4, 8, 8
-; GFX9-NEXT:    v_xor_b32_e32 v16, v3, v2
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, v2
-; GFX9-NEXT:    v_ashrrev_i32_e32 v13, 24, v4
-; GFX9-NEXT:    v_xor_b32_e32 v18, v11, v12
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v12, v12
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v17, v3
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v19, v13
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v20, v2
-; GFX9-NEXT:    v_bfe_i32 v15, v9, 16, 8
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v11, v11
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v21, v12
-; GFX9-NEXT:    v_xor_b32_e32 v3, v13, v3
-; GFX9-NEXT:    v_xor_b32_e32 v13, v15, v13
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v15, v15
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v22, v17
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v23, v19
-; GFX9-NEXT:    v_mul_f32_e32 v20, v17, v20
-; GFX9-NEXT:    v_mul_f32_e32 v21, v11, v21
+; GFX9-NEXT:    v_cvt_f32_i32_sdwa v3, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+; GFX9-NEXT:    v_cvt_f32_i32_sdwa v14, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
+; GFX9-NEXT:    v_cvt_f32_i32_sdwa v10, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
+; GFX9-NEXT:    v_cvt_f32_i32_sdwa v16, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v20, v3
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cvt_f32_i32_sdwa v13, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v21, v14
+; GFX9-NEXT:    v_cvt_f32_i32_sdwa v19, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v22, v10
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v23, v16
+; GFX9-NEXT:    v_mul_f32_e32 v20, v10, v20
+; GFX9-NEXT:    v_xor_b32_sdwa v2, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0
+; GFX9-NEXT:    v_mul_f32_e32 v21, v13, v21
 ; GFX9-NEXT:    v_trunc_f32_e32 v20, v20
-; GFX9-NEXT:    v_ashrrev_i32_e32 v16, 30, v16
-; GFX9-NEXT:    v_mul_f32_e32 v22, v19, v22
-; GFX9-NEXT:    v_mul_f32_e32 v23, v15, v23
+; GFX9-NEXT:    v_xor_b32_sdwa v12, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1
+; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 30, v2
+; GFX9-NEXT:    v_mul_f32_e32 v22, v16, v22
+; GFX9-NEXT:    v_mul_f32_e32 v23, v19, v23
 ; GFX9-NEXT:    v_trunc_f32_e32 v21, v21
-; GFX9-NEXT:    v_mad_f32 v24, -v20, v2, v17
-; GFX9-NEXT:    v_ashrrev_i32_e32 v18, 30, v18
-; GFX9-NEXT:    v_or_b32_e32 v16, 1, v16
+; GFX9-NEXT:    v_mad_f32 v24, -v20, v3, v10
+; GFX9-NEXT:    v_xor_b32_sdwa v15, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_2
+; GFX9-NEXT:    v_ashrrev_i32_e32 v12, 30, v12
+; GFX9-NEXT:    v_or_b32_e32 v2, 1, v2
 ; GFX9-NEXT:    v_trunc_f32_e32 v22, v22
 ; GFX9-NEXT:    v_trunc_f32_e32 v23, v23
-; GFX9-NEXT:    v_mad_f32 v11, -v21, v12, v11
-; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v24|, |v2|
-; GFX9-NEXT:    v_ashrrev_i32_e32 v3, 30, v3
-; GFX9-NEXT:    v_or_b32_e32 v18, 1, v18
+; GFX9-NEXT:    v_mad_f32 v13, -v21, v14, v13
+; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v24|, |v3|
+; GFX9-NEXT:    v_xor_b32_sdwa v18, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_3
+; GFX9-NEXT:    v_ashrrev_i32_e32 v15, 30, v15
+; GFX9-NEXT:    v_or_b32_e32 v12, 1, v12
 ; GFX9-NEXT:    v_cvt_i32_f32_e32 v20, v20
 ; GFX9-NEXT:    v_cvt_i32_f32_e32 v21, v21
-; GFX9-NEXT:    v_mad_f32 v25, -v22, v17, v19
+; GFX9-NEXT:    v_mad_f32 v25, -v22, v10, v16
 ; GFX9-NEXT:    v_cvt_i32_f32_e32 v22, v22
-; GFX9-NEXT:    v_mad_f32 v15, -v23, v19, v15
+; GFX9-NEXT:    v_mad_f32 v19, -v23, v16, v19
 ; GFX9-NEXT:    v_cvt_i32_f32_e32 v23, v23
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v16, vcc
-; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v11|, |v12|
-; GFX9-NEXT:    v_ashrrev_i32_e32 v13, 30, v13
-; GFX9-NEXT:    v_or_b32_e32 v3, 1, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v11, 0, v18, vcc
-; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v25|, |v17|
-; GFX9-NEXT:    v_or_b32_e32 v13, 1, v13
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, 0, v3, vcc
-; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v15|, |v19|
-; GFX9-NEXT:    v_cndmask_b32_e32 v12, 0, v13, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v13|, |v14|
+; GFX9-NEXT:    v_ashrrev_i32_e32 v18, 30, v18
+; GFX9-NEXT:    v_or_b32_e32 v15, 1, v15
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, 0, v12, vcc
+; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v25|, |v10|
+; GFX9-NEXT:    v_or_b32_e32 v18, 1, v18
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, 0, v15, vcc
+; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v19|, |v16|
+; GFX9-NEXT:    v_cndmask_b32_e32 v12, 0, v18, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v4
-; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 8, v4
-; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 24, v4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 8, v4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 24, v4
 ; GFX9-NEXT:    v_add_u32_e32 v2, v20, v2
-; GFX9-NEXT:    v_add_u32_e32 v11, v21, v11
-; GFX9-NEXT:    v_add_u32_e32 v3, v22, v3
+; GFX9-NEXT:    v_add_u32_e32 v3, v21, v3
+; GFX9-NEXT:    v_add_u32_e32 v10, v22, v10
 ; GFX9-NEXT:    v_add_u32_e32 v12, v23, v12
 ; GFX9-NEXT:    v_perm_b32 v1, v4, v9, s4
 ; GFX9-NEXT:    v_mul_lo_u32 v2, v2, v4
-; GFX9-NEXT:    v_mul_lo_u32 v4, v11, v10
-; GFX9-NEXT:    v_mul_lo_u32 v3, v3, v0
-; GFX9-NEXT:    v_mul_lo_u32 v10, v12, v14
+; GFX9-NEXT:    v_mul_lo_u32 v3, v3, v11
+; GFX9-NEXT:    v_mul_lo_u32 v4, v10, v0
+; GFX9-NEXT:    v_mul_lo_u32 v10, v12, v17
 ; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v2
-; GFX9-NEXT:    v_sub_u32_sdwa v2, v9, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_sub_u32_e32 v3, v14, v3
+; GFX9-NEXT:    v_sub_u32_sdwa v2, v9, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_sub_u32_e32 v3, v17, v4
 ; GFX9-NEXT:    v_sub_u32_sdwa v4, v9, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -2090,27 +2058,24 @@ define hidden void @sub_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4
-; GFX9-NEXT:    global_load_dword v2, v[2:3], off
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX9-NEXT:    global_load_dword v4, v[0:1], off
+; GFX9-NEXT:    global_load_dword v9, v[2:3], off
 ; GFX9-NEXT:    s_mov_b32 s4, 0x6070007
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
-; GFX9-NEXT:    v_sub_u16_sdwa v9, v1, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_perm_b32 v4, v2, v0, s4
-; GFX9-NEXT:    v_sub_u16_sdwa v0, v0, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_sub_u16_e32 v2, v3, v2
-; GFX9-NEXT:    v_sub_u16_e32 v1, v3, v1
-; GFX9-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    global_store_dword v[5:6], v0, off
-; GFX9-NEXT:    global_store_dword v[7:8], v4, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v9, v4, s4
+; GFX9-NEXT:    v_sub_u16_sdwa v1, v4, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_sub_u16_sdwa v2, v9, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_sub_u16_sdwa v3, v9, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_3
+; GFX9-NEXT:    v_sub_u16_sdwa v4, v9, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:WORD_1
+; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    global_store_dword v[5:6], v1, off
+; GFX9-NEXT:    global_store_dword v[7:8], v0, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll b/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll
index 77e1694dbe7e1..470b17f5cc560 100644
--- a/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll
+++ b/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll
@@ -1840,10 +1840,9 @@ define <2 x i16> @v_mul_sub_x_v2i16(<2 x i16> %x, <2 x i16> %y) {
 ; GFX8-LABEL: v_mul_sub_x_v2i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX8-NEXT:    v_mul_lo_u16_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_mul_lo_u16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_mul_lo_u16_e32 v1, v0, v1
-; GFX8-NEXT:    v_sub_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_sub_u16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-NEXT:    v_sub_u16_e32 v0, v1, v0
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr.mir b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr.mir
index e2854df2468b3..8d374e19bafad 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr.mir
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr.mir
@@ -1,3 +1,4 @@
+# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 # RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=si-peephole-sdwa -verify-machineinstrs -o - %s | FileCheck -check-prefix=VI -check-prefix=GFX89 -check-prefix=GCN %s
 # RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=si-peephole-sdwa -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 -check-prefix=GFX89 -check-prefix=GCN %s
 
@@ -468,3 +469,8 @@ body:             |
     S_ENDPGM 0, implicit %7
 
 ...
+## NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+# GCN: {{.*}}
+# GFX89: {{.*}}
+# GFX9: {{.*}}
+# VI: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
index c9dbadcbd2315..c2930313271a9 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
@@ -1557,7 +1557,8 @@ define amdgpu_kernel void @mac_v2half(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX89-NEXT:    s_waitcnt vmcnt(1)
 ; GFX89-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX89-NEXT:    s_waitcnt vmcnt(0)
-; GFX89-NEXT:    v_mac_f16_sdwa v4, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX89-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
+; GFX89-NEXT:    v_mac_f16_sdwa v4, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX89-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX89-NEXT:    v_mac_f16_e32 v2, v3, v2
 ; GFX89-NEXT:    v_or_b32_e32 v2, v2, v4
@@ -1718,18 +1719,16 @@ define amdgpu_kernel void @mulmul_v2i16(ptr addrspace(1) %out, ptr addrspace(1)
 ; GFX89-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX89-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
 ; GFX89-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GFX89-NEXT:    flat_load_dword v4, v[0:1]
 ; GFX89-NEXT:    flat_load_dword v2, v[2:3]
-; GFX89-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX89-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX89-NEXT:    v_mov_b32_e32 v1, s5
-; GFX89-NEXT:    s_waitcnt vmcnt(1)
-; GFX89-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX89-NEXT:    s_waitcnt vmcnt(0)
-; GFX89-NEXT:    v_mul_lo_u16_e32 v5, v3, v2
-; GFX89-NEXT:    v_mul_lo_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX89-NEXT:    v_mul_lo_u16_e32 v2, v5, v2
-; GFX89-NEXT:    v_mul_lo_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX89-NEXT:    v_or_b32_e32 v2, v2, v3
+; GFX89-NEXT:    v_mul_lo_u16_sdwa v3, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX89-NEXT:    v_mul_lo_u16_e32 v4, v4, v2
+; GFX89-NEXT:    v_mul_lo_u16_e32 v4, v4, v2
+; GFX89-NEXT:    v_mul_lo_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX89-NEXT:    v_or_b32_e32 v2, v4, v2
 ; GFX89-NEXT:    flat_store_dword v[0:1], v2
 ; GFX89-NEXT:    s_endpgm
 ;
diff --git a/update.bat b/update.bat
new file mode 100644
index 0000000000000..b7cd2c4ba9d97
--- /dev/null
+++ b/update.bat
@@ -0,0 +1,21 @@
+llvm\utils\update_llc_test_checks.py llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
+llvm\utils\update_llc_test_checks.py llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
+llvm\utils\update_llc_test_checks.py llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
+llvm\utils\update_llc_test_checks.py llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll
+llvm\utils\update_llc_test_checks.py llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
+llvm\utils\update_llc_test_checks.py llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
+llvm\utils\update_llc_test_checks.py llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
+llvm\utils\update_llc_test_checks.py llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
+llvm\utils\update_llc_test_checks.py llvm/test/CodeGen/AMDGPU/fract-match.ll
+llvm\utils\update_llc_test_checks.py llvm/test/CodeGen/AMDGPU/idiv-licm.ll
+llvm\utils\update_llc_test_checks.py llvm/test/CodeGen/AMDGPU/idot4u.ll
+llvm\utils\update_llc_test_checks.py llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
+llvm\utils\update_llc_test_checks.py llvm/test/CodeGen/AMDGPU/llvm.frexp.ll
+llvm\utils\update_llc_test_checks.py llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
+llvm\utils\update_llc_test_checks.py llvm/test/CodeGen/AMDGPU/permute_i8.ll
+llvm\utils\update_llc_test_checks.py llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll
+llvm\utils\update_llc_test_checks.py llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr.mir
+llvm\utils\update_llc_test_checks.py llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
+llvm\utils\update_llc_test_checks.py llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir
+llvm\utils\update_llc_test_checks.py llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll
+llvm\utils\update_llc_test_checks.py llvm/test/CodeGen/AMDGPU/v_mac_f16.ll

>From 31c940f41b83a70bf04515d06e0ef70254dc51b5 Mon Sep 17 00:00:00 2001
From: Brian Favela <brianfavela at microsoft.com>
Date: Fri, 7 Jun 2024 14:56:59 -0400
Subject: [PATCH 3/5] Updating tests and removing superfluous check

---
 llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp     |  9 +-
 .../test/CodeGen/AMDGPU/GlobalISel/saddsat.ll |  9 +-
 .../test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll |  9 +-
 llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll  |  3 +-
 .../CodeGen/AMDGPU/sdwa-peephole-instr.mir    |  6 --
 llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll     | 88 +++++++++++++++++++
 llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll     |  6 +-
 update.bat                                    | 21 -----
 8 files changed, 101 insertions(+), 50 deletions(-)
 delete mode 100644 update.bat

diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index 43348a0f68b13..082aeeea2c7cc 100644
--- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -45,7 +45,7 @@ class SDWADstOperand;
 using SDWAOperandsVector = SmallVector<SDWAOperand *, 4>;
 
 // helper typedef to make code cleaner
-typedef std::unordered_map<MachineInstr *, SDWAOperandsVector> SDWAOperandsMap;
+typedef MapVector<MachineInstr *, SDWAOperandsVector> SDWAOperandsMap;
 
 class SIPeepholeSDWA : public MachineFunctionPass {
 private:
@@ -53,7 +53,7 @@ class SIPeepholeSDWA : public MachineFunctionPass {
   const SIRegisterInfo *TRI;
   const SIInstrInfo *TII;
 
-  std::unordered_map<MachineInstr *, std::unique_ptr<SDWAOperand>> SDWAOperands;
+  MapVector<MachineInstr *, std::unique_ptr<SDWAOperand>> SDWAOperands;
   SDWAOperandsMap PotentialMatches;
   SmallVector<MachineInstr *, 8> ConvertedInstructions;
 
@@ -356,11 +356,6 @@ MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII,
       if (!isConvertibleToSDWA(*(UseMO.getParent()), ST, TII)) {
         return nullptr;
       }
-
-      // Not handling the obscure case where the same use is in multiple operands
-      if (PotentialMatches->find(UseMO.getParent()) != PotentialMatches->end()) {
-        return nullptr;
-      }
     }
     // Now that it's guaranteed all uses are legal, iterate over the uses again
     // to add them for later conversion.
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
index 04da7b24156dc..168e6dfa5f147 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
@@ -298,9 +298,8 @@ define i16 @v_saddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
 ; GFX9-NEXT:    v_pk_add_i16 v0, v0, v1 clamp
 ; GFX9-NEXT:    v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0xff
+; GFX9-NEXT:    v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -607,12 +606,10 @@ define i32 @v_saddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
 ; GFX8-NEXT:    v_add_u16_e32 v3, v3, v4
 ; GFX8-NEXT:    v_mov_b32_e32 v4, 0xff
 ; GFX8-NEXT:    v_and_b32_sdwa v1, sext(v1), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX8-NEXT:    v_ashrrev_i16_e32 v2, 8, v2
 ; GFX8-NEXT:    v_and_b32_sdwa v0, sext(v0), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_and_b32_sdwa v1, sext(v2), v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_and_b32_sdwa v1, sext(v3), v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
index d7b1e37a81a1b..2572f8581f0ed 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
@@ -279,11 +279,9 @@ define i16 @v_ssubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
 ; GFX8-NEXT:    v_max_i16_e32 v1, v1, v2
 ; GFX8-NEXT:    v_min_i16_e32 v1, v1, v4
 ; GFX8-NEXT:    v_sub_u16_e32 v1, v3, v1
-; GFX8-NEXT:    v_ashrrev_i16_e32 v1, 8, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0xff
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX8-NEXT:    v_and_b32_sdwa v0, sext(v0), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; GFX8-NEXT:    v_and_b32_sdwa v1, sext(v1), v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -300,9 +298,8 @@ define i16 @v_ssubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
 ; GFX9-NEXT:    v_pk_sub_i16 v0, v0, v1 clamp
 ; GFX9-NEXT:    v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0xff
+; GFX9-NEXT:    v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
index 663b009116286..cfdb4f7a07d02 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
@@ -679,7 +679,8 @@ define amdgpu_kernel void @fmuladd_v2f16(
 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(1)
 ; VI-FLUSH-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; VI-FLUSH-NEXT:    v_mac_f16_sdwa v3, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-FLUSH-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
+; VI-FLUSH-NEXT:    v_mac_f16_e32 v3, v5, v4
 ; VI-FLUSH-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; VI-FLUSH-NEXT:    v_mac_f16_e32 v0, v2, v1
 ; VI-FLUSH-NEXT:    v_or_b32_e32 v0, v0, v3
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr.mir b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr.mir
index 8d374e19bafad..e2854df2468b3 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr.mir
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr.mir
@@ -1,4 +1,3 @@
-# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 # RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=si-peephole-sdwa -verify-machineinstrs -o - %s | FileCheck -check-prefix=VI -check-prefix=GFX89 -check-prefix=GCN %s
 # RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=si-peephole-sdwa -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 -check-prefix=GFX89 -check-prefix=GCN %s
 
@@ -469,8 +468,3 @@ body:             |
     S_ENDPGM 0, implicit %7
 
 ...
-## NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-# GCN: {{.*}}
-# GFX89: {{.*}}
-# GFX9: {{.*}}
-# VI: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
index c2930313271a9..bd0808e7e359d 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
@@ -2204,6 +2204,94 @@ bb2:
   br label %bb0
 }
 
+define amdgpu_kernel void @mac_v2half_same_srcop(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 {
+; NOSDWA-LABEL: mac_v2half_same_srcop:
+; NOSDWA:       ; %bb.0: ; %entry
+; NOSDWA-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; NOSDWA-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; NOSDWA-NEXT:    s_waitcnt lgkmcnt(0)
+; NOSDWA-NEXT:    v_mov_b32_e32 v0, s6
+; NOSDWA-NEXT:    v_mov_b32_e32 v2, s0
+; NOSDWA-NEXT:    v_mov_b32_e32 v3, s1
+; NOSDWA-NEXT:    v_mov_b32_e32 v1, s7
+; NOSDWA-NEXT:    flat_load_dword v2, v[2:3]
+; NOSDWA-NEXT:    flat_load_dword v3, v[0:1]
+; NOSDWA-NEXT:    v_mov_b32_e32 v0, s4
+; NOSDWA-NEXT:    v_mov_b32_e32 v1, s5
+; NOSDWA-NEXT:    s_waitcnt vmcnt(1)
+; NOSDWA-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; NOSDWA-NEXT:    s_waitcnt vmcnt(0)
+; NOSDWA-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
+; NOSDWA-NEXT:    v_mac_f16_e32 v5, v4, v4
+; NOSDWA-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
+; NOSDWA-NEXT:    v_mac_f16_e32 v3, v2, v2
+; NOSDWA-NEXT:    v_or_b32_e32 v2, v3, v4
+; NOSDWA-NEXT:    flat_store_dword v[0:1], v2
+; NOSDWA-NEXT:    s_endpgm
+;
+; GFX89-LABEL: mac_v2half_same_srcop:
+; GFX89:       ; %bb.0: ; %entry
+; GFX89-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX89-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX89-NEXT:    v_mov_b32_e32 v0, s6
+; GFX89-NEXT:    v_mov_b32_e32 v1, s7
+; GFX89-NEXT:    v_mov_b32_e32 v2, s0
+; GFX89-NEXT:    v_mov_b32_e32 v3, s1
+; GFX89-NEXT:    flat_load_dword v4, v[0:1]
+; GFX89-NEXT:    flat_load_dword v2, v[2:3]
+; GFX89-NEXT:    v_mov_b32_e32 v0, s4
+; GFX89-NEXT:    v_mov_b32_e32 v1, s5
+; GFX89-NEXT:    s_waitcnt vmcnt(1)
+; GFX89-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
+; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    v_mac_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX89-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX89-NEXT:    v_mac_f16_e32 v4, v2, v2
+; GFX89-NEXT:    v_or_b32_e32 v2, v4, v3
+; GFX89-NEXT:    flat_store_dword v[0:1], v2
+; GFX89-NEXT:    s_endpgm
+;
+; GFX9-LABEL: mac_v2half_same_srcop:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v2, v0, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_pk_mul_f16 v1, v1, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_pk_add_f16 v1, v1, v2
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: mac_v2half_same_srcop:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v2, v0, s[6:7]
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_pk_mul_f16 v1, v1, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_pk_add_f16 v1, v1, v2
+; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-NEXT:    s_endpgm
+entry:
+  %a = load <2 x half>, ptr addrspace(1) %ina, align 4
+  %b = load <2 x half>, ptr addrspace(1) %inb, align 4
+  %mul = fmul <2 x half> %b, %b
+  %mac = fadd <2 x half> %mul, %a
+  store <2 x half> %mac, ptr addrspace(1) %out, align 4
+  ret void
+}
+
 declare i32 @llvm.amdgcn.workitem.id.x()
 
 attributes #0 = { "denormal-fp-math"="preserve-sign,preserve-sign" }
diff --git a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll
index d7a6be5110691..f8c9827ecf7a9 100644
--- a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll
@@ -32,12 +32,12 @@ define amdgpu_kernel void @s_abs_v2i16(ptr addrspace(1) %out, <2 x i16> %val) #0
 ; GFX9: v_pk_max_i16 [[MAX:v[0-9]+]], [[VAL]], [[SUB]]
 ; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], [[MAX]], 2 op_sel_hi:[1,0]
 
+; VI-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
 ; VI-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2
-; VI-DAG: v_lshrrev_b32_e32 v{{[0-9]+}}, 16,
 ; VI-DAG: v_sub_u16_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
-; VI-DAG: v_sub_u16_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
-; VI-DAG: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; VI-DAG: v_sub_u16_sdwa v{{[0-9]+}}, [[ZERO]], v{{[0-9]+}}
 ; VI-DAG: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; VI-DAG: v_max_i16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; VI: v_add_u16_e32 v{{[0-9]+}}, 2, v{{[0-9]+}}
 ; VI: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[TWO]]  dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-NOT: v_and_b32
diff --git a/update.bat b/update.bat
deleted file mode 100644
index b7cd2c4ba9d97..0000000000000
--- a/update.bat
+++ /dev/null
@@ -1,21 +0,0 @@
-llvm\utils\update_llc_test_checks.py llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
-llvm\utils\update_llc_test_checks.py llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
-llvm\utils\update_llc_test_checks.py llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
-llvm\utils\update_llc_test_checks.py llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll
-llvm\utils\update_llc_test_checks.py llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
-llvm\utils\update_llc_test_checks.py llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
-llvm\utils\update_llc_test_checks.py llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
-llvm\utils\update_llc_test_checks.py llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
-llvm\utils\update_llc_test_checks.py llvm/test/CodeGen/AMDGPU/fract-match.ll
-llvm\utils\update_llc_test_checks.py llvm/test/CodeGen/AMDGPU/idiv-licm.ll
-llvm\utils\update_llc_test_checks.py llvm/test/CodeGen/AMDGPU/idot4u.ll
-llvm\utils\update_llc_test_checks.py llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
-llvm\utils\update_llc_test_checks.py llvm/test/CodeGen/AMDGPU/llvm.frexp.ll
-llvm\utils\update_llc_test_checks.py llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
-llvm\utils\update_llc_test_checks.py llvm/test/CodeGen/AMDGPU/permute_i8.ll
-llvm\utils\update_llc_test_checks.py llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll
-llvm\utils\update_llc_test_checks.py llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr.mir
-llvm\utils\update_llc_test_checks.py llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
-llvm\utils\update_llc_test_checks.py llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir
-llvm\utils\update_llc_test_checks.py llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll
-llvm\utils\update_llc_test_checks.py llvm/test/CodeGen/AMDGPU/v_mac_f16.ll

>From b5e2d06ee9b85b697e640a3bfa198db51eed1132 Mon Sep 17 00:00:00 2001
From: Brian Favela <brianfavela at microsoft.com>
Date: Fri, 7 Jun 2024 16:59:50 -0400
Subject: [PATCH 4/5] Addressing review concerns

---
 llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index 082aeeea2c7cc..64b7e2d3a4147 100644
--- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -43,9 +43,7 @@ class SDWAOperand;
 class SDWADstOperand;
 
 using SDWAOperandsVector = SmallVector<SDWAOperand *, 4>;
-
-// helper typedef to make code cleaner
-typedef MapVector<MachineInstr *, SDWAOperandsVector> SDWAOperandsMap;
+using SDWAOperandsMap = MapVector<MachineInstr *, SDWAOperandsVector>;
 
 class SIPeepholeSDWA : public MachineFunctionPass {
 private:
@@ -347,13 +345,14 @@ MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII,
       return nullptr;
     }
 
-    for (MachineOperand &UseMO : getMRI()->use_nodbg_operands(Reg->getReg())) {
+
+    for (MachineInstr &UseMI : getMRI()->use_nodbg_instructions(Reg->getReg())) {
       // If there exist use of subreg of Reg then return nullptr
       if (!isSameReg(UseMO, *Reg))
         return nullptr;
 
       // Check that all instructions the use Reg can be converted
-      if (!isConvertibleToSDWA(*(UseMO.getParent()), ST, TII)) {
+      if (!isConvertibleToSDWA(UseMI, ST, TII)) {
         return nullptr;
       }
     }

>From 0ef2512497d8c132e964a20dfff5dfa3d22646a1 Mon Sep 17 00:00:00 2001
From: Brian Favela <brianfavela at microsoft.com>
Date: Fri, 7 Jun 2024 17:23:07 -0400
Subject: [PATCH 5/5] Address review and update tests

---
 llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp    | 11 ++++-----
 llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll | 25 +++++++++-----------
 llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll    |  3 +--
 3 files changed, 17 insertions(+), 22 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index 64b7e2d3a4147..8f56793d7bcc6 100644
--- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -345,20 +345,19 @@ MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII,
       return nullptr;
     }
 
-
     for (MachineInstr &UseMI : getMRI()->use_nodbg_instructions(Reg->getReg())) {
-      // If there exist use of subreg of Reg then return nullptr
-      if (!isSameReg(UseMO, *Reg))
-        return nullptr;
-
-      // Check that all instructions the use Reg can be converted
+      // Check that all instructions that use Reg can be converted
       if (!isConvertibleToSDWA(UseMI, ST, TII)) {
         return nullptr;
       }
     }
+
     // Now that it's guaranteed all uses are legal, iterate over the uses again
     // to add them for later conversion.
     for (MachineOperand &UseMO : getMRI()->use_nodbg_operands(Reg->getReg())) {
+      // Should not get a subregister here
+      assert(isSameReg(UseMO, *Reg));
+
       SDWAOperandsMap& potentialMatchesMap = *PotentialMatches;
       MachineInstr* UseMI = UseMO.getParent();
       potentialMatchesMap[UseMI].push_back(this);
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
index cfdb4f7a07d02..a2e30603b6afc 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
@@ -659,31 +659,28 @@ define amdgpu_kernel void @fmuladd_v2f16(
 ; VI-FLUSH-NEXT:    s_mov_b32 s14, s10
 ; VI-FLUSH-NEXT:    s_mov_b32 s15, s11
 ; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-FLUSH-NEXT:    s_mov_b32 s12, s2
+; VI-FLUSH-NEXT:    s_mov_b32 s13, s3
 ; VI-FLUSH-NEXT:    s_mov_b32 s16, s4
 ; VI-FLUSH-NEXT:    s_mov_b32 s17, s5
+; VI-FLUSH-NEXT:    s_mov_b32 s18, s10
+; VI-FLUSH-NEXT:    s_mov_b32 s19, s11
 ; VI-FLUSH-NEXT:    s_mov_b32 s4, s6
 ; VI-FLUSH-NEXT:    s_mov_b32 s5, s7
 ; VI-FLUSH-NEXT:    s_mov_b32 s6, s10
 ; VI-FLUSH-NEXT:    s_mov_b32 s7, s11
-; VI-FLUSH-NEXT:    s_mov_b32 s12, s2
-; VI-FLUSH-NEXT:    s_mov_b32 s13, s3
-; VI-FLUSH-NEXT:    s_mov_b32 s18, s10
-; VI-FLUSH-NEXT:    s_mov_b32 s19, s11
-; VI-FLUSH-NEXT:    buffer_load_dword v0, off, s[4:7], 0
-; VI-FLUSH-NEXT:    buffer_load_dword v1, off, s[16:19], 0
-; VI-FLUSH-NEXT:    buffer_load_dword v2, off, s[12:15], 0
+; VI-FLUSH-NEXT:    buffer_load_dword v0, off, s[12:15], 0
+; VI-FLUSH-NEXT:    buffer_load_dword v1, off, s[4:7], 0
+; VI-FLUSH-NEXT:    buffer_load_dword v2, off, s[16:19], 0
 ; VI-FLUSH-NEXT:    s_mov_b32 s8, s0
 ; VI-FLUSH-NEXT:    s_mov_b32 s9, s1
-; VI-FLUSH-NEXT:    s_waitcnt vmcnt(2)
-; VI-FLUSH-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(1)
-; VI-FLUSH-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
+; VI-FLUSH-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; VI-FLUSH-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
-; VI-FLUSH-NEXT:    v_mac_f16_e32 v3, v5, v4
+; VI-FLUSH-NEXT:    v_mac_f16_sdwa v3, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-FLUSH-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; VI-FLUSH-NEXT:    v_mac_f16_e32 v0, v2, v1
-; VI-FLUSH-NEXT:    v_or_b32_e32 v0, v0, v3
+; VI-FLUSH-NEXT:    v_mac_f16_e32 v1, v0, v2
+; VI-FLUSH-NEXT:    v_or_b32_e32 v0, v1, v3
 ; VI-FLUSH-NEXT:    buffer_store_dword v0, off, s[8:11], 0
 ; VI-FLUSH-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
index bd0808e7e359d..0f2eedb1923d6 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
@@ -1557,8 +1557,7 @@ define amdgpu_kernel void @mac_v2half(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX89-NEXT:    s_waitcnt vmcnt(1)
 ; GFX89-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX89-NEXT:    s_waitcnt vmcnt(0)
-; GFX89-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
-; GFX89-NEXT:    v_mac_f16_sdwa v4, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX89-NEXT:    v_mac_f16_sdwa v4, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX89-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX89-NEXT:    v_mac_f16_e32 v2, v3, v2
 ; GFX89-NEXT:    v_or_b32_e32 v2, v2, v4



More information about the llvm-commits mailing list