[llvm] 3ce1b96 - [AMDGPU] Switch PostRA sched to MachineSched

Joe Nash via llvm-commits llvm-commits at lists.llvm.org
Tue Sep 14 12:28:40 PDT 2021


Author: Joe Nash
Date: 2021-09-14T15:11:27-04:00
New Revision: 3ce1b9631a50d4853ab6d5750eaf50951b49e89d

URL: https://github.com/llvm/llvm-project/commit/3ce1b9631a50d4853ab6d5750eaf50951b49e89d
DIFF: https://github.com/llvm/llvm-project/commit/3ce1b9631a50d4853ab6d5750eaf50951b49e89d.diff

LOG: [AMDGPU] Switch PostRA sched to MachineSched

Use GCNHazardRecognizer in postra sched.
Updated tests for the new schedules.

Reviewed By: arsenm

Differential Revision: https://reviews.llvm.org/D109536

Change-Id: Ia86ba2ae168f12fb34b4d8efdab491f84d936cde

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
    llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
    llvm/lib/Target/AMDGPU/GCNSubtarget.h
    llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
    llvm/lib/Target/AMDGPU/SIInstrInfo.h
    llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/floor.f64.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.a16.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.dim.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.o.dim.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2d.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.d16.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll
    llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
    llvm/test/CodeGen/AMDGPU/add3.ll
    llvm/test/CodeGen/AMDGPU/agpr-remat.ll
    llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
    llvm/test/CodeGen/AMDGPU/amdgpu-mul24-knownbits.ll
    llvm/test/CodeGen/AMDGPU/anyext.ll
    llvm/test/CodeGen/AMDGPU/ashr.v2i16.ll
    llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
    llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
    llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
    llvm/test/CodeGen/AMDGPU/bitreverse.ll
    llvm/test/CodeGen/AMDGPU/branch-relaxation-inst-size-gfx10.ll
    llvm/test/CodeGen/AMDGPU/bypass-div.ll
    llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll
    llvm/test/CodeGen/AMDGPU/call-waitcnt.ll
    llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
    llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll
    llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll
    llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
    llvm/test/CodeGen/AMDGPU/captured-frame-index.ll
    llvm/test/CodeGen/AMDGPU/cc-update.ll
    llvm/test/CodeGen/AMDGPU/cluster_stores.ll
    llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll
    llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
    llvm/test/CodeGen/AMDGPU/ctpop16.ll
    llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
    llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
    llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll
    llvm/test/CodeGen/AMDGPU/ds-alignment.ll
    llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll
    llvm/test/CodeGen/AMDGPU/ds_read2.ll
    llvm/test/CodeGen/AMDGPU/ds_write2.ll
    llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
    llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll
    llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll
    llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
    llvm/test/CodeGen/AMDGPU/fexp.ll
    llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll
    llvm/test/CodeGen/AMDGPU/flat-scratch.ll
    llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll
    llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll
    llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
    llvm/test/CodeGen/AMDGPU/fp-min-max-atomics.ll
    llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
    llvm/test/CodeGen/AMDGPU/fp_to_sint.ll
    llvm/test/CodeGen/AMDGPU/fpext.f16.ll
    llvm/test/CodeGen/AMDGPU/fpow.ll
    llvm/test/CodeGen/AMDGPU/fptosi.f16.ll
    llvm/test/CodeGen/AMDGPU/fptoui.f16.ll
    llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
    llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll
    llvm/test/CodeGen/AMDGPU/frem.ll
    llvm/test/CodeGen/AMDGPU/fshl.ll
    llvm/test/CodeGen/AMDGPU/fshr.ll
    llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
    llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll
    llvm/test/CodeGen/AMDGPU/half.ll
    llvm/test/CodeGen/AMDGPU/idiv-licm.ll
    llvm/test/CodeGen/AMDGPU/idot2.ll
    llvm/test/CodeGen/AMDGPU/idot4s.ll
    llvm/test/CodeGen/AMDGPU/idot4u.ll
    llvm/test/CodeGen/AMDGPU/idot8s.ll
    llvm/test/CodeGen/AMDGPU/idot8u.ll
    llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
    llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
    llvm/test/CodeGen/AMDGPU/kernel-args.ll
    llvm/test/CodeGen/AMDGPU/lds-atomic-fmin-fmax.ll
    llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.a16.dim.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll
    llvm/test/CodeGen/AMDGPU/llvm.fma.f16.ll
    llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
    llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
    llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
    llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
    llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
    llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
    llvm/test/CodeGen/AMDGPU/load-global-i16.ll
    llvm/test/CodeGen/AMDGPU/load-local.128.ll
    llvm/test/CodeGen/AMDGPU/load-local.96.ll
    llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll
    llvm/test/CodeGen/AMDGPU/lshl64-to-32.ll
    llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
    llvm/test/CodeGen/AMDGPU/max.i16.ll
    llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll
    llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
    llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll
    llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll
    llvm/test/CodeGen/AMDGPU/memory_clause.ll
    llvm/test/CodeGen/AMDGPU/min.ll
    llvm/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll
    llvm/test/CodeGen/AMDGPU/mul.i16.ll
    llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
    llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
    llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
    llvm/test/CodeGen/AMDGPU/saddo.ll
    llvm/test/CodeGen/AMDGPU/saddsat.ll
    llvm/test/CodeGen/AMDGPU/sdiv.ll
    llvm/test/CodeGen/AMDGPU/sdiv64.ll
    llvm/test/CodeGen/AMDGPU/select.f16.ll
    llvm/test/CodeGen/AMDGPU/select64.ll
    llvm/test/CodeGen/AMDGPU/shift-i128.ll
    llvm/test/CodeGen/AMDGPU/shl.ll
    llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
    llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll
    llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
    llvm/test/CodeGen/AMDGPU/sign_extend.ll
    llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll
    llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
    llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
    llvm/test/CodeGen/AMDGPU/sra.ll
    llvm/test/CodeGen/AMDGPU/srem64.ll
    llvm/test/CodeGen/AMDGPU/srl.ll
    llvm/test/CodeGen/AMDGPU/ssubsat.ll
    llvm/test/CodeGen/AMDGPU/stack-realign.ll
    llvm/test/CodeGen/AMDGPU/store-local.128.ll
    llvm/test/CodeGen/AMDGPU/store-local.96.ll
    llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
    llvm/test/CodeGen/AMDGPU/strict_fadd.f16.ll
    llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll
    llvm/test/CodeGen/AMDGPU/strict_fmul.f16.ll
    llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll
    llvm/test/CodeGen/AMDGPU/trunc.ll
    llvm/test/CodeGen/AMDGPU/uaddsat.ll
    llvm/test/CodeGen/AMDGPU/udiv64.ll
    llvm/test/CodeGen/AMDGPU/udivrem.ll
    llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll
    llvm/test/CodeGen/AMDGPU/urem64.ll
    llvm/test/CodeGen/AMDGPU/usubsat.ll
    llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll
    llvm/test/CodeGen/AMDGPU/vector-extract-insert.ll
    llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll
    llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll
    llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
    llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
    llvm/test/CodeGen/AMDGPU/xor3.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 9b8a9c0b40590..631c6b9188662 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -1103,6 +1103,11 @@ void GCNSubtarget::getPostRAMutations(
   Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo));
 }
 
+std::unique_ptr<ScheduleDAGMutation>
+GCNSubtarget::createFillMFMAShadowMutation(const TargetInstrInfo *TII) const {
+  return std::make_unique<FillMFMAShadowMutation>(&InstrInfo);
+}
+
 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
   if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index abe9783565b9e..97e0af073ad55 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -809,6 +809,7 @@ class GCNPassConfig final : public AMDGPUPassConfig {
     // allow calls without EnableAMDGPUFunctionCalls if they are marked
     // noinline, so this is always required.
     setRequiresCodeGenSCCOrder(true);
+    substitutePass(&PostRASchedulerID, &PostMachineSchedulerID);
   }
 
   GCNTargetMachine &getGCNTargetMachine() const {
@@ -818,6 +819,14 @@ class GCNPassConfig final : public AMDGPUPassConfig {
   ScheduleDAGInstrs *
   createMachineScheduler(MachineSchedContext *C) const override;
 
+  ScheduleDAGInstrs *
+  createPostMachineScheduler(MachineSchedContext *C) const override {
+    ScheduleDAGMI *DAG = createGenericSchedPostRA(C);
+    const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
+    DAG->addMutation(ST.createFillMFMAShadowMutation(DAG->TII));
+    return DAG;
+  }
+
   bool addPreISel() override;
   void addMachineSSAOptimization() override;
   bool addILPOpts() override;

diff  --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 61cd30967e5d1..0245190a1c731 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1130,6 +1130,9 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
       std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations)
       const override;
 
+  std::unique_ptr<ScheduleDAGMutation>
+  createFillMFMAShadowMutation(const TargetInstrInfo *TII) const;
+
   bool isWave32() const {
     return getWavefrontSize() == 32;
   }

diff  --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 2461ba1a811a3..53ae471971f08 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -21,6 +21,7 @@
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/LiveVariables.h"
 #include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineScheduler.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
 #include "llvm/IR/DiagnosticInfo.h"
@@ -7462,6 +7463,20 @@ SIInstrInfo::CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const
   return new GCNHazardRecognizer(MF);
 }
 
+// Called during:
+// - pre-RA scheduling and post-RA scheduling
+ScheduleHazardRecognizer *
+SIInstrInfo::CreateTargetMIHazardRecognizer(const InstrItineraryData *II,
+                                            const ScheduleDAGMI *DAG) const {
+  // Borrowed from Arm Target
+  // We would like to restrict this hazard recognizer to only
+  // post-RA scheduling; we can tell that we're post-RA because we don't
+  // track VRegLiveness.
+  if (!DAG->hasVRegLiveness())
+    return new GCNHazardRecognizer(DAG->MF);
+  return TargetInstrInfo::CreateTargetMIHazardRecognizer(II, DAG);
+}
+
 std::pair<unsigned, unsigned>
 SIInstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
   return std::make_pair(TF & MO_MASK, TF & ~MO_MASK);

diff  --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index b55f04a4fc59b..248b46139f519 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1048,6 +1048,10 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
   ScheduleHazardRecognizer *
   CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const override;
 
+  ScheduleHazardRecognizer *
+  CreateTargetMIHazardRecognizer(const InstrItineraryData *II,
+                                 const ScheduleDAGMI *DAG) const override;
+
   bool isBasicBlockPrologue(const MachineInstr &MI) const override;
 
   MachineInstr *createPHIDestinationCopy(MachineBasicBlock &MBB,

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll
index 1a8672c8d1343..1578076411dc0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll
@@ -210,8 +210,8 @@ define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_splat(<2 x i16> inreg %a) {
 ; GFX8-LABEL: s_add_v2i16_neg_inline_imm_splat:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_mov_b32 s3, 0xffff
-; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
 ; GFX8-NEXT:    s_mov_b32 s1, 0xffc0
+; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
 ; GFX8-NEXT:    s_and_b32 s0, s0, s3
 ; GFX8-NEXT:    s_add_i32 s0, s0, s1
 ; GFX8-NEXT:    s_add_i32 s2, s2, s1
@@ -312,8 +312,8 @@ define amdgpu_ps i32 @s_add_v2i16(<2 x i16> inreg %a, <2 x i16> inreg %b) {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_mov_b32 s3, 0xffff
 ; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
-; GFX8-NEXT:    s_lshr_b32 s4, s1, 16
 ; GFX8-NEXT:    s_and_b32 s0, s0, s3
+; GFX8-NEXT:    s_lshr_b32 s4, s1, 16
 ; GFX8-NEXT:    s_and_b32 s1, s1, s3
 ; GFX8-NEXT:    s_add_i32 s0, s0, s1
 ; GFX8-NEXT:    s_add_i32 s2, s2, s4
@@ -351,8 +351,8 @@ define amdgpu_ps i32 @s_add_v2i16_fneg_lhs(<2 x half> inreg %a, <2 x i16> inreg
 ; GFX8-NEXT:    s_xor_b32 s0, s0, 0x80008000
 ; GFX8-NEXT:    s_mov_b32 s3, 0xffff
 ; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
-; GFX8-NEXT:    s_lshr_b32 s4, s1, 16
 ; GFX8-NEXT:    s_and_b32 s0, s0, s3
+; GFX8-NEXT:    s_lshr_b32 s4, s1, 16
 ; GFX8-NEXT:    s_and_b32 s1, s1, s3
 ; GFX8-NEXT:    s_add_i32 s0, s0, s1
 ; GFX8-NEXT:    s_add_i32 s2, s2, s4
@@ -393,8 +393,8 @@ define amdgpu_ps i32 @s_add_v2i16_fneg_rhs(<2 x i16> inreg %a, <2 x half> inreg
 ; GFX8-NEXT:    s_xor_b32 s1, s1, 0x80008000
 ; GFX8-NEXT:    s_mov_b32 s3, 0xffff
 ; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
-; GFX8-NEXT:    s_lshr_b32 s4, s1, 16
 ; GFX8-NEXT:    s_and_b32 s0, s0, s3
+; GFX8-NEXT:    s_lshr_b32 s4, s1, 16
 ; GFX8-NEXT:    s_and_b32 s1, s1, s3
 ; GFX8-NEXT:    s_add_i32 s0, s0, s1
 ; GFX8-NEXT:    s_add_i32 s2, s2, s4
@@ -423,8 +423,8 @@ define amdgpu_ps i32 @s_add_v2i16_fneg_lhs_fneg_rhs(<2 x half> inreg %a, <2 x ha
 ; GFX9-LABEL: s_add_v2i16_fneg_lhs_fneg_rhs:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_mov_b32 s2, 0x80008000
-; GFX9-NEXT:    s_xor_b32 s1, s1, s2
 ; GFX9-NEXT:    s_xor_b32 s0, s0, s2
+; GFX9-NEXT:    s_xor_b32 s1, s1, s2
 ; GFX9-NEXT:    s_lshr_b32 s2, s0, 16
 ; GFX9-NEXT:    s_lshr_b32 s3, s1, 16
 ; GFX9-NEXT:    s_add_i32 s0, s0, s1
@@ -435,12 +435,12 @@ define amdgpu_ps i32 @s_add_v2i16_fneg_lhs_fneg_rhs(<2 x half> inreg %a, <2 x ha
 ; GFX8-LABEL: s_add_v2i16_fneg_lhs_fneg_rhs:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_mov_b32 s2, 0x80008000
-; GFX8-NEXT:    s_xor_b32 s1, s1, s2
 ; GFX8-NEXT:    s_xor_b32 s0, s0, s2
+; GFX8-NEXT:    s_xor_b32 s1, s1, s2
 ; GFX8-NEXT:    s_mov_b32 s3, 0xffff
 ; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
-; GFX8-NEXT:    s_lshr_b32 s4, s1, 16
 ; GFX8-NEXT:    s_and_b32 s0, s0, s3
+; GFX8-NEXT:    s_lshr_b32 s4, s1, 16
 ; GFX8-NEXT:    s_and_b32 s1, s1, s3
 ; GFX8-NEXT:    s_add_i32 s0, s0, s1
 ; GFX8-NEXT:    s_add_i32 s2, s2, s4
@@ -452,10 +452,10 @@ define amdgpu_ps i32 @s_add_v2i16_fneg_lhs_fneg_rhs(<2 x half> inreg %a, <2 x ha
 ; GFX10-LABEL: s_add_v2i16_fneg_lhs_fneg_rhs:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_mov_b32 s2, 0x80008000
-; GFX10-NEXT:    s_xor_b32 s1, s1, s2
 ; GFX10-NEXT:    s_xor_b32 s0, s0, s2
-; GFX10-NEXT:    s_lshr_b32 s3, s1, 16
+; GFX10-NEXT:    s_xor_b32 s1, s1, s2
 ; GFX10-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX10-NEXT:    s_lshr_b32 s3, s1, 16
 ; GFX10-NEXT:    s_add_i32 s0, s0, s1
 ; GFX10-NEXT:    s_add_i32 s2, s2, s3
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s2

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll
index d105c0062e73e..2e117b2e0f37c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll
@@ -430,8 +430,8 @@ define amdgpu_ps i32 @s_andn2_v2i16(<2 x i16> inreg %src0, <2 x i16> inreg %src1
 ; GFX6-LABEL: s_andn2_v2i16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_mov_b32 s1, 0xffff
-; GFX6-NEXT:    s_and_b32 s2, s2, s1
 ; GFX6-NEXT:    s_lshl_b32 s0, s3, 16
+; GFX6-NEXT:    s_and_b32 s2, s2, s1
 ; GFX6-NEXT:    s_or_b32 s0, s0, s2
 ; GFX6-NEXT:    s_lshl_b32 s2, s5, 16
 ; GFX6-NEXT:    s_and_b32 s1, s4, s1
@@ -459,8 +459,8 @@ define amdgpu_ps i32 @s_andn2_v2i16_commute(<2 x i16> inreg %src0, <2 x i16> inr
 ; GFX6-LABEL: s_andn2_v2i16_commute:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_mov_b32 s1, 0xffff
-; GFX6-NEXT:    s_and_b32 s2, s2, s1
 ; GFX6-NEXT:    s_lshl_b32 s0, s3, 16
+; GFX6-NEXT:    s_and_b32 s2, s2, s1
 ; GFX6-NEXT:    s_or_b32 s0, s0, s2
 ; GFX6-NEXT:    s_lshl_b32 s2, s5, 16
 ; GFX6-NEXT:    s_and_b32 s1, s4, s1
@@ -488,8 +488,8 @@ define amdgpu_ps { i32, i32 } @s_andn2_v2i16_multi_use(<2 x i16> inreg %src0, <2
 ; GFX6-LABEL: s_andn2_v2i16_multi_use:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_mov_b32 s1, 0xffff
-; GFX6-NEXT:    s_and_b32 s2, s2, s1
 ; GFX6-NEXT:    s_lshl_b32 s0, s3, 16
+; GFX6-NEXT:    s_and_b32 s2, s2, s1
 ; GFX6-NEXT:    s_or_b32 s0, s0, s2
 ; GFX6-NEXT:    s_lshl_b32 s2, s5, 16
 ; GFX6-NEXT:    s_and_b32 s1, s4, s1
@@ -526,8 +526,8 @@ define amdgpu_ps { i32, i32 } @s_andn2_v2i16_multi_foldable_use(<2 x i16> inreg
 ; GFX6-NEXT:    s_lshl_b32 s0, s3, 16
 ; GFX6-NEXT:    s_and_b32 s2, s2, s1
 ; GFX6-NEXT:    s_or_b32 s0, s0, s2
-; GFX6-NEXT:    s_and_b32 s3, s4, s1
 ; GFX6-NEXT:    s_lshl_b32 s2, s5, 16
+; GFX6-NEXT:    s_and_b32 s3, s4, s1
 ; GFX6-NEXT:    s_or_b32 s2, s2, s3
 ; GFX6-NEXT:    s_lshl_b32 s3, s7, 16
 ; GFX6-NEXT:    s_and_b32 s1, s6, s1
@@ -633,11 +633,11 @@ define amdgpu_ps i64 @s_andn2_v4i16(<4 x i16> inreg %src0, <4 x i16> inreg %src1
 ; GFX6-NEXT:    s_mov_b32 s3, 0xffff
 ; GFX6-NEXT:    s_and_b32 s1, s2, s3
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
-; GFX6-NEXT:    s_and_b32 s2, s4, s3
 ; GFX6-NEXT:    s_lshl_b32 s1, s5, 16
+; GFX6-NEXT:    s_and_b32 s2, s4, s3
 ; GFX6-NEXT:    s_or_b32 s1, s1, s2
-; GFX6-NEXT:    s_and_b32 s4, s6, s3
 ; GFX6-NEXT:    s_lshl_b32 s2, s7, 16
+; GFX6-NEXT:    s_and_b32 s4, s6, s3
 ; GFX6-NEXT:    s_or_b32 s2, s2, s4
 ; GFX6-NEXT:    s_lshl_b32 s4, s9, 16
 ; GFX6-NEXT:    s_and_b32 s3, s8, s3
@@ -676,11 +676,11 @@ define amdgpu_ps i64 @s_andn2_v4i16_commute(<4 x i16> inreg %src0, <4 x i16> inr
 ; GFX6-NEXT:    s_mov_b32 s3, 0xffff
 ; GFX6-NEXT:    s_and_b32 s1, s2, s3
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
-; GFX6-NEXT:    s_and_b32 s2, s4, s3
 ; GFX6-NEXT:    s_lshl_b32 s1, s5, 16
+; GFX6-NEXT:    s_and_b32 s2, s4, s3
 ; GFX6-NEXT:    s_or_b32 s1, s1, s2
-; GFX6-NEXT:    s_and_b32 s4, s6, s3
 ; GFX6-NEXT:    s_lshl_b32 s2, s7, 16
+; GFX6-NEXT:    s_and_b32 s4, s6, s3
 ; GFX6-NEXT:    s_or_b32 s2, s2, s4
 ; GFX6-NEXT:    s_lshl_b32 s4, s9, 16
 ; GFX6-NEXT:    s_and_b32 s3, s8, s3
@@ -719,11 +719,11 @@ define amdgpu_ps { i64, i64 } @s_andn2_v4i16_multi_use(<4 x i16> inreg %src0, <4
 ; GFX6-NEXT:    s_mov_b32 s3, 0xffff
 ; GFX6-NEXT:    s_and_b32 s1, s2, s3
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
-; GFX6-NEXT:    s_and_b32 s2, s4, s3
 ; GFX6-NEXT:    s_lshl_b32 s1, s5, 16
+; GFX6-NEXT:    s_and_b32 s2, s4, s3
 ; GFX6-NEXT:    s_or_b32 s1, s1, s2
-; GFX6-NEXT:    s_and_b32 s4, s6, s3
 ; GFX6-NEXT:    s_lshl_b32 s2, s7, 16
+; GFX6-NEXT:    s_and_b32 s4, s6, s3
 ; GFX6-NEXT:    s_or_b32 s2, s2, s4
 ; GFX6-NEXT:    s_lshl_b32 s4, s9, 16
 ; GFX6-NEXT:    s_and_b32 s3, s8, s3
@@ -773,8 +773,8 @@ define amdgpu_ps { i64, i64 } @s_andn2_v4i16_multi_foldable_use(<4 x i16> inreg
 ; GFX6-NEXT:    s_lshl_b32 s1, s5, 16
 ; GFX6-NEXT:    s_and_b32 s2, s4, s14
 ; GFX6-NEXT:    s_or_b32 s1, s1, s2
-; GFX6-NEXT:    s_and_b32 s3, s6, s14
 ; GFX6-NEXT:    s_lshl_b32 s2, s7, 16
+; GFX6-NEXT:    s_and_b32 s3, s6, s14
 ; GFX6-NEXT:    s_or_b32 s2, s2, s3
 ; GFX6-NEXT:    s_lshl_b32 s3, s9, 16
 ; GFX6-NEXT:    s_and_b32 s4, s8, s14
@@ -831,8 +831,8 @@ define <4 x i16> @v_andn2_v4i16(<4 x i16> %src0, <4 x i16> %src1) {
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
 ; GFX6-NEXT:    v_and_b32_e32 v2, v2, v8
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX6-NEXT:    v_and_b32_e32 v3, v4, v8
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX6-NEXT:    v_and_b32_e32 v3, v4, v8
 ; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
 ; GFX6-NEXT:    v_and_b32_e32 v4, v6, v8

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
index ece08668dcd65..f739d915a3b0c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
@@ -810,11 +810,11 @@ define amdgpu_ps i32 @s_ashr_v2i16(<2 x i16> inreg %value, <2 x i16> inreg %amou
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_mov_b32 s3, 0x100010
 ; GFX8-NEXT:    s_sext_i32_i16 s2, s0
-; GFX8-NEXT:    s_sext_i32_i16 s4, s1
 ; GFX8-NEXT:    s_bfe_i32 s0, s0, s3
+; GFX8-NEXT:    s_sext_i32_i16 s4, s1
 ; GFX8-NEXT:    s_bfe_i32 s1, s1, s3
-; GFX8-NEXT:    s_ashr_i32 s0, s0, s1
 ; GFX8-NEXT:    s_ashr_i32 s2, s2, s4
+; GFX8-NEXT:    s_ashr_i32 s0, s0, s1
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX8-NEXT:    s_and_b32 s1, s2, 0xffff
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
@@ -823,8 +823,8 @@ define amdgpu_ps i32 @s_ashr_v2i16(<2 x i16> inreg %value, <2 x i16> inreg %amou
 ; GFX9-LABEL: s_ashr_v2i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_sext_i32_i16 s2, s0
-; GFX9-NEXT:    s_sext_i32_i16 s3, s1
 ; GFX9-NEXT:    s_ashr_i32 s0, s0, 16
+; GFX9-NEXT:    s_sext_i32_i16 s3, s1
 ; GFX9-NEXT:    s_ashr_i32 s1, s1, 16
 ; GFX9-NEXT:    s_ashr_i32 s2, s2, s3
 ; GFX9-NEXT:    s_ashr_i32 s0, s0, s1
@@ -834,8 +834,8 @@ define amdgpu_ps i32 @s_ashr_v2i16(<2 x i16> inreg %value, <2 x i16> inreg %amou
 ; GFX10-LABEL: s_ashr_v2i16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_sext_i32_i16 s2, s0
-; GFX10-NEXT:    s_sext_i32_i16 s3, s1
 ; GFX10-NEXT:    s_ashr_i32 s0, s0, 16
+; GFX10-NEXT:    s_sext_i32_i16 s3, s1
 ; GFX10-NEXT:    s_ashr_i32 s1, s1, 16
 ; GFX10-NEXT:    s_ashr_i32 s2, s2, s3
 ; GFX10-NEXT:    s_ashr_i32 s0, s0, s1
@@ -948,10 +948,10 @@ define <2 x float> @v_ashr_v4i16(<4 x i16> %value, <4 x i16> %amount) {
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v1, v4, v1
 ; GFX6-NEXT:    v_and_b32_e32 v4, s4, v6
 ; GFX6-NEXT:    v_bfe_i32 v2, v2, 0, 16
-; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v2, v4, v2
 ; GFX6-NEXT:    v_and_b32_e32 v4, s4, v7
 ; GFX6-NEXT:    v_bfe_i32 v3, v3, 0, 16
+; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v3, v4, v3
 ; GFX6-NEXT:    v_and_b32_e32 v0, s4, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
@@ -1004,10 +1004,10 @@ define amdgpu_ps <2 x i32> @s_ashr_v4i16(<4 x i16> inreg %value, <4 x i16> inreg
 ; GFX6-NEXT:    s_ashr_i32 s1, s1, s4
 ; GFX6-NEXT:    s_and_b32 s4, s6, s8
 ; GFX6-NEXT:    s_sext_i32_i16 s2, s2
-; GFX6-NEXT:    s_and_b32 s1, s1, s8
 ; GFX6-NEXT:    s_ashr_i32 s2, s2, s4
 ; GFX6-NEXT:    s_and_b32 s4, s7, s8
 ; GFX6-NEXT:    s_sext_i32_i16 s3, s3
+; GFX6-NEXT:    s_and_b32 s1, s1, s8
 ; GFX6-NEXT:    s_ashr_i32 s3, s3, s4
 ; GFX6-NEXT:    s_and_b32 s0, s0, s8
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
@@ -1022,18 +1022,18 @@ define amdgpu_ps <2 x i32> @s_ashr_v4i16(<4 x i16> inreg %value, <4 x i16> inreg
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_mov_b32 s5, 0x100010
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s0
-; GFX8-NEXT:    s_sext_i32_i16 s7, s2
-; GFX8-NEXT:    s_sext_i32_i16 s6, s1
-; GFX8-NEXT:    s_sext_i32_i16 s8, s3
 ; GFX8-NEXT:    s_bfe_i32 s0, s0, s5
-; GFX8-NEXT:    s_bfe_i32 s2, s2, s5
+; GFX8-NEXT:    s_sext_i32_i16 s6, s1
 ; GFX8-NEXT:    s_bfe_i32 s1, s1, s5
+; GFX8-NEXT:    s_sext_i32_i16 s7, s2
+; GFX8-NEXT:    s_bfe_i32 s2, s2, s5
+; GFX8-NEXT:    s_sext_i32_i16 s8, s3
 ; GFX8-NEXT:    s_bfe_i32 s3, s3, s5
+; GFX8-NEXT:    s_ashr_i32 s4, s4, s7
 ; GFX8-NEXT:    s_ashr_i32 s0, s0, s2
+; GFX8-NEXT:    s_ashr_i32 s2, s6, s8
 ; GFX8-NEXT:    s_ashr_i32 s1, s1, s3
-; GFX8-NEXT:    s_ashr_i32 s4, s4, s7
 ; GFX8-NEXT:    s_mov_b32 s3, 0xffff
-; GFX8-NEXT:    s_ashr_i32 s2, s6, s8
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX8-NEXT:    s_and_b32 s4, s4, s3
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
@@ -1045,15 +1045,15 @@ define amdgpu_ps <2 x i32> @s_ashr_v4i16(<4 x i16> inreg %value, <4 x i16> inreg
 ; GFX9-LABEL: s_ashr_v4i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_sext_i32_i16 s4, s0
-; GFX9-NEXT:    s_sext_i32_i16 s5, s2
 ; GFX9-NEXT:    s_ashr_i32 s0, s0, 16
+; GFX9-NEXT:    s_sext_i32_i16 s5, s2
 ; GFX9-NEXT:    s_ashr_i32 s2, s2, 16
-; GFX9-NEXT:    s_ashr_i32 s0, s0, s2
 ; GFX9-NEXT:    s_ashr_i32 s4, s4, s5
+; GFX9-NEXT:    s_ashr_i32 s0, s0, s2
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s4, s0
 ; GFX9-NEXT:    s_sext_i32_i16 s2, s1
-; GFX9-NEXT:    s_sext_i32_i16 s4, s3
 ; GFX9-NEXT:    s_ashr_i32 s1, s1, 16
+; GFX9-NEXT:    s_sext_i32_i16 s4, s3
 ; GFX9-NEXT:    s_ashr_i32 s3, s3, 16
 ; GFX9-NEXT:    s_ashr_i32 s2, s2, s4
 ; GFX9-NEXT:    s_ashr_i32 s1, s1, s3
@@ -1063,14 +1063,14 @@ define amdgpu_ps <2 x i32> @s_ashr_v4i16(<4 x i16> inreg %value, <4 x i16> inreg
 ; GFX10-LABEL: s_ashr_v4i16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_sext_i32_i16 s4, s0
-; GFX10-NEXT:    s_sext_i32_i16 s5, s2
 ; GFX10-NEXT:    s_ashr_i32 s0, s0, 16
+; GFX10-NEXT:    s_sext_i32_i16 s5, s2
 ; GFX10-NEXT:    s_ashr_i32 s2, s2, 16
 ; GFX10-NEXT:    s_ashr_i32 s4, s4, s5
 ; GFX10-NEXT:    s_ashr_i32 s0, s0, s2
 ; GFX10-NEXT:    s_sext_i32_i16 s2, s1
-; GFX10-NEXT:    s_sext_i32_i16 s5, s3
 ; GFX10-NEXT:    s_ashr_i32 s1, s1, 16
+; GFX10-NEXT:    s_sext_i32_i16 s5, s3
 ; GFX10-NEXT:    s_ashr_i32 s3, s3, 16
 ; GFX10-NEXT:    s_ashr_i32 s2, s2, s5
 ; GFX10-NEXT:    s_ashr_i32 s1, s1, s3
@@ -1125,28 +1125,28 @@ define <4 x float> @v_ashr_v8i16(<8 x i16> %value, <8 x i16> %amount) {
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v3, v8, v3
 ; GFX6-NEXT:    v_and_b32_e32 v8, s4, v12
 ; GFX6-NEXT:    v_bfe_i32 v4, v4, 0, 16
-; GFX6-NEXT:    v_and_b32_e32 v1, v1, v16
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v4, v8, v4
 ; GFX6-NEXT:    v_and_b32_e32 v8, s4, v13
 ; GFX6-NEXT:    v_bfe_i32 v5, v5, 0, 16
+; GFX6-NEXT:    v_and_b32_e32 v1, v1, v16
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v5, v8, v5
 ; GFX6-NEXT:    v_and_b32_e32 v8, s4, v14
 ; GFX6-NEXT:    v_bfe_i32 v6, v6, 0, 16
 ; GFX6-NEXT:    v_and_b32_e32 v0, v0, v16
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v6, v8, v6
+; GFX6-NEXT:    v_and_b32_e32 v8, v15, v16
+; GFX6-NEXT:    v_bfe_i32 v7, v7, 0, 16
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_and_b32_e32 v1, v2, v16
 ; GFX6-NEXT:    v_and_b32_e32 v2, v3, v16
-; GFX6-NEXT:    v_and_b32_e32 v8, v15, v16
-; GFX6-NEXT:    v_bfe_i32 v7, v7, 0, 16
-; GFX6-NEXT:    v_and_b32_e32 v3, v5, v16
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v7, v8, v7
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_and_b32_e32 v3, v5, v16
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX6-NEXT:    v_and_b32_e32 v2, v4, v16
-; GFX6-NEXT:    v_and_b32_e32 v4, v7, v16
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT:    v_and_b32_e32 v4, v7, v16
 ; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX6-NEXT:    v_and_b32_e32 v3, v6, v16
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
@@ -1211,28 +1211,28 @@ define amdgpu_ps <4 x i32> @s_ashr_v8i16(<8 x i16> inreg %value, <8 x i16> inreg
 ; GFX6-NEXT:    s_ashr_i32 s3, s3, s8
 ; GFX6-NEXT:    s_and_b32 s8, s12, s16
 ; GFX6-NEXT:    s_sext_i32_i16 s4, s4
-; GFX6-NEXT:    s_and_b32 s1, s1, s16
 ; GFX6-NEXT:    s_ashr_i32 s4, s4, s8
 ; GFX6-NEXT:    s_and_b32 s8, s13, s16
 ; GFX6-NEXT:    s_sext_i32_i16 s5, s5
+; GFX6-NEXT:    s_and_b32 s1, s1, s16
 ; GFX6-NEXT:    s_ashr_i32 s5, s5, s8
 ; GFX6-NEXT:    s_and_b32 s8, s14, s16
 ; GFX6-NEXT:    s_sext_i32_i16 s6, s6
 ; GFX6-NEXT:    s_and_b32 s0, s0, s16
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_ashr_i32 s6, s6, s8
+; GFX6-NEXT:    s_and_b32 s8, s15, s16
+; GFX6-NEXT:    s_sext_i32_i16 s7, s7
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
 ; GFX6-NEXT:    s_and_b32 s1, s2, s16
 ; GFX6-NEXT:    s_and_b32 s2, s3, s16
-; GFX6-NEXT:    s_and_b32 s8, s15, s16
-; GFX6-NEXT:    s_sext_i32_i16 s7, s7
-; GFX6-NEXT:    s_and_b32 s3, s5, s16
-; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
 ; GFX6-NEXT:    s_ashr_i32 s7, s7, s8
+; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX6-NEXT:    s_and_b32 s3, s5, s16
 ; GFX6-NEXT:    s_or_b32 s1, s1, s2
 ; GFX6-NEXT:    s_and_b32 s2, s4, s16
-; GFX6-NEXT:    s_and_b32 s4, s7, s16
 ; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
+; GFX6-NEXT:    s_and_b32 s4, s7, s16
 ; GFX6-NEXT:    s_or_b32 s2, s2, s3
 ; GFX6-NEXT:    s_and_b32 s3, s6, s16
 ; GFX6-NEXT:    s_lshl_b32 s4, s4, 16
@@ -1243,38 +1243,38 @@ define amdgpu_ps <4 x i32> @s_ashr_v8i16(<8 x i16> inreg %value, <8 x i16> inreg
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_mov_b32 s9, 0x100010
 ; GFX8-NEXT:    s_sext_i32_i16 s8, s0
-; GFX8-NEXT:    s_sext_i32_i16 s13, s4
+; GFX8-NEXT:    s_bfe_i32 s0, s0, s9
 ; GFX8-NEXT:    s_sext_i32_i16 s10, s1
+; GFX8-NEXT:    s_bfe_i32 s1, s1, s9
 ; GFX8-NEXT:    s_sext_i32_i16 s12, s3
-; GFX8-NEXT:    s_sext_i32_i16 s14, s5
-; GFX8-NEXT:    s_sext_i32_i16 s16, s7
-; GFX8-NEXT:    s_bfe_i32 s0, s0, s9
+; GFX8-NEXT:    s_bfe_i32 s3, s3, s9
+; GFX8-NEXT:    s_sext_i32_i16 s13, s4
 ; GFX8-NEXT:    s_bfe_i32 s4, s4, s9
-; GFX8-NEXT:    s_bfe_i32 s1, s1, s9
+; GFX8-NEXT:    s_sext_i32_i16 s14, s5
 ; GFX8-NEXT:    s_bfe_i32 s5, s5, s9
-; GFX8-NEXT:    s_bfe_i32 s3, s3, s9
+; GFX8-NEXT:    s_sext_i32_i16 s16, s7
 ; GFX8-NEXT:    s_bfe_i32 s7, s7, s9
-; GFX8-NEXT:    s_ashr_i32 s0, s0, s4
-; GFX8-NEXT:    s_ashr_i32 s3, s3, s7
-; GFX8-NEXT:    s_ashr_i32 s1, s1, s5
 ; GFX8-NEXT:    s_sext_i32_i16 s11, s2
-; GFX8-NEXT:    s_sext_i32_i16 s15, s6
 ; GFX8-NEXT:    s_bfe_i32 s2, s2, s9
+; GFX8-NEXT:    s_sext_i32_i16 s15, s6
 ; GFX8-NEXT:    s_bfe_i32 s6, s6, s9
+; GFX8-NEXT:    s_ashr_i32 s0, s0, s4
 ; GFX8-NEXT:    s_ashr_i32 s4, s10, s14
+; GFX8-NEXT:    s_ashr_i32 s1, s1, s5
+; GFX8-NEXT:    s_ashr_i32 s3, s3, s7
 ; GFX8-NEXT:    s_mov_b32 s7, 0xffff
-; GFX8-NEXT:    s_ashr_i32 s2, s2, s6
 ; GFX8-NEXT:    s_ashr_i32 s5, s11, s15
+; GFX8-NEXT:    s_ashr_i32 s2, s2, s6
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX8-NEXT:    s_and_b32 s4, s4, s7
 ; GFX8-NEXT:    s_ashr_i32 s8, s8, s13
-; GFX8-NEXT:    s_or_b32 s1, s1, s4
 ; GFX8-NEXT:    s_ashr_i32 s6, s12, s16
+; GFX8-NEXT:    s_or_b32 s1, s1, s4
 ; GFX8-NEXT:    s_lshl_b32 s2, s2, 16
 ; GFX8-NEXT:    s_and_b32 s4, s5, s7
-; GFX8-NEXT:    s_or_b32 s2, s2, s4
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX8-NEXT:    s_and_b32 s8, s8, s7
+; GFX8-NEXT:    s_or_b32 s2, s2, s4
 ; GFX8-NEXT:    s_lshl_b32 s3, s3, 16
 ; GFX8-NEXT:    s_and_b32 s4, s6, s7
 ; GFX8-NEXT:    s_or_b32 s0, s0, s8
@@ -1284,29 +1284,29 @@ define amdgpu_ps <4 x i32> @s_ashr_v8i16(<8 x i16> inreg %value, <8 x i16> inreg
 ; GFX9-LABEL: s_ashr_v8i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_sext_i32_i16 s8, s0
-; GFX9-NEXT:    s_sext_i32_i16 s9, s4
 ; GFX9-NEXT:    s_ashr_i32 s0, s0, 16
+; GFX9-NEXT:    s_sext_i32_i16 s9, s4
 ; GFX9-NEXT:    s_ashr_i32 s4, s4, 16
-; GFX9-NEXT:    s_ashr_i32 s0, s0, s4
 ; GFX9-NEXT:    s_ashr_i32 s8, s8, s9
+; GFX9-NEXT:    s_ashr_i32 s0, s0, s4
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s8, s0
 ; GFX9-NEXT:    s_sext_i32_i16 s4, s1
-; GFX9-NEXT:    s_sext_i32_i16 s8, s5
 ; GFX9-NEXT:    s_ashr_i32 s1, s1, 16
+; GFX9-NEXT:    s_sext_i32_i16 s8, s5
 ; GFX9-NEXT:    s_ashr_i32 s5, s5, 16
-; GFX9-NEXT:    s_ashr_i32 s1, s1, s5
 ; GFX9-NEXT:    s_ashr_i32 s4, s4, s8
+; GFX9-NEXT:    s_ashr_i32 s1, s1, s5
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s4, s1
 ; GFX9-NEXT:    s_sext_i32_i16 s4, s2
-; GFX9-NEXT:    s_sext_i32_i16 s5, s6
 ; GFX9-NEXT:    s_ashr_i32 s2, s2, 16
+; GFX9-NEXT:    s_sext_i32_i16 s5, s6
 ; GFX9-NEXT:    s_ashr_i32 s6, s6, 16
 ; GFX9-NEXT:    s_ashr_i32 s4, s4, s5
 ; GFX9-NEXT:    s_ashr_i32 s2, s2, s6
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s4, s2
 ; GFX9-NEXT:    s_sext_i32_i16 s4, s3
-; GFX9-NEXT:    s_sext_i32_i16 s5, s7
 ; GFX9-NEXT:    s_ashr_i32 s3, s3, 16
+; GFX9-NEXT:    s_sext_i32_i16 s5, s7
 ; GFX9-NEXT:    s_ashr_i32 s6, s7, 16
 ; GFX9-NEXT:    s_ashr_i32 s4, s4, s5
 ; GFX9-NEXT:    s_ashr_i32 s3, s3, s6
@@ -1316,14 +1316,14 @@ define amdgpu_ps <4 x i32> @s_ashr_v8i16(<8 x i16> inreg %value, <8 x i16> inreg
 ; GFX10-LABEL: s_ashr_v8i16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_sext_i32_i16 s8, s0
-; GFX10-NEXT:    s_sext_i32_i16 s9, s4
 ; GFX10-NEXT:    s_ashr_i32 s0, s0, 16
+; GFX10-NEXT:    s_sext_i32_i16 s9, s4
 ; GFX10-NEXT:    s_ashr_i32 s4, s4, 16
 ; GFX10-NEXT:    s_ashr_i32 s8, s8, s9
 ; GFX10-NEXT:    s_ashr_i32 s0, s0, s4
 ; GFX10-NEXT:    s_sext_i32_i16 s4, s1
-; GFX10-NEXT:    s_sext_i32_i16 s9, s5
 ; GFX10-NEXT:    s_ashr_i32 s1, s1, 16
+; GFX10-NEXT:    s_sext_i32_i16 s9, s5
 ; GFX10-NEXT:    s_ashr_i32 s5, s5, 16
 ; GFX10-NEXT:    s_ashr_i32 s4, s4, s9
 ; GFX10-NEXT:    s_ashr_i32 s1, s1, s5
@@ -1335,8 +1335,8 @@ define amdgpu_ps <4 x i32> @s_ashr_v8i16(<8 x i16> inreg %value, <8 x i16> inreg
 ; GFX10-NEXT:    s_ashr_i32 s4, s4, s5
 ; GFX10-NEXT:    s_ashr_i32 s2, s2, s6
 ; GFX10-NEXT:    s_sext_i32_i16 s5, s3
-; GFX10-NEXT:    s_sext_i32_i16 s6, s7
 ; GFX10-NEXT:    s_ashr_i32 s3, s3, 16
+; GFX10-NEXT:    s_sext_i32_i16 s6, s7
 ; GFX10-NEXT:    s_ashr_i32 s7, s7, 16
 ; GFX10-NEXT:    s_ashr_i32 s5, s5, s6
 ; GFX10-NEXT:    s_ashr_i32 s3, s3, s7

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll
index 75af85813f2be..3e70221e1a690 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll
@@ -92,8 +92,8 @@ define amdgpu_ps <2 x i32> @s_bswap_v2i32(<2 x i32> inreg %src) {
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    s_mov_b32 s0, 0x10203
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
-; GFX8-NEXT:    v_perm_b32 v1, 0, v1, s0
 ; GFX8-NEXT:    v_perm_b32 v0, 0, v0, s0
+; GFX8-NEXT:    v_perm_b32 v1, 0, v1, s0
 ; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX8-NEXT:    ; return to shader part epilog
@@ -103,8 +103,8 @@ define amdgpu_ps <2 x i32> @s_bswap_v2i32(<2 x i32> inreg %src) {
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    s_mov_b32 s0, 0x10203
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    v_perm_b32 v1, 0, v1, s0
 ; GFX9-NEXT:    v_perm_b32 v0, 0, v0, s0
+; GFX9-NEXT:    v_perm_b32 v1, 0, v1, s0
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX9-NEXT:    ; return to shader part epilog
@@ -285,9 +285,9 @@ define amdgpu_ps <2 x i64> @s_bswap_v2i64(<2 x i64> inreg %src) {
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s3
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX8-NEXT:    v_perm_b32 v0, 0, v0, s1
+; GFX8-NEXT:    v_perm_b32 v1, 0, v1, s1
 ; GFX8-NEXT:    v_perm_b32 v2, 0, v2, s1
 ; GFX8-NEXT:    v_perm_b32 v3, 0, v3, s1
-; GFX8-NEXT:    v_perm_b32 v1, 0, v1, s1
 ; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX8-NEXT:    v_readfirstlane_b32 s2, v2
@@ -302,9 +302,9 @@ define amdgpu_ps <2 x i64> @s_bswap_v2i64(<2 x i64> inreg %src) {
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX9-NEXT:    v_perm_b32 v0, 0, v0, s1
+; GFX9-NEXT:    v_perm_b32 v1, 0, v1, s1
 ; GFX9-NEXT:    v_perm_b32 v2, 0, v2, s1
 ; GFX9-NEXT:    v_perm_b32 v3, 0, v3, s1
-; GFX9-NEXT:    v_perm_b32 v1, 0, v1, s1
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX9-NEXT:    v_readfirstlane_b32 s2, v2

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
index 2135d94bc0faf..26d2c8e07a28e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
@@ -181,8 +181,8 @@ define <3 x float> @v_uitofp_v3i8_to_v3f32(i32 %arg0) nounwind {
 ; SI-NEXT:    v_cvt_f32_ubyte0_e32 v3, v1
 ; SI-NEXT:    v_bfe_u32 v1, v0, 8, 8
 ; SI-NEXT:    v_bfe_u32 v0, v0, 16, 8
-; SI-NEXT:    v_cvt_f32_ubyte0_e32 v2, v0
 ; SI-NEXT:    v_cvt_f32_ubyte0_e32 v1, v1
+; SI-NEXT:    v_cvt_f32_ubyte0_e32 v2, v0
 ; SI-NEXT:    v_mov_b32_e32 v0, v3
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -205,12 +205,12 @@ define <4 x float> @v_uitofp_v4i8_to_v4f32(i32 %arg0) nounwind {
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v0
-; SI-NEXT:    v_bfe_u32 v2, v0, 16, 8
 ; SI-NEXT:    v_cvt_f32_ubyte0_e32 v4, v1
 ; SI-NEXT:    v_bfe_u32 v1, v0, 8, 8
-; SI-NEXT:    v_cvt_f32_ubyte3_e32 v3, v0
+; SI-NEXT:    v_bfe_u32 v2, v0, 16, 8
 ; SI-NEXT:    v_cvt_f32_ubyte0_e32 v1, v1
 ; SI-NEXT:    v_cvt_f32_ubyte0_e32 v2, v2
+; SI-NEXT:    v_cvt_f32_ubyte3_e32 v3, v0
 ; SI-NEXT:    v_mov_b32_e32 v0, v4
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -233,12 +233,12 @@ define <4 x float> @v_uitofp_unpack_i32_to_v4f32(i32 %arg0) nounwind {
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v0
-; SI-NEXT:    v_bfe_u32 v2, v0, 16, 8
 ; SI-NEXT:    v_cvt_f32_ubyte0_e32 v4, v1
 ; SI-NEXT:    v_bfe_u32 v1, v0, 8, 8
-; SI-NEXT:    v_cvt_f32_ubyte3_e32 v3, v0
+; SI-NEXT:    v_bfe_u32 v2, v0, 16, 8
 ; SI-NEXT:    v_cvt_f32_ubyte0_e32 v1, v1
 ; SI-NEXT:    v_cvt_f32_ubyte0_e32 v2, v2
+; SI-NEXT:    v_cvt_f32_ubyte3_e32 v3, v0
 ; SI-NEXT:    v_mov_b32_e32 v0, v4
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll
index 57410918e0c2e..fa500054e058e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll
@@ -41,10 +41,12 @@ define amdgpu_ps i128 @extractelement_vgpr_v4i128_sgpr_idx(<4 x i128> addrspace(
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX9-NEXT:    v_mov_b32_e32 v18, v2
-; GFX9-NEXT:    v_mov_b32_e32 v3, v3
 ; GFX9-NEXT:    s_set_gpr_idx_off
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX9-NEXT:    s_set_gpr_idx_on s2, gpr_idx(SRC0)
+; GFX9-NEXT:    v_mov_b32_e32 v3, v3
+; GFX9-NEXT:    s_set_gpr_idx_off
 ; GFX9-NEXT:    v_readfirstlane_b32 s2, v18
 ; GFX9-NEXT:    v_readfirstlane_b32 s3, v3
 ; GFX9-NEXT:    ; return to shader part epilog
@@ -128,8 +130,8 @@ define i128 @extractelement_vgpr_v4i128_vgpr_idx(<4 x i128> addrspace(1)* %ptr,
 ; GFX9-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off
 ; GFX9-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:16
 ; GFX9-NEXT:    v_add_u32_e32 v17, 1, v16
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v16
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v17
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v16
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[6:7], 6, v16
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[8:9], 7, v16
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
@@ -187,8 +189,8 @@ define i128 @extractelement_vgpr_v4i128_vgpr_idx(<4 x i128> addrspace(1)* %ptr,
 ; GFX8-NEXT:    flat_load_dwordx4 v[4:7], v[3:4]
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 1, v2
 ; GFX8-NEXT:    v_add_u32_e32 v17, vcc, 1, v16
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v16
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v17
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v16
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[6:7], 6, v16
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[8:9], 7, v16
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
@@ -251,8 +253,8 @@ define i128 @extractelement_vgpr_v4i128_vgpr_idx(<4 x i128> addrspace(1)* %ptr,
 ; GFX7-NEXT:    buffer_load_dwordx4 v[2:5], v[0:1], s[8:11], 0 addr64
 ; GFX7-NEXT:    buffer_load_dwordx4 v[6:9], v[0:1], s[8:11], 0 addr64 offset:16
 ; GFX7-NEXT:    v_add_i32_e32 v17, vcc, 1, v16
-; GFX7-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v16
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v17
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v16
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[6:7], 6, v16
 ; GFX7-NEXT:    s_waitcnt vmcnt(1)
 ; GFX7-NEXT:    v_cndmask_b32_e64 v10, v2, v4, s[4:5]
@@ -310,8 +312,8 @@ define i128 @extractelement_vgpr_v4i128_vgpr_idx(<4 x i128> addrspace(1)* %ptr,
 ; GFX10-NEXT:    global_load_dwordx4 v[7:10], v[0:1], off offset:16
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 1, v2
 ; GFX10-NEXT:    global_load_dwordx4 v[11:14], v[0:1], off offset:32
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
 ; GFX10-NEXT:    v_add_nc_u32_e32 v19, 1, v2
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, 1, v19
 ; GFX10-NEXT:    s_waitcnt vmcnt(2)
 ; GFX10-NEXT:    v_cndmask_b32_e32 v15, v3, v5, vcc_lo
@@ -324,15 +326,15 @@ define i128 @extractelement_vgpr_v4i128_vgpr_idx(<4 x i128> addrspace(1)* %ptr,
 ; GFX10-NEXT:    v_cndmask_b32_e32 v5, v15, v7, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v6, v16, v8, vcc_lo
 ; GFX10-NEXT:    global_load_dwordx4 v[15:18], v[0:1], off offset:48
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v2
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v2
 ; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, v8, s4
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, 3, v19
 ; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v9, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v6, v6, v10, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 4, v2
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v9, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, v10, s4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 4, v2
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, 4, v19
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v5, v11, vcc_lo
@@ -350,8 +352,8 @@ define i128 @extractelement_vgpr_v4i128_vgpr_idx(<4 x i128> addrspace(1)* %ptr,
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v15, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v16, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 7, v2
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v15, s4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 7, v2
 ; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, v16, s4
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, 7, v19
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v17, vcc_lo
@@ -408,13 +410,13 @@ define amdgpu_ps i128 @extractelement_sgpr_v4i128_vgpr_idx(<4 x i128> addrspace(
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v19
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s14
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v16, s15
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[0:1]
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 3, v19
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v1, v7, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v17, v15, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v1, v7, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v18, v16, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v8, s[0:1]
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v19
@@ -442,8 +444,8 @@ define amdgpu_ps i128 @extractelement_sgpr_v4i128_vgpr_idx(<4 x i128> addrspace(
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s0
-; GFX8-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s1
+; GFX8-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s3
 ; GFX8-NEXT:    v_mov_b32_e32 v5, s4
 ; GFX8-NEXT:    v_mov_b32_e32 v6, s5
@@ -478,13 +480,13 @@ define amdgpu_ps i128 @extractelement_sgpr_v4i128_vgpr_idx(<4 x i128> addrspace(
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v19
 ; GFX8-NEXT:    v_mov_b32_e32 v15, s14
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v16, s15
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[0:1]
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 3, v19
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v1, v7, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v17, v15, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v1, v7, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v18, v16, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v8, s[0:1]
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v19
@@ -512,8 +514,8 @@ define amdgpu_ps i128 @extractelement_sgpr_v4i128_vgpr_idx(<4 x i128> addrspace(
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s0
-; GFX7-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v4, s3
 ; GFX7-NEXT:    v_mov_b32_e32 v5, s4
 ; GFX7-NEXT:    v_mov_b32_e32 v6, s5
@@ -548,13 +550,13 @@ define amdgpu_ps i128 @extractelement_sgpr_v4i128_vgpr_idx(<4 x i128> addrspace(
 ; GFX7-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v19
 ; GFX7-NEXT:    v_mov_b32_e32 v15, s14
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v16, s15
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v0
 ; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[0:1]
 ; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[0:1]
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 3, v19
-; GFX7-NEXT:    v_cndmask_b32_e64 v3, v1, v7, s[0:1]
 ; GFX7-NEXT:    v_cndmask_b32_e32 v0, v17, v15, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v1, v7, s[0:1]
 ; GFX7-NEXT:    v_cndmask_b32_e32 v1, v18, v16, vcc
 ; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v8, s[0:1]
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v19
@@ -621,12 +623,12 @@ define amdgpu_ps i128 @extractelement_sgpr_v4i128_vgpr_idx(<4 x i128> addrspace(
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s16, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, s17, s0
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 7, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v5, s19, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v4, s18, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v5, s19, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s18, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, s19, s0
-; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX10-NEXT:    v_readfirstlane_b32 s2, v2
 ; GFX10-NEXT:    v_readfirstlane_b32 s3, v3
 ; GFX10-NEXT:    ; return to shader part epilog

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll
index 3a88af6fb5dc2..1316c7f303dd4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll
@@ -38,8 +38,8 @@ define amdgpu_ps i16 @extractelement_vgpr_v4i16_sgpr_idx(<4 x i16> addrspace(1)*
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX9-NEXT:    s_lshr_b32 s0, s2, 1
-; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 1
 ; GFX9-NEXT:    s_and_b32 s1, s2, 1
+; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 1
 ; GFX9-NEXT:    s_lshl_b32 s0, s1, 4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
@@ -51,8 +51,8 @@ define amdgpu_ps i16 @extractelement_vgpr_v4i16_sgpr_idx(<4 x i16> addrspace(1)*
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; GFX8-NEXT:    s_lshr_b32 s0, s2, 1
-; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 1
 ; GFX8-NEXT:    s_and_b32 s1, s2, 1
+; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 1
 ; GFX8-NEXT:    s_lshl_b32 s0, s1, 4
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
@@ -64,8 +64,8 @@ define amdgpu_ps i16 @extractelement_vgpr_v4i16_sgpr_idx(<4 x i16> addrspace(1)*
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; GFX7-NEXT:    s_lshr_b32 s0, s2, 1
-; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 1
 ; GFX7-NEXT:    s_and_b32 s1, s2, 1
+; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 1
 ; GFX7-NEXT:    s_lshl_b32 s0, s1, 4
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
@@ -96,8 +96,8 @@ define i16 @extractelement_vgpr_v4i16_vgpr_idx(<4 x i16> addrspace(1)* %ptr, i32
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 1, v2
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
 ; GFX9-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 4, v2
@@ -109,8 +109,8 @@ define i16 @extractelement_vgpr_v4i16_vgpr_idx(<4 x i16> addrspace(1)* %ptr, i32
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 1, v2
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
 ; GFX8-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 4, v2
@@ -122,8 +122,8 @@ define i16 @extractelement_vgpr_v4i16_vgpr_idx(<4 x i16> addrspace(1)* %ptr, i32
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 1, v2
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
 ; GFX7-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 4, v2
@@ -153,8 +153,8 @@ define amdgpu_ps i16 @extractelement_sgpr_v4i16_vgpr_idx(<4 x i16> addrspace(4)*
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
 ; GCN-NEXT:    v_lshrrev_b32_e32 v1, 1, v0
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
+; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v2, s0
@@ -605,11 +605,11 @@ define amdgpu_ps i16 @extractelement_sgpr_v8i16_vgpr_idx(<8 x i16> addrspace(4)*
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN-NEXT:    v_mov_b32_e32 v3, s1
-; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; GCN-NEXT:    v_mov_b32_e32 v4, s2
+; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v1
-; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GCN-NEXT:    v_mov_b32_e32 v5, s3
+; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v1
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v5, vcc
 ; GCN-NEXT:    v_lshrrev_b32_e32 v0, v0, v1

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll
index 2e108ee65a252..04f10b2a548c3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll
@@ -12,8 +12,8 @@ define amdgpu_ps i8 @extractelement_sgpr_v4i8_sgpr_idx(<4 x i8> addrspace(4)* in
 ; GCN-NEXT:    s_bfe_u32 s3, s0, 0x80008
 ; GCN-NEXT:    s_lshr_b32 s1, s0, 24
 ; GCN-NEXT:    s_and_b32 s2, s0, 0xff
-; GCN-NEXT:    s_bfe_u32 s0, s0, 0x80010
 ; GCN-NEXT:    s_lshl_b32 s3, s3, 8
+; GCN-NEXT:    s_bfe_u32 s0, s0, 0x80010
 ; GCN-NEXT:    s_or_b32 s2, s2, s3
 ; GCN-NEXT:    s_lshl_b32 s0, s0, 16
 ; GCN-NEXT:    s_or_b32 s0, s2, s0
@@ -52,8 +52,8 @@ define amdgpu_ps i8 @extractelement_vgpr_v4i8_sgpr_idx(<4 x i8> addrspace(1)* %p
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 8
-; GFX9-NEXT:    v_mov_b32_e32 v3, 16
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0xff
+; GFX9-NEXT:    v_mov_b32_e32 v3, 16
 ; GFX9-NEXT:    s_and_b32 s0, s2, 3
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, 3
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -137,8 +137,8 @@ define i8 @extractelement_vgpr_v4i8_vgpr_idx(<4 x i8> addrspace(1)* %ptr, i32 %i
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 8
-; GFX9-NEXT:    v_mov_b32_e32 v4, 16
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0xff
+; GFX9-NEXT:    v_mov_b32_e32 v4, 16
 ; GFX9-NEXT:    v_and_b32_e32 v2, 3, v2
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
@@ -226,8 +226,8 @@ define amdgpu_ps i8 @extractelement_sgpr_v4i8_vgpr_idx(<4 x i8> addrspace(4)* in
 ; GFX9-NEXT:    s_bfe_u32 s3, s0, 0x80008
 ; GFX9-NEXT:    s_lshr_b32 s1, s0, 24
 ; GFX9-NEXT:    s_and_b32 s2, s0, 0xff
-; GFX9-NEXT:    s_bfe_u32 s0, s0, 0x80010
 ; GFX9-NEXT:    s_lshl_b32 s3, s3, 8
+; GFX9-NEXT:    s_bfe_u32 s0, s0, 0x80010
 ; GFX9-NEXT:    s_or_b32 s2, s2, s3
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX9-NEXT:    s_or_b32 s0, s2, s0
@@ -246,8 +246,8 @@ define amdgpu_ps i8 @extractelement_sgpr_v4i8_vgpr_idx(<4 x i8> addrspace(4)* in
 ; GFX8-NEXT:    s_bfe_u32 s3, s0, 0x80008
 ; GFX8-NEXT:    s_lshr_b32 s1, s0, 24
 ; GFX8-NEXT:    s_and_b32 s2, s0, 0xff
-; GFX8-NEXT:    s_bfe_u32 s0, s0, 0x80010
 ; GFX8-NEXT:    s_lshl_b32 s3, s3, 8
+; GFX8-NEXT:    s_bfe_u32 s0, s0, 0x80010
 ; GFX8-NEXT:    s_or_b32 s2, s2, s3
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX8-NEXT:    s_or_b32 s0, s2, s0
@@ -266,8 +266,8 @@ define amdgpu_ps i8 @extractelement_sgpr_v4i8_vgpr_idx(<4 x i8> addrspace(4)* in
 ; GFX7-NEXT:    s_bfe_u32 s3, s0, 0x80008
 ; GFX7-NEXT:    s_lshr_b32 s1, s0, 24
 ; GFX7-NEXT:    s_and_b32 s2, s0, 0xff
-; GFX7-NEXT:    s_bfe_u32 s0, s0, 0x80010
 ; GFX7-NEXT:    s_lshl_b32 s3, s3, 8
+; GFX7-NEXT:    s_bfe_u32 s0, s0, 0x80010
 ; GFX7-NEXT:    s_or_b32 s2, s2, s3
 ; GFX7-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX7-NEXT:    s_or_b32 s0, s2, s0
@@ -284,14 +284,14 @@ define amdgpu_ps i8 @extractelement_sgpr_v4i8_vgpr_idx(<4 x i8> addrspace(4)* in
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_bfe_u32 s2, s0, 0x80008
-; GFX10-NEXT:    s_bfe_u32 s3, s0, 0x80010
 ; GFX10-NEXT:    s_and_b32 s1, s0, 0xff
+; GFX10-NEXT:    s_bfe_u32 s3, s0, 0x80010
 ; GFX10-NEXT:    s_lshl_b32 s2, s2, 8
-; GFX10-NEXT:    s_lshr_b32 s0, s0, 24
 ; GFX10-NEXT:    s_lshl_b32 s3, s3, 16
 ; GFX10-NEXT:    s_or_b32 s1, s1, s2
-; GFX10-NEXT:    s_lshl_b32 s0, s0, 24
+; GFX10-NEXT:    s_lshr_b32 s0, s0, 24
 ; GFX10-NEXT:    s_or_b32 s1, s1, s3
+; GFX10-NEXT:    s_lshl_b32 s0, s0, 24
 ; GFX10-NEXT:    s_or_b32 s0, s1, s0
 ; GFX10-NEXT:    v_lshrrev_b32_e64 v0, v0, s0
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
@@ -323,14 +323,14 @@ define amdgpu_ps i8 @extractelement_sgpr_v4i8_idx0(<4 x i8> addrspace(4)* inreg
 ; GFX10-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_bfe_u32 s2, s0, 0x80008
-; GFX10-NEXT:    s_bfe_u32 s3, s0, 0x80010
 ; GFX10-NEXT:    s_and_b32 s1, s0, 0xff
+; GFX10-NEXT:    s_bfe_u32 s3, s0, 0x80010
 ; GFX10-NEXT:    s_lshl_b32 s2, s2, 8
-; GFX10-NEXT:    s_lshr_b32 s0, s0, 24
 ; GFX10-NEXT:    s_lshl_b32 s3, s3, 16
 ; GFX10-NEXT:    s_or_b32 s1, s1, s2
-; GFX10-NEXT:    s_lshl_b32 s0, s0, 24
+; GFX10-NEXT:    s_lshr_b32 s0, s0, 24
 ; GFX10-NEXT:    s_or_b32 s1, s1, s3
+; GFX10-NEXT:    s_lshl_b32 s0, s0, 24
 ; GFX10-NEXT:    s_or_b32 s0, s1, s0
 ; GFX10-NEXT:    ; return to shader part epilog
   %vector = load <4 x i8>, <4 x i8> addrspace(4)* %ptr
@@ -361,14 +361,14 @@ define amdgpu_ps i8 @extractelement_sgpr_v4i8_idx1(<4 x i8> addrspace(4)* inreg
 ; GFX10-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_bfe_u32 s2, s0, 0x80008
-; GFX10-NEXT:    s_bfe_u32 s3, s0, 0x80010
 ; GFX10-NEXT:    s_and_b32 s1, s0, 0xff
+; GFX10-NEXT:    s_bfe_u32 s3, s0, 0x80010
 ; GFX10-NEXT:    s_lshl_b32 s2, s2, 8
-; GFX10-NEXT:    s_lshr_b32 s0, s0, 24
 ; GFX10-NEXT:    s_lshl_b32 s3, s3, 16
 ; GFX10-NEXT:    s_or_b32 s1, s1, s2
-; GFX10-NEXT:    s_lshl_b32 s0, s0, 24
+; GFX10-NEXT:    s_lshr_b32 s0, s0, 24
 ; GFX10-NEXT:    s_or_b32 s1, s1, s3
+; GFX10-NEXT:    s_lshl_b32 s0, s0, 24
 ; GFX10-NEXT:    s_or_b32 s0, s1, s0
 ; GFX10-NEXT:    s_lshr_b32 s0, s0, 8
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -400,14 +400,14 @@ define amdgpu_ps i8 @extractelement_sgpr_v4i8_idx2(<4 x i8> addrspace(4)* inreg
 ; GFX10-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_bfe_u32 s2, s0, 0x80008
-; GFX10-NEXT:    s_bfe_u32 s3, s0, 0x80010
 ; GFX10-NEXT:    s_and_b32 s1, s0, 0xff
+; GFX10-NEXT:    s_bfe_u32 s3, s0, 0x80010
 ; GFX10-NEXT:    s_lshl_b32 s2, s2, 8
-; GFX10-NEXT:    s_lshr_b32 s0, s0, 24
 ; GFX10-NEXT:    s_lshl_b32 s3, s3, 16
 ; GFX10-NEXT:    s_or_b32 s1, s1, s2
-; GFX10-NEXT:    s_lshl_b32 s0, s0, 24
+; GFX10-NEXT:    s_lshr_b32 s0, s0, 24
 ; GFX10-NEXT:    s_or_b32 s1, s1, s3
+; GFX10-NEXT:    s_lshl_b32 s0, s0, 24
 ; GFX10-NEXT:    s_or_b32 s0, s1, s0
 ; GFX10-NEXT:    s_lshr_b32 s0, s0, 16
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -439,14 +439,14 @@ define amdgpu_ps i8 @extractelement_sgpr_v4i8_idx3(<4 x i8> addrspace(4)* inreg
 ; GFX10-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_bfe_u32 s2, s0, 0x80008
-; GFX10-NEXT:    s_bfe_u32 s3, s0, 0x80010
 ; GFX10-NEXT:    s_and_b32 s1, s0, 0xff
+; GFX10-NEXT:    s_bfe_u32 s3, s0, 0x80010
 ; GFX10-NEXT:    s_lshl_b32 s2, s2, 8
-; GFX10-NEXT:    s_lshr_b32 s0, s0, 24
 ; GFX10-NEXT:    s_lshl_b32 s3, s3, 16
 ; GFX10-NEXT:    s_or_b32 s1, s1, s2
-; GFX10-NEXT:    s_lshl_b32 s0, s0, 24
+; GFX10-NEXT:    s_lshr_b32 s0, s0, 24
 ; GFX10-NEXT:    s_or_b32 s1, s1, s3
+; GFX10-NEXT:    s_lshl_b32 s0, s0, 24
 ; GFX10-NEXT:    s_or_b32 s0, s1, s0
 ; GFX10-NEXT:    s_lshr_b32 s0, s0, 24
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -461,8 +461,8 @@ define i8 @extractelement_vgpr_v4i8_idx0(<4 x i8> addrspace(1)* %ptr) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 8
-; GFX9-NEXT:    v_mov_b32_e32 v3, 16
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0xff
+; GFX9-NEXT:    v_mov_b32_e32 v3, 16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
@@ -534,8 +534,8 @@ define i8 @extractelement_vgpr_v4i8_idx1(<4 x i8> addrspace(1)* %ptr) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX9-NEXT:    s_mov_b32 s4, 8
-; GFX9-NEXT:    v_mov_b32_e32 v2, 16
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0xff
+; GFX9-NEXT:    v_mov_b32_e32 v2, 16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v4, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
@@ -688,8 +688,8 @@ define i8 @extractelement_vgpr_v4i8_idx3(<4 x i8> addrspace(1)* %ptr) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 8
-; GFX9-NEXT:    v_mov_b32_e32 v3, 16
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0xff
+; GFX9-NEXT:    v_mov_b32_e32 v3, 16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
@@ -780,8 +780,8 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_sgpr_idx(<8 x i8> addrspace(4)* in
 ; GCN-NEXT:    s_and_b32 s2, s1, s5
 ; GCN-NEXT:    s_bfe_u32 s5, s1, s7
 ; GCN-NEXT:    s_lshr_b32 s3, s1, 24
-; GCN-NEXT:    s_bfe_u32 s1, s1, s8
 ; GCN-NEXT:    s_lshl_b32 s5, s5, 8
+; GCN-NEXT:    s_bfe_u32 s1, s1, s8
 ; GCN-NEXT:    s_or_b32 s2, s2, s5
 ; GCN-NEXT:    s_lshl_b32 s1, s1, 16
 ; GCN-NEXT:    s_or_b32 s1, s2, s1
@@ -814,12 +814,12 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_sgpr_idx(<8 x i8> addrspace(4)* in
 ; GFX10-NEXT:    s_lshl_b32 s5, s10, 8
 ; GFX10-NEXT:    s_lshl_b32 s3, s3, 8
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX10-NEXT:    s_or_b32 s5, s9, s5
 ; GFX10-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX10-NEXT:    s_or_b32 s5, s9, s5
 ; GFX10-NEXT:    s_or_b32 s2, s2, s3
 ; GFX10-NEXT:    s_lshl_b32 s7, s7, 24
-; GFX10-NEXT:    s_or_b32 s0, s5, s0
 ; GFX10-NEXT:    s_lshl_b32 s8, s8, 24
+; GFX10-NEXT:    s_or_b32 s0, s5, s0
 ; GFX10-NEXT:    s_or_b32 s1, s2, s1
 ; GFX10-NEXT:    s_or_b32 s0, s0, s7
 ; GFX10-NEXT:    s_or_b32 s1, s1, s8
@@ -876,14 +876,14 @@ define amdgpu_ps i8 @extractelement_vgpr_v8i8_sgpr_idx(<8 x i8> addrspace(1)* %p
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v6, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v7, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v7, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v7
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 24, v5
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v7
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v3
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
@@ -913,13 +913,13 @@ define amdgpu_ps i8 @extractelement_vgpr_v8i8_sgpr_idx(<8 x i8> addrspace(1)* %p
 ; GFX7-NEXT:    v_bfe_u32 v1, v1, 16, 8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 8, v7
-; GFX7-NEXT:    v_or_b32_e32 v4, v4, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_or_b32_e32 v4, v4, v5
 ; GFX7-NEXT:    v_or_b32_e32 v5, v6, v7
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
-; GFX7-NEXT:    v_or_b32_e32 v0, v4, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
+; GFX7-NEXT:    v_or_b32_e32 v0, v4, v0
 ; GFX7-NEXT:    v_or_b32_e32 v1, v5, v1
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v3
@@ -938,8 +938,8 @@ define amdgpu_ps i8 @extractelement_vgpr_v8i8_sgpr_idx(<8 x i8> addrspace(1)* %p
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 24, v1
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v6, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v7, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_and_or_b32 v0, v0, s3, v3
@@ -947,8 +947,8 @@ define amdgpu_ps i8 @extractelement_vgpr_v8i8_sgpr_idx(<8 x i8> addrspace(1)* %p
 ; GFX10-NEXT:    v_and_or_b32 v1, v1, s3, v5
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 24, v4
 ; GFX10-NEXT:    s_lshr_b32 s0, s2, 2
-; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s0, 1
 ; GFX10-NEXT:    v_or3_b32 v0, v0, v6, v2
+; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s0, 1
 ; GFX10-NEXT:    v_or3_b32 v1, v1, v7, v3
 ; GFX10-NEXT:    s_and_b32 s0, s2, 3
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, 3
@@ -1003,14 +1003,14 @@ define i8 @extractelement_vgpr_v8i8_vgpr_idx(<8 x i8> addrspace(1)* %ptr, i32 %i
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v8, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 24, v0
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v9, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v9, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v4, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v9
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 24, v7
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v9
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v6
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v3
@@ -1041,13 +1041,13 @@ define i8 @extractelement_vgpr_v8i8_vgpr_idx(<8 x i8> addrspace(1)* %ptr, i32 %i
 ; GFX7-NEXT:    v_bfe_u32 v1, v1, 16, 8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 8, v7
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
-; GFX7-NEXT:    v_or_b32_e32 v6, v6, v7
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_or_b32_e32 v6, v6, v7
 ; GFX7-NEXT:    v_or_b32_e32 v7, v8, v9
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; GFX7-NEXT:    v_or_b32_e32 v0, v6, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
+; GFX7-NEXT:    v_or_b32_e32 v0, v6, v0
 ; GFX7-NEXT:    v_or_b32_e32 v1, v7, v1
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v5
@@ -1073,8 +1073,8 @@ define i8 @extractelement_vgpr_v8i8_vgpr_idx(<8 x i8> addrspace(1)* %ptr, i32 %i
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v8, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_and_or_b32 v0, v0, s6, v4
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 24, v5
 ; GFX10-NEXT:    v_and_or_b32 v1, v1, s6, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 24, v5
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 2, v2
 ; GFX10-NEXT:    v_and_b32_e32 v2, 3, v2
 ; GFX10-NEXT:    v_or3_b32 v0, v0, v7, v3
@@ -1096,7 +1096,7 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_vgpr_idx(<8 x i8> addrspace(4)* in
 ; GCN-NEXT:    s_mov_b32 s6, 0x80008
 ; GCN-NEXT:    s_movk_i32 s4, 0xff
 ; GCN-NEXT:    v_lshrrev_b32_e32 v1, 2, v0
-; GCN-NEXT:    v_and_b32_e32 v0, 3, v0
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_bfe_u32 s7, s0, s6
 ; GCN-NEXT:    s_and_b32 s5, s0, s4
@@ -1112,8 +1112,8 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_vgpr_idx(<8 x i8> addrspace(4)* in
 ; GCN-NEXT:    s_and_b32 s2, s1, s4
 ; GCN-NEXT:    s_bfe_u32 s4, s1, s6
 ; GCN-NEXT:    s_lshr_b32 s3, s1, 24
-; GCN-NEXT:    s_bfe_u32 s1, s1, s7
 ; GCN-NEXT:    s_lshl_b32 s4, s4, 8
+; GCN-NEXT:    s_bfe_u32 s1, s1, s7
 ; GCN-NEXT:    s_or_b32 s2, s2, s4
 ; GCN-NEXT:    s_lshl_b32 s1, s1, 16
 ; GCN-NEXT:    s_or_b32 s1, s2, s1
@@ -1121,7 +1121,7 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_vgpr_idx(<8 x i8> addrspace(4)* in
 ; GCN-NEXT:    s_or_b32 s1, s1, s2
 ; GCN-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN-NEXT:    v_mov_b32_e32 v3, s1
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
+; GCN-NEXT:    v_and_b32_e32 v0, 3, v0
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GCN-NEXT:    v_lshrrev_b32_e32 v0, v0, v1
@@ -1141,8 +1141,8 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_vgpr_idx(<8 x i8> addrspace(4)* in
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_bfe_u32 s8, s0, s3
 ; GFX10-NEXT:    s_bfe_u32 s3, s1, s3
-; GFX10-NEXT:    s_and_b32 s7, s0, s2
 ; GFX10-NEXT:    s_lshr_b32 s6, s1, 24
+; GFX10-NEXT:    s_and_b32 s7, s0, s2
 ; GFX10-NEXT:    s_and_b32 s2, s1, s2
 ; GFX10-NEXT:    s_bfe_u32 s1, s1, s4
 ; GFX10-NEXT:    s_lshl_b32 s3, s3, 8
@@ -1191,14 +1191,14 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx0(<8 x i8> addrspace(4)* inreg
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_bfe_u32 s2, s0, 0x80008
-; GFX10-NEXT:    s_bfe_u32 s3, s0, 0x80010
 ; GFX10-NEXT:    s_and_b32 s1, s0, 0xff
+; GFX10-NEXT:    s_bfe_u32 s3, s0, 0x80010
 ; GFX10-NEXT:    s_lshl_b32 s2, s2, 8
-; GFX10-NEXT:    s_lshr_b32 s0, s0, 24
 ; GFX10-NEXT:    s_lshl_b32 s3, s3, 16
 ; GFX10-NEXT:    s_or_b32 s1, s1, s2
-; GFX10-NEXT:    s_lshl_b32 s0, s0, 24
+; GFX10-NEXT:    s_lshr_b32 s0, s0, 24
 ; GFX10-NEXT:    s_or_b32 s1, s1, s3
+; GFX10-NEXT:    s_lshl_b32 s0, s0, 24
 ; GFX10-NEXT:    s_or_b32 s0, s1, s0
 ; GFX10-NEXT:    ; return to shader part epilog
   %vector = load <8 x i8>, <8 x i8> addrspace(4)* %ptr
@@ -1229,14 +1229,14 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx1(<8 x i8> addrspace(4)* inreg
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_bfe_u32 s2, s0, 0x80008
-; GFX10-NEXT:    s_bfe_u32 s3, s0, 0x80010
 ; GFX10-NEXT:    s_and_b32 s1, s0, 0xff
+; GFX10-NEXT:    s_bfe_u32 s3, s0, 0x80010
 ; GFX10-NEXT:    s_lshl_b32 s2, s2, 8
-; GFX10-NEXT:    s_lshr_b32 s0, s0, 24
 ; GFX10-NEXT:    s_lshl_b32 s3, s3, 16
 ; GFX10-NEXT:    s_or_b32 s1, s1, s2
-; GFX10-NEXT:    s_lshl_b32 s0, s0, 24
+; GFX10-NEXT:    s_lshr_b32 s0, s0, 24
 ; GFX10-NEXT:    s_or_b32 s1, s1, s3
+; GFX10-NEXT:    s_lshl_b32 s0, s0, 24
 ; GFX10-NEXT:    s_or_b32 s0, s1, s0
 ; GFX10-NEXT:    s_lshr_b32 s0, s0, 8
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -1268,14 +1268,14 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx2(<8 x i8> addrspace(4)* inreg
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_bfe_u32 s2, s0, 0x80008
-; GFX10-NEXT:    s_bfe_u32 s3, s0, 0x80010
 ; GFX10-NEXT:    s_and_b32 s1, s0, 0xff
+; GFX10-NEXT:    s_bfe_u32 s3, s0, 0x80010
 ; GFX10-NEXT:    s_lshl_b32 s2, s2, 8
-; GFX10-NEXT:    s_lshr_b32 s0, s0, 24
 ; GFX10-NEXT:    s_lshl_b32 s3, s3, 16
 ; GFX10-NEXT:    s_or_b32 s1, s1, s2
-; GFX10-NEXT:    s_lshl_b32 s0, s0, 24
+; GFX10-NEXT:    s_lshr_b32 s0, s0, 24
 ; GFX10-NEXT:    s_or_b32 s1, s1, s3
+; GFX10-NEXT:    s_lshl_b32 s0, s0, 24
 ; GFX10-NEXT:    s_or_b32 s0, s1, s0
 ; GFX10-NEXT:    s_lshr_b32 s0, s0, 16
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -1307,14 +1307,14 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx3(<8 x i8> addrspace(4)* inreg
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_bfe_u32 s2, s0, 0x80008
-; GFX10-NEXT:    s_bfe_u32 s3, s0, 0x80010
 ; GFX10-NEXT:    s_and_b32 s1, s0, 0xff
+; GFX10-NEXT:    s_bfe_u32 s3, s0, 0x80010
 ; GFX10-NEXT:    s_lshl_b32 s2, s2, 8
-; GFX10-NEXT:    s_lshr_b32 s0, s0, 24
 ; GFX10-NEXT:    s_lshl_b32 s3, s3, 16
 ; GFX10-NEXT:    s_or_b32 s1, s1, s2
-; GFX10-NEXT:    s_lshl_b32 s0, s0, 24
+; GFX10-NEXT:    s_lshr_b32 s0, s0, 24
 ; GFX10-NEXT:    s_or_b32 s1, s1, s3
+; GFX10-NEXT:    s_lshl_b32 s0, s0, 24
 ; GFX10-NEXT:    s_or_b32 s0, s1, s0
 ; GFX10-NEXT:    s_lshr_b32 s0, s0, 24
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -1345,14 +1345,14 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx4(<8 x i8> addrspace(4)* inreg
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_bfe_u32 s2, s1, 0x80008
-; GFX10-NEXT:    s_bfe_u32 s3, s1, 0x80010
 ; GFX10-NEXT:    s_and_b32 s0, s1, 0xff
+; GFX10-NEXT:    s_bfe_u32 s3, s1, 0x80010
 ; GFX10-NEXT:    s_lshl_b32 s2, s2, 8
-; GFX10-NEXT:    s_lshr_b32 s1, s1, 24
 ; GFX10-NEXT:    s_lshl_b32 s3, s3, 16
 ; GFX10-NEXT:    s_or_b32 s0, s0, s2
-; GFX10-NEXT:    s_lshl_b32 s1, s1, 24
+; GFX10-NEXT:    s_lshr_b32 s1, s1, 24
 ; GFX10-NEXT:    s_or_b32 s0, s0, s3
+; GFX10-NEXT:    s_lshl_b32 s1, s1, 24
 ; GFX10-NEXT:    s_or_b32 s0, s0, s1
 ; GFX10-NEXT:    ; return to shader part epilog
   %vector = load <8 x i8>, <8 x i8> addrspace(4)* %ptr
@@ -1383,14 +1383,14 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx5(<8 x i8> addrspace(4)* inreg
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_bfe_u32 s2, s1, 0x80008
-; GFX10-NEXT:    s_bfe_u32 s3, s1, 0x80010
 ; GFX10-NEXT:    s_and_b32 s0, s1, 0xff
+; GFX10-NEXT:    s_bfe_u32 s3, s1, 0x80010
 ; GFX10-NEXT:    s_lshl_b32 s2, s2, 8
-; GFX10-NEXT:    s_lshr_b32 s1, s1, 24
 ; GFX10-NEXT:    s_lshl_b32 s3, s3, 16
 ; GFX10-NEXT:    s_or_b32 s0, s0, s2
-; GFX10-NEXT:    s_lshl_b32 s1, s1, 24
+; GFX10-NEXT:    s_lshr_b32 s1, s1, 24
 ; GFX10-NEXT:    s_or_b32 s0, s0, s3
+; GFX10-NEXT:    s_lshl_b32 s1, s1, 24
 ; GFX10-NEXT:    s_or_b32 s0, s0, s1
 ; GFX10-NEXT:    s_lshr_b32 s0, s0, 8
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -1422,14 +1422,14 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx6(<8 x i8> addrspace(4)* inreg
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_bfe_u32 s2, s1, 0x80008
-; GFX10-NEXT:    s_bfe_u32 s3, s1, 0x80010
 ; GFX10-NEXT:    s_and_b32 s0, s1, 0xff
+; GFX10-NEXT:    s_bfe_u32 s3, s1, 0x80010
 ; GFX10-NEXT:    s_lshl_b32 s2, s2, 8
-; GFX10-NEXT:    s_lshr_b32 s1, s1, 24
 ; GFX10-NEXT:    s_lshl_b32 s3, s3, 16
 ; GFX10-NEXT:    s_or_b32 s0, s0, s2
-; GFX10-NEXT:    s_lshl_b32 s1, s1, 24
+; GFX10-NEXT:    s_lshr_b32 s1, s1, 24
 ; GFX10-NEXT:    s_or_b32 s0, s0, s3
+; GFX10-NEXT:    s_lshl_b32 s1, s1, 24
 ; GFX10-NEXT:    s_or_b32 s0, s0, s1
 ; GFX10-NEXT:    s_lshr_b32 s0, s0, 16
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -1461,14 +1461,14 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx7(<8 x i8> addrspace(4)* inreg
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_bfe_u32 s2, s1, 0x80008
-; GFX10-NEXT:    s_bfe_u32 s3, s1, 0x80010
 ; GFX10-NEXT:    s_and_b32 s0, s1, 0xff
+; GFX10-NEXT:    s_bfe_u32 s3, s1, 0x80010
 ; GFX10-NEXT:    s_lshl_b32 s2, s2, 8
-; GFX10-NEXT:    s_lshr_b32 s1, s1, 24
 ; GFX10-NEXT:    s_lshl_b32 s3, s3, 16
 ; GFX10-NEXT:    s_or_b32 s0, s0, s2
-; GFX10-NEXT:    s_lshl_b32 s1, s1, 24
+; GFX10-NEXT:    s_lshr_b32 s1, s1, 24
 ; GFX10-NEXT:    s_or_b32 s0, s0, s3
+; GFX10-NEXT:    s_lshl_b32 s1, s1, 24
 ; GFX10-NEXT:    s_or_b32 s0, s0, s1
 ; GFX10-NEXT:    s_lshr_b32 s0, s0, 24
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -1483,9 +1483,9 @@ define i8 @extractelement_vgpr_v8i8_idx0(<8 x i8> addrspace(1)* %ptr) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 8
-; GFX9-NEXT:    v_mov_b32_e32 v3, 16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0xff
+; GFX9-NEXT:    v_mov_b32_e32 v3, 16
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
@@ -1556,9 +1556,9 @@ define i8 @extractelement_vgpr_v8i8_idx1(<8 x i8> addrspace(1)* %ptr) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX9-NEXT:    s_mov_b32 s4, 8
-; GFX9-NEXT:    v_mov_b32_e32 v2, 16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0xff
+; GFX9-NEXT:    v_mov_b32_e32 v2, 16
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v4, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
@@ -1710,9 +1710,9 @@ define i8 @extractelement_vgpr_v8i8_idx3(<8 x i8> addrspace(1)* %ptr) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 8
-; GFX9-NEXT:    v_mov_b32_e32 v3, 16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0xff
+; GFX9-NEXT:    v_mov_b32_e32 v3, 16
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
@@ -1787,11 +1787,11 @@ define i8 @extractelement_vgpr_v8i8_idx4(<8 x i8> addrspace(1)* %ptr) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 8
-; GFX9-NEXT:    v_mov_b32_e32 v3, 16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0xff
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 16
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 24, v1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_and_or_b32 v0, v1, v0, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 24, v4
@@ -1860,11 +1860,11 @@ define i8 @extractelement_vgpr_v8i8_idx5(<8 x i8> addrspace(1)* %ptr) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX9-NEXT:    s_mov_b32 s4, 8
-; GFX9-NEXT:    v_mov_b32_e32 v2, 16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0xff
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v4, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_mov_b32_e32 v2, 16
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 24, v1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v4, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_and_or_b32 v0, v1, v0, v4
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 24, v3
@@ -1940,8 +1940,8 @@ define i8 @extractelement_vgpr_v8i8_idx6(<8 x i8> addrspace(1)* %ptr) {
 ; GFX9-NEXT:    s_mov_b32 s4, 16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0xff
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 24, v1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v4, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_and_or_b32 v0, v1, v0, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 24, v3
@@ -2014,11 +2014,11 @@ define i8 @extractelement_vgpr_v8i8_idx7(<8 x i8> addrspace(1)* %ptr) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 8
-; GFX9-NEXT:    v_mov_b32_e32 v3, 16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0xff
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 16
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 24, v1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_and_or_b32 v0, v1, v0, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 24, v4
@@ -2101,33 +2101,33 @@ define amdgpu_ps i8 @extractelement_sgpr_v16i8_sgpr_idx(<16 x i8> addrspace(4)*
 ; GCN-NEXT:    s_bfe_u32 s0, s0, s12
 ; GCN-NEXT:    s_lshl_b32 s0, s0, 16
 ; GCN-NEXT:    s_or_b32 s0, s10, s0
-; GCN-NEXT:    s_bfe_u32 s10, s1, s11
 ; GCN-NEXT:    s_lshl_b32 s5, s5, 24
-; GCN-NEXT:    s_or_b32 s0, s0, s5
+; GCN-NEXT:    s_bfe_u32 s10, s1, s11
 ; GCN-NEXT:    s_lshr_b32 s6, s1, 24
+; GCN-NEXT:    s_or_b32 s0, s0, s5
 ; GCN-NEXT:    s_and_b32 s5, s1, s9
-; GCN-NEXT:    s_bfe_u32 s1, s1, s12
 ; GCN-NEXT:    s_lshl_b32 s10, s10, 8
+; GCN-NEXT:    s_bfe_u32 s1, s1, s12
 ; GCN-NEXT:    s_or_b32 s5, s5, s10
 ; GCN-NEXT:    s_lshl_b32 s1, s1, 16
 ; GCN-NEXT:    s_or_b32 s1, s5, s1
 ; GCN-NEXT:    s_lshl_b32 s5, s6, 24
 ; GCN-NEXT:    s_bfe_u32 s6, s2, s11
-; GCN-NEXT:    s_or_b32 s1, s1, s5
 ; GCN-NEXT:    s_lshr_b32 s7, s2, 24
+; GCN-NEXT:    s_or_b32 s1, s1, s5
 ; GCN-NEXT:    s_and_b32 s5, s2, s9
-; GCN-NEXT:    s_bfe_u32 s2, s2, s12
 ; GCN-NEXT:    s_lshl_b32 s6, s6, 8
+; GCN-NEXT:    s_bfe_u32 s2, s2, s12
 ; GCN-NEXT:    s_or_b32 s5, s5, s6
 ; GCN-NEXT:    s_lshl_b32 s2, s2, 16
-; GCN-NEXT:    s_bfe_u32 s6, s3, s11
 ; GCN-NEXT:    s_or_b32 s2, s5, s2
 ; GCN-NEXT:    s_lshl_b32 s5, s7, 24
-; GCN-NEXT:    s_or_b32 s2, s2, s5
+; GCN-NEXT:    s_bfe_u32 s6, s3, s11
 ; GCN-NEXT:    s_lshr_b32 s8, s3, 24
+; GCN-NEXT:    s_or_b32 s2, s2, s5
 ; GCN-NEXT:    s_and_b32 s5, s3, s9
-; GCN-NEXT:    s_bfe_u32 s3, s3, s12
 ; GCN-NEXT:    s_lshl_b32 s6, s6, 8
+; GCN-NEXT:    s_bfe_u32 s3, s3, s12
 ; GCN-NEXT:    s_or_b32 s5, s5, s6
 ; GCN-NEXT:    s_lshl_b32 s3, s3, 16
 ; GCN-NEXT:    s_or_b32 s3, s5, s3
@@ -2159,36 +2159,36 @@ define amdgpu_ps i8 @extractelement_sgpr_v16i8_sgpr_idx(<16 x i8> addrspace(4)*
 ; GFX10-NEXT:    s_lshl_b32 s13, s13, 8
 ; GFX10-NEXT:    s_bfe_u32 s15, s1, s6
 ; GFX10-NEXT:    s_bfe_u32 s17, s2, s6
-; GFX10-NEXT:    s_bfe_u32 s6, s3, s6
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX10-NEXT:    s_or_b32 s12, s12, s13
+; GFX10-NEXT:    s_bfe_u32 s6, s3, s6
 ; GFX10-NEXT:    s_lshr_b32 s9, s1, 24
+; GFX10-NEXT:    s_lshr_b32 s10, s2, 24
+; GFX10-NEXT:    s_lshr_b32 s11, s3, 24
 ; GFX10-NEXT:    s_and_b32 s14, s1, s5
 ; GFX10-NEXT:    s_bfe_u32 s1, s1, s7
 ; GFX10-NEXT:    s_and_b32 s16, s2, s5
-; GFX10-NEXT:    s_lshr_b32 s10, s2, 24
-; GFX10-NEXT:    s_bfe_u32 s2, s2, s7
+; GFX10-NEXT:    s_lshl_b32 s8, s8, 24
 ; GFX10-NEXT:    s_lshl_b32 s15, s15, 8
-; GFX10-NEXT:    s_lshr_b32 s11, s3, 24
+; GFX10-NEXT:    s_lshl_b32 s17, s17, 8
+; GFX10-NEXT:    s_or_b32 s0, s12, s0
+; GFX10-NEXT:    s_bfe_u32 s2, s2, s7
 ; GFX10-NEXT:    s_and_b32 s5, s3, s5
-; GFX10-NEXT:    s_bfe_u32 s3, s3, s7
 ; GFX10-NEXT:    s_lshl_b32 s6, s6, 8
-; GFX10-NEXT:    s_lshl_b32 s8, s8, 24
-; GFX10-NEXT:    s_or_b32 s0, s12, s0
-; GFX10-NEXT:    s_lshl_b32 s17, s17, 8
-; GFX10-NEXT:    s_or_b32 s0, s0, s8
-; GFX10-NEXT:    s_or_b32 s5, s5, s6
-; GFX10-NEXT:    s_lshl_b32 s3, s3, 16
+; GFX10-NEXT:    s_bfe_u32 s3, s3, s7
 ; GFX10-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX10-NEXT:    s_or_b32 s13, s14, s15
+; GFX10-NEXT:    s_or_b32 s0, s0, s8
 ; GFX10-NEXT:    s_or_b32 s8, s16, s17
 ; GFX10-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX10-NEXT:    s_or_b32 s3, s5, s3
+; GFX10-NEXT:    s_or_b32 s5, s5, s6
+; GFX10-NEXT:    s_lshl_b32 s3, s3, 16
+; GFX10-NEXT:    s_lshl_b32 s9, s9, 24
+; GFX10-NEXT:    s_or_b32 s1, s13, s1
 ; GFX10-NEXT:    s_or_b32 s2, s8, s2
 ; GFX10-NEXT:    s_lshl_b32 s8, s10, 24
+; GFX10-NEXT:    s_or_b32 s3, s5, s3
 ; GFX10-NEXT:    s_lshl_b32 s5, s11, 24
-; GFX10-NEXT:    s_lshl_b32 s9, s9, 24
-; GFX10-NEXT:    s_or_b32 s1, s13, s1
 ; GFX10-NEXT:    s_lshr_b32 s6, s4, 2
 ; GFX10-NEXT:    s_or_b32 s1, s1, s9
 ; GFX10-NEXT:    s_or_b32 s2, s2, s8
@@ -2214,11 +2214,11 @@ define amdgpu_ps i8 @extractelement_vgpr_v16i8_sgpr_idx(<16 x i8> addrspace(1)*
 ; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
 ; GFX9-NEXT:    s_mov_b32 s0, 8
 ; GFX9-NEXT:    s_mov_b32 s1, 16
-; GFX9-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX9-NEXT:    s_movk_i32 s3, 0xff
-; GFX9-NEXT:    s_lshr_b32 s4, s2, 2
-; GFX9-NEXT:    v_mov_b32_e32 v6, 16
+; GFX9-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0xff
+; GFX9-NEXT:    v_mov_b32_e32 v6, 16
+; GFX9-NEXT:    s_lshr_b32 s4, s2, 2
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 1
 ; GFX9-NEXT:    s_and_b32 s2, s2, 3
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -2234,8 +2234,8 @@ define amdgpu_ps i8 @extractelement_vgpr_v16i8_sgpr_idx(<16 x i8> addrspace(1)*
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
 ; GFX9-NEXT:    v_and_or_b32 v1, v1, s3, v13
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v16, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 24, v3
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v16, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v5, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_and_or_b32 v2, v2, v4, v15
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
@@ -2244,11 +2244,11 @@ define amdgpu_ps i8 @extractelement_vgpr_v16i8_sgpr_idx(<16 x i8> addrspace(1)*
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v6, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_and_or_b32 v3, v3, v4, v5
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 24, v10
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX9-NEXT:    v_or3_b32 v2, v2, v16, v9
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 2
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX9-NEXT:    v_or3_b32 v3, v3, v6, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX9-NEXT:    s_lshl_b32 s0, s2, 3
@@ -2270,35 +2270,35 @@ define amdgpu_ps i8 @extractelement_vgpr_v16i8_sgpr_idx(<16 x i8> addrspace(1)*
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v12, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v4, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 24, v0
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v13, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v14, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 24, v1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v13, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v14, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v15, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 24, v2
-; GFX8-NEXT:    v_or_b32_sdwa v2, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 24, v9
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v5
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v15, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v6, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 24, v9
+; GFX8-NEXT:    v_or_b32_sdwa v2, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v13
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v5
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 24, v3
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v7, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_or_b32_sdwa v3, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 24, v10
+; GFX8-NEXT:    v_or_b32_sdwa v3, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v2, v2, v15
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v8
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 24, v11
 ; GFX8-NEXT:    v_or_b32_e32 v3, v3, v7
 ; GFX8-NEXT:    v_or_b32_e32 v2, v2, v9
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 2
+; GFX8-NEXT:    v_or_b32_e32 v3, v3, v6
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 3
-; GFX8-NEXT:    v_or_b32_e32 v3, v3, v6
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX8-NEXT:    s_lshl_b32 s0, s1, 3
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s0, v0
@@ -2321,45 +2321,45 @@ define amdgpu_ps i8 @extractelement_vgpr_v16i8_sgpr_idx(<16 x i8> addrspace(1)*
 ; GFX7-NEXT:    v_bfe_u32 v12, v1, 8, 8
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 24, v1
-; GFX7-NEXT:    v_bfe_u32 v14, v2, 8, 8
 ; GFX7-NEXT:    v_and_b32_e32 v9, s0, v0
 ; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
 ; GFX7-NEXT:    v_and_b32_e32 v11, s0, v1
 ; GFX7-NEXT:    v_bfe_u32 v1, v1, 16, 8
+; GFX7-NEXT:    v_bfe_u32 v14, v2, 8, 8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 8, v10
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 8, v12
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 24, v2
 ; GFX7-NEXT:    v_and_b32_e32 v13, v2, v4
 ; GFX7-NEXT:    v_bfe_u32 v2, v2, 16, 8
 ; GFX7-NEXT:    v_bfe_u32 v15, v3, 8, 8
-; GFX7-NEXT:    v_or_b32_e32 v9, v9, v10
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_or_b32_e32 v10, v11, v12
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v14, 8, v14
+; GFX7-NEXT:    v_or_b32_e32 v9, v9, v10
+; GFX7-NEXT:    v_or_b32_e32 v10, v11, v12
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 24, v3
 ; GFX7-NEXT:    v_and_b32_e32 v4, v3, v4
 ; GFX7-NEXT:    v_bfe_u32 v3, v3, 16, 8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v15, 8, v15
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX7-NEXT:    v_or_b32_e32 v0, v9, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX7-NEXT:    v_or_b32_e32 v1, v10, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v15, 8, v15
 ; GFX7-NEXT:    v_or_b32_e32 v11, v13, v14
+; GFX7-NEXT:    v_or_b32_e32 v0, v9, v0
+; GFX7-NEXT:    v_or_b32_e32 v1, v10, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
-; GFX7-NEXT:    v_or_b32_e32 v2, v11, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX7-NEXT:    v_or_b32_e32 v4, v4, v15
+; GFX7-NEXT:    v_or_b32_e32 v2, v11, v2
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v5
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v6
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
 ; GFX7-NEXT:    v_or_b32_e32 v3, v4, v3
 ; GFX7-NEXT:    v_or_b32_e32 v2, v2, v7
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s1, 2
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX7-NEXT:    v_or_b32_e32 v3, v3, v8
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s1, 3
 ; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX7-NEXT:    s_lshl_b32 s0, s2, 3
@@ -2374,8 +2374,8 @@ define amdgpu_ps i8 @extractelement_vgpr_v16i8_sgpr_idx(<16 x i8> addrspace(1)*
 ; GFX10-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX10-NEXT:    s_mov_b32 s1, 16
 ; GFX10-NEXT:    s_movk_i32 s3, 0xff
-; GFX10-NEXT:    v_mov_b32_e32 v6, 16
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0xff
+; GFX10-NEXT:    v_mov_b32_e32 v6, 16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 24, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 24, v1
@@ -2398,8 +2398,8 @@ define amdgpu_ps i8 @extractelement_vgpr_v16i8_sgpr_idx(<16 x i8> addrspace(1)*
 ; GFX10-NEXT:    v_or3_b32 v0, v0, v11, v7
 ; GFX10-NEXT:    v_or3_b32 v1, v1, v13, v8
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s0, 1
-; GFX10-NEXT:    v_and_or_b32 v4, v3, v4, v5
 ; GFX10-NEXT:    v_or3_b32 v2, v2, v15, v9
+; GFX10-NEXT:    v_and_or_b32 v4, v3, v4, v5
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 24, v10
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
@@ -2425,11 +2425,11 @@ define i8 @extractelement_vgpr_v16i8_vgpr_idx(<16 x i8> addrspace(1)* %ptr, i32
 ; GFX9-NEXT:    global_load_dwordx4 v[3:6], v[0:1], off
 ; GFX9-NEXT:    s_mov_b32 s4, 8
 ; GFX9-NEXT:    s_mov_b32 s5, 16
-; GFX9-NEXT:    v_mov_b32_e32 v1, 8
 ; GFX9-NEXT:    s_movk_i32 s6, 0xff
-; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 2, v2
-; GFX9-NEXT:    v_mov_b32_e32 v7, 16
+; GFX9-NEXT:    v_mov_b32_e32 v1, 8
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0xff
+; GFX9-NEXT:    v_mov_b32_e32 v7, 16
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 2, v2
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v8
 ; GFX9-NEXT:    v_and_b32_e32 v2, 3, v2
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -2438,26 +2438,26 @@ define i8 @extractelement_vgpr_v16i8_vgpr_idx(<16 x i8> addrspace(1)* %ptr, i32
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v13, s4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v15, s4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 24, v5
+; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 24, v6
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v14, s5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v16, s5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v17, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 24, v6
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_and_or_b32 v3, v3, s6, v13
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
 ; GFX9-NEXT:    v_and_or_b32 v4, v4, s6, v15
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v18, v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v7, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_and_or_b32 v5, v5, v0, v17
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 24, v11
 ; GFX9-NEXT:    v_and_or_b32 v0, v6, v0, v1
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v7, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 24, v12
 ; GFX9-NEXT:    v_or3_b32 v3, v3, v14, v9
 ; GFX9-NEXT:    v_or3_b32 v4, v4, v16, v10
+; GFX9-NEXT:    v_or3_b32 v5, v5, v18, v11
 ; GFX9-NEXT:    v_or3_b32 v0, v0, v7, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT:    v_or3_b32 v5, v5, v18, v11
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v8
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v8
@@ -2481,34 +2481,34 @@ define i8 @extractelement_vgpr_v16i8_vgpr_idx(<16 x i8> addrspace(1)* %ptr, i32
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v14, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 24, v3
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v15, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_or_b32_sdwa v3, v3, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v16, v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 24, v4
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v15, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v16, v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX8-NEXT:    v_or_b32_sdwa v3, v3, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v17, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 24, v5
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 24, v11
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT:    v_or_b32_sdwa v5, v5, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v17, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v7, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 24, v11
+; GFX8-NEXT:    v_or_b32_sdwa v5, v5, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v3, v3, v15
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 24, v6
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v8, v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 24, v12
 ; GFX8-NEXT:    v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v1, v5, v17
-; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 24, v12
 ; GFX8-NEXT:    v_or_b32_e32 v3, v3, v10
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 24, v13
 ; GFX8-NEXT:    v_or_b32_e32 v5, v6, v8
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v11
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v9
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX8-NEXT:    v_or_b32_e32 v4, v5, v7
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v9
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 3, v2
@@ -2532,45 +2532,45 @@ define i8 @extractelement_vgpr_v16i8_vgpr_idx(<16 x i8> addrspace(1)* %ptr, i32
 ; GFX7-NEXT:    v_bfe_u32 v13, v4, 8, 8
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 24, v3
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 24, v4
-; GFX7-NEXT:    v_bfe_u32 v15, v5, 8, 8
 ; GFX7-NEXT:    v_and_b32_e32 v10, s4, v3
 ; GFX7-NEXT:    v_bfe_u32 v3, v3, 16, 8
 ; GFX7-NEXT:    v_and_b32_e32 v12, s4, v4
 ; GFX7-NEXT:    v_bfe_u32 v4, v4, 16, 8
+; GFX7-NEXT:    v_bfe_u32 v15, v5, 8, 8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v11, 8, v11
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v13, 8, v13
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 24, v5
 ; GFX7-NEXT:    v_and_b32_e32 v14, v5, v0
 ; GFX7-NEXT:    v_bfe_u32 v5, v5, 16, 8
 ; GFX7-NEXT:    v_bfe_u32 v16, v6, 8, 8
-; GFX7-NEXT:    v_or_b32_e32 v10, v10, v11
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:    v_or_b32_e32 v11, v12, v13
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v15, 8, v15
+; GFX7-NEXT:    v_or_b32_e32 v10, v10, v11
+; GFX7-NEXT:    v_or_b32_e32 v11, v12, v13
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v9, 24, v6
 ; GFX7-NEXT:    v_and_b32_e32 v0, v6, v0
 ; GFX7-NEXT:    v_bfe_u32 v6, v6, 16, 8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v16, 8, v16
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
-; GFX7-NEXT:    v_or_b32_e32 v3, v10, v3
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v16, 8, v16
 ; GFX7-NEXT:    v_or_b32_e32 v12, v14, v15
+; GFX7-NEXT:    v_or_b32_e32 v3, v10, v3
 ; GFX7-NEXT:    v_or_b32_e32 v4, v11, v4
-; GFX7-NEXT:    v_or_b32_e32 v1, v3, v1
-; GFX7-NEXT:    v_or_b32_e32 v3, v4, v7
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
-; GFX7-NEXT:    v_or_b32_e32 v5, v12, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v16
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX7-NEXT:    v_or_b32_e32 v5, v12, v5
+; GFX7-NEXT:    v_or_b32_e32 v1, v3, v1
+; GFX7-NEXT:    v_or_b32_e32 v3, v4, v7
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v6
 ; GFX7-NEXT:    v_or_b32_e32 v4, v5, v8
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v17
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v9
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v17
 ; GFX7-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 3, v2
@@ -2586,9 +2586,9 @@ define i8 @extractelement_vgpr_v16i8_vgpr_idx(<16 x i8> addrspace(1)* %ptr, i32
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 8
 ; GFX10-NEXT:    s_mov_b32 s5, 16
 ; GFX10-NEXT:    s_movk_i32 s6, 0xff
-; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 2, v2
-; GFX10-NEXT:    v_mov_b32_e32 v7, 16
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0xff
+; GFX10-NEXT:    v_mov_b32_e32 v7, 16
+; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 2, v2
 ; GFX10-NEXT:    v_and_b32_e32 v2, 3, v2
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v8
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
@@ -2604,15 +2604,15 @@ define i8 @extractelement_vgpr_v16i8_vgpr_idx(<16 x i8> addrspace(1)* %ptr, i32
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
 ; GFX10-NEXT:    v_and_or_b32 v4, v4, s6, v15
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v18, v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 24, v6
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v18, v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX10-NEXT:    v_and_or_b32 v5, v5, v0, v17
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 24, v11
 ; GFX10-NEXT:    v_or3_b32 v3, v3, v14, v9
 ; GFX10-NEXT:    v_or3_b32 v4, v4, v16, v10
-; GFX10-NEXT:    v_and_or_b32 v0, v6, v0, v1
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v7, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX10-NEXT:    v_and_or_b32 v0, v6, v0, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 24, v12
 ; GFX10-NEXT:    v_or3_b32 v5, v5, v18, v11
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc_lo
@@ -2647,33 +2647,33 @@ define amdgpu_ps i8 @extractelement_sgpr_v16i8_vgpr_idx(<16 x i8> addrspace(4)*
 ; GCN-NEXT:    s_bfe_u32 s0, s0, s11
 ; GCN-NEXT:    s_lshl_b32 s0, s0, 16
 ; GCN-NEXT:    s_or_b32 s0, s9, s0
-; GCN-NEXT:    s_bfe_u32 s9, s1, s10
 ; GCN-NEXT:    s_lshl_b32 s4, s4, 24
-; GCN-NEXT:    s_or_b32 s0, s0, s4
+; GCN-NEXT:    s_bfe_u32 s9, s1, s10
 ; GCN-NEXT:    s_lshr_b32 s5, s1, 24
+; GCN-NEXT:    s_or_b32 s0, s0, s4
 ; GCN-NEXT:    s_and_b32 s4, s1, s8
-; GCN-NEXT:    s_bfe_u32 s1, s1, s11
 ; GCN-NEXT:    s_lshl_b32 s9, s9, 8
+; GCN-NEXT:    s_bfe_u32 s1, s1, s11
 ; GCN-NEXT:    s_or_b32 s4, s4, s9
 ; GCN-NEXT:    s_lshl_b32 s1, s1, 16
 ; GCN-NEXT:    s_or_b32 s1, s4, s1
 ; GCN-NEXT:    s_lshl_b32 s4, s5, 24
 ; GCN-NEXT:    s_bfe_u32 s5, s2, s10
-; GCN-NEXT:    s_or_b32 s1, s1, s4
 ; GCN-NEXT:    s_lshr_b32 s6, s2, 24
+; GCN-NEXT:    s_or_b32 s1, s1, s4
 ; GCN-NEXT:    s_and_b32 s4, s2, s8
-; GCN-NEXT:    s_bfe_u32 s2, s2, s11
 ; GCN-NEXT:    s_lshl_b32 s5, s5, 8
+; GCN-NEXT:    s_bfe_u32 s2, s2, s11
 ; GCN-NEXT:    s_or_b32 s4, s4, s5
 ; GCN-NEXT:    s_lshl_b32 s2, s2, 16
-; GCN-NEXT:    s_bfe_u32 s5, s3, s10
 ; GCN-NEXT:    s_or_b32 s2, s4, s2
 ; GCN-NEXT:    s_lshl_b32 s4, s6, 24
-; GCN-NEXT:    s_or_b32 s2, s2, s4
+; GCN-NEXT:    s_bfe_u32 s5, s3, s10
 ; GCN-NEXT:    s_lshr_b32 s7, s3, 24
+; GCN-NEXT:    s_or_b32 s2, s2, s4
 ; GCN-NEXT:    s_and_b32 s4, s3, s8
-; GCN-NEXT:    s_bfe_u32 s3, s3, s11
 ; GCN-NEXT:    s_lshl_b32 s5, s5, 8
+; GCN-NEXT:    s_bfe_u32 s3, s3, s11
 ; GCN-NEXT:    s_or_b32 s4, s4, s5
 ; GCN-NEXT:    s_lshl_b32 s3, s3, 16
 ; GCN-NEXT:    s_or_b32 s3, s4, s3
@@ -2681,13 +2681,13 @@ define amdgpu_ps i8 @extractelement_sgpr_v16i8_vgpr_idx(<16 x i8> addrspace(4)*
 ; GCN-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN-NEXT:    v_mov_b32_e32 v3, s1
 ; GCN-NEXT:    s_or_b32 s3, s3, s4
-; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; GCN-NEXT:    v_mov_b32_e32 v4, s2
+; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v1
-; GCN-NEXT:    v_and_b32_e32 v0, 3, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GCN-NEXT:    v_mov_b32_e32 v5, s3
+; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v1
+; GCN-NEXT:    v_and_b32_e32 v0, 3, v0
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v5, vcc
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GCN-NEXT:    v_lshrrev_b32_e32 v0, v0, v1
@@ -2708,13 +2708,13 @@ define amdgpu_ps i8 @extractelement_sgpr_v16i8_vgpr_idx(<16 x i8> addrspace(4)*
 ; GFX10-NEXT:    s_bfe_u32 s12, s0, s5
 ; GFX10-NEXT:    s_bfe_u32 s14, s1, s5
 ; GFX10-NEXT:    s_lshr_b32 s8, s1, 24
+; GFX10-NEXT:    s_and_b32 s11, s0, s4
 ; GFX10-NEXT:    s_and_b32 s13, s1, s4
 ; GFX10-NEXT:    s_bfe_u32 s1, s1, s6
-; GFX10-NEXT:    s_and_b32 s11, s0, s4
 ; GFX10-NEXT:    s_lshl_b32 s12, s12, 8
 ; GFX10-NEXT:    s_lshl_b32 s14, s14, 8
-; GFX10-NEXT:    s_or_b32 s11, s11, s12
 ; GFX10-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX10-NEXT:    s_or_b32 s11, s11, s12
 ; GFX10-NEXT:    s_or_b32 s12, s13, s14
 ; GFX10-NEXT:    s_lshl_b32 s8, s8, 24
 ; GFX10-NEXT:    s_or_b32 s1, s12, s1
@@ -2726,10 +2726,10 @@ define amdgpu_ps i8 @extractelement_sgpr_v16i8_vgpr_idx(<16 x i8> addrspace(4)*
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX10-NEXT:    s_lshl_b32 s7, s7, 24
 ; GFX10-NEXT:    s_or_b32 s0, s11, s0
-; GFX10-NEXT:    s_and_b32 s15, s2, s4
 ; GFX10-NEXT:    s_lshr_b32 s9, s2, 24
-; GFX10-NEXT:    s_bfe_u32 s2, s2, s6
+; GFX10-NEXT:    s_and_b32 s15, s2, s4
 ; GFX10-NEXT:    s_lshl_b32 s16, s16, 8
+; GFX10-NEXT:    s_bfe_u32 s2, s2, s6
 ; GFX10-NEXT:    s_or_b32 s0, s0, s7
 ; GFX10-NEXT:    s_or_b32 s7, s15, s16
 ; GFX10-NEXT:    s_lshl_b32 s2, s2, 16
@@ -2738,9 +2738,9 @@ define amdgpu_ps i8 @extractelement_sgpr_v16i8_vgpr_idx(<16 x i8> addrspace(4)*
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 2, v1
 ; GFX10-NEXT:    s_or_b32 s2, s7, s2
 ; GFX10-NEXT:    s_lshl_b32 s7, s9, 24
-; GFX10-NEXT:    s_bfe_u32 s1, s3, s6
 ; GFX10-NEXT:    s_and_b32 s4, s3, s4
 ; GFX10-NEXT:    s_lshl_b32 s5, s5, 8
+; GFX10-NEXT:    s_bfe_u32 s1, s3, s6
 ; GFX10-NEXT:    s_or_b32 s2, s2, s7
 ; GFX10-NEXT:    s_lshr_b32 s10, s3, 24
 ; GFX10-NEXT:    s_or_b32 s3, s4, s5
@@ -2766,8 +2766,8 @@ define i8 @extractelement_vgpr_v16i8_idx0(<16 x i8> addrspace(1)* %ptr) {
 ; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 8
-; GFX9-NEXT:    v_mov_b32_e32 v3, 16
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0xff
+; GFX9-NEXT:    v_mov_b32_e32 v3, 16
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
@@ -2839,8 +2839,8 @@ define i8 @extractelement_vgpr_v16i8_idx1(<16 x i8> addrspace(1)* %ptr) {
 ; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
 ; GFX9-NEXT:    s_mov_b32 s4, 8
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v2, 16
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0xff
+; GFX9-NEXT:    v_mov_b32_e32 v2, 16
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v4, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
@@ -2993,8 +2993,8 @@ define i8 @extractelement_vgpr_v16i8_idx3(<16 x i8> addrspace(1)* %ptr) {
 ; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 8
-; GFX9-NEXT:    v_mov_b32_e32 v3, 16
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0xff
+; GFX9-NEXT:    v_mov_b32_e32 v3, 16
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
@@ -3070,10 +3070,10 @@ define i8 @extractelement_vgpr_v16i8_idx4(<16 x i8> addrspace(1)* %ptr) {
 ; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 8
-; GFX9-NEXT:    v_mov_b32_e32 v3, 16
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0xff
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 16
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 24, v1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_and_or_b32 v0, v1, v0, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 24, v4
@@ -3143,10 +3143,10 @@ define i8 @extractelement_vgpr_v16i8_idx5(<16 x i8> addrspace(1)* %ptr) {
 ; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
 ; GFX9-NEXT:    s_mov_b32 s4, 8
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v2, 16
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0xff
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v4, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_mov_b32_e32 v2, 16
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 24, v1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v4, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_and_or_b32 v0, v1, v0, v4
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 24, v3
@@ -3222,8 +3222,8 @@ define i8 @extractelement_vgpr_v16i8_idx6(<16 x i8> addrspace(1)* %ptr) {
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 8
 ; GFX9-NEXT:    s_mov_b32 s4, 16
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0xff
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 24, v1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v4, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_and_or_b32 v0, v1, v0, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 24, v3
@@ -3297,10 +3297,10 @@ define i8 @extractelement_vgpr_v16i8_idx7(<16 x i8> addrspace(1)* %ptr) {
 ; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 8
-; GFX9-NEXT:    v_mov_b32_e32 v3, 16
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0xff
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 16
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 24, v1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_and_or_b32 v0, v1, v0, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 24, v4
@@ -3374,12 +3374,12 @@ define i8 @extractelement_vgpr_v16i8_idx8(<16 x i8> addrspace(1)* %ptr) {
 ; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 8
-; GFX9-NEXT:    v_mov_b32_e32 v3, 16
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0xff
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 16
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 24, v2
-; GFX9-NEXT:    v_and_or_b32 v0, v2, v0, v1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_and_or_b32 v0, v2, v0, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 24, v4
 ; GFX9-NEXT:    v_or3_b32 v0, v0, v3, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -3447,10 +3447,10 @@ define i8 @extractelement_vgpr_v16i8_idx9(<16 x i8> addrspace(1)* %ptr) {
 ; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
 ; GFX9-NEXT:    s_mov_b32 s4, 8
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, 16
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0xff
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v4, s4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_mov_b32_e32 v1, 16
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v4, s4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_and_or_b32 v0, v2, v0, v4
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
@@ -3526,10 +3526,10 @@ define i8 @extractelement_vgpr_v16i8_idx10(<16 x i8> addrspace(1)* %ptr) {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 8
 ; GFX9-NEXT:    s_mov_b32 s4, 16
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0xff
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
-; GFX9-NEXT:    v_and_or_b32 v0, v2, v0, v1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v4, s4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_and_or_b32 v0, v2, v0, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 24, v3
 ; GFX9-NEXT:    v_or3_b32 v0, v0, v4, v1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -3601,12 +3601,12 @@ define i8 @extractelement_vgpr_v16i8_idx11(<16 x i8> addrspace(1)* %ptr) {
 ; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 8
-; GFX9-NEXT:    v_mov_b32_e32 v3, 16
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0xff
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 16
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 24, v2
-; GFX9-NEXT:    v_and_or_b32 v0, v2, v0, v1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_and_or_b32 v0, v2, v0, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 24, v4
 ; GFX9-NEXT:    v_or3_b32 v0, v0, v3, v1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
@@ -3678,12 +3678,12 @@ define i8 @extractelement_vgpr_v16i8_idx12(<16 x i8> addrspace(1)* %ptr) {
 ; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 8
-; GFX9-NEXT:    v_mov_b32_e32 v2, 16
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0xff
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_mov_b32_e32 v2, 16
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 24, v3
-; GFX9-NEXT:    v_and_or_b32 v0, v3, v0, v1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_and_or_b32 v0, v3, v0, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 24, v4
 ; GFX9-NEXT:    v_or3_b32 v0, v0, v2, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -3751,8 +3751,8 @@ define i8 @extractelement_vgpr_v16i8_idx13(<16 x i8> addrspace(1)* %ptr) {
 ; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
 ; GFX9-NEXT:    s_mov_b32 s4, 8
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, 16
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0xff
+; GFX9-NEXT:    v_mov_b32_e32 v1, 16
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 24, v3
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v4, s4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
@@ -3830,10 +3830,10 @@ define i8 @extractelement_vgpr_v16i8_idx14(<16 x i8> addrspace(1)* %ptr) {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 8
 ; GFX9-NEXT:    s_mov_b32 s4, 16
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0xff
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 24, v3
-; GFX9-NEXT:    v_and_or_b32 v0, v3, v0, v1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v4, s4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_and_or_b32 v0, v3, v0, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 24, v2
 ; GFX9-NEXT:    v_or3_b32 v0, v0, v4, v1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -3905,12 +3905,12 @@ define i8 @extractelement_vgpr_v16i8_idx15(<16 x i8> addrspace(1)* %ptr) {
 ; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 8
-; GFX9-NEXT:    v_mov_b32_e32 v2, 16
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0xff
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_mov_b32_e32 v2, 16
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 24, v3
-; GFX9-NEXT:    v_and_or_b32 v0, v3, v0, v1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_and_or_b32 v0, v3, v0, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 24, v4
 ; GFX9-NEXT:    v_or3_b32 v0, v0, v2, v1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 24, v0

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
index 548debc54788b..614236a115870 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
@@ -8,22 +8,22 @@ define float @dyn_extract_v8f32_const_s_v(i32 %sel) {
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v6, 1.0, 2.0, vcc
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0x40400000
+; GCN-NEXT:    v_cndmask_b32_e64 v6, 1.0, 2.0, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v0
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, 4.0, vcc
 ; GCN-NEXT:    v_mov_b32_e32 v2, 0x40a00000
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, 4.0, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GCN-NEXT:    v_mov_b32_e32 v3, 0x40c00000
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GCN-NEXT:    v_mov_b32_e32 v4, 0x40e00000
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 6, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GCN-NEXT:    v_mov_b32_e32 v5, 0x41000000
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v0
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v5, vcc
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
@@ -105,23 +105,23 @@ define amdgpu_ps float @dyn_extract_v8f32_s_v(<8 x float> inreg %vec, i32 %sel)
 ; GCN-NEXT:    v_mov_b32_e32 v2, s1
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GCN-NEXT:    s_mov_b32 s3, s5
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GCN-NEXT:    v_mov_b32_e32 v3, s2
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GCN-NEXT:    v_mov_b32_e32 v4, s3
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GCN-NEXT:    v_mov_b32_e32 v5, s6
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
 ; GCN-NEXT:    v_mov_b32_e32 v6, s7
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
 ; GCN-NEXT:    v_mov_b32_e32 v7, s8
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 6, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
 ; GCN-NEXT:    v_mov_b32_e32 v8, s9
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v0
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v8, vcc
 ; GCN-NEXT:    ; return to shader part epilog
@@ -291,21 +291,21 @@ define i64 @dyn_extract_v8i64_const_s_v(i32 %sel) {
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b64 s[4:5], 1
 ; GCN-NEXT:    s_mov_b64 s[6:7], 2
-; GCN-NEXT:    s_mov_b64 s[8:9], 3
 ; GCN-NEXT:    v_mov_b32_e32 v1, s4
 ; GCN-NEXT:    v_mov_b32_e32 v2, s5
 ; GCN-NEXT:    v_mov_b32_e32 v3, s6
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GCN-NEXT:    v_mov_b32_e32 v4, s7
-; GCN-NEXT:    s_mov_b64 s[10:11], 4
+; GCN-NEXT:    s_mov_b64 s[8:9], 3
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GCN-NEXT:    v_mov_b32_e32 v5, s8
 ; GCN-NEXT:    v_mov_b32_e32 v6, s9
+; GCN-NEXT:    s_mov_b64 s[10:11], 4
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v0
-; GCN-NEXT:    s_mov_b64 s[12:13], 5
 ; GCN-NEXT:    v_mov_b32_e32 v7, s10
 ; GCN-NEXT:    v_mov_b32_e32 v8, s11
+; GCN-NEXT:    s_mov_b64 s[12:13], 5
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
 ; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v0
@@ -446,8 +446,8 @@ define amdgpu_ps void @dyn_extract_v8i64_s_v(<8 x i64> inreg %vec, i32 %sel) {
 ; GPRIDX-NEXT:    v_mov_b32_e32 v1, s0
 ; GPRIDX-NEXT:    v_mov_b32_e32 v2, s1
 ; GPRIDX-NEXT:    v_mov_b32_e32 v3, s2
-; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GPRIDX-NEXT:    v_mov_b32_e32 v4, s3
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GPRIDX-NEXT:    s_mov_b32 s6, s8
 ; GPRIDX-NEXT:    s_mov_b32 s7, s9
 ; GPRIDX-NEXT:    v_mov_b32_e32 v5, s4
@@ -500,8 +500,8 @@ define amdgpu_ps void @dyn_extract_v8i64_s_v(<8 x i64> inreg %vec, i32 %sel) {
 ; MOVREL-NEXT:    v_mov_b32_e32 v1, s0
 ; MOVREL-NEXT:    v_mov_b32_e32 v2, s1
 ; MOVREL-NEXT:    v_mov_b32_e32 v3, s2
-; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; MOVREL-NEXT:    v_mov_b32_e32 v4, s3
+; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; MOVREL-NEXT:    s_mov_b32 s6, s8
 ; MOVREL-NEXT:    s_mov_b32 s7, s9
 ; MOVREL-NEXT:    v_mov_b32_e32 v5, s4
@@ -545,11 +545,11 @@ define amdgpu_ps void @dyn_extract_v8i64_s_v(<8 x i64> inreg %vec, i32 %sel) {
 ;
 ; GFX10-LABEL: dyn_extract_v8i64_s_v:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_mov_b32 s19, s5
 ; GFX10-NEXT:    s_mov_b32 s0, s2
 ; GFX10-NEXT:    s_mov_b32 s2, s4
-; GFX10-NEXT:    v_mov_b32_e32 v2, s19
+; GFX10-NEXT:    s_mov_b32 s19, s5
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s2
+; GFX10-NEXT:    v_mov_b32_e32 v2, s19
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
 ; GFX10-NEXT:    s_mov_b32 s1, s3
 ; GFX10-NEXT:    s_mov_b32 s4, s6
@@ -1936,17 +1936,17 @@ define amdgpu_ps float @dyn_extract_v6f32_s_v(<6 x float> inreg %vec, i32 %sel)
 ; GCN-NEXT:    v_mov_b32_e32 v1, s0
 ; GCN-NEXT:    v_mov_b32_e32 v2, s1
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GCN-NEXT:    v_mov_b32_e32 v3, s4
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GCN-NEXT:    v_mov_b32_e32 v4, s5
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GCN-NEXT:    v_mov_b32_e32 v5, s6
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
 ; GCN-NEXT:    v_mov_b32_e32 v6, s7
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v0
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v6, vcc
 ; GCN-NEXT:    ; return to shader part epilog
@@ -2089,20 +2089,20 @@ define amdgpu_ps float @dyn_extract_v7f32_s_v(<7 x float> inreg %vec, i32 %sel)
 ; GCN-NEXT:    v_mov_b32_e32 v1, s0
 ; GCN-NEXT:    v_mov_b32_e32 v2, s1
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GCN-NEXT:    v_mov_b32_e32 v3, s2
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GCN-NEXT:    v_mov_b32_e32 v4, s5
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GCN-NEXT:    v_mov_b32_e32 v5, s6
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
 ; GCN-NEXT:    v_mov_b32_e32 v6, s7
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
 ; GCN-NEXT:    v_mov_b32_e32 v7, s8
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 6, v0
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v7, vcc
 ; GCN-NEXT:    ; return to shader part epilog
@@ -2263,8 +2263,8 @@ define amdgpu_ps double @dyn_extract_v6f64_s_v(<6 x double> inreg %vec, i32 %sel
 ; GCN-NEXT:    v_mov_b32_e32 v1, s0
 ; GCN-NEXT:    v_mov_b32_e32 v2, s1
 ; GCN-NEXT:    v_mov_b32_e32 v3, s2
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GCN-NEXT:    v_mov_b32_e32 v4, s3
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GCN-NEXT:    s_mov_b32 s6, s8
 ; GCN-NEXT:    s_mov_b32 s7, s9
 ; GCN-NEXT:    v_mov_b32_e32 v5, s4
@@ -2295,11 +2295,11 @@ define amdgpu_ps double @dyn_extract_v6f64_s_v(<6 x double> inreg %vec, i32 %sel
 ;
 ; GFX10-LABEL: dyn_extract_v6f64_s_v:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_mov_b32 s15, s5
 ; GFX10-NEXT:    s_mov_b32 s0, s2
 ; GFX10-NEXT:    s_mov_b32 s2, s4
-; GFX10-NEXT:    v_mov_b32_e32 v2, s15
+; GFX10-NEXT:    s_mov_b32 s15, s5
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s2
+; GFX10-NEXT:    v_mov_b32_e32 v2, s15
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
 ; GFX10-NEXT:    s_mov_b32 s1, s3
 ; GFX10-NEXT:    s_mov_b32 s4, s6
@@ -2465,8 +2465,8 @@ define amdgpu_ps double @dyn_extract_v7f64_s_v(<7 x double> inreg %vec, i32 %sel
 ; GCN-NEXT:    v_mov_b32_e32 v1, s0
 ; GCN-NEXT:    v_mov_b32_e32 v2, s1
 ; GCN-NEXT:    v_mov_b32_e32 v3, s2
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GCN-NEXT:    v_mov_b32_e32 v4, s3
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GCN-NEXT:    s_mov_b32 s6, s8
 ; GCN-NEXT:    s_mov_b32 s7, s9
 ; GCN-NEXT:    v_mov_b32_e32 v5, s4
@@ -2504,11 +2504,11 @@ define amdgpu_ps double @dyn_extract_v7f64_s_v(<7 x double> inreg %vec, i32 %sel
 ;
 ; GFX10-LABEL: dyn_extract_v7f64_s_v:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_mov_b32 s19, s5
 ; GFX10-NEXT:    s_mov_b32 s0, s2
 ; GFX10-NEXT:    s_mov_b32 s2, s4
-; GFX10-NEXT:    v_mov_b32_e32 v2, s19
+; GFX10-NEXT:    s_mov_b32 s19, s5
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s2
+; GFX10-NEXT:    v_mov_b32_e32 v2, s19
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
 ; GFX10-NEXT:    s_mov_b32 s1, s3
 ; GFX10-NEXT:    s_mov_b32 s4, s6
@@ -2963,43 +2963,43 @@ define float @dyn_extract_v15f32_const_s_v(i32 %sel) {
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v13, 1.0, 2.0, vcc
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0x40400000
+; GCN-NEXT:    v_cndmask_b32_e64 v13, 1.0, 2.0, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v0
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v13, v1, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, 4.0, vcc
 ; GCN-NEXT:    v_mov_b32_e32 v2, 0x40a00000
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, 4.0, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GCN-NEXT:    v_mov_b32_e32 v3, 0x40c00000
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GCN-NEXT:    v_mov_b32_e32 v4, 0x40e00000
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 6, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GCN-NEXT:    v_mov_b32_e32 v5, 0x41000000
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
 ; GCN-NEXT:    v_mov_b32_e32 v6, 0x41100000
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 8, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
 ; GCN-NEXT:    v_mov_b32_e32 v7, 0x41200000
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 9, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
 ; GCN-NEXT:    v_mov_b32_e32 v8, 0x41300000
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 10, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v8, vcc
 ; GCN-NEXT:    v_mov_b32_e32 v9, 0x41400000
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v8, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 11, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
 ; GCN-NEXT:    v_mov_b32_e32 v10, 0x41500000
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 12, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v10, vcc
 ; GCN-NEXT:    v_mov_b32_e32 v11, 0x41600000
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v10, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 13, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v11, vcc
 ; GCN-NEXT:    v_mov_b32_e32 v12, 0x41700000
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v11, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 14, v0
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v12, vcc
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
@@ -3101,51 +3101,51 @@ define amdgpu_ps float @dyn_extract_v15f32_s_v(<15 x float> inreg %vec, i32 %sel
 ; GCN-NEXT:    v_mov_b32_e32 v2, s1
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GCN-NEXT:    s_mov_b32 s3, s5
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GCN-NEXT:    v_mov_b32_e32 v3, s2
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v0
 ; GCN-NEXT:    s_mov_b32 s4, s6
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GCN-NEXT:    v_mov_b32_e32 v4, s3
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v0
 ; GCN-NEXT:    s_mov_b32 s5, s7
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GCN-NEXT:    v_mov_b32_e32 v5, s4
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v0
 ; GCN-NEXT:    s_mov_b32 s6, s8
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
 ; GCN-NEXT:    v_mov_b32_e32 v6, s5
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v0
 ; GCN-NEXT:    s_mov_b32 s7, s9
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
 ; GCN-NEXT:    v_mov_b32_e32 v7, s6
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 6, v0
 ; GCN-NEXT:    s_mov_b32 s8, s10
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
 ; GCN-NEXT:    v_mov_b32_e32 v8, s7
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v0
 ; GCN-NEXT:    s_mov_b32 s9, s11
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v8, vcc
 ; GCN-NEXT:    v_mov_b32_e32 v9, s8
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v8, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 8, v0
 ; GCN-NEXT:    s_mov_b32 s10, s12
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
 ; GCN-NEXT:    v_mov_b32_e32 v10, s9
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 9, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v10, vcc
 ; GCN-NEXT:    v_mov_b32_e32 v11, s10
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v10, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 10, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v11, vcc
 ; GCN-NEXT:    v_mov_b32_e32 v12, s13
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v11, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 11, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v12, vcc
 ; GCN-NEXT:    v_mov_b32_e32 v13, s14
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v12, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 12, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v13, vcc
 ; GCN-NEXT:    v_mov_b32_e32 v14, s15
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v13, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 13, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v14, vcc
 ; GCN-NEXT:    v_mov_b32_e32 v15, s16
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v14, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 14, v0
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v15, vcc
 ; GCN-NEXT:    ; return to shader part epilog

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll
index aa6a244a1254b..110d9c6c7702d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll
@@ -632,8 +632,8 @@ define <2 x float> @v_fdiv_v2f32(<2 x float> %a, <2 x float> %b) {
 ; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v4, v4, v5, v7
 ; GFX6-FLUSH-NEXT:    v_div_scale_f32 v5, s[4:5], v3, v3, v1
-; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v0, v4, v2, v0
 ; GFX6-FLUSH-NEXT:    v_rcp_f32_e32 v6, v5
+; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v0, v4, v2, v0
 ; GFX6-FLUSH-NEXT:    v_div_scale_f32 v2, vcc, v1, v3, v1
 ; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; GFX6-FLUSH-NEXT:    v_fma_f32 v4, -v5, v6, 1.0
@@ -657,16 +657,16 @@ define <2 x float> @v_fdiv_v2f32(<2 x float> %a, <2 x float> %b) {
 ; GFX89-IEEE-NEXT:    v_rcp_f32_e32 v8, v4
 ; GFX89-IEEE-NEXT:    v_rcp_f32_e32 v9, v5
 ; GFX89-IEEE-NEXT:    v_fma_f32 v10, -v4, v8, 1.0
-; GFX89-IEEE-NEXT:    v_fma_f32 v11, -v5, v9, 1.0
 ; GFX89-IEEE-NEXT:    v_fma_f32 v8, v10, v8, v8
+; GFX89-IEEE-NEXT:    v_fma_f32 v11, -v5, v9, 1.0
 ; GFX89-IEEE-NEXT:    v_fma_f32 v9, v11, v9, v9
 ; GFX89-IEEE-NEXT:    v_mul_f32_e32 v10, v6, v8
-; GFX89-IEEE-NEXT:    v_fma_f32 v12, -v4, v10, v6
 ; GFX89-IEEE-NEXT:    v_mul_f32_e32 v11, v7, v9
+; GFX89-IEEE-NEXT:    v_fma_f32 v12, -v4, v10, v6
 ; GFX89-IEEE-NEXT:    v_fma_f32 v13, -v5, v11, v7
 ; GFX89-IEEE-NEXT:    v_fma_f32 v10, v12, v8, v10
-; GFX89-IEEE-NEXT:    v_fma_f32 v4, -v4, v10, v6
 ; GFX89-IEEE-NEXT:    v_fma_f32 v11, v13, v9, v11
+; GFX89-IEEE-NEXT:    v_fma_f32 v4, -v4, v10, v6
 ; GFX89-IEEE-NEXT:    v_div_fmas_f32 v4, v4, v8, v10
 ; GFX89-IEEE-NEXT:    v_fma_f32 v5, -v5, v11, v7
 ; GFX89-IEEE-NEXT:    s_mov_b64 vcc, s[4:5]
@@ -689,11 +689,11 @@ define <2 x float> @v_fdiv_v2f32(<2 x float> %a, <2 x float> %b) {
 ; GFX89-FLUSH-NEXT:    v_fma_f32 v7, v8, v6, v7
 ; GFX89-FLUSH-NEXT:    v_fma_f32 v4, -v4, v7, v5
 ; GFX89-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX89-FLUSH-NEXT:    v_div_fmas_f32 v4, v4, v6, v7
 ; GFX89-FLUSH-NEXT:    v_div_scale_f32 v5, s[4:5], v3, v3, v1
+; GFX89-FLUSH-NEXT:    v_div_fmas_f32 v4, v4, v6, v7
 ; GFX89-FLUSH-NEXT:    v_div_scale_f32 v6, vcc, v1, v3, v1
-; GFX89-FLUSH-NEXT:    v_div_fixup_f32 v0, v4, v2, v0
 ; GFX89-FLUSH-NEXT:    v_rcp_f32_e32 v7, v5
+; GFX89-FLUSH-NEXT:    v_div_fixup_f32 v0, v4, v2, v0
 ; GFX89-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; GFX89-FLUSH-NEXT:    v_fma_f32 v2, -v5, v7, 1.0
 ; GFX89-FLUSH-NEXT:    v_fma_f32 v2, v2, v7, v7
@@ -718,8 +718,8 @@ define <2 x float> @v_fdiv_v2f32(<2 x float> %a, <2 x float> %b) {
 ; GFX10-IEEE-NEXT:    v_fma_f32 v8, -v4, v6, 1.0
 ; GFX10-IEEE-NEXT:    v_fma_f32 v9, -v5, v7, 1.0
 ; GFX10-IEEE-NEXT:    v_fmac_f32_e32 v6, v8, v6
-; GFX10-IEEE-NEXT:    v_fmac_f32_e32 v7, v9, v7
 ; GFX10-IEEE-NEXT:    v_div_scale_f32 v8, s4, v1, v3, v1
+; GFX10-IEEE-NEXT:    v_fmac_f32_e32 v7, v9, v7
 ; GFX10-IEEE-NEXT:    v_mul_f32_e32 v9, v10, v6
 ; GFX10-IEEE-NEXT:    v_mul_f32_e32 v11, v8, v7
 ; GFX10-IEEE-NEXT:    v_fma_f32 v12, v9, -v4, v10
@@ -850,16 +850,16 @@ define <2 x float> @v_fdiv_v2f32_ulp25(<2 x float> %a, <2 x float> %b) {
 ; GFX89-IEEE-NEXT:    v_rcp_f32_e32 v8, v4
 ; GFX89-IEEE-NEXT:    v_rcp_f32_e32 v9, v5
 ; GFX89-IEEE-NEXT:    v_fma_f32 v10, -v4, v8, 1.0
-; GFX89-IEEE-NEXT:    v_fma_f32 v11, -v5, v9, 1.0
 ; GFX89-IEEE-NEXT:    v_fma_f32 v8, v10, v8, v8
+; GFX89-IEEE-NEXT:    v_fma_f32 v11, -v5, v9, 1.0
 ; GFX89-IEEE-NEXT:    v_fma_f32 v9, v11, v9, v9
 ; GFX89-IEEE-NEXT:    v_mul_f32_e32 v10, v6, v8
-; GFX89-IEEE-NEXT:    v_fma_f32 v12, -v4, v10, v6
 ; GFX89-IEEE-NEXT:    v_mul_f32_e32 v11, v7, v9
+; GFX89-IEEE-NEXT:    v_fma_f32 v12, -v4, v10, v6
 ; GFX89-IEEE-NEXT:    v_fma_f32 v13, -v5, v11, v7
 ; GFX89-IEEE-NEXT:    v_fma_f32 v10, v12, v8, v10
-; GFX89-IEEE-NEXT:    v_fma_f32 v4, -v4, v10, v6
 ; GFX89-IEEE-NEXT:    v_fma_f32 v11, v13, v9, v11
+; GFX89-IEEE-NEXT:    v_fma_f32 v4, -v4, v10, v6
 ; GFX89-IEEE-NEXT:    v_div_fmas_f32 v4, v4, v8, v10
 ; GFX89-IEEE-NEXT:    v_fma_f32 v5, -v5, v11, v7
 ; GFX89-IEEE-NEXT:    s_mov_b64 vcc, s[4:5]
@@ -880,8 +880,8 @@ define <2 x float> @v_fdiv_v2f32_ulp25(<2 x float> %a, <2 x float> %b) {
 ; GFX10-IEEE-NEXT:    v_fma_f32 v8, -v4, v6, 1.0
 ; GFX10-IEEE-NEXT:    v_fma_f32 v9, -v5, v7, 1.0
 ; GFX10-IEEE-NEXT:    v_fmac_f32_e32 v6, v8, v6
-; GFX10-IEEE-NEXT:    v_fmac_f32_e32 v7, v9, v7
 ; GFX10-IEEE-NEXT:    v_div_scale_f32 v8, s4, v1, v3, v1
+; GFX10-IEEE-NEXT:    v_fmac_f32_e32 v7, v9, v7
 ; GFX10-IEEE-NEXT:    v_mul_f32_e32 v9, v10, v6
 ; GFX10-IEEE-NEXT:    v_mul_f32_e32 v11, v8, v7
 ; GFX10-IEEE-NEXT:    v_fma_f32 v12, v9, -v4, v10
@@ -964,8 +964,8 @@ define <2 x float> @v_rcp_v2f32(<2 x float> %x) {
 ; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX6-FLUSH-NEXT:    v_div_scale_f32 v3, s[4:5], v1, v1, 1.0
-; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v0, v2, v0, 1.0
 ; GFX6-FLUSH-NEXT:    v_rcp_f32_e32 v4, v3
+; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v0, v2, v0, 1.0
 ; GFX6-FLUSH-NEXT:    v_div_scale_f32 v2, vcc, 1.0, v1, 1.0
 ; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; GFX6-FLUSH-NEXT:    v_fma_f32 v5, -v3, v4, 1.0
@@ -989,18 +989,18 @@ define <2 x float> @v_rcp_v2f32(<2 x float> %x) {
 ; GFX89-IEEE-NEXT:    v_rcp_f32_e32 v6, v2
 ; GFX89-IEEE-NEXT:    v_rcp_f32_e32 v7, v3
 ; GFX89-IEEE-NEXT:    v_fma_f32 v8, -v2, v6, 1.0
-; GFX89-IEEE-NEXT:    v_fma_f32 v9, -v3, v7, 1.0
 ; GFX89-IEEE-NEXT:    v_fma_f32 v6, v8, v6, v6
+; GFX89-IEEE-NEXT:    v_fma_f32 v9, -v3, v7, 1.0
 ; GFX89-IEEE-NEXT:    v_fma_f32 v7, v9, v7, v7
 ; GFX89-IEEE-NEXT:    v_mul_f32_e32 v8, v4, v6
-; GFX89-IEEE-NEXT:    v_fma_f32 v10, -v2, v8, v4
 ; GFX89-IEEE-NEXT:    v_mul_f32_e32 v9, v5, v7
+; GFX89-IEEE-NEXT:    v_fma_f32 v10, -v2, v8, v4
 ; GFX89-IEEE-NEXT:    v_fma_f32 v11, -v3, v9, v5
 ; GFX89-IEEE-NEXT:    v_fma_f32 v8, v10, v6, v8
-; GFX89-IEEE-NEXT:    v_fma_f32 v2, -v2, v8, v4
 ; GFX89-IEEE-NEXT:    v_fma_f32 v9, v11, v7, v9
-; GFX89-IEEE-NEXT:    v_div_fmas_f32 v2, v2, v6, v8
+; GFX89-IEEE-NEXT:    v_fma_f32 v2, -v2, v8, v4
 ; GFX89-IEEE-NEXT:    v_fma_f32 v3, -v3, v9, v5
+; GFX89-IEEE-NEXT:    v_div_fmas_f32 v2, v2, v6, v8
 ; GFX89-IEEE-NEXT:    s_mov_b64 vcc, s[4:5]
 ; GFX89-IEEE-NEXT:    v_div_fmas_f32 v3, v3, v7, v9
 ; GFX89-IEEE-NEXT:    v_div_fixup_f32 v0, v2, v0, 1.0
@@ -1021,11 +1021,11 @@ define <2 x float> @v_rcp_v2f32(<2 x float> %x) {
 ; GFX89-FLUSH-NEXT:    v_fma_f32 v5, v6, v4, v5
 ; GFX89-FLUSH-NEXT:    v_fma_f32 v2, -v2, v5, v3
 ; GFX89-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX89-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX89-FLUSH-NEXT:    v_div_scale_f32 v3, s[4:5], v1, v1, 1.0
+; GFX89-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX89-FLUSH-NEXT:    v_div_scale_f32 v4, vcc, 1.0, v1, 1.0
-; GFX89-FLUSH-NEXT:    v_div_fixup_f32 v0, v2, v0, 1.0
 ; GFX89-FLUSH-NEXT:    v_rcp_f32_e32 v5, v3
+; GFX89-FLUSH-NEXT:    v_div_fixup_f32 v0, v2, v0, 1.0
 ; GFX89-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; GFX89-FLUSH-NEXT:    v_fma_f32 v2, -v3, v5, 1.0
 ; GFX89-FLUSH-NEXT:    v_fma_f32 v2, v2, v5, v5
@@ -1050,8 +1050,8 @@ define <2 x float> @v_rcp_v2f32(<2 x float> %x) {
 ; GFX10-IEEE-NEXT:    v_fma_f32 v6, -v2, v4, 1.0
 ; GFX10-IEEE-NEXT:    v_fma_f32 v7, -v3, v5, 1.0
 ; GFX10-IEEE-NEXT:    v_fmac_f32_e32 v4, v6, v4
-; GFX10-IEEE-NEXT:    v_fmac_f32_e32 v5, v7, v5
 ; GFX10-IEEE-NEXT:    v_div_scale_f32 v6, s4, 1.0, v1, 1.0
+; GFX10-IEEE-NEXT:    v_fmac_f32_e32 v5, v7, v5
 ; GFX10-IEEE-NEXT:    v_mul_f32_e32 v7, v8, v4
 ; GFX10-IEEE-NEXT:    v_mul_f32_e32 v9, v6, v5
 ; GFX10-IEEE-NEXT:    v_fma_f32 v10, v7, -v2, v8
@@ -1146,8 +1146,8 @@ define <2 x float> @v_rcp_v2f32_arcp(<2 x float> %x) {
 ; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX6-FLUSH-NEXT:    v_div_scale_f32 v3, s[4:5], v1, v1, 1.0
-; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v0, v2, v0, 1.0
 ; GFX6-FLUSH-NEXT:    v_rcp_f32_e32 v4, v3
+; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v0, v2, v0, 1.0
 ; GFX6-FLUSH-NEXT:    v_div_scale_f32 v2, vcc, 1.0, v1, 1.0
 ; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; GFX6-FLUSH-NEXT:    v_fma_f32 v5, -v3, v4, 1.0
@@ -1171,18 +1171,18 @@ define <2 x float> @v_rcp_v2f32_arcp(<2 x float> %x) {
 ; GFX89-IEEE-NEXT:    v_rcp_f32_e32 v6, v2
 ; GFX89-IEEE-NEXT:    v_rcp_f32_e32 v7, v3
 ; GFX89-IEEE-NEXT:    v_fma_f32 v8, -v2, v6, 1.0
-; GFX89-IEEE-NEXT:    v_fma_f32 v9, -v3, v7, 1.0
 ; GFX89-IEEE-NEXT:    v_fma_f32 v6, v8, v6, v6
+; GFX89-IEEE-NEXT:    v_fma_f32 v9, -v3, v7, 1.0
 ; GFX89-IEEE-NEXT:    v_fma_f32 v7, v9, v7, v7
 ; GFX89-IEEE-NEXT:    v_mul_f32_e32 v8, v4, v6
-; GFX89-IEEE-NEXT:    v_fma_f32 v10, -v2, v8, v4
 ; GFX89-IEEE-NEXT:    v_mul_f32_e32 v9, v5, v7
+; GFX89-IEEE-NEXT:    v_fma_f32 v10, -v2, v8, v4
 ; GFX89-IEEE-NEXT:    v_fma_f32 v11, -v3, v9, v5
 ; GFX89-IEEE-NEXT:    v_fma_f32 v8, v10, v6, v8
-; GFX89-IEEE-NEXT:    v_fma_f32 v2, -v2, v8, v4
 ; GFX89-IEEE-NEXT:    v_fma_f32 v9, v11, v7, v9
-; GFX89-IEEE-NEXT:    v_div_fmas_f32 v2, v2, v6, v8
+; GFX89-IEEE-NEXT:    v_fma_f32 v2, -v2, v8, v4
 ; GFX89-IEEE-NEXT:    v_fma_f32 v3, -v3, v9, v5
+; GFX89-IEEE-NEXT:    v_div_fmas_f32 v2, v2, v6, v8
 ; GFX89-IEEE-NEXT:    s_mov_b64 vcc, s[4:5]
 ; GFX89-IEEE-NEXT:    v_div_fmas_f32 v3, v3, v7, v9
 ; GFX89-IEEE-NEXT:    v_div_fixup_f32 v0, v2, v0, 1.0
@@ -1203,11 +1203,11 @@ define <2 x float> @v_rcp_v2f32_arcp(<2 x float> %x) {
 ; GFX89-FLUSH-NEXT:    v_fma_f32 v5, v6, v4, v5
 ; GFX89-FLUSH-NEXT:    v_fma_f32 v2, -v2, v5, v3
 ; GFX89-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX89-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX89-FLUSH-NEXT:    v_div_scale_f32 v3, s[4:5], v1, v1, 1.0
+; GFX89-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX89-FLUSH-NEXT:    v_div_scale_f32 v4, vcc, 1.0, v1, 1.0
-; GFX89-FLUSH-NEXT:    v_div_fixup_f32 v0, v2, v0, 1.0
 ; GFX89-FLUSH-NEXT:    v_rcp_f32_e32 v5, v3
+; GFX89-FLUSH-NEXT:    v_div_fixup_f32 v0, v2, v0, 1.0
 ; GFX89-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; GFX89-FLUSH-NEXT:    v_fma_f32 v2, -v3, v5, 1.0
 ; GFX89-FLUSH-NEXT:    v_fma_f32 v2, v2, v5, v5
@@ -1232,8 +1232,8 @@ define <2 x float> @v_rcp_v2f32_arcp(<2 x float> %x) {
 ; GFX10-IEEE-NEXT:    v_fma_f32 v6, -v2, v4, 1.0
 ; GFX10-IEEE-NEXT:    v_fma_f32 v7, -v3, v5, 1.0
 ; GFX10-IEEE-NEXT:    v_fmac_f32_e32 v4, v6, v4
-; GFX10-IEEE-NEXT:    v_fmac_f32_e32 v5, v7, v5
 ; GFX10-IEEE-NEXT:    v_div_scale_f32 v6, s4, 1.0, v1, 1.0
+; GFX10-IEEE-NEXT:    v_fmac_f32_e32 v5, v7, v5
 ; GFX10-IEEE-NEXT:    v_mul_f32_e32 v7, v8, v4
 ; GFX10-IEEE-NEXT:    v_mul_f32_e32 v9, v6, v5
 ; GFX10-IEEE-NEXT:    v_fma_f32 v10, v7, -v2, v8
@@ -1441,16 +1441,16 @@ define <2 x float> @v_fdiv_v2f32_arcp_ulp25(<2 x float> %a, <2 x float> %b) {
 ; GFX89-IEEE-NEXT:    v_rcp_f32_e32 v8, v4
 ; GFX89-IEEE-NEXT:    v_rcp_f32_e32 v9, v5
 ; GFX89-IEEE-NEXT:    v_fma_f32 v10, -v4, v8, 1.0
-; GFX89-IEEE-NEXT:    v_fma_f32 v11, -v5, v9, 1.0
 ; GFX89-IEEE-NEXT:    v_fma_f32 v8, v10, v8, v8
+; GFX89-IEEE-NEXT:    v_fma_f32 v11, -v5, v9, 1.0
 ; GFX89-IEEE-NEXT:    v_fma_f32 v9, v11, v9, v9
 ; GFX89-IEEE-NEXT:    v_mul_f32_e32 v10, v6, v8
-; GFX89-IEEE-NEXT:    v_fma_f32 v12, -v4, v10, v6
 ; GFX89-IEEE-NEXT:    v_mul_f32_e32 v11, v7, v9
+; GFX89-IEEE-NEXT:    v_fma_f32 v12, -v4, v10, v6
 ; GFX89-IEEE-NEXT:    v_fma_f32 v13, -v5, v11, v7
 ; GFX89-IEEE-NEXT:    v_fma_f32 v10, v12, v8, v10
-; GFX89-IEEE-NEXT:    v_fma_f32 v4, -v4, v10, v6
 ; GFX89-IEEE-NEXT:    v_fma_f32 v11, v13, v9, v11
+; GFX89-IEEE-NEXT:    v_fma_f32 v4, -v4, v10, v6
 ; GFX89-IEEE-NEXT:    v_div_fmas_f32 v4, v4, v8, v10
 ; GFX89-IEEE-NEXT:    v_fma_f32 v5, -v5, v11, v7
 ; GFX89-IEEE-NEXT:    s_mov_b64 vcc, s[4:5]
@@ -1471,8 +1471,8 @@ define <2 x float> @v_fdiv_v2f32_arcp_ulp25(<2 x float> %a, <2 x float> %b) {
 ; GFX10-IEEE-NEXT:    v_fma_f32 v8, -v4, v6, 1.0
 ; GFX10-IEEE-NEXT:    v_fma_f32 v9, -v5, v7, 1.0
 ; GFX10-IEEE-NEXT:    v_fmac_f32_e32 v6, v8, v6
-; GFX10-IEEE-NEXT:    v_fmac_f32_e32 v7, v9, v7
 ; GFX10-IEEE-NEXT:    v_div_scale_f32 v8, s4, v1, v3, v1
+; GFX10-IEEE-NEXT:    v_fmac_f32_e32 v7, v9, v7
 ; GFX10-IEEE-NEXT:    v_mul_f32_e32 v9, v10, v6
 ; GFX10-IEEE-NEXT:    v_mul_f32_e32 v11, v8, v7
 ; GFX10-IEEE-NEXT:    v_fma_f32 v12, v9, -v4, v10

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll
index ca836897baa41..5431744849247 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll
@@ -18,8 +18,8 @@ define double @v_fdiv_f64(double %a, double %b) {
 ; GFX6-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1]
 ; GFX6-NEXT:    v_div_scale_f64 v[10:11], s[4:5], v[0:1], v[2:3], v[0:1]
 ; GFX6-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v11
 ; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], v3, v5
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v11
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
 ; GFX6-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
@@ -121,8 +121,8 @@ define double @v_fdiv_f64_ulp25(double %a, double %b) {
 ; GFX6-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1]
 ; GFX6-NEXT:    v_div_scale_f64 v[10:11], s[4:5], v[0:1], v[2:3], v[0:1]
 ; GFX6-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v11
 ; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], v3, v5
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v11
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
 ; GFX6-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
@@ -192,12 +192,12 @@ define double @v_rcp_f64(double %x) {
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
 ; GFX6-NEXT:    v_div_scale_f64 v[8:9], s[4:5], 1.0, v[0:1], 1.0
-; GFX6-NEXT:    v_mov_b32_e32 v10, 0x3ff00000
 ; GFX6-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
+; GFX6-NEXT:    v_mov_b32_e32 v10, 0x3ff00000
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v10, v9
 ; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], v1, v3
-; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
 ; GFX6-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
 ; GFX6-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
@@ -265,12 +265,12 @@ define double @v_rcp_f64_arcp(double %x) {
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
 ; GFX6-NEXT:    v_div_scale_f64 v[8:9], s[4:5], 1.0, v[0:1], 1.0
-; GFX6-NEXT:    v_mov_b32_e32 v10, 0x3ff00000
 ; GFX6-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
+; GFX6-NEXT:    v_mov_b32_e32 v10, 0x3ff00000
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v10, v9
 ; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], v1, v3
-; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
 ; GFX6-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
 ; GFX6-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
@@ -369,12 +369,12 @@ define double @v_rcp_f64_ulp25(double %x) {
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
 ; GFX6-NEXT:    v_div_scale_f64 v[8:9], s[4:5], 1.0, v[0:1], 1.0
-; GFX6-NEXT:    v_mov_b32_e32 v10, 0x3ff00000
 ; GFX6-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
+; GFX6-NEXT:    v_mov_b32_e32 v10, 0x3ff00000
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v10, v9
 ; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], v1, v3
-; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
 ; GFX6-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
 ; GFX6-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
@@ -474,8 +474,8 @@ define double @v_fdiv_f64_arcp_ulp25(double %a, double %b) {
 ; GFX6-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1]
 ; GFX6-NEXT:    v_div_scale_f64 v[10:11], s[4:5], v[0:1], v[2:3], v[0:1]
 ; GFX6-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v11
 ; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], v3, v5
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v11
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
 ; GFX6-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
@@ -545,18 +545,18 @@ define <2 x double> @v_fdiv_v2f64(<2 x double> %a, <2 x double> %b) {
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_div_scale_f64 v[8:9], s[4:5], v[4:5], v[4:5], v[0:1]
 ; GFX6-NEXT:    v_div_scale_f64 v[14:15], s[4:5], v[6:7], v[6:7], v[2:3]
-; GFX6-NEXT:    v_div_scale_f64 v[18:19], s[4:5], v[0:1], v[4:5], v[0:1]
 ; GFX6-NEXT:    v_rcp_f64_e32 v[10:11], v[8:9]
+; GFX6-NEXT:    v_div_scale_f64 v[18:19], s[4:5], v[0:1], v[4:5], v[0:1]
 ; GFX6-NEXT:    v_rcp_f64_e32 v[16:17], v[14:15]
+; GFX6-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v19
+; GFX6-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
 ; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], v5, v9
 ; GFX6-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
-; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], v7, v15
-; GFX6-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
-; GFX6-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
 ; GFX6-NEXT:    v_fma_f64 v[12:13], -v[14:15], v[16:17], 1.0
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], v7, v15
 ; GFX6-NEXT:    v_fma_f64 v[12:13], v[16:17], v[12:13], v[16:17]
 ; GFX6-NEXT:    v_mul_f64 v[16:17], v[18:19], v[10:11]
 ; GFX6-NEXT:    v_fma_f64 v[18:19], -v[8:9], v[16:17], v[18:19]
@@ -646,8 +646,8 @@ define <2 x double> @v_fdiv_v2f64(<2 x double> %a, <2 x double> %b) {
 ; GFX10-NEXT:    v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0
 ; GFX10-NEXT:    v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0
 ; GFX10-NEXT:    v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13]
-; GFX10-NEXT:    v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15]
 ; GFX10-NEXT:    v_div_scale_f64 v[16:17], s4, v[2:3], v[6:7], v[2:3]
+; GFX10-NEXT:    v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15]
 ; GFX10-NEXT:    v_mul_f64 v[18:19], v[20:21], v[12:13]
 ; GFX10-NEXT:    v_mul_f64 v[22:23], v[16:17], v[14:15]
 ; GFX10-NEXT:    v_fma_f64 v[8:9], -v[8:9], v[18:19], v[20:21]
@@ -715,18 +715,18 @@ define <2 x double> @v_fdiv_v2f64_ulp25(<2 x double> %a, <2 x double> %b) {
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_div_scale_f64 v[8:9], s[4:5], v[4:5], v[4:5], v[0:1]
 ; GFX6-NEXT:    v_div_scale_f64 v[14:15], s[4:5], v[6:7], v[6:7], v[2:3]
-; GFX6-NEXT:    v_div_scale_f64 v[18:19], s[4:5], v[0:1], v[4:5], v[0:1]
 ; GFX6-NEXT:    v_rcp_f64_e32 v[10:11], v[8:9]
+; GFX6-NEXT:    v_div_scale_f64 v[18:19], s[4:5], v[0:1], v[4:5], v[0:1]
 ; GFX6-NEXT:    v_rcp_f64_e32 v[16:17], v[14:15]
+; GFX6-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v19
+; GFX6-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
 ; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], v5, v9
 ; GFX6-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
-; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], v7, v15
-; GFX6-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
-; GFX6-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
 ; GFX6-NEXT:    v_fma_f64 v[12:13], -v[14:15], v[16:17], 1.0
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], v7, v15
 ; GFX6-NEXT:    v_fma_f64 v[12:13], v[16:17], v[12:13], v[16:17]
 ; GFX6-NEXT:    v_mul_f64 v[16:17], v[18:19], v[10:11]
 ; GFX6-NEXT:    v_fma_f64 v[18:19], -v[8:9], v[16:17], v[18:19]
@@ -816,8 +816,8 @@ define <2 x double> @v_fdiv_v2f64_ulp25(<2 x double> %a, <2 x double> %b) {
 ; GFX10-NEXT:    v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0
 ; GFX10-NEXT:    v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0
 ; GFX10-NEXT:    v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13]
-; GFX10-NEXT:    v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15]
 ; GFX10-NEXT:    v_div_scale_f64 v[16:17], s4, v[2:3], v[6:7], v[2:3]
+; GFX10-NEXT:    v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15]
 ; GFX10-NEXT:    v_mul_f64 v[18:19], v[20:21], v[12:13]
 ; GFX10-NEXT:    v_mul_f64 v[22:23], v[16:17], v[14:15]
 ; GFX10-NEXT:    v_fma_f64 v[8:9], -v[8:9], v[18:19], v[20:21]
@@ -838,8 +838,8 @@ define <2 x double> @v_rcp_v2f64(<2 x double> %x) {
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], 1.0
 ; GFX6-NEXT:    v_div_scale_f64 v[10:11], s[4:5], 1.0, v[0:1], 1.0
-; GFX6-NEXT:    v_mov_b32_e32 v18, 0x3ff00000
 ; GFX6-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
+; GFX6-NEXT:    v_mov_b32_e32 v18, 0x3ff00000
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v18, v11
 ; GFX6-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
 ; GFX6-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
@@ -878,8 +878,8 @@ define <2 x double> @v_rcp_v2f64(<2 x double> %x) {
 ; GFX8-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0
 ; GFX8-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0
 ; GFX8-NEXT:    v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
-; GFX8-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
 ; GFX8-NEXT:    v_div_scale_f64 v[12:13], vcc, 1.0, v[0:1], 1.0
+; GFX8-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
 ; GFX8-NEXT:    v_fma_f64 v[14:15], -v[4:5], v[8:9], 1.0
 ; GFX8-NEXT:    v_fma_f64 v[18:19], -v[6:7], v[10:11], 1.0
 ; GFX8-NEXT:    v_fma_f64 v[8:9], v[8:9], v[14:15], v[8:9]
@@ -906,8 +906,8 @@ define <2 x double> @v_rcp_v2f64(<2 x double> %x) {
 ; GFX9-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0
 ; GFX9-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0
 ; GFX9-NEXT:    v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
-; GFX9-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
 ; GFX9-NEXT:    v_div_scale_f64 v[12:13], vcc, 1.0, v[0:1], 1.0
+; GFX9-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
 ; GFX9-NEXT:    v_fma_f64 v[14:15], -v[4:5], v[8:9], 1.0
 ; GFX9-NEXT:    v_fma_f64 v[18:19], -v[6:7], v[10:11], 1.0
 ; GFX9-NEXT:    v_fma_f64 v[8:9], v[8:9], v[14:15], v[8:9]
@@ -939,8 +939,8 @@ define <2 x double> @v_rcp_v2f64(<2 x double> %x) {
 ; GFX10-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0
 ; GFX10-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0
 ; GFX10-NEXT:    v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
-; GFX10-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
 ; GFX10-NEXT:    v_div_scale_f64 v[12:13], s4, 1.0, v[2:3], 1.0
+; GFX10-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
 ; GFX10-NEXT:    v_mul_f64 v[14:15], v[16:17], v[8:9]
 ; GFX10-NEXT:    v_mul_f64 v[18:19], v[12:13], v[10:11]
 ; GFX10-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[14:15], v[16:17]
@@ -961,8 +961,8 @@ define <2 x double> @v_rcp_v2f64_arcp(<2 x double> %x) {
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], 1.0
 ; GFX6-NEXT:    v_div_scale_f64 v[10:11], s[4:5], 1.0, v[0:1], 1.0
-; GFX6-NEXT:    v_mov_b32_e32 v18, 0x3ff00000
 ; GFX6-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
+; GFX6-NEXT:    v_mov_b32_e32 v18, 0x3ff00000
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v18, v11
 ; GFX6-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
 ; GFX6-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
@@ -1001,8 +1001,8 @@ define <2 x double> @v_rcp_v2f64_arcp(<2 x double> %x) {
 ; GFX8-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0
 ; GFX8-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0
 ; GFX8-NEXT:    v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
-; GFX8-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
 ; GFX8-NEXT:    v_div_scale_f64 v[12:13], vcc, 1.0, v[0:1], 1.0
+; GFX8-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
 ; GFX8-NEXT:    v_fma_f64 v[14:15], -v[4:5], v[8:9], 1.0
 ; GFX8-NEXT:    v_fma_f64 v[18:19], -v[6:7], v[10:11], 1.0
 ; GFX8-NEXT:    v_fma_f64 v[8:9], v[8:9], v[14:15], v[8:9]
@@ -1029,8 +1029,8 @@ define <2 x double> @v_rcp_v2f64_arcp(<2 x double> %x) {
 ; GFX9-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0
 ; GFX9-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0
 ; GFX9-NEXT:    v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
-; GFX9-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
 ; GFX9-NEXT:    v_div_scale_f64 v[12:13], vcc, 1.0, v[0:1], 1.0
+; GFX9-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
 ; GFX9-NEXT:    v_fma_f64 v[14:15], -v[4:5], v[8:9], 1.0
 ; GFX9-NEXT:    v_fma_f64 v[18:19], -v[6:7], v[10:11], 1.0
 ; GFX9-NEXT:    v_fma_f64 v[8:9], v[8:9], v[14:15], v[8:9]
@@ -1062,8 +1062,8 @@ define <2 x double> @v_rcp_v2f64_arcp(<2 x double> %x) {
 ; GFX10-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0
 ; GFX10-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0
 ; GFX10-NEXT:    v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
-; GFX10-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
 ; GFX10-NEXT:    v_div_scale_f64 v[12:13], s4, 1.0, v[2:3], 1.0
+; GFX10-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
 ; GFX10-NEXT:    v_mul_f64 v[14:15], v[16:17], v[8:9]
 ; GFX10-NEXT:    v_mul_f64 v[18:19], v[12:13], v[10:11]
 ; GFX10-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[14:15], v[16:17]
@@ -1131,8 +1131,8 @@ define <2 x double> @v_rcp_v2f64_ulp25(<2 x double> %x) {
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], 1.0
 ; GFX6-NEXT:    v_div_scale_f64 v[10:11], s[4:5], 1.0, v[0:1], 1.0
-; GFX6-NEXT:    v_mov_b32_e32 v18, 0x3ff00000
 ; GFX6-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
+; GFX6-NEXT:    v_mov_b32_e32 v18, 0x3ff00000
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v18, v11
 ; GFX6-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
 ; GFX6-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
@@ -1171,8 +1171,8 @@ define <2 x double> @v_rcp_v2f64_ulp25(<2 x double> %x) {
 ; GFX8-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0
 ; GFX8-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0
 ; GFX8-NEXT:    v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
-; GFX8-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
 ; GFX8-NEXT:    v_div_scale_f64 v[12:13], vcc, 1.0, v[0:1], 1.0
+; GFX8-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
 ; GFX8-NEXT:    v_fma_f64 v[14:15], -v[4:5], v[8:9], 1.0
 ; GFX8-NEXT:    v_fma_f64 v[18:19], -v[6:7], v[10:11], 1.0
 ; GFX8-NEXT:    v_fma_f64 v[8:9], v[8:9], v[14:15], v[8:9]
@@ -1199,8 +1199,8 @@ define <2 x double> @v_rcp_v2f64_ulp25(<2 x double> %x) {
 ; GFX9-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0
 ; GFX9-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0
 ; GFX9-NEXT:    v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
-; GFX9-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
 ; GFX9-NEXT:    v_div_scale_f64 v[12:13], vcc, 1.0, v[0:1], 1.0
+; GFX9-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
 ; GFX9-NEXT:    v_fma_f64 v[14:15], -v[4:5], v[8:9], 1.0
 ; GFX9-NEXT:    v_fma_f64 v[18:19], -v[6:7], v[10:11], 1.0
 ; GFX9-NEXT:    v_fma_f64 v[8:9], v[8:9], v[14:15], v[8:9]
@@ -1232,8 +1232,8 @@ define <2 x double> @v_rcp_v2f64_ulp25(<2 x double> %x) {
 ; GFX10-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0
 ; GFX10-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0
 ; GFX10-NEXT:    v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
-; GFX10-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
 ; GFX10-NEXT:    v_div_scale_f64 v[12:13], s4, 1.0, v[2:3], 1.0
+; GFX10-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
 ; GFX10-NEXT:    v_mul_f64 v[14:15], v[16:17], v[8:9]
 ; GFX10-NEXT:    v_mul_f64 v[18:19], v[12:13], v[10:11]
 ; GFX10-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[14:15], v[16:17]
@@ -1301,18 +1301,18 @@ define <2 x double> @v_fdiv_v2f64_arcp_ulp25(<2 x double> %a, <2 x double> %b) {
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_div_scale_f64 v[8:9], s[4:5], v[4:5], v[4:5], v[0:1]
 ; GFX6-NEXT:    v_div_scale_f64 v[14:15], s[4:5], v[6:7], v[6:7], v[2:3]
-; GFX6-NEXT:    v_div_scale_f64 v[18:19], s[4:5], v[0:1], v[4:5], v[0:1]
 ; GFX6-NEXT:    v_rcp_f64_e32 v[10:11], v[8:9]
+; GFX6-NEXT:    v_div_scale_f64 v[18:19], s[4:5], v[0:1], v[4:5], v[0:1]
 ; GFX6-NEXT:    v_rcp_f64_e32 v[16:17], v[14:15]
+; GFX6-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v19
+; GFX6-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
 ; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], v5, v9
 ; GFX6-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
-; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], v7, v15
-; GFX6-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
-; GFX6-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
 ; GFX6-NEXT:    v_fma_f64 v[12:13], -v[14:15], v[16:17], 1.0
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], v7, v15
 ; GFX6-NEXT:    v_fma_f64 v[12:13], v[16:17], v[12:13], v[16:17]
 ; GFX6-NEXT:    v_mul_f64 v[16:17], v[18:19], v[10:11]
 ; GFX6-NEXT:    v_fma_f64 v[18:19], -v[8:9], v[16:17], v[18:19]
@@ -1402,8 +1402,8 @@ define <2 x double> @v_fdiv_v2f64_arcp_ulp25(<2 x double> %a, <2 x double> %b) {
 ; GFX10-NEXT:    v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0
 ; GFX10-NEXT:    v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0
 ; GFX10-NEXT:    v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13]
-; GFX10-NEXT:    v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15]
 ; GFX10-NEXT:    v_div_scale_f64 v[16:17], s4, v[2:3], v[6:7], v[2:3]
+; GFX10-NEXT:    v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15]
 ; GFX10-NEXT:    v_mul_f64 v[18:19], v[20:21], v[12:13]
 ; GFX10-NEXT:    v_mul_f64 v[22:23], v[16:17], v[14:15]
 ; GFX10-NEXT:    v_fma_f64 v[8:9], -v[8:9], v[18:19], v[20:21]

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
index 0be99514c7498..20bb529566d5e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
@@ -12,8 +12,8 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_lshl_b32 s1, s0, 2
 ; GFX9-NEXT:    s_and_b32 s0, s0, 15
-; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX9-NEXT:    s_add_i32 s1, s1, 4
+; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX9-NEXT:    scratch_store_dword off, v0, s1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_add_i32 s0, s0, 4
@@ -58,12 +58,12 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
-; GFX9-NEXT:    v_sub_u32_e32 v0, 0, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT:    v_sub_u32_e32 v0, 0, v0
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
 ; GFX9-NEXT:    v_add_u32_e32 v1, v2, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 15
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    scratch_store_dword v1, v3, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e32 v0, v2, v0
@@ -109,11 +109,11 @@ define void @store_load_vindex_foo(i32 %idx) {
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 15, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s32
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 15, v0
 ; GFX9-NEXT:    v_add_u32_e32 v1, v2, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 15
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    scratch_store_dword v1, v3, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e32 v0, v2, v0
@@ -183,9 +183,9 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_lshl_b32 s1, s0, 2
 ; GFX9-NEXT:    s_and_b32 s0, s0, 15
-; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX9-NEXT:    s_addk_i32 s1, 0x104
+; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX9-NEXT:    scratch_store_dword off, v0, s1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_addk_i32 s0, 0x104
@@ -239,11 +239,11 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
 ; GFX9-NEXT:    scratch_load_dword v1, off, vcc_hi offset:4 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
-; GFX9-NEXT:    v_sub_u32_e32 v0, 0, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x104
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT:    v_sub_u32_e32 v0, 0, v0
 ; GFX9-NEXT:    v_add_u32_e32 v1, v2, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 15
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    scratch_store_dword v1, v3, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e32 v0, v2, v0
@@ -297,11 +297,11 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_add_i32 vcc_hi, s32, 0x100
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 15, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, vcc_hi
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 15, v0
 ; GFX9-NEXT:    v_add_u32_e32 v1, v2, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 15
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    scratch_store_dword v1, v3, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e32 v0, v2, v0
@@ -355,9 +355,9 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_lshl_b32 s1, s0, 2
 ; GFX9-NEXT:    s_and_b32 s0, s0, 15
-; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX9-NEXT:    s_addk_i32 s1, 0x4004
+; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX9-NEXT:    scratch_store_dword off, v0, s1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_addk_i32 s0, 0x4004
@@ -411,11 +411,11 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
 ; GFX9-NEXT:    scratch_load_dword v1, off, vcc_hi offset:4 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
-; GFX9-NEXT:    v_sub_u32_e32 v0, 0, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x4004
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT:    v_sub_u32_e32 v0, 0, v0
 ; GFX9-NEXT:    v_add_u32_e32 v1, v2, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 15
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    scratch_store_dword v1, v3, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e32 v0, v2, v0
@@ -469,11 +469,11 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_add_i32 vcc_hi, s32, 0x4000
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 15, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, vcc_hi
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 15, v0
 ; GFX9-NEXT:    v_add_u32_e32 v1, v2, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 15
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    scratch_store_dword v1, v3, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e32 v0, v2, v0
@@ -521,9 +521,9 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
-; GFX9-NEXT:    s_movk_i32 s0, 0x3e80
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 13
 ; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
+; GFX9-NEXT:    s_movk_i32 s0, 0x3e80
 ; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 15
@@ -566,8 +566,8 @@ define void @store_load_large_imm_offset_foo() {
 ; GFX9-LABEL: store_load_large_imm_offset_foo:
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_movk_i32 s0, 0x3e80
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 13
+; GFX9-NEXT:    s_movk_i32 s0, 0x3e80
 ; GFX9-NEXT:    scratch_store_dword off, v0, s32
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 15

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/floor.f64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/floor.f64.ll
index d7ea1404b1175..a4f5948da2c69 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/floor.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/floor.f64.ll
@@ -7,8 +7,8 @@ define double @v_floor_f64_ieee(double %x) {
 ; GFX6-LABEL: v_floor_f64_ieee:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, -1
 ; GFX6-NEXT:    v_fract_f64_e32 v[2:3], v[0:1]
+; GFX6-NEXT:    s_mov_b32 s4, -1
 ; GFX6-NEXT:    s_mov_b32 s5, 0x3fefffff
 ; GFX6-NEXT:    v_min_f64 v[2:3], v[2:3], s[4:5]
 ; GFX6-NEXT:    v_cmp_o_f64_e32 vcc, v[0:1], v[0:1]
@@ -30,8 +30,8 @@ define double @v_floor_f64_ieee_nnan(double %x) {
 ; GFX6-LABEL: v_floor_f64_ieee_nnan:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, -1
 ; GFX6-NEXT:    v_fract_f64_e32 v[2:3], v[0:1]
+; GFX6-NEXT:    s_mov_b32 s4, -1
 ; GFX6-NEXT:    s_mov_b32 s5, 0x3fefffff
 ; GFX6-NEXT:    v_min_f64 v[2:3], v[2:3], s[4:5]
 ; GFX6-NEXT:    v_add_f64 v[0:1], v[0:1], -v[2:3]
@@ -50,8 +50,8 @@ define double @v_floor_f64_ieee_fneg(double %x) {
 ; GFX6-LABEL: v_floor_f64_ieee_fneg:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, -1
 ; GFX6-NEXT:    v_fract_f64_e64 v[2:3], -v[0:1]
+; GFX6-NEXT:    s_mov_b32 s4, -1
 ; GFX6-NEXT:    s_mov_b32 s5, 0x3fefffff
 ; GFX6-NEXT:    v_min_f64 v[2:3], v[2:3], s[4:5]
 ; GFX6-NEXT:    v_cmp_o_f64_e32 vcc, v[0:1], v[0:1]
@@ -74,8 +74,8 @@ define double @v_floor_f64_nonieee(double %x) #1 {
 ; GFX6-LABEL: v_floor_f64_nonieee:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, -1
 ; GFX6-NEXT:    v_fract_f64_e32 v[2:3], v[0:1]
+; GFX6-NEXT:    s_mov_b32 s4, -1
 ; GFX6-NEXT:    s_mov_b32 s5, 0x3fefffff
 ; GFX6-NEXT:    v_min_f64 v[2:3], v[2:3], s[4:5]
 ; GFX6-NEXT:    v_cmp_o_f64_e32 vcc, v[0:1], v[0:1]
@@ -97,8 +97,8 @@ define double @v_floor_f64_nonieee_nnan(double %x) #1 {
 ; GFX6-LABEL: v_floor_f64_nonieee_nnan:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, -1
 ; GFX6-NEXT:    v_fract_f64_e32 v[2:3], v[0:1]
+; GFX6-NEXT:    s_mov_b32 s4, -1
 ; GFX6-NEXT:    s_mov_b32 s5, 0x3fefffff
 ; GFX6-NEXT:    v_min_f64 v[2:3], v[2:3], s[4:5]
 ; GFX6-NEXT:    v_add_f64 v[0:1], v[0:1], -v[2:3]
@@ -117,8 +117,8 @@ define double @v_floor_f64_non_ieee_fneg(double %x) #1 {
 ; GFX6-LABEL: v_floor_f64_non_ieee_fneg:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, -1
 ; GFX6-NEXT:    v_fract_f64_e64 v[2:3], -v[0:1]
+; GFX6-NEXT:    s_mov_b32 s4, -1
 ; GFX6-NEXT:    s_mov_b32 s5, 0x3fefffff
 ; GFX6-NEXT:    v_min_f64 v[2:3], v[2:3], s[4:5]
 ; GFX6-NEXT:    v_cmp_o_f64_e32 vcc, v[0:1], v[0:1]
@@ -141,8 +141,8 @@ define double @v_floor_f64_fabs(double %x) {
 ; GFX6-LABEL: v_floor_f64_fabs:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, -1
 ; GFX6-NEXT:    v_fract_f64_e64 v[2:3], |v[0:1]|
+; GFX6-NEXT:    s_mov_b32 s4, -1
 ; GFX6-NEXT:    s_mov_b32 s5, 0x3fefffff
 ; GFX6-NEXT:    v_min_f64 v[2:3], v[2:3], s[4:5]
 ; GFX6-NEXT:    v_cmp_o_f64_e32 vcc, v[0:1], v[0:1]
@@ -170,8 +170,8 @@ define double @v_floor_f64_fneg_fabs(double %x) {
 ; GFX6-LABEL: v_floor_f64_fneg_fabs:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, -1
 ; GFX6-NEXT:    v_fract_f64_e64 v[2:3], -|v[0:1]|
+; GFX6-NEXT:    s_mov_b32 s4, -1
 ; GFX6-NEXT:    s_mov_b32 s5, 0x3fefffff
 ; GFX6-NEXT:    v_min_f64 v[2:3], v[2:3], s[4:5]
 ; GFX6-NEXT:    v_cmp_o_f64_e32 vcc, v[0:1], v[0:1]
@@ -194,8 +194,8 @@ define double @v_floor_f64_fneg_fabs(double %x) {
 define amdgpu_ps <2 x float> @s_floor_f64(double inreg %x) {
 ; GFX6-LABEL: s_floor_f64:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_mov_b32 s0, -1
 ; GFX6-NEXT:    v_fract_f64_e32 v[0:1], s[2:3]
+; GFX6-NEXT:    s_mov_b32 s0, -1
 ; GFX6-NEXT:    s_mov_b32 s1, 0x3fefffff
 ; GFX6-NEXT:    v_min_f64 v[0:1], v[0:1], s[0:1]
 ; GFX6-NEXT:    v_cmp_o_f64_e64 vcc, s[2:3], s[2:3]
@@ -218,8 +218,8 @@ define amdgpu_ps <2 x float> @s_floor_f64(double inreg %x) {
 define amdgpu_ps <2 x float> @s_floor_f64_fneg(double inreg %x) {
 ; GFX6-LABEL: s_floor_f64_fneg:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_mov_b32 s0, -1
 ; GFX6-NEXT:    v_fract_f64_e64 v[0:1], -s[2:3]
+; GFX6-NEXT:    s_mov_b32 s0, -1
 ; GFX6-NEXT:    s_mov_b32 s1, 0x3fefffff
 ; GFX6-NEXT:    v_min_f64 v[0:1], v[0:1], s[0:1]
 ; GFX6-NEXT:    v_cmp_o_f64_e64 vcc, s[2:3], s[2:3]
@@ -243,8 +243,8 @@ define amdgpu_ps <2 x float> @s_floor_f64_fneg(double inreg %x) {
 define amdgpu_ps <2 x float> @s_floor_f64_fabs(double inreg %x) {
 ; GFX6-LABEL: s_floor_f64_fabs:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_mov_b32 s0, -1
 ; GFX6-NEXT:    v_fract_f64_e64 v[0:1], |s[2:3]|
+; GFX6-NEXT:    s_mov_b32 s0, -1
 ; GFX6-NEXT:    s_mov_b32 s1, 0x3fefffff
 ; GFX6-NEXT:    v_min_f64 v[0:1], v[0:1], s[0:1]
 ; GFX6-NEXT:    v_cmp_o_f64_e64 vcc, s[2:3], s[2:3]
@@ -268,8 +268,8 @@ define amdgpu_ps <2 x float> @s_floor_f64_fabs(double inreg %x) {
 define amdgpu_ps <2 x float> @s_floor_f64_fneg_fabs(double inreg %x) {
 ; GFX6-LABEL: s_floor_f64_fneg_fabs:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_mov_b32 s0, -1
 ; GFX6-NEXT:    v_fract_f64_e64 v[0:1], -|s[2:3]|
+; GFX6-NEXT:    s_mov_b32 s0, -1
 ; GFX6-NEXT:    s_mov_b32 s1, 0x3fefffff
 ; GFX6-NEXT:    v_min_f64 v[0:1], v[0:1], s[0:1]
 ; GFX6-NEXT:    v_cmp_o_f64_e64 vcc, s[2:3], s[2:3]

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll
index cc7dd885c9940..4e7663838db0b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll
@@ -259,8 +259,8 @@ define <2 x half> @v_fma_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y, <2 x h
 ; GFX6-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
 ; GFX6-NEXT:    v_and_b32_e32 v2, v2, v6
-; GFX6-NEXT:    s_mov_b32 s4, 0x80008000
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX6-NEXT:    s_mov_b32 s4, 0x80008000
 ; GFX6-NEXT:    v_xor_b32_e32 v0, s4, v0
 ; GFX6-NEXT:    v_xor_b32_e32 v1, s4, v1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
@@ -328,15 +328,15 @@ define <4 x half> @v_fma_v4f16(<4 x half> %x, <4 x half> %y, <4 x half> %z) {
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v5
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v9, v9
 ; GFX6-NEXT:    v_fma_f32 v0, v0, v4, v8
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v6
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v6, v7
 ; GFX6-NEXT:    v_fma_f32 v1, v1, v5, v9
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v10
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v6, v7
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v7, v11
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-NEXT:    v_fma_f32 v2, v2, v4, v5
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX6-NEXT:    v_fma_f32 v3, v3, v6, v7
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -349,15 +349,15 @@ define <4 x half> @v_fma_v4f16(<4 x half> %x, <4 x half> %y, <4 x half> %z) {
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v4
-; GFX8-NEXT:    v_fma_f16 v0, v0, v2, v4
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v5
+; GFX8-NEXT:    v_fma_f16 v0, v0, v2, v4
 ; GFX8-NEXT:    v_fma_f16 v2, v6, v8, v10
 ; GFX8-NEXT:    v_mov_b32_e32 v4, 16
 ; GFX8-NEXT:    v_fma_f16 v1, v1, v3, v5
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX8-NEXT:    v_fma_f16 v3, v7, v9, v11
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll
index 295c3645dac6d..9f327a06919da 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll
@@ -36,12 +36,12 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(float addrs
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v8
-; VI-NEXT:    v_mov_b32_e32 v2, s4
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    v_mov_b32_e32 v2, s4
 ; VI-NEXT:    v_mov_b32_e32 v3, s5
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v8
-; VI-NEXT:    v_mov_b32_e32 v4, s6
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; VI-NEXT:    v_mov_b32_e32 v4, s6
 ; VI-NEXT:    v_mov_b32_e32 v5, s7
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v8
 ; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
@@ -146,12 +146,12 @@ define amdgpu_kernel void @v_test_no_global_nnans_med3_f32_pat0_srcmod0(float ad
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v6
-; VI-NEXT:    v_mov_b32_e32 v2, s4
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    v_mov_b32_e32 v2, s4
 ; VI-NEXT:    v_mov_b32_e32 v3, s5
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v6
-; VI-NEXT:    v_mov_b32_e32 v4, s6
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; VI-NEXT:    v_mov_b32_e32 v4, s6
 ; VI-NEXT:    v_mov_b32_e32 v5, s7
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v6
 ; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
@@ -270,12 +270,12 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(float add
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v6
-; VI-NEXT:    v_mov_b32_e32 v2, s4
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    v_mov_b32_e32 v2, s4
 ; VI-NEXT:    v_mov_b32_e32 v3, s5
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v6
-; VI-NEXT:    v_mov_b32_e32 v4, s6
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; VI-NEXT:    v_mov_b32_e32 v4, s6
 ; VI-NEXT:    v_mov_b32_e32 v5, s7
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v6
 ; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
@@ -285,8 +285,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(float add
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    flat_load_dword v3, v[4:5] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    s_mov_b32 s2, 0x80000000
+; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v6
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
@@ -390,12 +390,12 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(float add
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v6
-; VI-NEXT:    v_mov_b32_e32 v2, s4
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    v_mov_b32_e32 v2, s4
 ; VI-NEXT:    v_mov_b32_e32 v3, s5
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v6
-; VI-NEXT:    v_mov_b32_e32 v4, s6
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; VI-NEXT:    v_mov_b32_e32 v4, s6
 ; VI-NEXT:    v_mov_b32_e32 v5, s7
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v6
 ; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
@@ -514,12 +514,12 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(float addrspace(1)* %out,
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v6
-; VI-NEXT:    v_mov_b32_e32 v2, s4
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    v_mov_b32_e32 v2, s4
 ; VI-NEXT:    v_mov_b32_e32 v3, s5
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v6
-; VI-NEXT:    v_mov_b32_e32 v4, s6
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; VI-NEXT:    v_mov_b32_e32 v4, s6
 ; VI-NEXT:    v_mov_b32_e32 v5, s7
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v6
 ; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
@@ -642,12 +642,12 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(float addrspace(
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v6
-; VI-NEXT:    v_mov_b32_e32 v2, s4
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    v_mov_b32_e32 v2, s4
 ; VI-NEXT:    v_mov_b32_e32 v3, s5
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v6
-; VI-NEXT:    v_mov_b32_e32 v4, s6
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; VI-NEXT:    v_mov_b32_e32 v4, s6
 ; VI-NEXT:    v_mov_b32_e32 v5, s7
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v6
 ; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
@@ -663,8 +663,8 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(float addrspace(
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    v_mul_f32_e32 v4, 1.0, v7
 ; VI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; VI-NEXT:    v_min_f32_e32 v5, v4, v2
 ; VI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; VI-NEXT:    v_min_f32_e32 v5, v4, v2
 ; VI-NEXT:    v_max_f32_e32 v2, v4, v2
 ; VI-NEXT:    v_min_f32_e32 v2, v2, v3
 ; VI-NEXT:    v_max_f32_e32 v2, v5, v2
@@ -686,9 +686,9 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(float addrspace(
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
 ; GFX9-NEXT:    v_min_f32_e32 v4, v1, v2
 ; GFX9-NEXT:    v_max_f32_e32 v1, v1, v2
-; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
 ; GFX9-NEXT:    global_store_dword v[0:1], v4, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_min_f32_e32 v1, v1, v3

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll
index ec22e3451ba22..8e7d2db7a2b7f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll
@@ -265,8 +265,8 @@ define <4 x half> @v_fmul_v4f16_fneg_lhs_fneg_rhs(<4 x half> %a, <4 x half> %b)
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_mov_b32 s4, 0x80008000
 ; GFX8-NEXT:    v_xor_b32_e32 v0, s4, v0
-; GFX8-NEXT:    v_xor_b32_e32 v2, s4, v2
 ; GFX8-NEXT:    v_xor_b32_e32 v1, s4, v1
+; GFX8-NEXT:    v_xor_b32_e32 v2, s4, v2
 ; GFX8-NEXT:    v_xor_b32_e32 v3, s4, v3
 ; GFX8-NEXT:    v_mul_f16_e32 v4, v0, v2
 ; GFX8-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
@@ -436,9 +436,9 @@ define <6 x half> @v_fmul_v6f16_fneg_lhs_fneg_rhs(<6 x half> %a, <6 x half> %b)
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_mov_b32 s4, 0x80008000
 ; GFX8-NEXT:    v_xor_b32_e32 v0, s4, v0
-; GFX8-NEXT:    v_xor_b32_e32 v3, s4, v3
 ; GFX8-NEXT:    v_xor_b32_e32 v1, s4, v1
 ; GFX8-NEXT:    v_xor_b32_e32 v2, s4, v2
+; GFX8-NEXT:    v_xor_b32_e32 v3, s4, v3
 ; GFX8-NEXT:    v_xor_b32_e32 v4, s4, v4
 ; GFX8-NEXT:    v_xor_b32_e32 v5, s4, v5
 ; GFX8-NEXT:    v_mul_f16_e32 v6, v0, v3
@@ -636,10 +636,10 @@ define <8 x half> @v_fmul_v8f16_fneg_lhs_fneg_rhs(<8 x half> %a, <8 x half> %b)
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_mov_b32 s4, 0x80008000
 ; GFX8-NEXT:    v_xor_b32_e32 v0, s4, v0
-; GFX8-NEXT:    v_xor_b32_e32 v4, s4, v4
 ; GFX8-NEXT:    v_xor_b32_e32 v1, s4, v1
 ; GFX8-NEXT:    v_xor_b32_e32 v2, s4, v2
 ; GFX8-NEXT:    v_xor_b32_e32 v3, s4, v3
+; GFX8-NEXT:    v_xor_b32_e32 v4, s4, v4
 ; GFX8-NEXT:    v_xor_b32_e32 v5, s4, v5
 ; GFX8-NEXT:    v_xor_b32_e32 v6, s4, v6
 ; GFX8-NEXT:    v_xor_b32_e32 v7, s4, v7

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll
index 51a3efcc40292..bc2c5c7371046 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll
@@ -188,10 +188,10 @@ define <2 x half> @v_pow_v2f16(<2 x half> %x, <2 x half> %y) {
 ; GFX9-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX9-NEXT:    v_mul_legacy_f32_e32 v2, v2, v3
 ; GFX9-NEXT:    v_mul_legacy_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX9-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX9-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX9-NEXT:    v_exp_f16_e32 v1, v2
+; GFX9-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0xffff
 ; GFX9-NEXT:    v_and_or_b32 v0, v1, v2, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -274,10 +274,10 @@ define <2 x half> @v_pow_v2f16_fneg_lhs(<2 x half> %x, <2 x half> %y) {
 ; GFX9-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX9-NEXT:    v_mul_legacy_f32_e32 v2, v2, v3
 ; GFX9-NEXT:    v_mul_legacy_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX9-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX9-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX9-NEXT:    v_exp_f16_e32 v1, v2
+; GFX9-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0xffff
 ; GFX9-NEXT:    v_and_or_b32 v0, v1, v2, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -310,10 +310,10 @@ define <2 x half> @v_pow_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y) {
 ; GFX6-LABEL: v_pow_v2f16_fneg_rhs:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX6-NEXT:    v_or_b32_e32 v2, v3, v2
 ; GFX6-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
@@ -340,8 +340,8 @@ define <2 x half> @v_pow_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y) {
 ; GFX8-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX8-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX8-NEXT:    v_mul_legacy_f32_e32 v0, v0, v1
-; GFX8-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX8-NEXT:    v_mul_legacy_f32_e32 v2, v2, v3
+; GFX8-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX8-NEXT:    v_cvt_f16_f32_e32 v1, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 16
 ; GFX8-NEXT:    v_exp_f16_e32 v0, v0
@@ -437,8 +437,8 @@ define <2 x half> @v_pow_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y) {
 ; GFX8-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX8-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX8-NEXT:    v_mul_legacy_f32_e32 v0, v0, v1
-; GFX8-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX8-NEXT:    v_mul_legacy_f32_e32 v2, v2, v3
+; GFX8-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX8-NEXT:    v_cvt_f16_f32_e32 v1, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 16
 ; GFX8-NEXT:    v_exp_f16_e32 v0, v0

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
index d8abab5e208d6..29190d7931902 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
@@ -543,8 +543,8 @@ define amdgpu_ps i16 @s_fshl_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in
 ; GFX6-NEXT:    s_lshr_b32 s4, s2, 8
 ; GFX6-NEXT:    s_andn2_b32 s2, 7, s2
 ; GFX6-NEXT:    s_lshr_b32 s5, s5, 1
-; GFX6-NEXT:    s_bfe_u32 s1, s1, 0x80008
 ; GFX6-NEXT:    s_lshr_b32 s2, s5, s2
+; GFX6-NEXT:    s_bfe_u32 s1, s1, 0x80008
 ; GFX6-NEXT:    s_or_b32 s0, s0, s2
 ; GFX6-NEXT:    s_and_b32 s2, s4, 7
 ; GFX6-NEXT:    s_andn2_b32 s4, 7, s4
@@ -622,11 +622,11 @@ define amdgpu_ps i16 @s_fshl_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in
 ; GFX10-NEXT:    s_movk_i32 s6, 0xff
 ; GFX10-NEXT:    s_lshr_b32 s5, s2, 8
 ; GFX10-NEXT:    s_and_b32 s4, s4, s6
+; GFX10-NEXT:    s_and_b32 s7, s2, 7
 ; GFX10-NEXT:    s_and_b32 s1, s1, s6
 ; GFX10-NEXT:    s_bfe_u32 s4, s4, 0x100000
-; GFX10-NEXT:    s_and_b32 s7, s2, 7
-; GFX10-NEXT:    s_bfe_u32 s1, s1, 0x100000
 ; GFX10-NEXT:    s_lshr_b32 s3, s0, 8
+; GFX10-NEXT:    s_bfe_u32 s1, s1, 0x100000
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, s7
 ; GFX10-NEXT:    s_and_b32 s7, s5, 7
 ; GFX10-NEXT:    s_andn2_b32 s5, 7, s5
@@ -694,11 +694,11 @@ define i16 @v_fshl_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) {
 ; GFX8-NEXT:    v_and_b32_e32 v2, 7, v2
 ; GFX8-NEXT:    v_lshrrev_b16_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v1, v2, v1
-; GFX8-NEXT:    v_xor_b32_e32 v2, -1, v5
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_and_b32_e32 v1, 7, v5
-; GFX8-NEXT:    v_lshlrev_b16_e32 v1, v1, v3
+; GFX8-NEXT:    v_xor_b32_e32 v2, -1, v5
 ; GFX8-NEXT:    v_and_b32_e32 v2, 7, v2
+; GFX8-NEXT:    v_lshlrev_b16_e32 v1, v1, v3
 ; GFX8-NEXT:    v_lshrrev_b16_sdwa v3, v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v2, v2, v3
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
@@ -718,13 +718,13 @@ define i16 @v_fshl_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) {
 ; GFX9-NEXT:    v_and_b32_e32 v2, 7, v2
 ; GFX9-NEXT:    v_lshrrev_b16_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
-; GFX9-NEXT:    v_lshrrev_b16_e32 v1, v2, v1
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, v6, v0
-; GFX9-NEXT:    v_xor_b32_e32 v2, -1, v5
+; GFX9-NEXT:    v_lshrrev_b16_e32 v1, v2, v1
 ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_and_b32_e32 v1, 7, v5
-; GFX9-NEXT:    v_lshlrev_b16_e32 v1, v1, v3
+; GFX9-NEXT:    v_xor_b32_e32 v2, -1, v5
 ; GFX9-NEXT:    v_and_b32_e32 v2, 7, v2
+; GFX9-NEXT:    v_lshlrev_b16_e32 v1, v1, v3
 ; GFX9-NEXT:    v_lshrrev_b16_sdwa v3, s4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_lshrrev_b16_e32 v2, v2, v3
 ; GFX9-NEXT:    v_or_b32_e32 v1, v1, v2
@@ -740,11 +740,11 @@ define i16 @v_fshl_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) {
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 8, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
 ; GFX10-NEXT:    s_movk_i32 s4, 0xff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 8, v0
 ; GFX10-NEXT:    v_xor_b32_e32 v7, -1, v2
-; GFX10-NEXT:    v_and_b32_e32 v1, s4, v1
 ; GFX10-NEXT:    v_xor_b32_e32 v6, -1, v3
 ; GFX10-NEXT:    v_and_b32_e32 v4, s4, v4
-; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 8, v0
+; GFX10-NEXT:    v_and_b32_e32 v1, s4, v1
 ; GFX10-NEXT:    v_and_b32_e32 v3, 7, v3
 ; GFX10-NEXT:    v_and_b32_e32 v2, 7, v2
 ; GFX10-NEXT:    v_and_b32_e32 v6, 7, v6
@@ -798,11 +798,11 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
 ; GFX6-NEXT:    s_andn2_b32 s6, 7, s7
 ; GFX6-NEXT:    s_lshr_b32 s4, s4, 1
 ; GFX6-NEXT:    s_lshr_b32 s4, s4, s6
-; GFX6-NEXT:    s_and_b32 s2, s2, s10
 ; GFX6-NEXT:    s_or_b32 s3, s3, s4
 ; GFX6-NEXT:    s_and_b32 s4, s8, 7
 ; GFX6-NEXT:    s_andn2_b32 s6, 7, s8
 ; GFX6-NEXT:    s_lshr_b32 s1, s1, 25
+; GFX6-NEXT:    s_and_b32 s2, s2, s10
 ; GFX6-NEXT:    s_lshl_b32 s4, s5, s4
 ; GFX6-NEXT:    s_lshr_b32 s1, s1, s6
 ; GFX6-NEXT:    s_and_b32 s0, s0, s10
@@ -810,8 +810,8 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
 ; GFX6-NEXT:    s_or_b32 s1, s4, s1
 ; GFX6-NEXT:    s_or_b32 s0, s0, s2
 ; GFX6-NEXT:    s_and_b32 s2, s3, s10
-; GFX6-NEXT:    s_and_b32 s1, s1, s10
 ; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX6-NEXT:    s_and_b32 s1, s1, s10
 ; GFX6-NEXT:    s_or_b32 s0, s0, s2
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 24
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
@@ -831,11 +831,11 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
 ; GFX8-NEXT:    s_and_b32 s12, s2, 7
 ; GFX8-NEXT:    s_andn2_b32 s2, 7, s2
 ; GFX8-NEXT:    s_lshr_b32 s1, s1, 1
-; GFX8-NEXT:    s_lshr_b32 s1, s1, s2
 ; GFX8-NEXT:    s_lshr_b32 s3, s0, 8
 ; GFX8-NEXT:    s_lshr_b32 s4, s0, 16
 ; GFX8-NEXT:    s_lshr_b32 s5, s0, 24
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, s12
+; GFX8-NEXT:    s_lshr_b32 s1, s1, s2
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    s_and_b32 s1, s9, 7
 ; GFX8-NEXT:    s_lshl_b32 s1, s3, s1
@@ -852,19 +852,19 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
 ; GFX8-NEXT:    s_andn2_b32 s3, 7, s10
 ; GFX8-NEXT:    s_lshr_b32 s4, s4, 1
 ; GFX8-NEXT:    s_lshr_b32 s3, s4, s3
-; GFX8-NEXT:    s_and_b32 s1, s1, s13
 ; GFX8-NEXT:    s_or_b32 s2, s2, s3
 ; GFX8-NEXT:    s_and_b32 s3, s11, 7
-; GFX8-NEXT:    s_lshl_b32 s3, s5, s3
+; GFX8-NEXT:    s_and_b32 s1, s1, s13
 ; GFX8-NEXT:    s_andn2_b32 s4, 7, s11
+; GFX8-NEXT:    s_lshl_b32 s3, s5, s3
 ; GFX8-NEXT:    s_lshr_b32 s5, s8, 1
 ; GFX8-NEXT:    s_and_b32 s0, s0, s13
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 8
 ; GFX8-NEXT:    s_lshr_b32 s4, s5, s4
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    s_and_b32 s1, s2, s13
-; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX8-NEXT:    s_or_b32 s3, s3, s4
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    s_and_b32 s1, s3, s13
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 24
@@ -885,11 +885,11 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
 ; GFX9-NEXT:    s_and_b32 s12, s2, 7
 ; GFX9-NEXT:    s_andn2_b32 s2, 7, s2
 ; GFX9-NEXT:    s_lshr_b32 s1, s1, 1
-; GFX9-NEXT:    s_lshr_b32 s1, s1, s2
 ; GFX9-NEXT:    s_lshr_b32 s3, s0, 8
 ; GFX9-NEXT:    s_lshr_b32 s4, s0, 16
 ; GFX9-NEXT:    s_lshr_b32 s5, s0, 24
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, s12
+; GFX9-NEXT:    s_lshr_b32 s1, s1, s2
 ; GFX9-NEXT:    s_or_b32 s0, s0, s1
 ; GFX9-NEXT:    s_and_b32 s1, s9, 7
 ; GFX9-NEXT:    s_lshl_b32 s1, s3, s1
@@ -906,19 +906,19 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
 ; GFX9-NEXT:    s_andn2_b32 s3, 7, s10
 ; GFX9-NEXT:    s_lshr_b32 s4, s4, 1
 ; GFX9-NEXT:    s_lshr_b32 s3, s4, s3
-; GFX9-NEXT:    s_and_b32 s1, s1, s13
 ; GFX9-NEXT:    s_or_b32 s2, s2, s3
 ; GFX9-NEXT:    s_and_b32 s3, s11, 7
-; GFX9-NEXT:    s_lshl_b32 s3, s5, s3
+; GFX9-NEXT:    s_and_b32 s1, s1, s13
 ; GFX9-NEXT:    s_andn2_b32 s4, 7, s11
+; GFX9-NEXT:    s_lshl_b32 s3, s5, s3
 ; GFX9-NEXT:    s_lshr_b32 s5, s8, 1
 ; GFX9-NEXT:    s_and_b32 s0, s0, s13
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, 8
 ; GFX9-NEXT:    s_lshr_b32 s4, s5, s4
 ; GFX9-NEXT:    s_or_b32 s0, s0, s1
 ; GFX9-NEXT:    s_and_b32 s1, s2, s13
-; GFX9-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX9-NEXT:    s_or_b32 s3, s3, s4
+; GFX9-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX9-NEXT:    s_or_b32 s0, s0, s1
 ; GFX9-NEXT:    s_and_b32 s1, s3, s13
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, 24
@@ -946,10 +946,10 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
 ; GFX10-NEXT:    s_bfe_u32 s2, s2, 0x100000
 ; GFX10-NEXT:    s_andn2_b32 s9, 7, s9
 ; GFX10-NEXT:    s_lshr_b32 s2, s2, 1
-; GFX10-NEXT:    s_lshl_b32 s3, s3, s6
 ; GFX10-NEXT:    s_lshr_b32 s4, s0, 16
 ; GFX10-NEXT:    s_lshr_b32 s5, s0, 24
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, s13
+; GFX10-NEXT:    s_lshl_b32 s3, s3, s6
 ; GFX10-NEXT:    s_lshr_b32 s2, s2, s9
 ; GFX10-NEXT:    s_or_b32 s0, s0, s1
 ; GFX10-NEXT:    s_or_b32 s1, s3, s2
@@ -963,13 +963,13 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
 ; GFX10-NEXT:    s_and_b32 s4, s12, 7
 ; GFX10-NEXT:    s_andn2_b32 s6, 7, s12
 ; GFX10-NEXT:    s_lshr_b32 s7, s8, 1
-; GFX10-NEXT:    s_or_b32 s2, s3, s2
-; GFX10-NEXT:    s_and_b32 s1, s1, s11
 ; GFX10-NEXT:    s_lshl_b32 s4, s5, s4
 ; GFX10-NEXT:    s_lshr_b32 s5, s7, s6
+; GFX10-NEXT:    s_or_b32 s2, s3, s2
+; GFX10-NEXT:    s_and_b32 s1, s1, s11
+; GFX10-NEXT:    s_or_b32 s3, s4, s5
 ; GFX10-NEXT:    s_and_b32 s0, s0, s11
 ; GFX10-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX10-NEXT:    s_or_b32 s3, s4, s5
 ; GFX10-NEXT:    s_and_b32 s2, s2, s11
 ; GFX10-NEXT:    s_or_b32 s0, s0, s1
 ; GFX10-NEXT:    s_lshl_b32 s1, s2, 16
@@ -990,12 +990,12 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX6-LABEL: v_fshl_v4i8:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v10, 0xff, v1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v6, 8, v2
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v8, 24, v2
 ; GFX6-NEXT:    v_and_b32_e32 v9, 7, v2
 ; GFX6-NEXT:    v_xor_b32_e32 v2, -1, v2
+; GFX6-NEXT:    v_and_b32_e32 v10, 0xff, v1
 ; GFX6-NEXT:    v_and_b32_e32 v2, 7, v2
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v10, 1, v10
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
@@ -1021,11 +1021,11 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX6-NEXT:    v_mov_b32_e32 v9, 0xff
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v6, v4
 ; GFX6-NEXT:    v_xor_b32_e32 v6, -1, v8
-; GFX6-NEXT:    v_and_b32_e32 v2, v2, v9
 ; GFX6-NEXT:    v_or_b32_e32 v3, v3, v4
 ; GFX6-NEXT:    v_and_b32_e32 v4, 7, v8
 ; GFX6-NEXT:    v_and_b32_e32 v6, 7, v6
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 25, v1
+; GFX6-NEXT:    v_and_b32_e32 v2, v2, v9
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, v4, v5
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, v6, v1
 ; GFX6-NEXT:    v_and_b32_e32 v0, v0, v9
@@ -1033,8 +1033,8 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX6-NEXT:    v_or_b32_e32 v1, v4, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_and_b32_e32 v2, v3, v9
-; GFX6-NEXT:    v_and_b32_e32 v1, v1, v9
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_and_b32_e32 v1, v1, v9
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -1043,12 +1043,12 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX8-LABEL: v_fshl_v4i8:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v8, 7, v2
-; GFX8-NEXT:    v_mov_b32_e32 v10, 1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 24, v2
+; GFX8-NEXT:    v_and_b32_e32 v8, 7, v2
 ; GFX8-NEXT:    v_xor_b32_e32 v2, -1, v2
+; GFX8-NEXT:    v_mov_b32_e32 v10, 1
 ; GFX8-NEXT:    v_and_b32_e32 v2, 7, v2
 ; GFX8-NEXT:    v_lshrrev_b16_sdwa v11, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v8, v8, v0
@@ -1060,17 +1060,17 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
 ; GFX8-NEXT:    v_and_b32_e32 v5, 7, v5
 ; GFX8-NEXT:    v_lshrrev_b16_sdwa v4, v10, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_lshrrev_b16_e32 v4, v5, v4
-; GFX8-NEXT:    v_lshlrev_b16_e32 v3, v8, v3
 ; GFX8-NEXT:    v_mov_b32_e32 v9, 0xff
-; GFX8-NEXT:    v_xor_b32_e32 v5, -1, v6
+; GFX8-NEXT:    v_lshlrev_b16_e32 v3, v8, v3
+; GFX8-NEXT:    v_lshrrev_b16_e32 v4, v5, v4
 ; GFX8-NEXT:    v_or_b32_e32 v3, v3, v4
 ; GFX8-NEXT:    v_and_b32_e32 v4, 7, v6
+; GFX8-NEXT:    v_xor_b32_e32 v5, -1, v6
 ; GFX8-NEXT:    v_and_b32_sdwa v6, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_and_b32_e32 v5, 7, v5
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v6, 1, v6
-; GFX8-NEXT:    v_lshrrev_b16_e32 v5, v5, v6
 ; GFX8-NEXT:    v_lshlrev_b16_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_lshrrev_b16_e32 v5, v5, v6
 ; GFX8-NEXT:    v_or_b32_e32 v4, v4, v5
 ; GFX8-NEXT:    v_and_b32_e32 v5, 7, v7
 ; GFX8-NEXT:    v_xor_b32_e32 v6, -1, v7
@@ -1081,12 +1081,12 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v1, v6, v1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 8
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX8-NEXT:    s_movk_i32 s4, 0xff
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX8-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_and_b32_e32 v2, s4, v4
-; GFX8-NEXT:    v_and_b32_e32 v0, s4, v0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v0, s4, v0
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
@@ -1095,12 +1095,12 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX9-LABEL: v_fshl_v4i8:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v8, 7, v2
-; GFX9-NEXT:    s_mov_b32 s5, 1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 24, v2
+; GFX9-NEXT:    v_and_b32_e32 v8, 7, v2
 ; GFX9-NEXT:    v_xor_b32_e32 v2, -1, v2
+; GFX9-NEXT:    s_mov_b32 s5, 1
 ; GFX9-NEXT:    v_and_b32_e32 v2, 7, v2
 ; GFX9-NEXT:    v_lshrrev_b16_sdwa v10, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v8, v8, v0
@@ -1112,17 +1112,17 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
 ; GFX9-NEXT:    v_and_b32_e32 v5, 7, v5
 ; GFX9-NEXT:    v_lshrrev_b16_sdwa v4, s5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_lshrrev_b16_e32 v4, v5, v4
-; GFX9-NEXT:    v_lshlrev_b16_e32 v3, v8, v3
 ; GFX9-NEXT:    v_mov_b32_e32 v9, 0xff
-; GFX9-NEXT:    v_xor_b32_e32 v5, -1, v6
+; GFX9-NEXT:    v_lshlrev_b16_e32 v3, v8, v3
+; GFX9-NEXT:    v_lshrrev_b16_e32 v4, v5, v4
 ; GFX9-NEXT:    v_or_b32_e32 v3, v3, v4
 ; GFX9-NEXT:    v_and_b32_e32 v4, 7, v6
+; GFX9-NEXT:    v_xor_b32_e32 v5, -1, v6
 ; GFX9-NEXT:    v_and_b32_sdwa v6, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-NEXT:    v_and_b32_e32 v5, 7, v5
 ; GFX9-NEXT:    v_lshrrev_b16_e32 v6, 1, v6
-; GFX9-NEXT:    v_lshrrev_b16_e32 v5, v5, v6
 ; GFX9-NEXT:    v_lshlrev_b16_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_lshrrev_b16_e32 v5, v5, v6
 ; GFX9-NEXT:    v_or_b32_e32 v4, v4, v5
 ; GFX9-NEXT:    v_and_b32_e32 v5, 7, v7
 ; GFX9-NEXT:    v_xor_b32_e32 v6, -1, v7
@@ -1135,9 +1135,9 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 8
 ; GFX9-NEXT:    s_movk_i32 s4, 0xff
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_and_b32_e32 v0, s4, v0
 ; GFX9-NEXT:    v_and_or_b32 v1, v2, s4, v1
 ; GFX9-NEXT:    v_and_b32_e32 v2, s4, v4
+; GFX9-NEXT:    v_and_b32_e32 v0, s4, v0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; GFX9-NEXT:    v_or3_b32 v0, v1, v2, v0
@@ -1149,52 +1149,52 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 8, v2
 ; GFX10-NEXT:    v_and_b32_e32 v11, 7, v2
-; GFX10-NEXT:    v_xor_b32_e32 v10, -1, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
+; GFX10-NEXT:    v_xor_b32_e32 v10, -1, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
 ; GFX10-NEXT:    v_lshlrev_b16 v0, v11, v0
 ; GFX10-NEXT:    v_xor_b32_e32 v11, -1, v8
 ; GFX10-NEXT:    v_and_b32_e32 v8, 7, v8
-; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
 ; GFX10-NEXT:    v_mov_b32_e32 v13, 0xff
 ; GFX10-NEXT:    s_movk_i32 s4, 0xff
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
 ; GFX10-NEXT:    v_and_b32_e32 v12, s4, v1
 ; GFX10-NEXT:    v_and_b32_e32 v6, s4, v6
-; GFX10-NEXT:    v_and_b32_sdwa v1, v1, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    v_xor_b32_e32 v13, -1, v2
 ; GFX10-NEXT:    v_lshlrev_b16 v3, v8, v3
 ; GFX10-NEXT:    v_xor_b32_e32 v8, -1, v9
-; GFX10-NEXT:    v_and_b32_e32 v2, 7, v2
-; GFX10-NEXT:    v_and_b32_e32 v9, 7, v9
+; GFX10-NEXT:    v_and_b32_sdwa v1, v1, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_xor_b32_e32 v13, -1, v2
 ; GFX10-NEXT:    v_and_b32_e32 v11, 7, v11
 ; GFX10-NEXT:    v_lshrrev_b16 v6, 1, v6
+; GFX10-NEXT:    v_and_b32_e32 v9, 7, v9
 ; GFX10-NEXT:    v_and_b32_e32 v8, 7, v8
 ; GFX10-NEXT:    v_lshrrev_b16 v1, 1, v1
+; GFX10-NEXT:    v_and_b32_e32 v2, 7, v2
 ; GFX10-NEXT:    v_and_b32_e32 v13, 7, v13
 ; GFX10-NEXT:    v_lshrrev_b16 v7, 1, v7
-; GFX10-NEXT:    v_lshrrev_b16 v6, v11, v6
-; GFX10-NEXT:    v_lshlrev_b16 v2, v2, v5
 ; GFX10-NEXT:    v_and_b32_e32 v10, 7, v10
 ; GFX10-NEXT:    v_lshrrev_b16 v12, 1, v12
-; GFX10-NEXT:    v_lshrrev_b16 v5, v13, v7
+; GFX10-NEXT:    v_lshrrev_b16 v6, v11, v6
 ; GFX10-NEXT:    v_lshlrev_b16 v4, v9, v4
 ; GFX10-NEXT:    v_lshrrev_b16 v1, v8, v1
-; GFX10-NEXT:    v_or_b32_e32 v3, v3, v6
+; GFX10-NEXT:    v_lshlrev_b16 v2, v2, v5
+; GFX10-NEXT:    v_lshrrev_b16 v5, v13, v7
 ; GFX10-NEXT:    v_lshrrev_b16 v7, v10, v12
-; GFX10-NEXT:    v_or_b32_e32 v2, v2, v5
+; GFX10-NEXT:    v_or_b32_e32 v3, v3, v6
 ; GFX10-NEXT:    v_mov_b32_e32 v6, 8
 ; GFX10-NEXT:    v_or_b32_e32 v1, v4, v1
+; GFX10-NEXT:    v_or_b32_e32 v2, v2, v5
 ; GFX10-NEXT:    v_or_b32_e32 v0, v0, v7
-; GFX10-NEXT:    v_and_b32_e32 v2, s4, v2
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX10-NEXT:    v_and_b32_e32 v1, s4, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, s4, v2
 ; GFX10-NEXT:    v_and_or_b32 v0, v0, s4, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
 ; GFX10-NEXT:    v_or3_b32 v0, v0, v1, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %lhs = bitcast i32 %lhs.arg to <4 x i8>
@@ -1481,36 +1481,36 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX6-NEXT:    s_bfe_u32 s6, s6, 0x100000
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    s_lshr_b32 s8, s1, 8
-; GFX6-NEXT:    s_and_b32 s1, s1, s9
 ; GFX6-NEXT:    s_bfe_u32 s0, s0, 0x100000
 ; GFX6-NEXT:    s_lshl_b32 s6, s6, 16
+; GFX6-NEXT:    s_and_b32 s1, s1, s9
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX6-NEXT:    s_lshl_b32 s1, s1, 8
 ; GFX6-NEXT:    s_or_b32 s0, s0, s6
+; GFX6-NEXT:    s_lshl_b32 s1, s1, 8
 ; GFX6-NEXT:    s_and_b32 s6, s8, s9
 ; GFX6-NEXT:    s_or_b32 s1, s7, s1
 ; GFX6-NEXT:    s_bfe_u32 s6, s6, 0x100000
-; GFX6-NEXT:    v_mov_b32_e32 v1, 0xffffffe8
 ; GFX6-NEXT:    s_bfe_u32 s1, s1, 0x100000
 ; GFX6-NEXT:    s_lshl_b32 s6, s6, 16
-; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v0
+; GFX6-NEXT:    v_mov_b32_e32 v1, 0xffffffe8
 ; GFX6-NEXT:    s_or_b32 s1, s1, s6
 ; GFX6-NEXT:    s_lshr_b32 s6, s2, 16
 ; GFX6-NEXT:    s_lshr_b32 s7, s2, 24
 ; GFX6-NEXT:    s_and_b32 s10, s2, s9
 ; GFX6-NEXT:    s_bfe_u32 s2, s2, s11
+; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v0
 ; GFX6-NEXT:    s_lshl_b32 s2, s2, 8
 ; GFX6-NEXT:    s_and_b32 s6, s6, s9
 ; GFX6-NEXT:    s_or_b32 s2, s10, s2
 ; GFX6-NEXT:    s_bfe_u32 s6, s6, 0x100000
 ; GFX6-NEXT:    s_lshr_b32 s8, s3, 8
-; GFX6-NEXT:    s_and_b32 s3, s3, s9
 ; GFX6-NEXT:    s_bfe_u32 s2, s2, 0x100000
 ; GFX6-NEXT:    s_lshl_b32 s6, s6, 16
-; GFX6-NEXT:    s_lshl_b32 s3, s3, 8
-; GFX6-NEXT:    v_mul_hi_u32 v2, v0, v2
+; GFX6-NEXT:    s_and_b32 s3, s3, s9
 ; GFX6-NEXT:    s_or_b32 s2, s2, s6
+; GFX6-NEXT:    s_lshl_b32 s3, s3, 8
 ; GFX6-NEXT:    s_and_b32 s6, s8, s9
+; GFX6-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX6-NEXT:    s_or_b32 s3, s7, s3
 ; GFX6-NEXT:    s_bfe_u32 s6, s6, 0x100000
 ; GFX6-NEXT:    s_bfe_u32 s3, s3, 0x100000
@@ -1542,22 +1542,22 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, 24, v0
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
 ; GFX6-NEXT:    v_mul_hi_u32 v1, v2, v1
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX6-NEXT:    s_and_b32 s6, s8, s9
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX6-NEXT:    s_or_b32 s5, s7, s5
 ; GFX6-NEXT:    s_bfe_u32 s6, s6, 0x100000
 ; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, 24, v0
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
 ; GFX6-NEXT:    s_bfe_u32 s5, s5, 0x100000
 ; GFX6-NEXT:    s_lshl_b32 s6, s6, 16
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
 ; GFX6-NEXT:    s_or_b32 s5, s5, s6
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v1, s5, v1
 ; GFX6-NEXT:    s_mov_b32 s6, 0xffffff
 ; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, 23, v0
-; GFX6-NEXT:    v_and_b32_e32 v0, s6, v0
 ; GFX6-NEXT:    v_mul_lo_u32 v1, v1, 24
+; GFX6-NEXT:    v_and_b32_e32 v0, s6, v0
 ; GFX6-NEXT:    v_lshl_b32_e32 v0, s0, v0
 ; GFX6-NEXT:    s_lshr_b32 s0, s2, 1
 ; GFX6-NEXT:    v_and_b32_e32 v2, s6, v3
@@ -1575,13 +1575,13 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX6-NEXT:    v_and_b32_e32 v1, v1, v4
 ; GFX6-NEXT:    s_lshr_b32 s0, s3, 1
 ; GFX6-NEXT:    v_and_b32_e32 v2, v2, v4
-; GFX6-NEXT:    v_bfe_u32 v3, v0, 8, 8
 ; GFX6-NEXT:    v_lshl_b32_e32 v1, s1, v1
 ; GFX6-NEXT:    v_lshr_b32_e32 v2, s0, v2
+; GFX6-NEXT:    v_bfe_u32 v3, v0, 8, 8
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX6-NEXT:    v_and_b32_e32 v2, s9, v0
-; GFX6-NEXT:    v_bfe_u32 v0, v0, 16, 8
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
+; GFX6-NEXT:    v_bfe_u32 v0, v0, 16, 8
 ; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
@@ -1612,17 +1612,17 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX8-NEXT:    s_bfe_u32 s6, s6, 0x100000
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX8-NEXT:    s_lshr_b32 s9, s1, 8
-; GFX8-NEXT:    s_and_b32 s1, s1, s10
 ; GFX8-NEXT:    s_bfe_u32 s0, s0, 0x100000
 ; GFX8-NEXT:    s_lshl_b32 s6, s6, 16
-; GFX8-NEXT:    s_lshl_b32 s1, s1, s11
+; GFX8-NEXT:    s_and_b32 s1, s1, s10
 ; GFX8-NEXT:    s_or_b32 s0, s0, s6
+; GFX8-NEXT:    s_lshl_b32 s1, s1, s11
 ; GFX8-NEXT:    s_and_b32 s6, s9, s10
 ; GFX8-NEXT:    s_or_b32 s1, s8, s1
 ; GFX8-NEXT:    s_bfe_u32 s6, s6, 0x100000
-; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x100000
 ; GFX8-NEXT:    s_lshl_b32 s6, s6, 16
+; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX8-NEXT:    s_or_b32 s1, s1, s6
 ; GFX8-NEXT:    s_lshr_b32 s6, s2, 8
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
@@ -1637,17 +1637,17 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX8-NEXT:    s_bfe_u32 s6, s6, 0x100000
 ; GFX8-NEXT:    v_mul_lo_u32 v2, v1, v0
 ; GFX8-NEXT:    s_lshr_b32 s9, s3, 8
-; GFX8-NEXT:    s_and_b32 s3, s3, s10
 ; GFX8-NEXT:    s_bfe_u32 s2, s2, 0x100000
 ; GFX8-NEXT:    s_lshl_b32 s6, s6, 16
-; GFX8-NEXT:    s_lshl_b32 s3, s3, s11
+; GFX8-NEXT:    s_and_b32 s3, s3, s10
 ; GFX8-NEXT:    s_or_b32 s2, s2, s6
+; GFX8-NEXT:    s_lshl_b32 s3, s3, s11
 ; GFX8-NEXT:    s_and_b32 s6, s9, s10
 ; GFX8-NEXT:    s_or_b32 s3, s8, s3
 ; GFX8-NEXT:    s_bfe_u32 s6, s6, 0x100000
-; GFX8-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX8-NEXT:    s_bfe_u32 s3, s3, 0x100000
 ; GFX8-NEXT:    s_lshl_b32 s6, s6, 16
+; GFX8-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX8-NEXT:    s_or_b32 s3, s3, s6
 ; GFX8-NEXT:    s_lshr_b32 s6, s4, 8
 ; GFX8-NEXT:    s_and_b32 s6, s6, s10
@@ -1676,22 +1676,22 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, 24, v0
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
 ; GFX8-NEXT:    v_mul_hi_u32 v1, v2, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX8-NEXT:    s_and_b32 s6, s9, s10
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX8-NEXT:    s_or_b32 s5, s8, s5
 ; GFX8-NEXT:    s_bfe_u32 s6, s6, 0x100000
 ; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, 24, v0
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
 ; GFX8-NEXT:    s_bfe_u32 s5, s5, 0x100000
 ; GFX8-NEXT:    s_lshl_b32 s6, s6, 16
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
 ; GFX8-NEXT:    s_or_b32 s5, s5, s6
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v2, v1
 ; GFX8-NEXT:    v_mul_hi_u32 v1, s5, v1
 ; GFX8-NEXT:    s_mov_b32 s6, 0xffffff
 ; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, 23, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, s6, v0
 ; GFX8-NEXT:    v_mul_lo_u32 v1, v1, 24
+; GFX8-NEXT:    v_and_b32_e32 v0, s6, v0
 ; GFX8-NEXT:    v_lshlrev_b32_e64 v0, v0, s0
 ; GFX8-NEXT:    s_lshr_b32 s0, s2, 1
 ; GFX8-NEXT:    v_and_b32_e32 v2, s6, v3
@@ -1707,8 +1707,8 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX8-NEXT:    v_mov_b32_e32 v4, 0xffffff
 ; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 23, v1
 ; GFX8-NEXT:    v_and_b32_e32 v1, v1, v4
-; GFX8-NEXT:    v_and_b32_e32 v2, v2, v4
 ; GFX8-NEXT:    s_lshr_b32 s0, s3, 1
+; GFX8-NEXT:    v_and_b32_e32 v2, v2, v4
 ; GFX8-NEXT:    v_lshlrev_b32_e64 v1, v1, s1
 ; GFX8-NEXT:    v_lshrrev_b32_e64 v2, v2, s0
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
@@ -1719,8 +1719,8 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX8-NEXT:    v_or_b32_e32 v0, v3, v0
 ; GFX8-NEXT:    v_and_b32_e32 v3, s10, v1
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v3
 ; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
@@ -1745,21 +1745,21 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX9-NEXT:    s_and_b32 s7, s9, s12
 ; GFX9-NEXT:    s_bfe_u32 s7, s7, 0x100000
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffffffe8
-; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v0
 ; GFX9-NEXT:    s_lshr_b32 s11, s1, 8
-; GFX9-NEXT:    s_and_b32 s1, s1, s12
 ; GFX9-NEXT:    s_bfe_u32 s0, s0, 0x100000
 ; GFX9-NEXT:    s_lshl_b32 s7, s7, 16
-; GFX9-NEXT:    s_lshl_b32 s1, s1, s13
+; GFX9-NEXT:    s_and_b32 s1, s1, s12
+; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v0
 ; GFX9-NEXT:    s_or_b32 s0, s0, s7
+; GFX9-NEXT:    s_lshl_b32 s1, s1, s13
 ; GFX9-NEXT:    s_and_b32 s7, s11, s12
 ; GFX9-NEXT:    s_or_b32 s1, s10, s1
 ; GFX9-NEXT:    s_bfe_u32 s7, s7, 0x100000
 ; GFX9-NEXT:    s_bfe_u32 s1, s1, 0x100000
 ; GFX9-NEXT:    s_lshl_b32 s7, s7, 16
-; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX9-NEXT:    s_or_b32 s1, s1, s7
 ; GFX9-NEXT:    s_lshr_b32 s7, s2, 8
+; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX9-NEXT:    s_and_b32 s7, s7, s12
 ; GFX9-NEXT:    s_lshr_b32 s9, s2, 16
 ; GFX9-NEXT:    s_lshr_b32 s10, s2, 24
@@ -1770,21 +1770,21 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX9-NEXT:    s_bfe_u32 s7, s7, 0x100000
 ; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v2, 24
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v2
 ; GFX9-NEXT:    s_lshr_b32 s11, s3, 8
-; GFX9-NEXT:    s_and_b32 s3, s3, s12
 ; GFX9-NEXT:    s_bfe_u32 s2, s2, 0x100000
 ; GFX9-NEXT:    s_lshl_b32 s7, s7, 16
+; GFX9-NEXT:    s_and_b32 s3, s3, s12
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v2
 ; GFX9-NEXT:    s_or_b32 s2, s2, s7
-; GFX9-NEXT:    s_and_b32 s7, s11, s12
 ; GFX9-NEXT:    s_lshl_b32 s3, s3, s13
+; GFX9-NEXT:    s_and_b32 s7, s11, s12
 ; GFX9-NEXT:    s_or_b32 s3, s10, s3
 ; GFX9-NEXT:    s_bfe_u32 s7, s7, 0x100000
 ; GFX9-NEXT:    s_bfe_u32 s3, s3, 0x100000
 ; GFX9-NEXT:    s_lshl_b32 s7, s7, 16
-; GFX9-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
 ; GFX9-NEXT:    s_or_b32 s3, s3, s7
 ; GFX9-NEXT:    s_lshr_b32 s7, s4, 8
+; GFX9-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
 ; GFX9-NEXT:    s_and_b32 s7, s7, s12
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GFX9-NEXT:    s_lshr_b32 s9, s4, 16
@@ -1794,9 +1794,9 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX9-NEXT:    s_or_b32 s4, s4, s7
 ; GFX9-NEXT:    s_and_b32 s7, s9, s12
 ; GFX9-NEXT:    s_bfe_u32 s7, s7, 0x100000
-; GFX9-NEXT:    v_mul_lo_u32 v1, v1, v2
 ; GFX9-NEXT:    s_bfe_u32 s4, s4, 0x100000
 ; GFX9-NEXT:    s_lshl_b32 s7, s7, 16
+; GFX9-NEXT:    v_mul_lo_u32 v1, v1, v2
 ; GFX9-NEXT:    s_or_b32 s4, s4, s7
 ; GFX9-NEXT:    v_mul_hi_u32 v0, s4, v0
 ; GFX9-NEXT:    s_lshr_b32 s11, s5, 8
@@ -1816,17 +1816,17 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX9-NEXT:    v_subrev_u32_e32 v3, 24, v0
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v1, v1, 24
 ; GFX9-NEXT:    v_subrev_u32_e32 v3, 24, v0
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
+; GFX9-NEXT:    v_mul_lo_u32 v1, v1, 24
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX9-NEXT:    s_mov_b32 s7, 0xffffff
 ; GFX9-NEXT:    v_sub_u32_e32 v3, 23, v0
 ; GFX9-NEXT:    s_lshr_b32 s2, s2, 1
 ; GFX9-NEXT:    v_and_b32_e32 v3, s7, v3
-; GFX9-NEXT:    v_sub_u32_e32 v1, s5, v1
 ; GFX9-NEXT:    v_and_b32_e32 v0, s7, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e64 v3, v3, s2
+; GFX9-NEXT:    v_sub_u32_e32 v1, s5, v1
 ; GFX9-NEXT:    v_lshl_or_b32 v0, s0, v0, v3
 ; GFX9-NEXT:    v_subrev_u32_e32 v3, 24, v1
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v1
@@ -1840,11 +1840,11 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX9-NEXT:    s_lshr_b32 s0, s3, 1
 ; GFX9-NEXT:    v_and_b32_e32 v2, v3, v2
 ; GFX9-NEXT:    v_lshrrev_b32_e64 v2, v2, s0
-; GFX9-NEXT:    v_lshl_or_b32 v1, s1, v1, v2
 ; GFX9-NEXT:    s_mov_b32 s6, 8
+; GFX9-NEXT:    v_lshl_or_b32 v1, s1, v1, v2
+; GFX9-NEXT:    s_mov_b32 s8, 16
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_and_b32_e32 v3, s12, v1
-; GFX9-NEXT:    s_mov_b32 s8, 16
 ; GFX9-NEXT:    v_and_or_b32 v2, v0, s12, v2
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
@@ -1875,9 +1875,9 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX10-NEXT:    s_lshr_b32 s7, s0, 16
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
-; GFX10-NEXT:    s_and_b32 s8, s8, s9
 ; GFX10-NEXT:    s_and_b32 s0, s0, s9
 ; GFX10-NEXT:    s_lshl_b32 s6, s6, s11
+; GFX10-NEXT:    s_and_b32 s8, s8, s9
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX10-NEXT:    s_or_b32 s0, s0, s6
@@ -1895,49 +1895,50 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX10-NEXT:    s_and_b32 s8, s10, s9
 ; GFX10-NEXT:    v_mul_hi_u32 v3, v1, v3
 ; GFX10-NEXT:    s_bfe_u32 s8, s8, 0x100000
-; GFX10-NEXT:    s_and_b32 s5, s5, s9
 ; GFX10-NEXT:    s_bfe_u32 s4, s4, 0x100000
 ; GFX10-NEXT:    s_lshl_b32 s8, s8, 16
-; GFX10-NEXT:    s_lshl_b32 s5, s5, s11
-; GFX10-NEXT:    v_add_nc_u32_e32 v0, v0, v2
+; GFX10-NEXT:    s_and_b32 s5, s5, s9
 ; GFX10-NEXT:    s_or_b32 s4, s4, s8
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, v0, v2
+; GFX10-NEXT:    s_lshl_b32 s5, s5, s11
 ; GFX10-NEXT:    s_and_b32 s8, s13, s9
 ; GFX10-NEXT:    s_or_b32 s5, s12, s5
 ; GFX10-NEXT:    s_bfe_u32 s8, s8, 0x100000
-; GFX10-NEXT:    v_mul_hi_u32 v0, s4, v0
 ; GFX10-NEXT:    v_add_nc_u32_e32 v1, v1, v3
+; GFX10-NEXT:    v_mul_hi_u32 v0, s4, v0
 ; GFX10-NEXT:    s_bfe_u32 s5, s5, 0x100000
 ; GFX10-NEXT:    s_lshl_b32 s8, s8, 16
 ; GFX10-NEXT:    s_lshr_b32 s10, s2, 16
 ; GFX10-NEXT:    s_or_b32 s5, s5, s8
 ; GFX10-NEXT:    s_lshr_b32 s8, s2, 8
 ; GFX10-NEXT:    v_mul_hi_u32 v1, s5, v1
-; GFX10-NEXT:    v_mul_lo_u32 v0, v0, 24
 ; GFX10-NEXT:    s_and_b32 s8, s8, s9
+; GFX10-NEXT:    v_mul_lo_u32 v0, v0, 24
 ; GFX10-NEXT:    s_and_b32 s12, s2, s9
 ; GFX10-NEXT:    s_lshl_b32 s8, s8, s11
 ; GFX10-NEXT:    s_and_b32 s10, s10, s9
 ; GFX10-NEXT:    s_or_b32 s8, s12, s8
 ; GFX10-NEXT:    s_lshr_b32 s2, s2, 24
 ; GFX10-NEXT:    v_mul_lo_u32 v1, v1, 24
+; GFX10-NEXT:    s_bfe_u32 s6, s6, 0x100000
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v0, s4, v0
 ; GFX10-NEXT:    s_bfe_u32 s4, s8, 0x100000
 ; GFX10-NEXT:    s_bfe_u32 s8, s10, 0x100000
-; GFX10-NEXT:    s_bfe_u32 s6, s6, 0x100000
 ; GFX10-NEXT:    s_bfe_u32 s7, s7, 0x100000
+; GFX10-NEXT:    s_bfe_u32 s0, s0, 0x100000
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v2, 24, v0
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v0
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v1, s5, v1
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v0
 ; GFX10-NEXT:    s_lshl_b32 s5, s8, 16
 ; GFX10-NEXT:    s_lshr_b32 s8, s3, 8
 ; GFX10-NEXT:    s_and_b32 s3, s3, s9
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, 24, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v1
 ; GFX10-NEXT:    s_lshl_b32 s3, s3, s11
 ; GFX10-NEXT:    s_or_b32 s4, s4, s5
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v2, 24, v0
 ; GFX10-NEXT:    s_or_b32 s2, s2, s3
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v2, 24, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v0
 ; GFX10-NEXT:    s_and_b32 s3, s8, s9
@@ -1948,7 +1949,7 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v1
 ; GFX10-NEXT:    s_bfe_u32 s2, s2, 0x100000
 ; GFX10-NEXT:    s_lshl_b32 s3, s3, 16
-; GFX10-NEXT:    s_bfe_u32 s0, s0, 0x100000
+; GFX10-NEXT:    s_lshl_b32 s6, s6, 16
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v2, 23, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 0xffffff
@@ -1961,22 +1962,21 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX10-NEXT:    v_and_b32_e32 v1, v1, v3
 ; GFX10-NEXT:    v_lshrrev_b32_e64 v2, v2, s3
 ; GFX10-NEXT:    v_and_b32_e32 v4, v4, v3
-; GFX10-NEXT:    s_lshl_b32 s6, s6, 16
 ; GFX10-NEXT:    s_bfe_u32 s1, s1, 0x100000
 ; GFX10-NEXT:    s_lshl_b32 s7, s7, 16
 ; GFX10-NEXT:    s_or_b32 s0, s0, s6
-; GFX10-NEXT:    v_lshrrev_b32_e64 v3, v4, s2
 ; GFX10-NEXT:    s_or_b32 s1, s1, s7
+; GFX10-NEXT:    v_lshrrev_b32_e64 v3, v4, s2
 ; GFX10-NEXT:    v_lshl_or_b32 v0, s0, v0, v2
 ; GFX10-NEXT:    s_mov_b32 s0, 8
 ; GFX10-NEXT:    v_lshl_or_b32 v1, s1, v1, v3
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX10-NEXT:    s_mov_b32 s0, 16
 ; GFX10-NEXT:    v_and_b32_e32 v3, s9, v1
-; GFX10-NEXT:    v_bfe_u32 v4, v1, 8, 8
-; GFX10-NEXT:    v_bfe_u32 v1, v1, 16, 8
 ; GFX10-NEXT:    v_and_or_b32 v2, v0, s9, v2
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX10-NEXT:    v_bfe_u32 v4, v1, 8, 8
+; GFX10-NEXT:    v_bfe_u32 v1, v1, 16, 8
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
 ; GFX10-NEXT:    v_lshl_or_b32 v1, v1, 8, v4
 ; GFX10-NEXT:    v_or3_b32 v0, v2, v0, v3
@@ -2023,8 +2023,8 @@ define <2 x i24> @v_fshl_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX6-NEXT:    v_mul_lo_u32 v6, v7, v8
 ; GFX6-NEXT:    v_sub_i32_e32 v7, vcc, 23, v4
 ; GFX6-NEXT:    v_and_b32_e32 v4, v4, v9
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, v4, v0
 ; GFX6-NEXT:    v_mul_hi_u32 v6, v8, v6
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, v4, v0
 ; GFX6-NEXT:    v_and_b32_e32 v4, v5, v9
 ; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v8, v6
 ; GFX6-NEXT:    v_mul_hi_u32 v5, v4, v5
@@ -2080,8 +2080,8 @@ define <2 x i24> @v_fshl_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX8-NEXT:    v_mul_lo_u32 v6, v7, v8
 ; GFX8-NEXT:    v_sub_u32_e32 v7, vcc, 23, v4
 ; GFX8-NEXT:    v_and_b32_e32 v4, v4, v9
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, v4, v0
 ; GFX8-NEXT:    v_mul_hi_u32 v6, v8, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, v4, v0
 ; GFX8-NEXT:    v_and_b32_e32 v4, v5, v9
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v8, v6
 ; GFX8-NEXT:    v_mul_hi_u32 v5, v4, v5
@@ -2594,9 +2594,9 @@ define <2 x i32> @v_fshl_v2i32(<2 x i32> %lhs, <2 x i32> %rhs, <2 x i32> %amt) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_alignbit_b32 v2, v0, v2, 1
-; GFX10-NEXT:    v_alignbit_b32 v3, v1, v3, 1
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    v_xor_b32_e32 v4, -1, v4
+; GFX10-NEXT:    v_alignbit_b32 v3, v1, v3, 1
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 1, v1
 ; GFX10-NEXT:    v_xor_b32_e32 v5, -1, v5
 ; GFX10-NEXT:    v_alignbit_b32 v0, v0, v2, v4
@@ -2663,12 +2663,12 @@ define <3 x i32> @v_fshl_v3i32(<3 x i32> %lhs, <3 x i32> %rhs, <3 x i32> %amt) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_alignbit_b32 v3, v0, v3, 1
-; GFX10-NEXT:    v_alignbit_b32 v4, v1, v4, 1
-; GFX10-NEXT:    v_alignbit_b32 v5, v2, v5, 1
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    v_xor_b32_e32 v6, -1, v6
+; GFX10-NEXT:    v_alignbit_b32 v4, v1, v4, 1
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 1, v1
 ; GFX10-NEXT:    v_xor_b32_e32 v7, -1, v7
+; GFX10-NEXT:    v_alignbit_b32 v5, v2, v5, 1
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 1, v2
 ; GFX10-NEXT:    v_xor_b32_e32 v8, -1, v8
 ; GFX10-NEXT:    v_alignbit_b32 v0, v0, v3, v6
@@ -2748,15 +2748,15 @@ define <4 x i32> @v_fshl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs, <4 x i32> %amt) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_alignbit_b32 v4, v0, v4, 1
-; GFX10-NEXT:    v_alignbit_b32 v5, v1, v5, 1
-; GFX10-NEXT:    v_alignbit_b32 v6, v2, v6, 1
-; GFX10-NEXT:    v_alignbit_b32 v7, v3, v7, 1
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    v_xor_b32_e32 v8, -1, v8
+; GFX10-NEXT:    v_alignbit_b32 v5, v1, v5, 1
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 1, v1
 ; GFX10-NEXT:    v_xor_b32_e32 v9, -1, v9
+; GFX10-NEXT:    v_alignbit_b32 v6, v2, v6, 1
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 1, v2
 ; GFX10-NEXT:    v_xor_b32_e32 v10, -1, v10
+; GFX10-NEXT:    v_alignbit_b32 v7, v3, v7, 1
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 1, v3
 ; GFX10-NEXT:    v_xor_b32_e32 v11, -1, v11
 ; GFX10-NEXT:    v_alignbit_b32 v0, v0, v4, v8
@@ -3090,8 +3090,8 @@ define amdgpu_ps half @v_fshl_i16_ssv(i16 inreg %lhs, i16 inreg %rhs, i16 %amt)
 ; GFX10-NEXT:    v_and_b32_e32 v0, 15, v0
 ; GFX10-NEXT:    s_bfe_u32 s1, s1, 0x100000
 ; GFX10-NEXT:    s_bfe_u32 s2, 1, 0x100000
-; GFX10-NEXT:    v_and_b32_e32 v1, 15, v1
 ; GFX10-NEXT:    s_lshr_b32 s1, s1, s2
+; GFX10-NEXT:    v_and_b32_e32 v1, 15, v1
 ; GFX10-NEXT:    v_lshlrev_b16 v0, v0, s0
 ; GFX10-NEXT:    v_lshrrev_b16 v1, v1, s1
 ; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -3118,8 +3118,8 @@ define amdgpu_ps half @v_fshl_i16_svs(i16 inreg %lhs, i16 %rhs, i16 inreg %amt)
 ; GFX8-LABEL: v_fshl_i16_svs:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_and_b32 s2, s1, 15
-; GFX8-NEXT:    s_bfe_u32 s2, s2, 0x100000
 ; GFX8-NEXT:    s_andn2_b32 s1, 15, s1
+; GFX8-NEXT:    s_bfe_u32 s2, s2, 0x100000
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v0, 1, v0
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, s2
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v0, s1, v0
@@ -3129,8 +3129,8 @@ define amdgpu_ps half @v_fshl_i16_svs(i16 inreg %lhs, i16 %rhs, i16 inreg %amt)
 ; GFX9-LABEL: v_fshl_i16_svs:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_and_b32 s2, s1, 15
-; GFX9-NEXT:    s_bfe_u32 s2, s2, 0x100000
 ; GFX9-NEXT:    s_andn2_b32 s1, 15, s1
+; GFX9-NEXT:    s_bfe_u32 s2, s2, 0x100000
 ; GFX9-NEXT:    v_lshrrev_b16_e32 v0, 1, v0
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, s2
 ; GFX9-NEXT:    v_lshrrev_b16_e32 v0, s1, v0
@@ -3224,9 +3224,9 @@ define amdgpu_ps i32 @s_fshl_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs, <
 ; GFX6-NEXT:    s_or_b32 s0, s0, s2
 ; GFX6-NEXT:    s_and_b32 s2, s5, 15
 ; GFX6-NEXT:    s_bfe_u32 s2, s2, 0x100000
+; GFX6-NEXT:    s_andn2_b32 s4, 15, s5
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, s2
 ; GFX6-NEXT:    s_and_b32 s2, s3, s6
-; GFX6-NEXT:    s_andn2_b32 s4, 15, s5
 ; GFX6-NEXT:    s_lshr_b32 s2, s2, 1
 ; GFX6-NEXT:    s_bfe_u32 s3, s4, 0x100000
 ; GFX6-NEXT:    s_lshr_b32 s2, s2, s3
@@ -3283,8 +3283,8 @@ define amdgpu_ps i32 @s_fshl_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs, <
 ; GFX9-NEXT:    s_lshr_b32 s3, s3, 1
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s3
 ; GFX9-NEXT:    s_lshr_b32 s3, s1, 16
-; GFX9-NEXT:    s_lshr_b32 s5, s2, 16
 ; GFX9-NEXT:    s_and_b32 s1, s1, s4
+; GFX9-NEXT:    s_lshr_b32 s5, s2, 16
 ; GFX9-NEXT:    s_and_b32 s2, s2, s4
 ; GFX9-NEXT:    s_lshr_b32 s1, s1, s2
 ; GFX9-NEXT:    s_lshr_b32 s2, s3, s5
@@ -3334,15 +3334,15 @@ define <2 x i16> @v_fshl_v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt) {
 ; GFX6-NEXT:    v_bfe_u32 v6, v6, 0, 16
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 1, v2
 ; GFX6-NEXT:    v_bfe_u32 v4, v4, 0, 16
-; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, v6, v0
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v2
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_and_b32_e32 v2, 15, v5
 ; GFX6-NEXT:    v_xor_b32_e32 v4, -1, v5
 ; GFX6-NEXT:    v_bfe_u32 v2, v2, 0, 16
+; GFX6-NEXT:    v_and_b32_e32 v4, 15, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, v2, v1
 ; GFX6-NEXT:    v_and_b32_e32 v2, s4, v3
-; GFX6-NEXT:    v_and_b32_e32 v4, 15, v4
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 1, v2
 ; GFX6-NEXT:    v_bfe_u32 v3, v4, 0, 16
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v3, v2
@@ -3352,8 +3352,8 @@ define <2 x i16> @v_fshl_v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt) {
 ; GFX8-LABEL: v_fshl_v2i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v4, 15, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v4, 15, v2
 ; GFX8-NEXT:    v_xor_b32_e32 v2, -1, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 15, v2
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v5, 1, v1
@@ -3413,8 +3413,8 @@ define <2 x i16> @v_fshl_v2i16_4_8(<2 x i16> %lhs, <2 x i16> %rhs) {
 ; GFX6-NEXT:    v_and_b32_e32 v2, s4, v2
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 12, v2
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX6-NEXT:    v_and_b32_e32 v2, s4, v3
 ; GFX6-NEXT:    s_bfe_u32 s5, 8, 0x100000
+; GFX6-NEXT:    v_and_b32_e32 v2, s4, v3
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, s5, v1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 8, v2
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
@@ -3442,8 +3442,8 @@ define <2 x i16> @v_fshl_v2i16_4_8(<2 x i16> %lhs, <2 x i16> %rhs) {
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v2, 16
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v2
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v3, 16
-; GFX9-NEXT:    s_mov_b32 s4, 0x4f7ffffe
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v3
+; GFX9-NEXT:    s_mov_b32 s4, 0x4f7ffffe
 ; GFX9-NEXT:    v_mul_f32_e32 v2, s4, v2
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GFX9-NEXT:    v_mul_f32_e32 v3, s4, v3
@@ -3460,8 +3460,8 @@ define <2 x i16> @v_fshl_v2i16_4_8(<2 x i16> %lhs, <2 x i16> %rhs) {
 ; GFX9-NEXT:    v_sub_u32_e32 v2, 4, v2
 ; GFX9-NEXT:    v_subrev_u32_e32 v4, 16, v2
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 16, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 4, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX9-NEXT:    v_subrev_u32_e32 v4, 16, v2
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 16, v2
 ; GFX9-NEXT:    v_sub_u32_e32 v3, 8, v3
@@ -3537,8 +3537,8 @@ define amdgpu_ps float @v_fshl_v2i16_ssv(<2 x i16> inreg %lhs, <2 x i16> inreg %
 ; GFX6-NEXT:    v_xor_b32_e32 v0, -1, v0
 ; GFX6-NEXT:    v_lshl_b32_e32 v2, s0, v2
 ; GFX6-NEXT:    s_mov_b32 s0, 0xffff
-; GFX6-NEXT:    s_and_b32 s2, s2, s0
 ; GFX6-NEXT:    v_and_b32_e32 v0, 15, v0
+; GFX6-NEXT:    s_and_b32 s2, s2, s0
 ; GFX6-NEXT:    s_lshr_b32 s2, s2, 1
 ; GFX6-NEXT:    v_bfe_u32 v0, v0, 0, 16
 ; GFX6-NEXT:    v_lshr_b32_e32 v0, s2, v0
@@ -3562,11 +3562,11 @@ define amdgpu_ps float @v_fshl_v2i16_ssv(<2 x i16> inreg %lhs, <2 x i16> inreg %
 ; GFX8-LABEL: v_fshl_v2i16_ssv:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    v_and_b32_e32 v2, 15, v0
+; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX8-NEXT:    s_lshr_b32 s3, s1, 16
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX8-NEXT:    v_xor_b32_e32 v0, -1, v0
-; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
 ; GFX8-NEXT:    v_lshlrev_b16_e64 v2, v2, s0
-; GFX8-NEXT:    s_lshr_b32 s3, s1, 16
 ; GFX8-NEXT:    s_bfe_u32 s0, s1, 0x100000
 ; GFX8-NEXT:    s_bfe_u32 s1, 1, 0x100000
 ; GFX8-NEXT:    v_and_b32_e32 v0, 15, v0
@@ -3635,9 +3635,9 @@ define amdgpu_ps float @v_fshl_v2i16_svs(<2 x i16> inreg %lhs, <2 x i16> %rhs, <
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s2, v0
 ; GFX6-NEXT:    v_or_b32_e32 v0, s0, v0
 ; GFX6-NEXT:    s_and_b32 s0, s3, 15
+; GFX6-NEXT:    s_andn2_b32 s2, 15, s3
 ; GFX6-NEXT:    s_bfe_u32 s0, s0, 0x100000
 ; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
-; GFX6-NEXT:    s_andn2_b32 s2, 15, s3
 ; GFX6-NEXT:    s_lshl_b32 s0, s1, s0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 1, v1
 ; GFX6-NEXT:    s_bfe_u32 s1, s2, 0x100000
@@ -3654,17 +3654,17 @@ define amdgpu_ps float @v_fshl_v2i16_svs(<2 x i16> inreg %lhs, <2 x i16> %rhs, <
 ; GFX8-NEXT:    s_and_b32 s4, s1, 15
 ; GFX8-NEXT:    s_lshr_b32 s3, s1, 16
 ; GFX8-NEXT:    s_andn2_b32 s1, 15, s1
-; GFX8-NEXT:    v_lshrrev_b16_e32 v1, 1, v0
 ; GFX8-NEXT:    s_bfe_u32 s4, s4, 0x100000
+; GFX8-NEXT:    v_lshrrev_b16_e32 v1, 1, v0
 ; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
-; GFX8-NEXT:    v_lshrrev_b16_e32 v1, s1, v1
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, s4
+; GFX8-NEXT:    v_lshrrev_b16_e32 v1, s1, v1
 ; GFX8-NEXT:    v_or_b32_e32 v1, s0, v1
 ; GFX8-NEXT:    s_and_b32 s0, s3, 15
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 1
+; GFX8-NEXT:    s_andn2_b32 s1, 15, s3
 ; GFX8-NEXT:    s_bfe_u32 s0, s0, 0x100000
 ; GFX8-NEXT:    v_lshrrev_b16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    s_andn2_b32 s1, 15, s3
 ; GFX8-NEXT:    s_lshl_b32 s0, s2, s0
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v0, s1, v0
 ; GFX8-NEXT:    v_or_b32_e32 v0, s0, v0
@@ -3680,9 +3680,9 @@ define amdgpu_ps float @v_fshl_v2i16_svs(<2 x i16> inreg %lhs, <2 x i16> %rhs, <
 ; GFX9-NEXT:    s_andn2_b32 s1, s2, s1
 ; GFX9-NEXT:    s_lshr_b32 s2, s0, 16
 ; GFX9-NEXT:    s_lshr_b32 s4, s3, 16
-; GFX9-NEXT:    v_pk_lshrrev_b16 v0, 1, v0 op_sel_hi:[0,1]
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, s3
 ; GFX9-NEXT:    s_lshl_b32 s2, s2, s4
+; GFX9-NEXT:    v_pk_lshrrev_b16 v0, 1, v0 op_sel_hi:[0,1]
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
 ; GFX9-NEXT:    v_pk_lshrrev_b16 v0, s1, v0
 ; GFX9-NEXT:    v_or_b32_e32 v0, s0, v0
@@ -3722,9 +3722,9 @@ define amdgpu_ps float @v_fshl_v2i16_vss(<2 x i16> %lhs, <2 x i16> inreg %rhs, <
 ; GFX6-NEXT:    v_or_b32_e32 v0, s0, v0
 ; GFX6-NEXT:    s_and_b32 s0, s3, 15
 ; GFX6-NEXT:    s_bfe_u32 s0, s0, 0x100000
+; GFX6-NEXT:    s_andn2_b32 s2, 15, s3
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, s0, v1
 ; GFX6-NEXT:    s_and_b32 s0, s1, s4
-; GFX6-NEXT:    s_andn2_b32 s2, 15, s3
 ; GFX6-NEXT:    s_lshr_b32 s0, s0, 1
 ; GFX6-NEXT:    s_bfe_u32 s1, s2, 0x100000
 ; GFX6-NEXT:    s_lshr_b32 s0, s0, s1
@@ -3747,9 +3747,9 @@ define amdgpu_ps float @v_fshl_v2i16_vss(<2 x i16> %lhs, <2 x i16> inreg %rhs, <
 ; GFX8-NEXT:    s_lshr_b32 s0, s0, s4
 ; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x100000
 ; GFX8-NEXT:    s_lshr_b32 s0, s0, s1
-; GFX8-NEXT:    s_andn2_b32 s1, 15, s3
 ; GFX8-NEXT:    v_or_b32_e32 v1, s0, v1
 ; GFX8-NEXT:    s_and_b32 s0, s3, 15
+; GFX8-NEXT:    s_andn2_b32 s1, 15, s3
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX8-NEXT:    s_lshr_b32 s0, s2, s4
 ; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x100000
@@ -3774,8 +3774,8 @@ define amdgpu_ps float @v_fshl_v2i16_vss(<2 x i16> %lhs, <2 x i16> inreg %rhs, <
 ; GFX9-NEXT:    s_lshr_b32 s2, s2, 1
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
 ; GFX9-NEXT:    s_lshr_b32 s2, s0, 16
-; GFX9-NEXT:    s_lshr_b32 s4, s1, 16
 ; GFX9-NEXT:    s_and_b32 s0, s0, s3
+; GFX9-NEXT:    s_lshr_b32 s4, s1, 16
 ; GFX9-NEXT:    s_and_b32 s1, s1, s3
 ; GFX9-NEXT:    s_lshr_b32 s0, s0, s1
 ; GFX9-NEXT:    s_lshr_b32 s1, s2, s4
@@ -3838,9 +3838,9 @@ define amdgpu_ps <2 x i32> @s_fshl_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %
 ; GFX6-NEXT:    s_or_b32 s0, s0, s4
 ; GFX6-NEXT:    s_and_b32 s4, s9, 15
 ; GFX6-NEXT:    s_bfe_u32 s4, s4, 0x100000
+; GFX6-NEXT:    s_andn2_b32 s8, 15, s9
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, s4
 ; GFX6-NEXT:    s_and_b32 s4, s5, s12
-; GFX6-NEXT:    s_andn2_b32 s8, 15, s9
 ; GFX6-NEXT:    s_lshr_b32 s4, s4, 1
 ; GFX6-NEXT:    s_bfe_u32 s5, s8, 0x100000
 ; GFX6-NEXT:    s_lshr_b32 s4, s4, s5
@@ -3861,8 +3861,8 @@ define amdgpu_ps <2 x i32> @s_fshl_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %
 ; GFX6-NEXT:    s_and_b32 s4, s7, s12
 ; GFX6-NEXT:    s_lshr_b32 s4, s4, 1
 ; GFX6-NEXT:    s_bfe_u32 s5, s5, 0x100000
-; GFX6-NEXT:    s_bfe_u32 s1, s1, 0x100000
 ; GFX6-NEXT:    s_lshr_b32 s4, s4, s5
+; GFX6-NEXT:    s_bfe_u32 s1, s1, 0x100000
 ; GFX6-NEXT:    s_or_b32 s3, s3, s4
 ; GFX6-NEXT:    s_bfe_u32 s0, s0, 0x100000
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
@@ -3898,14 +3898,14 @@ define amdgpu_ps <2 x i32> @s_fshl_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %
 ; GFX8-NEXT:    s_or_b32 s2, s2, s4
 ; GFX8-NEXT:    s_and_b32 s4, s5, 15
 ; GFX8-NEXT:    s_lshr_b32 s9, s3, 16
-; GFX8-NEXT:    s_bfe_u32 s3, s3, 0x100000
 ; GFX8-NEXT:    s_lshr_b32 s11, s5, 16
 ; GFX8-NEXT:    s_andn2_b32 s5, 15, s5
 ; GFX8-NEXT:    s_bfe_u32 s4, s4, 0x100000
+; GFX8-NEXT:    s_bfe_u32 s3, s3, 0x100000
 ; GFX8-NEXT:    s_lshr_b32 s7, s1, 16
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, s4
-; GFX8-NEXT:    s_bfe_u32 s4, s5, 0x100000
 ; GFX8-NEXT:    s_lshr_b32 s3, s3, s12
+; GFX8-NEXT:    s_bfe_u32 s4, s5, 0x100000
 ; GFX8-NEXT:    s_lshr_b32 s3, s3, s4
 ; GFX8-NEXT:    s_or_b32 s1, s1, s3
 ; GFX8-NEXT:    s_and_b32 s3, s11, 15
@@ -3913,9 +3913,9 @@ define amdgpu_ps <2 x i32> @s_fshl_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %
 ; GFX8-NEXT:    s_bfe_u32 s3, s3, 0x100000
 ; GFX8-NEXT:    s_lshr_b32 s5, s9, s12
 ; GFX8-NEXT:    s_bfe_u32 s4, s4, 0x100000
-; GFX8-NEXT:    s_bfe_u32 s2, s2, 0x100000
 ; GFX8-NEXT:    s_lshl_b32 s3, s7, s3
 ; GFX8-NEXT:    s_lshr_b32 s4, s5, s4
+; GFX8-NEXT:    s_bfe_u32 s2, s2, 0x100000
 ; GFX8-NEXT:    s_or_b32 s3, s3, s4
 ; GFX8-NEXT:    s_bfe_u32 s0, s0, 0x100000
 ; GFX8-NEXT:    s_lshl_b32 s2, s2, 16
@@ -3940,11 +3940,11 @@ define amdgpu_ps <2 x i32> @s_fshl_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %
 ; GFX9-NEXT:    s_and_b32 s2, s2, s8
 ; GFX9-NEXT:    s_lshr_b32 s2, s2, 1
 ; GFX9-NEXT:    s_lshr_b32 s7, s7, 1
-; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s2, s7
 ; GFX9-NEXT:    s_andn2_b32 s4, s6, s4
+; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s2, s7
 ; GFX9-NEXT:    s_lshr_b32 s7, s2, 16
-; GFX9-NEXT:    s_lshr_b32 s9, s4, 16
 ; GFX9-NEXT:    s_and_b32 s2, s2, s8
+; GFX9-NEXT:    s_lshr_b32 s9, s4, 16
 ; GFX9-NEXT:    s_and_b32 s4, s4, s8
 ; GFX9-NEXT:    s_lshr_b32 s2, s2, s4
 ; GFX9-NEXT:    s_lshr_b32 s4, s7, s9
@@ -3963,8 +3963,8 @@ define amdgpu_ps <2 x i32> @s_fshl_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %
 ; GFX9-NEXT:    s_lshr_b32 s2, s2, 1
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s3, s2
 ; GFX9-NEXT:    s_lshr_b32 s3, s2, 16
-; GFX9-NEXT:    s_lshr_b32 s5, s4, 16
 ; GFX9-NEXT:    s_and_b32 s2, s2, s8
+; GFX9-NEXT:    s_lshr_b32 s5, s4, 16
 ; GFX9-NEXT:    s_and_b32 s4, s4, s8
 ; GFX9-NEXT:    s_lshr_b32 s2, s2, s4
 ; GFX9-NEXT:    s_lshr_b32 s3, s3, s5
@@ -3981,15 +3981,15 @@ define amdgpu_ps <2 x i32> @s_fshl_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %
 ; GFX10-NEXT:    s_and_b32 s7, s4, s6
 ; GFX10-NEXT:    s_lshr_b32 s11, s11, 1
 ; GFX10-NEXT:    s_lshr_b32 s2, s2, 1
-; GFX10-NEXT:    s_lshr_b32 s8, s0, 16
 ; GFX10-NEXT:    s_andn2_b32 s4, s6, s4
+; GFX10-NEXT:    s_lshr_b32 s8, s0, 16
 ; GFX10-NEXT:    s_lshr_b32 s10, s7, 16
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s2, s11, s2
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, s7
 ; GFX10-NEXT:    s_lshl_b32 s7, s8, s10
 ; GFX10-NEXT:    s_lshr_b32 s8, s2, 16
-; GFX10-NEXT:    s_and_b32 s10, s4, s9
 ; GFX10-NEXT:    s_and_b32 s2, s2, s9
+; GFX10-NEXT:    s_and_b32 s10, s4, s9
 ; GFX10-NEXT:    s_lshr_b32 s4, s4, 16
 ; GFX10-NEXT:    s_lshr_b32 s2, s2, s10
 ; GFX10-NEXT:    s_lshr_b32 s4, s8, s4
@@ -4007,8 +4007,8 @@ define amdgpu_ps <2 x i32> @s_fshl_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %
 ; GFX10-NEXT:    s_lshl_b32 s1, s1, s4
 ; GFX10-NEXT:    s_lshl_b32 s4, s6, s7
 ; GFX10-NEXT:    s_lshr_b32 s6, s3, 16
-; GFX10-NEXT:    s_and_b32 s7, s5, s9
 ; GFX10-NEXT:    s_and_b32 s3, s3, s9
+; GFX10-NEXT:    s_and_b32 s7, s5, s9
 ; GFX10-NEXT:    s_lshr_b32 s5, s5, 16
 ; GFX10-NEXT:    s_lshr_b32 s3, s3, s7
 ; GFX10-NEXT:    s_lshr_b32 s5, s6, s5
@@ -4040,17 +4040,17 @@ define <4 x half> @v_fshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt)
 ; GFX6-NEXT:    v_and_b32_e32 v4, 15, v9
 ; GFX6-NEXT:    v_xor_b32_e32 v8, -1, v9
 ; GFX6-NEXT:    v_bfe_u32 v4, v4, 0, 16
+; GFX6-NEXT:    v_and_b32_e32 v8, 15, v8
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, v4, v1
 ; GFX6-NEXT:    v_and_b32_e32 v4, s4, v5
-; GFX6-NEXT:    v_and_b32_e32 v8, 15, v8
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 1, v4
 ; GFX6-NEXT:    v_bfe_u32 v5, v8, 0, 16
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v5, v4
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v4
 ; GFX6-NEXT:    v_and_b32_e32 v4, 15, v10
+; GFX6-NEXT:    v_mov_b32_e32 v12, 0xffff
 ; GFX6-NEXT:    v_xor_b32_e32 v5, -1, v10
 ; GFX6-NEXT:    v_bfe_u32 v4, v4, 0, 16
-; GFX6-NEXT:    v_mov_b32_e32 v12, 0xffff
 ; GFX6-NEXT:    v_and_b32_e32 v5, 15, v5
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
 ; GFX6-NEXT:    v_and_b32_e32 v4, v6, v12
@@ -4073,8 +4073,8 @@ define <4 x half> @v_fshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt)
 ; GFX8-LABEL: v_fshl_v4i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v8, 15, v4
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
+; GFX8-NEXT:    v_and_b32_e32 v8, 15, v4
 ; GFX8-NEXT:    v_xor_b32_e32 v4, -1, v4
 ; GFX8-NEXT:    v_and_b32_e32 v4, 15, v4
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v9, 1, v2
@@ -4088,14 +4088,14 @@ define <4 x half> @v_fshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt)
 ; GFX8-NEXT:    v_and_b32_e32 v6, 15, v6
 ; GFX8-NEXT:    v_lshrrev_b16_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v2, v6, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 15, v5
-; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
 ; GFX8-NEXT:    v_xor_b32_e32 v5, -1, v5
 ; GFX8-NEXT:    v_and_b32_e32 v5, 15, v5
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v6, 1, v3
-; GFX8-NEXT:    v_lshrrev_b16_e32 v5, v5, v6
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v2, v2, v1
+; GFX8-NEXT:    v_lshrrev_b16_e32 v5, v5, v6
 ; GFX8-NEXT:    v_or_b32_e32 v2, v2, v5
 ; GFX8-NEXT:    v_and_b32_e32 v5, 15, v7
 ; GFX8-NEXT:    v_xor_b32_e32 v6, -1, v7
@@ -4120,13 +4120,13 @@ define <4 x half> @v_fshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt)
 ; GFX9-NEXT:    v_xor_b32_e32 v4, -1, v4
 ; GFX9-NEXT:    v_and_b32_e32 v4, s4, v4
 ; GFX9-NEXT:    v_pk_lshrrev_b16 v2, 1, v2 op_sel_hi:[0,1]
-; GFX9-NEXT:    v_pk_lshrrev_b16 v2, v4, v2
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v0, v6, v0
-; GFX9-NEXT:    v_xor_b32_e32 v4, -1, v5
+; GFX9-NEXT:    v_pk_lshrrev_b16 v2, v4, v2
 ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX9-NEXT:    v_and_b32_e32 v2, s4, v5
-; GFX9-NEXT:    v_pk_lshlrev_b16 v1, v2, v1
+; GFX9-NEXT:    v_xor_b32_e32 v4, -1, v5
 ; GFX9-NEXT:    v_and_b32_e32 v4, s4, v4
+; GFX9-NEXT:    v_pk_lshlrev_b16 v1, v2, v1
 ; GFX9-NEXT:    v_pk_lshrrev_b16 v2, 1, v3 op_sel_hi:[0,1]
 ; GFX9-NEXT:    v_pk_lshrrev_b16 v2, v4, v2
 ; GFX9-NEXT:    v_or_b32_e32 v1, v1, v2
@@ -4174,8 +4174,8 @@ define amdgpu_ps i64 @s_fshl_i64(i64 inreg %lhs, i64 inreg %rhs, i64 inreg %amt)
 define amdgpu_ps i64 @s_fshl_i64_5(i64 inreg %lhs, i64 inreg %rhs) {
 ; GCN-LABEL: s_fshl_i64_5:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_lshr_b32 s2, s3, 27
 ; GCN-NEXT:    s_lshl_b64 s[0:1], s[0:1], 5
+; GCN-NEXT:    s_lshr_b32 s2, s3, 27
 ; GCN-NEXT:    s_mov_b32 s3, 0
 ; GCN-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
 ; GCN-NEXT:    ; return to shader part epilog
@@ -4483,8 +4483,8 @@ define amdgpu_ps <2 x float> @v_fshl_i64_vss(i64 %lhs, i64 inreg %rhs, i64 inreg
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_and_b64 s[4:5], s[2:3], 63
 ; GFX6-NEXT:    s_andn2_b64 s[2:3], 63, s[2:3]
-; GFX6-NEXT:    s_lshr_b64 s[0:1], s[0:1], 1
 ; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], s4
+; GFX6-NEXT:    s_lshr_b64 s[0:1], s[0:1], 1
 ; GFX6-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
 ; GFX6-NEXT:    v_or_b32_e32 v0, s0, v0
 ; GFX6-NEXT:    v_or_b32_e32 v1, s1, v1
@@ -4494,8 +4494,8 @@ define amdgpu_ps <2 x float> @v_fshl_i64_vss(i64 %lhs, i64 inreg %rhs, i64 inreg
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_and_b64 s[4:5], s[2:3], 63
 ; GFX8-NEXT:    s_andn2_b64 s[2:3], 63, s[2:3]
-; GFX8-NEXT:    s_lshr_b64 s[0:1], s[0:1], 1
 ; GFX8-NEXT:    v_lshlrev_b64 v[0:1], s4, v[0:1]
+; GFX8-NEXT:    s_lshr_b64 s[0:1], s[0:1], 1
 ; GFX8-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
 ; GFX8-NEXT:    v_or_b32_e32 v0, s0, v0
 ; GFX8-NEXT:    v_or_b32_e32 v1, s1, v1
@@ -4505,8 +4505,8 @@ define amdgpu_ps <2 x float> @v_fshl_i64_vss(i64 %lhs, i64 inreg %rhs, i64 inreg
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_and_b64 s[4:5], s[2:3], 63
 ; GFX9-NEXT:    s_andn2_b64 s[2:3], 63, s[2:3]
-; GFX9-NEXT:    s_lshr_b64 s[0:1], s[0:1], 1
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], s4, v[0:1]
+; GFX9-NEXT:    s_lshr_b64 s[0:1], s[0:1], 1
 ; GFX9-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
 ; GFX9-NEXT:    v_or_b32_e32 v0, s0, v0
 ; GFX9-NEXT:    v_or_b32_e32 v1, s1, v1
@@ -4533,12 +4533,12 @@ define amdgpu_ps <2 x i64> @s_fshl_v2i64(<2 x i64> inreg %lhs, <2 x i64> inreg %
 ; GFX6-NEXT:    s_and_b64 s[12:13], s[8:9], 63
 ; GFX6-NEXT:    s_andn2_b64 s[8:9], 63, s[8:9]
 ; GFX6-NEXT:    s_lshr_b64 s[4:5], s[4:5], 1
-; GFX6-NEXT:    s_lshr_b64 s[4:5], s[4:5], s8
 ; GFX6-NEXT:    s_lshl_b64 s[0:1], s[0:1], s12
+; GFX6-NEXT:    s_lshr_b64 s[4:5], s[4:5], s8
 ; GFX6-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
 ; GFX6-NEXT:    s_and_b64 s[4:5], s[10:11], 63
-; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
 ; GFX6-NEXT:    s_andn2_b64 s[8:9], 63, s[10:11]
+; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
 ; GFX6-NEXT:    s_lshr_b64 s[4:5], s[6:7], 1
 ; GFX6-NEXT:    s_lshr_b64 s[4:5], s[4:5], s8
 ; GFX6-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
@@ -4549,12 +4549,12 @@ define amdgpu_ps <2 x i64> @s_fshl_v2i64(<2 x i64> inreg %lhs, <2 x i64> inreg %
 ; GFX8-NEXT:    s_and_b64 s[12:13], s[8:9], 63
 ; GFX8-NEXT:    s_andn2_b64 s[8:9], 63, s[8:9]
 ; GFX8-NEXT:    s_lshr_b64 s[4:5], s[4:5], 1
-; GFX8-NEXT:    s_lshr_b64 s[4:5], s[4:5], s8
 ; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], s12
+; GFX8-NEXT:    s_lshr_b64 s[4:5], s[4:5], s8
 ; GFX8-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
 ; GFX8-NEXT:    s_and_b64 s[4:5], s[10:11], 63
-; GFX8-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
 ; GFX8-NEXT:    s_andn2_b64 s[8:9], 63, s[10:11]
+; GFX8-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
 ; GFX8-NEXT:    s_lshr_b64 s[4:5], s[6:7], 1
 ; GFX8-NEXT:    s_lshr_b64 s[4:5], s[4:5], s8
 ; GFX8-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
@@ -4565,12 +4565,12 @@ define amdgpu_ps <2 x i64> @s_fshl_v2i64(<2 x i64> inreg %lhs, <2 x i64> inreg %
 ; GFX9-NEXT:    s_and_b64 s[12:13], s[8:9], 63
 ; GFX9-NEXT:    s_andn2_b64 s[8:9], 63, s[8:9]
 ; GFX9-NEXT:    s_lshr_b64 s[4:5], s[4:5], 1
-; GFX9-NEXT:    s_lshr_b64 s[4:5], s[4:5], s8
 ; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], s12
+; GFX9-NEXT:    s_lshr_b64 s[4:5], s[4:5], s8
 ; GFX9-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
 ; GFX9-NEXT:    s_and_b64 s[4:5], s[10:11], 63
-; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
 ; GFX9-NEXT:    s_andn2_b64 s[8:9], 63, s[10:11]
+; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
 ; GFX9-NEXT:    s_lshr_b64 s[4:5], s[6:7], 1
 ; GFX9-NEXT:    s_lshr_b64 s[4:5], s[4:5], s8
 ; GFX9-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
@@ -4603,8 +4603,8 @@ define <2 x i64> @v_fshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) {
 ; GFX6-NEXT:    v_xor_b32_e32 v8, -1, v8
 ; GFX6-NEXT:    v_lshr_b64 v[4:5], v[4:5], 1
 ; GFX6-NEXT:    v_and_b32_e32 v8, 63, v8
-; GFX6-NEXT:    v_lshr_b64 v[4:5], v[4:5], v8
 ; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], v9
+; GFX6-NEXT:    v_lshr_b64 v[4:5], v[4:5], v8
 ; GFX6-NEXT:    v_xor_b32_e32 v8, -1, v10
 ; GFX6-NEXT:    v_lshr_b64 v[6:7], v[6:7], 1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v4
@@ -4624,8 +4624,8 @@ define <2 x i64> @v_fshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) {
 ; GFX8-NEXT:    v_xor_b32_e32 v8, -1, v8
 ; GFX8-NEXT:    v_lshrrev_b64 v[4:5], 1, v[4:5]
 ; GFX8-NEXT:    v_and_b32_e32 v8, 63, v8
-; GFX8-NEXT:    v_lshrrev_b64 v[4:5], v8, v[4:5]
 ; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v9, v[0:1]
+; GFX8-NEXT:    v_lshrrev_b64 v[4:5], v8, v[4:5]
 ; GFX8-NEXT:    v_xor_b32_e32 v8, -1, v10
 ; GFX8-NEXT:    v_lshrrev_b64 v[6:7], 1, v[6:7]
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
@@ -4645,8 +4645,8 @@ define <2 x i64> @v_fshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) {
 ; GFX9-NEXT:    v_xor_b32_e32 v8, -1, v8
 ; GFX9-NEXT:    v_lshrrev_b64 v[4:5], 1, v[4:5]
 ; GFX9-NEXT:    v_and_b32_e32 v8, 63, v8
-; GFX9-NEXT:    v_lshrrev_b64 v[4:5], v8, v[4:5]
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v9, v[0:1]
+; GFX9-NEXT:    v_lshrrev_b64 v[4:5], v8, v[4:5]
 ; GFX9-NEXT:    v_xor_b32_e32 v8, -1, v10
 ; GFX9-NEXT:    v_lshrrev_b64 v[6:7], 1, v[6:7]
 ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v4
@@ -4718,8 +4718,8 @@ define amdgpu_ps i128 @s_fshl_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
 ; GFX6-NEXT:    s_cmp_eq_u32 s8, 0
 ; GFX6-NEXT:    s_cselect_b32 s16, 1, 0
 ; GFX6-NEXT:    s_lshr_b64 s[6:7], s[4:5], s8
-; GFX6-NEXT:    s_lshl_b64 s[10:11], s[4:5], s10
 ; GFX6-NEXT:    s_lshr_b64 s[8:9], s[0:1], s8
+; GFX6-NEXT:    s_lshl_b64 s[10:11], s[4:5], s10
 ; GFX6-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
 ; GFX6-NEXT:    s_lshr_b64 s[4:5], s[4:5], s12
 ; GFX6-NEXT:    s_cmp_lg_u32 s13, 0
@@ -4765,8 +4765,8 @@ define amdgpu_ps i128 @s_fshl_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
 ; GFX8-NEXT:    s_cmp_eq_u32 s8, 0
 ; GFX8-NEXT:    s_cselect_b32 s16, 1, 0
 ; GFX8-NEXT:    s_lshr_b64 s[6:7], s[4:5], s8
-; GFX8-NEXT:    s_lshl_b64 s[10:11], s[4:5], s10
 ; GFX8-NEXT:    s_lshr_b64 s[8:9], s[0:1], s8
+; GFX8-NEXT:    s_lshl_b64 s[10:11], s[4:5], s10
 ; GFX8-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
 ; GFX8-NEXT:    s_lshr_b64 s[4:5], s[4:5], s12
 ; GFX8-NEXT:    s_cmp_lg_u32 s13, 0
@@ -4812,8 +4812,8 @@ define amdgpu_ps i128 @s_fshl_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
 ; GFX9-NEXT:    s_cmp_eq_u32 s8, 0
 ; GFX9-NEXT:    s_cselect_b32 s16, 1, 0
 ; GFX9-NEXT:    s_lshr_b64 s[6:7], s[4:5], s8
-; GFX9-NEXT:    s_lshl_b64 s[10:11], s[4:5], s10
 ; GFX9-NEXT:    s_lshr_b64 s[8:9], s[0:1], s8
+; GFX9-NEXT:    s_lshl_b64 s[10:11], s[4:5], s10
 ; GFX9-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
 ; GFX9-NEXT:    s_lshr_b64 s[4:5], s[4:5], s12
 ; GFX9-NEXT:    s_cmp_lg_u32 s13, 0
@@ -4837,8 +4837,8 @@ define amdgpu_ps i128 @s_fshl_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
 ; GFX10-NEXT:    s_cselect_b32 s18, 1, 0
 ; GFX10-NEXT:    s_cmp_eq_u32 s12, 0
 ; GFX10-NEXT:    s_cselect_b32 s19, 1, 0
-; GFX10-NEXT:    s_lshl_b64 s[16:17], s[2:3], s12
 ; GFX10-NEXT:    s_lshr_b64 s[14:15], s[0:1], s10
+; GFX10-NEXT:    s_lshl_b64 s[16:17], s[2:3], s12
 ; GFX10-NEXT:    s_lshl_b64 s[12:13], s[0:1], s12
 ; GFX10-NEXT:    s_or_b64 s[14:15], s[14:15], s[16:17]
 ; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s9
@@ -4891,8 +4891,8 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) {
 ; GFX6-NEXT:    v_lshl_b64 v[12:13], v[0:1], v14
 ; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], v16
 ; GFX6-NEXT:    v_or_b32_e32 v8, v8, v10
-; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v14
 ; GFX6-NEXT:    v_or_b32_e32 v9, v9, v11
+; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v14
 ; GFX6-NEXT:    v_cndmask_b32_e32 v10, 0, v12, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v11, 0, v13, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
@@ -4911,11 +4911,11 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) {
 ; GFX6-NEXT:    v_lshr_b64 v[8:9], v[2:3], v15
 ; GFX6-NEXT:    v_lshr_b64 v[2:3], v[2:3], v14
 ; GFX6-NEXT:    v_or_b32_e32 v4, v4, v6
-; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v15
 ; GFX6-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v15
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v15
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v15
 ; GFX6-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v8, vcc
@@ -4940,8 +4940,8 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) {
 ; GFX8-NEXT:    v_lshlrev_b64 v[12:13], v14, v[0:1]
 ; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v16, v[0:1]
 ; GFX8-NEXT:    v_or_b32_e32 v8, v8, v10
-; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v14
 ; GFX8-NEXT:    v_or_b32_e32 v9, v9, v11
+; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v14
 ; GFX8-NEXT:    v_cndmask_b32_e32 v10, 0, v12, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v11, 0, v13, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
@@ -4960,11 +4960,11 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) {
 ; GFX8-NEXT:    v_lshrrev_b64 v[8:9], v15, v[2:3]
 ; GFX8-NEXT:    v_lshrrev_b64 v[2:3], v14, v[2:3]
 ; GFX8-NEXT:    v_or_b32_e32 v4, v4, v6
-; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v15
 ; GFX8-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v15
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v15
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v15
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, 0, v8, vcc
@@ -4989,8 +4989,8 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) {
 ; GFX9-NEXT:    v_lshlrev_b64 v[12:13], v14, v[0:1]
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v16, v[0:1]
 ; GFX9-NEXT:    v_or_b32_e32 v8, v8, v10
-; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v14
 ; GFX9-NEXT:    v_or_b32_e32 v9, v9, v11
+; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v14
 ; GFX9-NEXT:    v_cndmask_b32_e32 v10, 0, v12, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v11, 0, v13, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
@@ -5009,11 +5009,11 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) {
 ; GFX9-NEXT:    v_lshrrev_b64 v[8:9], v15, v[2:3]
 ; GFX9-NEXT:    v_lshrrev_b64 v[2:3], v14, v[2:3]
 ; GFX9-NEXT:    v_or_b32_e32 v4, v4, v6
-; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v15
 ; GFX9-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v15
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v15
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v15
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v8, vcc
@@ -5046,28 +5046,28 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) {
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v20, v[0:1]
 ; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v18
 ; GFX10-NEXT:    v_or_b32_e32 v10, v8, v10
-; GFX10-NEXT:    v_lshlrev_b64 v[16:17], v16, v[6:7]
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v8, 64, v19
+; GFX10-NEXT:    v_lshlrev_b64 v[16:17], v16, v[6:7]
 ; GFX10-NEXT:    v_or_b32_e32 v11, v9, v11
 ; GFX10-NEXT:    v_cmp_gt_u32_e64 s4, 64, v19
 ; GFX10-NEXT:    v_cndmask_b32_e32 v10, v0, v10, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s6, 0, v18
 ; GFX10-NEXT:    v_lshrrev_b64 v[8:9], v8, v[6:7]
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s5, 0, v19
 ; GFX10-NEXT:    v_or_b32_e32 v14, v14, v16
 ; GFX10-NEXT:    v_or_b32_e32 v15, v15, v17
 ; GFX10-NEXT:    v_cndmask_b32_e32 v11, v1, v11, vcc_lo
 ; GFX10-NEXT:    v_lshrrev_b64 v[0:1], v19, v[6:7]
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s5, 0, v19
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s6, 0, v18
 ; GFX10-NEXT:    v_cndmask_b32_e64 v8, v8, v14, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v6, v9, v15, s4
 ; GFX10-NEXT:    v_cndmask_b32_e32 v12, 0, v12, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v7, 0, v13, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s6
 ; GFX10-NEXT:    v_cndmask_b32_e64 v4, v8, v4, s5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v5, v6, v5, s5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, v0, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, v1, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s6
 ; GFX10-NEXT:    v_or_b32_e32 v0, v12, v4
 ; GFX10-NEXT:    v_or_b32_e32 v1, v7, v5
 ; GFX10-NEXT:    v_or_b32_e32 v2, v2, v6
@@ -5093,29 +5093,29 @@ define amdgpu_ps <4 x float> @v_fshl_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 ; GFX6-NEXT:    v_or_b32_e32 v3, v1, v3
 ; GFX6-NEXT:    v_lshl_b64 v[0:1], s[0:1], v8
 ; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v6
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX6-NEXT:    s_mov_b32 s8, 0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v8, 0, v4, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v9, 0, v5, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s2
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v6
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s3
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v6
 ; GFX6-NEXT:    s_lshr_b64 s[0:1], s[4:5], 1
 ; GFX6-NEXT:    s_lshl_b32 s9, s6, 31
-; GFX6-NEXT:    s_lshr_b64 s[2:3], s[6:7], 1
 ; GFX6-NEXT:    v_cndmask_b32_e32 v6, v0, v2, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v10, v1, v3, vcc
-; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 64, v7
 ; GFX6-NEXT:    s_or_b64 s[0:1], s[0:1], s[8:9]
+; GFX6-NEXT:    s_lshr_b64 s[2:3], s[6:7], 1
+; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 64, v7
 ; GFX6-NEXT:    v_lshr_b64 v[0:1], s[0:1], v7
 ; GFX6-NEXT:    v_lshl_b64 v[2:3], s[2:3], v2
 ; GFX6-NEXT:    v_subrev_i32_e32 v11, vcc, 64, v7
 ; GFX6-NEXT:    v_or_b32_e32 v2, v0, v2
 ; GFX6-NEXT:    v_or_b32_e32 v3, v1, v3
 ; GFX6-NEXT:    v_lshr_b64 v[0:1], s[2:3], v11
-; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v7
 ; GFX6-NEXT:    v_lshr_b64 v[4:5], s[2:3], v7
+; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v7
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s0
@@ -5146,29 +5146,29 @@ define amdgpu_ps <4 x float> @v_fshl_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 ; GFX8-NEXT:    v_or_b32_e32 v3, v1, v3
 ; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v8, s[0:1]
 ; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX8-NEXT:    s_mov_b32 s8, 0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v8, 0, v4, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v9, 0, v5, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s2
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v6
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v6
 ; GFX8-NEXT:    s_lshr_b64 s[0:1], s[4:5], 1
 ; GFX8-NEXT:    s_lshl_b32 s9, s6, 31
-; GFX8-NEXT:    s_lshr_b64 s[2:3], s[6:7], 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, v0, v2, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v10, v1, v3, vcc
-; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 64, v7
 ; GFX8-NEXT:    s_or_b64 s[0:1], s[0:1], s[8:9]
+; GFX8-NEXT:    s_lshr_b64 s[2:3], s[6:7], 1
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 64, v7
 ; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v7, s[0:1]
 ; GFX8-NEXT:    v_lshlrev_b64 v[2:3], v2, s[2:3]
 ; GFX8-NEXT:    v_subrev_u32_e32 v11, vcc, 64, v7
 ; GFX8-NEXT:    v_or_b32_e32 v2, v0, v2
 ; GFX8-NEXT:    v_or_b32_e32 v3, v1, v3
 ; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v11, s[2:3]
-; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v7
 ; GFX8-NEXT:    v_lshrrev_b64 v[4:5], v7, s[2:3]
+; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v7
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s0
@@ -5199,19 +5199,19 @@ define amdgpu_ps <4 x float> @v_fshl_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 ; GFX9-NEXT:    v_or_b32_e32 v3, v1, v3
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v8, s[0:1]
 ; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX9-NEXT:    s_mov_b32 s8, 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v8, 0, v4, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v9, 0, v5, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v6
 ; GFX9-NEXT:    s_lshr_b64 s[0:1], s[4:5], 1
 ; GFX9-NEXT:    s_lshl_b32 s9, s6, 31
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v0, v2, vcc
-; GFX9-NEXT:    s_lshr_b64 s[2:3], s[6:7], 1
 ; GFX9-NEXT:    s_or_b64 s[0:1], s[0:1], s[8:9]
+; GFX9-NEXT:    s_lshr_b64 s[2:3], s[6:7], 1
 ; GFX9-NEXT:    v_sub_u32_e32 v2, 64, v7
 ; GFX9-NEXT:    v_cndmask_b32_e32 v10, v1, v3, vcc
 ; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v7, s[0:1]
@@ -5220,8 +5220,8 @@ define amdgpu_ps <4 x float> @v_fshl_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 ; GFX9-NEXT:    v_or_b32_e32 v2, v0, v2
 ; GFX9-NEXT:    v_or_b32_e32 v3, v1, v3
 ; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v11, s[2:3]
-; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v7
 ; GFX9-NEXT:    v_lshrrev_b64 v[4:5], v7, s[2:3]
+; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v7
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s0
@@ -5315,21 +5315,21 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX6-NEXT:    s_cselect_b64 s[0:1], s[2:3], s[0:1]
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 31, v2
 ; GFX6-NEXT:    v_lshr_b64 v[2:3], v[2:3], 1
-; GFX6-NEXT:    s_sub_i32 s3, 64, s4
 ; GFX6-NEXT:    s_sub_i32 s2, s4, 64
+; GFX6-NEXT:    s_sub_i32 s3, 64, s4
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v4
 ; GFX6-NEXT:    s_cmp_lt_u32 s4, 64
 ; GFX6-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX6-NEXT:    s_cmp_eq_u32 s4, 0
 ; GFX6-NEXT:    v_lshr_b64 v[4:5], v[0:1], s4
 ; GFX6-NEXT:    v_lshl_b64 v[6:7], v[2:3], s3
-; GFX6-NEXT:    v_lshr_b64 v[8:9], v[2:3], s4
 ; GFX6-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX6-NEXT:    v_lshr_b64 v[8:9], v[2:3], s4
 ; GFX6-NEXT:    v_lshr_b64 v[2:3], v[2:3], s2
 ; GFX6-NEXT:    s_and_b32 s2, 1, s5
-; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
 ; GFX6-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX6-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
 ; GFX6-NEXT:    s_and_b32 s2, 1, s8
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
@@ -5370,21 +5370,21 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX8-NEXT:    s_cselect_b64 s[0:1], s[2:3], s[0:1]
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 31, v2
 ; GFX8-NEXT:    v_lshrrev_b64 v[2:3], 1, v[2:3]
-; GFX8-NEXT:    s_sub_i32 s3, 64, s4
 ; GFX8-NEXT:    s_sub_i32 s2, s4, 64
+; GFX8-NEXT:    s_sub_i32 s3, 64, s4
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
 ; GFX8-NEXT:    s_cmp_lt_u32 s4, 64
 ; GFX8-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX8-NEXT:    s_cmp_eq_u32 s4, 0
 ; GFX8-NEXT:    v_lshrrev_b64 v[4:5], s4, v[0:1]
 ; GFX8-NEXT:    v_lshlrev_b64 v[6:7], s3, v[2:3]
-; GFX8-NEXT:    v_lshrrev_b64 v[8:9], s4, v[2:3]
 ; GFX8-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX8-NEXT:    v_lshrrev_b64 v[8:9], s4, v[2:3]
 ; GFX8-NEXT:    v_lshrrev_b64 v[2:3], s2, v[2:3]
 ; GFX8-NEXT:    s_and_b32 s2, 1, s5
-; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
 ; GFX8-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX8-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
 ; GFX8-NEXT:    s_and_b32 s2, 1, s8
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
@@ -5425,21 +5425,21 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX9-NEXT:    s_cselect_b64 s[0:1], s[2:3], s[0:1]
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 31, v2
 ; GFX9-NEXT:    v_lshrrev_b64 v[2:3], 1, v[2:3]
-; GFX9-NEXT:    s_sub_i32 s3, 64, s4
 ; GFX9-NEXT:    s_sub_i32 s2, s4, 64
+; GFX9-NEXT:    s_sub_i32 s3, 64, s4
 ; GFX9-NEXT:    v_or_b32_e32 v1, v1, v4
 ; GFX9-NEXT:    s_cmp_lt_u32 s4, 64
 ; GFX9-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX9-NEXT:    s_cmp_eq_u32 s4, 0
 ; GFX9-NEXT:    v_lshrrev_b64 v[4:5], s4, v[0:1]
 ; GFX9-NEXT:    v_lshlrev_b64 v[6:7], s3, v[2:3]
-; GFX9-NEXT:    v_lshrrev_b64 v[8:9], s4, v[2:3]
 ; GFX9-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX9-NEXT:    v_lshrrev_b64 v[8:9], s4, v[2:3]
 ; GFX9-NEXT:    v_lshrrev_b64 v[2:3], s2, v[2:3]
 ; GFX9-NEXT:    s_and_b32 s2, 1, s5
-; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
 ; GFX9-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX9-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
 ; GFX9-NEXT:    s_and_b32 s2, 1, s8
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
@@ -5470,8 +5470,8 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX10-NEXT:    s_cmp_eq_u32 s8, 0
 ; GFX10-NEXT:    v_lshrrev_b64 v[2:3], 1, v[2:3]
 ; GFX10-NEXT:    s_cselect_b32 s13, 1, 0
-; GFX10-NEXT:    s_lshl_b64 s[10:11], s[2:3], s8
 ; GFX10-NEXT:    s_lshr_b64 s[6:7], s[0:1], s6
+; GFX10-NEXT:    s_lshl_b64 s[10:11], s[2:3], s8
 ; GFX10-NEXT:    s_lshl_b64 s[8:9], s[0:1], s8
 ; GFX10-NEXT:    s_or_b64 s[6:7], s[6:7], s[10:11]
 ; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s5
@@ -5519,8 +5519,8 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
 ; GFX6-NEXT:    s_mov_b64 s[6:7], 0x7f
 ; GFX6-NEXT:    s_and_b64 s[8:9], s[4:5], s[6:7]
 ; GFX6-NEXT:    s_andn2_b64 s[4:5], s[6:7], s[4:5]
-; GFX6-NEXT:    s_sub_i32 s6, 64, s8
 ; GFX6-NEXT:    s_sub_i32 s5, s8, 64
+; GFX6-NEXT:    s_sub_i32 s6, 64, s8
 ; GFX6-NEXT:    s_cmp_lt_u32 s8, 64
 ; GFX6-NEXT:    s_cselect_b32 s9, 1, 0
 ; GFX6-NEXT:    s_cmp_eq_u32 s8, 0
@@ -5530,8 +5530,8 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
 ; GFX6-NEXT:    v_lshl_b64 v[8:9], v[0:1], s8
 ; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], s5
 ; GFX6-NEXT:    s_and_b32 s5, 1, s9
-; GFX6-NEXT:    s_lshl_b32 s9, s2, 31
 ; GFX6-NEXT:    s_lshr_b64 s[0:1], s[0:1], 1
+; GFX6-NEXT:    s_lshl_b32 s9, s2, 31
 ; GFX6-NEXT:    s_mov_b32 s8, s7
 ; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s5
 ; GFX6-NEXT:    s_and_b32 s5, 1, s10
@@ -5544,15 +5544,15 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
 ; GFX6-NEXT:    v_or_b32_e32 v5, v5, v7
 ; GFX6-NEXT:    s_cselect_b32 s11, 1, 0
 ; GFX6-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX6-NEXT:    s_cselect_b32 s12, 1, 0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v6, 0, v8, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v7, 0, v9, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
 ; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s5
+; GFX6-NEXT:    s_cselect_b32 s12, 1, 0
 ; GFX6-NEXT:    s_lshr_b64 s[6:7], s[2:3], s4
-; GFX6-NEXT:    s_lshl_b64 s[8:9], s[2:3], s8
 ; GFX6-NEXT:    s_lshr_b64 s[4:5], s[0:1], s4
+; GFX6-NEXT:    s_lshl_b64 s[8:9], s[2:3], s8
 ; GFX6-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
 ; GFX6-NEXT:    s_lshr_b64 s[2:3], s[2:3], s10
 ; GFX6-NEXT:    s_cmp_lg_u32 s11, 0
@@ -5574,8 +5574,8 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0x7f
 ; GFX8-NEXT:    s_and_b64 s[8:9], s[4:5], s[6:7]
 ; GFX8-NEXT:    s_andn2_b64 s[4:5], s[6:7], s[4:5]
-; GFX8-NEXT:    s_sub_i32 s6, 64, s8
 ; GFX8-NEXT:    s_sub_i32 s5, s8, 64
+; GFX8-NEXT:    s_sub_i32 s6, 64, s8
 ; GFX8-NEXT:    s_cmp_lt_u32 s8, 64
 ; GFX8-NEXT:    s_cselect_b32 s9, 1, 0
 ; GFX8-NEXT:    s_cmp_eq_u32 s8, 0
@@ -5585,8 +5585,8 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
 ; GFX8-NEXT:    v_lshlrev_b64 v[8:9], s8, v[0:1]
 ; GFX8-NEXT:    v_lshlrev_b64 v[0:1], s5, v[0:1]
 ; GFX8-NEXT:    s_and_b32 s5, 1, s9
-; GFX8-NEXT:    s_lshl_b32 s9, s2, 31
 ; GFX8-NEXT:    s_lshr_b64 s[0:1], s[0:1], 1
+; GFX8-NEXT:    s_lshl_b32 s9, s2, 31
 ; GFX8-NEXT:    s_mov_b32 s8, s7
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s5
 ; GFX8-NEXT:    s_and_b32 s5, 1, s10
@@ -5599,15 +5599,15 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
 ; GFX8-NEXT:    v_or_b32_e32 v5, v5, v7
 ; GFX8-NEXT:    s_cselect_b32 s11, 1, 0
 ; GFX8-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX8-NEXT:    s_cselect_b32 s12, 1, 0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, 0, v8, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, 0, v9, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s5
+; GFX8-NEXT:    s_cselect_b32 s12, 1, 0
 ; GFX8-NEXT:    s_lshr_b64 s[6:7], s[2:3], s4
-; GFX8-NEXT:    s_lshl_b64 s[8:9], s[2:3], s8
 ; GFX8-NEXT:    s_lshr_b64 s[4:5], s[0:1], s4
+; GFX8-NEXT:    s_lshl_b64 s[8:9], s[2:3], s8
 ; GFX8-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
 ; GFX8-NEXT:    s_lshr_b64 s[2:3], s[2:3], s10
 ; GFX8-NEXT:    s_cmp_lg_u32 s11, 0
@@ -5629,8 +5629,8 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
 ; GFX9-NEXT:    s_mov_b64 s[6:7], 0x7f
 ; GFX9-NEXT:    s_and_b64 s[8:9], s[4:5], s[6:7]
 ; GFX9-NEXT:    s_andn2_b64 s[4:5], s[6:7], s[4:5]
-; GFX9-NEXT:    s_sub_i32 s6, 64, s8
 ; GFX9-NEXT:    s_sub_i32 s5, s8, 64
+; GFX9-NEXT:    s_sub_i32 s6, 64, s8
 ; GFX9-NEXT:    s_cmp_lt_u32 s8, 64
 ; GFX9-NEXT:    s_cselect_b32 s9, 1, 0
 ; GFX9-NEXT:    s_cmp_eq_u32 s8, 0
@@ -5640,8 +5640,8 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
 ; GFX9-NEXT:    v_lshlrev_b64 v[8:9], s8, v[0:1]
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], s5, v[0:1]
 ; GFX9-NEXT:    s_and_b32 s5, 1, s9
-; GFX9-NEXT:    s_lshl_b32 s9, s2, 31
 ; GFX9-NEXT:    s_lshr_b64 s[0:1], s[0:1], 1
+; GFX9-NEXT:    s_lshl_b32 s9, s2, 31
 ; GFX9-NEXT:    s_mov_b32 s8, s7
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s5
 ; GFX9-NEXT:    s_and_b32 s5, 1, s10
@@ -5654,15 +5654,15 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
 ; GFX9-NEXT:    v_or_b32_e32 v5, v5, v7
 ; GFX9-NEXT:    s_cselect_b32 s11, 1, 0
 ; GFX9-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX9-NEXT:    s_cselect_b32 s12, 1, 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, 0, v8, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v7, 0, v9, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s5
+; GFX9-NEXT:    s_cselect_b32 s12, 1, 0
 ; GFX9-NEXT:    s_lshr_b64 s[6:7], s[2:3], s4
-; GFX9-NEXT:    s_lshl_b64 s[8:9], s[2:3], s8
 ; GFX9-NEXT:    s_lshr_b64 s[4:5], s[0:1], s4
+; GFX9-NEXT:    s_lshl_b64 s[8:9], s[2:3], s8
 ; GFX9-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
 ; GFX9-NEXT:    s_lshr_b64 s[2:3], s[2:3], s10
 ; GFX9-NEXT:    s_cmp_lg_u32 s11, 0
@@ -5686,20 +5686,20 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
 ; GFX10-NEXT:    s_andn2_b64 s[10:11], s[6:7], s[4:5]
 ; GFX10-NEXT:    s_sub_i32 s4, 64, s8
 ; GFX10-NEXT:    s_sub_i32 s5, s8, 64
-; GFX10-NEXT:    s_cmp_lt_u32 s8, 64
 ; GFX10-NEXT:    v_lshrrev_b64 v[4:5], s4, v[0:1]
 ; GFX10-NEXT:    v_lshlrev_b64 v[6:7], s8, v[2:3]
+; GFX10-NEXT:    s_cmp_lt_u32 s8, 64
+; GFX10-NEXT:    v_lshlrev_b64 v[8:9], s8, v[0:1]
 ; GFX10-NEXT:    s_cselect_b32 vcc_lo, 1, 0
 ; GFX10-NEXT:    s_cmp_eq_u32 s8, 0
-; GFX10-NEXT:    v_lshlrev_b64 v[8:9], s8, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], s5, v[0:1]
 ; GFX10-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX10-NEXT:    s_and_b32 s4, 1, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b64 v[0:1], s5, v[0:1]
 ; GFX10-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX10-NEXT:    v_or_b32_e32 v5, v5, v7
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 s4, 0, s4
-; GFX10-NEXT:    s_lshl_b32 s5, s2, 31
 ; GFX10-NEXT:    s_lshr_b64 s[0:1], s[0:1], 1
+; GFX10-NEXT:    s_lshl_b32 s5, s2, 31
 ; GFX10-NEXT:    s_and_b32 s6, 1, s6
 ; GFX10-NEXT:    s_lshr_b64 s[2:3], s[2:3], 1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s4
@@ -5869,8 +5869,8 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX6-NEXT:    s_cmp_eq_u32 s16, 0
 ; GFX6-NEXT:    s_cselect_b32 s28, 1, 0
 ; GFX6-NEXT:    s_lshr_b64 s[10:11], s[8:9], s16
-; GFX6-NEXT:    s_lshl_b64 s[22:23], s[8:9], s22
 ; GFX6-NEXT:    s_lshr_b64 s[16:17], s[0:1], s16
+; GFX6-NEXT:    s_lshl_b64 s[22:23], s[8:9], s22
 ; GFX6-NEXT:    s_or_b64 s[16:17], s[16:17], s[22:23]
 ; GFX6-NEXT:    s_lshr_b64 s[8:9], s[8:9], s26
 ; GFX6-NEXT:    s_cmp_lg_u32 s27, 0
@@ -5895,8 +5895,8 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX6-NEXT:    s_or_b64 s[8:9], s[20:21], s[8:9]
 ; GFX6-NEXT:    s_lshl_b64 s[4:5], s[4:5], s11
 ; GFX6-NEXT:    s_cmp_lg_u32 s18, 0
-; GFX6-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
 ; GFX6-NEXT:    s_cselect_b64 s[16:17], s[16:17], 0
+; GFX6-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
 ; GFX6-NEXT:    s_cmp_lg_u32 s22, 0
 ; GFX6-NEXT:    s_cselect_b64 s[6:7], s[6:7], s[4:5]
 ; GFX6-NEXT:    s_lshr_b64 s[4:5], s[12:13], 1
@@ -5911,8 +5911,8 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX6-NEXT:    s_cmp_eq_u32 s10, 0
 ; GFX6-NEXT:    s_cselect_b32 s20, 1, 0
 ; GFX6-NEXT:    s_lshr_b64 s[12:13], s[8:9], s10
-; GFX6-NEXT:    s_lshl_b64 s[14:15], s[8:9], s14
 ; GFX6-NEXT:    s_lshr_b64 s[10:11], s[4:5], s10
+; GFX6-NEXT:    s_lshl_b64 s[14:15], s[8:9], s14
 ; GFX6-NEXT:    s_or_b64 s[10:11], s[10:11], s[14:15]
 ; GFX6-NEXT:    s_lshr_b64 s[8:9], s[8:9], s18
 ; GFX6-NEXT:    s_cmp_lg_u32 s19, 0
@@ -5958,8 +5958,8 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX8-NEXT:    s_cmp_eq_u32 s16, 0
 ; GFX8-NEXT:    s_cselect_b32 s28, 1, 0
 ; GFX8-NEXT:    s_lshr_b64 s[10:11], s[8:9], s16
-; GFX8-NEXT:    s_lshl_b64 s[22:23], s[8:9], s22
 ; GFX8-NEXT:    s_lshr_b64 s[16:17], s[0:1], s16
+; GFX8-NEXT:    s_lshl_b64 s[22:23], s[8:9], s22
 ; GFX8-NEXT:    s_or_b64 s[16:17], s[16:17], s[22:23]
 ; GFX8-NEXT:    s_lshr_b64 s[8:9], s[8:9], s26
 ; GFX8-NEXT:    s_cmp_lg_u32 s27, 0
@@ -5984,8 +5984,8 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX8-NEXT:    s_or_b64 s[8:9], s[20:21], s[8:9]
 ; GFX8-NEXT:    s_lshl_b64 s[4:5], s[4:5], s11
 ; GFX8-NEXT:    s_cmp_lg_u32 s18, 0
-; GFX8-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
 ; GFX8-NEXT:    s_cselect_b64 s[16:17], s[16:17], 0
+; GFX8-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
 ; GFX8-NEXT:    s_cmp_lg_u32 s22, 0
 ; GFX8-NEXT:    s_cselect_b64 s[6:7], s[6:7], s[4:5]
 ; GFX8-NEXT:    s_lshr_b64 s[4:5], s[12:13], 1
@@ -6000,8 +6000,8 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX8-NEXT:    s_cmp_eq_u32 s10, 0
 ; GFX8-NEXT:    s_cselect_b32 s20, 1, 0
 ; GFX8-NEXT:    s_lshr_b64 s[12:13], s[8:9], s10
-; GFX8-NEXT:    s_lshl_b64 s[14:15], s[8:9], s14
 ; GFX8-NEXT:    s_lshr_b64 s[10:11], s[4:5], s10
+; GFX8-NEXT:    s_lshl_b64 s[14:15], s[8:9], s14
 ; GFX8-NEXT:    s_or_b64 s[10:11], s[10:11], s[14:15]
 ; GFX8-NEXT:    s_lshr_b64 s[8:9], s[8:9], s18
 ; GFX8-NEXT:    s_cmp_lg_u32 s19, 0
@@ -6047,8 +6047,8 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX9-NEXT:    s_cmp_eq_u32 s16, 0
 ; GFX9-NEXT:    s_cselect_b32 s28, 1, 0
 ; GFX9-NEXT:    s_lshr_b64 s[10:11], s[8:9], s16
-; GFX9-NEXT:    s_lshl_b64 s[22:23], s[8:9], s22
 ; GFX9-NEXT:    s_lshr_b64 s[16:17], s[0:1], s16
+; GFX9-NEXT:    s_lshl_b64 s[22:23], s[8:9], s22
 ; GFX9-NEXT:    s_or_b64 s[16:17], s[16:17], s[22:23]
 ; GFX9-NEXT:    s_lshr_b64 s[8:9], s[8:9], s26
 ; GFX9-NEXT:    s_cmp_lg_u32 s27, 0
@@ -6073,8 +6073,8 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX9-NEXT:    s_or_b64 s[8:9], s[20:21], s[8:9]
 ; GFX9-NEXT:    s_lshl_b64 s[4:5], s[4:5], s11
 ; GFX9-NEXT:    s_cmp_lg_u32 s18, 0
-; GFX9-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
 ; GFX9-NEXT:    s_cselect_b64 s[16:17], s[16:17], 0
+; GFX9-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
 ; GFX9-NEXT:    s_cmp_lg_u32 s22, 0
 ; GFX9-NEXT:    s_cselect_b64 s[6:7], s[6:7], s[4:5]
 ; GFX9-NEXT:    s_lshr_b64 s[4:5], s[12:13], 1
@@ -6089,8 +6089,8 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX9-NEXT:    s_cmp_eq_u32 s10, 0
 ; GFX9-NEXT:    s_cselect_b32 s20, 1, 0
 ; GFX9-NEXT:    s_lshr_b64 s[12:13], s[8:9], s10
-; GFX9-NEXT:    s_lshl_b64 s[14:15], s[8:9], s14
 ; GFX9-NEXT:    s_lshr_b64 s[10:11], s[4:5], s10
+; GFX9-NEXT:    s_lshl_b64 s[14:15], s[8:9], s14
 ; GFX9-NEXT:    s_or_b64 s[10:11], s[10:11], s[14:15]
 ; GFX9-NEXT:    s_lshr_b64 s[8:9], s[8:9], s18
 ; GFX9-NEXT:    s_cmp_lg_u32 s19, 0
@@ -6204,11 +6204,11 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, 64, v23
 ; GFX6-NEXT:    v_lshr_b64 v[17:18], v[0:1], v17
 ; GFX6-NEXT:    v_lshl_b64 v[21:22], v[2:3], v23
-; GFX6-NEXT:    v_xor_b32_e32 v16, -1, v16
 ; GFX6-NEXT:    v_lshr_b64 v[8:9], v[8:9], 1
-; GFX6-NEXT:    v_and_b32_e32 v24, s6, v16
+; GFX6-NEXT:    v_xor_b32_e32 v16, -1, v16
 ; GFX6-NEXT:    v_or_b32_e32 v21, v17, v21
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v17, 31, v10
+; GFX6-NEXT:    v_and_b32_e32 v24, s6, v16
 ; GFX6-NEXT:    v_lshr_b64 v[10:11], v[10:11], 1
 ; GFX6-NEXT:    v_or_b32_e32 v9, v9, v17
 ; GFX6-NEXT:    v_sub_i32_e32 v16, vcc, 64, v24
@@ -6233,8 +6233,8 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, v18, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v18, 0, v1, vcc
 ; GFX6-NEXT:    v_lshr_b64 v[0:1], v[10:11], v24
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v24
 ; GFX6-NEXT:    v_cndmask_b32_e64 v3, v3, v19, s[4:5]
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v24
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v8, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v9, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e64 v8, 0, v0, s[4:5]
@@ -6276,8 +6276,8 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX6-NEXT:    v_lshr_b64 v[6:7], v[6:7], v12
 ; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v17
 ; GFX6-NEXT:    v_cndmask_b32_e32 v6, v6, v10, vcc
-; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v17
 ; GFX6-NEXT:    v_cndmask_b32_e32 v7, v7, v11, vcc
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v17
 ; GFX6-NEXT:    v_cndmask_b32_e64 v4, v6, v4, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e64 v5, v7, v5, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v6, 0, v8, vcc
@@ -6296,11 +6296,11 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, 64, v23
 ; GFX8-NEXT:    v_lshrrev_b64 v[17:18], v17, v[0:1]
 ; GFX8-NEXT:    v_lshlrev_b64 v[21:22], v23, v[2:3]
-; GFX8-NEXT:    v_xor_b32_e32 v16, -1, v16
 ; GFX8-NEXT:    v_lshrrev_b64 v[8:9], 1, v[8:9]
-; GFX8-NEXT:    v_and_b32_e32 v24, s6, v16
+; GFX8-NEXT:    v_xor_b32_e32 v16, -1, v16
 ; GFX8-NEXT:    v_or_b32_e32 v21, v17, v21
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 31, v10
+; GFX8-NEXT:    v_and_b32_e32 v24, s6, v16
 ; GFX8-NEXT:    v_lshrrev_b64 v[10:11], 1, v[10:11]
 ; GFX8-NEXT:    v_or_b32_e32 v9, v9, v17
 ; GFX8-NEXT:    v_sub_u32_e32 v16, vcc, 64, v24
@@ -6325,8 +6325,8 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v18, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v18, 0, v1, vcc
 ; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v24, v[10:11]
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v24
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v19, s[4:5]
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v24
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v8, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v9, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, v0, s[4:5]
@@ -6368,8 +6368,8 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX8-NEXT:    v_lshrrev_b64 v[6:7], v12, v[6:7]
 ; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v17
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, v6, v10, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v17
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, v7, v11, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v17
 ; GFX8-NEXT:    v_cndmask_b32_e64 v4, v6, v4, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v5, v7, v5, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, 0, v8, vcc
@@ -6388,11 +6388,11 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX9-NEXT:    v_sub_u32_e32 v17, 64, v23
 ; GFX9-NEXT:    v_lshrrev_b64 v[17:18], v17, v[0:1]
 ; GFX9-NEXT:    v_lshlrev_b64 v[21:22], v23, v[2:3]
-; GFX9-NEXT:    v_xor_b32_e32 v16, -1, v16
 ; GFX9-NEXT:    v_lshrrev_b64 v[8:9], 1, v[8:9]
-; GFX9-NEXT:    v_and_b32_e32 v24, s6, v16
+; GFX9-NEXT:    v_xor_b32_e32 v16, -1, v16
 ; GFX9-NEXT:    v_or_b32_e32 v21, v17, v21
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 31, v10
+; GFX9-NEXT:    v_and_b32_e32 v24, s6, v16
 ; GFX9-NEXT:    v_lshrrev_b64 v[10:11], 1, v[10:11]
 ; GFX9-NEXT:    v_or_b32_e32 v9, v9, v17
 ; GFX9-NEXT:    v_sub_u32_e32 v16, 64, v24
@@ -6417,8 +6417,8 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v18, s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v18, 0, v1, vcc
 ; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v24, v[10:11]
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v24
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v19, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v24
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v8, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v9, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, v0, s[4:5]
@@ -6439,9 +6439,9 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX9-NEXT:    v_lshlrev_b64 v[8:9], v16, v[4:5]
 ; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v18, v[4:5]
 ; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v16
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v10, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v18, 0, v8, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v19, 0, v9, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v10, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v11, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v16
 ; GFX9-NEXT:    v_cndmask_b32_e32 v16, v4, v6, vcc
@@ -6460,8 +6460,8 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX9-NEXT:    v_lshrrev_b64 v[6:7], v12, v[6:7]
 ; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v17
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v10, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v17
 ; GFX9-NEXT:    v_cndmask_b32_e32 v7, v7, v11, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v17
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, v6, v4, s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, v7, v5, s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, 0, v8, vcc
@@ -6494,13 +6494,13 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v29, v[0:1]
 ; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v27
 ; GFX10-NEXT:    v_or_b32_e32 v18, v16, v18
-; GFX10-NEXT:    v_lshlrev_b64 v[25:26], v25, v[10:11]
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v16, 64, v28
+; GFX10-NEXT:    v_lshlrev_b64 v[25:26], v25, v[10:11]
 ; GFX10-NEXT:    v_or_b32_e32 v19, v17, v19
 ; GFX10-NEXT:    v_cmp_gt_u32_e64 s4, 64, v28
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s5, 0, v28
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s6, 0, v27
 ; GFX10-NEXT:    v_lshrrev_b64 v[16:17], v16, v[10:11]
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s6, 0, v27
 ; GFX10-NEXT:    v_or_b32_e32 v23, v23, v25
 ; GFX10-NEXT:    v_or_b32_e32 v24, v24, v26
 ; GFX10-NEXT:    v_cndmask_b32_e32 v19, v1, v19, vcc_lo
@@ -6508,14 +6508,14 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX10-NEXT:    v_lshrrev_b64 v[0:1], v28, v[10:11]
 ; GFX10-NEXT:    v_cndmask_b32_e64 v16, v16, v23, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v10, v17, v24, s4
+; GFX10-NEXT:    v_cndmask_b32_e32 v21, 0, v21, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v11, 0, v22, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v22, v19, v3, s6
-; GFX10-NEXT:    v_cndmask_b32_e32 v21, 0, v21, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, v16, v8, s5
-; GFX10-NEXT:    v_cndmask_b32_e64 v8, v10, v9, s5
-; GFX10-NEXT:    v_and_b32_e32 v23, s7, v20
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v18, v2, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, v10, v9, s5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, v0, s4
+; GFX10-NEXT:    v_and_b32_e32 v23, s7, v20
 ; GFX10-NEXT:    v_or_b32_e32 v0, v21, v3
 ; GFX10-NEXT:    v_xor_b32_e32 v3, -1, v20
 ; GFX10-NEXT:    v_cndmask_b32_e64 v24, 0, v1, s4
@@ -6523,13 +6523,13 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v10, 64, v23
 ; GFX10-NEXT:    v_or_b32_e32 v2, v2, v9
 ; GFX10-NEXT:    v_lshrrev_b64 v[8:9], 1, v[12:13]
-; GFX10-NEXT:    v_and_b32_e32 v25, s7, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 31, v14
+; GFX10-NEXT:    v_and_b32_e32 v25, s7, v3
 ; GFX10-NEXT:    v_lshrrev_b64 v[10:11], v10, v[4:5]
 ; GFX10-NEXT:    v_lshlrev_b64 v[12:13], v23, v[6:7]
 ; GFX10-NEXT:    v_lshrrev_b64 v[14:15], 1, v[14:15]
-; GFX10-NEXT:    v_sub_nc_u32_e32 v20, 64, v25
 ; GFX10-NEXT:    v_or_b32_e32 v9, v9, v16
+; GFX10-NEXT:    v_sub_nc_u32_e32 v20, 64, v25
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, 64, v23
 ; GFX10-NEXT:    v_lshlrev_b64 v[16:17], v23, v[4:5]
 ; GFX10-NEXT:    v_or_b32_e32 v12, v10, v12
@@ -6547,22 +6547,22 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX10-NEXT:    v_cndmask_b32_e32 v12, v3, v12, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v5, v4, v5, vcc_lo
 ; GFX10-NEXT:    v_lshrrev_b64 v[3:4], v25, v[14:15]
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s6, 0, v23
 ; GFX10-NEXT:    v_cndmask_b32_e64 v10, v10, v16, s4
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s5, 0, v25
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s6, 0, v23
 ; GFX10-NEXT:    v_cndmask_b32_e64 v11, v11, v18, s4
 ; GFX10-NEXT:    v_cndmask_b32_e32 v14, 0, v17, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v7, v5, v7, s6
 ; GFX10-NEXT:    v_cndmask_b32_e64 v6, v12, v6, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, v5, v7, s6
 ; GFX10-NEXT:    v_cndmask_b32_e64 v5, v10, v8, s5
-; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, v4, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v8, v11, v9, s5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, v3, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, v4, s4
 ; GFX10-NEXT:    v_or_b32_e32 v3, v22, v24
 ; GFX10-NEXT:    v_or_b32_e32 v4, v13, v5
-; GFX10-NEXT:    v_or_b32_e32 v7, v7, v10
 ; GFX10-NEXT:    v_or_b32_e32 v5, v14, v8
 ; GFX10-NEXT:    v_or_b32_e32 v6, v6, v9
+; GFX10-NEXT:    v_or_b32_e32 v7, v7, v10
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x i128> @llvm.fshl.v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %amt)
   ret <2 x i128> %result

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
index 8ba814a5df078..676cf95d7e11e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
@@ -265,9 +265,9 @@ define amdgpu_ps i8 @s_fshr_i8(i8 inreg %lhs, i8 inreg %rhs, i8 inreg %amt) {
 ; GFX6-LABEL: s_fshr_i8:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_and_b32 s3, s2, 7
-; GFX6-NEXT:    s_and_b32 s1, s1, 0xff
 ; GFX6-NEXT:    s_andn2_b32 s2, 7, s2
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX6-NEXT:    s_and_b32 s1, s1, 0xff
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, s2
 ; GFX6-NEXT:    s_lshr_b32 s1, s1, s3
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
@@ -277,9 +277,9 @@ define amdgpu_ps i8 @s_fshr_i8(i8 inreg %lhs, i8 inreg %rhs, i8 inreg %amt) {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_and_b32 s1, s1, 0xff
 ; GFX8-NEXT:    s_and_b32 s3, s2, 7
-; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x100000
 ; GFX8-NEXT:    s_andn2_b32 s2, 7, s2
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x100000
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, s2
 ; GFX8-NEXT:    s_lshr_b32 s1, s1, s3
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
@@ -289,9 +289,9 @@ define amdgpu_ps i8 @s_fshr_i8(i8 inreg %lhs, i8 inreg %rhs, i8 inreg %amt) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_and_b32 s1, s1, 0xff
 ; GFX9-NEXT:    s_and_b32 s3, s2, 7
-; GFX9-NEXT:    s_bfe_u32 s1, s1, 0x100000
 ; GFX9-NEXT:    s_andn2_b32 s2, 7, s2
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX9-NEXT:    s_bfe_u32 s1, s1, 0x100000
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, s2
 ; GFX9-NEXT:    s_lshr_b32 s1, s1, s3
 ; GFX9-NEXT:    s_or_b32 s0, s0, s1
@@ -301,11 +301,11 @@ define amdgpu_ps i8 @s_fshr_i8(i8 inreg %lhs, i8 inreg %rhs, i8 inreg %amt) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_and_b32 s1, s1, 0xff
 ; GFX10-NEXT:    s_and_b32 s3, s2, 7
-; GFX10-NEXT:    s_bfe_u32 s1, s1, 0x100000
 ; GFX10-NEXT:    s_andn2_b32 s2, 7, s2
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, 1
-; GFX10-NEXT:    s_lshr_b32 s1, s1, s3
+; GFX10-NEXT:    s_bfe_u32 s1, s1, 0x100000
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, s2
+; GFX10-NEXT:    s_lshr_b32 s1, s1, s3
 ; GFX10-NEXT:    s_or_b32 s0, s0, s1
 ; GFX10-NEXT:    ; return to shader part epilog
   %result = call i8 @llvm.fshr.i8(i8 %lhs, i8 %rhs, i8 %amt)
@@ -318,9 +318,9 @@ define i8 @v_fshr_i8(i8 %lhs, i8 %rhs, i8 %amt) {
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v3, 7, v2
 ; GFX6-NEXT:    v_xor_b32_e32 v2, -1, v2
-; GFX6-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX6-NEXT:    v_and_b32_e32 v2, 7, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, v2, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, v3, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -356,8 +356,8 @@ define i8 @v_fshr_i8(i8 %lhs, i8 %rhs, i8 %amt) {
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_xor_b32_e32 v3, -1, v2
 ; GFX10-NEXT:    v_and_b32_e32 v2, 7, v2
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX10-NEXT:    v_lshlrev_b16 v0, 1, v0
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX10-NEXT:    v_and_b32_e32 v3, 7, v3
 ; GFX10-NEXT:    v_lshrrev_b16 v1, v2, v1
 ; GFX10-NEXT:    v_lshlrev_b16 v0, v3, v0
@@ -534,16 +534,16 @@ define amdgpu_ps i16 @s_fshr_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in
 ; GFX6-NEXT:    s_lshr_b32 s4, s2, 8
 ; GFX6-NEXT:    s_and_b32 s5, s2, 7
 ; GFX6-NEXT:    s_andn2_b32 s2, 7, s2
-; GFX6-NEXT:    s_lshl_b32 s0, s0, 1
 ; GFX6-NEXT:    s_movk_i32 s6, 0xff
+; GFX6-NEXT:    s_lshl_b32 s0, s0, 1
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, s2
 ; GFX6-NEXT:    s_and_b32 s2, s1, s6
 ; GFX6-NEXT:    s_lshr_b32 s2, s2, s5
 ; GFX6-NEXT:    s_or_b32 s0, s0, s2
 ; GFX6-NEXT:    s_and_b32 s2, s4, 7
-; GFX6-NEXT:    s_bfe_u32 s1, s1, 0x80008
 ; GFX6-NEXT:    s_andn2_b32 s4, 7, s4
 ; GFX6-NEXT:    s_lshl_b32 s3, s3, 1
+; GFX6-NEXT:    s_bfe_u32 s1, s1, 0x80008
 ; GFX6-NEXT:    s_lshl_b32 s3, s3, s4
 ; GFX6-NEXT:    s_lshr_b32 s1, s1, s2
 ; GFX6-NEXT:    s_or_b32 s1, s3, s1
@@ -565,13 +565,13 @@ define amdgpu_ps i16 @s_fshr_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in
 ; GFX8-NEXT:    s_lshr_b32 s4, s1, 8
 ; GFX8-NEXT:    s_and_b32 s1, s1, s2
 ; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x100000
-; GFX8-NEXT:    s_and_b32 s4, s4, s2
 ; GFX8-NEXT:    s_lshr_b32 s1, s1, s6
+; GFX8-NEXT:    s_and_b32 s4, s4, s2
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    s_and_b32 s1, s5, 7
-; GFX8-NEXT:    s_bfe_u32 s4, s4, 0x100000
 ; GFX8-NEXT:    s_andn2_b32 s5, 7, s5
 ; GFX8-NEXT:    s_lshl_b32 s3, s3, 1
+; GFX8-NEXT:    s_bfe_u32 s4, s4, 0x100000
 ; GFX8-NEXT:    s_lshl_b32 s3, s3, s5
 ; GFX8-NEXT:    s_lshr_b32 s1, s4, s1
 ; GFX8-NEXT:    s_or_b32 s1, s3, s1
@@ -594,13 +594,13 @@ define amdgpu_ps i16 @s_fshr_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in
 ; GFX9-NEXT:    s_lshr_b32 s4, s1, 8
 ; GFX9-NEXT:    s_and_b32 s1, s1, s2
 ; GFX9-NEXT:    s_bfe_u32 s1, s1, 0x100000
-; GFX9-NEXT:    s_and_b32 s4, s4, s2
 ; GFX9-NEXT:    s_lshr_b32 s1, s1, s6
+; GFX9-NEXT:    s_and_b32 s4, s4, s2
 ; GFX9-NEXT:    s_or_b32 s0, s0, s1
 ; GFX9-NEXT:    s_and_b32 s1, s5, 7
-; GFX9-NEXT:    s_bfe_u32 s4, s4, 0x100000
 ; GFX9-NEXT:    s_andn2_b32 s5, 7, s5
 ; GFX9-NEXT:    s_lshl_b32 s3, s3, 1
+; GFX9-NEXT:    s_bfe_u32 s4, s4, 0x100000
 ; GFX9-NEXT:    s_lshl_b32 s3, s3, s5
 ; GFX9-NEXT:    s_lshr_b32 s1, s4, s1
 ; GFX9-NEXT:    s_or_b32 s1, s3, s1
@@ -616,17 +616,17 @@ define amdgpu_ps i16 @s_fshr_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in
 ; GFX10-NEXT:    s_lshr_b32 s4, s1, 8
 ; GFX10-NEXT:    s_movk_i32 s7, 0xff
 ; GFX10-NEXT:    s_lshr_b32 s3, s0, 8
-; GFX10-NEXT:    s_and_b32 s4, s4, s7
 ; GFX10-NEXT:    s_lshr_b32 s5, s2, 8
 ; GFX10-NEXT:    s_and_b32 s6, s2, 7
 ; GFX10-NEXT:    s_andn2_b32 s2, 7, s2
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX10-NEXT:    s_and_b32 s4, s4, s7
 ; GFX10-NEXT:    s_and_b32 s1, s1, s7
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, s2
 ; GFX10-NEXT:    s_and_b32 s2, s5, 7
-; GFX10-NEXT:    s_bfe_u32 s4, s4, 0x100000
 ; GFX10-NEXT:    s_andn2_b32 s5, 7, s5
 ; GFX10-NEXT:    s_lshl_b32 s3, s3, 1
+; GFX10-NEXT:    s_bfe_u32 s4, s4, 0x100000
 ; GFX10-NEXT:    s_bfe_u32 s1, s1, 0x100000
 ; GFX10-NEXT:    s_lshl_b32 s3, s3, s5
 ; GFX10-NEXT:    s_lshr_b32 s2, s4, s2
@@ -656,17 +656,17 @@ define i16 @v_fshr_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) {
 ; GFX6-NEXT:    v_xor_b32_e32 v2, -1, v2
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
 ; GFX6-NEXT:    v_and_b32_e32 v2, 7, v2
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX6-NEXT:    s_movk_i32 s4, 0xff
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, v2, v0
 ; GFX6-NEXT:    v_and_b32_e32 v2, s4, v1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v5, v2
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_and_b32_e32 v2, 7, v4
 ; GFX6-NEXT:    v_xor_b32_e32 v4, -1, v4
-; GFX6-NEXT:    v_bfe_u32 v1, v1, 8, 8
 ; GFX6-NEXT:    v_and_b32_e32 v4, 7, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 1, v3
+; GFX6-NEXT:    v_bfe_u32 v1, v1, 8, 8
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v4, v3
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, v2, v1
 ; GFX6-NEXT:    v_or_b32_e32 v1, v3, v1
@@ -687,8 +687,8 @@ define i16 @v_fshr_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) {
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 1, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v0, v2, v0
-; GFX8-NEXT:    v_xor_b32_e32 v2, -1, v5
 ; GFX8-NEXT:    v_lshrrev_b16_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_xor_b32_e32 v2, -1, v5
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_and_b32_e32 v1, 7, v5
 ; GFX8-NEXT:    v_and_b32_e32 v2, 7, v2
@@ -712,8 +712,8 @@ define i16 @v_fshr_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) {
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 1, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, v2, v0
-; GFX9-NEXT:    v_xor_b32_e32 v2, -1, v5
 ; GFX9-NEXT:    v_lshrrev_b16_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_xor_b32_e32 v2, -1, v5
 ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_and_b32_e32 v1, 7, v5
 ; GFX9-NEXT:    v_and_b32_e32 v2, 7, v2
@@ -738,11 +738,11 @@ define i16 @v_fshr_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) {
 ; GFX10-NEXT:    v_xor_b32_e32 v6, -1, v3
 ; GFX10-NEXT:    v_xor_b32_e32 v2, -1, v2
 ; GFX10-NEXT:    v_and_b32_e32 v3, 7, v3
-; GFX10-NEXT:    v_and_b32_e32 v5, s4, v5
 ; GFX10-NEXT:    v_lshlrev_b16 v4, 1, v4
+; GFX10-NEXT:    v_and_b32_e32 v5, s4, v5
 ; GFX10-NEXT:    v_and_b32_e32 v6, 7, v6
-; GFX10-NEXT:    v_and_b32_e32 v1, s4, v1
 ; GFX10-NEXT:    v_lshlrev_b16 v0, 1, v0
+; GFX10-NEXT:    v_and_b32_e32 v1, s4, v1
 ; GFX10-NEXT:    v_and_b32_e32 v2, 7, v2
 ; GFX10-NEXT:    v_lshrrev_b16 v3, v3, v5
 ; GFX10-NEXT:    v_lshlrev_b16 v4, v6, v4
@@ -772,8 +772,8 @@ define amdgpu_ps i32 @s_fshr_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
 ; GFX6-NEXT:    s_lshr_b32 s9, s2, 24
 ; GFX6-NEXT:    s_and_b32 s10, s2, 7
 ; GFX6-NEXT:    s_andn2_b32 s2, 7, s2
-; GFX6-NEXT:    s_lshl_b32 s0, s0, 1
 ; GFX6-NEXT:    s_movk_i32 s11, 0xff
+; GFX6-NEXT:    s_lshl_b32 s0, s0, 1
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, s2
 ; GFX6-NEXT:    s_and_b32 s2, s1, s11
 ; GFX6-NEXT:    s_lshr_b32 s2, s2, s10
@@ -784,24 +784,24 @@ define amdgpu_ps i32 @s_fshr_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
 ; GFX6-NEXT:    s_lshl_b32 s3, s3, s7
 ; GFX6-NEXT:    s_bfe_u32 s7, s1, 0x80008
 ; GFX6-NEXT:    s_lshr_b32 s2, s7, s2
-; GFX6-NEXT:    s_or_b32 s2, s3, s2
 ; GFX6-NEXT:    s_lshr_b32 s6, s1, 24
+; GFX6-NEXT:    s_or_b32 s2, s3, s2
 ; GFX6-NEXT:    s_and_b32 s3, s8, 7
-; GFX6-NEXT:    s_bfe_u32 s1, s1, 0x80010
 ; GFX6-NEXT:    s_andn2_b32 s7, 7, s8
 ; GFX6-NEXT:    s_lshl_b32 s4, s4, 1
-; GFX6-NEXT:    s_lshr_b32 s1, s1, s3
+; GFX6-NEXT:    s_bfe_u32 s1, s1, 0x80010
 ; GFX6-NEXT:    s_lshl_b32 s4, s4, s7
+; GFX6-NEXT:    s_lshr_b32 s1, s1, s3
 ; GFX6-NEXT:    s_or_b32 s1, s4, s1
 ; GFX6-NEXT:    s_and_b32 s3, s9, 7
-; GFX6-NEXT:    s_and_b32 s2, s2, s11
 ; GFX6-NEXT:    s_andn2_b32 s4, 7, s9
 ; GFX6-NEXT:    s_lshl_b32 s5, s5, 1
-; GFX6-NEXT:    s_and_b32 s1, s1, s11
+; GFX6-NEXT:    s_and_b32 s2, s2, s11
 ; GFX6-NEXT:    s_lshl_b32 s4, s5, s4
 ; GFX6-NEXT:    s_lshr_b32 s3, s6, s3
 ; GFX6-NEXT:    s_and_b32 s0, s0, s11
 ; GFX6-NEXT:    s_lshl_b32 s2, s2, 8
+; GFX6-NEXT:    s_and_b32 s1, s1, s11
 ; GFX6-NEXT:    s_or_b32 s3, s4, s3
 ; GFX6-NEXT:    s_or_b32 s0, s0, s2
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
@@ -820,15 +820,15 @@ define amdgpu_ps i32 @s_fshr_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
 ; GFX8-NEXT:    s_lshr_b32 s6, s1, 8
 ; GFX8-NEXT:    s_lshr_b32 s7, s1, 16
 ; GFX8-NEXT:    s_lshr_b32 s8, s1, 24
-; GFX8-NEXT:    s_and_b32 s1, s1, s13
 ; GFX8-NEXT:    s_lshr_b32 s9, s2, 8
 ; GFX8-NEXT:    s_lshr_b32 s10, s2, 16
 ; GFX8-NEXT:    s_lshr_b32 s11, s2, 24
 ; GFX8-NEXT:    s_and_b32 s12, s2, 7
 ; GFX8-NEXT:    s_andn2_b32 s2, 7, s2
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, 1
-; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX8-NEXT:    s_and_b32 s1, s1, s13
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, s2
+; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x100000
 ; GFX8-NEXT:    s_andn2_b32 s2, 7, s9
 ; GFX8-NEXT:    s_lshl_b32 s3, s3, 1
 ; GFX8-NEXT:    s_lshr_b32 s1, s1, s12
@@ -853,12 +853,12 @@ define amdgpu_ps i32 @s_fshr_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
 ; GFX8-NEXT:    s_lshl_b32 s5, s5, 1
 ; GFX8-NEXT:    s_and_b32 s0, s0, s13
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX8-NEXT:    s_or_b32 s0, s0, s1
-; GFX8-NEXT:    s_and_b32 s1, s2, s13
 ; GFX8-NEXT:    s_lshl_b32 s4, s5, s4
 ; GFX8-NEXT:    s_lshr_b32 s3, s8, s3
-; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
+; GFX8-NEXT:    s_and_b32 s1, s2, s13
 ; GFX8-NEXT:    s_or_b32 s3, s4, s3
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    s_and_b32 s1, s3, s13
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 24
@@ -874,15 +874,15 @@ define amdgpu_ps i32 @s_fshr_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
 ; GFX9-NEXT:    s_lshr_b32 s6, s1, 8
 ; GFX9-NEXT:    s_lshr_b32 s7, s1, 16
 ; GFX9-NEXT:    s_lshr_b32 s8, s1, 24
-; GFX9-NEXT:    s_and_b32 s1, s1, s13
 ; GFX9-NEXT:    s_lshr_b32 s9, s2, 8
 ; GFX9-NEXT:    s_lshr_b32 s10, s2, 16
 ; GFX9-NEXT:    s_lshr_b32 s11, s2, 24
 ; GFX9-NEXT:    s_and_b32 s12, s2, 7
 ; GFX9-NEXT:    s_andn2_b32 s2, 7, s2
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, 1
-; GFX9-NEXT:    s_bfe_u32 s1, s1, 0x100000
+; GFX9-NEXT:    s_and_b32 s1, s1, s13
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, s2
+; GFX9-NEXT:    s_bfe_u32 s1, s1, 0x100000
 ; GFX9-NEXT:    s_andn2_b32 s2, 7, s9
 ; GFX9-NEXT:    s_lshl_b32 s3, s3, 1
 ; GFX9-NEXT:    s_lshr_b32 s1, s1, s12
@@ -907,12 +907,12 @@ define amdgpu_ps i32 @s_fshr_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
 ; GFX9-NEXT:    s_lshl_b32 s5, s5, 1
 ; GFX9-NEXT:    s_and_b32 s0, s0, s13
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX9-NEXT:    s_or_b32 s0, s0, s1
-; GFX9-NEXT:    s_and_b32 s1, s2, s13
 ; GFX9-NEXT:    s_lshl_b32 s4, s5, s4
 ; GFX9-NEXT:    s_lshr_b32 s3, s8, s3
-; GFX9-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX9-NEXT:    s_or_b32 s0, s0, s1
+; GFX9-NEXT:    s_and_b32 s1, s2, s13
 ; GFX9-NEXT:    s_or_b32 s3, s4, s3
+; GFX9-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX9-NEXT:    s_or_b32 s0, s0, s1
 ; GFX9-NEXT:    s_and_b32 s1, s3, s13
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, 24
@@ -926,44 +926,44 @@ define amdgpu_ps i32 @s_fshr_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
 ; GFX10-NEXT:    s_lshr_b32 s3, s0, 8
 ; GFX10-NEXT:    s_lshr_b32 s4, s0, 16
 ; GFX10-NEXT:    s_lshr_b32 s5, s0, 24
-; GFX10-NEXT:    s_and_b32 s6, s6, s13
 ; GFX10-NEXT:    s_lshr_b32 s7, s1, 16
 ; GFX10-NEXT:    s_lshr_b32 s8, s1, 24
-; GFX10-NEXT:    s_and_b32 s1, s1, s13
 ; GFX10-NEXT:    s_lshr_b32 s9, s2, 8
 ; GFX10-NEXT:    s_lshr_b32 s10, s2, 16
 ; GFX10-NEXT:    s_lshr_b32 s11, s2, 24
 ; GFX10-NEXT:    s_and_b32 s12, s2, 7
 ; GFX10-NEXT:    s_andn2_b32 s2, 7, s2
+; GFX10-NEXT:    s_and_b32 s1, s1, s13
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX10-NEXT:    s_and_b32 s6, s6, s13
 ; GFX10-NEXT:    s_bfe_u32 s1, s1, 0x100000
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, s2
 ; GFX10-NEXT:    s_and_b32 s2, s9, 7
-; GFX10-NEXT:    s_bfe_u32 s6, s6, 0x100000
 ; GFX10-NEXT:    s_andn2_b32 s9, 7, s9
 ; GFX10-NEXT:    s_lshl_b32 s3, s3, 1
+; GFX10-NEXT:    s_bfe_u32 s6, s6, 0x100000
 ; GFX10-NEXT:    s_lshr_b32 s1, s1, s12
+; GFX10-NEXT:    s_lshl_b32 s3, s3, s9
 ; GFX10-NEXT:    s_lshr_b32 s2, s6, s2
 ; GFX10-NEXT:    s_and_b32 s6, s7, s13
-; GFX10-NEXT:    s_lshl_b32 s3, s3, s9
 ; GFX10-NEXT:    s_or_b32 s0, s0, s1
 ; GFX10-NEXT:    s_or_b32 s1, s3, s2
 ; GFX10-NEXT:    s_and_b32 s2, s10, 7
-; GFX10-NEXT:    s_bfe_u32 s6, s6, 0x100000
 ; GFX10-NEXT:    s_andn2_b32 s3, 7, s10
 ; GFX10-NEXT:    s_lshl_b32 s4, s4, 1
-; GFX10-NEXT:    s_lshr_b32 s2, s6, s2
+; GFX10-NEXT:    s_bfe_u32 s6, s6, 0x100000
 ; GFX10-NEXT:    s_lshl_b32 s3, s4, s3
+; GFX10-NEXT:    s_lshr_b32 s2, s6, s2
 ; GFX10-NEXT:    s_andn2_b32 s4, 7, s11
 ; GFX10-NEXT:    s_lshl_b32 s5, s5, 1
 ; GFX10-NEXT:    s_and_b32 s6, s11, 7
-; GFX10-NEXT:    s_or_b32 s2, s3, s2
-; GFX10-NEXT:    s_and_b32 s1, s1, s13
 ; GFX10-NEXT:    s_lshl_b32 s4, s5, s4
 ; GFX10-NEXT:    s_lshr_b32 s5, s8, s6
+; GFX10-NEXT:    s_or_b32 s2, s3, s2
+; GFX10-NEXT:    s_and_b32 s1, s1, s13
+; GFX10-NEXT:    s_or_b32 s3, s4, s5
 ; GFX10-NEXT:    s_and_b32 s0, s0, s13
 ; GFX10-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX10-NEXT:    s_or_b32 s3, s4, s5
 ; GFX10-NEXT:    s_and_b32 s2, s2, s13
 ; GFX10-NEXT:    s_or_b32 s0, s0, s1
 ; GFX10-NEXT:    s_lshl_b32 s1, s2, 16
@@ -992,9 +992,9 @@ define i32 @v_fshr_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
-; GFX6-NEXT:    v_and_b32_e32 v11, 0xff, v1
 ; GFX6-NEXT:    v_and_b32_e32 v2, 7, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX6-NEXT:    v_and_b32_e32 v11, 0xff, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, v2, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v10, v10, v11
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v10
@@ -1009,23 +1009,23 @@ define i32 @v_fshr_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX6-NEXT:    v_and_b32_e32 v7, 7, v8
 ; GFX6-NEXT:    v_xor_b32_e32 v8, -1, v8
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v6, 24, v1
-; GFX6-NEXT:    v_bfe_u32 v1, v1, 16, 8
 ; GFX6-NEXT:    v_and_b32_e32 v8, 7, v8
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 1, v4
+; GFX6-NEXT:    v_bfe_u32 v1, v1, 16, 8
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 0xff
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, v8, v4
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, v7, v1
 ; GFX6-NEXT:    v_xor_b32_e32 v7, -1, v9
-; GFX6-NEXT:    v_lshlrev_b32_e32 v4, v8, v4
 ; GFX6-NEXT:    v_or_b32_e32 v1, v4, v1
 ; GFX6-NEXT:    v_and_b32_e32 v4, 7, v9
-; GFX6-NEXT:    v_and_b32_e32 v3, v3, v2
 ; GFX6-NEXT:    v_and_b32_e32 v7, 7, v7
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 1, v5
-; GFX6-NEXT:    v_and_b32_e32 v1, v1, v2
+; GFX6-NEXT:    v_and_b32_e32 v3, v3, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v5, v7, v5
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v4, v6
 ; GFX6-NEXT:    v_and_b32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
+; GFX6-NEXT:    v_and_b32_e32 v1, v1, v2
 ; GFX6-NEXT:    v_or_b32_e32 v4, v5, v4
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v3
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
@@ -1038,10 +1038,10 @@ define i32 @v_fshr_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX8-LABEL: v_fshr_v4i8:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v8, 7, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 24, v2
+; GFX8-NEXT:    v_and_b32_e32 v8, 7, v2
 ; GFX8-NEXT:    v_xor_b32_e32 v2, -1, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 7, v2
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v9, 1, v0
@@ -1056,13 +1056,13 @@ define i32 @v_fshr_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v3, 1, v3
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v3, v5, v3
 ; GFX8-NEXT:    v_lshrrev_b16_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_xor_b32_e32 v5, -1, v6
 ; GFX8-NEXT:    v_or_b32_e32 v3, v3, v4
 ; GFX8-NEXT:    v_and_b32_e32 v4, 7, v6
+; GFX8-NEXT:    v_xor_b32_e32 v5, -1, v6
 ; GFX8-NEXT:    v_mov_b32_e32 v6, 1
-; GFX8-NEXT:    v_lshlrev_b16_sdwa v8, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_and_b32_e32 v5, 7, v5
 ; GFX8-NEXT:    v_mov_b32_e32 v9, 0xff
+; GFX8-NEXT:    v_and_b32_e32 v5, 7, v5
+; GFX8-NEXT:    v_lshlrev_b16_sdwa v8, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v5, v5, v8
 ; GFX8-NEXT:    v_and_b32_sdwa v8, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v4, v4, v8
@@ -1075,12 +1075,12 @@ define i32 @v_fshr_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX8-NEXT:    v_lshrrev_b16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 8
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX8-NEXT:    s_movk_i32 s4, 0xff
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX8-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_and_b32_e32 v2, s4, v4
-; GFX8-NEXT:    v_and_b32_e32 v0, s4, v0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v0, s4, v0
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
@@ -1089,10 +1089,10 @@ define i32 @v_fshr_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX9-LABEL: v_fshr_v4i8:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v8, 7, v2
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 24, v2
+; GFX9-NEXT:    v_and_b32_e32 v8, 7, v2
 ; GFX9-NEXT:    v_xor_b32_e32 v2, -1, v2
 ; GFX9-NEXT:    v_and_b32_e32 v2, 7, v2
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v9, 1, v0
@@ -1107,13 +1107,13 @@ define i32 @v_fshr_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v3, 1, v3
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v3, v5, v3
 ; GFX9-NEXT:    v_lshrrev_b16_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_xor_b32_e32 v5, -1, v6
 ; GFX9-NEXT:    v_or_b32_e32 v3, v3, v4
 ; GFX9-NEXT:    v_and_b32_e32 v4, 7, v6
+; GFX9-NEXT:    v_xor_b32_e32 v5, -1, v6
 ; GFX9-NEXT:    v_mov_b32_e32 v6, 1
-; GFX9-NEXT:    v_lshlrev_b16_sdwa v8, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT:    v_and_b32_e32 v5, 7, v5
 ; GFX9-NEXT:    v_mov_b32_e32 v9, 0xff
+; GFX9-NEXT:    v_and_b32_e32 v5, 7, v5
+; GFX9-NEXT:    v_lshlrev_b16_sdwa v8, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v5, v5, v8
 ; GFX9-NEXT:    v_and_b32_sdwa v8, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-NEXT:    v_lshrrev_b16_e32 v4, v4, v8
@@ -1128,9 +1128,9 @@ define i32 @v_fshr_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 8
 ; GFX9-NEXT:    s_movk_i32 s4, 0xff
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_and_b32_e32 v0, s4, v0
 ; GFX9-NEXT:    v_and_or_b32 v1, v2, s4, v1
 ; GFX9-NEXT:    v_and_b32_e32 v2, s4, v4
+; GFX9-NEXT:    v_and_b32_e32 v0, s4, v0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; GFX9-NEXT:    v_or3_b32 v0, v1, v2, v0
@@ -1143,30 +1143,30 @@ define i32 @v_fshr_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 8, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
 ; GFX10-NEXT:    v_xor_b32_e32 v8, -1, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 24, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 24, v2
 ; GFX10-NEXT:    v_xor_b32_e32 v11, -1, v6
 ; GFX10-NEXT:    v_lshlrev_b16 v3, 1, v3
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
-; GFX10-NEXT:    v_xor_b32_e32 v14, -1, v12
-; GFX10-NEXT:    v_and_b32_e32 v11, 7, v11
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 8, v1
+; GFX10-NEXT:    v_and_b32_e32 v11, 7, v11
 ; GFX10-NEXT:    v_and_b32_e32 v8, 7, v8
 ; GFX10-NEXT:    v_lshlrev_b16 v0, 1, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v13, 0xff
+; GFX10-NEXT:    v_xor_b32_e32 v14, -1, v12
 ; GFX10-NEXT:    v_lshlrev_b16 v3, v11, v3
 ; GFX10-NEXT:    v_xor_b32_e32 v11, -1, v10
 ; GFX10-NEXT:    s_movk_i32 s4, 0xff
-; GFX10-NEXT:    v_lshlrev_b16 v0, v8, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 24, v1
+; GFX10-NEXT:    v_lshlrev_b16 v0, v8, v0
 ; GFX10-NEXT:    v_and_b32_e32 v8, s4, v1
-; GFX10-NEXT:    v_and_b32_sdwa v1, v1, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    v_and_b32_e32 v10, 7, v10
 ; GFX10-NEXT:    v_and_b32_e32 v6, 7, v6
 ; GFX10-NEXT:    v_and_b32_e32 v7, s4, v7
+; GFX10-NEXT:    v_and_b32_e32 v10, 7, v10
 ; GFX10-NEXT:    v_and_b32_e32 v11, 7, v11
 ; GFX10-NEXT:    v_lshlrev_b16 v4, 1, v4
+; GFX10-NEXT:    v_and_b32_sdwa v1, v1, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    v_and_b32_e32 v13, 7, v14
 ; GFX10-NEXT:    v_lshlrev_b16 v5, 1, v5
 ; GFX10-NEXT:    v_and_b32_e32 v12, 7, v12
@@ -1178,15 +1178,15 @@ define i32 @v_fshr_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX10-NEXT:    v_lshrrev_b16 v7, v12, v9
 ; GFX10-NEXT:    v_lshrrev_b16 v2, v2, v8
 ; GFX10-NEXT:    v_or_b32_e32 v3, v3, v6
-; GFX10-NEXT:    v_or_b32_e32 v1, v4, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v6, 8
+; GFX10-NEXT:    v_or_b32_e32 v1, v4, v1
 ; GFX10-NEXT:    v_or_b32_e32 v4, v5, v7
 ; GFX10-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX10-NEXT:    v_and_b32_e32 v1, s4, v1
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-NEXT:    v_and_b32_e32 v1, s4, v1
 ; GFX10-NEXT:    v_and_b32_e32 v3, s4, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_and_or_b32 v0, v0, s4, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
 ; GFX10-NEXT:    v_or3_b32 v0, v0, v1, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -1467,11 +1467,11 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX6-NEXT:    s_lshr_b32 s6, s0, 16
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX6-NEXT:    s_lshr_b32 s8, s1, 8
-; GFX6-NEXT:    s_and_b32 s1, s1, s9
 ; GFX6-NEXT:    s_lshr_b32 s7, s0, 24
+; GFX6-NEXT:    s_lshr_b32 s8, s1, 8
 ; GFX6-NEXT:    s_and_b32 s10, s0, s9
 ; GFX6-NEXT:    s_bfe_u32 s0, s0, s11
+; GFX6-NEXT:    s_and_b32 s1, s1, s9
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 8
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 8
 ; GFX6-NEXT:    v_mov_b32_e32 v1, 0xffffffe8
@@ -1479,22 +1479,22 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX6-NEXT:    s_or_b32 s1, s7, s1
 ; GFX6-NEXT:    s_and_b32 s7, s8, s9
 ; GFX6-NEXT:    s_lshr_b32 s8, s2, 16
-; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v0
 ; GFX6-NEXT:    s_lshr_b32 s10, s2, 24
 ; GFX6-NEXT:    s_and_b32 s13, s2, s9
 ; GFX6-NEXT:    s_bfe_u32 s2, s2, s11
+; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v0
 ; GFX6-NEXT:    s_lshl_b32 s2, s2, 8
 ; GFX6-NEXT:    s_and_b32 s8, s8, s9
 ; GFX6-NEXT:    s_or_b32 s2, s13, s2
 ; GFX6-NEXT:    s_bfe_u32 s8, s8, 0x100000
 ; GFX6-NEXT:    s_lshr_b32 s12, s3, 8
-; GFX6-NEXT:    s_and_b32 s3, s3, s9
 ; GFX6-NEXT:    s_bfe_u32 s2, s2, 0x100000
 ; GFX6-NEXT:    s_lshl_b32 s8, s8, 16
-; GFX6-NEXT:    s_lshl_b32 s3, s3, 8
-; GFX6-NEXT:    v_mul_hi_u32 v2, v0, v2
+; GFX6-NEXT:    s_and_b32 s3, s3, s9
 ; GFX6-NEXT:    s_or_b32 s2, s2, s8
+; GFX6-NEXT:    s_lshl_b32 s3, s3, 8
 ; GFX6-NEXT:    s_and_b32 s8, s12, s9
+; GFX6-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX6-NEXT:    s_or_b32 s3, s10, s3
 ; GFX6-NEXT:    s_bfe_u32 s8, s8, 0x100000
 ; GFX6-NEXT:    s_bfe_u32 s3, s3, 0x100000
@@ -1526,16 +1526,16 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, 24, v0
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
 ; GFX6-NEXT:    v_mul_hi_u32 v1, v2, v1
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX6-NEXT:    s_and_b32 s8, s12, s9
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX6-NEXT:    s_or_b32 s5, s10, s5
 ; GFX6-NEXT:    s_bfe_u32 s8, s8, 0x100000
 ; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, 24, v0
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
 ; GFX6-NEXT:    s_bfe_u32 s5, s5, 0x100000
 ; GFX6-NEXT:    s_lshl_b32 s8, s8, 16
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
 ; GFX6-NEXT:    s_or_b32 s5, s5, s8
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v1, s5, v1
 ; GFX6-NEXT:    s_and_b32 s6, s6, s9
@@ -1546,9 +1546,9 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, 23, v0
 ; GFX6-NEXT:    s_lshl_b32 s4, s6, 17
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 1
-; GFX6-NEXT:    v_and_b32_e32 v0, s8, v0
 ; GFX6-NEXT:    s_or_b32 s0, s4, s0
 ; GFX6-NEXT:    v_and_b32_e32 v2, s8, v3
+; GFX6-NEXT:    v_and_b32_e32 v0, s8, v0
 ; GFX6-NEXT:    v_lshl_b32_e32 v2, s0, v2
 ; GFX6-NEXT:    v_lshr_b32_e32 v0, s2, v0
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s5, v1
@@ -1558,23 +1558,23 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, 24, v1
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v1
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX6-NEXT:    s_bfe_u32 s1, s1, 0x100000
 ; GFX6-NEXT:    s_bfe_u32 s7, s7, 0x100000
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX6-NEXT:    v_mov_b32_e32 v4, 0xffffff
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 23, v1
 ; GFX6-NEXT:    s_lshl_b32 s0, s7, 17
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 1
-; GFX6-NEXT:    v_and_b32_e32 v1, v1, v4
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
 ; GFX6-NEXT:    v_and_b32_e32 v2, v2, v4
-; GFX6-NEXT:    v_bfe_u32 v3, v0, 8, 8
+; GFX6-NEXT:    v_and_b32_e32 v1, v1, v4
 ; GFX6-NEXT:    v_lshl_b32_e32 v2, s0, v2
 ; GFX6-NEXT:    v_lshr_b32_e32 v1, s3, v1
+; GFX6-NEXT:    v_bfe_u32 v3, v0, 8, 8
 ; GFX6-NEXT:    v_or_b32_e32 v1, v2, v1
 ; GFX6-NEXT:    v_and_b32_e32 v2, s9, v0
-; GFX6-NEXT:    v_bfe_u32 v0, v0, 16, 8
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
+; GFX6-NEXT:    v_bfe_u32 v0, v0, 16, 8
 ; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
@@ -1598,17 +1598,17 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX8-NEXT:    s_bfe_u32 s11, 8, 0x100000
 ; GFX8-NEXT:    s_and_b32 s1, s1, s10
 ; GFX8-NEXT:    s_lshr_b32 s6, s0, 8
-; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX8-NEXT:    s_lshr_b32 s8, s0, 24
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, s11
-; GFX8-NEXT:    s_or_b32 s1, s8, s1
+; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX8-NEXT:    s_and_b32 s6, s6, s10
+; GFX8-NEXT:    s_or_b32 s1, s8, s1
 ; GFX8-NEXT:    s_lshr_b32 s8, s2, 8
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX8-NEXT:    s_and_b32 s8, s8, s10
 ; GFX8-NEXT:    s_lshr_b32 s7, s0, 16
 ; GFX8-NEXT:    s_and_b32 s0, s0, s10
 ; GFX8-NEXT:    s_lshl_b32 s6, s6, s11
+; GFX8-NEXT:    s_and_b32 s8, s8, s10
 ; GFX8-NEXT:    s_or_b32 s0, s0, s6
 ; GFX8-NEXT:    s_and_b32 s6, s7, s10
 ; GFX8-NEXT:    s_and_b32 s7, s9, s10
@@ -1622,17 +1622,17 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX8-NEXT:    s_bfe_u32 s8, s8, 0x100000
 ; GFX8-NEXT:    v_mul_lo_u32 v2, v1, v0
 ; GFX8-NEXT:    s_lshr_b32 s13, s3, 8
-; GFX8-NEXT:    s_and_b32 s3, s3, s10
 ; GFX8-NEXT:    s_bfe_u32 s2, s2, 0x100000
 ; GFX8-NEXT:    s_lshl_b32 s8, s8, 16
-; GFX8-NEXT:    s_lshl_b32 s3, s3, s11
+; GFX8-NEXT:    s_and_b32 s3, s3, s10
 ; GFX8-NEXT:    s_or_b32 s2, s2, s8
+; GFX8-NEXT:    s_lshl_b32 s3, s3, s11
 ; GFX8-NEXT:    s_and_b32 s8, s13, s10
 ; GFX8-NEXT:    s_or_b32 s3, s12, s3
 ; GFX8-NEXT:    s_bfe_u32 s8, s8, 0x100000
-; GFX8-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX8-NEXT:    s_bfe_u32 s3, s3, 0x100000
 ; GFX8-NEXT:    s_lshl_b32 s8, s8, 16
+; GFX8-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX8-NEXT:    s_or_b32 s3, s3, s8
 ; GFX8-NEXT:    s_lshr_b32 s8, s4, 8
 ; GFX8-NEXT:    s_and_b32 s8, s8, s10
@@ -1661,16 +1661,16 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, 24, v0
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
 ; GFX8-NEXT:    v_mul_hi_u32 v1, v2, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX8-NEXT:    s_and_b32 s8, s13, s10
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX8-NEXT:    s_or_b32 s5, s12, s5
 ; GFX8-NEXT:    s_bfe_u32 s8, s8, 0x100000
 ; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, 24, v0
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
 ; GFX8-NEXT:    s_bfe_u32 s5, s5, 0x100000
 ; GFX8-NEXT:    s_lshl_b32 s8, s8, 16
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
 ; GFX8-NEXT:    s_or_b32 s5, s5, s8
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v2, v1
 ; GFX8-NEXT:    v_mul_hi_u32 v1, s5, v1
 ; GFX8-NEXT:    s_bfe_u32 s0, s0, 0x100000
@@ -1680,9 +1680,9 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, 23, v0
 ; GFX8-NEXT:    s_lshl_b32 s4, s6, 17
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, 1
-; GFX8-NEXT:    v_and_b32_e32 v0, s8, v0
 ; GFX8-NEXT:    s_or_b32 s0, s4, s0
 ; GFX8-NEXT:    v_and_b32_e32 v2, s8, v3
+; GFX8-NEXT:    v_and_b32_e32 v0, s8, v0
 ; GFX8-NEXT:    v_lshlrev_b32_e64 v2, v2, s0
 ; GFX8-NEXT:    v_lshrrev_b32_e64 v0, v0, s2
 ; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, s5, v1
@@ -1692,16 +1692,16 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, 24, v1
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x100000
 ; GFX8-NEXT:    s_bfe_u32 s7, s7, 0x100000
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX8-NEXT:    v_mov_b32_e32 v4, 0xffffff
 ; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 23, v1
 ; GFX8-NEXT:    s_lshl_b32 s0, s7, 17
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 1
-; GFX8-NEXT:    v_and_b32_e32 v1, v1, v4
-; GFX8-NEXT:    v_and_b32_e32 v2, v2, v4
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
+; GFX8-NEXT:    v_and_b32_e32 v2, v2, v4
+; GFX8-NEXT:    v_and_b32_e32 v1, v1, v4
 ; GFX8-NEXT:    v_lshlrev_b32_e64 v2, v2, s0
 ; GFX8-NEXT:    v_lshrrev_b32_e64 v1, v1, s3
 ; GFX8-NEXT:    v_or_b32_e32 v1, v2, v1
@@ -1712,8 +1712,8 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX8-NEXT:    v_or_b32_e32 v0, v3, v0
 ; GFX8-NEXT:    v_and_b32_e32 v3, s10, v1
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v3
 ; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
@@ -1735,14 +1735,14 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v0
 ; GFX9-NEXT:    s_lshr_b32 s10, s0, 24
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, s13
-; GFX9-NEXT:    s_or_b32 s1, s10, s1
-; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX9-NEXT:    s_and_b32 s7, s7, s12
+; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
+; GFX9-NEXT:    s_or_b32 s1, s10, s1
 ; GFX9-NEXT:    s_lshr_b32 s10, s2, 8
-; GFX9-NEXT:    s_and_b32 s10, s10, s12
 ; GFX9-NEXT:    s_lshr_b32 s9, s0, 16
 ; GFX9-NEXT:    s_and_b32 s0, s0, s12
 ; GFX9-NEXT:    s_lshl_b32 s7, s7, s13
+; GFX9-NEXT:    s_and_b32 s10, s10, s12
 ; GFX9-NEXT:    s_or_b32 s0, s0, s7
 ; GFX9-NEXT:    s_and_b32 s7, s9, s12
 ; GFX9-NEXT:    s_and_b32 s9, s11, s12
@@ -1757,17 +1757,17 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX9-NEXT:    s_bfe_u32 s10, s10, 0x100000
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v2
 ; GFX9-NEXT:    s_lshr_b32 s15, s3, 8
-; GFX9-NEXT:    s_and_b32 s3, s3, s12
 ; GFX9-NEXT:    s_bfe_u32 s2, s2, 0x100000
 ; GFX9-NEXT:    s_lshl_b32 s10, s10, 16
-; GFX9-NEXT:    s_lshl_b32 s3, s3, s13
+; GFX9-NEXT:    s_and_b32 s3, s3, s12
 ; GFX9-NEXT:    s_or_b32 s2, s2, s10
+; GFX9-NEXT:    s_lshl_b32 s3, s3, s13
 ; GFX9-NEXT:    s_and_b32 s10, s15, s12
 ; GFX9-NEXT:    s_or_b32 s3, s14, s3
 ; GFX9-NEXT:    s_bfe_u32 s10, s10, 0x100000
-; GFX9-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
 ; GFX9-NEXT:    s_bfe_u32 s3, s3, 0x100000
 ; GFX9-NEXT:    s_lshl_b32 s10, s10, 16
+; GFX9-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
 ; GFX9-NEXT:    s_or_b32 s3, s3, s10
 ; GFX9-NEXT:    s_lshr_b32 s10, s4, 8
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
@@ -1791,9 +1791,9 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX9-NEXT:    s_and_b32 s10, s15, s12
 ; GFX9-NEXT:    s_or_b32 s5, s14, s5
 ; GFX9-NEXT:    s_bfe_u32 s10, s10, 0x100000
-; GFX9-NEXT:    v_mul_lo_u32 v0, v0, 24
 ; GFX9-NEXT:    s_bfe_u32 s5, s5, 0x100000
 ; GFX9-NEXT:    s_lshl_b32 s10, s10, 16
+; GFX9-NEXT:    v_mul_lo_u32 v0, v0, 24
 ; GFX9-NEXT:    s_or_b32 s5, s5, s10
 ; GFX9-NEXT:    v_add_u32_e32 v1, v2, v1
 ; GFX9-NEXT:    v_mul_hi_u32 v1, s5, v1
@@ -1805,17 +1805,17 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX9-NEXT:    v_subrev_u32_e32 v3, 24, v0
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
 ; GFX9-NEXT:    s_bfe_u32 s0, s0, 0x100000
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX9-NEXT:    s_bfe_u32 s7, s7, 0x100000
 ; GFX9-NEXT:    s_mov_b32 s10, 0xffffff
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX9-NEXT:    v_sub_u32_e32 v3, 23, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, s10, v0
 ; GFX9-NEXT:    s_lshl_b32 s4, s7, 17
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, 1
-; GFX9-NEXT:    v_sub_u32_e32 v1, s5, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, s10, v0
 ; GFX9-NEXT:    s_or_b32 s0, s4, s0
 ; GFX9-NEXT:    v_and_b32_e32 v3, s10, v3
 ; GFX9-NEXT:    v_lshrrev_b32_e64 v0, v0, s2
+; GFX9-NEXT:    v_sub_u32_e32 v1, s5, v1
 ; GFX9-NEXT:    v_lshl_or_b32 v0, s0, v3, v0
 ; GFX9-NEXT:    v_subrev_u32_e32 v3, 24, v1
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v1
@@ -1823,21 +1823,21 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX9-NEXT:    v_subrev_u32_e32 v3, 24, v1
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v1
 ; GFX9-NEXT:    s_bfe_u32 s1, s1, 0x100000
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX9-NEXT:    s_bfe_u32 s9, s9, 0x100000
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0xffffff
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX9-NEXT:    v_sub_u32_e32 v3, 23, v1
-; GFX9-NEXT:    v_and_b32_e32 v1, v1, v2
 ; GFX9-NEXT:    s_lshl_b32 s0, s9, 17
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, 1
+; GFX9-NEXT:    v_and_b32_e32 v1, v1, v2
 ; GFX9-NEXT:    s_or_b32 s0, s0, s1
 ; GFX9-NEXT:    v_and_b32_e32 v3, v3, v2
 ; GFX9-NEXT:    v_lshrrev_b32_e64 v1, v1, s3
-; GFX9-NEXT:    v_lshl_or_b32 v1, s0, v3, v1
 ; GFX9-NEXT:    s_mov_b32 s6, 8
+; GFX9-NEXT:    v_lshl_or_b32 v1, s0, v3, v1
+; GFX9-NEXT:    s_mov_b32 s8, 16
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_and_b32_e32 v3, s12, v1
-; GFX9-NEXT:    s_mov_b32 s8, 16
 ; GFX9-NEXT:    v_and_or_b32 v2, v0, s12, v2
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
@@ -1855,10 +1855,10 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v1, 24
 ; GFX10-NEXT:    s_movk_i32 s9, 0xff
 ; GFX10-NEXT:    s_lshr_b32 s12, s4, 8
-; GFX10-NEXT:    s_lshr_b32 s13, s4, 16
+; GFX10-NEXT:    s_bfe_u32 s10, 8, 0x100000
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GFX10-NEXT:    s_bfe_u32 s10, 8, 0x100000
+; GFX10-NEXT:    s_lshr_b32 s13, s4, 16
 ; GFX10-NEXT:    s_and_b32 s12, s12, s9
 ; GFX10-NEXT:    s_lshr_b32 s14, s4, 24
 ; GFX10-NEXT:    s_and_b32 s4, s4, s9
@@ -1869,13 +1869,13 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
 ; GFX10-NEXT:    s_lshr_b32 s15, s5, 8
-; GFX10-NEXT:    s_and_b32 s5, s5, s9
 ; GFX10-NEXT:    s_bfe_u32 s4, s4, 0x100000
+; GFX10-NEXT:    s_lshl_b32 s12, s12, 16
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX10-NEXT:    s_lshl_b32 s12, s12, 16
-; GFX10-NEXT:    s_lshl_b32 s5, s5, s10
+; GFX10-NEXT:    s_and_b32 s5, s5, s9
 ; GFX10-NEXT:    s_or_b32 s4, s4, s12
+; GFX10-NEXT:    s_lshl_b32 s5, s5, s10
 ; GFX10-NEXT:    v_mul_lo_u32 v2, 0xffffffe8, v0
 ; GFX10-NEXT:    v_mul_lo_u32 v3, 0xffffffe8, v1
 ; GFX10-NEXT:    s_and_b32 s12, s15, s9
@@ -1896,18 +1896,18 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX10-NEXT:    v_mul_hi_u32 v2, v1, v3
 ; GFX10-NEXT:    s_lshr_b32 s8, s2, 8
 ; GFX10-NEXT:    s_lshr_b32 s7, s0, 16
-; GFX10-NEXT:    s_and_b32 s8, s8, s9
-; GFX10-NEXT:    v_mul_hi_u32 v0, s4, v0
 ; GFX10-NEXT:    s_and_b32 s0, s0, s9
+; GFX10-NEXT:    v_mul_hi_u32 v0, s4, v0
 ; GFX10-NEXT:    s_lshl_b32 s6, s6, s10
-; GFX10-NEXT:    s_lshr_b32 s13, s2, 24
-; GFX10-NEXT:    v_add_nc_u32_e32 v1, v1, v2
+; GFX10-NEXT:    s_and_b32 s8, s8, s9
 ; GFX10-NEXT:    s_or_b32 s0, s0, s6
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, v1, v2
 ; GFX10-NEXT:    s_and_b32 s6, s7, s9
 ; GFX10-NEXT:    s_and_b32 s7, s11, s9
+; GFX10-NEXT:    s_lshr_b32 s11, s2, 16
 ; GFX10-NEXT:    v_mul_lo_u32 v0, v0, 24
 ; GFX10-NEXT:    v_mul_hi_u32 v1, s5, v1
-; GFX10-NEXT:    s_lshr_b32 s11, s2, 16
+; GFX10-NEXT:    s_lshr_b32 s13, s2, 24
 ; GFX10-NEXT:    s_and_b32 s2, s2, s9
 ; GFX10-NEXT:    s_lshl_b32 s8, s8, s10
 ; GFX10-NEXT:    s_lshr_b32 s12, s3, 8
@@ -2018,13 +2018,13 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX6-NEXT:    v_mul_lo_u32 v6, v7, v8
 ; GFX6-NEXT:    v_sub_i32_e32 v7, vcc, 23, v4
 ; GFX6-NEXT:    v_and_b32_e32 v7, v7, v9
-; GFX6-NEXT:    v_and_b32_e32 v4, v4, v9
 ; GFX6-NEXT:    v_mul_hi_u32 v6, v8, v6
+; GFX6-NEXT:    v_and_b32_e32 v4, v4, v9
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, v7, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v2
-; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
 ; GFX6-NEXT:    v_mul_hi_u32 v6, v5, v6
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_and_b32_e32 v3, v3, v9
 ; GFX6-NEXT:    v_mul_lo_u32 v6, v6, 24
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v5, v6
@@ -2035,8 +2035,8 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v2
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, 23, v2
-; GFX6-NEXT:    v_and_b32_e32 v2, v2, v9
 ; GFX6-NEXT:    v_and_b32_e32 v4, v4, v9
+; GFX6-NEXT:    v_and_b32_e32 v2, v2, v9
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, v4, v1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v2, v3
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
@@ -2075,13 +2075,13 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX8-NEXT:    v_mul_lo_u32 v6, v7, v8
 ; GFX8-NEXT:    v_sub_u32_e32 v7, vcc, 23, v4
 ; GFX8-NEXT:    v_and_b32_e32 v7, v7, v9
-; GFX8-NEXT:    v_and_b32_e32 v4, v4, v9
 ; GFX8-NEXT:    v_mul_hi_u32 v6, v8, v6
+; GFX8-NEXT:    v_and_b32_e32 v4, v4, v9
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, v7, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, v4, v2
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v8, v6
 ; GFX8-NEXT:    v_mul_hi_u32 v6, v5, v6
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX8-NEXT:    v_and_b32_e32 v3, v3, v9
 ; GFX8-NEXT:    v_mul_lo_u32 v6, v6, 24
 ; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, v5, v6
@@ -2092,8 +2092,8 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, 23, v2
-; GFX8-NEXT:    v_and_b32_e32 v2, v2, v9
 ; GFX8-NEXT:    v_and_b32_e32 v4, v4, v9
+; GFX8-NEXT:    v_and_b32_e32 v2, v2, v9
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, v4, v1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, v2, v3
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
@@ -2640,10 +2640,10 @@ define amdgpu_ps i16 @s_fshr_i16(i16 inreg %lhs, i16 inreg %rhs, i16 inreg %amt)
 ; GFX10-LABEL: s_fshr_i16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_and_b32 s3, s2, 15
-; GFX10-NEXT:    s_andn2_b32 s2, 15, s2
 ; GFX10-NEXT:    s_bfe_u32 s4, 1, 0x100000
-; GFX10-NEXT:    s_bfe_u32 s2, s2, 0x100000
+; GFX10-NEXT:    s_andn2_b32 s2, 15, s2
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, s4
+; GFX10-NEXT:    s_bfe_u32 s2, s2, 0x100000
 ; GFX10-NEXT:    s_bfe_u32 s1, s1, 0x100000
 ; GFX10-NEXT:    s_bfe_u32 s3, s3, 0x100000
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, s2
@@ -2783,8 +2783,8 @@ define i16 @v_fshr_i16(i16 %lhs, i16 %rhs, i16 %amt) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_xor_b32_e32 v3, -1, v2
-; GFX10-NEXT:    v_and_b32_e32 v2, 15, v2
 ; GFX10-NEXT:    v_lshlrev_b16 v0, 1, v0
+; GFX10-NEXT:    v_and_b32_e32 v2, 15, v2
 ; GFX10-NEXT:    v_and_b32_e32 v3, 15, v3
 ; GFX10-NEXT:    v_lshrrev_b16 v1, v2, v1
 ; GFX10-NEXT:    v_lshlrev_b16 v0, v3, v0
@@ -2966,8 +2966,8 @@ define amdgpu_ps half @v_fshr_i16_svs(i16 inreg %lhs, i16 %rhs, i16 inreg %amt)
 ; GFX10-LABEL: v_fshr_i16_svs:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_and_b32 s2, s1, 15
-; GFX10-NEXT:    s_andn2_b32 s1, 15, s1
 ; GFX10-NEXT:    s_bfe_u32 s3, 1, 0x100000
+; GFX10-NEXT:    s_andn2_b32 s1, 15, s1
 ; GFX10-NEXT:    v_lshrrev_b16 v0, s2, v0
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, s3
 ; GFX10-NEXT:    s_bfe_u32 s1, s1, 0x100000
@@ -3041,32 +3041,32 @@ define amdgpu_ps i32 @s_fshr_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs, <
 ; GFX6-NEXT:    s_and_b32 s4, s4, s6
 ; GFX6-NEXT:    s_or_b32 s4, s5, s4
 ; GFX6-NEXT:    s_bfe_u32 s5, 1, 0x100000
-; GFX6-NEXT:    s_and_b32 s7, s2, s6
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, s5
+; GFX6-NEXT:    s_and_b32 s7, s2, s6
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, s5
 ; GFX6-NEXT:    s_and_b32 s5, s3, s6
-; GFX6-NEXT:    s_lshl_b32 s2, s2, 1
 ; GFX6-NEXT:    s_lshr_b32 s7, s7, 15
 ; GFX6-NEXT:    s_lshr_b32 s5, s5, 15
+; GFX6-NEXT:    s_lshl_b32 s2, s2, 1
 ; GFX6-NEXT:    s_xor_b32 s4, s4, -1
-; GFX6-NEXT:    s_and_b32 s2, s2, s6
 ; GFX6-NEXT:    s_or_b32 s0, s0, s7
-; GFX6-NEXT:    s_and_b32 s7, s4, 15
 ; GFX6-NEXT:    s_or_b32 s1, s1, s5
 ; GFX6-NEXT:    s_lshr_b32 s5, s4, 16
+; GFX6-NEXT:    s_and_b32 s7, s4, 15
 ; GFX6-NEXT:    s_andn2_b32 s4, 15, s4
+; GFX6-NEXT:    s_and_b32 s2, s2, s6
 ; GFX6-NEXT:    s_bfe_u32 s7, s7, 0x100000
 ; GFX6-NEXT:    s_lshr_b32 s2, s2, 1
 ; GFX6-NEXT:    s_bfe_u32 s4, s4, 0x100000
-; GFX6-NEXT:    s_lshr_b32 s2, s2, s4
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, s7
+; GFX6-NEXT:    s_lshr_b32 s2, s2, s4
 ; GFX6-NEXT:    s_or_b32 s0, s0, s2
 ; GFX6-NEXT:    s_and_b32 s2, s5, 15
-; GFX6-NEXT:    s_bfe_u32 s2, s2, 0x100000
 ; GFX6-NEXT:    s_lshl_b32 s3, s3, 1
+; GFX6-NEXT:    s_bfe_u32 s2, s2, 0x100000
+; GFX6-NEXT:    s_andn2_b32 s4, 15, s5
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, s2
 ; GFX6-NEXT:    s_and_b32 s2, s3, s6
-; GFX6-NEXT:    s_andn2_b32 s4, 15, s5
 ; GFX6-NEXT:    s_lshr_b32 s2, s2, 1
 ; GFX6-NEXT:    s_bfe_u32 s3, s4, 0x100000
 ; GFX6-NEXT:    s_lshr_b32 s2, s2, s3
@@ -3086,25 +3086,25 @@ define amdgpu_ps i32 @s_fshr_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs, <
 ; GFX8-NEXT:    s_lshr_b32 s4, s1, 16
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, s5
 ; GFX8-NEXT:    s_lshr_b32 s6, s6, s7
-; GFX8-NEXT:    s_lshl_b32 s1, s1, s5
 ; GFX8-NEXT:    s_or_b32 s0, s0, s6
-; GFX8-NEXT:    s_lshr_b32 s6, s4, s7
 ; GFX8-NEXT:    s_lshl_b32 s3, s3, s5
+; GFX8-NEXT:    s_lshr_b32 s6, s4, s7
+; GFX8-NEXT:    s_lshl_b32 s1, s1, s5
 ; GFX8-NEXT:    s_xor_b32 s2, s2, -1
-; GFX8-NEXT:    s_and_b32 s7, s2, 15
-; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x100000
 ; GFX8-NEXT:    s_or_b32 s3, s3, s6
 ; GFX8-NEXT:    s_lshr_b32 s6, s2, 16
+; GFX8-NEXT:    s_and_b32 s7, s2, 15
 ; GFX8-NEXT:    s_andn2_b32 s2, 15, s2
+; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x100000
 ; GFX8-NEXT:    s_bfe_u32 s7, s7, 0x100000
 ; GFX8-NEXT:    s_lshr_b32 s1, s1, s5
 ; GFX8-NEXT:    s_bfe_u32 s2, s2, 0x100000
-; GFX8-NEXT:    s_lshr_b32 s1, s1, s2
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, s7
+; GFX8-NEXT:    s_lshr_b32 s1, s1, s2
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    s_and_b32 s1, s6, 15
-; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x100000
 ; GFX8-NEXT:    s_lshl_b32 s4, s4, s5
+; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x100000
 ; GFX8-NEXT:    s_andn2_b32 s2, 15, s6
 ; GFX8-NEXT:    s_lshl_b32 s1, s3, s1
 ; GFX8-NEXT:    s_bfe_u32 s3, s4, 0x100000
@@ -3181,35 +3181,35 @@ define <2 x i16> @v_fshr_v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt) {
 ; GFX6-NEXT:    v_and_b32_e32 v4, v4, v6
 ; GFX6-NEXT:    s_mov_b32 s5, 0xffff
 ; GFX6-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX6-NEXT:    v_and_b32_e32 v5, s5, v2
 ; GFX6-NEXT:    s_bfe_u32 s4, 1, 0x100000
+; GFX6-NEXT:    v_and_b32_e32 v5, s5, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v5, 15, v5
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v5
 ; GFX6-NEXT:    v_and_b32_e32 v5, s5, v3
-; GFX6-NEXT:    v_xor_b32_e32 v4, -1, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, s4, v1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v5, 15, v5
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 1, v2
-; GFX6-NEXT:    v_and_b32_e32 v7, 15, v4
+; GFX6-NEXT:    v_xor_b32_e32 v4, -1, v4
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v5
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 1, v2
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
+; GFX6-NEXT:    v_and_b32_e32 v7, 15, v4
 ; GFX6-NEXT:    v_xor_b32_e32 v4, -1, v4
 ; GFX6-NEXT:    v_and_b32_e32 v4, 15, v4
 ; GFX6-NEXT:    v_and_b32_e32 v2, v2, v6
 ; GFX6-NEXT:    v_bfe_u32 v7, v7, 0, 16
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 1, v2
 ; GFX6-NEXT:    v_bfe_u32 v4, v4, 0, 16
-; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, v7, v0
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v2
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_and_b32_e32 v2, 15, v5
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 1, v3
 ; GFX6-NEXT:    v_xor_b32_e32 v4, -1, v5
 ; GFX6-NEXT:    v_bfe_u32 v2, v2, 0, 16
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 1, v3
+; GFX6-NEXT:    v_and_b32_e32 v4, 15, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, v2, v1
 ; GFX6-NEXT:    v_and_b32_e32 v2, v3, v6
-; GFX6-NEXT:    v_and_b32_e32 v4, 15, v4
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 1, v2
 ; GFX6-NEXT:    v_bfe_u32 v3, v4, 0, 16
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v3, v2
@@ -3221,17 +3221,17 @@ define <2 x i16> @v_fshr_v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v3, 1, v0
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v4, 15, v1
-; GFX8-NEXT:    v_mov_b32_e32 v5, 15
 ; GFX8-NEXT:    v_or_b32_e32 v3, v3, v4
 ; GFX8-NEXT:    v_mov_b32_e32 v4, 1
-; GFX8-NEXT:    v_xor_b32_e32 v2, -1, v2
+; GFX8-NEXT:    v_mov_b32_e32 v5, 15
 ; GFX8-NEXT:    v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-NEXT:    v_lshrrev_b16_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_xor_b32_e32 v2, -1, v2
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v5
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v5, 1, v1
 ; GFX8-NEXT:    v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_and_b32_e32 v6, 15, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v6, 15, v2
 ; GFX8-NEXT:    v_xor_b32_e32 v2, -1, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 15, v2
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v5, 1, v5
@@ -3323,8 +3323,8 @@ define <2 x i16> @v_fshr_v2i16_4_8(<2 x i16> %lhs, <2 x i16> %rhs) {
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v2, 16
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v2
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v3, 16
-; GFX9-NEXT:    s_mov_b32 s4, 0x4f7ffffe
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v3
+; GFX9-NEXT:    s_mov_b32 s4, 0x4f7ffffe
 ; GFX9-NEXT:    v_mul_f32_e32 v2, s4, v2
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GFX9-NEXT:    v_mul_f32_e32 v3, s4, v3
@@ -3341,8 +3341,8 @@ define <2 x i16> @v_fshr_v2i16_4_8(<2 x i16> %lhs, <2 x i16> %rhs) {
 ; GFX9-NEXT:    v_sub_u32_e32 v2, 4, v2
 ; GFX9-NEXT:    v_subrev_u32_e32 v4, 16, v2
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 16, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 4, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX9-NEXT:    v_subrev_u32_e32 v4, 16, v2
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 16, v2
 ; GFX9-NEXT:    v_sub_u32_e32 v3, 8, v3
@@ -3417,32 +3417,32 @@ define amdgpu_ps float @v_fshr_v2i16_ssv(<2 x i16> inreg %lhs, <2 x i16> inreg %
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX6-NEXT:    s_mov_b32 s5, 0xffff
-; GFX6-NEXT:    s_and_b32 s6, s2, s5
 ; GFX6-NEXT:    s_bfe_u32 s4, 1, 0x100000
+; GFX6-NEXT:    s_and_b32 s6, s2, s5
 ; GFX6-NEXT:    v_xor_b32_e32 v0, -1, v0
-; GFX6-NEXT:    v_and_b32_e32 v2, 15, v0
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, s4
 ; GFX6-NEXT:    s_lshr_b32 s6, s6, 15
+; GFX6-NEXT:    v_and_b32_e32 v2, 15, v0
+; GFX6-NEXT:    s_or_b32 s0, s0, s6
+; GFX6-NEXT:    s_lshl_b32 s2, s2, 1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX6-NEXT:    v_xor_b32_e32 v0, -1, v0
-; GFX6-NEXT:    s_or_b32 s0, s0, s6
 ; GFX6-NEXT:    v_bfe_u32 v2, v2, 0, 16
-; GFX6-NEXT:    s_lshl_b32 s2, s2, 1
 ; GFX6-NEXT:    v_and_b32_e32 v0, 15, v0
 ; GFX6-NEXT:    v_lshl_b32_e32 v2, s0, v2
 ; GFX6-NEXT:    s_and_b32 s0, s2, s5
 ; GFX6-NEXT:    s_lshr_b32 s0, s0, 1
 ; GFX6-NEXT:    v_bfe_u32 v0, v0, 0, 16
 ; GFX6-NEXT:    v_lshr_b32_e32 v0, s0, v0
-; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
-; GFX6-NEXT:    v_and_b32_e32 v2, 15, v1
-; GFX6-NEXT:    v_xor_b32_e32 v1, -1, v1
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, s4
 ; GFX6-NEXT:    s_and_b32 s4, s3, s5
 ; GFX6-NEXT:    s_lshl_b32 s3, s3, 1
+; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX6-NEXT:    v_and_b32_e32 v2, 15, v1
+; GFX6-NEXT:    v_xor_b32_e32 v1, -1, v1
+; GFX6-NEXT:    s_lshr_b32 s4, s4, 15
 ; GFX6-NEXT:    v_and_b32_e32 v1, 15, v1
 ; GFX6-NEXT:    s_and_b32 s0, s3, s5
-; GFX6-NEXT:    s_lshr_b32 s4, s4, 15
 ; GFX6-NEXT:    s_or_b32 s1, s1, s4
 ; GFX6-NEXT:    v_bfe_u32 v2, v2, 0, 16
 ; GFX6-NEXT:    s_lshr_b32 s0, s0, 1
@@ -3462,27 +3462,27 @@ define amdgpu_ps float @v_fshr_v2i16_ssv(<2 x i16> inreg %lhs, <2 x i16> inreg %
 ; GFX8-NEXT:    s_bfe_u32 s5, s1, 0x100000
 ; GFX8-NEXT:    s_bfe_u32 s6, 15, 0x100000
 ; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
-; GFX8-NEXT:    v_xor_b32_e32 v0, -1, v0
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, s4
 ; GFX8-NEXT:    s_lshr_b32 s5, s5, s6
+; GFX8-NEXT:    v_xor_b32_e32 v0, -1, v0
 ; GFX8-NEXT:    s_lshr_b32 s3, s1, 16
 ; GFX8-NEXT:    s_or_b32 s0, s0, s5
-; GFX8-NEXT:    v_and_b32_e32 v2, 15, v0
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, s4
+; GFX8-NEXT:    v_and_b32_e32 v2, 15, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX8-NEXT:    v_xor_b32_e32 v0, -1, v0
 ; GFX8-NEXT:    v_lshlrev_b16_e64 v2, v2, s0
 ; GFX8-NEXT:    s_bfe_u32 s0, s1, 0x100000
 ; GFX8-NEXT:    v_and_b32_e32 v0, 15, v0
 ; GFX8-NEXT:    s_lshr_b32 s0, s0, s4
-; GFX8-NEXT:    v_lshrrev_b16_e64 v0, v0, s0
 ; GFX8-NEXT:    s_lshr_b32 s5, s3, s6
 ; GFX8-NEXT:    s_lshl_b32 s3, s3, s4
+; GFX8-NEXT:    v_lshrrev_b16_e64 v0, v0, s0
 ; GFX8-NEXT:    s_lshl_b32 s2, s2, s4
-; GFX8-NEXT:    s_bfe_u32 s0, s3, 0x100000
 ; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX8-NEXT:    v_and_b32_e32 v2, 15, v1
 ; GFX8-NEXT:    v_xor_b32_e32 v1, -1, v1
+; GFX8-NEXT:    s_bfe_u32 s0, s3, 0x100000
 ; GFX8-NEXT:    s_or_b32 s2, s2, s5
 ; GFX8-NEXT:    v_and_b32_e32 v1, 15, v1
 ; GFX8-NEXT:    s_lshr_b32 s0, s0, s4
@@ -3534,32 +3534,32 @@ define amdgpu_ps float @v_fshr_v2i16_svs(<2 x i16> inreg %lhs, <2 x i16> %rhs, <
 ; GFX6-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
 ; GFX6-NEXT:    s_and_b32 s2, s2, s4
-; GFX6-NEXT:    v_and_b32_e32 v2, s4, v0
 ; GFX6-NEXT:    s_or_b32 s2, s3, s2
 ; GFX6-NEXT:    s_bfe_u32 s3, 1, 0x100000
-; GFX6-NEXT:    v_and_b32_e32 v3, s4, v1
+; GFX6-NEXT:    v_and_b32_e32 v2, s4, v0
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, s3
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 15, v2
+; GFX6-NEXT:    v_and_b32_e32 v3, s4, v1
 ; GFX6-NEXT:    v_or_b32_e32 v2, s0, v2
 ; GFX6-NEXT:    s_lshl_b32 s0, s1, s3
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 15, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX6-NEXT:    v_or_b32_e32 v3, s0, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX6-NEXT:    s_xor_b32 s0, s2, -1
-; GFX6-NEXT:    s_and_b32 s2, s0, 15
-; GFX6-NEXT:    v_and_b32_e32 v0, s4, v0
 ; GFX6-NEXT:    s_lshr_b32 s1, s0, 16
+; GFX6-NEXT:    s_and_b32 s2, s0, 15
 ; GFX6-NEXT:    s_andn2_b32 s0, 15, s0
+; GFX6-NEXT:    v_and_b32_e32 v0, s4, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 1, v0
 ; GFX6-NEXT:    s_bfe_u32 s0, s0, 0x100000
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 1, v1
 ; GFX6-NEXT:    s_bfe_u32 s2, s2, 0x100000
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s0, v0
 ; GFX6-NEXT:    s_and_b32 s0, s1, 15
-; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, s2, v2
 ; GFX6-NEXT:    s_andn2_b32 s1, 15, s1
 ; GFX6-NEXT:    s_bfe_u32 s0, s0, 0x100000
+; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, s0, v3
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 1, v1
@@ -3575,20 +3575,20 @@ define amdgpu_ps float @v_fshr_v2i16_svs(<2 x i16> inreg %lhs, <2 x i16> %rhs, <
 ; GFX8-LABEL: v_fshr_v2i16_svs:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_bfe_u32 s3, 1, 0x100000
-; GFX8-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, s3
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v1, 15, v0
+; GFX8-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX8-NEXT:    v_or_b32_e32 v1, s0, v1
 ; GFX8-NEXT:    s_lshl_b32 s0, s2, s3
 ; GFX8-NEXT:    v_lshrrev_b16_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_lshlrev_b16_e32 v3, 1, v0
 ; GFX8-NEXT:    v_or_b32_e32 v2, s0, v2
-; GFX8-NEXT:    s_xor_b32 s0, s1, -1
+; GFX8-NEXT:    v_lshlrev_b16_e32 v3, 1, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v4, 1
+; GFX8-NEXT:    s_xor_b32 s0, s1, -1
 ; GFX8-NEXT:    v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    s_and_b32 s2, s0, 15
 ; GFX8-NEXT:    s_lshr_b32 s1, s0, 16
+; GFX8-NEXT:    s_and_b32 s2, s0, 15
 ; GFX8-NEXT:    s_andn2_b32 s0, 15, s0
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v3, 1, v3
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v3, s0, v3
@@ -3653,32 +3653,32 @@ define amdgpu_ps float @v_fshr_v2i16_vss(<2 x i16> %lhs, <2 x i16> inreg %rhs, <
 ; GFX6-NEXT:    s_and_b32 s2, s2, s4
 ; GFX6-NEXT:    s_or_b32 s2, s3, s2
 ; GFX6-NEXT:    s_bfe_u32 s3, 1, 0x100000
-; GFX6-NEXT:    s_and_b32 s5, s0, s4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, s3, v0
+; GFX6-NEXT:    s_and_b32 s5, s0, s4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, s3, v1
 ; GFX6-NEXT:    s_and_b32 s3, s1, s4
-; GFX6-NEXT:    s_lshl_b32 s0, s0, 1
 ; GFX6-NEXT:    s_lshr_b32 s5, s5, 15
 ; GFX6-NEXT:    s_lshr_b32 s3, s3, 15
+; GFX6-NEXT:    s_lshl_b32 s0, s0, 1
 ; GFX6-NEXT:    s_xor_b32 s2, s2, -1
-; GFX6-NEXT:    s_and_b32 s0, s0, s4
 ; GFX6-NEXT:    v_or_b32_e32 v0, s5, v0
-; GFX6-NEXT:    s_and_b32 s5, s2, 15
 ; GFX6-NEXT:    v_or_b32_e32 v1, s3, v1
 ; GFX6-NEXT:    s_lshr_b32 s3, s2, 16
+; GFX6-NEXT:    s_and_b32 s5, s2, 15
 ; GFX6-NEXT:    s_andn2_b32 s2, 15, s2
+; GFX6-NEXT:    s_and_b32 s0, s0, s4
 ; GFX6-NEXT:    s_bfe_u32 s5, s5, 0x100000
 ; GFX6-NEXT:    s_lshr_b32 s0, s0, 1
 ; GFX6-NEXT:    s_bfe_u32 s2, s2, 0x100000
-; GFX6-NEXT:    s_lshr_b32 s0, s0, s2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, s5, v0
+; GFX6-NEXT:    s_lshr_b32 s0, s0, s2
 ; GFX6-NEXT:    v_or_b32_e32 v0, s0, v0
 ; GFX6-NEXT:    s_and_b32 s0, s3, 15
-; GFX6-NEXT:    s_bfe_u32 s0, s0, 0x100000
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 1
+; GFX6-NEXT:    s_bfe_u32 s0, s0, 0x100000
+; GFX6-NEXT:    s_andn2_b32 s2, 15, s3
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, s0, v1
 ; GFX6-NEXT:    s_and_b32 s0, s1, s4
-; GFX6-NEXT:    s_andn2_b32 s2, 15, s3
 ; GFX6-NEXT:    s_lshr_b32 s0, s0, 1
 ; GFX6-NEXT:    s_bfe_u32 s1, s2, 0x100000
 ; GFX6-NEXT:    s_lshr_b32 s0, s0, s1
@@ -3704,17 +3704,17 @@ define amdgpu_ps float @v_fshr_v2i16_vss(<2 x i16> %lhs, <2 x i16> inreg %rhs, <
 ; GFX8-NEXT:    s_bfe_u32 s3, 1, 0x100000
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, s3
 ; GFX8-NEXT:    s_xor_b32 s1, s1, -1
-; GFX8-NEXT:    s_bfe_u32 s0, s0, 0x100000
-; GFX8-NEXT:    s_and_b32 s5, s1, 15
 ; GFX8-NEXT:    s_lshr_b32 s4, s1, 16
+; GFX8-NEXT:    s_and_b32 s5, s1, 15
 ; GFX8-NEXT:    s_andn2_b32 s1, 15, s1
+; GFX8-NEXT:    s_bfe_u32 s0, s0, 0x100000
 ; GFX8-NEXT:    s_lshr_b32 s0, s0, s3
 ; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x100000
-; GFX8-NEXT:    s_lshr_b32 s0, s0, s1
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v1, s5, v1
+; GFX8-NEXT:    s_lshr_b32 s0, s0, s1
+; GFX8-NEXT:    s_lshl_b32 s2, s2, s3
 ; GFX8-NEXT:    v_or_b32_e32 v1, s0, v1
 ; GFX8-NEXT:    s_and_b32 s0, s4, 15
-; GFX8-NEXT:    s_lshl_b32 s2, s2, s3
 ; GFX8-NEXT:    s_andn2_b32 s1, 15, s4
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v0, s0, v0
 ; GFX8-NEXT:    s_bfe_u32 s0, s2, 0x100000
@@ -3753,8 +3753,8 @@ define amdgpu_ps float @v_fshr_v2i16_vss(<2 x i16> %lhs, <2 x i16> inreg %rhs, <
 ; GFX10-NEXT:    s_and_b32 s4, s1, s2
 ; GFX10-NEXT:    s_andn2_b32 s1, s2, s1
 ; GFX10-NEXT:    s_lshr_b32 s2, s0, 16
-; GFX10-NEXT:    v_pk_lshlrev_b16 v0, s1, v0
 ; GFX10-NEXT:    s_and_b32 s0, s0, s3
+; GFX10-NEXT:    v_pk_lshlrev_b16 v0, s1, v0
 ; GFX10-NEXT:    s_and_b32 s1, s4, s3
 ; GFX10-NEXT:    s_lshr_b32 s3, s4, 16
 ; GFX10-NEXT:    s_lshr_b32 s0, s0, s1
@@ -3790,34 +3790,34 @@ define amdgpu_ps <2 x i32> @s_fshr_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %
 ; GFX6-NEXT:    s_or_b32 s8, s9, s8
 ; GFX6-NEXT:    s_lshl_b32 s9, s11, 16
 ; GFX6-NEXT:    s_and_b32 s10, s10, s12
-; GFX6-NEXT:    s_and_b32 s11, s4, s12
 ; GFX6-NEXT:    s_or_b32 s9, s9, s10
 ; GFX6-NEXT:    s_bfe_u32 s10, 1, 0x100000
+; GFX6-NEXT:    s_and_b32 s11, s4, s12
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, s10
 ; GFX6-NEXT:    s_lshr_b32 s11, s11, 15
 ; GFX6-NEXT:    s_or_b32 s0, s0, s11
 ; GFX6-NEXT:    s_and_b32 s11, s5, s12
-; GFX6-NEXT:    s_lshl_b32 s4, s4, 1
-; GFX6-NEXT:    s_xor_b32 s8, s8, -1
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, s10
 ; GFX6-NEXT:    s_lshr_b32 s11, s11, 15
-; GFX6-NEXT:    s_and_b32 s13, s8, 15
-; GFX6-NEXT:    s_and_b32 s4, s4, s12
+; GFX6-NEXT:    s_lshl_b32 s4, s4, 1
+; GFX6-NEXT:    s_xor_b32 s8, s8, -1
 ; GFX6-NEXT:    s_or_b32 s1, s1, s11
 ; GFX6-NEXT:    s_lshr_b32 s11, s8, 16
+; GFX6-NEXT:    s_and_b32 s13, s8, 15
 ; GFX6-NEXT:    s_andn2_b32 s8, 15, s8
+; GFX6-NEXT:    s_and_b32 s4, s4, s12
 ; GFX6-NEXT:    s_bfe_u32 s13, s13, 0x100000
 ; GFX6-NEXT:    s_lshr_b32 s4, s4, 1
 ; GFX6-NEXT:    s_bfe_u32 s8, s8, 0x100000
-; GFX6-NEXT:    s_lshr_b32 s4, s4, s8
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, s13
+; GFX6-NEXT:    s_lshr_b32 s4, s4, s8
 ; GFX6-NEXT:    s_or_b32 s0, s0, s4
 ; GFX6-NEXT:    s_and_b32 s4, s11, 15
-; GFX6-NEXT:    s_bfe_u32 s4, s4, 0x100000
 ; GFX6-NEXT:    s_lshl_b32 s5, s5, 1
+; GFX6-NEXT:    s_bfe_u32 s4, s4, 0x100000
+; GFX6-NEXT:    s_andn2_b32 s8, 15, s11
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, s4
 ; GFX6-NEXT:    s_and_b32 s4, s5, s12
-; GFX6-NEXT:    s_andn2_b32 s8, 15, s11
 ; GFX6-NEXT:    s_lshr_b32 s4, s4, 1
 ; GFX6-NEXT:    s_bfe_u32 s5, s8, 0x100000
 ; GFX6-NEXT:    s_lshr_b32 s4, s4, s5
@@ -3836,22 +3836,22 @@ define amdgpu_ps <2 x i32> @s_fshr_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %
 ; GFX6-NEXT:    s_or_b32 s2, s2, s3
 ; GFX6-NEXT:    s_lshl_b32 s3, s6, 1
 ; GFX6-NEXT:    s_xor_b32 s5, s9, -1
-; GFX6-NEXT:    s_and_b32 s3, s3, s12
 ; GFX6-NEXT:    s_lshl_b32 s4, s7, 1
-; GFX6-NEXT:    s_and_b32 s7, s5, 15
 ; GFX6-NEXT:    s_lshr_b32 s6, s5, 16
+; GFX6-NEXT:    s_and_b32 s7, s5, 15
 ; GFX6-NEXT:    s_andn2_b32 s5, 15, s5
+; GFX6-NEXT:    s_and_b32 s3, s3, s12
 ; GFX6-NEXT:    s_bfe_u32 s7, s7, 0x100000
 ; GFX6-NEXT:    s_lshr_b32 s3, s3, 1
 ; GFX6-NEXT:    s_bfe_u32 s5, s5, 0x100000
-; GFX6-NEXT:    s_lshr_b32 s3, s3, s5
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, s7
+; GFX6-NEXT:    s_lshr_b32 s3, s3, s5
 ; GFX6-NEXT:    s_or_b32 s1, s1, s3
 ; GFX6-NEXT:    s_and_b32 s3, s6, 15
 ; GFX6-NEXT:    s_bfe_u32 s3, s3, 0x100000
+; GFX6-NEXT:    s_andn2_b32 s5, 15, s6
 ; GFX6-NEXT:    s_lshl_b32 s2, s2, s3
 ; GFX6-NEXT:    s_and_b32 s3, s4, s12
-; GFX6-NEXT:    s_andn2_b32 s5, 15, s6
 ; GFX6-NEXT:    s_lshr_b32 s3, s3, 1
 ; GFX6-NEXT:    s_bfe_u32 s4, s5, 0x100000
 ; GFX6-NEXT:    s_lshr_b32 s3, s3, s4
@@ -3871,25 +3871,25 @@ define amdgpu_ps <2 x i32> @s_fshr_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %
 ; GFX8-NEXT:    s_lshr_b32 s7, s2, 16
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, s8
 ; GFX8-NEXT:    s_lshr_b32 s9, s9, s10
-; GFX8-NEXT:    s_lshl_b32 s2, s2, s8
 ; GFX8-NEXT:    s_or_b32 s0, s0, s9
-; GFX8-NEXT:    s_lshr_b32 s9, s7, s10
 ; GFX8-NEXT:    s_lshl_b32 s6, s6, s8
+; GFX8-NEXT:    s_lshr_b32 s9, s7, s10
+; GFX8-NEXT:    s_lshl_b32 s2, s2, s8
 ; GFX8-NEXT:    s_xor_b32 s4, s4, -1
-; GFX8-NEXT:    s_and_b32 s11, s4, 15
-; GFX8-NEXT:    s_bfe_u32 s2, s2, 0x100000
 ; GFX8-NEXT:    s_or_b32 s6, s6, s9
 ; GFX8-NEXT:    s_lshr_b32 s9, s4, 16
+; GFX8-NEXT:    s_and_b32 s11, s4, 15
 ; GFX8-NEXT:    s_andn2_b32 s4, 15, s4
+; GFX8-NEXT:    s_bfe_u32 s2, s2, 0x100000
 ; GFX8-NEXT:    s_bfe_u32 s11, s11, 0x100000
 ; GFX8-NEXT:    s_lshr_b32 s2, s2, s8
 ; GFX8-NEXT:    s_bfe_u32 s4, s4, 0x100000
-; GFX8-NEXT:    s_lshr_b32 s2, s2, s4
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, s11
+; GFX8-NEXT:    s_lshr_b32 s2, s2, s4
 ; GFX8-NEXT:    s_or_b32 s0, s0, s2
 ; GFX8-NEXT:    s_and_b32 s2, s9, 15
-; GFX8-NEXT:    s_bfe_u32 s2, s2, 0x100000
 ; GFX8-NEXT:    s_lshl_b32 s7, s7, s8
+; GFX8-NEXT:    s_bfe_u32 s2, s2, 0x100000
 ; GFX8-NEXT:    s_andn2_b32 s4, 15, s9
 ; GFX8-NEXT:    s_lshl_b32 s2, s6, s2
 ; GFX8-NEXT:    s_bfe_u32 s6, s7, 0x100000
@@ -3898,36 +3898,36 @@ define amdgpu_ps <2 x i32> @s_fshr_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %
 ; GFX8-NEXT:    s_lshr_b32 s4, s6, s4
 ; GFX8-NEXT:    s_or_b32 s2, s2, s4
 ; GFX8-NEXT:    s_bfe_u32 s2, s2, 0x100000
-; GFX8-NEXT:    s_bfe_u32 s6, s3, 0x100000
 ; GFX8-NEXT:    s_bfe_u32 s0, s0, 0x100000
 ; GFX8-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX8-NEXT:    s_bfe_u32 s6, s3, 0x100000
 ; GFX8-NEXT:    s_or_b32 s0, s0, s2
 ; GFX8-NEXT:    s_lshr_b32 s2, s1, 16
 ; GFX8-NEXT:    s_lshr_b32 s4, s3, 16
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, s8
 ; GFX8-NEXT:    s_lshr_b32 s6, s6, s10
-; GFX8-NEXT:    s_lshl_b32 s3, s3, s8
 ; GFX8-NEXT:    s_or_b32 s1, s1, s6
-; GFX8-NEXT:    s_lshr_b32 s6, s4, s10
 ; GFX8-NEXT:    s_lshl_b32 s2, s2, s8
+; GFX8-NEXT:    s_lshr_b32 s6, s4, s10
+; GFX8-NEXT:    s_lshl_b32 s3, s3, s8
 ; GFX8-NEXT:    s_xor_b32 s5, s5, -1
-; GFX8-NEXT:    s_and_b32 s7, s5, 15
-; GFX8-NEXT:    s_bfe_u32 s3, s3, 0x100000
 ; GFX8-NEXT:    s_or_b32 s2, s2, s6
 ; GFX8-NEXT:    s_lshr_b32 s6, s5, 16
+; GFX8-NEXT:    s_and_b32 s7, s5, 15
 ; GFX8-NEXT:    s_andn2_b32 s5, 15, s5
+; GFX8-NEXT:    s_bfe_u32 s3, s3, 0x100000
 ; GFX8-NEXT:    s_bfe_u32 s7, s7, 0x100000
 ; GFX8-NEXT:    s_lshr_b32 s3, s3, s8
 ; GFX8-NEXT:    s_bfe_u32 s5, s5, 0x100000
-; GFX8-NEXT:    s_lshr_b32 s3, s3, s5
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, s7
+; GFX8-NEXT:    s_lshr_b32 s3, s3, s5
 ; GFX8-NEXT:    s_or_b32 s1, s1, s3
 ; GFX8-NEXT:    s_and_b32 s3, s6, 15
-; GFX8-NEXT:    s_bfe_u32 s3, s3, 0x100000
 ; GFX8-NEXT:    s_lshl_b32 s4, s4, s8
+; GFX8-NEXT:    s_bfe_u32 s3, s3, 0x100000
+; GFX8-NEXT:    s_andn2_b32 s5, 15, s6
 ; GFX8-NEXT:    s_lshl_b32 s2, s2, s3
 ; GFX8-NEXT:    s_bfe_u32 s3, s4, 0x100000
-; GFX8-NEXT:    s_andn2_b32 s5, 15, s6
 ; GFX8-NEXT:    s_lshr_b32 s3, s3, s8
 ; GFX8-NEXT:    s_bfe_u32 s4, s5, 0x100000
 ; GFX8-NEXT:    s_lshr_b32 s3, s3, s4
@@ -3940,14 +3940,14 @@ define amdgpu_ps <2 x i32> @s_fshr_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %
 ;
 ; GFX9-LABEL: s_fshr_v4i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_lshr_b32 s9, s0, 16
 ; GFX9-NEXT:    s_mov_b32 s8, 0x10001
+; GFX9-NEXT:    s_lshr_b32 s9, s0, 16
 ; GFX9-NEXT:    s_mov_b32 s6, 0xf000f
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, s8
 ; GFX9-NEXT:    s_lshl_b32 s9, s9, 1
-; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s9
 ; GFX9-NEXT:    s_and_b32 s7, s4, s6
 ; GFX9-NEXT:    s_andn2_b32 s4, s6, s4
+; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s9
 ; GFX9-NEXT:    s_lshr_b32 s9, s0, 16
 ; GFX9-NEXT:    s_lshr_b32 s10, s4, 16
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, s4
@@ -3955,8 +3955,8 @@ define amdgpu_ps <2 x i32> @s_fshr_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %
 ; GFX9-NEXT:    s_mov_b32 s9, 0xffff
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s4
 ; GFX9-NEXT:    s_lshr_b32 s4, s2, 16
-; GFX9-NEXT:    s_lshr_b32 s10, s7, 16
 ; GFX9-NEXT:    s_and_b32 s2, s2, s9
+; GFX9-NEXT:    s_lshr_b32 s10, s7, 16
 ; GFX9-NEXT:    s_and_b32 s7, s7, s9
 ; GFX9-NEXT:    s_lshr_b32 s2, s2, s7
 ; GFX9-NEXT:    s_lshr_b32 s4, s4, s10
@@ -3974,8 +3974,8 @@ define amdgpu_ps <2 x i32> @s_fshr_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %
 ; GFX9-NEXT:    s_lshl_b32 s4, s5, s6
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s4
 ; GFX9-NEXT:    s_lshr_b32 s4, s3, 16
-; GFX9-NEXT:    s_lshr_b32 s5, s2, 16
 ; GFX9-NEXT:    s_and_b32 s3, s3, s9
+; GFX9-NEXT:    s_lshr_b32 s5, s2, 16
 ; GFX9-NEXT:    s_and_b32 s2, s2, s9
 ; GFX9-NEXT:    s_lshr_b32 s2, s3, s2
 ; GFX9-NEXT:    s_lshr_b32 s3, s4, s5
@@ -3985,8 +3985,8 @@ define amdgpu_ps <2 x i32> @s_fshr_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %
 ;
 ; GFX10-LABEL: s_fshr_v4i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_lshr_b32 s8, s0, 16
 ; GFX10-NEXT:    s_mov_b32 s7, 0x10001
+; GFX10-NEXT:    s_lshr_b32 s8, s0, 16
 ; GFX10-NEXT:    s_mov_b32 s6, 0xf000f
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, s7
 ; GFX10-NEXT:    s_lshl_b32 s8, s8, 1
@@ -4008,14 +4008,14 @@ define amdgpu_ps <2 x i32> @s_fshr_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %
 ; GFX10-NEXT:    s_lshr_b32 s5, s1, 16
 ; GFX10-NEXT:    s_lshr_b32 s6, s4, 16
 ; GFX10-NEXT:    s_lshr_b32 s10, s2, 16
-; GFX10-NEXT:    s_and_b32 s11, s9, s8
 ; GFX10-NEXT:    s_and_b32 s2, s2, s8
+; GFX10-NEXT:    s_and_b32 s11, s9, s8
 ; GFX10-NEXT:    s_lshr_b32 s9, s9, 16
 ; GFX10-NEXT:    s_lshl_b32 s1, s1, s4
 ; GFX10-NEXT:    s_lshl_b32 s4, s5, s6
 ; GFX10-NEXT:    s_lshr_b32 s5, s3, 16
-; GFX10-NEXT:    s_and_b32 s6, s7, s8
 ; GFX10-NEXT:    s_and_b32 s3, s3, s8
+; GFX10-NEXT:    s_and_b32 s6, s7, s8
 ; GFX10-NEXT:    s_lshr_b32 s7, s7, 16
 ; GFX10-NEXT:    s_lshr_b32 s2, s2, s11
 ; GFX10-NEXT:    s_lshr_b32 s9, s10, s9
@@ -4043,36 +4043,36 @@ define <4 x half> @v_fshr_v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v9, 16, v11
 ; GFX6-NEXT:    v_and_b32_e32 v10, v10, v12
 ; GFX6-NEXT:    s_mov_b32 s5, 0xffff
-; GFX6-NEXT:    s_bfe_u32 s4, 1, 0x100000
 ; GFX6-NEXT:    v_or_b32_e32 v9, v9, v10
+; GFX6-NEXT:    s_bfe_u32 s4, 1, 0x100000
 ; GFX6-NEXT:    v_and_b32_e32 v10, s5, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v10, 15, v10
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v10
 ; GFX6-NEXT:    v_and_b32_e32 v10, s5, v5
-; GFX6-NEXT:    v_xor_b32_e32 v8, -1, v8
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, s4, v1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v10, 15, v10
-; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 1, v4
-; GFX6-NEXT:    v_and_b32_e32 v11, 15, v8
+; GFX6-NEXT:    v_xor_b32_e32 v8, -1, v8
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v10
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 1, v4
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v10, 16, v8
+; GFX6-NEXT:    v_and_b32_e32 v11, 15, v8
 ; GFX6-NEXT:    v_xor_b32_e32 v8, -1, v8
 ; GFX6-NEXT:    v_and_b32_e32 v8, 15, v8
 ; GFX6-NEXT:    v_and_b32_e32 v4, v4, v12
 ; GFX6-NEXT:    v_bfe_u32 v11, v11, 0, 16
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 1, v4
 ; GFX6-NEXT:    v_bfe_u32 v8, v8, 0, 16
-; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v8, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, v11, v0
+; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v8, v4
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX6-NEXT:    v_and_b32_e32 v4, 15, v10
+; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 1, v5
 ; GFX6-NEXT:    v_xor_b32_e32 v8, -1, v10
 ; GFX6-NEXT:    v_bfe_u32 v4, v4, 0, 16
-; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 1, v5
+; GFX6-NEXT:    v_and_b32_e32 v8, 15, v8
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, v4, v1
 ; GFX6-NEXT:    v_and_b32_e32 v4, v5, v12
-; GFX6-NEXT:    v_and_b32_e32 v8, 15, v8
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 1, v4
 ; GFX6-NEXT:    v_bfe_u32 v5, v8, 0, 16
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v5, v4
@@ -4087,24 +4087,24 @@ define <4 x half> @v_fshr_v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt)
 ; GFX6-NEXT:    v_or_b32_e32 v3, v3, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 1, v6
 ; GFX6-NEXT:    v_xor_b32_e32 v6, -1, v9
-; GFX6-NEXT:    v_and_b32_e32 v8, 15, v6
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 1, v7
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
+; GFX6-NEXT:    v_and_b32_e32 v8, 15, v6
 ; GFX6-NEXT:    v_xor_b32_e32 v6, -1, v6
 ; GFX6-NEXT:    v_and_b32_e32 v6, 15, v6
 ; GFX6-NEXT:    v_and_b32_e32 v4, v4, v12
 ; GFX6-NEXT:    v_bfe_u32 v8, v8, 0, 16
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 1, v4
 ; GFX6-NEXT:    v_bfe_u32 v6, v6, 0, 16
-; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v6, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v8, v2
+; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v6, v4
 ; GFX6-NEXT:    v_or_b32_e32 v2, v2, v4
 ; GFX6-NEXT:    v_and_b32_e32 v4, 15, v7
 ; GFX6-NEXT:    v_xor_b32_e32 v6, -1, v7
 ; GFX6-NEXT:    v_bfe_u32 v4, v4, 0, 16
+; GFX6-NEXT:    v_and_b32_e32 v6, 15, v6
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v4, v3
 ; GFX6-NEXT:    v_and_b32_e32 v4, v5, v12
-; GFX6-NEXT:    v_and_b32_e32 v6, 15, v6
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 1, v4
 ; GFX6-NEXT:    v_bfe_u32 v5, v6, 0, 16
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v5, v4
@@ -4119,14 +4119,14 @@ define <4 x half> @v_fshr_v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt)
 ; GFX8-NEXT:    v_or_b32_e32 v6, v6, v7
 ; GFX8-NEXT:    v_mov_b32_e32 v7, 1
 ; GFX8-NEXT:    v_mov_b32_e32 v8, 15
-; GFX8-NEXT:    v_xor_b32_e32 v4, -1, v4
 ; GFX8-NEXT:    v_lshlrev_b16_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-NEXT:    v_lshrrev_b16_sdwa v9, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_xor_b32_e32 v4, -1, v4
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v9
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v9, 1, v2
 ; GFX8-NEXT:    v_lshlrev_b16_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_and_b32_e32 v10, 15, v4
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
+; GFX8-NEXT:    v_and_b32_e32 v10, 15, v4
 ; GFX8-NEXT:    v_xor_b32_e32 v4, -1, v4
 ; GFX8-NEXT:    v_and_b32_e32 v4, 15, v4
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v9, 1, v9
@@ -4153,8 +4153,8 @@ define <4 x half> @v_fshr_v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt)
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v7
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v7, 1, v3
 ; GFX8-NEXT:    v_lshlrev_b16_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_and_b32_e32 v8, 15, v5
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v5
+; GFX8-NEXT:    v_and_b32_e32 v8, 15, v5
 ; GFX8-NEXT:    v_xor_b32_e32 v5, -1, v5
 ; GFX8-NEXT:    v_and_b32_e32 v5, 15, v5
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v7, 1, v7
@@ -4181,8 +4181,8 @@ define <4 x half> @v_fshr_v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt)
 ; GFX9-NEXT:    v_and_b32_e32 v4, s4, v4
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1]
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v0, v4, v0
-; GFX9-NEXT:    v_xor_b32_e32 v4, -1, v5
 ; GFX9-NEXT:    v_pk_lshrrev_b16 v2, v6, v2
+; GFX9-NEXT:    v_xor_b32_e32 v4, -1, v5
 ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX9-NEXT:    v_and_b32_e32 v2, s4, v5
 ; GFX9-NEXT:    v_and_b32_e32 v4, s4, v4
@@ -4289,8 +4289,8 @@ define amdgpu_ps i64 @s_fshr_i64_32(i64 inreg %lhs, i64 inreg %rhs) {
 define amdgpu_ps i64 @s_fshr_i64_48(i64 inreg %lhs, i64 inreg %rhs) {
 ; GCN-LABEL: s_fshr_i64_48:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_lshr_b32 s2, s3, 16
 ; GCN-NEXT:    s_lshl_b64 s[0:1], s[0:1], 16
+; GCN-NEXT:    s_lshr_b32 s2, s3, 16
 ; GCN-NEXT:    s_mov_b32 s3, 0
 ; GCN-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
 ; GCN-NEXT:    ; return to shader part epilog
@@ -4998,8 +4998,8 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) {
 ; GFX6-NEXT:    v_or_b32_e32 v2, v0, v2
 ; GFX6-NEXT:    v_or_b32_e32 v3, v1, v3
 ; GFX6-NEXT:    v_lshr_b64 v[0:1], v[6:7], v15
-; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v14
 ; GFX6-NEXT:    v_lshr_b64 v[8:9], v[6:7], v14
+; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v14
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v14
@@ -5047,8 +5047,8 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) {
 ; GFX8-NEXT:    v_or_b32_e32 v2, v0, v2
 ; GFX8-NEXT:    v_or_b32_e32 v3, v1, v3
 ; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v15, v[6:7]
-; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v14
 ; GFX8-NEXT:    v_lshrrev_b64 v[8:9], v14, v[6:7]
+; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v14
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v14
@@ -5096,8 +5096,8 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) {
 ; GFX9-NEXT:    v_or_b32_e32 v2, v0, v2
 ; GFX9-NEXT:    v_or_b32_e32 v3, v1, v3
 ; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v15, v[6:7]
-; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v14
 ; GFX9-NEXT:    v_lshrrev_b64 v[8:9], v14, v[6:7]
+; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v14
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v14
@@ -5143,18 +5143,18 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v10, v0, v10, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v11, v1, v11, vcc_lo
 ; GFX10-NEXT:    v_lshrrev_b64 v[0:1], v19, v[6:7]
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s6, 0, v18
 ; GFX10-NEXT:    v_cndmask_b32_e64 v8, v8, v12, s4
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s5, 0, v19
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s6, 0, v18
 ; GFX10-NEXT:    v_cndmask_b32_e64 v6, v9, v13, s4
 ; GFX10-NEXT:    v_cndmask_b32_e32 v14, 0, v14, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v7, 0, v15, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s6
 ; GFX10-NEXT:    v_cndmask_b32_e64 v4, v8, v4, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s6
 ; GFX10-NEXT:    v_cndmask_b32_e64 v5, v6, v5, s5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, v0, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, v1, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s6
 ; GFX10-NEXT:    v_or_b32_e32 v0, v14, v4
 ; GFX10-NEXT:    v_or_b32_e32 v1, v7, v5
 ; GFX10-NEXT:    v_or_b32_e32 v2, v2, v6
@@ -5170,13 +5170,13 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 ; GFX6-NEXT:    s_movk_i32 s8, 0x7f
 ; GFX6-NEXT:    v_and_b32_e32 v6, s8, v0
 ; GFX6-NEXT:    v_xor_b32_e32 v0, -1, v0
-; GFX6-NEXT:    v_and_b32_e32 v7, s8, v0
 ; GFX6-NEXT:    s_mov_b32 s9, 0
+; GFX6-NEXT:    v_and_b32_e32 v7, s8, v0
 ; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
 ; GFX6-NEXT:    s_lshr_b32 s8, s1, 31
 ; GFX6-NEXT:    s_lshl_b64 s[10:11], s[0:1], 1
-; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, 64, v7
 ; GFX6-NEXT:    s_or_b64 s[0:1], s[2:3], s[8:9]
+; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, 64, v7
 ; GFX6-NEXT:    v_lshr_b64 v[0:1], s[10:11], v0
 ; GFX6-NEXT:    v_lshl_b64 v[2:3], s[0:1], v7
 ; GFX6-NEXT:    v_subrev_i32_e32 v8, vcc, 64, v7
@@ -5185,13 +5185,13 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 ; GFX6-NEXT:    v_or_b32_e32 v3, v1, v3
 ; GFX6-NEXT:    v_lshl_b64 v[0:1], s[10:11], v8
 ; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v7
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v8, 0, v4, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v9, 0, v5, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s0
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s1
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
 ; GFX6-NEXT:    v_cndmask_b32_e32 v7, v0, v2, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v10, v1, v3, vcc
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 64, v6
@@ -5201,13 +5201,13 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 ; GFX6-NEXT:    v_or_b32_e32 v2, v0, v2
 ; GFX6-NEXT:    v_or_b32_e32 v3, v1, v3
 ; GFX6-NEXT:    v_lshr_b64 v[0:1], s[6:7], v11
-; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v6
 ; GFX6-NEXT:    v_lshr_b64 v[4:5], s[6:7], v6
+; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v6
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s4
-; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v6
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s5
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v6
 ; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
@@ -5223,13 +5223,13 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 ; GFX8-NEXT:    s_movk_i32 s8, 0x7f
 ; GFX8-NEXT:    v_and_b32_e32 v6, s8, v0
 ; GFX8-NEXT:    v_xor_b32_e32 v0, -1, v0
-; GFX8-NEXT:    v_and_b32_e32 v7, s8, v0
 ; GFX8-NEXT:    s_mov_b32 s9, 0
+; GFX8-NEXT:    v_and_b32_e32 v7, s8, v0
 ; GFX8-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
 ; GFX8-NEXT:    s_lshr_b32 s8, s1, 31
 ; GFX8-NEXT:    s_lshl_b64 s[10:11], s[0:1], 1
-; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, 64, v7
 ; GFX8-NEXT:    s_or_b64 s[0:1], s[2:3], s[8:9]
+; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, 64, v7
 ; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v0, s[10:11]
 ; GFX8-NEXT:    v_lshlrev_b64 v[2:3], v7, s[0:1]
 ; GFX8-NEXT:    v_subrev_u32_e32 v8, vcc, 64, v7
@@ -5238,13 +5238,13 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 ; GFX8-NEXT:    v_or_b32_e32 v3, v1, v3
 ; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v8, s[10:11]
 ; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v7
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v8, 0, v4, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v9, 0, v5, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s0
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, v0, v2, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v10, v1, v3, vcc
 ; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 64, v6
@@ -5254,13 +5254,13 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 ; GFX8-NEXT:    v_or_b32_e32 v2, v0, v2
 ; GFX8-NEXT:    v_or_b32_e32 v3, v1, v3
 ; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v11, s[6:7]
-; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v6
 ; GFX8-NEXT:    v_lshrrev_b64 v[4:5], v6, s[6:7]
+; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v6
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s4
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v6
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s5
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v6
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
@@ -5276,13 +5276,13 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 ; GFX9-NEXT:    s_movk_i32 s8, 0x7f
 ; GFX9-NEXT:    v_and_b32_e32 v6, s8, v0
 ; GFX9-NEXT:    v_xor_b32_e32 v0, -1, v0
-; GFX9-NEXT:    v_and_b32_e32 v7, s8, v0
 ; GFX9-NEXT:    s_mov_b32 s9, 0
+; GFX9-NEXT:    v_and_b32_e32 v7, s8, v0
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
 ; GFX9-NEXT:    s_lshr_b32 s8, s1, 31
 ; GFX9-NEXT:    s_lshl_b64 s[10:11], s[0:1], 1
-; GFX9-NEXT:    v_sub_u32_e32 v0, 64, v7
 ; GFX9-NEXT:    s_or_b64 s[0:1], s[2:3], s[8:9]
+; GFX9-NEXT:    v_sub_u32_e32 v0, 64, v7
 ; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v0, s[10:11]
 ; GFX9-NEXT:    v_lshlrev_b64 v[2:3], v7, s[0:1]
 ; GFX9-NEXT:    v_subrev_u32_e32 v8, 64, v7
@@ -5291,10 +5291,10 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 ; GFX9-NEXT:    v_or_b32_e32 v3, v1, v3
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v8, s[10:11]
 ; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v7
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v8, 0, v4, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v9, 0, v5, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
@@ -5307,13 +5307,13 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 ; GFX9-NEXT:    v_or_b32_e32 v2, v0, v2
 ; GFX9-NEXT:    v_or_b32_e32 v3, v1, v3
 ; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v11, s[6:7]
-; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v6
 ; GFX9-NEXT:    v_lshrrev_b64 v[4:5], v6, s[6:7]
+; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v6
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v6
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v6
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
@@ -5328,17 +5328,17 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_xor_b32_e32 v1, -1, v0
 ; GFX10-NEXT:    s_movk_i32 s10, 0x7f
-; GFX10-NEXT:    s_lshr_b32 s8, s1, 31
-; GFX10-NEXT:    v_and_b32_e32 v13, s10, v0
 ; GFX10-NEXT:    s_mov_b32 s9, 0
-; GFX10-NEXT:    v_and_b32_e32 v12, s10, v1
+; GFX10-NEXT:    v_and_b32_e32 v13, s10, v0
 ; GFX10-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
+; GFX10-NEXT:    v_and_b32_e32 v12, s10, v1
+; GFX10-NEXT:    s_lshr_b32 s8, s1, 31
 ; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v8, 64, v13
 ; GFX10-NEXT:    s_or_b64 s[8:9], s[2:3], s[8:9]
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v2, 64, v12
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v10, 64, v12
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v12, s[8:9]
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v10, 64, v12
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v14, 64, v13
 ; GFX10-NEXT:    v_lshrrev_b64 v[4:5], v13, s[4:5]
 ; GFX10-NEXT:    v_lshrrev_b64 v[2:3], v2, s[0:1]
@@ -5357,15 +5357,15 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 ; GFX10-NEXT:    v_lshrrev_b64 v[2:3], v13, s[6:7]
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s0
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s1, 0, v13
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s0
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s2, 0, v12
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, 0, v7, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v6, 0, v6, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, 0, v7, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, s4, s1
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s5, s1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v5, v8, s8, s2
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, v2, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v7, v10, s9, s2
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s5, s1
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, v2, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, v3, s0
 ; GFX10-NEXT:    v_or_b32_e32 v0, v6, v0
 ; GFX10-NEXT:    v_or_b32_e32 v1, v4, v1
@@ -5404,20 +5404,20 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX6-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[6:7]
 ; GFX6-NEXT:    s_cmp_lg_u32 s13, 0
 ; GFX6-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[4:5]
-; GFX6-NEXT:    s_sub_i32 s5, 64, s8
 ; GFX6-NEXT:    s_sub_i32 s4, s8, 64
+; GFX6-NEXT:    s_sub_i32 s5, 64, s8
 ; GFX6-NEXT:    s_cmp_lt_u32 s8, 64
 ; GFX6-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX6-NEXT:    s_cmp_eq_u32 s8, 0
 ; GFX6-NEXT:    v_lshr_b64 v[4:5], v[0:1], s8
 ; GFX6-NEXT:    v_lshl_b64 v[6:7], v[2:3], s5
-; GFX6-NEXT:    v_lshr_b64 v[8:9], v[2:3], s8
 ; GFX6-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX6-NEXT:    v_lshr_b64 v[8:9], v[2:3], s8
 ; GFX6-NEXT:    v_lshr_b64 v[2:3], v[2:3], s4
 ; GFX6-NEXT:    s_and_b32 s4, 1, s6
-; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
 ; GFX6-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX6-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
 ; GFX6-NEXT:    s_and_b32 s4, 1, s7
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
@@ -5460,20 +5460,20 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX8-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[6:7]
 ; GFX8-NEXT:    s_cmp_lg_u32 s13, 0
 ; GFX8-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[4:5]
-; GFX8-NEXT:    s_sub_i32 s5, 64, s8
 ; GFX8-NEXT:    s_sub_i32 s4, s8, 64
+; GFX8-NEXT:    s_sub_i32 s5, 64, s8
 ; GFX8-NEXT:    s_cmp_lt_u32 s8, 64
 ; GFX8-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX8-NEXT:    s_cmp_eq_u32 s8, 0
 ; GFX8-NEXT:    v_lshrrev_b64 v[4:5], s8, v[0:1]
 ; GFX8-NEXT:    v_lshlrev_b64 v[6:7], s5, v[2:3]
-; GFX8-NEXT:    v_lshrrev_b64 v[8:9], s8, v[2:3]
 ; GFX8-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX8-NEXT:    v_lshrrev_b64 v[8:9], s8, v[2:3]
 ; GFX8-NEXT:    v_lshrrev_b64 v[2:3], s4, v[2:3]
 ; GFX8-NEXT:    s_and_b32 s4, 1, s6
-; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
 ; GFX8-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX8-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
 ; GFX8-NEXT:    s_and_b32 s4, 1, s7
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
@@ -5516,20 +5516,20 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX9-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[6:7]
 ; GFX9-NEXT:    s_cmp_lg_u32 s13, 0
 ; GFX9-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[4:5]
-; GFX9-NEXT:    s_sub_i32 s5, 64, s8
 ; GFX9-NEXT:    s_sub_i32 s4, s8, 64
+; GFX9-NEXT:    s_sub_i32 s5, 64, s8
 ; GFX9-NEXT:    s_cmp_lt_u32 s8, 64
 ; GFX9-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX9-NEXT:    s_cmp_eq_u32 s8, 0
 ; GFX9-NEXT:    v_lshrrev_b64 v[4:5], s8, v[0:1]
 ; GFX9-NEXT:    v_lshlrev_b64 v[6:7], s5, v[2:3]
-; GFX9-NEXT:    v_lshrrev_b64 v[8:9], s8, v[2:3]
 ; GFX9-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX9-NEXT:    v_lshrrev_b64 v[8:9], s8, v[2:3]
 ; GFX9-NEXT:    v_lshrrev_b64 v[2:3], s4, v[2:3]
 ; GFX9-NEXT:    s_and_b32 s4, 1, s6
-; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
 ; GFX9-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX9-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
 ; GFX9-NEXT:    s_and_b32 s4, 1, s7
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
@@ -5628,18 +5628,18 @@ define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
 ; GFX6-NEXT:    s_and_b32 s4, 1, s9
 ; GFX6-NEXT:    s_sub_i32 s10, s8, 64
 ; GFX6-NEXT:    s_sub_i32 s9, 64, s8
-; GFX6-NEXT:    s_cmp_lt_u32 s8, 64
 ; GFX6-NEXT:    v_or_b32_e32 v6, v0, v6
 ; GFX6-NEXT:    v_or_b32_e32 v7, v1, v7
 ; GFX6-NEXT:    v_lshl_b64 v[0:1], v[4:5], s5
+; GFX6-NEXT:    s_cmp_lt_u32 s8, 64
 ; GFX6-NEXT:    s_cselect_b32 s11, 1, 0
 ; GFX6-NEXT:    s_cmp_eq_u32 s8, 0
-; GFX6-NEXT:    s_cselect_b32 s12, 1, 0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v5, 0, v9, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
 ; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX6-NEXT:    s_cselect_b32 s12, 1, 0
 ; GFX6-NEXT:    s_lshr_b64 s[4:5], s[2:3], s8
 ; GFX6-NEXT:    s_lshr_b64 s[6:7], s[0:1], s8
 ; GFX6-NEXT:    s_lshl_b64 s[8:9], s[2:3], s9
@@ -5682,18 +5682,18 @@ define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
 ; GFX8-NEXT:    s_and_b32 s4, 1, s9
 ; GFX8-NEXT:    s_sub_i32 s10, s8, 64
 ; GFX8-NEXT:    s_sub_i32 s9, 64, s8
-; GFX8-NEXT:    s_cmp_lt_u32 s8, 64
 ; GFX8-NEXT:    v_or_b32_e32 v6, v0, v6
 ; GFX8-NEXT:    v_or_b32_e32 v7, v1, v7
 ; GFX8-NEXT:    v_lshlrev_b64 v[0:1], s5, v[4:5]
+; GFX8-NEXT:    s_cmp_lt_u32 s8, 64
 ; GFX8-NEXT:    s_cselect_b32 s11, 1, 0
 ; GFX8-NEXT:    s_cmp_eq_u32 s8, 0
-; GFX8-NEXT:    s_cselect_b32 s12, 1, 0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, 0, v9, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX8-NEXT:    s_cselect_b32 s12, 1, 0
 ; GFX8-NEXT:    s_lshr_b64 s[4:5], s[2:3], s8
 ; GFX8-NEXT:    s_lshr_b64 s[6:7], s[0:1], s8
 ; GFX8-NEXT:    s_lshl_b64 s[8:9], s[2:3], s9
@@ -5736,18 +5736,18 @@ define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
 ; GFX9-NEXT:    s_and_b32 s4, 1, s9
 ; GFX9-NEXT:    s_sub_i32 s10, s8, 64
 ; GFX9-NEXT:    s_sub_i32 s9, 64, s8
-; GFX9-NEXT:    s_cmp_lt_u32 s8, 64
 ; GFX9-NEXT:    v_or_b32_e32 v6, v0, v6
 ; GFX9-NEXT:    v_or_b32_e32 v7, v1, v7
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], s5, v[4:5]
+; GFX9-NEXT:    s_cmp_lt_u32 s8, 64
 ; GFX9-NEXT:    s_cselect_b32 s11, 1, 0
 ; GFX9-NEXT:    s_cmp_eq_u32 s8, 0
-; GFX9-NEXT:    s_cselect_b32 s12, 1, 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, 0, v9, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX9-NEXT:    s_cselect_b32 s12, 1, 0
 ; GFX9-NEXT:    s_lshr_b64 s[4:5], s[2:3], s8
 ; GFX9-NEXT:    s_lshr_b64 s[6:7], s[0:1], s8
 ; GFX9-NEXT:    s_lshl_b64 s[8:9], s[2:3], s9
@@ -5778,8 +5778,8 @@ define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
 ; GFX10-NEXT:    v_or_b32_e32 v2, v2, v4
 ; GFX10-NEXT:    s_sub_i32 s4, 64, s8
 ; GFX10-NEXT:    s_sub_i32 s5, s8, 64
-; GFX10-NEXT:    s_cmp_lt_u32 s8, 64
 ; GFX10-NEXT:    v_lshrrev_b64 v[4:5], s4, v[0:1]
+; GFX10-NEXT:    s_cmp_lt_u32 s8, 64
 ; GFX10-NEXT:    v_lshlrev_b64 v[6:7], s8, v[2:3]
 ; GFX10-NEXT:    s_cselect_b32 vcc_lo, 1, 0
 ; GFX10-NEXT:    s_cmp_eq_u32 s8, 0
@@ -5865,13 +5865,13 @@ define amdgpu_ps i128 @s_fshr_i128_65(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX10-LABEL: s_fshr_i128_65:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    s_lshl_b32 s3, s2, 31
 ; GFX10-NEXT:    s_lshl_b32 s5, s0, 31
+; GFX10-NEXT:    s_lshl_b32 s3, s2, 31
 ; GFX10-NEXT:    s_mov_b32 s2, s4
-; GFX10-NEXT:    s_lshr_b64 s[8:9], s[0:1], 1
 ; GFX10-NEXT:    s_lshr_b64 s[6:7], s[6:7], 1
-; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
+; GFX10-NEXT:    s_lshr_b64 s[8:9], s[0:1], 1
 ; GFX10-NEXT:    s_or_b64 s[0:1], s[4:5], s[6:7]
+; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
 ; GFX10-NEXT:    ; return to shader part epilog
   %result = call i128 @llvm.fshr.i128(i128 %lhs, i128 %rhs, i128 65)
   ret i128 %result
@@ -5881,9 +5881,9 @@ define i128 @v_fshr_i128_65(i128 %lhs, i128 %rhs) {
 ; GFX6-LABEL: v_fshr_i128_65:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 31, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 31, v2
 ; GFX6-NEXT:    v_lshr_b64 v[2:3], v[0:1], 1
-; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 31, v0
 ; GFX6-NEXT:    v_lshr_b64 v[0:1], v[6:7], 1
 ; GFX6-NEXT:    v_or_b32_e32 v3, v5, v3
 ; GFX6-NEXT:    v_or_b32_e32 v1, v4, v1
@@ -5892,9 +5892,9 @@ define i128 @v_fshr_i128_65(i128 %lhs, i128 %rhs) {
 ; GFX8-LABEL: v_fshr_i128_65:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 31, v0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 31, v2
 ; GFX8-NEXT:    v_lshrrev_b64 v[2:3], 1, v[0:1]
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 31, v0
 ; GFX8-NEXT:    v_lshrrev_b64 v[0:1], 1, v[6:7]
 ; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
 ; GFX8-NEXT:    v_or_b32_e32 v1, v4, v1
@@ -5903,9 +5903,9 @@ define i128 @v_fshr_i128_65(i128 %lhs, i128 %rhs) {
 ; GFX9-LABEL: v_fshr_i128_65:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 31, v0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 31, v2
 ; GFX9-NEXT:    v_lshrrev_b64 v[2:3], 1, v[0:1]
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 31, v0
 ; GFX9-NEXT:    v_lshrrev_b64 v[0:1], 1, v[6:7]
 ; GFX9-NEXT:    v_or_b32_e32 v3, v5, v3
 ; GFX9-NEXT:    v_or_b32_e32 v1, v4, v1
@@ -5936,8 +5936,8 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX6-NEXT:    s_and_b64 s[22:23], s[16:17], s[18:19]
 ; GFX6-NEXT:    s_andn2_b64 s[16:17], s[18:19], s[16:17]
 ; GFX6-NEXT:    s_lshl_b64 s[24:25], s[0:1], 1
-; GFX6-NEXT:    s_lshr_b32 s0, s1, 31
 ; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
+; GFX6-NEXT:    s_lshr_b32 s0, s1, 31
 ; GFX6-NEXT:    s_mov_b32 s1, s19
 ; GFX6-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
 ; GFX6-NEXT:    s_sub_i32 s23, s16, 64
@@ -5963,8 +5963,8 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX6-NEXT:    s_cmp_eq_u32 s22, 0
 ; GFX6-NEXT:    s_cselect_b32 s28, 1, 0
 ; GFX6-NEXT:    s_lshr_b64 s[0:1], s[10:11], s22
-; GFX6-NEXT:    s_lshl_b64 s[24:25], s[10:11], s24
 ; GFX6-NEXT:    s_lshr_b64 s[22:23], s[8:9], s22
+; GFX6-NEXT:    s_lshl_b64 s[24:25], s[10:11], s24
 ; GFX6-NEXT:    s_or_b64 s[22:23], s[22:23], s[24:25]
 ; GFX6-NEXT:    s_lshr_b64 s[10:11], s[10:11], s26
 ; GFX6-NEXT:    s_cmp_lg_u32 s27, 0
@@ -5975,8 +5975,8 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX6-NEXT:    s_cselect_b64 s[10:11], s[0:1], 0
 ; GFX6-NEXT:    s_or_b64 s[0:1], s[2:3], s[8:9]
 ; GFX6-NEXT:    s_or_b64 s[2:3], s[16:17], s[10:11]
-; GFX6-NEXT:    s_andn2_b64 s[10:11], s[18:19], s[20:21]
 ; GFX6-NEXT:    s_and_b64 s[8:9], s[20:21], s[18:19]
+; GFX6-NEXT:    s_andn2_b64 s[10:11], s[18:19], s[20:21]
 ; GFX6-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
 ; GFX6-NEXT:    s_lshr_b32 s18, s5, 31
 ; GFX6-NEXT:    s_lshl_b64 s[16:17], s[4:5], 1
@@ -6004,8 +6004,8 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX6-NEXT:    s_cmp_eq_u32 s8, 0
 ; GFX6-NEXT:    s_cselect_b32 s20, 1, 0
 ; GFX6-NEXT:    s_lshr_b64 s[4:5], s[14:15], s8
-; GFX6-NEXT:    s_lshl_b64 s[16:17], s[14:15], s16
 ; GFX6-NEXT:    s_lshr_b64 s[8:9], s[12:13], s8
+; GFX6-NEXT:    s_lshl_b64 s[16:17], s[14:15], s16
 ; GFX6-NEXT:    s_or_b64 s[8:9], s[8:9], s[16:17]
 ; GFX6-NEXT:    s_lshr_b64 s[14:15], s[14:15], s18
 ; GFX6-NEXT:    s_cmp_lg_u32 s19, 0
@@ -6025,8 +6025,8 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX8-NEXT:    s_and_b64 s[22:23], s[16:17], s[18:19]
 ; GFX8-NEXT:    s_andn2_b64 s[16:17], s[18:19], s[16:17]
 ; GFX8-NEXT:    s_lshl_b64 s[24:25], s[0:1], 1
-; GFX8-NEXT:    s_lshr_b32 s0, s1, 31
 ; GFX8-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
+; GFX8-NEXT:    s_lshr_b32 s0, s1, 31
 ; GFX8-NEXT:    s_mov_b32 s1, s19
 ; GFX8-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
 ; GFX8-NEXT:    s_sub_i32 s23, s16, 64
@@ -6052,8 +6052,8 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX8-NEXT:    s_cmp_eq_u32 s22, 0
 ; GFX8-NEXT:    s_cselect_b32 s28, 1, 0
 ; GFX8-NEXT:    s_lshr_b64 s[0:1], s[10:11], s22
-; GFX8-NEXT:    s_lshl_b64 s[24:25], s[10:11], s24
 ; GFX8-NEXT:    s_lshr_b64 s[22:23], s[8:9], s22
+; GFX8-NEXT:    s_lshl_b64 s[24:25], s[10:11], s24
 ; GFX8-NEXT:    s_or_b64 s[22:23], s[22:23], s[24:25]
 ; GFX8-NEXT:    s_lshr_b64 s[10:11], s[10:11], s26
 ; GFX8-NEXT:    s_cmp_lg_u32 s27, 0
@@ -6064,8 +6064,8 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX8-NEXT:    s_cselect_b64 s[10:11], s[0:1], 0
 ; GFX8-NEXT:    s_or_b64 s[0:1], s[2:3], s[8:9]
 ; GFX8-NEXT:    s_or_b64 s[2:3], s[16:17], s[10:11]
-; GFX8-NEXT:    s_andn2_b64 s[10:11], s[18:19], s[20:21]
 ; GFX8-NEXT:    s_and_b64 s[8:9], s[20:21], s[18:19]
+; GFX8-NEXT:    s_andn2_b64 s[10:11], s[18:19], s[20:21]
 ; GFX8-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
 ; GFX8-NEXT:    s_lshr_b32 s18, s5, 31
 ; GFX8-NEXT:    s_lshl_b64 s[16:17], s[4:5], 1
@@ -6093,8 +6093,8 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX8-NEXT:    s_cmp_eq_u32 s8, 0
 ; GFX8-NEXT:    s_cselect_b32 s20, 1, 0
 ; GFX8-NEXT:    s_lshr_b64 s[4:5], s[14:15], s8
-; GFX8-NEXT:    s_lshl_b64 s[16:17], s[14:15], s16
 ; GFX8-NEXT:    s_lshr_b64 s[8:9], s[12:13], s8
+; GFX8-NEXT:    s_lshl_b64 s[16:17], s[14:15], s16
 ; GFX8-NEXT:    s_or_b64 s[8:9], s[8:9], s[16:17]
 ; GFX8-NEXT:    s_lshr_b64 s[14:15], s[14:15], s18
 ; GFX8-NEXT:    s_cmp_lg_u32 s19, 0
@@ -6114,8 +6114,8 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX9-NEXT:    s_and_b64 s[22:23], s[16:17], s[18:19]
 ; GFX9-NEXT:    s_andn2_b64 s[16:17], s[18:19], s[16:17]
 ; GFX9-NEXT:    s_lshl_b64 s[24:25], s[0:1], 1
-; GFX9-NEXT:    s_lshr_b32 s0, s1, 31
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
+; GFX9-NEXT:    s_lshr_b32 s0, s1, 31
 ; GFX9-NEXT:    s_mov_b32 s1, s19
 ; GFX9-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
 ; GFX9-NEXT:    s_sub_i32 s23, s16, 64
@@ -6141,8 +6141,8 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX9-NEXT:    s_cmp_eq_u32 s22, 0
 ; GFX9-NEXT:    s_cselect_b32 s28, 1, 0
 ; GFX9-NEXT:    s_lshr_b64 s[0:1], s[10:11], s22
-; GFX9-NEXT:    s_lshl_b64 s[24:25], s[10:11], s24
 ; GFX9-NEXT:    s_lshr_b64 s[22:23], s[8:9], s22
+; GFX9-NEXT:    s_lshl_b64 s[24:25], s[10:11], s24
 ; GFX9-NEXT:    s_or_b64 s[22:23], s[22:23], s[24:25]
 ; GFX9-NEXT:    s_lshr_b64 s[10:11], s[10:11], s26
 ; GFX9-NEXT:    s_cmp_lg_u32 s27, 0
@@ -6153,8 +6153,8 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX9-NEXT:    s_cselect_b64 s[10:11], s[0:1], 0
 ; GFX9-NEXT:    s_or_b64 s[0:1], s[2:3], s[8:9]
 ; GFX9-NEXT:    s_or_b64 s[2:3], s[16:17], s[10:11]
-; GFX9-NEXT:    s_andn2_b64 s[10:11], s[18:19], s[20:21]
 ; GFX9-NEXT:    s_and_b64 s[8:9], s[20:21], s[18:19]
+; GFX9-NEXT:    s_andn2_b64 s[10:11], s[18:19], s[20:21]
 ; GFX9-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
 ; GFX9-NEXT:    s_lshr_b32 s18, s5, 31
 ; GFX9-NEXT:    s_lshl_b64 s[16:17], s[4:5], 1
@@ -6182,8 +6182,8 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX9-NEXT:    s_cmp_eq_u32 s8, 0
 ; GFX9-NEXT:    s_cselect_b32 s20, 1, 0
 ; GFX9-NEXT:    s_lshr_b64 s[4:5], s[14:15], s8
-; GFX9-NEXT:    s_lshl_b64 s[16:17], s[14:15], s16
 ; GFX9-NEXT:    s_lshr_b64 s[8:9], s[12:13], s8
+; GFX9-NEXT:    s_lshl_b64 s[16:17], s[14:15], s16
 ; GFX9-NEXT:    s_or_b64 s[8:9], s[8:9], s[16:17]
 ; GFX9-NEXT:    s_lshr_b64 s[14:15], s[14:15], s18
 ; GFX9-NEXT:    s_cmp_lg_u32 s19, 0
@@ -6198,12 +6198,12 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ;
 ; GFX10-LABEL: s_fshr_v2i128:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_mov_b32 s19, 0
 ; GFX10-NEXT:    s_movk_i32 s18, 0x7f
-; GFX10-NEXT:    s_lshr_b32 s24, s1, 31
+; GFX10-NEXT:    s_mov_b32 s19, 0
+; GFX10-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
 ; GFX10-NEXT:    s_and_b64 s[22:23], s[16:17], s[18:19]
 ; GFX10-NEXT:    s_andn2_b64 s[16:17], s[18:19], s[16:17]
-; GFX10-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
+; GFX10-NEXT:    s_lshr_b32 s24, s1, 31
 ; GFX10-NEXT:    s_mov_b32 s25, s19
 ; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
 ; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[24:25]
@@ -6243,8 +6243,8 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX10-NEXT:    s_andn2_b64 s[10:11], s[18:19], s[20:21]
 ; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
 ; GFX10-NEXT:    s_and_b64 s[8:9], s[20:21], s[18:19]
-; GFX10-NEXT:    s_lshr_b32 s18, s5, 31
 ; GFX10-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
+; GFX10-NEXT:    s_lshr_b32 s18, s5, 31
 ; GFX10-NEXT:    s_or_b64 s[0:1], s[16:17], s[0:1]
 ; GFX10-NEXT:    s_lshl_b64 s[4:5], s[4:5], 1
 ; GFX10-NEXT:    s_or_b64 s[6:7], s[6:7], s[18:19]
@@ -6300,9 +6300,9 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], 1
 ; GFX6-NEXT:    v_or_b32_e32 v2, v2, v17
 ; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, 64, v23
-; GFX6-NEXT:    v_and_b32_e32 v24, s6, v16
 ; GFX6-NEXT:    v_lshr_b64 v[17:18], v[0:1], v17
 ; GFX6-NEXT:    v_lshl_b64 v[21:22], v[2:3], v23
+; GFX6-NEXT:    v_and_b32_e32 v24, s6, v16
 ; GFX6-NEXT:    v_sub_i32_e32 v16, vcc, 64, v24
 ; GFX6-NEXT:    v_or_b32_e32 v21, v17, v21
 ; GFX6-NEXT:    v_or_b32_e32 v22, v18, v22
@@ -6366,14 +6366,14 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX6-NEXT:    v_or_b32_e32 v11, v4, v6
 ; GFX6-NEXT:    v_or_b32_e32 v17, v5, v7
 ; GFX6-NEXT:    v_lshr_b64 v[6:7], v[14:15], v10
-; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v16
 ; GFX6-NEXT:    v_lshr_b64 v[4:5], v[14:15], v16
+; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v16
 ; GFX6-NEXT:    v_cndmask_b32_e32 v6, v6, v11, vcc
-; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v16
 ; GFX6-NEXT:    v_cndmask_b32_e32 v7, v7, v17, vcc
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v16
 ; GFX6-NEXT:    v_cndmask_b32_e64 v6, v6, v12, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v10, 0, v4, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e64 v7, v7, v13, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e32 v10, 0, v4, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v11, 0, v5, vcc
 ; GFX6-NEXT:    v_or_b32_e32 v4, v18, v6
 ; GFX6-NEXT:    v_or_b32_e32 v5, v19, v7
@@ -6392,9 +6392,9 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
 ; GFX8-NEXT:    v_or_b32_e32 v2, v2, v17
 ; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, 64, v23
-; GFX8-NEXT:    v_and_b32_e32 v24, s6, v16
 ; GFX8-NEXT:    v_lshrrev_b64 v[17:18], v17, v[0:1]
 ; GFX8-NEXT:    v_lshlrev_b64 v[21:22], v23, v[2:3]
+; GFX8-NEXT:    v_and_b32_e32 v24, s6, v16
 ; GFX8-NEXT:    v_sub_u32_e32 v16, vcc, 64, v24
 ; GFX8-NEXT:    v_or_b32_e32 v21, v17, v21
 ; GFX8-NEXT:    v_or_b32_e32 v22, v18, v22
@@ -6458,14 +6458,14 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX8-NEXT:    v_or_b32_e32 v11, v4, v6
 ; GFX8-NEXT:    v_or_b32_e32 v17, v5, v7
 ; GFX8-NEXT:    v_lshrrev_b64 v[6:7], v10, v[14:15]
-; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v16
 ; GFX8-NEXT:    v_lshrrev_b64 v[4:5], v16, v[14:15]
+; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v16
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, v6, v11, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v16
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, v7, v17, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v16
 ; GFX8-NEXT:    v_cndmask_b32_e64 v6, v6, v12, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v10, 0, v4, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v7, v7, v13, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v10, 0, v4, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v11, 0, v5, vcc
 ; GFX8-NEXT:    v_or_b32_e32 v4, v18, v6
 ; GFX8-NEXT:    v_or_b32_e32 v5, v19, v7
@@ -6484,9 +6484,9 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
 ; GFX9-NEXT:    v_or_b32_e32 v2, v2, v17
 ; GFX9-NEXT:    v_sub_u32_e32 v17, 64, v23
-; GFX9-NEXT:    v_and_b32_e32 v24, s6, v16
 ; GFX9-NEXT:    v_lshrrev_b64 v[17:18], v17, v[0:1]
 ; GFX9-NEXT:    v_lshlrev_b64 v[21:22], v23, v[2:3]
+; GFX9-NEXT:    v_and_b32_e32 v24, s6, v16
 ; GFX9-NEXT:    v_sub_u32_e32 v16, 64, v24
 ; GFX9-NEXT:    v_or_b32_e32 v21, v17, v21
 ; GFX9-NEXT:    v_or_b32_e32 v22, v18, v22
@@ -6521,8 +6521,8 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX9-NEXT:    v_xor_b32_e32 v8, -1, v20
 ; GFX9-NEXT:    v_lshlrev_b64 v[6:7], 1, v[6:7]
 ; GFX9-NEXT:    v_or_b32_e32 v1, v18, v3
-; GFX9-NEXT:    v_and_b32_e32 v17, s6, v8
 ; GFX9-NEXT:    v_or_b32_e32 v3, v16, v9
+; GFX9-NEXT:    v_and_b32_e32 v17, s6, v8
 ; GFX9-NEXT:    v_lshlrev_b64 v[8:9], 1, v[4:5]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 31, v5
 ; GFX9-NEXT:    v_or_b32_e32 v6, v6, v4
@@ -6550,14 +6550,14 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX9-NEXT:    v_or_b32_e32 v11, v4, v6
 ; GFX9-NEXT:    v_or_b32_e32 v17, v5, v7
 ; GFX9-NEXT:    v_lshrrev_b64 v[6:7], v10, v[14:15]
-; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v16
 ; GFX9-NEXT:    v_lshrrev_b64 v[4:5], v16, v[14:15]
+; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v16
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v11, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v16
 ; GFX9-NEXT:    v_cndmask_b32_e32 v7, v7, v17, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v16
 ; GFX9-NEXT:    v_cndmask_b32_e64 v6, v6, v12, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e32 v10, 0, v4, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, v13, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, 0, v4, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v11, 0, v5, vcc
 ; GFX9-NEXT:    v_or_b32_e32 v4, v18, v6
 ; GFX9-NEXT:    v_or_b32_e32 v5, v19, v7
@@ -6608,18 +6608,18 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v17, s4
 ; GFX10-NEXT:    v_lshrrev_b64 v[2:3], v26, v[10:11]
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc_lo
-; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 31, v5
 ; GFX10-NEXT:    v_and_b32_e32 v25, s5, v16
+; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 31, v5
 ; GFX10-NEXT:    v_lshlrev_b64 v[4:5], 1, v[4:5]
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc_lo
 ; GFX10-NEXT:    v_or_b32_e32 v0, v23, v0
-; GFX10-NEXT:    v_or_b32_e32 v6, v6, v8
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v9, 64, v25
+; GFX10-NEXT:    v_or_b32_e32 v6, v6, v8
 ; GFX10-NEXT:    v_and_b32_e32 v23, s5, v20
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, v2, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v26, 0, v3, s4
-; GFX10-NEXT:    v_lshlrev_b64 v[10:11], v25, v[6:7]
 ; GFX10-NEXT:    v_lshrrev_b64 v[8:9], v9, v[4:5]
+; GFX10-NEXT:    v_lshlrev_b64 v[10:11], v25, v[6:7]
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v20, 64, v23
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, 64, v25
 ; GFX10-NEXT:    v_or_b32_e32 v2, v18, v2
@@ -6639,20 +6639,20 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX10-NEXT:    v_cndmask_b32_e32 v10, v3, v10, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v5, v4, v5, vcc_lo
 ; GFX10-NEXT:    v_lshrrev_b64 v[3:4], v23, v[14:15]
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s6, 0, v25
 ; GFX10-NEXT:    v_cndmask_b32_e64 v8, v8, v16, s4
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s5, 0, v23
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s6, 0, v25
 ; GFX10-NEXT:    v_cndmask_b32_e64 v9, v9, v18, s4
 ; GFX10-NEXT:    v_cndmask_b32_e32 v14, 0, v17, vcc_lo
+; GFX10-NEXT:    v_or_b32_e32 v1, v24, v1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v6, v10, v6, s6
 ; GFX10-NEXT:    v_cndmask_b32_e64 v7, v5, v7, s6
 ; GFX10-NEXT:    v_cndmask_b32_e64 v5, v8, v12, s5
-; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, v4, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v8, v9, v13, s5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, v3, s4
-; GFX10-NEXT:    v_or_b32_e32 v1, v24, v1
-; GFX10-NEXT:    v_or_b32_e32 v4, v11, v5
+; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, v4, s4
 ; GFX10-NEXT:    v_or_b32_e32 v3, v22, v26
+; GFX10-NEXT:    v_or_b32_e32 v4, v11, v5
 ; GFX10-NEXT:    v_or_b32_e32 v5, v14, v8
 ; GFX10-NEXT:    v_or_b32_e32 v6, v6, v9
 ; GFX10-NEXT:    v_or_b32_e32 v7, v7, v10

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
index de3e08b9a2db2..c55fff82f117a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
@@ -143,9 +143,9 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(<64 x i32> addrspace(1)* %out.
 ; GCN-NEXT:    v_mov_b32_e32 v0, s48
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:496
 ; GCN-NEXT:    v_mov_b32_e32 v0, s49
-; GCN-NEXT:    s_and_b32 s4, s25, 63
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:500
 ; GCN-NEXT:    v_mov_b32_e32 v0, s50
+; GCN-NEXT:    s_and_b32 s4, s25, 63
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:504
 ; GCN-NEXT:    v_mov_b32_e32 v0, s51
 ; GCN-NEXT:    s_lshl_b32 s4, s4, 2

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll
index 7ab87e4275cb9..4c04371e1dc87 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll
@@ -92,9 +92,9 @@ define amdgpu_ps void @insertelement_v_v2i16_s_s(<2 x i16> addrspace(1)* %ptr, i
 ; GFX9-NEXT:    s_and_b32 s2, s2, s1
 ; GFX9-NEXT:    s_lshl_b32 s2, s2, s0
 ; GFX9-NEXT:    s_lshl_b32 s0, s1, s0
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_not_b32 s0, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s2
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_and_or_b32 v2, v2, s0, v3
@@ -164,8 +164,8 @@ define amdgpu_ps void @insertelement_s_v2i16_v_s(<2 x i16> addrspace(4)* inreg %
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; GFX9-NEXT:    s_and_b32 s1, s4, 1
-; GFX9-NEXT:    s_mov_b32 s2, 0xffff
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, 4
+; GFX9-NEXT:    s_mov_b32 s2, 0xffff
 ; GFX9-NEXT:    v_and_b32_e32 v2, s2, v0
 ; GFX9-NEXT:    s_lshl_b32 s2, s2, s1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -288,8 +288,8 @@ define amdgpu_ps void @insertelement_s_v2i16_s_v(<2 x i16> addrspace(4)* inreg %
 ;
 ; GFX10-LABEL: insertelement_s_v2i16_s_v:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    s_mov_b32 s1, 0xffff
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v1, v0, s1
@@ -362,8 +362,8 @@ define amdgpu_ps void @insertelement_s_v2i16_v_v(<2 x i16> addrspace(4)* inreg %
 ;
 ; GFX10-LABEL: insertelement_s_v2i16_v_v:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GFX10-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX10-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GFX10-NEXT:    s_mov_b32 s1, 0xffff
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v2, v1, s1
@@ -402,8 +402,8 @@ define amdgpu_ps void @insertelement_v_v2i16_s_v(<2 x i16> addrspace(1)* %ptr, i
 ; GFX8-LABEL: insertelement_v_v2i16_s_v:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    flat_load_dword v0, v[0:1]
-; GFX8-NEXT:    v_and_b32_e32 v1, 1, v2
 ; GFX8-NEXT:    s_mov_b32 s0, 0xffff
+; GFX8-NEXT:    v_and_b32_e32 v1, 1, v2
 ; GFX8-NEXT:    s_and_b32 s1, s2, s0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e64 v2, v1, s1
@@ -420,8 +420,8 @@ define amdgpu_ps void @insertelement_v_v2i16_s_v(<2 x i16> addrspace(1)* %ptr, i
 ; GFX7-LABEL: insertelement_v_v2i16_s_v:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    flat_load_dword v0, v[0:1]
-; GFX7-NEXT:    v_and_b32_e32 v1, 1, v2
 ; GFX7-NEXT:    s_mov_b32 s0, 0xffff
+; GFX7-NEXT:    v_and_b32_e32 v1, 1, v2
 ; GFX7-NEXT:    s_and_b32 s1, s2, s0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
 ; GFX7-NEXT:    v_lshl_b32_e32 v2, s1, v1
@@ -478,8 +478,8 @@ define amdgpu_ps void @insertelement_v_v2i16_v_s(<2 x i16> addrspace(1)* %ptr, i
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX8-NEXT:    s_and_b32 s1, s2, 1
-; GFX8-NEXT:    s_lshl_b32 s1, s1, 4
 ; GFX8-NEXT:    s_mov_b32 s0, 0xffff
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 4
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, s1
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    s_not_b32 s0, s0
@@ -552,8 +552,8 @@ define amdgpu_ps void @insertelement_v_v2i16_v_v(<2 x i16> addrspace(1)* %ptr, i
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX8-NEXT:    v_and_b32_e32 v1, 1, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
 ; GFX8-NEXT:    s_mov_b32 s0, 0xffff
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX8-NEXT:    v_lshlrev_b32_e64 v1, v1, s0
 ; GFX8-NEXT:    v_xor_b32_e32 v1, -1, v1
@@ -568,8 +568,8 @@ define amdgpu_ps void @insertelement_v_v2i16_v_v(<2 x i16> addrspace(1)* %ptr, i
 ; GFX7-LABEL: insertelement_v_v2i16_v_v:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    flat_load_dword v0, v[0:1]
-; GFX7-NEXT:    v_and_b32_e32 v1, 1, v3
 ; GFX7-NEXT:    s_mov_b32 s0, 0xffff
+; GFX7-NEXT:    v_and_b32_e32 v1, 1, v3
 ; GFX7-NEXT:    v_and_b32_e32 v2, s0, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v1, v2
@@ -665,13 +665,13 @@ define amdgpu_ps void @insertelement_v_v4i16_s_s(<4 x i16> addrspace(1)* %ptr, i
 ; GFX9-LABEL: insertelement_v_v4i16_s_s:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX9-NEXT:    s_lshr_b32 s1, s3, 1
 ; GFX9-NEXT:    s_and_b32 s3, s3, 1
-; GFX9-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX9-NEXT:    s_and_b32 s2, s2, s0
 ; GFX9-NEXT:    s_lshl_b32 s3, s3, 4
-; GFX9-NEXT:    s_lshl_b32 s0, s0, s3
 ; GFX9-NEXT:    s_lshl_b32 s2, s2, s3
+; GFX9-NEXT:    s_lshl_b32 s0, s0, s3
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s1, 1
 ; GFX9-NEXT:    s_not_b32 s0, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v4, s2
@@ -780,9 +780,9 @@ define amdgpu_ps void @insertelement_s_v4i16_v_s(<4 x i16> addrspace(4)* inreg %
 ; GFX9-NEXT:    v_lshl_or_b32 v4, v0, s4, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 1
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
@@ -806,9 +806,9 @@ define amdgpu_ps void @insertelement_s_v4i16_v_s(<4 x i16> addrspace(4)* inreg %
 ; GFX8-NEXT:    s_andn2_b32 s3, s3, s4
 ; GFX8-NEXT:    v_or_b32_e32 v4, s3, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0
-; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 1
 ; GFX8-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
@@ -832,9 +832,9 @@ define amdgpu_ps void @insertelement_s_v4i16_v_s(<4 x i16> addrspace(4)* inreg %
 ; GFX7-NEXT:    v_or_b32_e32 v4, s3, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
 ; GFX7-NEXT:    v_mov_b32_e32 v2, 0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 1
 ; GFX7-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
@@ -994,12 +994,12 @@ define amdgpu_ps void @insertelement_s_v4i16_v_v(<4 x i16> addrspace(4)* inreg %
 ; GFX9-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
 ; GFX9-NEXT:    s_mov_b32 s2, 0xffff
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX9-NEXT:    v_lshlrev_b32_e64 v1, v1, s2
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v4, s1
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX9-NEXT:    v_lshlrev_b32_e64 v1, v1, s2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX9-NEXT:    v_xor_b32_e32 v1, -1, v1
 ; GFX9-NEXT:    v_and_or_b32 v4, v3, v1, v0
@@ -1020,12 +1020,12 @@ define amdgpu_ps void @insertelement_s_v4i16_v_v(<4 x i16> addrspace(4)* inreg %
 ; GFX8-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
 ; GFX8-NEXT:    s_mov_b32 s2, 0xffff
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX8-NEXT:    v_lshlrev_b32_e64 v1, v1, s2
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s1
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT:    v_lshlrev_b32_e64 v1, v1, s2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX8-NEXT:    v_xor_b32_e32 v1, -1, v1
 ; GFX8-NEXT:    v_and_b32_e32 v1, v3, v1
@@ -1048,12 +1048,12 @@ define amdgpu_ps void @insertelement_s_v4i16_v_v(<4 x i16> addrspace(4)* inreg %
 ; GFX7-NEXT:    s_mov_b32 s2, 0xffff
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, s2, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, v1, v0
-; GFX7-NEXT:    v_lshl_b32_e32 v1, s2, v1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX7-NEXT:    v_mov_b32_e32 v4, s1
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, v1, v0
+; GFX7-NEXT:    v_lshl_b32_e32 v1, s2, v1
 ; GFX7-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX7-NEXT:    v_xor_b32_e32 v1, -1, v1
 ; GFX7-NEXT:    v_and_b32_e32 v1, v3, v1
@@ -1102,9 +1102,9 @@ define amdgpu_ps void @insertelement_v_v4i16_s_v(<4 x i16> addrspace(1)* %ptr, i
 ; GFX9-LABEL: insertelement_v_v4i16_s_v:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 1, v2
 ; GFX9-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX9-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX9-NEXT:    s_and_b32 s1, s2, s0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 4, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e64 v6, v2, s1
@@ -1125,9 +1125,9 @@ define amdgpu_ps void @insertelement_v_v4i16_s_v(<4 x i16> addrspace(1)* %ptr, i
 ; GFX8-LABEL: insertelement_v_v4i16_s_v:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 1, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX8-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX8-NEXT:    s_and_b32 s1, s2, s0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 4, v2
 ; GFX8-NEXT:    v_lshlrev_b32_e64 v6, v2, s1
@@ -1149,9 +1149,9 @@ define amdgpu_ps void @insertelement_v_v4i16_s_v(<4 x i16> addrspace(1)* %ptr, i
 ; GFX7-LABEL: insertelement_v_v4i16_s_v:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX7-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 1, v2
 ; GFX7-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX7-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX7-NEXT:    s_and_b32 s1, s2, s0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 4, v2
 ; GFX7-NEXT:    v_lshl_b32_e32 v6, s1, v2
@@ -1204,8 +1204,8 @@ define amdgpu_ps void @insertelement_v_v4i16_v_s(<4 x i16> addrspace(1)* %ptr, i
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX9-NEXT:    s_lshr_b32 s1, s2, 1
 ; GFX9-NEXT:    s_and_b32 s2, s2, 1
-; GFX9-NEXT:    s_lshl_b32 s2, s2, 4
 ; GFX9-NEXT:    s_mov_b32 s0, 0xffff
+; GFX9-NEXT:    s_lshl_b32 s2, s2, 4
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, s2
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s1, 1
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
@@ -1226,10 +1226,10 @@ define amdgpu_ps void @insertelement_v_v4i16_v_s(<4 x i16> addrspace(1)* %ptr, i
 ; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; GFX8-NEXT:    s_lshr_b32 s1, s2, 1
 ; GFX8-NEXT:    s_and_b32 s2, s2, 1
-; GFX8-NEXT:    s_lshl_b32 s2, s2, 4
 ; GFX8-NEXT:    s_mov_b32 s0, 0xffff
-; GFX8-NEXT:    s_lshl_b32 s0, s0, s2
+; GFX8-NEXT:    s_lshl_b32 s2, s2, 4
 ; GFX8-NEXT:    v_mov_b32_e32 v5, s2
+; GFX8-NEXT:    s_lshl_b32 s0, s0, s2
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s1, 1
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX8-NEXT:    s_not_b32 s0, s0
@@ -1302,8 +1302,8 @@ define amdgpu_ps void @insertelement_v_v4i16_v_v(<4 x i16> addrspace(1)* %ptr, i
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 1, v3
 ; GFX9-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 4, v3
 ; GFX9-NEXT:    s_mov_b32 s0, 0xffff
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 4, v3
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX9-NEXT:    v_lshlrev_b32_e64 v3, v3, s0
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
@@ -1324,8 +1324,8 @@ define amdgpu_ps void @insertelement_v_v4i16_v_v(<4 x i16> addrspace(1)* %ptr, i
 ; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 1, v3
 ; GFX8-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 4, v3
 ; GFX8-NEXT:    s_mov_b32 s0, 0xffff
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 4, v3
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX8-NEXT:    v_lshlrev_b32_e64 v3, v3, s0
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
@@ -1345,9 +1345,9 @@ define amdgpu_ps void @insertelement_v_v4i16_v_v(<4 x i16> addrspace(1)* %ptr, i
 ; GFX7-LABEL: insertelement_v_v4i16_v_v:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX7-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 1, v3
 ; GFX7-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX7-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX7-NEXT:    v_and_b32_e32 v2, s0, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 4, v3
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v3, v2
@@ -1693,15 +1693,15 @@ define amdgpu_ps void @insertelement_s_v8i16_v_s(<8 x i16> addrspace(4)* inreg %
 ; GFX9-NEXT:    v_lshl_or_b32 v6, v0, s4, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 2
+; GFX9-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
-; GFX9-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 3
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
@@ -1729,15 +1729,15 @@ define amdgpu_ps void @insertelement_s_v8i16_v_s(<8 x i16> addrspace(4)* inreg %
 ; GFX8-NEXT:    s_andn2_b32 s4, s6, s4
 ; GFX8-NEXT:    v_or_b32_e32 v6, s4, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
 ; GFX8-NEXT:    v_mov_b32_e32 v4, 0
-; GFX8-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 3
 ; GFX8-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
@@ -1765,11 +1765,11 @@ define amdgpu_ps void @insertelement_s_v8i16_v_s(<8 x i16> addrspace(4)* inreg %
 ; GFX7-NEXT:    v_or_b32_e32 v4, s4, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 0
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 1
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 2
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX7-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
@@ -1826,9 +1826,9 @@ define amdgpu_ps void @insertelement_s_v8i16_s_v(<8 x i16> addrspace(4)* inreg %
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[2:3], 0x0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 1, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    s_mov_b32 s5, 0xffff
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s8
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s9
@@ -1837,11 +1837,11 @@ define amdgpu_ps void @insertelement_s_v8i16_s_v(<8 x i16> addrspace(4)* inreg %
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v4
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GFX9-NEXT:    s_and_b32 s4, s4, s5
-; GFX9-NEXT:    v_lshlrev_b32_e64 v2, v0, s4
-; GFX9-NEXT:    v_lshlrev_b32_e64 v0, v0, s5
 ; GFX9-NEXT:    v_mov_b32_e32 v5, s11
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v4
+; GFX9-NEXT:    v_lshlrev_b32_e64 v2, v0, s4
+; GFX9-NEXT:    v_lshlrev_b32_e64 v0, v0, s5
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[2:3]
 ; GFX9-NEXT:    v_xor_b32_e32 v0, -1, v0
 ; GFX9-NEXT:    v_and_or_b32 v6, v1, v0, v2
@@ -1863,9 +1863,9 @@ define amdgpu_ps void @insertelement_s_v8i16_s_v(<8 x i16> addrspace(4)* inreg %
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[8:11], s[2:3], 0x0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 1, v0
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX8-NEXT:    s_mov_b32 s5, 0xffff
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s8
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s9
@@ -1874,11 +1874,11 @@ define amdgpu_ps void @insertelement_s_v8i16_s_v(<8 x i16> addrspace(4)* inreg %
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v4
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GFX8-NEXT:    s_and_b32 s4, s4, s5
-; GFX8-NEXT:    v_lshlrev_b32_e64 v2, v0, s4
-; GFX8-NEXT:    v_lshlrev_b32_e64 v0, v0, s5
 ; GFX8-NEXT:    v_mov_b32_e32 v5, s11
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v4
+; GFX8-NEXT:    v_lshlrev_b32_e64 v2, v0, s4
+; GFX8-NEXT:    v_lshlrev_b32_e64 v0, v0, s5
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[2:3]
 ; GFX8-NEXT:    v_xor_b32_e32 v0, -1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, v1, v0
@@ -1901,9 +1901,9 @@ define amdgpu_ps void @insertelement_s_v8i16_s_v(<8 x i16> addrspace(4)* inreg %
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[2:3], 0x0
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 1, v0
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
 ; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX7-NEXT:    s_mov_b32 s5, 0xffff
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s8
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s9
@@ -1912,11 +1912,11 @@ define amdgpu_ps void @insertelement_s_v8i16_s_v(<8 x i16> addrspace(4)* inreg %
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GFX7-NEXT:    s_and_b32 s4, s4, s5
-; GFX7-NEXT:    v_lshl_b32_e32 v2, s4, v0
-; GFX7-NEXT:    v_lshl_b32_e32 v0, s5, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v5, s11
 ; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v4
+; GFX7-NEXT:    v_lshl_b32_e32 v2, s4, v0
+; GFX7-NEXT:    v_lshl_b32_e32 v0, s5, v0
 ; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[2:3]
 ; GFX7-NEXT:    v_xor_b32_e32 v0, -1, v0
 ; GFX7-NEXT:    v_and_b32_e32 v0, v1, v0
@@ -1926,10 +1926,10 @@ define amdgpu_ps void @insertelement_s_v8i16_s_v(<8 x i16> addrspace(4)* inreg %
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s10
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s11
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v4
-; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[2:3]
-; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
 ; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v5, s[4:5]
 ; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[2:3]
 ; GFX7-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
@@ -1980,21 +1980,21 @@ define amdgpu_ps void @insertelement_s_v8i16_v_v(<8 x i16> addrspace(4)* inreg %
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 1, v1
-; GFX9-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
+; GFX9-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v4
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX9-NEXT:    v_mov_b32_e32 v5, s6
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
 ; GFX9-NEXT:    s_mov_b32 s8, 0xffff
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX9-NEXT:    v_lshlrev_b32_e64 v1, v1, s8
 ; GFX9-NEXT:    v_mov_b32_e32 v6, s7
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v4
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX9-NEXT:    v_lshlrev_b32_e64 v1, v1, s8
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[2:3]
 ; GFX9-NEXT:    v_xor_b32_e32 v1, -1, v1
 ; GFX9-NEXT:    v_and_or_b32 v6, v2, v1, v0
@@ -2016,21 +2016,21 @@ define amdgpu_ps void @insertelement_s_v8i16_v_v(<8 x i16> addrspace(4)* inreg %
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 1, v1
-; GFX8-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
+; GFX8-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v4
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX8-NEXT:    v_mov_b32_e32 v5, s6
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
 ; GFX8-NEXT:    s_mov_b32 s8, 0xffff
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX8-NEXT:    v_lshlrev_b32_e64 v1, v1, s8
 ; GFX8-NEXT:    v_mov_b32_e32 v6, s7
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v4
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT:    v_lshlrev_b32_e64 v1, v1, s8
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[2:3]
 ; GFX8-NEXT:    v_xor_b32_e32 v1, -1, v1
 ; GFX8-NEXT:    v_and_b32_e32 v1, v2, v1
@@ -2053,9 +2053,9 @@ define amdgpu_ps void @insertelement_s_v8i16_v_v(<8 x i16> addrspace(4)* inreg %
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 1, v1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
 ; GFX7-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GFX7-NEXT:    s_mov_b32 s8, 0xffff
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s5
@@ -2064,11 +2064,11 @@ define amdgpu_ps void @insertelement_s_v8i16_v_v(<8 x i16> addrspace(4)* inreg %
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, s8, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, v1, v0
-; GFX7-NEXT:    v_lshl_b32_e32 v1, s8, v1
 ; GFX7-NEXT:    v_mov_b32_e32 v6, s7
 ; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, v1, v0
+; GFX7-NEXT:    v_lshl_b32_e32 v1, s8, v1
 ; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[2:3]
 ; GFX7-NEXT:    v_xor_b32_e32 v1, -1, v1
 ; GFX7-NEXT:    v_and_b32_e32 v1, v2, v1
@@ -2078,10 +2078,10 @@ define amdgpu_ps void @insertelement_s_v8i16_v_v(<8 x i16> addrspace(4)* inreg %
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v4
-; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[2:3]
-; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
 ; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v5, s[4:5]
 ; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[2:3]
 ; GFX7-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
@@ -2091,11 +2091,11 @@ define amdgpu_ps void @insertelement_s_v8i16_v_v(<8 x i16> addrspace(4)* inreg %
 ; GFX10-LABEL: insertelement_s_v8i16_v_v:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 1, v1
 ; GFX10-NEXT:    v_and_b32_e32 v2, 1, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 1, v1
 ; GFX10-NEXT:    s_mov_b32 s0, 0xffff
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 4, v2
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s1, 3, v6
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s2, 0, v6
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v2, s0
@@ -2130,9 +2130,9 @@ define amdgpu_ps void @insertelement_v_v8i16_s_v(<8 x i16> addrspace(1)* %ptr, i
 ; GFX9-LABEL: insertelement_v_v8i16_s_v:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    global_load_dwordx4 v[3:6], v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 1, v2
 ; GFX9-NEXT:    v_and_b32_e32 v1, 1, v2
-; GFX9-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX9-NEXT:    s_and_b32 s1, s2, s0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
@@ -2159,9 +2159,9 @@ define amdgpu_ps void @insertelement_v_v8i16_s_v(<8 x i16> addrspace(1)* %ptr, i
 ; GFX8-LABEL: insertelement_v_v8i16_s_v:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    flat_load_dwordx4 v[3:6], v[0:1]
+; GFX8-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 1, v2
 ; GFX8-NEXT:    v_and_b32_e32 v1, 1, v2
-; GFX8-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX8-NEXT:    s_and_b32 s1, s2, s0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
@@ -2192,9 +2192,9 @@ define amdgpu_ps void @insertelement_v_v8i16_s_v(<8 x i16> addrspace(1)* %ptr, i
 ; GFX7-NEXT:    s_mov_b32 s11, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    buffer_load_dwordx4 v[3:6], v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 1, v2
 ; GFX7-NEXT:    v_and_b32_e32 v1, 1, v2
-; GFX7-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX7-NEXT:    s_and_b32 s1, s2, s0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
@@ -2257,9 +2257,9 @@ define amdgpu_ps void @insertelement_v_v8i16_v_s(<8 x i16> addrspace(1)* %ptr, i
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    global_load_dwordx4 v[3:6], v[0:1], off
 ; GFX9-NEXT:    s_and_b32 s1, s2, 1
+; GFX9-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX9-NEXT:    s_lshr_b32 s4, s2, 1
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, 4
-; GFX9-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, s1
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 1
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, s1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
@@ -2285,9 +2285,9 @@ define amdgpu_ps void @insertelement_v_v8i16_v_s(<8 x i16> addrspace(1)* %ptr, i
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    flat_load_dwordx4 v[3:6], v[0:1]
 ; GFX8-NEXT:    s_and_b32 s1, s2, 1
+; GFX8-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX8-NEXT:    s_lshr_b32 s4, s2, 1
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 4
-; GFX8-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, s1
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 1
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s1
@@ -2318,8 +2318,8 @@ define amdgpu_ps void @insertelement_v_v8i16_v_s(<8 x i16> addrspace(1)* %ptr, i
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    buffer_load_dwordx4 v[3:6], v[0:1], s[8:11], 0 addr64
 ; GFX7-NEXT:    s_and_b32 s1, s2, 1
-; GFX7-NEXT:    s_lshr_b32 s4, s2, 1
 ; GFX7-NEXT:    s_mov_b32 s0, 0xffff
+; GFX7-NEXT:    s_lshr_b32 s4, s2, 1
 ; GFX7-NEXT:    s_lshl_b32 s1, s1, 4
 ; GFX7-NEXT:    v_and_b32_e32 v0, s0, v2
 ; GFX7-NEXT:    s_lshl_b32 s0, s0, s1
@@ -2382,9 +2382,9 @@ define amdgpu_ps void @insertelement_v_v8i16_v_v(<8 x i16> addrspace(1)* %ptr, i
 ; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 1, v3
 ; GFX9-NEXT:    v_and_b32_e32 v1, 1, v3
+; GFX9-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX9-NEXT:    v_lshlrev_b32_e64 v1, v1, s0
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v0
@@ -2410,9 +2410,9 @@ define amdgpu_ps void @insertelement_v_v8i16_v_v(<8 x i16> addrspace(1)* %ptr, i
 ; GFX8-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 1, v3
 ; GFX8-NEXT:    v_and_b32_e32 v1, 1, v3
+; GFX8-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX8-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX8-NEXT:    v_lshlrev_b32_e64 v1, v1, s0
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v0
@@ -2440,12 +2440,12 @@ define amdgpu_ps void @insertelement_v_v8i16_v_v(<8 x i16> addrspace(1)* %ptr, i
 ; GFX7-NEXT:    s_mov_b32 s11, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 1, v3
 ; GFX7-NEXT:    v_and_b32_e32 v1, 1, v3
-; GFX7-NEXT:    s_mov_b32 s0, 0xffff
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX7-NEXT:    v_and_b32_e32 v2, s0, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v1, v2
 ; GFX7-NEXT:    v_lshl_b32_e32 v1, s0, v1
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v0
@@ -2469,11 +2469,11 @@ define amdgpu_ps void @insertelement_v_v8i16_v_v(<8 x i16> addrspace(1)* %ptr, i
 ; GFX10-LABEL: insertelement_v_v8i16_v_v:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 1, v3
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 1, v3
 ; GFX10-NEXT:    s_mov_b32 s0, 0xffff
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s1, 3, v1
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s2, 0, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v8, v0, s0
@@ -2539,19 +2539,19 @@ define amdgpu_ps void @insertelement_s_v16i16_s_s(<16 x i16> addrspace(4)* inreg
 ; GFX9-NEXT:    s_cmp_eq_u32 s7, 4
 ; GFX9-NEXT:    s_cselect_b32 s4, s16, s12
 ; GFX9-NEXT:    s_cmp_eq_u32 s7, 5
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    s_cselect_b32 s5, s16, s13
 ; GFX9-NEXT:    s_cmp_eq_u32 s7, 6
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    s_cselect_b32 s6, s16, s14
 ; GFX9-NEXT:    s_cmp_eq_u32 s7, 7
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s3
-; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
 ; GFX9-NEXT:    s_cselect_b32 s7, s16, s15
-; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
 ; GFX9-NEXT:    s_mov_b64 s[0:1], 16
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s7
@@ -2598,19 +2598,19 @@ define amdgpu_ps void @insertelement_s_v16i16_s_s(<16 x i16> addrspace(4)* inreg
 ; GFX8-NEXT:    s_cmp_eq_u32 s7, 4
 ; GFX8-NEXT:    s_cselect_b32 s4, s16, s12
 ; GFX8-NEXT:    s_cmp_eq_u32 s7, 5
-; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    s_cselect_b32 s5, s16, s13
 ; GFX8-NEXT:    s_cmp_eq_u32 s7, 6
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    s_cselect_b32 s6, s16, s14
 ; GFX8-NEXT:    s_cmp_eq_u32 s7, 7
 ; GFX8-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s3
-; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NEXT:    s_cselect_b32 s7, s16, s15
-; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NEXT:    v_mov_b32_e32 v4, 16
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX8-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s6
@@ -2656,9 +2656,9 @@ define amdgpu_ps void @insertelement_s_v16i16_s_s(<16 x i16> addrspace(4)* inreg
 ; GFX7-NEXT:    s_cmp_eq_u32 s7, 4
 ; GFX7-NEXT:    s_cselect_b32 s4, s16, s12
 ; GFX7-NEXT:    s_cmp_eq_u32 s7, 5
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX7-NEXT:    s_cselect_b32 s5, s16, s13
 ; GFX7-NEXT:    s_cmp_eq_u32 s7, 6
+; GFX7-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX7-NEXT:    s_cselect_b32 s6, s16, s14
 ; GFX7-NEXT:    s_cmp_eq_u32 s7, 7
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
@@ -2667,10 +2667,10 @@ define amdgpu_ps void @insertelement_s_v16i16_s_s(<16 x i16> addrspace(4)* inreg
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX7-NEXT:    s_mov_b32 s10, -1
 ; GFX7-NEXT:    s_mov_b32 s11, 0xf000
-; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
 ; GFX7-NEXT:    s_cselect_b32 s7, s16, s15
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 16
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s7
@@ -2747,8 +2747,8 @@ define amdgpu_ps void @insertelement_v_v16i16_s_s(<16 x i16> addrspace(1)* %ptr,
 ; GFX9-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off
 ; GFX9-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:16
 ; GFX9-NEXT:    s_and_b32 s1, s3, 1
-; GFX9-NEXT:    s_lshr_b32 s12, s3, 1
 ; GFX9-NEXT:    s_mov_b32 s0, 0xffff
+; GFX9-NEXT:    s_lshr_b32 s12, s3, 1
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, 4
 ; GFX9-NEXT:    s_and_b32 s2, s2, s0
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, s1
@@ -2774,8 +2774,8 @@ define amdgpu_ps void @insertelement_v_v16i16_s_s(<16 x i16> addrspace(1)* %ptr,
 ; GFX9-NEXT:    v_and_or_b32 v10, v1, s13, v0
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[12:13], s12, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, v10, s[12:13]
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v4, v10, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v10, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v4, v10, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, v5, v10, s[2:3]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, v6, v10, s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, v7, v10, s[6:7]
@@ -2796,8 +2796,8 @@ define amdgpu_ps void @insertelement_v_v16i16_s_s(<16 x i16> addrspace(1)* %ptr,
 ; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; GFX8-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
 ; GFX8-NEXT:    s_and_b32 s1, s3, 1
-; GFX8-NEXT:    s_lshr_b32 s12, s3, 1
 ; GFX8-NEXT:    s_mov_b32 s0, 0xffff
+; GFX8-NEXT:    s_lshr_b32 s12, s3, 1
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 4
 ; GFX8-NEXT:    s_and_b32 s2, s2, s0
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, s1
@@ -2846,8 +2846,8 @@ define amdgpu_ps void @insertelement_v_v16i16_s_s(<16 x i16> addrspace(1)* %ptr,
 ; GFX7-NEXT:    buffer_load_dwordx4 v[2:5], v[0:1], s[16:19], 0 addr64
 ; GFX7-NEXT:    buffer_load_dwordx4 v[6:9], v[0:1], s[16:19], 0 addr64 offset:16
 ; GFX7-NEXT:    s_and_b32 s1, s3, 1
-; GFX7-NEXT:    s_lshr_b32 s12, s3, 1
 ; GFX7-NEXT:    s_mov_b32 s0, 0xffff
+; GFX7-NEXT:    s_lshr_b32 s12, s3, 1
 ; GFX7-NEXT:    s_lshl_b32 s1, s1, 4
 ; GFX7-NEXT:    s_and_b32 s2, s2, s0
 ; GFX7-NEXT:    s_lshl_b32 s0, s0, s1
@@ -2879,9 +2879,9 @@ define amdgpu_ps void @insertelement_v_v16i16_s_s(<16 x i16> addrspace(1)* %ptr,
 ; GFX7-NEXT:    v_cndmask_b32_e64 v3, v5, v10, s[2:3]
 ; GFX7-NEXT:    v_cndmask_b32_e64 v4, v6, v10, s[4:5]
 ; GFX7-NEXT:    v_cndmask_b32_e64 v5, v7, v10, s[6:7]
-; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
 ; GFX7-NEXT:    v_cndmask_b32_e64 v6, v8, v10, s[8:9]
 ; GFX7-NEXT:    v_cndmask_b32_e64 v7, v9, v10, s[10:11]
+; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
 ; GFX7-NEXT:    s_mov_b64 s[16:17], 16
 ; GFX7-NEXT:    buffer_store_dwordx4 v[4:7], off, s[16:19], 0
 ; GFX7-NEXT:    s_endpgm
@@ -2968,25 +2968,25 @@ define amdgpu_ps void @insertelement_s_v16i16_v_s(<16 x i16> addrspace(4)* inreg
 ; GFX9-NEXT:    v_lshl_or_b32 v8, v0, s1, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s8
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v8, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s10
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v8, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 2
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v8, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s11
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v8, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 3
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v8, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v5, s13
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v8, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 5
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v8, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v6, s14
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v8, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 6
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v4, s12
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s2, 4
 ; GFX9-NEXT:    v_mov_b32_e32 v7, s15
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s2, 4
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 7
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, v4, v8, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v7, v7, v8, vcc
@@ -3027,31 +3027,31 @@ define amdgpu_ps void @insertelement_s_v16i16_v_s(<16 x i16> addrspace(4)* inreg
 ; GFX8-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX8-NEXT:    v_or_b32_e32 v8, s0, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s8
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s9
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v8, vcc
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s10
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v8, vcc
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 2
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v8, vcc
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s11
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v8, vcc
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 3
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v8, vcc
 ; GFX8-NEXT:    v_mov_b32_e32 v5, s13
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v8, vcc
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v5, v8, vcc
 ; GFX8-NEXT:    v_mov_b32_e32 v6, s14
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v5, v8, vcc
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 6
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s12
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s2, 4
 ; GFX8-NEXT:    v_mov_b32_e32 v7, s15
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s2, 4
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 7
 ; GFX8-NEXT:    v_cndmask_b32_e64 v4, v4, v8, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, v7, v8, vcc
 ; GFX8-NEXT:    v_mov_b32_e32 v8, 0
-; GFX8-NEXT:    v_mov_b32_e32 v10, 16
 ; GFX8-NEXT:    v_mov_b32_e32 v9, 0
+; GFX8-NEXT:    v_mov_b32_e32 v10, 16
 ; GFX8-NEXT:    v_mov_b32_e32 v11, 0
 ; GFX8-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
 ; GFX8-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
@@ -3086,32 +3086,32 @@ define amdgpu_ps void @insertelement_s_v16i16_v_s(<16 x i16> addrspace(4)* inreg
 ; GFX7-NEXT:    v_or_b32_e32 v8, s0, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s8
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 0
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s9
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 1
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v8, vcc
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s10
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v8, vcc
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 2
-; GFX7-NEXT:    v_cndmask_b32_e32 v2, v2, v8, vcc
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s11
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v2, v8, vcc
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 3
-; GFX7-NEXT:    v_cndmask_b32_e32 v3, v3, v8, vcc
 ; GFX7-NEXT:    v_mov_b32_e32 v5, s13
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v3, v8, vcc
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 5
-; GFX7-NEXT:    v_cndmask_b32_e32 v5, v5, v8, vcc
 ; GFX7-NEXT:    v_mov_b32_e32 v4, s12
-; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], s2, 4
 ; GFX7-NEXT:    v_mov_b32_e32 v6, s14
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], s2, 4
+; GFX7-NEXT:    v_cndmask_b32_e32 v5, v5, v8, vcc
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 6
+; GFX7-NEXT:    v_mov_b32_e32 v7, s15
 ; GFX7-NEXT:    v_cndmask_b32_e64 v4, v4, v8, s[0:1]
 ; GFX7-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 7
-; GFX7-NEXT:    v_mov_b32_e32 v7, s15
 ; GFX7-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GFX7-NEXT:    v_cndmask_b32_e32 v7, v7, v8, vcc
+; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GFX7-NEXT:    s_mov_b64 s[0:1], 16
 ; GFX7-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
@@ -3197,21 +3197,21 @@ define amdgpu_ps void @insertelement_s_v16i16_s_v(<16 x i16> addrspace(4)* inreg
 ; GFX9-NEXT:    v_mov_b32_e32 v5, s20
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s[2:3]
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[14:15], 4, v8
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    s_mov_b32 s5, 0xffff
 ; GFX9-NEXT:    v_mov_b32_e32 v6, s21
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[14:15]
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[6:7], 5, v8
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:    s_mov_b32 s5, 0xffff
 ; GFX9-NEXT:    v_mov_b32_e32 v7, s22
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s[6:7]
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[8:9], 6, v8
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GFX9-NEXT:    s_and_b32 s4, s4, s5
-; GFX9-NEXT:    v_lshlrev_b32_e64 v2, v0, s4
-; GFX9-NEXT:    v_lshlrev_b32_e64 v0, v0, s5
 ; GFX9-NEXT:    v_mov_b32_e32 v9, s23
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[8:9]
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[10:11], 7, v8
+; GFX9-NEXT:    v_lshlrev_b32_e64 v2, v0, s4
+; GFX9-NEXT:    v_lshlrev_b32_e64 v0, v0, s5
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s[10:11]
 ; GFX9-NEXT:    v_xor_b32_e32 v0, -1, v0
 ; GFX9-NEXT:    v_and_or_b32 v9, v1, v0, v2
@@ -3224,9 +3224,9 @@ define amdgpu_ps void @insertelement_s_v16i16_s_v(<16 x i16> addrspace(4)* inreg
 ; GFX9-NEXT:    v_mov_b32_e32 v6, s22
 ; GFX9-NEXT:    v_mov_b32_e32 v7, s23
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[12:13], 0, v8
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v9, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v9, s[12:13]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v9, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v9, s[2:3]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, v4, v9, s[14:15]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, v5, v9, s[6:7]
@@ -3257,21 +3257,21 @@ define amdgpu_ps void @insertelement_s_v16i16_s_v(<16 x i16> addrspace(4)* inreg
 ; GFX8-NEXT:    v_mov_b32_e32 v5, s20
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s[2:3]
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[14:15], 4, v8
-; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    s_mov_b32 s5, 0xffff
 ; GFX8-NEXT:    v_mov_b32_e32 v6, s21
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[14:15]
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[6:7], 5, v8
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX8-NEXT:    s_mov_b32 s5, 0xffff
 ; GFX8-NEXT:    v_mov_b32_e32 v7, s22
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s[6:7]
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[8:9], 6, v8
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GFX8-NEXT:    s_and_b32 s4, s4, s5
-; GFX8-NEXT:    v_lshlrev_b32_e64 v2, v0, s4
-; GFX8-NEXT:    v_lshlrev_b32_e64 v0, v0, s5
 ; GFX8-NEXT:    v_mov_b32_e32 v9, s23
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[8:9]
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[10:11], 7, v8
+; GFX8-NEXT:    v_lshlrev_b32_e64 v2, v0, s4
+; GFX8-NEXT:    v_lshlrev_b32_e64 v0, v0, s5
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s[10:11]
 ; GFX8-NEXT:    v_xor_b32_e32 v0, -1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, v1, v0
@@ -3294,8 +3294,8 @@ define amdgpu_ps void @insertelement_s_v16i16_s_v(<16 x i16> addrspace(4)* inreg
 ; GFX8-NEXT:    v_cndmask_b32_e64 v6, v6, v9, s[8:9]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v7, v7, v9, s[10:11]
 ; GFX8-NEXT:    v_mov_b32_e32 v8, 0
-; GFX8-NEXT:    v_mov_b32_e32 v10, 16
 ; GFX8-NEXT:    v_mov_b32_e32 v9, 0
+; GFX8-NEXT:    v_mov_b32_e32 v10, 16
 ; GFX8-NEXT:    v_mov_b32_e32 v11, 0
 ; GFX8-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
 ; GFX8-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
@@ -3318,21 +3318,21 @@ define amdgpu_ps void @insertelement_s_v16i16_s_v(<16 x i16> addrspace(4)* inreg
 ; GFX7-NEXT:    v_mov_b32_e32 v5, s20
 ; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s[2:3]
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[14:15], 4, v8
-; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX7-NEXT:    s_mov_b32 s5, 0xffff
 ; GFX7-NEXT:    v_mov_b32_e32 v6, s21
 ; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[14:15]
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[6:7], 5, v8
+; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX7-NEXT:    s_mov_b32 s5, 0xffff
 ; GFX7-NEXT:    v_mov_b32_e32 v7, s22
 ; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s[6:7]
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[8:9], 6, v8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GFX7-NEXT:    s_and_b32 s4, s4, s5
-; GFX7-NEXT:    v_lshl_b32_e32 v2, s4, v0
-; GFX7-NEXT:    v_lshl_b32_e32 v0, s5, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v9, s23
 ; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[8:9]
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[10:11], 7, v8
+; GFX7-NEXT:    v_lshl_b32_e32 v2, s4, v0
+; GFX7-NEXT:    v_lshl_b32_e32 v0, s5, v0
 ; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s[10:11]
 ; GFX7-NEXT:    v_xor_b32_e32 v0, -1, v0
 ; GFX7-NEXT:    v_and_b32_e32 v0, v1, v0
@@ -3346,18 +3346,18 @@ define amdgpu_ps void @insertelement_s_v16i16_s_v(<16 x i16> addrspace(4)* inreg
 ; GFX7-NEXT:    v_mov_b32_e32 v5, s21
 ; GFX7-NEXT:    v_mov_b32_e32 v6, s22
 ; GFX7-NEXT:    v_mov_b32_e32 v7, s23
-; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v9, s[2:3]
-; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v9, s[0:1]
 ; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v9, s[12:13]
 ; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v9, s[0:1]
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v9, s[2:3]
 ; GFX7-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GFX7-NEXT:    v_cndmask_b32_e64 v4, v4, v9, s[14:15]
 ; GFX7-NEXT:    v_cndmask_b32_e64 v5, v5, v9, s[6:7]
 ; GFX7-NEXT:    v_cndmask_b32_e64 v6, v6, v9, s[8:9]
 ; GFX7-NEXT:    v_cndmask_b32_e64 v7, v7, v9, s[10:11]
+; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GFX7-NEXT:    s_mov_b64 s[0:1], 16
 ; GFX7-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
@@ -3438,20 +3438,20 @@ define amdgpu_ps void @insertelement_s_v16i16_v_v(<16 x i16> addrspace(4)* inreg
 ; GFX9-NEXT:    v_mov_b32_e32 v6, s16
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[2:3]
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 4, v8
-; GFX9-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v7, s17
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[4:5]
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[6:7], 5, v8
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
+; GFX9-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v9, s18
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v7, s[6:7]
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[8:9], 6, v8
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
 ; GFX9-NEXT:    s_mov_b32 s20, 0xffff
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX9-NEXT:    v_lshlrev_b32_e64 v1, v1, s20
 ; GFX9-NEXT:    v_mov_b32_e32 v10, s19
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v9, s[8:9]
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[10:11], 7, v8
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX9-NEXT:    v_lshlrev_b32_e64 v1, v1, s20
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v10, s[10:11]
 ; GFX9-NEXT:    v_xor_b32_e32 v1, -1, v1
 ; GFX9-NEXT:    v_and_or_b32 v9, v2, v1, v0
@@ -3464,9 +3464,9 @@ define amdgpu_ps void @insertelement_s_v16i16_v_v(<16 x i16> addrspace(4)* inreg
 ; GFX9-NEXT:    v_mov_b32_e32 v6, s18
 ; GFX9-NEXT:    v_mov_b32_e32 v7, s19
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[12:13], 0, v8
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v9, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v9, s[12:13]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v9, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v9, s[2:3]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, v4, v9, s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, v5, v9, s[6:7]
@@ -3497,20 +3497,20 @@ define amdgpu_ps void @insertelement_s_v16i16_v_v(<16 x i16> addrspace(4)* inreg
 ; GFX8-NEXT:    v_mov_b32_e32 v6, s16
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[2:3]
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 4, v8
-; GFX8-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v7, s17
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[4:5]
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[6:7], 5, v8
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
+; GFX8-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v9, s18
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v7, s[6:7]
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[8:9], 6, v8
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
 ; GFX8-NEXT:    s_mov_b32 s20, 0xffff
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX8-NEXT:    v_lshlrev_b32_e64 v1, v1, s20
 ; GFX8-NEXT:    v_mov_b32_e32 v10, s19
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v9, s[8:9]
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[10:11], 7, v8
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT:    v_lshlrev_b32_e64 v1, v1, s20
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v10, s[10:11]
 ; GFX8-NEXT:    v_xor_b32_e32 v1, -1, v1
 ; GFX8-NEXT:    v_and_b32_e32 v1, v2, v1
@@ -3533,8 +3533,8 @@ define amdgpu_ps void @insertelement_s_v16i16_v_v(<16 x i16> addrspace(4)* inreg
 ; GFX8-NEXT:    v_cndmask_b32_e64 v6, v6, v9, s[8:9]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v7, v7, v9, s[10:11]
 ; GFX8-NEXT:    v_mov_b32_e32 v8, 0
-; GFX8-NEXT:    v_mov_b32_e32 v10, 16
 ; GFX8-NEXT:    v_mov_b32_e32 v9, 0
+; GFX8-NEXT:    v_mov_b32_e32 v10, 16
 ; GFX8-NEXT:    v_mov_b32_e32 v11, 0
 ; GFX8-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
 ; GFX8-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
@@ -3557,21 +3557,21 @@ define amdgpu_ps void @insertelement_s_v16i16_v_v(<16 x i16> addrspace(4)* inreg
 ; GFX7-NEXT:    v_mov_b32_e32 v6, s16
 ; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[2:3]
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[4:5], 4, v8
-; GFX7-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX7-NEXT:    s_mov_b32 s20, 0xffff
 ; GFX7-NEXT:    v_mov_b32_e32 v7, s17
 ; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[4:5]
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[6:7], 5, v8
+; GFX7-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX7-NEXT:    s_mov_b32 s20, 0xffff
 ; GFX7-NEXT:    v_mov_b32_e32 v9, s18
 ; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v7, s[6:7]
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[8:9], 6, v8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, s20, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, v1, v0
-; GFX7-NEXT:    v_lshl_b32_e32 v1, s20, v1
 ; GFX7-NEXT:    v_mov_b32_e32 v10, s19
 ; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v9, s[8:9]
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[10:11], 7, v8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, v1, v0
+; GFX7-NEXT:    v_lshl_b32_e32 v1, s20, v1
 ; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v10, s[10:11]
 ; GFX7-NEXT:    v_xor_b32_e32 v1, -1, v1
 ; GFX7-NEXT:    v_and_b32_e32 v1, v2, v1
@@ -3585,18 +3585,18 @@ define amdgpu_ps void @insertelement_s_v16i16_v_v(<16 x i16> addrspace(4)* inreg
 ; GFX7-NEXT:    v_mov_b32_e32 v6, s18
 ; GFX7-NEXT:    v_mov_b32_e32 v7, s19
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[12:13], 0, v8
-; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v9, s[2:3]
-; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v9, s[0:1]
 ; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v9, s[12:13]
 ; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v9, s[0:1]
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v9, s[2:3]
 ; GFX7-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GFX7-NEXT:    v_cndmask_b32_e64 v4, v4, v9, s[4:5]
 ; GFX7-NEXT:    v_cndmask_b32_e64 v5, v5, v9, s[6:7]
 ; GFX7-NEXT:    v_cndmask_b32_e64 v6, v6, v9, s[8:9]
 ; GFX7-NEXT:    v_cndmask_b32_e64 v7, v7, v9, s[10:11]
+; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GFX7-NEXT:    s_mov_b64 s[0:1], 16
 ; GFX7-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
@@ -3619,9 +3619,9 @@ define amdgpu_ps void @insertelement_s_v16i16_v_v(<16 x i16> addrspace(4)* inreg
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v1, s4
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, 6, v10
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v8, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT:    v_xor_b32_e32 v9, -1, v3
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s9
-; GFX10-NEXT:    v_xor_b32_e32 v9, -1, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, s8, v2, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s10, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s11, s1
@@ -3663,9 +3663,9 @@ define amdgpu_ps void @insertelement_v_v16i16_s_v(<16 x i16> addrspace(1)* %ptr,
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    global_load_dwordx4 v[3:6], v[0:1], off
 ; GFX9-NEXT:    global_load_dwordx4 v[7:10], v[0:1], off offset:16
+; GFX9-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 1, v2
 ; GFX9-NEXT:    v_and_b32_e32 v1, 1, v2
-; GFX9-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX9-NEXT:    s_and_b32 s1, s2, s0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
@@ -3689,8 +3689,8 @@ define amdgpu_ps void @insertelement_v_v16i16_s_v(<16 x i16> addrspace(1)* %ptr,
 ; GFX9-NEXT:    v_cndmask_b32_e64 v11, v11, v9, s[8:9]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v11, v11, v10, s[10:11]
 ; GFX9-NEXT:    v_and_or_b32 v11, v11, v1, v2
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v5, v11, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v3, v11, s[12:13]
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v5, v11, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, v6, v11, s[2:3]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, v8, v11, s[6:7]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v6, v9, v11, s[8:9]
@@ -3711,9 +3711,9 @@ define amdgpu_ps void @insertelement_v_v16i16_s_v(<16 x i16> addrspace(1)* %ptr,
 ; GFX8-NEXT:    v_addc_u32_e32 v8, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dwordx4 v[3:6], v[0:1]
 ; GFX8-NEXT:    flat_load_dwordx4 v[7:10], v[7:8]
+; GFX8-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 1, v2
 ; GFX8-NEXT:    v_and_b32_e32 v1, 1, v2
-; GFX8-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX8-NEXT:    s_and_b32 s1, s2, s0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
@@ -3747,8 +3747,8 @@ define amdgpu_ps void @insertelement_v_v16i16_s_v(<16 x i16> addrspace(1)* %ptr,
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v11, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v4, v7, v11, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v7, v10, v11, s[10:11]
-; GFX8-NEXT:    v_mov_b32_e32 v10, 16
 ; GFX8-NEXT:    v_mov_b32_e32 v9, 0
+; GFX8-NEXT:    v_mov_b32_e32 v10, 16
 ; GFX8-NEXT:    v_mov_b32_e32 v11, 0
 ; GFX8-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
 ; GFX8-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
@@ -3761,9 +3761,9 @@ define amdgpu_ps void @insertelement_v_v16i16_s_v(<16 x i16> addrspace(1)* %ptr,
 ; GFX7-NEXT:    s_mov_b64 s[16:17], 0
 ; GFX7-NEXT:    buffer_load_dwordx4 v[3:6], v[0:1], s[16:19], 0 addr64
 ; GFX7-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[16:19], 0 addr64 offset:16
+; GFX7-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 1, v2
 ; GFX7-NEXT:    v_and_b32_e32 v1, 1, v2
-; GFX7-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX7-NEXT:    s_and_b32 s1, s2, s0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
@@ -3794,10 +3794,10 @@ define amdgpu_ps void @insertelement_v_v16i16_s_v(<16 x i16> addrspace(1)* %ptr,
 ; GFX7-NEXT:    v_cndmask_b32_e64 v2, v5, v11, s[0:1]
 ; GFX7-NEXT:    v_cndmask_b32_e64 v3, v6, v11, s[2:3]
 ; GFX7-NEXT:    v_cndmask_b32_e64 v4, v7, v11, s[4:5]
-; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
 ; GFX7-NEXT:    v_cndmask_b32_e64 v5, v8, v11, s[6:7]
 ; GFX7-NEXT:    v_cndmask_b32_e64 v6, v9, v11, s[8:9]
 ; GFX7-NEXT:    v_cndmask_b32_e64 v7, v10, v11, s[10:11]
+; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
 ; GFX7-NEXT:    s_mov_b64 s[16:17], 16
 ; GFX7-NEXT:    buffer_store_dwordx4 v[4:7], off, s[16:19], 0
 ; GFX7-NEXT:    s_endpgm
@@ -3860,9 +3860,9 @@ define amdgpu_ps void @insertelement_v_v16i16_v_s(<16 x i16> addrspace(1)* %ptr,
 ; GFX9-NEXT:    global_load_dwordx4 v[3:6], v[0:1], off
 ; GFX9-NEXT:    global_load_dwordx4 v[7:10], v[0:1], off offset:16
 ; GFX9-NEXT:    s_and_b32 s1, s2, 1
+; GFX9-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX9-NEXT:    s_lshr_b32 s12, s2, 1
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, 4
-; GFX9-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, s1
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s12, 1
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, s1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
@@ -3884,8 +3884,8 @@ define amdgpu_ps void @insertelement_v_v16i16_v_s(<16 x i16> addrspace(1)* %ptr,
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v10, s[10:11]
 ; GFX9-NEXT:    v_and_or_b32 v11, v1, s13, v0
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[12:13], s12, 0
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v5, v11, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v3, v11, s[12:13]
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v5, v11, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, v6, v11, s[2:3]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, v8, v11, s[6:7]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v6, v9, v11, s[8:9]
@@ -3907,9 +3907,9 @@ define amdgpu_ps void @insertelement_v_v16i16_v_s(<16 x i16> addrspace(1)* %ptr,
 ; GFX8-NEXT:    flat_load_dwordx4 v[3:6], v[0:1]
 ; GFX8-NEXT:    flat_load_dwordx4 v[7:10], v[7:8]
 ; GFX8-NEXT:    s_and_b32 s1, s2, 1
+; GFX8-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX8-NEXT:    s_lshr_b32 s12, s2, 1
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 4
-; GFX8-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, s1
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s12, 1
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s1
@@ -3942,8 +3942,8 @@ define amdgpu_ps void @insertelement_v_v16i16_v_s(<16 x i16> addrspace(1)* %ptr,
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v11, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v4, v7, v11, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v7, v10, v11, s[10:11]
-; GFX8-NEXT:    v_mov_b32_e32 v10, 16
 ; GFX8-NEXT:    v_mov_b32_e32 v9, 0
+; GFX8-NEXT:    v_mov_b32_e32 v10, 16
 ; GFX8-NEXT:    v_mov_b32_e32 v11, 0
 ; GFX8-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
 ; GFX8-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
@@ -3957,8 +3957,8 @@ define amdgpu_ps void @insertelement_v_v16i16_v_s(<16 x i16> addrspace(1)* %ptr,
 ; GFX7-NEXT:    buffer_load_dwordx4 v[3:6], v[0:1], s[16:19], 0 addr64
 ; GFX7-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[16:19], 0 addr64 offset:16
 ; GFX7-NEXT:    s_and_b32 s1, s2, 1
-; GFX7-NEXT:    s_lshr_b32 s12, s2, 1
 ; GFX7-NEXT:    s_mov_b32 s0, 0xffff
+; GFX7-NEXT:    s_lshr_b32 s12, s2, 1
 ; GFX7-NEXT:    s_lshl_b32 s1, s1, 4
 ; GFX7-NEXT:    v_and_b32_e32 v0, s0, v2
 ; GFX7-NEXT:    s_lshl_b32 s0, s0, s1
@@ -3989,10 +3989,10 @@ define amdgpu_ps void @insertelement_v_v16i16_v_s(<16 x i16> addrspace(1)* %ptr,
 ; GFX7-NEXT:    v_cndmask_b32_e64 v2, v5, v11, s[0:1]
 ; GFX7-NEXT:    v_cndmask_b32_e64 v3, v6, v11, s[2:3]
 ; GFX7-NEXT:    v_cndmask_b32_e64 v4, v7, v11, s[4:5]
-; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
 ; GFX7-NEXT:    v_cndmask_b32_e64 v5, v8, v11, s[6:7]
 ; GFX7-NEXT:    v_cndmask_b32_e64 v6, v9, v11, s[8:9]
 ; GFX7-NEXT:    v_cndmask_b32_e64 v7, v10, v11, s[10:11]
+; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
 ; GFX7-NEXT:    s_mov_b64 s[16:17], 16
 ; GFX7-NEXT:    buffer_store_dwordx4 v[4:7], off, s[16:19], 0
 ; GFX7-NEXT:    s_endpgm
@@ -4055,9 +4055,9 @@ define amdgpu_ps void @insertelement_v_v16i16_v_v(<16 x i16> addrspace(1)* %ptr,
 ; GFX9-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:16
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 1, v3
 ; GFX9-NEXT:    v_and_b32_e32 v1, 1, v3
+; GFX9-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX9-NEXT:    v_lshlrev_b32_e64 v1, v1, s0
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v0
@@ -4102,9 +4102,9 @@ define amdgpu_ps void @insertelement_v_v16i16_v_v(<16 x i16> addrspace(1)* %ptr,
 ; GFX8-NEXT:    flat_load_dwordx4 v[8:11], v[8:9]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 1, v3
 ; GFX8-NEXT:    v_and_b32_e32 v1, 1, v3
+; GFX8-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX8-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX8-NEXT:    v_lshlrev_b32_e64 v1, v1, s0
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v0
@@ -4135,8 +4135,8 @@ define amdgpu_ps void @insertelement_v_v16i16_v_v(<16 x i16> addrspace(1)* %ptr,
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, v7, v12, s[2:3]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v6, v10, v12, s[8:9]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v7, v11, v12, s[10:11]
-; GFX8-NEXT:    v_mov_b32_e32 v10, 16
 ; GFX8-NEXT:    v_mov_b32_e32 v9, 0
+; GFX8-NEXT:    v_mov_b32_e32 v10, 16
 ; GFX8-NEXT:    v_mov_b32_e32 v11, 0
 ; GFX8-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
 ; GFX8-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
@@ -4149,12 +4149,12 @@ define amdgpu_ps void @insertelement_v_v16i16_v_v(<16 x i16> addrspace(1)* %ptr,
 ; GFX7-NEXT:    s_mov_b64 s[16:17], 0
 ; GFX7-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[16:19], 0 addr64
 ; GFX7-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[16:19], 0 addr64 offset:16
+; GFX7-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 1, v3
 ; GFX7-NEXT:    v_and_b32_e32 v1, 1, v3
-; GFX7-NEXT:    s_mov_b32 s0, 0xffff
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX7-NEXT:    v_and_b32_e32 v2, s0, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v1, v2
 ; GFX7-NEXT:    v_lshl_b32_e32 v1, s0, v1
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v0
@@ -4181,11 +4181,11 @@ define amdgpu_ps void @insertelement_v_v16i16_v_v(<16 x i16> addrspace(1)* %ptr,
 ; GFX7-NEXT:    v_cndmask_b32_e32 v1, v5, v12, vcc
 ; GFX7-NEXT:    v_cndmask_b32_e64 v2, v6, v12, s[0:1]
 ; GFX7-NEXT:    v_cndmask_b32_e64 v3, v7, v12, s[2:3]
-; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
 ; GFX7-NEXT:    v_cndmask_b32_e64 v4, v8, v12, s[4:5]
 ; GFX7-NEXT:    v_cndmask_b32_e64 v5, v9, v12, s[6:7]
 ; GFX7-NEXT:    v_cndmask_b32_e64 v6, v10, v12, s[8:9]
 ; GFX7-NEXT:    v_cndmask_b32_e64 v7, v11, v12, s[10:11]
+; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
 ; GFX7-NEXT:    s_mov_b64 s[16:17], 16
 ; GFX7-NEXT:    buffer_store_dwordx4 v[4:7], off, s[16:19], 0
 ; GFX7-NEXT:    s_endpgm

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll
index 9a658af63641f..18d82efb02251 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll
@@ -12,8 +12,8 @@ define amdgpu_ps void @insertelement_s_v2i8_s_s(<2 x i8> addrspace(4)* inreg %pt
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xff, v0
@@ -32,8 +32,8 @@ define amdgpu_ps void @insertelement_s_v2i8_s_s(<2 x i8> addrspace(4)* inreg %pt
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xff, v0
@@ -97,8 +97,8 @@ define amdgpu_ps void @insertelement_v_v2i8_s_s(<2 x i8> addrspace(1)* %ptr, i8
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s3, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s3, 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xff, v0
@@ -115,8 +115,8 @@ define amdgpu_ps void @insertelement_v_v2i8_s_s(<2 x i8> addrspace(1)* %ptr, i8
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s3, 0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s3, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xff, v0
@@ -178,8 +178,8 @@ define amdgpu_ps void @insertelement_s_v2i8_v_s(<2 x i8> addrspace(4)* inreg %pt
 ; GFX9-NEXT:    global_load_ushort v1, v1, s[2:3]
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 8, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xff, v0
@@ -197,8 +197,8 @@ define amdgpu_ps void @insertelement_s_v2i8_v_s(<2 x i8> addrspace(4)* inreg %pt
 ; GFX8-NEXT:    flat_load_ushort v1, v[1:2]
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 8, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xff, v0
@@ -262,8 +262,8 @@ define amdgpu_ps void @insertelement_s_v2i8_s_v(<2 x i8> addrspace(4)* inreg %pt
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v1, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v1, v2, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xff, v0
@@ -282,8 +282,8 @@ define amdgpu_ps void @insertelement_s_v2i8_s_v(<2 x i8> addrspace(4)* inreg %pt
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v1, v2, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v1, v2, vcc
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xff, v0
@@ -430,8 +430,8 @@ define amdgpu_ps void @insertelement_v_v2i8_s_v(<2 x i8> addrspace(1)* %ptr, i8
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xff, v0
@@ -448,8 +448,8 @@ define amdgpu_ps void @insertelement_v_v2i8_s_v(<2 x i8> addrspace(1)* %ptr, i8
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xff, v0
@@ -510,8 +510,8 @@ define amdgpu_ps void @insertelement_v_v2i8_v_s(<2 x i8> addrspace(1)* %ptr, i8
 ; GFX9-NEXT:    global_load_ushort v0, v[0:1], off
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v0, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v0, v2, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xff, v0
@@ -527,8 +527,8 @@ define amdgpu_ps void @insertelement_v_v2i8_v_s(<2 x i8> addrspace(1)* %ptr, i8
 ; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v0, v2, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v0, v2, vcc
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xff, v0
@@ -588,8 +588,8 @@ define amdgpu_ps void @insertelement_v_v2i8_v_v(<2 x i8> addrspace(1)* %ptr, i8
 ; GFX9-NEXT:    global_load_ushort v0, v[0:1], off
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v0, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v0, v2, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xff, v0
@@ -605,8 +605,8 @@ define amdgpu_ps void @insertelement_v_v2i8_v_v(<2 x i8> addrspace(1)* %ptr, i8
 ; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v0, v2, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v0, v2, vcc
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xff, v0
@@ -721,9 +721,9 @@ define amdgpu_ps void @insertelement_v_v4i8_s_s(<4 x i8> addrspace(1)* %ptr, i8
 ; GFX9-LABEL: insertelement_v_v4i8_s_s:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off
-; GFX9-NEXT:    s_and_b32 s3, s3, 3
 ; GFX9-NEXT:    s_mov_b32 s0, 8
 ; GFX9-NEXT:    s_movk_i32 s4, 0xff
+; GFX9-NEXT:    s_and_b32 s3, s3, 3
 ; GFX9-NEXT:    s_mov_b32 s1, 16
 ; GFX9-NEXT:    s_and_b32 s2, s2, s4
 ; GFX9-NEXT:    s_lshl_b32 s3, s3, 3
@@ -850,11 +850,11 @@ define amdgpu_ps void @insertelement_v_v4i8_s_s(<4 x i8> addrspace(1)* %ptr, i8
 ; GFX10-NEXT:    s_not_b32 s2, s3
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 8
 ; GFX10-NEXT:    v_and_or_b32 v0, v0, s2, s0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
 ; GFX10-NEXT:    v_and_or_b32 v4, v0, s1, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    v_or3_b32 v2, v4, v2, v3
@@ -878,8 +878,8 @@ define amdgpu_ps void @insertelement_s_v4i8_v_s(<4 x i8> addrspace(4)* inreg %pt
 ; GFX9-NEXT:    s_bfe_u32 s7, s0, 0x80008
 ; GFX9-NEXT:    s_lshr_b32 s3, s0, 24
 ; GFX9-NEXT:    s_and_b32 s6, s0, s5
-; GFX9-NEXT:    s_bfe_u32 s0, s0, 0x80010
 ; GFX9-NEXT:    s_lshl_b32 s7, s7, 8
+; GFX9-NEXT:    s_bfe_u32 s0, s0, 0x80010
 ; GFX9-NEXT:    s_or_b32 s6, s6, s7
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX9-NEXT:    s_or_b32 s0, s6, s0
@@ -891,8 +891,8 @@ define amdgpu_ps void @insertelement_s_v4i8_v_s(<4 x i8> addrspace(4)* inreg %pt
 ; GFX9-NEXT:    s_andn2_b32 s0, s0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-NEXT:    v_lshl_or_b32 v0, v0, s3, v1
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 24, v0
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_and_or_b32 v2, v0, s5, v2
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v3, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 24, v1
@@ -912,8 +912,8 @@ define amdgpu_ps void @insertelement_s_v4i8_v_s(<4 x i8> addrspace(4)* inreg %pt
 ; GFX8-NEXT:    s_bfe_u32 s5, s1, 0x80008
 ; GFX8-NEXT:    s_lshr_b32 s2, s1, 24
 ; GFX8-NEXT:    s_and_b32 s3, s1, s0
-; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x80010
 ; GFX8-NEXT:    s_lshl_b32 s5, s5, 8
+; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x80010
 ; GFX8-NEXT:    s_or_b32 s3, s3, s5
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX8-NEXT:    s_or_b32 s1, s3, s1
@@ -921,8 +921,8 @@ define amdgpu_ps void @insertelement_s_v4i8_v_s(<4 x i8> addrspace(4)* inreg %pt
 ; GFX8-NEXT:    s_or_b32 s1, s1, s2
 ; GFX8-NEXT:    s_and_b32 s2, s4, 3
 ; GFX8-NEXT:    s_lshl_b32 s2, s2, 3
-; GFX8-NEXT:    s_lshl_b32 s0, s0, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s2
+; GFX8-NEXT:    s_lshl_b32 s0, s0, s2
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX8-NEXT:    s_andn2_b32 s0, s1, s0
 ; GFX8-NEXT:    v_or_b32_e32 v0, s0, v0
@@ -947,8 +947,8 @@ define amdgpu_ps void @insertelement_s_v4i8_v_s(<4 x i8> addrspace(4)* inreg %pt
 ; GFX7-NEXT:    s_bfe_u32 s3, s0, 0x80008
 ; GFX7-NEXT:    s_lshr_b32 s1, s0, 24
 ; GFX7-NEXT:    s_and_b32 s2, s0, s5
-; GFX7-NEXT:    s_bfe_u32 s0, s0, 0x80010
 ; GFX7-NEXT:    s_lshl_b32 s3, s3, 8
+; GFX7-NEXT:    s_bfe_u32 s0, s0, 0x80010
 ; GFX7-NEXT:    s_or_b32 s2, s2, s3
 ; GFX7-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX7-NEXT:    s_or_b32 s0, s2, s0
@@ -963,8 +963,8 @@ define amdgpu_ps void @insertelement_s_v4i8_v_s(<4 x i8> addrspace(4)* inreg %pt
 ; GFX7-NEXT:    v_bfe_u32 v3, v0, 8, 8
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 24, v0
 ; GFX7-NEXT:    v_and_b32_e32 v2, s5, v0
-; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
+; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
 ; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    v_or_b32_e32 v0, v2, v0
@@ -1027,8 +1027,8 @@ define amdgpu_ps void @insertelement_s_v4i8_s_v(<4 x i8> addrspace(4)* inreg %pt
 ; GFX9-NEXT:    s_bfe_u32 s7, s0, 0x80008
 ; GFX9-NEXT:    s_lshr_b32 s3, s0, 24
 ; GFX9-NEXT:    s_and_b32 s6, s0, s5
-; GFX9-NEXT:    s_bfe_u32 s0, s0, 0x80010
 ; GFX9-NEXT:    s_lshl_b32 s7, s7, 8
+; GFX9-NEXT:    s_bfe_u32 s0, s0, 0x80010
 ; GFX9-NEXT:    s_or_b32 s6, s6, s7
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX9-NEXT:    s_or_b32 s0, s6, s0
@@ -1039,9 +1039,9 @@ define amdgpu_ps void @insertelement_s_v4i8_s_v(<4 x i8> addrspace(4)* inreg %pt
 ; GFX9-NEXT:    v_lshlrev_b32_e64 v0, v0, s5
 ; GFX9-NEXT:    v_xor_b32_e32 v0, -1, v0
 ; GFX9-NEXT:    v_and_or_b32 v0, s0, v0, v1
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    s_mov_b32 s2, 16
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 24, v0
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_and_or_b32 v2, v0, s5, v2
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v3, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 24, v1
@@ -1062,8 +1062,8 @@ define amdgpu_ps void @insertelement_s_v4i8_s_v(<4 x i8> addrspace(4)* inreg %pt
 ; GFX8-NEXT:    s_bfe_u32 s5, s1, 0x80008
 ; GFX8-NEXT:    s_lshr_b32 s2, s1, 24
 ; GFX8-NEXT:    s_and_b32 s3, s1, s0
-; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x80010
 ; GFX8-NEXT:    s_lshl_b32 s5, s5, 8
+; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x80010
 ; GFX8-NEXT:    s_or_b32 s3, s3, s5
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX8-NEXT:    s_or_b32 s1, s3, s1
@@ -1098,8 +1098,8 @@ define amdgpu_ps void @insertelement_s_v4i8_s_v(<4 x i8> addrspace(4)* inreg %pt
 ; GFX7-NEXT:    s_bfe_u32 s3, s0, 0x80008
 ; GFX7-NEXT:    s_lshr_b32 s1, s0, 24
 ; GFX7-NEXT:    s_and_b32 s2, s0, s5
-; GFX7-NEXT:    s_bfe_u32 s0, s0, 0x80010
 ; GFX7-NEXT:    s_lshl_b32 s3, s3, 8
+; GFX7-NEXT:    s_bfe_u32 s0, s0, 0x80010
 ; GFX7-NEXT:    s_or_b32 s2, s2, s3
 ; GFX7-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX7-NEXT:    s_or_b32 s0, s2, s0
@@ -1114,8 +1114,8 @@ define amdgpu_ps void @insertelement_s_v4i8_s_v(<4 x i8> addrspace(4)* inreg %pt
 ; GFX7-NEXT:    v_bfe_u32 v3, v0, 8, 8
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 24, v0
 ; GFX7-NEXT:    v_and_b32_e32 v2, s5, v0
-; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
+; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
 ; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    v_or_b32_e32 v0, v2, v0
@@ -1179,20 +1179,20 @@ define amdgpu_ps void @insertelement_s_v4i8_v_v(<4 x i8> addrspace(4)* inreg %pt
 ; GFX9-NEXT:    s_bfe_u32 s6, s0, 0x80008
 ; GFX9-NEXT:    s_lshr_b32 s3, s0, 24
 ; GFX9-NEXT:    s_and_b32 s5, s0, s4
-; GFX9-NEXT:    s_bfe_u32 s0, s0, 0x80010
 ; GFX9-NEXT:    s_lshl_b32 s6, s6, 8
+; GFX9-NEXT:    s_bfe_u32 s0, s0, 0x80010
 ; GFX9-NEXT:    s_or_b32 s5, s5, s6
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX9-NEXT:    v_lshlrev_b32_e64 v1, v1, s4
 ; GFX9-NEXT:    s_or_b32 s0, s5, s0
 ; GFX9-NEXT:    s_lshl_b32 s3, s3, 24
+; GFX9-NEXT:    v_lshlrev_b32_e64 v1, v1, s4
 ; GFX9-NEXT:    s_or_b32 s0, s0, s3
 ; GFX9-NEXT:    v_xor_b32_e32 v1, -1, v1
-; GFX9-NEXT:    v_and_or_b32 v0, s0, v1, v0
 ; GFX9-NEXT:    s_mov_b32 s1, 8
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_and_or_b32 v0, s0, v1, v0
 ; GFX9-NEXT:    s_mov_b32 s2, 16
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 24, v0
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_and_or_b32 v2, v0, s4, v2
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v3, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 24, v1
@@ -1213,13 +1213,13 @@ define amdgpu_ps void @insertelement_s_v4i8_v_v(<4 x i8> addrspace(4)* inreg %pt
 ; GFX8-NEXT:    s_bfe_u32 s4, s1, 0x80008
 ; GFX8-NEXT:    s_lshr_b32 s2, s1, 24
 ; GFX8-NEXT:    s_and_b32 s3, s1, s0
-; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x80010
 ; GFX8-NEXT:    s_lshl_b32 s4, s4, 8
+; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x80010
 ; GFX8-NEXT:    s_or_b32 s3, s3, s4
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX8-NEXT:    v_lshlrev_b32_e64 v1, v1, s0
 ; GFX8-NEXT:    s_or_b32 s1, s3, s1
 ; GFX8-NEXT:    s_lshl_b32 s2, s2, 24
+; GFX8-NEXT:    v_lshlrev_b32_e64 v1, v1, s0
 ; GFX8-NEXT:    s_or_b32 s1, s1, s2
 ; GFX8-NEXT:    v_xor_b32_e32 v1, -1, v1
 ; GFX8-NEXT:    v_and_b32_e32 v1, s1, v1
@@ -1249,8 +1249,8 @@ define amdgpu_ps void @insertelement_s_v4i8_v_v(<4 x i8> addrspace(4)* inreg %pt
 ; GFX7-NEXT:    s_bfe_u32 s3, s0, 0x80008
 ; GFX7-NEXT:    s_lshr_b32 s1, s0, 24
 ; GFX7-NEXT:    s_and_b32 s2, s0, s4
-; GFX7-NEXT:    s_bfe_u32 s0, s0, 0x80010
 ; GFX7-NEXT:    s_lshl_b32 s3, s3, 8
+; GFX7-NEXT:    s_bfe_u32 s0, s0, 0x80010
 ; GFX7-NEXT:    s_or_b32 s2, s2, s3
 ; GFX7-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX7-NEXT:    s_or_b32 s0, s2, s0
@@ -1264,8 +1264,8 @@ define amdgpu_ps void @insertelement_s_v4i8_v_v(<4 x i8> addrspace(4)* inreg %pt
 ; GFX7-NEXT:    v_bfe_u32 v3, v0, 8, 8
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 24, v0
 ; GFX7-NEXT:    v_and_b32_e32 v2, s4, v0
-; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
+; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
 ; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    v_or_b32_e32 v0, v2, v0
@@ -1320,9 +1320,9 @@ define amdgpu_ps void @insertelement_v_v4i8_s_v(<4 x i8> addrspace(1)* %ptr, i8
 ; GFX9-LABEL: insertelement_v_v4i8_s_v:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off
-; GFX9-NEXT:    v_and_b32_e32 v2, 3, v2
 ; GFX9-NEXT:    s_mov_b32 s0, 8
 ; GFX9-NEXT:    s_movk_i32 s3, 0xff
+; GFX9-NEXT:    v_and_b32_e32 v2, 3, v2
 ; GFX9-NEXT:    s_mov_b32 s1, 16
 ; GFX9-NEXT:    s_and_b32 s2, s2, s3
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
@@ -1354,9 +1354,9 @@ define amdgpu_ps void @insertelement_v_v4i8_s_v(<4 x i8> addrspace(1)* %ptr, i8
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 8
+; GFX8-NEXT:    s_movk_i32 s0, 0xff
 ; GFX8-NEXT:    v_mov_b32_e32 v3, 16
 ; GFX8-NEXT:    v_and_b32_e32 v2, 3, v2
-; GFX8-NEXT:    s_movk_i32 s0, 0xff
 ; GFX8-NEXT:    s_and_b32 s1, s2, s0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
 ; GFX8-NEXT:    v_lshlrev_b32_e64 v6, v2, s1
@@ -1468,11 +1468,11 @@ define amdgpu_ps void @insertelement_v_v4i8_v_s(<4 x i8> addrspace(1)* %ptr, i8
 ; GFX9-LABEL: insertelement_v_v4i8_v_s:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off
-; GFX9-NEXT:    s_and_b32 s2, s2, 3
 ; GFX9-NEXT:    s_mov_b32 s0, 8
+; GFX9-NEXT:    s_and_b32 s2, s2, 3
 ; GFX9-NEXT:    s_mov_b32 s1, 16
-; GFX9-NEXT:    s_lshl_b32 s2, s2, 3
 ; GFX9-NEXT:    s_movk_i32 s3, 0xff
+; GFX9-NEXT:    s_lshl_b32 s2, s2, 3
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    s_lshl_b32 s2, s3, s2
 ; GFX9-NEXT:    s_not_b32 s2, s2
@@ -1594,11 +1594,11 @@ define amdgpu_ps void @insertelement_v_v4i8_v_s(<4 x i8> addrspace(1)* %ptr, i8
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 8
 ; GFX10-NEXT:    v_and_or_b32 v0, v0, s1, v2
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 16
-; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
 ; GFX10-NEXT:    v_and_or_b32 v4, v0, s0, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    v_or3_b32 v2, v4, v2, v3
@@ -1614,11 +1614,11 @@ define amdgpu_ps void @insertelement_v_v4i8_v_v(<4 x i8> addrspace(1)* %ptr, i8
 ; GFX9-LABEL: insertelement_v_v4i8_v_v:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off
-; GFX9-NEXT:    v_and_b32_e32 v3, 3, v3
 ; GFX9-NEXT:    s_mov_b32 s0, 8
+; GFX9-NEXT:    v_and_b32_e32 v3, 3, v3
 ; GFX9-NEXT:    s_mov_b32 s1, 16
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX9-NEXT:    s_movk_i32 s2, 0xff
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_lshlrev_b32_e64 v3, v3, s2
 ; GFX9-NEXT:    v_xor_b32_e32 v3, -1, v3
@@ -1633,8 +1633,8 @@ define amdgpu_ps void @insertelement_v_v4i8_v_v(<4 x i8> addrspace(1)* %ptr, i8
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
 ; GFX9-NEXT:    v_or3_b32 v0, v0, v8, v6
 ; GFX9-NEXT:    v_and_or_b32 v0, v0, v3, v2
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v3, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v3, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v4, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_and_or_b32 v3, v0, v1, v3
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
@@ -1649,11 +1649,11 @@ define amdgpu_ps void @insertelement_v_v4i8_v_v(<4 x i8> addrspace(1)* %ptr, i8
 ; GFX8-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX8-NEXT:    v_mov_b32_e32 v4, 8
 ; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0xff
 ; GFX8-NEXT:    v_mov_b32_e32 v5, 16
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX8-NEXT:    v_mov_b32_e32 v1, 0xff
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, v3, v1
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, v3, v1
 ; GFX8-NEXT:    v_xor_b32_e32 v1, -1, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v6, 8
 ; GFX8-NEXT:    v_mov_b32_e32 v7, 16
@@ -1775,13 +1775,13 @@ define amdgpu_ps void @insertelement_s_v8i8_s_s(<8 x i8> addrspace(4)* inreg %pt
 ; GFX9-NEXT:    s_bfe_u32 s0, s0, s9
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX9-NEXT:    s_or_b32 s0, s7, s0
-; GFX9-NEXT:    s_bfe_u32 s7, s1, s8
 ; GFX9-NEXT:    s_lshl_b32 s2, s2, 24
-; GFX9-NEXT:    s_or_b32 s0, s0, s2
+; GFX9-NEXT:    s_bfe_u32 s7, s1, s8
 ; GFX9-NEXT:    s_lshr_b32 s3, s1, 24
+; GFX9-NEXT:    s_or_b32 s0, s0, s2
 ; GFX9-NEXT:    s_and_b32 s2, s1, s6
-; GFX9-NEXT:    s_bfe_u32 s1, s1, s9
 ; GFX9-NEXT:    s_lshl_b32 s7, s7, 8
+; GFX9-NEXT:    s_bfe_u32 s1, s1, s9
 ; GFX9-NEXT:    s_or_b32 s2, s2, s7
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX9-NEXT:    s_or_b32 s1, s2, s1
@@ -1804,18 +1804,18 @@ define amdgpu_ps void @insertelement_s_v8i8_s_s(<8 x i8> addrspace(4)* inreg %pt
 ; GFX9-NEXT:    s_bfe_u32 s5, s0, s8
 ; GFX9-NEXT:    s_lshr_b32 s2, s0, 24
 ; GFX9-NEXT:    s_and_b32 s4, s0, s6
-; GFX9-NEXT:    s_bfe_u32 s0, s0, s9
 ; GFX9-NEXT:    s_lshl_b32 s5, s5, 8
+; GFX9-NEXT:    s_bfe_u32 s0, s0, s9
 ; GFX9-NEXT:    s_or_b32 s4, s4, s5
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX9-NEXT:    s_or_b32 s0, s4, s0
-; GFX9-NEXT:    s_bfe_u32 s4, s1, s8
 ; GFX9-NEXT:    s_lshl_b32 s2, s2, 24
-; GFX9-NEXT:    s_or_b32 s0, s0, s2
+; GFX9-NEXT:    s_bfe_u32 s4, s1, s8
 ; GFX9-NEXT:    s_lshr_b32 s3, s1, 24
+; GFX9-NEXT:    s_or_b32 s0, s0, s2
 ; GFX9-NEXT:    s_and_b32 s2, s1, s6
-; GFX9-NEXT:    s_bfe_u32 s1, s1, s9
 ; GFX9-NEXT:    s_lshl_b32 s4, s4, 8
+; GFX9-NEXT:    s_bfe_u32 s1, s1, s9
 ; GFX9-NEXT:    s_or_b32 s2, s2, s4
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX9-NEXT:    s_or_b32 s1, s2, s1
@@ -1843,13 +1843,13 @@ define amdgpu_ps void @insertelement_s_v8i8_s_s(<8 x i8> addrspace(4)* inreg %pt
 ; GFX8-NEXT:    s_bfe_u32 s0, s0, s9
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX8-NEXT:    s_or_b32 s0, s7, s0
-; GFX8-NEXT:    s_bfe_u32 s7, s1, s8
 ; GFX8-NEXT:    s_lshl_b32 s2, s2, 24
-; GFX8-NEXT:    s_or_b32 s0, s0, s2
+; GFX8-NEXT:    s_bfe_u32 s7, s1, s8
 ; GFX8-NEXT:    s_lshr_b32 s3, s1, 24
+; GFX8-NEXT:    s_or_b32 s0, s0, s2
 ; GFX8-NEXT:    s_and_b32 s2, s1, s6
-; GFX8-NEXT:    s_bfe_u32 s1, s1, s9
 ; GFX8-NEXT:    s_lshl_b32 s7, s7, 8
+; GFX8-NEXT:    s_bfe_u32 s1, s1, s9
 ; GFX8-NEXT:    s_or_b32 s2, s2, s7
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX8-NEXT:    s_or_b32 s1, s2, s1
@@ -1872,18 +1872,18 @@ define amdgpu_ps void @insertelement_s_v8i8_s_s(<8 x i8> addrspace(4)* inreg %pt
 ; GFX8-NEXT:    s_bfe_u32 s5, s0, s8
 ; GFX8-NEXT:    s_lshr_b32 s2, s0, 24
 ; GFX8-NEXT:    s_and_b32 s4, s0, s6
-; GFX8-NEXT:    s_bfe_u32 s0, s0, s9
 ; GFX8-NEXT:    s_lshl_b32 s5, s5, 8
+; GFX8-NEXT:    s_bfe_u32 s0, s0, s9
 ; GFX8-NEXT:    s_or_b32 s4, s4, s5
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX8-NEXT:    s_or_b32 s0, s4, s0
-; GFX8-NEXT:    s_bfe_u32 s4, s1, s8
 ; GFX8-NEXT:    s_lshl_b32 s2, s2, 24
-; GFX8-NEXT:    s_or_b32 s0, s0, s2
+; GFX8-NEXT:    s_bfe_u32 s4, s1, s8
 ; GFX8-NEXT:    s_lshr_b32 s3, s1, 24
+; GFX8-NEXT:    s_or_b32 s0, s0, s2
 ; GFX8-NEXT:    s_and_b32 s2, s1, s6
-; GFX8-NEXT:    s_bfe_u32 s1, s1, s9
 ; GFX8-NEXT:    s_lshl_b32 s4, s4, 8
+; GFX8-NEXT:    s_bfe_u32 s1, s1, s9
 ; GFX8-NEXT:    s_or_b32 s2, s2, s4
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX8-NEXT:    s_or_b32 s1, s2, s1
@@ -1909,13 +1909,13 @@ define amdgpu_ps void @insertelement_s_v8i8_s_s(<8 x i8> addrspace(4)* inreg %pt
 ; GFX7-NEXT:    s_bfe_u32 s0, s0, s9
 ; GFX7-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX7-NEXT:    s_or_b32 s0, s7, s0
-; GFX7-NEXT:    s_bfe_u32 s7, s1, s8
 ; GFX7-NEXT:    s_lshl_b32 s2, s2, 24
-; GFX7-NEXT:    s_or_b32 s0, s0, s2
+; GFX7-NEXT:    s_bfe_u32 s7, s1, s8
 ; GFX7-NEXT:    s_lshr_b32 s3, s1, 24
+; GFX7-NEXT:    s_or_b32 s0, s0, s2
 ; GFX7-NEXT:    s_and_b32 s2, s1, s6
-; GFX7-NEXT:    s_bfe_u32 s1, s1, s9
 ; GFX7-NEXT:    s_lshl_b32 s7, s7, 8
+; GFX7-NEXT:    s_bfe_u32 s1, s1, s9
 ; GFX7-NEXT:    s_or_b32 s2, s2, s7
 ; GFX7-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX7-NEXT:    s_or_b32 s1, s2, s1
@@ -1938,8 +1938,8 @@ define amdgpu_ps void @insertelement_s_v8i8_s_s(<8 x i8> addrspace(4)* inreg %pt
 ; GFX7-NEXT:    s_bfe_u32 s10, s4, s8
 ; GFX7-NEXT:    s_lshr_b32 s2, s4, 24
 ; GFX7-NEXT:    s_and_b32 s7, s4, s6
-; GFX7-NEXT:    s_bfe_u32 s4, s4, s9
 ; GFX7-NEXT:    s_lshl_b32 s10, s10, 8
+; GFX7-NEXT:    s_bfe_u32 s4, s4, s9
 ; GFX7-NEXT:    s_or_b32 s7, s7, s10
 ; GFX7-NEXT:    s_lshl_b32 s4, s4, 16
 ; GFX7-NEXT:    s_or_b32 s4, s7, s4
@@ -1948,16 +1948,16 @@ define amdgpu_ps void @insertelement_s_v8i8_s_s(<8 x i8> addrspace(4)* inreg %pt
 ; GFX7-NEXT:    s_and_b32 s4, s3, s6
 ; GFX7-NEXT:    s_bfe_u32 s6, s3, s8
 ; GFX7-NEXT:    s_lshr_b32 s5, s3, 24
-; GFX7-NEXT:    s_bfe_u32 s3, s3, s9
 ; GFX7-NEXT:    s_lshl_b32 s6, s6, 8
+; GFX7-NEXT:    s_bfe_u32 s3, s3, s9
 ; GFX7-NEXT:    s_or_b32 s4, s4, s6
 ; GFX7-NEXT:    s_lshl_b32 s3, s3, 16
 ; GFX7-NEXT:    s_or_b32 s3, s4, s3
 ; GFX7-NEXT:    s_lshl_b32 s4, s5, 24
 ; GFX7-NEXT:    s_or_b32 s3, s3, s4
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX7-NEXT:    s_mov_b64 s[0:1], 0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -1983,13 +1983,13 @@ define amdgpu_ps void @insertelement_s_v8i8_s_s(<8 x i8> addrspace(4)* inreg %pt
 ; GFX10-NEXT:    s_bfe_u32 s1, s1, s6
 ; GFX10-NEXT:    s_lshl_b32 s11, s11, 8
 ; GFX10-NEXT:    s_lshl_b32 s13, s13, 8
-; GFX10-NEXT:    s_or_b32 s10, s10, s11
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX10-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX10-NEXT:    s_or_b32 s10, s10, s11
 ; GFX10-NEXT:    s_or_b32 s11, s12, s13
 ; GFX10-NEXT:    s_lshl_b32 s8, s8, 24
-; GFX10-NEXT:    s_or_b32 s0, s10, s0
 ; GFX10-NEXT:    s_lshl_b32 s9, s9, 24
+; GFX10-NEXT:    s_or_b32 s0, s10, s0
 ; GFX10-NEXT:    s_or_b32 s1, s11, s1
 ; GFX10-NEXT:    s_or_b32 s0, s0, s8
 ; GFX10-NEXT:    s_or_b32 s1, s1, s9
@@ -2011,8 +2011,8 @@ define amdgpu_ps void @insertelement_s_v8i8_s_s(<8 x i8> addrspace(4)* inreg %pt
 ; GFX10-NEXT:    s_and_b32 s5, s0, s2
 ; GFX10-NEXT:    s_lshr_b32 s8, s1, 24
 ; GFX10-NEXT:    s_and_b32 s2, s1, s2
-; GFX10-NEXT:    s_bfe_u32 s1, s1, s6
 ; GFX10-NEXT:    s_lshl_b32 s3, s3, 8
+; GFX10-NEXT:    s_bfe_u32 s1, s1, s6
 ; GFX10-NEXT:    s_lshr_b32 s4, s0, 24
 ; GFX10-NEXT:    s_bfe_u32 s0, s0, s6
 ; GFX10-NEXT:    s_lshl_b32 s7, s7, 8
@@ -2042,9 +2042,9 @@ define amdgpu_ps void @insertelement_v_v8i8_s_s(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX9-NEXT:    s_mov_b32 s0, 8
 ; GFX9-NEXT:    s_mov_b32 s1, 16
+; GFX9-NEXT:    s_movk_i32 s4, 0xff
 ; GFX9-NEXT:    s_lshr_b32 s5, s3, 2
 ; GFX9-NEXT:    s_and_b32 s3, s3, 3
-; GFX9-NEXT:    s_movk_i32 s4, 0xff
 ; GFX9-NEXT:    s_and_b32 s2, s2, s4
 ; GFX9-NEXT:    s_lshl_b32 s3, s3, 3
 ; GFX9-NEXT:    s_lshl_b32 s2, s2, s3
@@ -2075,15 +2075,15 @@ define amdgpu_ps void @insertelement_v_v8i8_s_s(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 24, v0
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v8, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v8, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v4, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v9, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_and_or_b32 v1, v1, s4, v4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 24, v7
 ; GFX9-NEXT:    v_and_or_b32 v0, v0, s4, v8
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX9-NEXT:    v_and_or_b32 v1, v1, s4, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 24, v7
 ; GFX9-NEXT:    v_or3_b32 v0, v0, v9, v6
 ; GFX9-NEXT:    v_or3_b32 v1, v1, v5, v4
 ; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
@@ -2111,14 +2111,14 @@ define amdgpu_ps void @insertelement_v_v8i8_s_s(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v10, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v4, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 24, v0
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v11, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 24, v1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v11, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v11
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 24, v9
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v11
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v5
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v8
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
@@ -2133,12 +2133,12 @@ define amdgpu_ps void @insertelement_v_v8i8_s_s(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v9, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v7, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v9
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v9
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v7
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v5
@@ -2171,13 +2171,13 @@ define amdgpu_ps void @insertelement_v_v8i8_s_s(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX7-NEXT:    v_bfe_u32 v1, v1, 16, 8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 8, v7
-; GFX7-NEXT:    v_or_b32_e32 v4, v4, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_or_b32_e32 v4, v4, v5
 ; GFX7-NEXT:    v_or_b32_e32 v5, v6, v7
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
-; GFX7-NEXT:    v_or_b32_e32 v0, v4, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
+; GFX7-NEXT:    v_or_b32_e32 v0, v4, v0
 ; GFX7-NEXT:    v_or_b32_e32 v1, v5, v1
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v3
@@ -2185,8 +2185,8 @@ define amdgpu_ps void @insertelement_v_v8i8_s_s(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX7-NEXT:    v_and_b32_e32 v2, s1, v2
 ; GFX7-NEXT:    v_or_b32_e32 v2, s2, v2
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], s0, 0
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX7-NEXT:    v_bfe_u32 v5, v0, 8, 8
 ; GFX7-NEXT:    v_bfe_u32 v7, v1, 8, 8
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
@@ -2197,13 +2197,13 @@ define amdgpu_ps void @insertelement_v_v8i8_s_s(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX7-NEXT:    v_bfe_u32 v1, v1, 16, 8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 8, v7
-; GFX7-NEXT:    v_or_b32_e32 v4, v4, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_or_b32_e32 v4, v4, v5
 ; GFX7-NEXT:    v_or_b32_e32 v5, v6, v7
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
-; GFX7-NEXT:    v_or_b32_e32 v0, v4, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
+; GFX7-NEXT:    v_or_b32_e32 v0, v4, v0
 ; GFX7-NEXT:    v_or_b32_e32 v1, v5, v1
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v3
@@ -2221,8 +2221,8 @@ define amdgpu_ps void @insertelement_v_v8i8_s_s(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 24, v1
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v6, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v7, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_and_or_b32 v0, v0, s4, v3
@@ -2231,8 +2231,8 @@ define amdgpu_ps void @insertelement_v_v8i8_s_s(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 24, v4
 ; GFX10-NEXT:    s_lshr_b32 s0, s3, 2
 ; GFX10-NEXT:    s_and_b32 s1, s3, 3
-; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s0, 1
 ; GFX10-NEXT:    v_or3_b32 v0, v0, v6, v2
+; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s0, 1
 ; GFX10-NEXT:    v_or3_b32 v1, v1, v7, v3
 ; GFX10-NEXT:    s_lshl_b32 s1, s1, 3
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s0, 0
@@ -2247,8 +2247,8 @@ define amdgpu_ps void @insertelement_v_v8i8_s_s(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 16
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 24, v1
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v7, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v8, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_and_or_b32 v2, v0, s4, v5
@@ -2285,13 +2285,13 @@ define amdgpu_ps void @insertelement_s_v8i8_v_s(<8 x i8> addrspace(4)* inreg %pt
 ; GFX9-NEXT:    s_bfe_u32 s0, s0, s10
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX9-NEXT:    s_or_b32 s0, s8, s0
-; GFX9-NEXT:    s_bfe_u32 s8, s1, s9
 ; GFX9-NEXT:    s_lshl_b32 s5, s5, 24
-; GFX9-NEXT:    s_or_b32 s0, s0, s5
+; GFX9-NEXT:    s_bfe_u32 s8, s1, s9
 ; GFX9-NEXT:    s_lshr_b32 s6, s1, 24
+; GFX9-NEXT:    s_or_b32 s0, s0, s5
 ; GFX9-NEXT:    s_and_b32 s5, s1, s7
-; GFX9-NEXT:    s_bfe_u32 s1, s1, s10
 ; GFX9-NEXT:    s_lshl_b32 s8, s8, 8
+; GFX9-NEXT:    s_bfe_u32 s1, s1, s10
 ; GFX9-NEXT:    s_or_b32 s5, s5, s8
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX9-NEXT:    s_or_b32 s1, s5, s1
@@ -2308,13 +2308,13 @@ define amdgpu_ps void @insertelement_s_v8i8_v_s(<8 x i8> addrspace(4)* inreg %pt
 ; GFX9-NEXT:    v_lshl_or_b32 v2, v0, s4, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 1
+; GFX9-NEXT:    s_mov_b32 s3, 16
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v4, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    s_mov_b32 s3, 16
 ; GFX9-NEXT:    v_and_or_b32 v4, v0, s7, v4
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, s3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
@@ -2322,8 +2322,8 @@ define amdgpu_ps void @insertelement_s_v8i8_v_s(<8 x i8> addrspace(4)* inreg %pt
 ; GFX9-NEXT:    v_or3_b32 v0, v4, v0, v2
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_and_or_b32 v4, v1, s7, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 24, v3
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, s3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 24, v3
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-NEXT:    v_or3_b32 v1, v4, v1, v5
@@ -2347,13 +2347,13 @@ define amdgpu_ps void @insertelement_s_v8i8_v_s(<8 x i8> addrspace(4)* inreg %pt
 ; GFX8-NEXT:    s_bfe_u32 s0, s0, s8
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX8-NEXT:    s_or_b32 s0, s6, s0
-; GFX8-NEXT:    s_bfe_u32 s6, s1, s7
 ; GFX8-NEXT:    s_lshl_b32 s2, s2, 24
-; GFX8-NEXT:    s_or_b32 s0, s0, s2
+; GFX8-NEXT:    s_bfe_u32 s6, s1, s7
 ; GFX8-NEXT:    s_lshr_b32 s3, s1, 24
+; GFX8-NEXT:    s_or_b32 s0, s0, s2
 ; GFX8-NEXT:    s_and_b32 s2, s1, s5
-; GFX8-NEXT:    s_bfe_u32 s1, s1, s8
 ; GFX8-NEXT:    s_lshl_b32 s6, s6, 8
+; GFX8-NEXT:    s_bfe_u32 s1, s1, s8
 ; GFX8-NEXT:    s_or_b32 s2, s2, s6
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX8-NEXT:    s_or_b32 s1, s2, s1
@@ -2372,9 +2372,9 @@ define amdgpu_ps void @insertelement_s_v8i8_v_s(<8 x i8> addrspace(4)* inreg %pt
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v5, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v5, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
 ; GFX8-NEXT:    v_or_b32_sdwa v5, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -2410,13 +2410,13 @@ define amdgpu_ps void @insertelement_s_v8i8_v_s(<8 x i8> addrspace(4)* inreg %pt
 ; GFX7-NEXT:    s_bfe_u32 s0, s0, s8
 ; GFX7-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX7-NEXT:    s_or_b32 s0, s6, s0
-; GFX7-NEXT:    s_bfe_u32 s6, s1, s7
 ; GFX7-NEXT:    s_lshl_b32 s2, s2, 24
-; GFX7-NEXT:    s_or_b32 s0, s0, s2
+; GFX7-NEXT:    s_bfe_u32 s6, s1, s7
 ; GFX7-NEXT:    s_lshr_b32 s3, s1, 24
+; GFX7-NEXT:    s_or_b32 s0, s0, s2
 ; GFX7-NEXT:    s_and_b32 s2, s1, s5
-; GFX7-NEXT:    s_bfe_u32 s1, s1, s8
 ; GFX7-NEXT:    s_lshl_b32 s6, s6, 8
+; GFX7-NEXT:    s_bfe_u32 s1, s1, s8
 ; GFX7-NEXT:    s_or_b32 s2, s2, s6
 ; GFX7-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX7-NEXT:    s_or_b32 s1, s2, s1
@@ -2434,24 +2434,24 @@ define amdgpu_ps void @insertelement_s_v8i8_v_s(<8 x i8> addrspace(4)* inreg %pt
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 0
 ; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX7-NEXT:    v_bfe_u32 v5, v0, 8, 8
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 1
+; GFX7-NEXT:    v_bfe_u32 v5, v0, 8, 8
 ; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
 ; GFX7-NEXT:    v_and_b32_e32 v4, s5, v0
-; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
+; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
 ; GFX7-NEXT:    v_or_b32_e32 v4, v4, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    v_or_b32_e32 v0, v4, v0
-; GFX7-NEXT:    v_bfe_u32 v4, v1, 8, 8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
-; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX7-NEXT:    v_bfe_u32 v4, v1, 8, 8
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v1
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX7-NEXT:    v_and_b32_e32 v2, s5, v1
-; GFX7-NEXT:    v_bfe_u32 v1, v1, 16, 8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; GFX7-NEXT:    v_bfe_u32 v1, v1, 16, 8
 ; GFX7-NEXT:    v_or_b32_e32 v2, v2, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX7-NEXT:    v_or_b32_e32 v1, v2, v1
@@ -2484,12 +2484,12 @@ define amdgpu_ps void @insertelement_s_v8i8_v_s(<8 x i8> addrspace(4)* inreg %pt
 ; GFX10-NEXT:    s_lshl_b32 s5, s10, 8
 ; GFX10-NEXT:    s_lshl_b32 s3, s3, 8
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX10-NEXT:    s_or_b32 s5, s9, s5
 ; GFX10-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX10-NEXT:    s_or_b32 s5, s9, s5
 ; GFX10-NEXT:    s_or_b32 s3, s11, s3
 ; GFX10-NEXT:    s_lshl_b32 s7, s7, 24
-; GFX10-NEXT:    s_or_b32 s0, s5, s0
 ; GFX10-NEXT:    s_lshl_b32 s8, s8, 24
+; GFX10-NEXT:    s_or_b32 s0, s5, s0
 ; GFX10-NEXT:    s_or_b32 s1, s3, s1
 ; GFX10-NEXT:    s_or_b32 s0, s0, s7
 ; GFX10-NEXT:    s_or_b32 s1, s1, s8
@@ -2513,13 +2513,13 @@ define amdgpu_ps void @insertelement_s_v8i8_v_s(<8 x i8> addrspace(4)* inreg %pt
 ; GFX10-NEXT:    v_and_or_b32 v3, v0, s2, v3
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 24, v1
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v7, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
 ; GFX10-NEXT:    v_and_or_b32 v5, v1, s2, v5
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-NEXT:    v_or3_b32 v2, v3, v6, v2
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    v_or3_b32 v2, v3, v6, v2
 ; GFX10-NEXT:    v_or3_b32 v3, v5, v7, v4
 ; GFX10-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
 ; GFX10-NEXT:    s_endpgm
@@ -2547,13 +2547,13 @@ define amdgpu_ps void @insertelement_s_v8i8_s_v(<8 x i8> addrspace(4)* inreg %pt
 ; GFX9-NEXT:    s_bfe_u32 s0, s0, s10
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX9-NEXT:    s_or_b32 s0, s8, s0
-; GFX9-NEXT:    s_bfe_u32 s8, s1, s9
 ; GFX9-NEXT:    s_lshl_b32 s5, s5, 24
-; GFX9-NEXT:    s_or_b32 s0, s0, s5
+; GFX9-NEXT:    s_bfe_u32 s8, s1, s9
 ; GFX9-NEXT:    s_lshr_b32 s6, s1, 24
+; GFX9-NEXT:    s_or_b32 s0, s0, s5
 ; GFX9-NEXT:    s_and_b32 s5, s1, s7
-; GFX9-NEXT:    s_bfe_u32 s1, s1, s10
 ; GFX9-NEXT:    s_lshl_b32 s8, s8, 8
+; GFX9-NEXT:    s_bfe_u32 s1, s1, s10
 ; GFX9-NEXT:    s_or_b32 s5, s5, s8
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX9-NEXT:    s_or_b32 s1, s5, s1
@@ -2572,11 +2572,11 @@ define amdgpu_ps void @insertelement_s_v8i8_s_v(<8 x i8> addrspace(4)* inreg %pt
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v2
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
 ; GFX9-NEXT:    s_mov_b32 s2, 8
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
+; GFX9-NEXT:    s_mov_b32 s3, 16
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v4, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    s_mov_b32 s3, 16
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX9-NEXT:    v_and_or_b32 v4, v0, s7, v4
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, s3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
@@ -2585,8 +2585,8 @@ define amdgpu_ps void @insertelement_s_v8i8_s_v(<8 x i8> addrspace(4)* inreg %pt
 ; GFX9-NEXT:    v_or3_b32 v0, v4, v0, v2
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_and_or_b32 v4, v1, s7, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 24, v3
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, s3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 24, v3
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-NEXT:    v_or3_b32 v1, v4, v1, v5
@@ -2610,13 +2610,13 @@ define amdgpu_ps void @insertelement_s_v8i8_s_v(<8 x i8> addrspace(4)* inreg %pt
 ; GFX8-NEXT:    s_bfe_u32 s0, s0, s8
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX8-NEXT:    s_or_b32 s0, s6, s0
-; GFX8-NEXT:    s_bfe_u32 s6, s1, s7
 ; GFX8-NEXT:    s_lshl_b32 s2, s2, 24
-; GFX8-NEXT:    s_or_b32 s0, s0, s2
+; GFX8-NEXT:    s_bfe_u32 s6, s1, s7
 ; GFX8-NEXT:    s_lshr_b32 s3, s1, 24
+; GFX8-NEXT:    s_or_b32 s0, s0, s2
 ; GFX8-NEXT:    s_and_b32 s2, s1, s5
-; GFX8-NEXT:    s_bfe_u32 s1, s1, s8
 ; GFX8-NEXT:    s_lshl_b32 s6, s6, 8
+; GFX8-NEXT:    s_bfe_u32 s1, s1, s8
 ; GFX8-NEXT:    s_or_b32 s2, s2, s6
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX8-NEXT:    s_or_b32 s1, s2, s1
@@ -2676,13 +2676,13 @@ define amdgpu_ps void @insertelement_s_v8i8_s_v(<8 x i8> addrspace(4)* inreg %pt
 ; GFX7-NEXT:    s_bfe_u32 s0, s0, s8
 ; GFX7-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX7-NEXT:    s_or_b32 s0, s6, s0
-; GFX7-NEXT:    s_bfe_u32 s6, s1, s7
 ; GFX7-NEXT:    s_lshl_b32 s2, s2, 24
-; GFX7-NEXT:    s_or_b32 s0, s0, s2
+; GFX7-NEXT:    s_bfe_u32 s6, s1, s7
 ; GFX7-NEXT:    s_lshr_b32 s3, s1, 24
+; GFX7-NEXT:    s_or_b32 s0, s0, s2
 ; GFX7-NEXT:    s_and_b32 s2, s1, s5
-; GFX7-NEXT:    s_bfe_u32 s1, s1, s8
 ; GFX7-NEXT:    s_lshl_b32 s6, s6, 8
+; GFX7-NEXT:    s_bfe_u32 s1, s1, s8
 ; GFX7-NEXT:    s_or_b32 s2, s2, s6
 ; GFX7-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX7-NEXT:    s_or_b32 s1, s2, s1
@@ -2706,19 +2706,19 @@ define amdgpu_ps void @insertelement_s_v8i8_s_v(<8 x i8> addrspace(4)* inreg %pt
 ; GFX7-NEXT:    v_bfe_u32 v5, v0, 8, 8
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
 ; GFX7-NEXT:    v_and_b32_e32 v4, s5, v0
-; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
+; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
 ; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX7-NEXT:    v_or_b32_e32 v4, v4, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    v_or_b32_e32 v0, v4, v0
-; GFX7-NEXT:    v_bfe_u32 v4, v1, 8, 8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
-; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX7-NEXT:    v_bfe_u32 v4, v1, 8, 8
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v1
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX7-NEXT:    v_and_b32_e32 v2, s5, v1
-; GFX7-NEXT:    v_bfe_u32 v1, v1, 16, 8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; GFX7-NEXT:    v_bfe_u32 v1, v1, 16, 8
 ; GFX7-NEXT:    v_or_b32_e32 v2, v2, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX7-NEXT:    v_or_b32_e32 v1, v2, v1
@@ -2737,12 +2737,12 @@ define amdgpu_ps void @insertelement_s_v8i8_s_v(<8 x i8> addrspace(4)* inreg %pt
 ; GFX10-NEXT:    s_mov_b32 s3, 0x80008
 ; GFX10-NEXT:    s_movk_i32 s2, 0xff
 ; GFX10-NEXT:    s_mov_b32 s5, 0x80010
-; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 2, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 3, v1
 ; GFX10-NEXT:    s_and_b32 s4, s4, s2
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 3, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 2, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v1, s4
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v0, v1, s2
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
 ; GFX10-NEXT:    v_xor_b32_e32 v4, -1, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_bfe_u32 s8, s0, s3
@@ -2760,8 +2760,8 @@ define amdgpu_ps void @insertelement_s_v8i8_s_v(<8 x i8> addrspace(4)* inreg %pt
 ; GFX10-NEXT:    s_bfe_u32 s0, s0, s5
 ; GFX10-NEXT:    s_lshl_b32 s5, s8, 8
 ; GFX10-NEXT:    s_or_b32 s1, s1, s6
-; GFX10-NEXT:    s_lshl_b32 s3, s4, 24
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, 16
+; GFX10-NEXT:    s_lshl_b32 s3, s4, 24
 ; GFX10-NEXT:    s_or_b32 s4, s7, s5
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-NEXT:    s_or_b32 s0, s4, s0
@@ -2815,35 +2815,35 @@ define amdgpu_ps void @insertelement_s_v8i8_v_v(<8 x i8> addrspace(4)* inreg %pt
 ; GFX9-NEXT:    s_bfe_u32 s0, s0, s9
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX9-NEXT:    s_or_b32 s0, s7, s0
-; GFX9-NEXT:    s_bfe_u32 s7, s1, s8
 ; GFX9-NEXT:    s_lshl_b32 s4, s4, 24
-; GFX9-NEXT:    s_or_b32 s0, s0, s4
+; GFX9-NEXT:    s_bfe_u32 s7, s1, s8
 ; GFX9-NEXT:    s_lshr_b32 s5, s1, 24
+; GFX9-NEXT:    s_or_b32 s0, s0, s4
 ; GFX9-NEXT:    s_and_b32 s4, s1, s6
-; GFX9-NEXT:    s_bfe_u32 s1, s1, s9
 ; GFX9-NEXT:    s_lshl_b32 s7, s7, 8
+; GFX9-NEXT:    s_bfe_u32 s1, s1, s9
 ; GFX9-NEXT:    s_or_b32 s4, s4, s7
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX9-NEXT:    s_or_b32 s1, s4, s1
 ; GFX9-NEXT:    s_lshl_b32 s4, s5, 24
 ; GFX9-NEXT:    s_or_b32 s1, s1, s4
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 3, v1
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_lshlrev_b32_e64 v1, v1, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v4, s1
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshlrev_b32_e64 v1, v1, s6
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX9-NEXT:    v_xor_b32_e32 v1, -1, v1
 ; GFX9-NEXT:    v_and_or_b32 v3, v3, v1, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v2
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
 ; GFX9-NEXT:    s_mov_b32 s2, 8
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
+; GFX9-NEXT:    s_mov_b32 s3, 16
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v4, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    s_mov_b32 s3, 16
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX9-NEXT:    v_and_or_b32 v4, v0, s6, v4
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, s3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
@@ -2852,8 +2852,8 @@ define amdgpu_ps void @insertelement_s_v8i8_v_v(<8 x i8> addrspace(4)* inreg %pt
 ; GFX9-NEXT:    v_or3_b32 v0, v4, v0, v2
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_and_or_b32 v4, v1, s6, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 24, v3
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, s3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 24, v3
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-NEXT:    v_or3_b32 v1, v4, v1, v5
@@ -2877,24 +2877,24 @@ define amdgpu_ps void @insertelement_s_v8i8_v_v(<8 x i8> addrspace(4)* inreg %pt
 ; GFX8-NEXT:    s_bfe_u32 s0, s0, s7
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX8-NEXT:    s_or_b32 s0, s5, s0
-; GFX8-NEXT:    s_bfe_u32 s5, s1, s6
 ; GFX8-NEXT:    s_lshl_b32 s2, s2, 24
-; GFX8-NEXT:    s_or_b32 s0, s0, s2
+; GFX8-NEXT:    s_bfe_u32 s5, s1, s6
 ; GFX8-NEXT:    s_lshr_b32 s3, s1, 24
+; GFX8-NEXT:    s_or_b32 s0, s0, s2
 ; GFX8-NEXT:    s_and_b32 s2, s1, s4
-; GFX8-NEXT:    s_bfe_u32 s1, s1, s7
 ; GFX8-NEXT:    s_lshl_b32 s5, s5, 8
+; GFX8-NEXT:    s_bfe_u32 s1, s1, s7
 ; GFX8-NEXT:    s_or_b32 s2, s2, s5
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX8-NEXT:    s_or_b32 s1, s2, s1
 ; GFX8-NEXT:    s_lshl_b32 s2, s3, 24
 ; GFX8-NEXT:    s_or_b32 s1, s1, s2
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 3, v1
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_lshlrev_b32_e64 v1, v1, s4
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s1
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshlrev_b32_e64 v1, v1, s4
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX8-NEXT:    v_xor_b32_e32 v1, -1, v1
 ; GFX8-NEXT:    v_and_b32_e32 v1, v3, v1
@@ -2942,13 +2942,13 @@ define amdgpu_ps void @insertelement_s_v8i8_v_v(<8 x i8> addrspace(4)* inreg %pt
 ; GFX7-NEXT:    s_bfe_u32 s0, s0, s7
 ; GFX7-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX7-NEXT:    s_or_b32 s0, s5, s0
-; GFX7-NEXT:    s_bfe_u32 s5, s1, s6
 ; GFX7-NEXT:    s_lshl_b32 s2, s2, 24
-; GFX7-NEXT:    s_or_b32 s0, s0, s2
+; GFX7-NEXT:    s_bfe_u32 s5, s1, s6
 ; GFX7-NEXT:    s_lshr_b32 s3, s1, 24
+; GFX7-NEXT:    s_or_b32 s0, s0, s2
 ; GFX7-NEXT:    s_and_b32 s2, s1, s4
-; GFX7-NEXT:    s_bfe_u32 s1, s1, s7
 ; GFX7-NEXT:    s_lshl_b32 s5, s5, 8
+; GFX7-NEXT:    s_bfe_u32 s1, s1, s7
 ; GFX7-NEXT:    s_or_b32 s2, s2, s5
 ; GFX7-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX7-NEXT:    s_or_b32 s1, s2, s1
@@ -2956,11 +2956,11 @@ define amdgpu_ps void @insertelement_s_v8i8_v_v(<8 x i8> addrspace(4)* inreg %pt
 ; GFX7-NEXT:    s_or_b32 s1, s1, s2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 3, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, v1, v0
-; GFX7-NEXT:    v_lshl_b32_e32 v1, s4, v1
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX7-NEXT:    v_mov_b32_e32 v4, s1
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, v1, v0
+; GFX7-NEXT:    v_lshl_b32_e32 v1, s4, v1
 ; GFX7-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX7-NEXT:    v_xor_b32_e32 v1, -1, v1
 ; GFX7-NEXT:    v_and_b32_e32 v1, v3, v1
@@ -2972,19 +2972,19 @@ define amdgpu_ps void @insertelement_s_v8i8_v_v(<8 x i8> addrspace(4)* inreg %pt
 ; GFX7-NEXT:    v_bfe_u32 v5, v0, 8, 8
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
 ; GFX7-NEXT:    v_and_b32_e32 v4, s4, v0
-; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
+; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
 ; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX7-NEXT:    v_or_b32_e32 v4, v4, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    v_or_b32_e32 v0, v4, v0
-; GFX7-NEXT:    v_bfe_u32 v4, v1, 8, 8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
-; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX7-NEXT:    v_bfe_u32 v4, v1, 8, 8
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v1
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX7-NEXT:    v_and_b32_e32 v2, s4, v1
-; GFX7-NEXT:    v_bfe_u32 v1, v1, 16, 8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; GFX7-NEXT:    v_bfe_u32 v1, v1, 16, 8
 ; GFX7-NEXT:    v_or_b32_e32 v2, v2, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX7-NEXT:    v_or_b32_e32 v1, v2, v1
@@ -3068,9 +3068,9 @@ define amdgpu_ps void @insertelement_v_v8i8_s_v(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX9-NEXT:    s_mov_b32 s0, 8
 ; GFX9-NEXT:    s_mov_b32 s1, 16
+; GFX9-NEXT:    s_movk_i32 s3, 0xff
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 2, v2
 ; GFX9-NEXT:    v_and_b32_e32 v2, 3, v2
-; GFX9-NEXT:    s_movk_i32 s3, 0xff
 ; GFX9-NEXT:    s_and_b32 s2, s2, s3
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e64 v8, v2, s2
@@ -3100,15 +3100,15 @@ define amdgpu_ps void @insertelement_v_v8i8_s_v(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v8, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v8, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v9, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_and_or_b32 v1, v1, s3, v5
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 24, v7
 ; GFX9-NEXT:    v_and_or_b32 v0, v0, s3, v8
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
+; GFX9-NEXT:    v_and_or_b32 v1, v1, s3, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 24, v7
 ; GFX9-NEXT:    v_or3_b32 v0, v0, v9, v2
 ; GFX9-NEXT:    v_or3_b32 v1, v1, v6, v5
 ; GFX9-NEXT:    global_store_dwordx2 v[3:4], v[0:1], off
@@ -3119,9 +3119,9 @@ define amdgpu_ps void @insertelement_v_v8i8_s_v(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; GFX8-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX8-NEXT:    v_mov_b32_e32 v6, 16
+; GFX8-NEXT:    s_movk_i32 s0, 0xff
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 2, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 3, v2
-; GFX8-NEXT:    s_movk_i32 s0, 0xff
 ; GFX8-NEXT:    s_and_b32 s1, s2, s0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
 ; GFX8-NEXT:    v_lshlrev_b32_e64 v10, v2, s1
@@ -3137,14 +3137,14 @@ define amdgpu_ps void @insertelement_v_v8i8_s_v(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v13, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 24, v0
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v14, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 24, v1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v14, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 24, v11
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v14
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 24, v12
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v14
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v6
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v11
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v5
@@ -3158,12 +3158,12 @@ define amdgpu_ps void @insertelement_v_v8i8_s_v(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v9, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v8, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v9
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v9
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v8
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v5
@@ -3198,21 +3198,21 @@ define amdgpu_ps void @insertelement_v_v8i8_s_v(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX7-NEXT:    v_bfe_u32 v1, v1, 16, 8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 8, v8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 8, v10
-; GFX7-NEXT:    v_or_b32_e32 v7, v7, v8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_or_b32_e32 v7, v7, v8
 ; GFX7-NEXT:    v_or_b32_e32 v8, v9, v10
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX7-NEXT:    v_or_b32_e32 v0, v7, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX7-NEXT:    v_or_b32_e32 v0, v7, v0
 ; GFX7-NEXT:    v_or_b32_e32 v1, v8, v1
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v5
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v6
 ; GFX7-NEXT:    v_cndmask_b32_e32 v5, v0, v1, vcc
 ; GFX7-NEXT:    v_and_b32_e32 v2, v5, v2
 ; GFX7-NEXT:    v_or_b32_e32 v2, v2, v4
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX7-NEXT:    v_bfe_u32 v5, v0, 8, 8
 ; GFX7-NEXT:    v_bfe_u32 v7, v1, 8, 8
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
@@ -3223,13 +3223,13 @@ define amdgpu_ps void @insertelement_v_v8i8_s_v(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX7-NEXT:    v_bfe_u32 v1, v1, 16, 8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 8, v7
-; GFX7-NEXT:    v_or_b32_e32 v4, v4, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_or_b32_e32 v4, v4, v5
 ; GFX7-NEXT:    v_or_b32_e32 v5, v6, v7
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
-; GFX7-NEXT:    v_or_b32_e32 v0, v4, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
+; GFX7-NEXT:    v_or_b32_e32 v0, v4, v0
 ; GFX7-NEXT:    v_or_b32_e32 v1, v5, v1
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v3
@@ -3249,14 +3249,14 @@ define amdgpu_ps void @insertelement_v_v8i8_s_v(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v7, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 24, v1
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v7, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v8, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v9, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_and_or_b32 v0, v0, s3, v5
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 24, v6
 ; GFX10-NEXT:    v_and_or_b32 v1, v1, s3, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 24, v6
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v6, v3, s3
 ; GFX10-NEXT:    s_and_b32 s0, s2, s3
 ; GFX10-NEXT:    v_or3_b32 v0, v0, v8, v4
@@ -3270,8 +3270,8 @@ define amdgpu_ps void @insertelement_v_v8i8_s_v(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 16
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 24, v1
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v7, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
@@ -3298,9 +3298,9 @@ define amdgpu_ps void @insertelement_v_v8i8_v_s(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX9-NEXT:    s_mov_b32 s0, 8
 ; GFX9-NEXT:    s_mov_b32 s1, 16
+; GFX9-NEXT:    s_movk_i32 s3, 0xff
 ; GFX9-NEXT:    s_lshr_b32 s4, s2, 2
 ; GFX9-NEXT:    s_and_b32 s2, s2, 3
-; GFX9-NEXT:    s_movk_i32 s3, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s2, s2, 3
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    s_lshl_b32 s2, s3, s2
@@ -3329,15 +3329,15 @@ define amdgpu_ps void @insertelement_v_v8i8_v_s(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v8, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v8, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v9, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_and_or_b32 v1, v1, s3, v5
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 24, v7
 ; GFX9-NEXT:    v_and_or_b32 v0, v0, s3, v8
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
+; GFX9-NEXT:    v_and_or_b32 v1, v1, s3, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 24, v7
 ; GFX9-NEXT:    v_or3_b32 v0, v0, v9, v2
 ; GFX9-NEXT:    v_or3_b32 v1, v1, v6, v5
 ; GFX9-NEXT:    global_store_dwordx2 v[3:4], v[0:1], off
@@ -3365,14 +3365,14 @@ define amdgpu_ps void @insertelement_v_v8i8_v_s(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v11, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 24, v0
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v12, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 24, v1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v12, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v12
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 24, v10
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v12
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v6
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v9
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v5
@@ -3387,12 +3387,12 @@ define amdgpu_ps void @insertelement_v_v8i8_v_s(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v9, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v8, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v9
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v9
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v8
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v5
@@ -3426,13 +3426,13 @@ define amdgpu_ps void @insertelement_v_v8i8_v_s(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX7-NEXT:    v_bfe_u32 v1, v1, 16, 8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 8, v8
-; GFX7-NEXT:    v_or_b32_e32 v5, v5, v6
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_or_b32_e32 v5, v5, v6
 ; GFX7-NEXT:    v_or_b32_e32 v6, v7, v8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX7-NEXT:    v_or_b32_e32 v0, v5, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
+; GFX7-NEXT:    v_or_b32_e32 v0, v5, v0
 ; GFX7-NEXT:    v_or_b32_e32 v1, v6, v1
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v3
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v4
@@ -3440,8 +3440,8 @@ define amdgpu_ps void @insertelement_v_v8i8_v_s(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX7-NEXT:    v_and_b32_e32 v3, s1, v3
 ; GFX7-NEXT:    v_or_b32_e32 v2, v3, v2
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], s0, 0
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX7-NEXT:    v_bfe_u32 v5, v0, 8, 8
 ; GFX7-NEXT:    v_bfe_u32 v7, v1, 8, 8
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
@@ -3452,13 +3452,13 @@ define amdgpu_ps void @insertelement_v_v8i8_v_s(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX7-NEXT:    v_bfe_u32 v1, v1, 16, 8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 8, v7
-; GFX7-NEXT:    v_or_b32_e32 v4, v4, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_or_b32_e32 v4, v4, v5
 ; GFX7-NEXT:    v_or_b32_e32 v5, v6, v7
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
-; GFX7-NEXT:    v_or_b32_e32 v0, v4, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
+; GFX7-NEXT:    v_or_b32_e32 v0, v4, v0
 ; GFX7-NEXT:    v_or_b32_e32 v1, v5, v1
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v3
@@ -3474,8 +3474,8 @@ define amdgpu_ps void @insertelement_v_v8i8_v_s(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v4, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v6, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v6, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v7, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v8, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_and_or_b32 v0, v0, s3, v4
@@ -3526,11 +3526,11 @@ define amdgpu_ps void @insertelement_v_v8i8_v_v(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX9-NEXT:    s_mov_b32 s0, 8
 ; GFX9-NEXT:    s_mov_b32 s1, 16
+; GFX9-NEXT:    s_movk_i32 s2, 0xff
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 2, v3
 ; GFX9-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX9-NEXT:    s_movk_i32 s2, 0xff
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX9-NEXT:    v_mov_b32_e32 v6, 0xff
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, v3, v6
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v9
@@ -3575,12 +3575,12 @@ define amdgpu_ps void @insertelement_v_v8i8_v_v(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX8-LABEL: insertelement_v_v8i8_v_v:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v7, 8
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 2, v3
 ; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX8-NEXT:    v_mov_b32_e32 v7, 8
+; GFX8-NEXT:    v_mov_b32_e32 v6, 0xff
 ; GFX8-NEXT:    v_mov_b32_e32 v8, 16
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX8-NEXT:    v_mov_b32_e32 v6, 0xff
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, v3, v6
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v11
@@ -3594,14 +3594,14 @@ define amdgpu_ps void @insertelement_v_v8i8_v_v(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v13, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v7, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 24, v0
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v14, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 24, v1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v14, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v8, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v14
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 24, v12
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v14
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v8
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v6
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v7
@@ -3615,12 +3615,12 @@ define amdgpu_ps void @insertelement_v_v8i8_v_v(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 24, v1
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v7, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v9, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v7
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v7
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v9
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v3
@@ -3634,9 +3634,9 @@ define amdgpu_ps void @insertelement_v_v8i8_v_v(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_movk_i32 s0, 0xff
+; GFX7-NEXT:    v_mov_b32_e32 v4, 0xff
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 2, v3
 ; GFX7-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX7-NEXT:    v_mov_b32_e32 v4, 0xff
 ; GFX7-NEXT:    v_and_b32_e32 v2, v2, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v3, v2
@@ -3655,13 +3655,13 @@ define amdgpu_ps void @insertelement_v_v8i8_v_v(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX7-NEXT:    v_bfe_u32 v1, v1, 16, 8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v11, 8, v11
-; GFX7-NEXT:    v_or_b32_e32 v8, v8, v9
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_or_b32_e32 v8, v8, v9
 ; GFX7-NEXT:    v_or_b32_e32 v9, v10, v11
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX7-NEXT:    v_or_b32_e32 v0, v8, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
+; GFX7-NEXT:    v_or_b32_e32 v0, v8, v0
 ; GFX7-NEXT:    v_or_b32_e32 v1, v9, v1
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v6
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v7
@@ -3682,12 +3682,12 @@ define amdgpu_ps void @insertelement_v_v8i8_v_v(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 8, v7
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX7-NEXT:    v_or_b32_e32 v5, v5, v6
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_or_b32_e32 v5, v5, v6
 ; GFX7-NEXT:    v_or_b32_e32 v4, v4, v7
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
-; GFX7-NEXT:    v_or_b32_e32 v0, v5, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
+; GFX7-NEXT:    v_or_b32_e32 v0, v5, v0
 ; GFX7-NEXT:    v_or_b32_e32 v1, v4, v1
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v3
@@ -3701,22 +3701,22 @@ define amdgpu_ps void @insertelement_v_v8i8_v_v(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX10-NEXT:    v_and_b32_e32 v4, 3, v3
 ; GFX10-NEXT:    s_mov_b32 s1, 16
 ; GFX10-NEXT:    s_movk_i32 s2, 0xff
-; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 2, v3
 ; GFX10-NEXT:    v_mov_b32_e32 v5, 0xff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 2, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 24, v0
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v7, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v9, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 24, v1
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v9, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v10, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v11, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_and_or_b32 v0, v0, s2, v7
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 24, v8
 ; GFX10-NEXT:    v_and_or_b32 v1, v1, s2, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 24, v8
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v8, v4, v5
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 0, v3
 ; GFX10-NEXT:    v_or3_b32 v0, v0, v10, v6
@@ -3730,13 +3730,13 @@ define amdgpu_ps void @insertelement_v_v8i8_v_v(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 16
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v6, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v8, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v9, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_and_or_b32 v2, v0, v5, v6
-; GFX10-NEXT:    v_and_or_b32 v3, v1, v5, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
+; GFX10-NEXT:    v_and_or_b32 v3, v1, v5, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 24, v7
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
@@ -3768,33 +3768,33 @@ define amdgpu_ps void @insertelement_s_v16i8_s_s(<16 x i8> addrspace(4)* inreg %
 ; GFX9-NEXT:    s_bfe_u32 s0, s0, s13
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX9-NEXT:    s_or_b32 s0, s11, s0
-; GFX9-NEXT:    s_bfe_u32 s11, s1, s12
 ; GFX9-NEXT:    s_lshl_b32 s6, s6, 24
-; GFX9-NEXT:    s_or_b32 s0, s0, s6
+; GFX9-NEXT:    s_bfe_u32 s11, s1, s12
 ; GFX9-NEXT:    s_lshr_b32 s7, s1, 24
+; GFX9-NEXT:    s_or_b32 s0, s0, s6
 ; GFX9-NEXT:    s_and_b32 s6, s1, s10
-; GFX9-NEXT:    s_bfe_u32 s1, s1, s13
 ; GFX9-NEXT:    s_lshl_b32 s11, s11, 8
+; GFX9-NEXT:    s_bfe_u32 s1, s1, s13
 ; GFX9-NEXT:    s_or_b32 s6, s6, s11
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX9-NEXT:    s_or_b32 s1, s6, s1
 ; GFX9-NEXT:    s_lshl_b32 s6, s7, 24
 ; GFX9-NEXT:    s_bfe_u32 s7, s2, s12
-; GFX9-NEXT:    s_or_b32 s1, s1, s6
 ; GFX9-NEXT:    s_lshr_b32 s8, s2, 24
+; GFX9-NEXT:    s_or_b32 s1, s1, s6
 ; GFX9-NEXT:    s_and_b32 s6, s2, s10
-; GFX9-NEXT:    s_bfe_u32 s2, s2, s13
 ; GFX9-NEXT:    s_lshl_b32 s7, s7, 8
+; GFX9-NEXT:    s_bfe_u32 s2, s2, s13
 ; GFX9-NEXT:    s_or_b32 s6, s6, s7
 ; GFX9-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX9-NEXT:    s_bfe_u32 s7, s3, s12
 ; GFX9-NEXT:    s_or_b32 s2, s6, s2
 ; GFX9-NEXT:    s_lshl_b32 s6, s8, 24
-; GFX9-NEXT:    s_or_b32 s2, s2, s6
+; GFX9-NEXT:    s_bfe_u32 s7, s3, s12
 ; GFX9-NEXT:    s_lshr_b32 s9, s3, 24
+; GFX9-NEXT:    s_or_b32 s2, s2, s6
 ; GFX9-NEXT:    s_and_b32 s6, s3, s10
-; GFX9-NEXT:    s_bfe_u32 s3, s3, s13
 ; GFX9-NEXT:    s_lshl_b32 s7, s7, 8
+; GFX9-NEXT:    s_bfe_u32 s3, s3, s13
 ; GFX9-NEXT:    s_or_b32 s6, s6, s7
 ; GFX9-NEXT:    s_lshl_b32 s3, s3, 16
 ; GFX9-NEXT:    s_or_b32 s3, s6, s3
@@ -3825,38 +3825,38 @@ define amdgpu_ps void @insertelement_s_v16i8_s_s(<16 x i8> addrspace(4)* inreg %
 ; GFX9-NEXT:    s_bfe_u32 s9, s0, s12
 ; GFX9-NEXT:    s_lshr_b32 s4, s0, 24
 ; GFX9-NEXT:    s_and_b32 s8, s0, s10
-; GFX9-NEXT:    s_bfe_u32 s0, s0, s13
 ; GFX9-NEXT:    s_lshl_b32 s9, s9, 8
+; GFX9-NEXT:    s_bfe_u32 s0, s0, s13
 ; GFX9-NEXT:    s_or_b32 s8, s8, s9
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX9-NEXT:    s_or_b32 s0, s8, s0
-; GFX9-NEXT:    s_bfe_u32 s8, s1, s12
 ; GFX9-NEXT:    s_lshl_b32 s4, s4, 24
-; GFX9-NEXT:    s_or_b32 s0, s0, s4
+; GFX9-NEXT:    s_bfe_u32 s8, s1, s12
 ; GFX9-NEXT:    s_lshr_b32 s5, s1, 24
+; GFX9-NEXT:    s_or_b32 s0, s0, s4
 ; GFX9-NEXT:    s_and_b32 s4, s1, s10
-; GFX9-NEXT:    s_bfe_u32 s1, s1, s13
 ; GFX9-NEXT:    s_lshl_b32 s8, s8, 8
+; GFX9-NEXT:    s_bfe_u32 s1, s1, s13
 ; GFX9-NEXT:    s_or_b32 s4, s4, s8
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX9-NEXT:    s_or_b32 s1, s4, s1
 ; GFX9-NEXT:    s_lshl_b32 s4, s5, 24
 ; GFX9-NEXT:    s_bfe_u32 s5, s2, s12
-; GFX9-NEXT:    s_or_b32 s1, s1, s4
 ; GFX9-NEXT:    s_lshr_b32 s6, s2, 24
+; GFX9-NEXT:    s_or_b32 s1, s1, s4
 ; GFX9-NEXT:    s_and_b32 s4, s2, s10
-; GFX9-NEXT:    s_bfe_u32 s2, s2, s13
 ; GFX9-NEXT:    s_lshl_b32 s5, s5, 8
+; GFX9-NEXT:    s_bfe_u32 s2, s2, s13
 ; GFX9-NEXT:    s_or_b32 s4, s4, s5
 ; GFX9-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX9-NEXT:    s_bfe_u32 s5, s3, s12
 ; GFX9-NEXT:    s_or_b32 s2, s4, s2
 ; GFX9-NEXT:    s_lshl_b32 s4, s6, 24
-; GFX9-NEXT:    s_or_b32 s2, s2, s4
+; GFX9-NEXT:    s_bfe_u32 s5, s3, s12
 ; GFX9-NEXT:    s_lshr_b32 s7, s3, 24
+; GFX9-NEXT:    s_or_b32 s2, s2, s4
 ; GFX9-NEXT:    s_and_b32 s4, s3, s10
-; GFX9-NEXT:    s_bfe_u32 s3, s3, s13
 ; GFX9-NEXT:    s_lshl_b32 s5, s5, 8
+; GFX9-NEXT:    s_bfe_u32 s3, s3, s13
 ; GFX9-NEXT:    s_or_b32 s4, s4, s5
 ; GFX9-NEXT:    s_lshl_b32 s3, s3, 16
 ; GFX9-NEXT:    s_or_b32 s3, s4, s3
@@ -3886,33 +3886,33 @@ define amdgpu_ps void @insertelement_s_v16i8_s_s(<16 x i8> addrspace(4)* inreg %
 ; GFX8-NEXT:    s_bfe_u32 s0, s0, s13
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX8-NEXT:    s_or_b32 s0, s11, s0
-; GFX8-NEXT:    s_bfe_u32 s11, s1, s12
 ; GFX8-NEXT:    s_lshl_b32 s6, s6, 24
-; GFX8-NEXT:    s_or_b32 s0, s0, s6
+; GFX8-NEXT:    s_bfe_u32 s11, s1, s12
 ; GFX8-NEXT:    s_lshr_b32 s7, s1, 24
+; GFX8-NEXT:    s_or_b32 s0, s0, s6
 ; GFX8-NEXT:    s_and_b32 s6, s1, s10
-; GFX8-NEXT:    s_bfe_u32 s1, s1, s13
 ; GFX8-NEXT:    s_lshl_b32 s11, s11, 8
+; GFX8-NEXT:    s_bfe_u32 s1, s1, s13
 ; GFX8-NEXT:    s_or_b32 s6, s6, s11
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX8-NEXT:    s_or_b32 s1, s6, s1
 ; GFX8-NEXT:    s_lshl_b32 s6, s7, 24
 ; GFX8-NEXT:    s_bfe_u32 s7, s2, s12
-; GFX8-NEXT:    s_or_b32 s1, s1, s6
 ; GFX8-NEXT:    s_lshr_b32 s8, s2, 24
+; GFX8-NEXT:    s_or_b32 s1, s1, s6
 ; GFX8-NEXT:    s_and_b32 s6, s2, s10
-; GFX8-NEXT:    s_bfe_u32 s2, s2, s13
 ; GFX8-NEXT:    s_lshl_b32 s7, s7, 8
+; GFX8-NEXT:    s_bfe_u32 s2, s2, s13
 ; GFX8-NEXT:    s_or_b32 s6, s6, s7
 ; GFX8-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX8-NEXT:    s_bfe_u32 s7, s3, s12
 ; GFX8-NEXT:    s_or_b32 s2, s6, s2
 ; GFX8-NEXT:    s_lshl_b32 s6, s8, 24
-; GFX8-NEXT:    s_or_b32 s2, s2, s6
+; GFX8-NEXT:    s_bfe_u32 s7, s3, s12
 ; GFX8-NEXT:    s_lshr_b32 s9, s3, 24
+; GFX8-NEXT:    s_or_b32 s2, s2, s6
 ; GFX8-NEXT:    s_and_b32 s6, s3, s10
-; GFX8-NEXT:    s_bfe_u32 s3, s3, s13
 ; GFX8-NEXT:    s_lshl_b32 s7, s7, 8
+; GFX8-NEXT:    s_bfe_u32 s3, s3, s13
 ; GFX8-NEXT:    s_or_b32 s6, s6, s7
 ; GFX8-NEXT:    s_lshl_b32 s3, s3, 16
 ; GFX8-NEXT:    s_or_b32 s3, s6, s3
@@ -3943,38 +3943,38 @@ define amdgpu_ps void @insertelement_s_v16i8_s_s(<16 x i8> addrspace(4)* inreg %
 ; GFX8-NEXT:    s_bfe_u32 s9, s0, s12
 ; GFX8-NEXT:    s_lshr_b32 s4, s0, 24
 ; GFX8-NEXT:    s_and_b32 s8, s0, s10
-; GFX8-NEXT:    s_bfe_u32 s0, s0, s13
 ; GFX8-NEXT:    s_lshl_b32 s9, s9, 8
+; GFX8-NEXT:    s_bfe_u32 s0, s0, s13
 ; GFX8-NEXT:    s_or_b32 s8, s8, s9
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX8-NEXT:    s_or_b32 s0, s8, s0
-; GFX8-NEXT:    s_bfe_u32 s8, s1, s12
 ; GFX8-NEXT:    s_lshl_b32 s4, s4, 24
-; GFX8-NEXT:    s_or_b32 s0, s0, s4
+; GFX8-NEXT:    s_bfe_u32 s8, s1, s12
 ; GFX8-NEXT:    s_lshr_b32 s5, s1, 24
+; GFX8-NEXT:    s_or_b32 s0, s0, s4
 ; GFX8-NEXT:    s_and_b32 s4, s1, s10
-; GFX8-NEXT:    s_bfe_u32 s1, s1, s13
 ; GFX8-NEXT:    s_lshl_b32 s8, s8, 8
+; GFX8-NEXT:    s_bfe_u32 s1, s1, s13
 ; GFX8-NEXT:    s_or_b32 s4, s4, s8
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX8-NEXT:    s_or_b32 s1, s4, s1
 ; GFX8-NEXT:    s_lshl_b32 s4, s5, 24
 ; GFX8-NEXT:    s_bfe_u32 s5, s2, s12
-; GFX8-NEXT:    s_or_b32 s1, s1, s4
 ; GFX8-NEXT:    s_lshr_b32 s6, s2, 24
+; GFX8-NEXT:    s_or_b32 s1, s1, s4
 ; GFX8-NEXT:    s_and_b32 s4, s2, s10
-; GFX8-NEXT:    s_bfe_u32 s2, s2, s13
 ; GFX8-NEXT:    s_lshl_b32 s5, s5, 8
+; GFX8-NEXT:    s_bfe_u32 s2, s2, s13
 ; GFX8-NEXT:    s_or_b32 s4, s4, s5
 ; GFX8-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX8-NEXT:    s_bfe_u32 s5, s3, s12
 ; GFX8-NEXT:    s_or_b32 s2, s4, s2
 ; GFX8-NEXT:    s_lshl_b32 s4, s6, 24
-; GFX8-NEXT:    s_or_b32 s2, s2, s4
+; GFX8-NEXT:    s_bfe_u32 s5, s3, s12
 ; GFX8-NEXT:    s_lshr_b32 s7, s3, 24
+; GFX8-NEXT:    s_or_b32 s2, s2, s4
 ; GFX8-NEXT:    s_and_b32 s4, s3, s10
-; GFX8-NEXT:    s_bfe_u32 s3, s3, s13
 ; GFX8-NEXT:    s_lshl_b32 s5, s5, 8
+; GFX8-NEXT:    s_bfe_u32 s3, s3, s13
 ; GFX8-NEXT:    s_or_b32 s4, s4, s5
 ; GFX8-NEXT:    s_lshl_b32 s3, s3, 16
 ; GFX8-NEXT:    s_or_b32 s3, s4, s3
@@ -4002,33 +4002,33 @@ define amdgpu_ps void @insertelement_s_v16i8_s_s(<16 x i8> addrspace(4)* inreg %
 ; GFX7-NEXT:    s_bfe_u32 s0, s0, s13
 ; GFX7-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX7-NEXT:    s_or_b32 s0, s11, s0
-; GFX7-NEXT:    s_bfe_u32 s11, s1, s12
 ; GFX7-NEXT:    s_lshl_b32 s6, s6, 24
-; GFX7-NEXT:    s_or_b32 s0, s0, s6
+; GFX7-NEXT:    s_bfe_u32 s11, s1, s12
 ; GFX7-NEXT:    s_lshr_b32 s7, s1, 24
+; GFX7-NEXT:    s_or_b32 s0, s0, s6
 ; GFX7-NEXT:    s_and_b32 s6, s1, s10
-; GFX7-NEXT:    s_bfe_u32 s1, s1, s13
 ; GFX7-NEXT:    s_lshl_b32 s11, s11, 8
+; GFX7-NEXT:    s_bfe_u32 s1, s1, s13
 ; GFX7-NEXT:    s_or_b32 s6, s6, s11
 ; GFX7-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX7-NEXT:    s_or_b32 s1, s6, s1
 ; GFX7-NEXT:    s_lshl_b32 s6, s7, 24
 ; GFX7-NEXT:    s_bfe_u32 s7, s2, s12
-; GFX7-NEXT:    s_or_b32 s1, s1, s6
 ; GFX7-NEXT:    s_lshr_b32 s8, s2, 24
+; GFX7-NEXT:    s_or_b32 s1, s1, s6
 ; GFX7-NEXT:    s_and_b32 s6, s2, s10
-; GFX7-NEXT:    s_bfe_u32 s2, s2, s13
 ; GFX7-NEXT:    s_lshl_b32 s7, s7, 8
+; GFX7-NEXT:    s_bfe_u32 s2, s2, s13
 ; GFX7-NEXT:    s_or_b32 s6, s6, s7
 ; GFX7-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX7-NEXT:    s_bfe_u32 s7, s3, s12
 ; GFX7-NEXT:    s_or_b32 s2, s6, s2
 ; GFX7-NEXT:    s_lshl_b32 s6, s8, 24
-; GFX7-NEXT:    s_or_b32 s2, s2, s6
+; GFX7-NEXT:    s_bfe_u32 s7, s3, s12
 ; GFX7-NEXT:    s_lshr_b32 s9, s3, 24
+; GFX7-NEXT:    s_or_b32 s2, s2, s6
 ; GFX7-NEXT:    s_and_b32 s6, s3, s10
-; GFX7-NEXT:    s_bfe_u32 s3, s3, s13
 ; GFX7-NEXT:    s_lshl_b32 s7, s7, 8
+; GFX7-NEXT:    s_bfe_u32 s3, s3, s13
 ; GFX7-NEXT:    s_or_b32 s6, s6, s7
 ; GFX7-NEXT:    s_lshl_b32 s3, s3, 16
 ; GFX7-NEXT:    s_or_b32 s3, s6, s3
@@ -4059,8 +4059,8 @@ define amdgpu_ps void @insertelement_s_v16i8_s_s(<16 x i8> addrspace(4)* inreg %
 ; GFX7-NEXT:    s_bfe_u32 s14, s5, s12
 ; GFX7-NEXT:    s_lshr_b32 s4, s5, 24
 ; GFX7-NEXT:    s_and_b32 s11, s5, s10
-; GFX7-NEXT:    s_bfe_u32 s5, s5, s13
 ; GFX7-NEXT:    s_lshl_b32 s14, s14, 8
+; GFX7-NEXT:    s_bfe_u32 s5, s5, s13
 ; GFX7-NEXT:    s_or_b32 s11, s11, s14
 ; GFX7-NEXT:    s_lshl_b32 s5, s5, 16
 ; GFX7-NEXT:    s_or_b32 s5, s11, s5
@@ -4069,28 +4069,28 @@ define amdgpu_ps void @insertelement_s_v16i8_s_s(<16 x i8> addrspace(4)* inreg %
 ; GFX7-NEXT:    s_lshr_b32 s6, s7, 24
 ; GFX7-NEXT:    s_or_b32 s4, s5, s4
 ; GFX7-NEXT:    s_and_b32 s5, s7, s10
-; GFX7-NEXT:    s_bfe_u32 s7, s7, s13
 ; GFX7-NEXT:    s_lshl_b32 s11, s11, 8
+; GFX7-NEXT:    s_bfe_u32 s7, s7, s13
 ; GFX7-NEXT:    s_or_b32 s5, s5, s11
 ; GFX7-NEXT:    s_lshl_b32 s7, s7, 16
 ; GFX7-NEXT:    s_or_b32 s5, s5, s7
-; GFX7-NEXT:    s_bfe_u32 s7, s2, s12
 ; GFX7-NEXT:    s_lshl_b32 s6, s6, 24
-; GFX7-NEXT:    s_or_b32 s5, s5, s6
+; GFX7-NEXT:    s_bfe_u32 s7, s2, s12
 ; GFX7-NEXT:    s_lshr_b32 s8, s2, 24
+; GFX7-NEXT:    s_or_b32 s5, s5, s6
 ; GFX7-NEXT:    s_and_b32 s6, s2, s10
-; GFX7-NEXT:    s_bfe_u32 s2, s2, s13
 ; GFX7-NEXT:    s_lshl_b32 s7, s7, 8
+; GFX7-NEXT:    s_bfe_u32 s2, s2, s13
 ; GFX7-NEXT:    s_or_b32 s6, s6, s7
 ; GFX7-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX7-NEXT:    s_bfe_u32 s7, s3, s12
 ; GFX7-NEXT:    s_or_b32 s2, s6, s2
 ; GFX7-NEXT:    s_lshl_b32 s6, s8, 24
-; GFX7-NEXT:    s_or_b32 s6, s2, s6
+; GFX7-NEXT:    s_bfe_u32 s7, s3, s12
 ; GFX7-NEXT:    s_lshr_b32 s9, s3, 24
+; GFX7-NEXT:    s_or_b32 s6, s2, s6
 ; GFX7-NEXT:    s_and_b32 s2, s3, s10
-; GFX7-NEXT:    s_bfe_u32 s3, s3, s13
 ; GFX7-NEXT:    s_lshl_b32 s7, s7, 8
+; GFX7-NEXT:    s_bfe_u32 s3, s3, s13
 ; GFX7-NEXT:    s_or_b32 s2, s2, s7
 ; GFX7-NEXT:    s_lshl_b32 s3, s3, 16
 ; GFX7-NEXT:    s_or_b32 s2, s2, s3
@@ -4117,35 +4117,35 @@ define amdgpu_ps void @insertelement_s_v16i8_s_s(<16 x i8> addrspace(4)* inreg %
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_bfe_u32 s14, s0, s7
 ; GFX10-NEXT:    s_lshr_b32 s9, s0, 24
-; GFX10-NEXT:    s_bfe_u32 s16, s1, s7
 ; GFX10-NEXT:    s_and_b32 s13, s0, s6
 ; GFX10-NEXT:    s_bfe_u32 s0, s0, s8
+; GFX10-NEXT:    s_bfe_u32 s16, s1, s7
 ; GFX10-NEXT:    s_lshl_b32 s14, s14, 8
 ; GFX10-NEXT:    s_lshr_b32 s10, s1, 24
 ; GFX10-NEXT:    s_and_b32 s15, s1, s6
 ; GFX10-NEXT:    s_bfe_u32 s1, s1, s8
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX10-NEXT:    s_or_b32 s13, s13, s14
 ; GFX10-NEXT:    s_lshl_b32 s16, s16, 8
+; GFX10-NEXT:    s_or_b32 s13, s13, s14
 ; GFX10-NEXT:    s_bfe_u32 s18, s2, s7
 ; GFX10-NEXT:    s_lshl_b32 s9, s9, 24
-; GFX10-NEXT:    s_or_b32 s0, s13, s0
 ; GFX10-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX10-NEXT:    s_or_b32 s14, s15, s16
-; GFX10-NEXT:    s_or_b32 s0, s0, s9
+; GFX10-NEXT:    s_or_b32 s0, s13, s0
 ; GFX10-NEXT:    s_lshr_b32 s11, s2, 24
 ; GFX10-NEXT:    s_and_b32 s17, s2, s6
-; GFX10-NEXT:    s_lshl_b32 s9, s18, 8
-; GFX10-NEXT:    s_bfe_u32 s2, s2, s8
 ; GFX10-NEXT:    s_lshl_b32 s10, s10, 24
 ; GFX10-NEXT:    s_or_b32 s1, s14, s1
+; GFX10-NEXT:    s_or_b32 s0, s0, s9
+; GFX10-NEXT:    s_lshl_b32 s9, s18, 8
+; GFX10-NEXT:    s_bfe_u32 s2, s2, s8
 ; GFX10-NEXT:    s_or_b32 s9, s17, s9
 ; GFX10-NEXT:    s_lshl_b32 s2, s2, 16
 ; GFX10-NEXT:    s_or_b32 s1, s1, s10
 ; GFX10-NEXT:    s_bfe_u32 s10, s3, s7
+; GFX10-NEXT:    s_lshr_b32 s12, s3, 24
 ; GFX10-NEXT:    s_or_b32 s2, s9, s2
 ; GFX10-NEXT:    s_lshl_b32 s9, s11, 24
-; GFX10-NEXT:    s_lshr_b32 s12, s3, 24
 ; GFX10-NEXT:    s_and_b32 s11, s3, s6
 ; GFX10-NEXT:    s_lshl_b32 s10, s10, 8
 ; GFX10-NEXT:    s_bfe_u32 s3, s3, s8
@@ -4191,20 +4191,20 @@ define amdgpu_ps void @insertelement_s_v16i8_s_s(<16 x i8> addrspace(4)* inreg %
 ; GFX10-NEXT:    s_and_b32 s12, s1, s6
 ; GFX10-NEXT:    s_lshl_b32 s10, s10, 8
 ; GFX10-NEXT:    s_bfe_u32 s1, s1, s8
+; GFX10-NEXT:    s_or_b32 s10, s12, s10
+; GFX10-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX10-NEXT:    s_or_b32 s0, s0, s4
 ; GFX10-NEXT:    s_lshl_b32 s4, s5, 24
 ; GFX10-NEXT:    s_bfe_u32 s5, s2, s7
-; GFX10-NEXT:    s_or_b32 s10, s12, s10
-; GFX10-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX10-NEXT:    s_lshr_b32 s9, s2, 24
 ; GFX10-NEXT:    s_or_b32 s1, s10, s1
 ; GFX10-NEXT:    s_and_b32 s10, s2, s6
 ; GFX10-NEXT:    s_lshl_b32 s5, s5, 8
 ; GFX10-NEXT:    s_bfe_u32 s2, s2, s8
-; GFX10-NEXT:    s_or_b32 s1, s1, s4
-; GFX10-NEXT:    s_bfe_u32 s4, s3, s7
 ; GFX10-NEXT:    s_or_b32 s5, s10, s5
 ; GFX10-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX10-NEXT:    s_or_b32 s1, s1, s4
+; GFX10-NEXT:    s_bfe_u32 s4, s3, s7
 ; GFX10-NEXT:    s_lshr_b32 s11, s3, 24
 ; GFX10-NEXT:    s_or_b32 s2, s5, s2
 ; GFX10-NEXT:    s_and_b32 s5, s3, s6
@@ -4235,11 +4235,11 @@ define amdgpu_ps void @insertelement_v_v16i8_s_s(<16 x i8> addrspace(1)* %ptr, i
 ; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
 ; GFX9-NEXT:    s_mov_b32 s0, 8
 ; GFX9-NEXT:    s_mov_b32 s1, 16
-; GFX9-NEXT:    v_mov_b32_e32 v6, 8
 ; GFX9-NEXT:    s_movk_i32 s6, 0xff
+; GFX9-NEXT:    v_mov_b32_e32 v6, 8
+; GFX9-NEXT:    v_mov_b32_e32 v7, 16
 ; GFX9-NEXT:    s_lshr_b32 s4, s3, 2
 ; GFX9-NEXT:    s_and_b32 s3, s3, 3
-; GFX9-NEXT:    v_mov_b32_e32 v7, 16
 ; GFX9-NEXT:    s_and_b32 s2, s2, s6
 ; GFX9-NEXT:    s_lshl_b32 s3, s3, 3
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 1
@@ -4260,8 +4260,8 @@ define amdgpu_ps void @insertelement_v_v16i8_s_s(<16 x i8> addrspace(1)* %ptr, i
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v16, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v17, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_and_or_b32 v0, v0, s6, v13
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
 ; GFX9-NEXT:    v_and_or_b32 v1, v1, s6, v15
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 24, v3
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v18, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
@@ -4281,9 +4281,9 @@ define amdgpu_ps void @insertelement_v_v16i8_s_s(<16 x i8> addrspace(1)* %ptr, i
 ; GFX9-NEXT:    v_cndmask_b32_e64 v9, v9, v3, s[2:3]
 ; GFX9-NEXT:    v_and_or_b32 v8, v9, s5, v8
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], s4, 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v8, s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v8, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v8, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v8, s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v8, s[2:3]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 24, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 24, v1
@@ -4297,14 +4297,14 @@ define amdgpu_ps void @insertelement_v_v16i8_s_s(<16 x i8> addrspace(1)* %ptr, i
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v15, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v17, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v7, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_and_or_b32 v3, v3, s6, v6
-; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 24, v11
 ; GFX9-NEXT:    v_and_or_b32 v0, v0, s6, v12
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
 ; GFX9-NEXT:    v_and_or_b32 v1, v1, s6, v14
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
 ; GFX9-NEXT:    v_and_or_b32 v2, v2, s6, v16
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
+; GFX9-NEXT:    v_and_or_b32 v3, v3, s6, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 24, v11
 ; GFX9-NEXT:    v_or3_b32 v0, v0, v13, v8
 ; GFX9-NEXT:    v_or3_b32 v1, v1, v15, v9
 ; GFX9-NEXT:    v_or3_b32 v2, v2, v17, v10
@@ -4316,8 +4316,8 @@ define amdgpu_ps void @insertelement_v_v16i8_s_s(<16 x i8> addrspace(1)* %ptr, i
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; GFX8-NEXT:    v_mov_b32_e32 v6, 8
-; GFX8-NEXT:    v_mov_b32_e32 v8, 8
 ; GFX8-NEXT:    v_mov_b32_e32 v7, 16
+; GFX8-NEXT:    v_mov_b32_e32 v8, 8
 ; GFX8-NEXT:    v_mov_b32_e32 v9, 16
 ; GFX8-NEXT:    s_and_b32 s1, s3, 3
 ; GFX8-NEXT:    s_movk_i32 s0, 0xff
@@ -4336,24 +4336,24 @@ define amdgpu_ps void @insertelement_v_v16i8_s_s(<16 x i8> addrspace(1)* %ptr, i
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v14, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 24, v0
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v15, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 24, v1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v15, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v7, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v16, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 24, v2
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v17, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_or_b32_sdwa v2, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 24, v11
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v7
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v18, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 24, v11
+; GFX8-NEXT:    v_or_b32_sdwa v2, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v15
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v7
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 24, v3
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v19, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_or_b32_sdwa v3, v3, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 24, v12
+; GFX8-NEXT:    v_or_b32_sdwa v3, v3, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v2, v2, v17
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v10
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v6
@@ -4367,9 +4367,9 @@ define amdgpu_ps void @insertelement_v_v16i8_s_s(<16 x i8> addrspace(1)* %ptr, i
 ; GFX8-NEXT:    v_and_b32_e32 v6, s6, v6
 ; GFX8-NEXT:    v_or_b32_e32 v6, s5, v6
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], s4, 0
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[2:3]
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v12, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v14, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
@@ -4378,22 +4378,22 @@ define amdgpu_ps void @insertelement_v_v16i8_s_s(<16 x i8> addrspace(1)* %ptr, i
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 24, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 24, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 24, v3
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v13, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v15, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v17, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v9, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v2, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 24, v3
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v9, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX8-NEXT:    v_or_b32_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v13
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v15
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
-; GFX8-NEXT:    v_or_b32_e32 v2, v2, v17
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 24, v11
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v13
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v15
+; GFX8-NEXT:    v_or_b32_e32 v2, v2, v17
 ; GFX8-NEXT:    v_or_b32_e32 v3, v3, v9
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v6
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v7
@@ -4425,36 +4425,36 @@ define amdgpu_ps void @insertelement_v_v16i8_s_s(<16 x i8> addrspace(1)* %ptr, i
 ; GFX7-NEXT:    v_bfe_u32 v11, v1, 8, 8
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
-; GFX7-NEXT:    v_bfe_u32 v13, v2, 8, 8
 ; GFX7-NEXT:    v_and_b32_e32 v8, s6, v0
 ; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
 ; GFX7-NEXT:    v_and_b32_e32 v10, s6, v1
 ; GFX7-NEXT:    v_bfe_u32 v1, v1, 16, 8
+; GFX7-NEXT:    v_bfe_u32 v13, v2, 8, 8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v11, 8, v11
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 24, v2
-; GFX7-NEXT:    v_bfe_u32 v15, v3, 8, 8
 ; GFX7-NEXT:    v_and_b32_e32 v12, s6, v2
 ; GFX7-NEXT:    v_bfe_u32 v2, v2, 16, 8
-; GFX7-NEXT:    v_or_b32_e32 v8, v8, v9
+; GFX7-NEXT:    v_bfe_u32 v15, v3, 8, 8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_or_b32_e32 v9, v10, v11
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v13, 8, v13
+; GFX7-NEXT:    v_or_b32_e32 v8, v8, v9
+; GFX7-NEXT:    v_or_b32_e32 v9, v10, v11
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 24, v3
 ; GFX7-NEXT:    v_and_b32_e32 v14, s6, v3
 ; GFX7-NEXT:    v_bfe_u32 v3, v3, 16, 8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v15, 8, v15
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; GFX7-NEXT:    v_or_b32_e32 v0, v8, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX7-NEXT:    v_or_b32_e32 v1, v9, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v15, 8, v15
 ; GFX7-NEXT:    v_or_b32_e32 v10, v12, v13
+; GFX7-NEXT:    v_or_b32_e32 v0, v8, v0
+; GFX7-NEXT:    v_or_b32_e32 v1, v9, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX7-NEXT:    v_or_b32_e32 v2, v10, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX7-NEXT:    v_or_b32_e32 v11, v14, v15
+; GFX7-NEXT:    v_or_b32_e32 v2, v10, v2
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
@@ -4467,45 +4467,45 @@ define amdgpu_ps void @insertelement_v_v16i8_s_s(<16 x i8> addrspace(1)* %ptr, i
 ; GFX7-NEXT:    v_and_b32_e32 v4, s7, v4
 ; GFX7-NEXT:    v_or_b32_e32 v4, s5, v4
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[4:5], s4, 0
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[0:1]
 ; GFX7-NEXT:    v_bfe_u32 v9, v0, 8, 8
 ; GFX7-NEXT:    v_bfe_u32 v11, v1, 8, 8
 ; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[2:3]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
-; GFX7-NEXT:    v_bfe_u32 v13, v2, 8, 8
 ; GFX7-NEXT:    v_and_b32_e32 v8, s6, v0
 ; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
 ; GFX7-NEXT:    v_and_b32_e32 v10, s6, v1
 ; GFX7-NEXT:    v_bfe_u32 v1, v1, 16, 8
+; GFX7-NEXT:    v_bfe_u32 v13, v2, 8, 8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v11, 8, v11
-; GFX7-NEXT:    v_or_b32_e32 v8, v8, v9
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 24, v2
 ; GFX7-NEXT:    v_and_b32_e32 v12, s6, v2
 ; GFX7-NEXT:    v_bfe_u32 v2, v2, 16, 8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_or_b32_e32 v9, v10, v11
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v13, 8, v13
+; GFX7-NEXT:    v_or_b32_e32 v8, v8, v9
+; GFX7-NEXT:    v_or_b32_e32 v9, v10, v11
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; GFX7-NEXT:    v_or_b32_e32 v0, v8, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX7-NEXT:    v_or_b32_e32 v1, v9, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX7-NEXT:    v_or_b32_e32 v10, v12, v13
+; GFX7-NEXT:    v_or_b32_e32 v0, v8, v0
+; GFX7-NEXT:    v_or_b32_e32 v1, v9, v1
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v5
-; GFX7-NEXT:    v_bfe_u32 v5, v3, 8, 8
 ; GFX7-NEXT:    v_or_b32_e32 v2, v10, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v6
-; GFX7-NEXT:    v_or_b32_e32 v2, v2, v4
+; GFX7-NEXT:    v_bfe_u32 v5, v3, 8, 8
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 24, v3
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v4
 ; GFX7-NEXT:    v_and_b32_e32 v4, s6, v3
-; GFX7-NEXT:    v_bfe_u32 v3, v3, 16, 8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
+; GFX7-NEXT:    v_bfe_u32 v3, v3, 16, 8
 ; GFX7-NEXT:    v_or_b32_e32 v4, v4, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX7-NEXT:    v_or_b32_e32 v3, v4, v3
@@ -4533,23 +4533,23 @@ define amdgpu_ps void @insertelement_v_v16i8_s_s(<16 x i8> addrspace(1)* %ptr, i
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 24, v2
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v11, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v13, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v14, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX10-NEXT:    v_and_or_b32 v0, v0, s4, v10
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v14, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX10-NEXT:    v_and_or_b32 v1, v1, s4, v12
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v15, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 24, v3
-; GFX10-NEXT:    v_or3_b32 v0, v0, v11, v6
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v15, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v16, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX10-NEXT:    v_or3_b32 v1, v1, v13, v7
 ; GFX10-NEXT:    v_and_or_b32 v2, v2, s4, v14
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
+; GFX10-NEXT:    v_or3_b32 v0, v0, v11, v6
+; GFX10-NEXT:    v_or3_b32 v1, v1, v13, v7
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v10, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_and_or_b32 v3, v3, s4, v16
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 24, v9
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v0, v1, vcc_lo
 ; GFX10-NEXT:    v_or3_b32 v2, v2, v15, v8
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v0, v1, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s5, 2
 ; GFX10-NEXT:    s_and_b32 s1, s3, 3
 ; GFX10-NEXT:    v_or3_b32 v3, v3, v10, v6
@@ -4563,12 +4563,12 @@ define amdgpu_ps void @insertelement_v_v16i8_s_s(<16 x i8> addrspace(1)* %ptr, i
 ; GFX10-NEXT:    v_and_or_b32 v6, v6, s3, s2
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s2, s5, 0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s2
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s1
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 24, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 24, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 24, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 24, v3
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v10, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v12, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
@@ -4578,14 +4578,14 @@ define amdgpu_ps void @insertelement_v_v16i8_s_s(<16 x i8> addrspace(1)* %ptr, i
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v13, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v15, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v16, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_and_or_b32 v3, v3, s4, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
 ; GFX10-NEXT:    v_and_or_b32 v0, v0, s4, v10
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
 ; GFX10-NEXT:    v_and_or_b32 v1, v1, s4, v12
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
 ; GFX10-NEXT:    v_and_or_b32 v2, v2, s4, v14
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
+; GFX10-NEXT:    v_and_or_b32 v3, v3, s4, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX10-NEXT:    v_or3_b32 v0, v0, v11, v6
@@ -4618,33 +4618,33 @@ define amdgpu_ps void @insertelement_s_v16i8_v_s(<16 x i8> addrspace(4)* inreg %
 ; GFX9-NEXT:    s_bfe_u32 s0, s0, s14
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX9-NEXT:    s_or_b32 s0, s12, s0
-; GFX9-NEXT:    s_bfe_u32 s12, s1, s13
 ; GFX9-NEXT:    s_lshl_b32 s7, s7, 24
-; GFX9-NEXT:    s_or_b32 s0, s0, s7
+; GFX9-NEXT:    s_bfe_u32 s12, s1, s13
 ; GFX9-NEXT:    s_lshr_b32 s8, s1, 24
+; GFX9-NEXT:    s_or_b32 s0, s0, s7
 ; GFX9-NEXT:    s_and_b32 s7, s1, s11
-; GFX9-NEXT:    s_bfe_u32 s1, s1, s14
 ; GFX9-NEXT:    s_lshl_b32 s12, s12, 8
+; GFX9-NEXT:    s_bfe_u32 s1, s1, s14
 ; GFX9-NEXT:    s_or_b32 s7, s7, s12
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX9-NEXT:    s_or_b32 s1, s7, s1
 ; GFX9-NEXT:    s_lshl_b32 s7, s8, 24
 ; GFX9-NEXT:    s_bfe_u32 s8, s2, s13
-; GFX9-NEXT:    s_or_b32 s1, s1, s7
 ; GFX9-NEXT:    s_lshr_b32 s9, s2, 24
+; GFX9-NEXT:    s_or_b32 s1, s1, s7
 ; GFX9-NEXT:    s_and_b32 s7, s2, s11
-; GFX9-NEXT:    s_bfe_u32 s2, s2, s14
 ; GFX9-NEXT:    s_lshl_b32 s8, s8, 8
+; GFX9-NEXT:    s_bfe_u32 s2, s2, s14
 ; GFX9-NEXT:    s_or_b32 s7, s7, s8
 ; GFX9-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX9-NEXT:    s_bfe_u32 s8, s3, s13
 ; GFX9-NEXT:    s_or_b32 s2, s7, s2
 ; GFX9-NEXT:    s_lshl_b32 s7, s9, 24
-; GFX9-NEXT:    s_or_b32 s2, s2, s7
+; GFX9-NEXT:    s_bfe_u32 s8, s3, s13
 ; GFX9-NEXT:    s_lshr_b32 s10, s3, 24
+; GFX9-NEXT:    s_or_b32 s2, s2, s7
 ; GFX9-NEXT:    s_and_b32 s7, s3, s11
-; GFX9-NEXT:    s_bfe_u32 s3, s3, s14
 ; GFX9-NEXT:    s_lshl_b32 s8, s8, 8
+; GFX9-NEXT:    s_bfe_u32 s3, s3, s14
 ; GFX9-NEXT:    s_or_b32 s7, s7, s8
 ; GFX9-NEXT:    s_lshl_b32 s3, s3, 16
 ; GFX9-NEXT:    s_or_b32 s3, s7, s3
@@ -4665,19 +4665,19 @@ define amdgpu_ps void @insertelement_s_v16i8_v_s(<16 x i8> addrspace(4)* inreg %
 ; GFX9-NEXT:    v_lshl_or_b32 v4, v0, s4, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s7, 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s7, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s7, 2
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s3
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s7, 3
+; GFX9-NEXT:    s_mov_b32 s6, 16
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v8, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    s_mov_b32 s6, 16
 ; GFX9-NEXT:    v_and_or_b32 v8, v0, s11, v8
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
@@ -4689,8 +4689,8 @@ define amdgpu_ps void @insertelement_s_v16i8_v_s(<16 x i8> addrspace(4)* inreg %
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
 ; GFX9-NEXT:    v_or3_b32 v1, v4, v1, v5
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 8
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v5, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 24, v2
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v5, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_mov_b32_e32 v8, 16
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 24, v3
 ; GFX9-NEXT:    v_and_or_b32 v5, v2, s11, v5
@@ -4699,9 +4699,9 @@ define amdgpu_ps void @insertelement_s_v16i8_v_s(<16 x i8> addrspace(4)* inreg %
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_or3_b32 v2, v5, v2, v6
 ; GFX9-NEXT:    v_and_or_b32 v6, v3, s11, v4
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-NEXT:    v_or3_b32 v3, v6, v3, v7
 ; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
@@ -4724,33 +4724,33 @@ define amdgpu_ps void @insertelement_s_v16i8_v_s(<16 x i8> addrspace(4)* inreg %
 ; GFX8-NEXT:    s_bfe_u32 s0, s0, s12
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX8-NEXT:    s_or_b32 s0, s10, s0
-; GFX8-NEXT:    s_bfe_u32 s10, s1, s11
 ; GFX8-NEXT:    s_lshl_b32 s5, s5, 24
-; GFX8-NEXT:    s_or_b32 s0, s0, s5
+; GFX8-NEXT:    s_bfe_u32 s10, s1, s11
 ; GFX8-NEXT:    s_lshr_b32 s6, s1, 24
+; GFX8-NEXT:    s_or_b32 s0, s0, s5
 ; GFX8-NEXT:    s_and_b32 s5, s1, s9
-; GFX8-NEXT:    s_bfe_u32 s1, s1, s12
 ; GFX8-NEXT:    s_lshl_b32 s10, s10, 8
+; GFX8-NEXT:    s_bfe_u32 s1, s1, s12
 ; GFX8-NEXT:    s_or_b32 s5, s5, s10
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX8-NEXT:    s_or_b32 s1, s5, s1
 ; GFX8-NEXT:    s_lshl_b32 s5, s6, 24
 ; GFX8-NEXT:    s_bfe_u32 s6, s2, s11
-; GFX8-NEXT:    s_or_b32 s1, s1, s5
 ; GFX8-NEXT:    s_lshr_b32 s7, s2, 24
+; GFX8-NEXT:    s_or_b32 s1, s1, s5
 ; GFX8-NEXT:    s_and_b32 s5, s2, s9
-; GFX8-NEXT:    s_bfe_u32 s2, s2, s12
 ; GFX8-NEXT:    s_lshl_b32 s6, s6, 8
+; GFX8-NEXT:    s_bfe_u32 s2, s2, s12
 ; GFX8-NEXT:    s_or_b32 s5, s5, s6
 ; GFX8-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX8-NEXT:    s_bfe_u32 s6, s3, s11
 ; GFX8-NEXT:    s_or_b32 s2, s5, s2
 ; GFX8-NEXT:    s_lshl_b32 s5, s7, 24
-; GFX8-NEXT:    s_or_b32 s2, s2, s5
+; GFX8-NEXT:    s_bfe_u32 s6, s3, s11
 ; GFX8-NEXT:    s_lshr_b32 s8, s3, 24
+; GFX8-NEXT:    s_or_b32 s2, s2, s5
 ; GFX8-NEXT:    s_and_b32 s5, s3, s9
-; GFX8-NEXT:    s_bfe_u32 s3, s3, s12
 ; GFX8-NEXT:    s_lshl_b32 s6, s6, 8
+; GFX8-NEXT:    s_bfe_u32 s3, s3, s12
 ; GFX8-NEXT:    s_or_b32 s5, s5, s6
 ; GFX8-NEXT:    s_lshl_b32 s3, s3, 16
 ; GFX8-NEXT:    s_or_b32 s3, s5, s3
@@ -4772,16 +4772,16 @@ define amdgpu_ps void @insertelement_s_v16i8_v_s(<16 x i8> addrspace(4)* inreg %
 ; GFX8-NEXT:    v_or_b32_e32 v4, s4, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 2
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v9, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 3
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v9, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
 ; GFX8-NEXT:    v_or_b32_sdwa v9, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -4799,13 +4799,13 @@ define amdgpu_ps void @insertelement_s_v16i8_v_s(<16 x i8> addrspace(4)* inreg %
 ; GFX8-NEXT:    v_mov_b32_e32 v4, 8
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v5, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX8-NEXT:    v_mov_b32_e32 v8, 16
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 24, v2
 ; GFX8-NEXT:    v_or_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 24, v3
 ; GFX8-NEXT:    v_or_b32_e32 v2, v5, v2
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 24, v6
-; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 24, v3
 ; GFX8-NEXT:    v_or_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX8-NEXT:    v_or_b32_e32 v2, v2, v5
@@ -4833,33 +4833,33 @@ define amdgpu_ps void @insertelement_s_v16i8_v_s(<16 x i8> addrspace(4)* inreg %
 ; GFX7-NEXT:    s_bfe_u32 s0, s0, s12
 ; GFX7-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX7-NEXT:    s_or_b32 s0, s10, s0
-; GFX7-NEXT:    s_bfe_u32 s10, s1, s11
 ; GFX7-NEXT:    s_lshl_b32 s5, s5, 24
-; GFX7-NEXT:    s_or_b32 s0, s0, s5
+; GFX7-NEXT:    s_bfe_u32 s10, s1, s11
 ; GFX7-NEXT:    s_lshr_b32 s6, s1, 24
+; GFX7-NEXT:    s_or_b32 s0, s0, s5
 ; GFX7-NEXT:    s_and_b32 s5, s1, s9
-; GFX7-NEXT:    s_bfe_u32 s1, s1, s12
 ; GFX7-NEXT:    s_lshl_b32 s10, s10, 8
+; GFX7-NEXT:    s_bfe_u32 s1, s1, s12
 ; GFX7-NEXT:    s_or_b32 s5, s5, s10
 ; GFX7-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX7-NEXT:    s_or_b32 s1, s5, s1
 ; GFX7-NEXT:    s_lshl_b32 s5, s6, 24
 ; GFX7-NEXT:    s_bfe_u32 s6, s2, s11
-; GFX7-NEXT:    s_or_b32 s1, s1, s5
 ; GFX7-NEXT:    s_lshr_b32 s7, s2, 24
+; GFX7-NEXT:    s_or_b32 s1, s1, s5
 ; GFX7-NEXT:    s_and_b32 s5, s2, s9
-; GFX7-NEXT:    s_bfe_u32 s2, s2, s12
 ; GFX7-NEXT:    s_lshl_b32 s6, s6, 8
+; GFX7-NEXT:    s_bfe_u32 s2, s2, s12
 ; GFX7-NEXT:    s_or_b32 s5, s5, s6
 ; GFX7-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX7-NEXT:    s_bfe_u32 s6, s3, s11
 ; GFX7-NEXT:    s_or_b32 s2, s5, s2
 ; GFX7-NEXT:    s_lshl_b32 s5, s7, 24
-; GFX7-NEXT:    s_or_b32 s2, s2, s5
+; GFX7-NEXT:    s_bfe_u32 s6, s3, s11
 ; GFX7-NEXT:    s_lshr_b32 s8, s3, 24
+; GFX7-NEXT:    s_or_b32 s2, s2, s5
 ; GFX7-NEXT:    s_and_b32 s5, s3, s9
-; GFX7-NEXT:    s_bfe_u32 s3, s3, s12
 ; GFX7-NEXT:    s_lshl_b32 s6, s6, 8
+; GFX7-NEXT:    s_bfe_u32 s3, s3, s12
 ; GFX7-NEXT:    s_or_b32 s5, s5, s6
 ; GFX7-NEXT:    s_lshl_b32 s3, s3, 16
 ; GFX7-NEXT:    s_or_b32 s3, s5, s3
@@ -4880,51 +4880,51 @@ define amdgpu_ps void @insertelement_s_v16i8_v_s(<16 x i8> addrspace(4)* inreg %
 ; GFX7-NEXT:    v_or_b32_e32 v4, s4, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 0
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 1
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 2
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX7-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; GFX7-NEXT:    v_bfe_u32 v9, v0, 8, 8
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 3
+; GFX7-NEXT:    v_bfe_u32 v9, v0, 8, 8
 ; GFX7-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
 ; GFX7-NEXT:    v_and_b32_e32 v8, s9, v0
-; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
+; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
 ; GFX7-NEXT:    v_or_b32_e32 v8, v8, v9
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    v_or_b32_e32 v0, v8, v0
-; GFX7-NEXT:    v_bfe_u32 v8, v1, 8, 8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; GFX7-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX7-NEXT:    v_bfe_u32 v8, v1, 8, 8
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX7-NEXT:    v_and_b32_e32 v4, s9, v1
-; GFX7-NEXT:    v_bfe_u32 v1, v1, 16, 8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 8, v8
+; GFX7-NEXT:    v_bfe_u32 v1, v1, 16, 8
 ; GFX7-NEXT:    v_or_b32_e32 v4, v4, v8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX7-NEXT:    v_or_b32_e32 v1, v4, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v5
 ; GFX7-NEXT:    v_bfe_u32 v5, v2, 8, 8
-; GFX7-NEXT:    v_or_b32_e32 v1, v1, v4
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 24, v2
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v4
 ; GFX7-NEXT:    v_and_b32_e32 v4, s9, v2
-; GFX7-NEXT:    v_bfe_u32 v2, v2, 16, 8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
+; GFX7-NEXT:    v_bfe_u32 v2, v2, 16, 8
 ; GFX7-NEXT:    v_or_b32_e32 v4, v4, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-NEXT:    v_bfe_u32 v5, v3, 8, 8
 ; GFX7-NEXT:    v_or_b32_e32 v2, v4, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v6
-; GFX7-NEXT:    v_or_b32_e32 v2, v2, v4
+; GFX7-NEXT:    v_bfe_u32 v5, v3, 8, 8
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 24, v3
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v4
 ; GFX7-NEXT:    v_and_b32_e32 v4, s9, v3
-; GFX7-NEXT:    v_bfe_u32 v3, v3, 16, 8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
+; GFX7-NEXT:    v_bfe_u32 v3, v3, 16, 8
 ; GFX7-NEXT:    v_or_b32_e32 v4, v4, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX7-NEXT:    v_or_b32_e32 v3, v4, v3
@@ -4956,26 +4956,26 @@ define amdgpu_ps void @insertelement_s_v16i8_v_s(<16 x i8> addrspace(4)* inreg %
 ; GFX10-NEXT:    s_bfe_u32 s1, s1, s7
 ; GFX10-NEXT:    s_lshl_b32 s13, s13, 8
 ; GFX10-NEXT:    s_lshl_b32 s15, s15, 8
-; GFX10-NEXT:    s_or_b32 s12, s12, s13
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX10-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX10-NEXT:    s_or_b32 s12, s12, s13
 ; GFX10-NEXT:    s_or_b32 s13, s14, s15
 ; GFX10-NEXT:    s_bfe_u32 s17, s2, s6
-; GFX10-NEXT:    s_bfe_u32 s6, s3, s6
 ; GFX10-NEXT:    s_lshl_b32 s8, s8, 24
-; GFX10-NEXT:    s_or_b32 s0, s12, s0
 ; GFX10-NEXT:    s_lshl_b32 s9, s9, 24
+; GFX10-NEXT:    s_or_b32 s0, s12, s0
 ; GFX10-NEXT:    s_or_b32 s1, s13, s1
-; GFX10-NEXT:    s_or_b32 s0, s0, s8
-; GFX10-NEXT:    s_or_b32 s1, s1, s9
+; GFX10-NEXT:    s_bfe_u32 s6, s3, s6
 ; GFX10-NEXT:    s_lshr_b32 s10, s2, 24
+; GFX10-NEXT:    s_lshr_b32 s11, s3, 24
 ; GFX10-NEXT:    s_and_b32 s16, s2, s5
+; GFX10-NEXT:    s_or_b32 s0, s0, s8
 ; GFX10-NEXT:    s_lshl_b32 s8, s17, 8
 ; GFX10-NEXT:    s_bfe_u32 s2, s2, s7
-; GFX10-NEXT:    s_lshr_b32 s11, s3, 24
+; GFX10-NEXT:    s_or_b32 s1, s1, s9
 ; GFX10-NEXT:    s_and_b32 s9, s3, s5
-; GFX10-NEXT:    s_bfe_u32 s3, s3, s7
 ; GFX10-NEXT:    s_lshl_b32 s6, s6, 8
+; GFX10-NEXT:    s_bfe_u32 s3, s3, s7
 ; GFX10-NEXT:    s_or_b32 s8, s16, s8
 ; GFX10-NEXT:    s_lshl_b32 s2, s2, 16
 ; GFX10-NEXT:    s_or_b32 s6, s9, s6
@@ -5020,12 +5020,12 @@ define amdgpu_ps void @insertelement_s_v16i8_v_s(<16 x i8> addrspace(4)* inreg %
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc_lo
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v11, v10, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_and_or_b32 v9, v1, s5, v9
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 24, v3
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v10, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_and_or_b32 v11, v2, s5, v11
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v12, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
@@ -5053,7 +5053,7 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(<16 x i8> addrspace(4)* inreg %
 ; GFX9-NEXT:    s_mov_b32 s13, 0x80008
 ; GFX9-NEXT:    s_movk_i32 s12, 0xff
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 2, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 3, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_bfe_u32 s14, s0, s13
 ; GFX9-NEXT:    s_and_b32 s8, s0, s12
@@ -5069,8 +5069,8 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(<16 x i8> addrspace(4)* inreg %
 ; GFX9-NEXT:    s_bfe_u32 s5, s1, s13
 ; GFX9-NEXT:    s_lshr_b32 s9, s1, 24
 ; GFX9-NEXT:    s_and_b32 s0, s1, s12
-; GFX9-NEXT:    s_bfe_u32 s1, s1, s14
 ; GFX9-NEXT:    s_lshl_b32 s5, s5, 8
+; GFX9-NEXT:    s_bfe_u32 s1, s1, s14
 ; GFX9-NEXT:    s_or_b32 s0, s0, s5
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX9-NEXT:    s_or_b32 s0, s0, s1
@@ -5081,8 +5081,8 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(<16 x i8> addrspace(4)* inreg %
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, 8
 ; GFX9-NEXT:    s_or_b32 s0, s0, s1
 ; GFX9-NEXT:    s_bfe_u32 s1, s2, s14
-; GFX9-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX9-NEXT:    s_lshr_b32 s10, s2, 24
+; GFX9-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX9-NEXT:    s_or_b32 s0, s0, s1
 ; GFX9-NEXT:    s_lshl_b32 s1, s10, 24
 ; GFX9-NEXT:    s_or_b32 s10, s0, s1
@@ -5097,47 +5097,47 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(<16 x i8> addrspace(4)* inreg %
 ; GFX9-NEXT:    s_lshl_b32 s1, s11, 24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s8
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s9
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
+; GFX9-NEXT:    v_and_b32_e32 v0, 3, v0
 ; GFX9-NEXT:    s_or_b32 s11, s0, s1
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s10
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v4
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX9-NEXT:    s_and_b32 s4, s4, s12
-; GFX9-NEXT:    v_lshlrev_b32_e64 v2, v0, s4
-; GFX9-NEXT:    v_lshlrev_b32_e64 v0, v0, s12
 ; GFX9-NEXT:    v_mov_b32_e32 v5, s11
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v4
+; GFX9-NEXT:    v_lshlrev_b32_e64 v2, v0, s4
+; GFX9-NEXT:    v_lshlrev_b32_e64 v0, v0, s12
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[2:3]
 ; GFX9-NEXT:    v_xor_b32_e32 v0, -1, v0
 ; GFX9-NEXT:    v_and_or_b32 v5, v1, v0, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s8
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v4
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v5, s[4:5]
 ; GFX9-NEXT:    s_mov_b32 s6, 8
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v5, s[4:5]
+; GFX9-NEXT:    s_mov_b32 s7, 16
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v8, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    s_mov_b32 s7, 16
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s10
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s11
-; GFX9-NEXT:    v_and_or_b32 v8, v0, s12, v8
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX9-NEXT:    v_and_or_b32 v8, v0, s12, v8
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; GFX9-NEXT:    v_or3_b32 v0, v8, v0, v4
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v4, s6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[2:3]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
+; GFX9-NEXT:    v_or3_b32 v0, v8, v0, v4
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v4, s6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_and_or_b32 v4, v1, s12, v4
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, s7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
 ; GFX9-NEXT:    v_or3_b32 v1, v4, v1, v5
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 8
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v5, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 24, v2
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v5, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_mov_b32_e32 v8, 16
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 24, v3
 ; GFX9-NEXT:    v_and_or_b32 v5, v2, s12, v5
@@ -5146,9 +5146,9 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(<16 x i8> addrspace(4)* inreg %
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_or3_b32 v2, v5, v2, v6
 ; GFX9-NEXT:    v_and_or_b32 v6, v3, s12, v4
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-NEXT:    v_or3_b32 v3, v6, v3, v7
 ; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
@@ -5165,8 +5165,8 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(<16 x i8> addrspace(4)* inreg %
 ; GFX8-NEXT:    s_bfe_u32 s9, s0, s13
 ; GFX8-NEXT:    s_lshr_b32 s5, s0, 24
 ; GFX8-NEXT:    s_and_b32 s8, s0, s12
-; GFX8-NEXT:    s_bfe_u32 s0, s0, s14
 ; GFX8-NEXT:    s_lshl_b32 s9, s9, 8
+; GFX8-NEXT:    s_bfe_u32 s0, s0, s14
 ; GFX8-NEXT:    s_or_b32 s8, s8, s9
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX8-NEXT:    s_or_b32 s0, s8, s0
@@ -5175,8 +5175,8 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(<16 x i8> addrspace(4)* inreg %
 ; GFX8-NEXT:    s_bfe_u32 s5, s1, s13
 ; GFX8-NEXT:    s_lshr_b32 s6, s1, 24
 ; GFX8-NEXT:    s_and_b32 s0, s1, s12
-; GFX8-NEXT:    s_bfe_u32 s1, s1, s14
 ; GFX8-NEXT:    s_lshl_b32 s5, s5, 8
+; GFX8-NEXT:    s_bfe_u32 s1, s1, s14
 ; GFX8-NEXT:    s_or_b32 s0, s0, s5
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
@@ -5187,8 +5187,8 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(<16 x i8> addrspace(4)* inreg %
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 8
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    s_bfe_u32 s1, s2, s14
-; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX8-NEXT:    s_lshr_b32 s7, s2, 24
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    s_lshl_b32 s1, s7, 24
 ; GFX8-NEXT:    s_or_b32 s10, s0, s1
@@ -5199,23 +5199,23 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(<16 x i8> addrspace(4)* inreg %
 ; GFX8-NEXT:    s_bfe_u32 s1, s3, s14
 ; GFX8-NEXT:    s_lshr_b32 s11, s3, 24
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX8-NEXT:    v_and_b32_e32 v0, 3, v0
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    s_lshl_b32 s1, s11, 24
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s8
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s9
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
+; GFX8-NEXT:    v_and_b32_e32 v0, 3, v0
 ; GFX8-NEXT:    s_or_b32 s11, s0, s1
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s10
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v4
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX8-NEXT:    s_and_b32 s4, s4, s12
-; GFX8-NEXT:    v_lshlrev_b32_e64 v2, v0, s4
-; GFX8-NEXT:    v_lshlrev_b32_e64 v0, v0, s12
 ; GFX8-NEXT:    v_mov_b32_e32 v5, s11
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v4
+; GFX8-NEXT:    v_lshlrev_b32_e64 v2, v0, s4
+; GFX8-NEXT:    v_lshlrev_b32_e64 v0, v0, s12
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[2:3]
 ; GFX8-NEXT:    v_xor_b32_e32 v0, -1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, v1, v0
@@ -5248,13 +5248,13 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(<16 x i8> addrspace(4)* inreg %
 ; GFX8-NEXT:    v_mov_b32_e32 v4, 8
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v5, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX8-NEXT:    v_mov_b32_e32 v8, 16
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 24, v2
 ; GFX8-NEXT:    v_or_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 24, v3
 ; GFX8-NEXT:    v_or_b32_e32 v2, v5, v2
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 24, v6
-; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 24, v3
 ; GFX8-NEXT:    v_or_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX8-NEXT:    v_or_b32_e32 v2, v2, v5
@@ -5277,8 +5277,8 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(<16 x i8> addrspace(4)* inreg %
 ; GFX7-NEXT:    s_bfe_u32 s9, s0, s13
 ; GFX7-NEXT:    s_lshr_b32 s5, s0, 24
 ; GFX7-NEXT:    s_and_b32 s8, s0, s12
-; GFX7-NEXT:    s_bfe_u32 s0, s0, s14
 ; GFX7-NEXT:    s_lshl_b32 s9, s9, 8
+; GFX7-NEXT:    s_bfe_u32 s0, s0, s14
 ; GFX7-NEXT:    s_or_b32 s8, s8, s9
 ; GFX7-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX7-NEXT:    s_or_b32 s0, s8, s0
@@ -5287,8 +5287,8 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(<16 x i8> addrspace(4)* inreg %
 ; GFX7-NEXT:    s_bfe_u32 s5, s1, s13
 ; GFX7-NEXT:    s_lshr_b32 s6, s1, 24
 ; GFX7-NEXT:    s_and_b32 s0, s1, s12
-; GFX7-NEXT:    s_bfe_u32 s1, s1, s14
 ; GFX7-NEXT:    s_lshl_b32 s5, s5, 8
+; GFX7-NEXT:    s_bfe_u32 s1, s1, s14
 ; GFX7-NEXT:    s_or_b32 s0, s0, s5
 ; GFX7-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX7-NEXT:    s_or_b32 s0, s0, s1
@@ -5299,8 +5299,8 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(<16 x i8> addrspace(4)* inreg %
 ; GFX7-NEXT:    s_lshl_b32 s1, s1, 8
 ; GFX7-NEXT:    s_or_b32 s0, s0, s1
 ; GFX7-NEXT:    s_bfe_u32 s1, s2, s14
-; GFX7-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX7-NEXT:    s_lshr_b32 s7, s2, 24
+; GFX7-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX7-NEXT:    s_or_b32 s0, s0, s1
 ; GFX7-NEXT:    s_lshl_b32 s1, s7, 24
 ; GFX7-NEXT:    s_or_b32 s10, s0, s1
@@ -5311,23 +5311,23 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(<16 x i8> addrspace(4)* inreg %
 ; GFX7-NEXT:    s_bfe_u32 s1, s3, s14
 ; GFX7-NEXT:    s_lshr_b32 s11, s3, 24
 ; GFX7-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX7-NEXT:    v_and_b32_e32 v0, 3, v0
 ; GFX7-NEXT:    s_or_b32 s0, s0, s1
 ; GFX7-NEXT:    s_lshl_b32 s1, s11, 24
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s8
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s9
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
+; GFX7-NEXT:    v_and_b32_e32 v0, 3, v0
 ; GFX7-NEXT:    s_or_b32 s11, s0, s1
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s10
 ; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX7-NEXT:    s_and_b32 s4, s4, s12
-; GFX7-NEXT:    v_lshl_b32_e32 v2, s4, v0
-; GFX7-NEXT:    v_lshl_b32_e32 v0, s12, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v5, s11
 ; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v4
+; GFX7-NEXT:    v_lshl_b32_e32 v2, s4, v0
+; GFX7-NEXT:    v_lshl_b32_e32 v0, s12, v0
 ; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[2:3]
 ; GFX7-NEXT:    v_xor_b32_e32 v0, -1, v0
 ; GFX7-NEXT:    v_and_b32_e32 v0, v1, v0
@@ -5339,43 +5339,43 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(<16 x i8> addrspace(4)* inreg %
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
 ; GFX7-NEXT:    v_and_b32_e32 v8, s12, v0
-; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
+; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
 ; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
 ; GFX7-NEXT:    v_or_b32_e32 v8, v8, v9
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s10
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s11
 ; GFX7-NEXT:    v_or_b32_e32 v0, v8, v0
-; GFX7-NEXT:    v_bfe_u32 v8, v1, 8, 8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[2:3]
+; GFX7-NEXT:    v_bfe_u32 v8, v1, 8, 8
 ; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
-; GFX7-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[2:3]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX7-NEXT:    v_and_b32_e32 v4, s12, v1
-; GFX7-NEXT:    v_bfe_u32 v1, v1, 16, 8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 8, v8
+; GFX7-NEXT:    v_bfe_u32 v1, v1, 16, 8
 ; GFX7-NEXT:    v_or_b32_e32 v4, v4, v8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX7-NEXT:    v_or_b32_e32 v1, v4, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v5
 ; GFX7-NEXT:    v_bfe_u32 v5, v2, 8, 8
-; GFX7-NEXT:    v_or_b32_e32 v1, v1, v4
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 24, v2
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v4
 ; GFX7-NEXT:    v_and_b32_e32 v4, s12, v2
-; GFX7-NEXT:    v_bfe_u32 v2, v2, 16, 8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
+; GFX7-NEXT:    v_bfe_u32 v2, v2, 16, 8
 ; GFX7-NEXT:    v_or_b32_e32 v4, v4, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-NEXT:    v_bfe_u32 v5, v3, 8, 8
 ; GFX7-NEXT:    v_or_b32_e32 v2, v4, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v6
-; GFX7-NEXT:    v_or_b32_e32 v2, v2, v4
+; GFX7-NEXT:    v_bfe_u32 v5, v3, 8, 8
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 24, v3
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v4
 ; GFX7-NEXT:    v_and_b32_e32 v4, s12, v3
-; GFX7-NEXT:    v_bfe_u32 v3, v3, 16, 8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
+; GFX7-NEXT:    v_bfe_u32 v3, v3, 16, 8
 ; GFX7-NEXT:    v_or_b32_e32 v4, v4, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX7-NEXT:    v_or_b32_e32 v3, v4, v3
@@ -5400,6 +5400,7 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(<16 x i8> addrspace(4)* inreg %
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v4
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v2, v0, s5
+; GFX10-NEXT:    v_xor_b32_e32 v2, -1, v2
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_bfe_u32 s13, s0, s6
 ; GFX10-NEXT:    s_lshr_b32 s8, s0, 24
@@ -5418,9 +5419,9 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(<16 x i8> addrspace(4)* inreg %
 ; GFX10-NEXT:    s_or_b32 s0, s12, s0
 ; GFX10-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX10-NEXT:    s_or_b32 s13, s14, s15
-; GFX10-NEXT:    s_or_b32 s8, s0, s8
 ; GFX10-NEXT:    s_lshr_b32 s10, s2, 24
 ; GFX10-NEXT:    s_and_b32 s16, s2, s5
+; GFX10-NEXT:    s_or_b32 s8, s0, s8
 ; GFX10-NEXT:    s_lshl_b32 s0, s17, 8
 ; GFX10-NEXT:    s_bfe_u32 s2, s2, s7
 ; GFX10-NEXT:    s_lshl_b32 s9, s9, 24
@@ -5429,24 +5430,23 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(<16 x i8> addrspace(4)* inreg %
 ; GFX10-NEXT:    s_lshl_b32 s2, s2, 16
 ; GFX10-NEXT:    s_or_b32 s9, s1, s9
 ; GFX10-NEXT:    s_or_b32 s0, s0, s2
-; GFX10-NEXT:    s_bfe_u32 s2, s3, s6
 ; GFX10-NEXT:    s_lshl_b32 s1, s10, 24
+; GFX10-NEXT:    s_bfe_u32 s2, s3, s6
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s9
-; GFX10-NEXT:    s_or_b32 s10, s0, s1
-; GFX10-NEXT:    s_bfe_u32 s1, s3, s7
 ; GFX10-NEXT:    s_and_b32 s6, s3, s5
 ; GFX10-NEXT:    s_lshl_b32 s2, s2, 8
-; GFX10-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX10-NEXT:    s_or_b32 s10, s0, s1
+; GFX10-NEXT:    s_bfe_u32 s1, s3, s7
 ; GFX10-NEXT:    s_or_b32 s0, s6, s2
+; GFX10-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, s8, v1, vcc_lo
 ; GFX10-NEXT:    s_or_b32 s1, s0, s1
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 2, v4
 ; GFX10-NEXT:    s_lshr_b32 s11, s3, 24
-; GFX10-NEXT:    v_xor_b32_e32 v2, -1, v2
-; GFX10-NEXT:    s_lshl_b32 s2, s11, 24
 ; GFX10-NEXT:    s_mov_b32 s3, 8
-; GFX10-NEXT:    s_or_b32 s11, s1, s2
+; GFX10-NEXT:    s_lshl_b32 s2, s11, 24
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s10, s0
+; GFX10-NEXT:    s_or_b32 s11, s1, s2
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s1, 3, v4
 ; GFX10-NEXT:    s_and_b32 s2, s4, s5
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v0, v0, s2
@@ -5466,20 +5466,20 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(<16 x i8> addrspace(4)* inreg %
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v6, s3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v9, s3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v11, v10, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 24, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 24, v3
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v11, v10, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v10, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX10-NEXT:    v_and_or_b32 v6, v0, s5, v6
-; GFX10-NEXT:    v_and_or_b32 v9, v1, s5, v9
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
+; GFX10-NEXT:    v_and_or_b32 v9, v1, s5, v9
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
 ; GFX10-NEXT:    v_and_or_b32 v11, v2, s5, v11
-; GFX10-NEXT:    v_and_or_b32 v10, v3, s5, v10
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v12, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
+; GFX10-NEXT:    v_and_or_b32 v10, v3, s5, v10
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v12, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
 ; GFX10-NEXT:    v_or3_b32 v0, v6, v0, v4
@@ -5503,7 +5503,7 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(<16 x i8> addrspace(4)* inreg %
 ; GFX9-NEXT:    s_mov_b32 s12, 0x80008
 ; GFX9-NEXT:    s_movk_i32 s10, 0xff
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 2, v1
-; GFX9-NEXT:    v_and_b32_e32 v1, 3, v1
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_bfe_u32 s13, s0, s12
 ; GFX9-NEXT:    s_and_b32 s11, s0, s10
@@ -5516,11 +5516,11 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(<16 x i8> addrspace(4)* inreg %
 ; GFX9-NEXT:    s_or_b32 s0, s11, s0
 ; GFX9-NEXT:    s_lshl_b32 s4, s4, 24
 ; GFX9-NEXT:    s_bfe_u32 s11, s1, s12
-; GFX9-NEXT:    s_or_b32 s4, s0, s4
 ; GFX9-NEXT:    s_lshr_b32 s5, s1, 24
+; GFX9-NEXT:    s_or_b32 s4, s0, s4
 ; GFX9-NEXT:    s_and_b32 s0, s1, s10
-; GFX9-NEXT:    s_bfe_u32 s1, s1, s13
 ; GFX9-NEXT:    s_lshl_b32 s11, s11, 8
+; GFX9-NEXT:    s_bfe_u32 s1, s1, s13
 ; GFX9-NEXT:    s_or_b32 s0, s0, s11
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX9-NEXT:    s_or_b32 s0, s0, s1
@@ -5531,8 +5531,8 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(<16 x i8> addrspace(4)* inreg %
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, 8
 ; GFX9-NEXT:    s_or_b32 s0, s0, s1
 ; GFX9-NEXT:    s_bfe_u32 s1, s2, s13
-; GFX9-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX9-NEXT:    s_lshr_b32 s6, s2, 24
+; GFX9-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX9-NEXT:    s_or_b32 s0, s0, s1
 ; GFX9-NEXT:    s_lshl_b32 s1, s6, 24
 ; GFX9-NEXT:    s_or_b32 s6, s0, s1
@@ -5547,17 +5547,17 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(<16 x i8> addrspace(4)* inreg %
 ; GFX9-NEXT:    s_lshl_b32 s1, s7, 24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
+; GFX9-NEXT:    v_and_b32_e32 v1, 3, v1
 ; GFX9-NEXT:    s_or_b32 s7, s0, s1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 3, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v5, s6
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v4
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_lshlrev_b32_e64 v1, v1, s10
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 3, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v6, s7
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v4
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshlrev_b32_e64 v1, v1, s10
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[2:3]
 ; GFX9-NEXT:    v_xor_b32_e32 v1, -1, v1
 ; GFX9-NEXT:    v_and_or_b32 v5, v2, v1, v0
@@ -5566,27 +5566,27 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(<16 x i8> addrspace(4)* inreg %
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v4
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v5, s[4:5]
 ; GFX9-NEXT:    s_mov_b32 s8, 8
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v5, s[4:5]
+; GFX9-NEXT:    s_mov_b32 s9, 16
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v8, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    s_mov_b32 s9, 16
-; GFX9-NEXT:    v_and_or_b32 v8, v0, s10, v8
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX9-NEXT:    v_and_or_b32 v8, v0, s10, v8
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, s9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; GFX9-NEXT:    v_or3_b32 v0, v8, v0, v4
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v4, s8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[2:3]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
+; GFX9-NEXT:    v_or3_b32 v0, v8, v0, v4
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v4, s8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_and_or_b32 v4, v1, s10, v4
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, s9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
 ; GFX9-NEXT:    v_or3_b32 v1, v4, v1, v5
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 8
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v5, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 24, v2
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v5, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_mov_b32_e32 v8, 16
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 24, v3
 ; GFX9-NEXT:    v_and_or_b32 v5, v2, s10, v5
@@ -5595,9 +5595,9 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(<16 x i8> addrspace(4)* inreg %
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_or3_b32 v2, v5, v2, v6
 ; GFX9-NEXT:    v_and_or_b32 v6, v3, s10, v4
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-NEXT:    v_or3_b32 v3, v6, v3, v7
 ; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
@@ -5609,7 +5609,7 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(<16 x i8> addrspace(4)* inreg %
 ; GFX8-NEXT:    s_mov_b32 s10, 0x80008
 ; GFX8-NEXT:    s_movk_i32 s8, 0xff
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 2, v1
-; GFX8-NEXT:    v_and_b32_e32 v1, 3, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_bfe_u32 s11, s0, s10
 ; GFX8-NEXT:    s_and_b32 s9, s0, s8
@@ -5622,11 +5622,11 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(<16 x i8> addrspace(4)* inreg %
 ; GFX8-NEXT:    s_or_b32 s0, s9, s0
 ; GFX8-NEXT:    s_lshl_b32 s4, s4, 24
 ; GFX8-NEXT:    s_bfe_u32 s9, s1, s10
-; GFX8-NEXT:    s_or_b32 s4, s0, s4
 ; GFX8-NEXT:    s_lshr_b32 s5, s1, 24
+; GFX8-NEXT:    s_or_b32 s4, s0, s4
 ; GFX8-NEXT:    s_and_b32 s0, s1, s8
-; GFX8-NEXT:    s_bfe_u32 s1, s1, s11
 ; GFX8-NEXT:    s_lshl_b32 s9, s9, 8
+; GFX8-NEXT:    s_bfe_u32 s1, s1, s11
 ; GFX8-NEXT:    s_or_b32 s0, s0, s9
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
@@ -5637,8 +5637,8 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(<16 x i8> addrspace(4)* inreg %
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 8
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    s_bfe_u32 s1, s2, s11
-; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX8-NEXT:    s_lshr_b32 s6, s2, 24
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    s_lshl_b32 s1, s6, 24
 ; GFX8-NEXT:    s_or_b32 s6, s0, s1
@@ -5653,17 +5653,17 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(<16 x i8> addrspace(4)* inreg %
 ; GFX8-NEXT:    s_lshl_b32 s1, s7, 24
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s5
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
+; GFX8-NEXT:    v_and_b32_e32 v1, 3, v1
 ; GFX8-NEXT:    s_or_b32 s7, s0, s1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 3, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v5, s6
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v4
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_lshlrev_b32_e64 v1, v1, s8
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 3, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v6, s7
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v4
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshlrev_b32_e64 v1, v1, s8
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[2:3]
 ; GFX8-NEXT:    v_xor_b32_e32 v1, -1, v1
 ; GFX8-NEXT:    v_and_b32_e32 v1, v2, v1
@@ -5696,13 +5696,13 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(<16 x i8> addrspace(4)* inreg %
 ; GFX8-NEXT:    v_mov_b32_e32 v4, 8
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v5, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX8-NEXT:    v_mov_b32_e32 v8, 16
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 24, v2
 ; GFX8-NEXT:    v_or_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 24, v3
 ; GFX8-NEXT:    v_or_b32_e32 v2, v5, v2
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 24, v6
-; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 24, v3
 ; GFX8-NEXT:    v_or_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX8-NEXT:    v_or_b32_e32 v2, v2, v5
@@ -5720,7 +5720,7 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(<16 x i8> addrspace(4)* inreg %
 ; GFX7-NEXT:    s_mov_b32 s10, 0x80008
 ; GFX7-NEXT:    s_movk_i32 s8, 0xff
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 2, v1
-; GFX7-NEXT:    v_and_b32_e32 v1, 3, v1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_bfe_u32 s11, s0, s10
 ; GFX7-NEXT:    s_and_b32 s9, s0, s8
@@ -5733,11 +5733,11 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(<16 x i8> addrspace(4)* inreg %
 ; GFX7-NEXT:    s_or_b32 s0, s9, s0
 ; GFX7-NEXT:    s_lshl_b32 s4, s4, 24
 ; GFX7-NEXT:    s_bfe_u32 s9, s1, s10
-; GFX7-NEXT:    s_or_b32 s4, s0, s4
 ; GFX7-NEXT:    s_lshr_b32 s5, s1, 24
+; GFX7-NEXT:    s_or_b32 s4, s0, s4
 ; GFX7-NEXT:    s_and_b32 s0, s1, s8
-; GFX7-NEXT:    s_bfe_u32 s1, s1, s11
 ; GFX7-NEXT:    s_lshl_b32 s9, s9, 8
+; GFX7-NEXT:    s_bfe_u32 s1, s1, s11
 ; GFX7-NEXT:    s_or_b32 s0, s0, s9
 ; GFX7-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX7-NEXT:    s_or_b32 s0, s0, s1
@@ -5748,8 +5748,8 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(<16 x i8> addrspace(4)* inreg %
 ; GFX7-NEXT:    s_lshl_b32 s1, s1, 8
 ; GFX7-NEXT:    s_or_b32 s0, s0, s1
 ; GFX7-NEXT:    s_bfe_u32 s1, s2, s11
-; GFX7-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX7-NEXT:    s_lshr_b32 s6, s2, 24
+; GFX7-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX7-NEXT:    s_or_b32 s0, s0, s1
 ; GFX7-NEXT:    s_lshl_b32 s1, s6, 24
 ; GFX7-NEXT:    s_or_b32 s6, s0, s1
@@ -5764,18 +5764,18 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(<16 x i8> addrspace(4)* inreg %
 ; GFX7-NEXT:    s_lshl_b32 s1, s7, 24
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s5
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
+; GFX7-NEXT:    v_and_b32_e32 v1, 3, v1
 ; GFX7-NEXT:    s_or_b32 s7, s0, s1
 ; GFX7-NEXT:    v_mov_b32_e32 v5, s6
 ; GFX7-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 3, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, s8, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, v1, v0
-; GFX7-NEXT:    v_lshl_b32_e32 v1, s8, v1
 ; GFX7-NEXT:    v_mov_b32_e32 v6, s7
 ; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, v1, v0
+; GFX7-NEXT:    v_lshl_b32_e32 v1, s8, v1
 ; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[2:3]
 ; GFX7-NEXT:    v_xor_b32_e32 v1, -1, v1
 ; GFX7-NEXT:    v_and_b32_e32 v1, v2, v1
@@ -5789,41 +5789,41 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(<16 x i8> addrspace(4)* inreg %
 ; GFX7-NEXT:    v_bfe_u32 v9, v0, 8, 8
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
 ; GFX7-NEXT:    v_and_b32_e32 v8, s8, v0
-; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
+; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
 ; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
 ; GFX7-NEXT:    v_or_b32_e32 v8, v8, v9
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    v_or_b32_e32 v0, v8, v0
-; GFX7-NEXT:    v_bfe_u32 v8, v1, 8, 8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[2:3]
+; GFX7-NEXT:    v_bfe_u32 v8, v1, 8, 8
 ; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
-; GFX7-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[2:3]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX7-NEXT:    v_and_b32_e32 v4, s8, v1
-; GFX7-NEXT:    v_bfe_u32 v1, v1, 16, 8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 8, v8
+; GFX7-NEXT:    v_bfe_u32 v1, v1, 16, 8
 ; GFX7-NEXT:    v_or_b32_e32 v4, v4, v8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX7-NEXT:    v_or_b32_e32 v1, v4, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v5
 ; GFX7-NEXT:    v_bfe_u32 v5, v2, 8, 8
-; GFX7-NEXT:    v_or_b32_e32 v1, v1, v4
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 24, v2
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v4
 ; GFX7-NEXT:    v_and_b32_e32 v4, s8, v2
-; GFX7-NEXT:    v_bfe_u32 v2, v2, 16, 8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
+; GFX7-NEXT:    v_bfe_u32 v2, v2, 16, 8
 ; GFX7-NEXT:    v_or_b32_e32 v4, v4, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-NEXT:    v_bfe_u32 v5, v3, 8, 8
 ; GFX7-NEXT:    v_or_b32_e32 v2, v4, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v6
-; GFX7-NEXT:    v_or_b32_e32 v2, v2, v4
+; GFX7-NEXT:    v_bfe_u32 v5, v3, 8, 8
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 24, v3
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v4
 ; GFX7-NEXT:    v_and_b32_e32 v4, s8, v3
-; GFX7-NEXT:    v_bfe_u32 v3, v3, 16, 8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
+; GFX7-NEXT:    v_bfe_u32 v3, v3, 16, 8
 ; GFX7-NEXT:    v_or_b32_e32 v4, v4, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX7-NEXT:    v_or_b32_e32 v3, v4, v3
@@ -5849,6 +5849,7 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(<16 x i8> addrspace(4)* inreg %
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 3, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v1, s8
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-NEXT:    v_xor_b32_e32 v1, -1, v3
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_bfe_u32 s12, s0, s7
 ; GFX10-NEXT:    s_lshr_b32 s4, s0, 24
@@ -5861,10 +5862,10 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(<16 x i8> addrspace(4)* inreg %
 ; GFX10-NEXT:    s_lshl_b32 s4, s4, 24
 ; GFX10-NEXT:    s_or_b32 s0, s11, s0
 ; GFX10-NEXT:    s_bfe_u32 s14, s1, s7
-; GFX10-NEXT:    s_or_b32 s4, s0, s4
-; GFX10-NEXT:    s_bfe_u32 s0, s2, s9
 ; GFX10-NEXT:    s_and_b32 s15, s2, s8
 ; GFX10-NEXT:    s_lshl_b32 s16, s16, 8
+; GFX10-NEXT:    s_or_b32 s4, s0, s4
+; GFX10-NEXT:    s_bfe_u32 s0, s2, s9
 ; GFX10-NEXT:    s_lshr_b32 s5, s1, 24
 ; GFX10-NEXT:    s_and_b32 s13, s1, s8
 ; GFX10-NEXT:    s_bfe_u32 s1, s1, s9
@@ -5892,12 +5893,11 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(<16 x i8> addrspace(4)* inreg %
 ; GFX10-NEXT:    s_or_b32 s1, s0, s1
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 2, v4
 ; GFX10-NEXT:    s_lshl_b32 s2, s10, 24
-; GFX10-NEXT:    v_xor_b32_e32 v1, -1, v3
+; GFX10-NEXT:    s_mov_b32 s3, 8
 ; GFX10-NEXT:    s_or_b32 s7, s1, s2
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s1, 3, v4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s6, s0
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s2, 0, v4
-; GFX10-NEXT:    s_mov_b32 s3, 8
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s7, s1
 ; GFX10-NEXT:    v_and_or_b32 v5, v2, v1, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s4
@@ -5913,20 +5913,20 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(<16 x i8> addrspace(4)* inreg %
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v6, s3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v9, s3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v11, v10, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 24, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 24, v3
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v11, v10, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v10, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX10-NEXT:    v_and_or_b32 v6, v0, s8, v6
-; GFX10-NEXT:    v_and_or_b32 v9, v1, s8, v9
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
+; GFX10-NEXT:    v_and_or_b32 v9, v1, s8, v9
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
 ; GFX10-NEXT:    v_and_or_b32 v11, v2, s8, v11
-; GFX10-NEXT:    v_and_or_b32 v10, v3, s8, v10
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v12, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
+; GFX10-NEXT:    v_and_or_b32 v10, v3, s8, v10
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v12, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
 ; GFX10-NEXT:    v_or3_b32 v0, v6, v0, v4
@@ -5949,8 +5949,8 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(<16 x i8> addrspace(1)* %ptr, i
 ; GFX9-NEXT:    global_load_dwordx4 v[3:6], v[0:1], off
 ; GFX9-NEXT:    s_mov_b32 s0, 8
 ; GFX9-NEXT:    s_mov_b32 s1, 16
-; GFX9-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX9-NEXT:    s_movk_i32 s6, 0xff
+; GFX9-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 16
 ; GFX9-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v8, 0
@@ -5962,26 +5962,26 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(<16 x i8> addrspace(1)* %ptr, i
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 24, v5
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v14, s1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v16, s1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v17, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_and_or_b32 v3, v3, s6, v13
 ; GFX9-NEXT:    v_and_or_b32 v4, v4, s6, v15
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 2, v2
 ; GFX9-NEXT:    v_and_b32_e32 v2, 3, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v17, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_and_or_b32 v3, v3, s6, v13
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
+; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 24, v6
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v18, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v19, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    s_and_b32 s0, s2, s6
 ; GFX9-NEXT:    v_and_or_b32 v5, v5, s6, v17
+; GFX9-NEXT:    s_and_b32 s0, s2, s6
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 24, v11
 ; GFX9-NEXT:    v_or3_b32 v3, v3, v14, v9
 ; GFX9-NEXT:    v_or3_b32 v4, v4, v16, v10
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v15
-; GFX9-NEXT:    v_lshlrev_b32_e64 v17, v2, s0
 ; GFX9-NEXT:    v_and_or_b32 v13, v6, s6, v19
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v6, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_lshlrev_b32_e64 v17, v2, s0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 24, v12
 ; GFX9-NEXT:    v_or3_b32 v5, v5, v18, v11
 ; GFX9-NEXT:    v_cndmask_b32_e32 v9, v3, v4, vcc
@@ -5994,29 +5994,29 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(<16 x i8> addrspace(1)* %ptr, i
 ; GFX9-NEXT:    v_cndmask_b32_e64 v9, v9, v6, s[2:3]
 ; GFX9-NEXT:    v_and_or_b32 v2, v9, v2, v17
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v15
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v2, s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v2, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, v5, v2, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v2, s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[2:3]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 24, v2
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 24, v3
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 24, v4
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 24, v5
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 24, v2
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v12, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v14, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v16, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v13, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v15, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v17, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v13, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v18, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_and_or_b32 v1, v3, s6, v12
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 24, v6
+; GFX9-NEXT:    v_and_or_b32 v4, v4, s6, v14
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 24, v9
+; GFX9-NEXT:    v_and_or_b32 v5, v5, s6, v16
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 24, v10
 ; GFX9-NEXT:    v_and_or_b32 v10, v2, s6, v0
-; GFX9-NEXT:    v_and_or_b32 v5, v5, s6, v16
-; GFX9-NEXT:    v_and_or_b32 v4, v4, s6, v14
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 24, v11
 ; GFX9-NEXT:    v_or3_b32 v0, v1, v13, v3
 ; GFX9-NEXT:    v_or3_b32 v1, v4, v15, v6
@@ -6029,8 +6029,8 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(<16 x i8> addrspace(1)* %ptr, i
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    flat_load_dwordx4 v[3:6], v[0:1]
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 8
-; GFX8-NEXT:    v_mov_b32_e32 v9, 8
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 16
+; GFX8-NEXT:    v_mov_b32_e32 v9, 8
 ; GFX8-NEXT:    v_mov_b32_e32 v10, 16
 ; GFX8-NEXT:    s_movk_i32 s0, 0xff
 ; GFX8-NEXT:    s_and_b32 s1, s2, s0
@@ -6042,33 +6042,33 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(<16 x i8> addrspace(1)* %ptr, i
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 24, v3
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 24, v4
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v16, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_or_b32_sdwa v3, v3, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v17, v9, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 2, v2
+; GFX8-NEXT:    v_or_b32_sdwa v3, v3, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 24, v5
-; GFX8-NEXT:    v_and_b32_e32 v2, 3, v2
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v18, v10, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v19, v9, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX8-NEXT:    v_or_b32_sdwa v4, v5, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 24, v12
+; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 2, v2
+; GFX8-NEXT:    v_and_b32_e32 v2, 3, v2
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 24, v11
+; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 24, v12
 ; GFX8-NEXT:    v_or_b32_e32 v3, v3, v16
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 24, v6
 ; GFX8-NEXT:    v_or_b32_sdwa v5, v6, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v6, v10, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_or_b32_e32 v1, v4, v18
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 24, v13
+; GFX8-NEXT:    v_or_b32_e32 v1, v4, v18
 ; GFX8-NEXT:    v_or_b32_e32 v3, v3, v11
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v12
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v15
 ; GFX8-NEXT:    v_lshlrev_b32_e64 v17, v2, s1
 ; GFX8-NEXT:    v_lshlrev_b32_e64 v2, v2, s0
-; GFX8-NEXT:    v_or_b32_e32 v4, v5, v6
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 24, v14
+; GFX8-NEXT:    v_or_b32_e32 v4, v5, v6
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v13
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v3, v0, vcc
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v15
@@ -6080,9 +6080,9 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(<16 x i8> addrspace(1)* %ptr, i
 ; GFX8-NEXT:    v_and_b32_e32 v2, v5, v2
 ; GFX8-NEXT:    v_or_b32_e32 v2, v2, v17
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v15
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v2, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v2, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, v4, v2, s[2:3]
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v12, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v14, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
@@ -6091,19 +6091,19 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(<16 x i8> addrspace(1)* %ptr, i
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 24, v3
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 24, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 24, v2
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v13, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_or_b32_sdwa v3, v3, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v15, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v17, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 24, v2
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v10, v10, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_or_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v3, v3, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 24, v11
+; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 24, v11
 ; GFX8-NEXT:    v_or_b32_e32 v3, v3, v13
 ; GFX8-NEXT:    v_or_b32_e32 v11, v0, v15
 ; GFX8-NEXT:    v_or_b32_e32 v12, v1, v17
@@ -6139,86 +6139,86 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(<16 x i8> addrspace(1)* %ptr, i
 ; GFX7-NEXT:    v_bfe_u32 v12, v4, 8, 8
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 24, v3
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 24, v4
-; GFX7-NEXT:    v_bfe_u32 v14, v5, 8, 8
 ; GFX7-NEXT:    v_and_b32_e32 v9, s6, v3
 ; GFX7-NEXT:    v_bfe_u32 v3, v3, 16, 8
 ; GFX7-NEXT:    v_and_b32_e32 v11, s6, v4
 ; GFX7-NEXT:    v_bfe_u32 v4, v4, 16, 8
+; GFX7-NEXT:    v_bfe_u32 v14, v5, 8, 8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 8, v10
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 8, v12
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 24, v5
-; GFX7-NEXT:    v_bfe_u32 v16, v6, 8, 8
 ; GFX7-NEXT:    v_and_b32_e32 v13, s6, v5
 ; GFX7-NEXT:    v_bfe_u32 v5, v5, 16, 8
-; GFX7-NEXT:    v_or_b32_e32 v9, v9, v10
+; GFX7-NEXT:    v_bfe_u32 v16, v6, 8, 8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:    v_or_b32_e32 v10, v11, v12
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v14, 8, v14
+; GFX7-NEXT:    v_or_b32_e32 v9, v9, v10
+; GFX7-NEXT:    v_or_b32_e32 v10, v11, v12
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 24, v6
 ; GFX7-NEXT:    v_and_b32_e32 v15, s6, v6
 ; GFX7-NEXT:    v_bfe_u32 v6, v6, 16, 8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v16, 8, v16
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
-; GFX7-NEXT:    v_or_b32_e32 v3, v9, v3
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
-; GFX7-NEXT:    v_or_b32_e32 v4, v10, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v16, 8, v16
 ; GFX7-NEXT:    v_or_b32_e32 v11, v13, v14
+; GFX7-NEXT:    v_or_b32_e32 v3, v9, v3
+; GFX7-NEXT:    v_or_b32_e32 v4, v10, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
 ; GFX7-NEXT:    v_or_b32_e32 v12, v15, v16
 ; GFX7-NEXT:    v_or_b32_e32 v5, v11, v5
 ; GFX7-NEXT:    v_or_b32_e32 v0, v3, v0
 ; GFX7-NEXT:    v_or_b32_e32 v1, v4, v1
-; GFX7-NEXT:    v_or_b32_e32 v3, v5, v7
-; GFX7-NEXT:    v_cndmask_b32_e32 v5, v0, v1, vcc
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
 ; GFX7-NEXT:    v_or_b32_e32 v6, v12, v6
+; GFX7-NEXT:    v_or_b32_e32 v3, v5, v7
+; GFX7-NEXT:    v_cndmask_b32_e32 v5, v0, v1, vcc
 ; GFX7-NEXT:    v_or_b32_e32 v4, v6, v8
 ; GFX7-NEXT:    v_cndmask_b32_e64 v5, v5, v3, s[0:1]
 ; GFX7-NEXT:    v_cndmask_b32_e64 v5, v5, v4, s[2:3]
 ; GFX7-NEXT:    v_and_b32_e32 v2, v5, v2
 ; GFX7-NEXT:    v_or_b32_e32 v2, v2, v18
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v2, s[0:1]
 ; GFX7-NEXT:    v_bfe_u32 v9, v0, 8, 8
 ; GFX7-NEXT:    v_bfe_u32 v11, v1, 8, 8
 ; GFX7-NEXT:    v_cndmask_b32_e64 v4, v4, v2, s[2:3]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
-; GFX7-NEXT:    v_bfe_u32 v13, v3, 8, 8
 ; GFX7-NEXT:    v_and_b32_e32 v8, s6, v0
 ; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
 ; GFX7-NEXT:    v_and_b32_e32 v10, s6, v1
 ; GFX7-NEXT:    v_bfe_u32 v1, v1, 16, 8
+; GFX7-NEXT:    v_bfe_u32 v13, v3, 8, 8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v11, 8, v11
-; GFX7-NEXT:    v_or_b32_e32 v8, v8, v9
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 24, v3
 ; GFX7-NEXT:    v_and_b32_e32 v12, s6, v3
 ; GFX7-NEXT:    v_bfe_u32 v3, v3, 16, 8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_or_b32_e32 v9, v10, v11
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v13, 8, v13
+; GFX7-NEXT:    v_or_b32_e32 v8, v8, v9
+; GFX7-NEXT:    v_or_b32_e32 v9, v10, v11
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
-; GFX7-NEXT:    v_or_b32_e32 v0, v8, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX7-NEXT:    v_or_b32_e32 v1, v9, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX7-NEXT:    v_or_b32_e32 v10, v12, v13
+; GFX7-NEXT:    v_or_b32_e32 v0, v8, v0
+; GFX7-NEXT:    v_or_b32_e32 v1, v9, v1
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v5
-; GFX7-NEXT:    v_bfe_u32 v5, v4, 8, 8
 ; GFX7-NEXT:    v_or_b32_e32 v2, v10, v3
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 24, v6
-; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
+; GFX7-NEXT:    v_bfe_u32 v5, v4, 8, 8
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 24, v4
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX7-NEXT:    v_and_b32_e32 v3, s6, v4
-; GFX7-NEXT:    v_bfe_u32 v4, v4, 16, 8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
+; GFX7-NEXT:    v_bfe_u32 v4, v4, 16, 8
 ; GFX7-NEXT:    v_or_b32_e32 v3, v3, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX7-NEXT:    v_or_b32_e32 v3, v3, v4
@@ -6247,23 +6247,23 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(<16 x i8> addrspace(1)* %ptr, i
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 24, v5
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v13, s1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v15, s1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v16, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX10-NEXT:    v_and_or_b32 v3, v3, s3, v12
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v16, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX10-NEXT:    v_and_or_b32 v4, v4, s3, v14
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v17, v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 24, v6
-; GFX10-NEXT:    v_or3_b32 v3, v3, v13, v8
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v17, v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v18, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX10-NEXT:    v_or3_b32 v4, v4, v15, v9
 ; GFX10-NEXT:    v_and_or_b32 v5, v5, s3, v16
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
+; GFX10-NEXT:    v_or3_b32 v3, v3, v13, v8
+; GFX10-NEXT:    v_or3_b32 v4, v4, v15, v9
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v12, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_and_or_b32 v6, v6, s3, v18
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 24, v11
-; GFX10-NEXT:    v_cndmask_b32_e32 v9, v3, v4, vcc_lo
 ; GFX10-NEXT:    v_or3_b32 v5, v5, v17, v10
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, v3, v4, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 2, v2
 ; GFX10-NEXT:    s_and_b32 s1, s2, s3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v10, v0, s3
@@ -6290,19 +6290,19 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(<16 x i8> addrspace(1)* %ptr, i
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v11, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v13, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v15, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v7, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_and_or_b32 v2, v2, s3, v10
-; GFX10-NEXT:    v_and_or_b32 v3, v3, s3, v12
-; GFX10-NEXT:    v_and_or_b32 v12, v4, s3, v14
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 24, v5
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v7, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_and_or_b32 v14, v0, s3, v1
+; GFX10-NEXT:    v_and_or_b32 v3, v3, s3, v12
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX10-NEXT:    v_and_or_b32 v12, v4, s3, v14
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
+; GFX10-NEXT:    v_and_or_b32 v14, v0, s3, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
+; GFX10-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX10-NEXT:    v_or3_b32 v0, v2, v11, v10
 ; GFX10-NEXT:    v_or3_b32 v1, v3, v13, v6
-; GFX10-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX10-NEXT:    v_or3_b32 v2, v12, v15, v8
 ; GFX10-NEXT:    v_or3_b32 v3, v14, v7, v9
 ; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
@@ -6319,11 +6319,11 @@ define amdgpu_ps void @insertelement_v_v16i8_v_s(<16 x i8> addrspace(1)* %ptr, i
 ; GFX9-NEXT:    global_load_dwordx4 v[3:6], v[0:1], off
 ; GFX9-NEXT:    s_mov_b32 s0, 8
 ; GFX9-NEXT:    s_mov_b32 s1, 16
-; GFX9-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX9-NEXT:    s_movk_i32 s6, 0xff
+; GFX9-NEXT:    v_mov_b32_e32 v0, 8
+; GFX9-NEXT:    v_mov_b32_e32 v1, 16
 ; GFX9-NEXT:    s_lshr_b32 s4, s2, 2
 ; GFX9-NEXT:    s_and_b32 s2, s2, 3
-; GFX9-NEXT:    v_mov_b32_e32 v1, 16
 ; GFX9-NEXT:    s_lshl_b32 s2, s2, 3
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 1
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
@@ -6342,8 +6342,8 @@ define amdgpu_ps void @insertelement_v_v16i8_v_s(<16 x i8> addrspace(1)* %ptr, i
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v16, s1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v17, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_and_or_b32 v3, v3, s6, v13
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
 ; GFX9-NEXT:    v_and_or_b32 v4, v4, s6, v15
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 24, v6
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v18, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
@@ -6363,29 +6363,29 @@ define amdgpu_ps void @insertelement_v_v16i8_v_s(<16 x i8> addrspace(1)* %ptr, i
 ; GFX9-NEXT:    v_cndmask_b32_e64 v9, v9, v6, s[2:3]
 ; GFX9-NEXT:    v_and_or_b32 v2, v9, s5, v2
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], s4, 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v2, s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v2, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, v5, v2, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v2, s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[2:3]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 24, v2
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 24, v3
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 24, v4
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 24, v5
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 24, v2
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v12, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v14, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v16, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v13, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v15, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v17, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v13, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v18, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_and_or_b32 v1, v3, s6, v12
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 24, v6
+; GFX9-NEXT:    v_and_or_b32 v4, v4, s6, v14
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 24, v9
+; GFX9-NEXT:    v_and_or_b32 v5, v5, s6, v16
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 24, v10
 ; GFX9-NEXT:    v_and_or_b32 v10, v2, s6, v0
-; GFX9-NEXT:    v_and_or_b32 v5, v5, s6, v16
-; GFX9-NEXT:    v_and_or_b32 v4, v4, s6, v14
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 24, v11
 ; GFX9-NEXT:    v_or3_b32 v0, v1, v13, v3
 ; GFX9-NEXT:    v_or3_b32 v1, v4, v15, v6
@@ -6400,13 +6400,13 @@ define amdgpu_ps void @insertelement_v_v16i8_v_s(<16 x i8> addrspace(1)* %ptr, i
 ; GFX8-NEXT:    s_and_b32 s1, s2, 3
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 3
-; GFX8-NEXT:    v_mov_b32_e32 v9, 8
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 16
+; GFX8-NEXT:    v_mov_b32_e32 v9, 8
 ; GFX8-NEXT:    v_mov_b32_e32 v11, s1
 ; GFX8-NEXT:    v_mov_b32_e32 v10, 16
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v11, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    s_lshr_b32 s4, s2, 2
 ; GFX8-NEXT:    s_movk_i32 s0, 0xff
+; GFX8-NEXT:    s_lshr_b32 s4, s2, 2
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, s1
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 1
 ; GFX8-NEXT:    s_not_b32 s5, s0
@@ -6420,27 +6420,27 @@ define amdgpu_ps void @insertelement_v_v16i8_v_s(<16 x i8> addrspace(1)* %ptr, i
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 24, v3
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 24, v4
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v16, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_or_b32_sdwa v3, v3, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v17, v9, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX8-NEXT:    v_or_b32_sdwa v3, v3, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 24, v5
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v18, v10, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v19, v9, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX8-NEXT:    v_or_b32_sdwa v4, v5, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 24, v12
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 24, v11
+; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 24, v12
 ; GFX8-NEXT:    v_or_b32_e32 v3, v3, v16
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 24, v6
 ; GFX8-NEXT:    v_or_b32_sdwa v5, v6, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v6, v10, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_or_b32_e32 v1, v4, v18
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 24, v13
+; GFX8-NEXT:    v_or_b32_e32 v1, v4, v18
 ; GFX8-NEXT:    v_or_b32_e32 v3, v3, v11
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v12
-; GFX8-NEXT:    v_or_b32_e32 v4, v5, v6
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 24, v14
+; GFX8-NEXT:    v_or_b32_e32 v4, v5, v6
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v13
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v3, v0, vcc
 ; GFX8-NEXT:    v_or_b32_e32 v4, v4, v14
@@ -6449,9 +6449,9 @@ define amdgpu_ps void @insertelement_v_v16i8_v_s(<16 x i8> addrspace(1)* %ptr, i
 ; GFX8-NEXT:    v_and_b32_e32 v5, s5, v5
 ; GFX8-NEXT:    v_or_b32_e32 v2, v5, v2
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], s4, 0
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v2, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v2, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, v4, v2, s[2:3]
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v12, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v14, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
@@ -6460,19 +6460,19 @@ define amdgpu_ps void @insertelement_v_v16i8_v_s(<16 x i8> addrspace(1)* %ptr, i
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 24, v3
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 24, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 24, v2
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v13, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_or_b32_sdwa v3, v3, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v15, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v17, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 24, v2
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v10, v10, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_or_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v3, v3, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 24, v11
+; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 24, v11
 ; GFX8-NEXT:    v_or_b32_e32 v3, v3, v13
 ; GFX8-NEXT:    v_or_b32_e32 v11, v0, v15
 ; GFX8-NEXT:    v_or_b32_e32 v12, v1, v17
@@ -6507,87 +6507,87 @@ define amdgpu_ps void @insertelement_v_v16i8_v_s(<16 x i8> addrspace(1)* %ptr, i
 ; GFX7-NEXT:    v_bfe_u32 v12, v4, 8, 8
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 24, v3
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v4
-; GFX7-NEXT:    v_bfe_u32 v14, v5, 8, 8
 ; GFX7-NEXT:    v_and_b32_e32 v9, s6, v3
 ; GFX7-NEXT:    v_bfe_u32 v3, v3, 16, 8
 ; GFX7-NEXT:    v_and_b32_e32 v11, s6, v4
 ; GFX7-NEXT:    v_bfe_u32 v4, v4, 16, 8
+; GFX7-NEXT:    v_bfe_u32 v14, v5, 8, 8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 8, v10
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 8, v12
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 24, v5
-; GFX7-NEXT:    v_bfe_u32 v16, v6, 8, 8
 ; GFX7-NEXT:    v_and_b32_e32 v13, s6, v5
 ; GFX7-NEXT:    v_bfe_u32 v5, v5, 16, 8
-; GFX7-NEXT:    v_or_b32_e32 v9, v9, v10
+; GFX7-NEXT:    v_bfe_u32 v16, v6, 8, 8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:    v_or_b32_e32 v10, v11, v12
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v14, 8, v14
+; GFX7-NEXT:    v_or_b32_e32 v9, v9, v10
+; GFX7-NEXT:    v_or_b32_e32 v10, v11, v12
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 24, v6
 ; GFX7-NEXT:    v_and_b32_e32 v15, s6, v6
 ; GFX7-NEXT:    v_bfe_u32 v6, v6, 16, 8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v16, 8, v16
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
-; GFX7-NEXT:    v_or_b32_e32 v3, v9, v3
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
-; GFX7-NEXT:    v_or_b32_e32 v4, v10, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v16, 8, v16
 ; GFX7-NEXT:    v_or_b32_e32 v11, v13, v14
+; GFX7-NEXT:    v_or_b32_e32 v3, v9, v3
+; GFX7-NEXT:    v_or_b32_e32 v4, v10, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
 ; GFX7-NEXT:    v_or_b32_e32 v12, v15, v16
 ; GFX7-NEXT:    v_or_b32_e32 v5, v11, v5
 ; GFX7-NEXT:    v_or_b32_e32 v1, v3, v1
 ; GFX7-NEXT:    v_or_b32_e32 v2, v4, v2
-; GFX7-NEXT:    v_or_b32_e32 v3, v5, v7
-; GFX7-NEXT:    v_cndmask_b32_e32 v5, v1, v2, vcc
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
 ; GFX7-NEXT:    v_or_b32_e32 v6, v12, v6
+; GFX7-NEXT:    v_or_b32_e32 v3, v5, v7
+; GFX7-NEXT:    v_cndmask_b32_e32 v5, v1, v2, vcc
 ; GFX7-NEXT:    v_or_b32_e32 v4, v6, v8
 ; GFX7-NEXT:    v_cndmask_b32_e64 v5, v5, v3, s[0:1]
 ; GFX7-NEXT:    v_cndmask_b32_e64 v5, v5, v4, s[2:3]
 ; GFX7-NEXT:    v_and_b32_e32 v5, s5, v5
 ; GFX7-NEXT:    v_or_b32_e32 v0, v5, v0
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[4:5], s4, 0
-; GFX7-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
 ; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v0, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
 ; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v0, s[0:1]
 ; GFX7-NEXT:    v_bfe_u32 v9, v1, 8, 8
 ; GFX7-NEXT:    v_bfe_u32 v11, v2, 8, 8
 ; GFX7-NEXT:    v_cndmask_b32_e64 v4, v4, v0, s[2:3]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 24, v1
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 24, v2
-; GFX7-NEXT:    v_bfe_u32 v13, v3, 8, 8
 ; GFX7-NEXT:    v_and_b32_e32 v8, s6, v1
 ; GFX7-NEXT:    v_bfe_u32 v1, v1, 16, 8
 ; GFX7-NEXT:    v_and_b32_e32 v10, s6, v2
 ; GFX7-NEXT:    v_bfe_u32 v2, v2, 16, 8
+; GFX7-NEXT:    v_bfe_u32 v13, v3, 8, 8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v11, 8, v11
-; GFX7-NEXT:    v_or_b32_e32 v8, v8, v9
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 24, v3
 ; GFX7-NEXT:    v_and_b32_e32 v12, s6, v3
 ; GFX7-NEXT:    v_bfe_u32 v3, v3, 16, 8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-NEXT:    v_or_b32_e32 v9, v10, v11
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v13, 8, v13
+; GFX7-NEXT:    v_or_b32_e32 v8, v8, v9
+; GFX7-NEXT:    v_or_b32_e32 v9, v10, v11
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
-; GFX7-NEXT:    v_or_b32_e32 v1, v8, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX7-NEXT:    v_or_b32_e32 v10, v12, v13
+; GFX7-NEXT:    v_or_b32_e32 v1, v8, v1
 ; GFX7-NEXT:    v_or_b32_e32 v2, v9, v2
 ; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX7-NEXT:    v_or_b32_e32 v1, v2, v5
-; GFX7-NEXT:    v_bfe_u32 v5, v4, 8, 8
 ; GFX7-NEXT:    v_or_b32_e32 v2, v10, v3
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 24, v6
-; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
+; GFX7-NEXT:    v_bfe_u32 v5, v4, 8, 8
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 24, v4
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX7-NEXT:    v_and_b32_e32 v3, s6, v4
-; GFX7-NEXT:    v_bfe_u32 v4, v4, 16, 8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
+; GFX7-NEXT:    v_bfe_u32 v4, v4, 16, 8
 ; GFX7-NEXT:    v_or_b32_e32 v3, v3, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX7-NEXT:    v_or_b32_e32 v3, v3, v4
@@ -6614,23 +6614,23 @@ define amdgpu_ps void @insertelement_v_v16i8_v_s(<16 x i8> addrspace(1)* %ptr, i
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 24, v5
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v12, s1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v14, s1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v15, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX10-NEXT:    v_and_or_b32 v3, v3, s3, v11
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v15, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX10-NEXT:    v_and_or_b32 v4, v4, s3, v13
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v16, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 24, v6
-; GFX10-NEXT:    v_or3_b32 v3, v3, v12, v7
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v16, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v17, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX10-NEXT:    v_or3_b32 v4, v4, v14, v8
 ; GFX10-NEXT:    v_and_or_b32 v5, v5, s3, v15
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
+; GFX10-NEXT:    v_or3_b32 v3, v3, v12, v7
+; GFX10-NEXT:    v_or3_b32 v4, v4, v14, v8
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v11, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_and_or_b32 v6, v6, s3, v17
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 24, v10
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v3, v4, vcc_lo
 ; GFX10-NEXT:    v_or3_b32 v5, v5, v16, v9
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v3, v4, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s4, 2
 ; GFX10-NEXT:    s_and_b32 s1, s2, 3
 ; GFX10-NEXT:    v_or3_b32 v6, v6, v11, v7
@@ -6644,12 +6644,12 @@ define amdgpu_ps void @insertelement_v_v16i8_v_s(<16 x i8> addrspace(1)* %ptr, i
 ; GFX10-NEXT:    v_and_or_b32 v2, v7, s2, v2
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s2, s4, 0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v2, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, v2, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v2, s2
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, v2, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s1
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 24, v4
-; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 24, v5
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 24, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 24, v5
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 24, v2
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v10, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v12, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
@@ -6662,14 +6662,14 @@ define amdgpu_ps void @insertelement_v_v16i8_v_s(<16 x i8> addrspace(1)* %ptr, i
 ; GFX10-NEXT:    v_and_or_b32 v1, v3, s3, v10
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 24, v6
 ; GFX10-NEXT:    v_and_or_b32 v6, v4, s3, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
 ; GFX10-NEXT:    v_and_or_b32 v10, v5, s3, v14
-; GFX10-NEXT:    v_and_or_b32 v12, v2, s3, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
+; GFX10-NEXT:    v_and_or_b32 v12, v2, s3, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
-; GFX10-NEXT:    v_or3_b32 v0, v1, v11, v3
 ; GFX10-NEXT:    v_mov_b32_e32 v5, 0
+; GFX10-NEXT:    v_or3_b32 v0, v1, v11, v3
 ; GFX10-NEXT:    v_or3_b32 v1, v6, v13, v7
 ; GFX10-NEXT:    v_or3_b32 v2, v10, v15, v8
 ; GFX10-NEXT:    v_or3_b32 v3, v12, v16, v9
@@ -6688,9 +6688,9 @@ define amdgpu_ps void @insertelement_v_v16i8_v_v(<16 x i8> addrspace(1)* %ptr, i
 ; GFX9-NEXT:    s_mov_b32 s0, 8
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 8
 ; GFX9-NEXT:    s_mov_b32 s1, 16
-; GFX9-NEXT:    v_mov_b32_e32 v8, 16
 ; GFX9-NEXT:    s_movk_i32 s2, 0xff
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0xff
+; GFX9-NEXT:    v_mov_b32_e32 v8, 16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 24, v4
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 24, v5
@@ -6699,14 +6699,14 @@ define amdgpu_ps void @insertelement_v_v16i8_v_v(<16 x i8> addrspace(1)* %ptr, i
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v17, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v19, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 24, v6
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v18, v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_and_or_b32 v6, v6, v0, v17
-; GFX9-NEXT:    v_and_or_b32 v17, v7, v0, v19
-; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 2, v3
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v14, s1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v16, s1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v18, v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_and_or_b32 v13, v4, s2, v13
 ; GFX9-NEXT:    v_and_or_b32 v15, v5, s2, v15
+; GFX9-NEXT:    v_and_or_b32 v6, v6, v0, v17
+; GFX9-NEXT:    v_and_or_b32 v17, v7, v0, v19
+; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 2, v3
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 24, v7
@@ -6715,10 +6715,10 @@ define amdgpu_ps void @insertelement_v_v16i8_v_v(<16 x i8> addrspace(1)* %ptr, i
 ; GFX9-NEXT:    v_or3_b32 v9, v13, v14, v9
 ; GFX9-NEXT:    v_or3_b32 v10, v15, v16, v10
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v19
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX9-NEXT:    v_or3_b32 v6, v6, v18, v11
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 24, v12
+; GFX9-NEXT:    v_or3_b32 v6, v6, v18, v11
 ; GFX9-NEXT:    v_cndmask_b32_e32 v11, v9, v10, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v19
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
@@ -6746,17 +6746,17 @@ define amdgpu_ps void @insertelement_v_v16i8_v_v(<16 x i8> addrspace(1)* %ptr, i
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v14, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v16, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v18, v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_and_or_b32 v3, v3, v0, v13
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v8, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_and_or_b32 v3, v3, v0, v13
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
 ; GFX9-NEXT:    v_and_or_b32 v9, v9, v0, v15
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
-; GFX9-NEXT:    v_and_or_b32 v13, v2, v0, v1
 ; GFX9-NEXT:    v_and_or_b32 v6, v6, v0, v17
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 24, v11
+; GFX9-NEXT:    v_and_or_b32 v13, v2, v0, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 24, v12
-; GFX9-NEXT:    v_or3_b32 v0, v3, v14, v7
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0
+; GFX9-NEXT:    v_or3_b32 v0, v3, v14, v7
 ; GFX9-NEXT:    v_or3_b32 v1, v9, v16, v10
 ; GFX9-NEXT:    v_or3_b32 v2, v6, v18, v11
 ; GFX9-NEXT:    v_or3_b32 v3, v13, v8, v12
@@ -6774,30 +6774,30 @@ define amdgpu_ps void @insertelement_v_v16i8_v_v(<16 x i8> addrspace(1)* %ptr, i
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v17, v9, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v19, v9, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 24, v6
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v15, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 24, v6
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v18, v10, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX8-NEXT:    v_or_b32_sdwa v6, v6, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v17, v7, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v19, 2, v3
 ; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 24, v7
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v16, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 24, v4
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 24, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 24, v7
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v16, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v8, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v15, v4, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, v3, v0
+; GFX8-NEXT:    v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, v3, v0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 24, v11
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 24, v12
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v8
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 24, v13
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 24, v14
 ; GFX8-NEXT:    v_or_b32_e32 v14, v15, v16
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v8
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v7, v10, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX8-NEXT:    v_or_b32_e32 v6, v6, v18
 ; GFX8-NEXT:    v_or_b32_e32 v3, v14, v3
@@ -6816,8 +6816,8 @@ define amdgpu_ps void @insertelement_v_v16i8_v_v(<16 x i8> addrspace(1)* %ptr, i
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v19
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, v3, v0, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v6, v0, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v6, v0, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v7, v0, s[2:3]
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v12, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v14, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
@@ -6826,28 +6826,28 @@ define amdgpu_ps void @insertelement_v_v16i8_v_v(<16 x i8> addrspace(1)* %ptr, i
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 24, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 24, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 24, v0
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v13, v10, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_or_b32_sdwa v2, v2, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v15, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v17, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v10, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX8-NEXT:    v_or_b32_sdwa v2, v2, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v3, v3, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 24, v0
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v10, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v15
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 24, v11
 ; GFX8-NEXT:    v_or_b32_e32 v2, v2, v13
-; GFX8-NEXT:    v_or_b32_e32 v10, v0, v10
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v15
 ; GFX8-NEXT:    v_or_b32_e32 v3, v3, v17
-; GFX8-NEXT:    v_or_b32_e32 v0, v2, v6
-; GFX8-NEXT:    v_or_b32_e32 v2, v3, v8
+; GFX8-NEXT:    v_or_b32_e32 v10, v0, v10
 ; GFX8-NEXT:    v_mov_b32_e32 v5, 0
+; GFX8-NEXT:    v_or_b32_e32 v0, v2, v6
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v7
+; GFX8-NEXT:    v_or_b32_e32 v2, v3, v8
 ; GFX8-NEXT:    v_or_b32_e32 v3, v10, v9
 ; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NEXT:    s_endpgm
@@ -6876,41 +6876,41 @@ define amdgpu_ps void @insertelement_v_v16i8_v_v(<16 x i8> addrspace(1)* %ptr, i
 ; GFX7-NEXT:    v_bfe_u32 v14, v5, 8, 8
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 24, v4
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 24, v5
-; GFX7-NEXT:    v_bfe_u32 v16, v6, 8, 8
 ; GFX7-NEXT:    v_and_b32_e32 v11, s0, v4
 ; GFX7-NEXT:    v_bfe_u32 v4, v4, 16, 8
 ; GFX7-NEXT:    v_and_b32_e32 v13, s0, v5
 ; GFX7-NEXT:    v_bfe_u32 v5, v5, 16, 8
+; GFX7-NEXT:    v_bfe_u32 v16, v6, 8, 8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 8, v12
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v14, 8, v14
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v9, 24, v6
-; GFX7-NEXT:    v_bfe_u32 v18, v7, 8, 8
 ; GFX7-NEXT:    v_and_b32_e32 v15, v6, v8
 ; GFX7-NEXT:    v_bfe_u32 v6, v6, 16, 8
-; GFX7-NEXT:    v_or_b32_e32 v11, v11, v12
+; GFX7-NEXT:    v_bfe_u32 v18, v7, 8, 8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-NEXT:    v_or_b32_e32 v12, v13, v14
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v16, 8, v16
+; GFX7-NEXT:    v_or_b32_e32 v11, v11, v12
+; GFX7-NEXT:    v_or_b32_e32 v12, v13, v14
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v10, 24, v7
 ; GFX7-NEXT:    v_and_b32_e32 v17, v7, v8
 ; GFX7-NEXT:    v_bfe_u32 v7, v7, 16, 8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v18, 8, v18
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
-; GFX7-NEXT:    v_or_b32_e32 v4, v11, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
-; GFX7-NEXT:    v_or_b32_e32 v5, v12, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v18, 8, v18
 ; GFX7-NEXT:    v_or_b32_e32 v13, v15, v16
+; GFX7-NEXT:    v_or_b32_e32 v4, v11, v4
+; GFX7-NEXT:    v_or_b32_e32 v5, v12, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
 ; GFX7-NEXT:    v_or_b32_e32 v14, v17, v18
 ; GFX7-NEXT:    v_or_b32_e32 v6, v13, v6
 ; GFX7-NEXT:    v_or_b32_e32 v0, v4, v0
 ; GFX7-NEXT:    v_or_b32_e32 v1, v5, v1
-; GFX7-NEXT:    v_or_b32_e32 v4, v6, v9
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
 ; GFX7-NEXT:    v_or_b32_e32 v7, v14, v7
+; GFX7-NEXT:    v_or_b32_e32 v4, v6, v9
 ; GFX7-NEXT:    v_cndmask_b32_e32 v6, v0, v1, vcc
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v19
 ; GFX7-NEXT:    v_or_b32_e32 v5, v7, v10
@@ -6924,39 +6924,39 @@ define amdgpu_ps void @insertelement_v_v16i8_v_v(<16 x i8> addrspace(1)* %ptr, i
 ; GFX7-NEXT:    v_cndmask_b32_e64 v3, v4, v2, s[0:1]
 ; GFX7-NEXT:    v_cndmask_b32_e64 v4, v5, v2, s[2:3]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
-; GFX7-NEXT:    v_bfe_u32 v12, v1, 8, 8
 ; GFX7-NEXT:    v_and_b32_e32 v9, v0, v8
 ; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
+; GFX7-NEXT:    v_bfe_u32 v12, v1, 8, 8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 8, v10
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
-; GFX7-NEXT:    v_bfe_u32 v14, v3, 8, 8
 ; GFX7-NEXT:    v_and_b32_e32 v11, v1, v8
 ; GFX7-NEXT:    v_bfe_u32 v1, v1, 16, 8
+; GFX7-NEXT:    v_bfe_u32 v14, v3, 8, 8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX7-NEXT:    v_or_b32_e32 v9, v9, v10
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 8, v12
+; GFX7-NEXT:    v_or_b32_e32 v9, v9, v10
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 24, v3
 ; GFX7-NEXT:    v_and_b32_e32 v13, v3, v8
 ; GFX7-NEXT:    v_bfe_u32 v3, v3, 16, 8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
-; GFX7-NEXT:    v_or_b32_e32 v0, v9, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_or_b32_e32 v10, v11, v12
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v14, 8, v14
-; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX7-NEXT:    v_or_b32_e32 v11, v13, v14
+; GFX7-NEXT:    v_or_b32_e32 v10, v11, v12
+; GFX7-NEXT:    v_or_b32_e32 v0, v9, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
+; GFX7-NEXT:    v_or_b32_e32 v11, v13, v14
 ; GFX7-NEXT:    v_or_b32_e32 v1, v10, v1
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v5
-; GFX7-NEXT:    v_bfe_u32 v5, v4, 8, 8
 ; GFX7-NEXT:    v_or_b32_e32 v2, v11, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 24, v6
-; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
+; GFX7-NEXT:    v_bfe_u32 v5, v4, 8, 8
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 24, v4
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX7-NEXT:    v_and_b32_e32 v3, v4, v8
-; GFX7-NEXT:    v_bfe_u32 v4, v4, 16, 8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
+; GFX7-NEXT:    v_bfe_u32 v4, v4, 16, 8
 ; GFX7-NEXT:    v_or_b32_e32 v3, v3, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX7-NEXT:    v_or_b32_e32 v3, v3, v4
@@ -6973,9 +6973,9 @@ define amdgpu_ps void @insertelement_v_v16i8_v_v(<16 x i8> addrspace(1)* %ptr, i
 ; GFX10-NEXT:    s_mov_b32 s1, 16
 ; GFX10-NEXT:    s_movk_i32 s2, 0xff
 ; GFX10-NEXT:    v_and_b32_e32 v0, 3, v3
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0xff
 ; GFX10-NEXT:    v_mov_b32_e32 v9, 16
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 2, v3
-; GFX10-NEXT:    v_mov_b32_e32 v1, 0xff
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
@@ -6986,23 +6986,23 @@ define amdgpu_ps void @insertelement_v_v16i8_v_v(<16 x i8> addrspace(1)* %ptr, i
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 24, v6
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v15, s1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v17, s1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v18, v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX10-NEXT:    v_and_or_b32 v4, v4, s2, v14
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
 ; GFX10-NEXT:    v_and_or_b32 v5, v5, s2, v16
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 24, v11
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v18, v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v19, v9, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v13, 24, v7
-; GFX10-NEXT:    v_or3_b32 v4, v4, v15, v10
-; GFX10-NEXT:    v_or3_b32 v5, v5, v17, v11
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v19, v9, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v20, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX10-NEXT:    v_and_or_b32 v6, v6, v1, v18
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 24, v12
+; GFX10-NEXT:    v_or3_b32 v4, v4, v15, v10
+; GFX10-NEXT:    v_or3_b32 v5, v5, v17, v11
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v14, v9, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 24, v13
 ; GFX10-NEXT:    v_and_or_b32 v7, v7, v1, v20
-; GFX10-NEXT:    v_cndmask_b32_e32 v11, v4, v5, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 24, v13
 ; GFX10-NEXT:    v_or3_b32 v6, v6, v19, v12
+; GFX10-NEXT:    v_cndmask_b32_e32 v11, v4, v5, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 2, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v12, v0, v1
 ; GFX10-NEXT:    v_or3_b32 v7, v7, v14, v10
@@ -7028,19 +7028,19 @@ define amdgpu_ps void @insertelement_v_v16i8_v_v(<16 x i8> addrspace(1)* %ptr, i
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v12, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v14, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v16, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_and_or_b32 v2, v2, v1, v11
-; GFX10-NEXT:    v_and_or_b32 v3, v3, v1, v13
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v9, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX10-NEXT:    v_and_or_b32 v2, v2, v1, v11
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 24, v5
-; GFX10-NEXT:    v_and_or_b32 v13, v4, v1, v15
-; GFX10-NEXT:    v_and_or_b32 v8, v0, v1, v8
+; GFX10-NEXT:    v_and_or_b32 v3, v3, v1, v13
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX10-NEXT:    v_and_or_b32 v13, v4, v1, v15
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
+; GFX10-NEXT:    v_and_or_b32 v8, v0, v1, v8
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
+; GFX10-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX10-NEXT:    v_or3_b32 v0, v2, v12, v11
 ; GFX10-NEXT:    v_or3_b32 v1, v3, v14, v6
-; GFX10-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX10-NEXT:    v_or3_b32 v2, v13, v16, v7
 ; GFX10-NEXT:    v_or3_b32 v3, v8, v9, v10
 ; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
index 5aae1526cdbc5..11b06cdd60a56 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
@@ -111,23 +111,23 @@ define <8 x float> @dyn_insertelement_v8f32_const_s_v_v(float %val, i32 %idx) {
 ; GPRIDX-NEXT:    v_mov_b32_e32 v15, s11
 ; GPRIDX-NEXT:    v_mov_b32_e32 v8, s4
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GPRIDX-NEXT:    v_cndmask_b32_e32 v8, v8, v0, vcc
 ; GPRIDX-NEXT:    v_mov_b32_e32 v9, s5
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v8, v8, v0, vcc
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GPRIDX-NEXT:    v_cndmask_b32_e32 v9, v9, v0, vcc
 ; GPRIDX-NEXT:    v_mov_b32_e32 v10, s6
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v9, v9, v0, vcc
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v1
-; GPRIDX-NEXT:    v_cndmask_b32_e32 v2, v10, v0, vcc
 ; GPRIDX-NEXT:    v_mov_b32_e32 v11, s7
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v2, v10, v0, vcc
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v1
-; GPRIDX-NEXT:    v_cndmask_b32_e32 v3, v11, v0, vcc
 ; GPRIDX-NEXT:    v_mov_b32_e32 v12, s8
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v3, v11, v0, vcc
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v1
-; GPRIDX-NEXT:    v_cndmask_b32_e32 v4, v12, v0, vcc
 ; GPRIDX-NEXT:    v_mov_b32_e32 v13, s9
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v4, v12, v0, vcc
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v1
-; GPRIDX-NEXT:    v_cndmask_b32_e32 v5, v13, v0, vcc
 ; GPRIDX-NEXT:    v_mov_b32_e32 v14, s10
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v5, v13, v0, vcc
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 6, v1
 ; GPRIDX-NEXT:    v_cndmask_b32_e32 v6, v14, v0, vcc
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v1
@@ -195,23 +195,23 @@ define amdgpu_ps <8 x float> @dyn_insertelement_v8f32_s_s_v(<8 x float> inreg %v
 ; GPRIDX-NEXT:    v_mov_b32_e32 v8, s0
 ; GPRIDX-NEXT:    v_mov_b32_e32 v7, s10
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GPRIDX-NEXT:    v_cndmask_b32_e32 v8, v8, v7, vcc
 ; GPRIDX-NEXT:    v_mov_b32_e32 v9, s1
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v8, v8, v7, vcc
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GPRIDX-NEXT:    v_cndmask_b32_e32 v1, v9, v7, vcc
 ; GPRIDX-NEXT:    v_mov_b32_e32 v10, s2
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v1, v9, v7, vcc
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v0
-; GPRIDX-NEXT:    v_cndmask_b32_e32 v2, v10, v7, vcc
 ; GPRIDX-NEXT:    v_mov_b32_e32 v11, s3
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v2, v10, v7, vcc
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v0
-; GPRIDX-NEXT:    v_cndmask_b32_e32 v3, v11, v7, vcc
 ; GPRIDX-NEXT:    v_mov_b32_e32 v12, s4
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v3, v11, v7, vcc
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v0
-; GPRIDX-NEXT:    v_cndmask_b32_e32 v4, v12, v7, vcc
 ; GPRIDX-NEXT:    v_mov_b32_e32 v13, s5
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v4, v12, v7, vcc
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v0
-; GPRIDX-NEXT:    v_cndmask_b32_e32 v5, v13, v7, vcc
 ; GPRIDX-NEXT:    v_mov_b32_e32 v14, s6
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v5, v13, v7, vcc
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 6, v0
 ; GPRIDX-NEXT:    v_cndmask_b32_e32 v6, v14, v7, vcc
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v0
@@ -274,23 +274,23 @@ define amdgpu_ps <8 x float> @dyn_insertelement_v8f32_s_v_s(<8 x float> inreg %v
 ; GPRIDX-NEXT:    v_mov_b32_e32 v15, s7
 ; GPRIDX-NEXT:    v_mov_b32_e32 v8, s0
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e64 vcc, s10, 0
-; GPRIDX-NEXT:    v_cndmask_b32_e32 v8, v8, v0, vcc
 ; GPRIDX-NEXT:    v_mov_b32_e32 v9, s1
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v8, v8, v0, vcc
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e64 vcc, s10, 1
-; GPRIDX-NEXT:    v_cndmask_b32_e32 v1, v9, v0, vcc
 ; GPRIDX-NEXT:    v_mov_b32_e32 v10, s2
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v1, v9, v0, vcc
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e64 vcc, s10, 2
-; GPRIDX-NEXT:    v_cndmask_b32_e32 v2, v10, v0, vcc
 ; GPRIDX-NEXT:    v_mov_b32_e32 v11, s3
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v2, v10, v0, vcc
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e64 vcc, s10, 3
-; GPRIDX-NEXT:    v_cndmask_b32_e32 v3, v11, v0, vcc
 ; GPRIDX-NEXT:    v_mov_b32_e32 v12, s4
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v3, v11, v0, vcc
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e64 vcc, s10, 4
-; GPRIDX-NEXT:    v_cndmask_b32_e32 v4, v12, v0, vcc
 ; GPRIDX-NEXT:    v_mov_b32_e32 v13, s5
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v4, v12, v0, vcc
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e64 vcc, s10, 5
-; GPRIDX-NEXT:    v_cndmask_b32_e32 v5, v13, v0, vcc
 ; GPRIDX-NEXT:    v_mov_b32_e32 v14, s6
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v5, v13, v0, vcc
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e64 vcc, s10, 6
 ; GPRIDX-NEXT:    v_cndmask_b32_e32 v6, v14, v0, vcc
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e64 vcc, s10, 7
@@ -399,23 +399,23 @@ define amdgpu_ps <8 x float> @dyn_insertelement_v8f32_s_v_v(<8 x float> inreg %v
 ; GPRIDX-NEXT:    v_mov_b32_e32 v15, s7
 ; GPRIDX-NEXT:    v_mov_b32_e32 v8, s0
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GPRIDX-NEXT:    v_cndmask_b32_e32 v8, v8, v0, vcc
 ; GPRIDX-NEXT:    v_mov_b32_e32 v9, s1
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v8, v8, v0, vcc
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GPRIDX-NEXT:    v_cndmask_b32_e32 v9, v9, v0, vcc
 ; GPRIDX-NEXT:    v_mov_b32_e32 v10, s2
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v9, v9, v0, vcc
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v1
-; GPRIDX-NEXT:    v_cndmask_b32_e32 v2, v10, v0, vcc
 ; GPRIDX-NEXT:    v_mov_b32_e32 v11, s3
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v2, v10, v0, vcc
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v1
-; GPRIDX-NEXT:    v_cndmask_b32_e32 v3, v11, v0, vcc
 ; GPRIDX-NEXT:    v_mov_b32_e32 v12, s4
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v3, v11, v0, vcc
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v1
-; GPRIDX-NEXT:    v_cndmask_b32_e32 v4, v12, v0, vcc
 ; GPRIDX-NEXT:    v_mov_b32_e32 v13, s5
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v4, v12, v0, vcc
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v1
-; GPRIDX-NEXT:    v_cndmask_b32_e32 v5, v13, v0, vcc
 ; GPRIDX-NEXT:    v_mov_b32_e32 v14, s6
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v5, v13, v0, vcc
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 6, v1
 ; GPRIDX-NEXT:    v_cndmask_b32_e32 v6, v14, v0, vcc
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v1
@@ -769,7 +769,6 @@ define void @dyn_insertelement_v8f64_const_s_v_v(double %val, i32 %idx) {
 ; GPRIDX-NEXT:    v_mov_b32_e32 v3, s4
 ; GPRIDX-NEXT:    v_mov_b32_e32 v4, s5
 ; GPRIDX-NEXT:    v_mov_b32_e32 v5, s6
-; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
 ; GPRIDX-NEXT:    v_mov_b32_e32 v6, s7
 ; GPRIDX-NEXT:    v_mov_b32_e32 v7, s8
 ; GPRIDX-NEXT:    v_mov_b32_e32 v8, s9
@@ -783,6 +782,7 @@ define void @dyn_insertelement_v8f64_const_s_v_v(double %val, i32 %idx) {
 ; GPRIDX-NEXT:    v_mov_b32_e32 v16, s17
 ; GPRIDX-NEXT:    v_mov_b32_e32 v17, s18
 ; GPRIDX-NEXT:    v_mov_b32_e32 v18, s19
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e64 s[16:17], 0, v2
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e64 s[4:5], 2, v2
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e64 s[6:7], 3, v2
@@ -834,7 +834,6 @@ define void @dyn_insertelement_v8f64_const_s_v_v(double %val, i32 %idx) {
 ; MOVREL-NEXT:    s_mov_b32 s8, s18
 ; MOVREL-NEXT:    s_mov_b64 s[6:7], 2.0
 ; MOVREL-NEXT:    v_mov_b32_e32 v3, s4
-; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
 ; MOVREL-NEXT:    v_mov_b32_e32 v4, s5
 ; MOVREL-NEXT:    v_mov_b32_e32 v5, s6
 ; MOVREL-NEXT:    v_mov_b32_e32 v6, s7
@@ -850,6 +849,7 @@ define void @dyn_insertelement_v8f64_const_s_v_v(double %val, i32 %idx) {
 ; MOVREL-NEXT:    v_mov_b32_e32 v16, s17
 ; MOVREL-NEXT:    v_mov_b32_e32 v17, s18
 ; MOVREL-NEXT:    v_mov_b32_e32 v18, s19
+; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
 ; MOVREL-NEXT:    v_cmp_eq_u32_e64 s4, 1, v2
 ; MOVREL-NEXT:    v_cmp_eq_u32_e64 s5, 3, v2
 ; MOVREL-NEXT:    v_cmp_eq_u32_e64 s10, 2, v2
@@ -986,8 +986,8 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_s_v(<8 x double> inreg %vec, do
 ; MOVREL-NEXT:    s_mov_b32 s14, s16
 ; MOVREL-NEXT:    v_mov_b32_e32 v16, s15
 ; MOVREL-NEXT:    v_mov_b32_e32 v2, s1
-; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; MOVREL-NEXT:    v_mov_b32_e32 v1, s0
+; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; MOVREL-NEXT:    v_mov_b32_e32 v15, s14
 ; MOVREL-NEXT:    v_mov_b32_e32 v14, s13
 ; MOVREL-NEXT:    v_mov_b32_e32 v13, s12
@@ -1002,14 +1002,14 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_s_v(<8 x double> inreg %vec, do
 ; MOVREL-NEXT:    v_mov_b32_e32 v4, s3
 ; MOVREL-NEXT:    v_mov_b32_e32 v3, s2
 ; MOVREL-NEXT:    v_cmp_eq_u32_e64 s0, 1, v0
-; MOVREL-NEXT:    v_cmp_eq_u32_e64 s1, 2, v0
 ; MOVREL-NEXT:    v_cndmask_b32_e64 v1, v1, s18, vcc_lo
 ; MOVREL-NEXT:    v_cndmask_b32_e64 v2, v2, s19, vcc_lo
 ; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v0
-; MOVREL-NEXT:    v_cmp_eq_u32_e64 s2, 5, v0
+; MOVREL-NEXT:    v_cmp_eq_u32_e64 s1, 2, v0
 ; MOVREL-NEXT:    v_cndmask_b32_e64 v3, v3, s18, s0
 ; MOVREL-NEXT:    v_cndmask_b32_e64 v4, v4, s19, s0
 ; MOVREL-NEXT:    v_cmp_eq_u32_e64 s0, 4, v0
+; MOVREL-NEXT:    v_cmp_eq_u32_e64 s2, 5, v0
 ; MOVREL-NEXT:    v_cmp_eq_u32_e64 s3, 6, v0
 ; MOVREL-NEXT:    v_cmp_eq_u32_e64 s4, 7, v0
 ; MOVREL-NEXT:    v_cndmask_b32_e64 v5, v5, s18, s1
@@ -1232,10 +1232,10 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_v_v(<8 x double> inreg %vec, do
 ; GPRIDX-NEXT:    v_mov_b32_e32 v8, s5
 ; GPRIDX-NEXT:    v_mov_b32_e32 v7, s4
 ; GPRIDX-NEXT:    v_mov_b32_e32 v6, s3
-; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
 ; GPRIDX-NEXT:    v_mov_b32_e32 v5, s2
 ; GPRIDX-NEXT:    v_mov_b32_e32 v4, s1
 ; GPRIDX-NEXT:    v_mov_b32_e32 v3, s0
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e64 s[12:13], 0, v2
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v2
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v2
@@ -1486,14 +1486,14 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_v_v(<8 x double> %vec, double %
 ; GPRIDX:       ; %bb.0: ; %entry
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v18
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e64 s[0:1], 1, v18
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc
+; GPRIDX-NEXT:    v_cndmask_b32_e64 v2, v2, v16, s[0:1]
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e64 s[2:3], 2, v18
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e64 s[4:5], 3, v18
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e64 s[6:7], 4, v18
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e64 s[8:9], 5, v18
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e64 s[10:11], 7, v18
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e64 s[12:13], 6, v18
-; GPRIDX-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc
-; GPRIDX-NEXT:    v_cndmask_b32_e64 v2, v2, v16, s[0:1]
 ; GPRIDX-NEXT:    v_cndmask_b32_e32 v1, v1, v17, vcc
 ; GPRIDX-NEXT:    v_cndmask_b32_e64 v3, v3, v17, s[0:1]
 ; GPRIDX-NEXT:    v_cndmask_b32_e64 v4, v4, v16, s[2:3]
@@ -2075,18 +2075,18 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_s_s_add_1(<8 x double> inreg %v
 ; MOVREL-NEXT:    s_mov_b32 s15, s17
 ; MOVREL-NEXT:    s_movreld_b64 s[2:3], s[18:19]
 ; MOVREL-NEXT:    v_mov_b32_e32 v0, s0
-; MOVREL-NEXT:    v_mov_b32_e32 v4, s4
 ; MOVREL-NEXT:    v_mov_b32_e32 v1, s1
 ; MOVREL-NEXT:    v_mov_b32_e32 v2, s2
 ; MOVREL-NEXT:    v_mov_b32_e32 v3, s3
-; MOVREL-NEXT:    v_mov_b32_e32 v8, s8
+; MOVREL-NEXT:    v_mov_b32_e32 v4, s4
 ; MOVREL-NEXT:    v_mov_b32_e32 v5, s5
 ; MOVREL-NEXT:    v_mov_b32_e32 v6, s6
 ; MOVREL-NEXT:    v_mov_b32_e32 v7, s7
-; MOVREL-NEXT:    v_mov_b32_e32 v12, s12
+; MOVREL-NEXT:    v_mov_b32_e32 v8, s8
 ; MOVREL-NEXT:    v_mov_b32_e32 v9, s9
 ; MOVREL-NEXT:    v_mov_b32_e32 v10, s10
 ; MOVREL-NEXT:    v_mov_b32_e32 v11, s11
+; MOVREL-NEXT:    v_mov_b32_e32 v12, s12
 ; MOVREL-NEXT:    v_mov_b32_e32 v13, s13
 ; MOVREL-NEXT:    v_mov_b32_e32 v14, s14
 ; MOVREL-NEXT:    v_mov_b32_e32 v15, s15
@@ -2119,14 +2119,14 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_v_v_add_1(<8 x double> %vec, do
 ; GPRIDX-NEXT:    v_add_u32_e32 v18, 1, v18
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v18
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e64 s[0:1], 1, v18
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc
+; GPRIDX-NEXT:    v_cndmask_b32_e64 v2, v2, v16, s[0:1]
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e64 s[2:3], 2, v18
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e64 s[4:5], 3, v18
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e64 s[6:7], 4, v18
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e64 s[8:9], 5, v18
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e64 s[10:11], 7, v18
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e64 s[12:13], 6, v18
-; GPRIDX-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc
-; GPRIDX-NEXT:    v_cndmask_b32_e64 v2, v2, v16, s[0:1]
 ; GPRIDX-NEXT:    v_cndmask_b32_e32 v1, v1, v17, vcc
 ; GPRIDX-NEXT:    v_cndmask_b32_e64 v3, v3, v17, s[0:1]
 ; GPRIDX-NEXT:    v_cndmask_b32_e64 v4, v4, v16, s[2:3]
@@ -3510,20 +3510,20 @@ define amdgpu_ps <7 x float> @dyn_insertelement_v7f32_s_v_s(<7 x float> inreg %v
 ; GPRIDX-NEXT:    v_mov_b32_e32 v13, s6
 ; GPRIDX-NEXT:    v_mov_b32_e32 v7, s0
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e64 vcc, s9, 0
-; GPRIDX-NEXT:    v_cndmask_b32_e32 v7, v7, v0, vcc
 ; GPRIDX-NEXT:    v_mov_b32_e32 v8, s1
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v7, v7, v0, vcc
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e64 vcc, s9, 1
-; GPRIDX-NEXT:    v_cndmask_b32_e32 v1, v8, v0, vcc
 ; GPRIDX-NEXT:    v_mov_b32_e32 v9, s2
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v1, v8, v0, vcc
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e64 vcc, s9, 2
-; GPRIDX-NEXT:    v_cndmask_b32_e32 v2, v9, v0, vcc
 ; GPRIDX-NEXT:    v_mov_b32_e32 v10, s3
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v2, v9, v0, vcc
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e64 vcc, s9, 3
-; GPRIDX-NEXT:    v_cndmask_b32_e32 v3, v10, v0, vcc
 ; GPRIDX-NEXT:    v_mov_b32_e32 v11, s4
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v3, v10, v0, vcc
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e64 vcc, s9, 4
-; GPRIDX-NEXT:    v_cndmask_b32_e32 v4, v11, v0, vcc
 ; GPRIDX-NEXT:    v_mov_b32_e32 v12, s5
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v4, v11, v0, vcc
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e64 vcc, s9, 5
 ; GPRIDX-NEXT:    v_cndmask_b32_e32 v5, v12, v0, vcc
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e64 vcc, s9, 6
@@ -3581,20 +3581,20 @@ define amdgpu_ps <7 x float> @dyn_insertelement_v7f32_s_v_v(<7 x float> inreg %v
 ; GPRIDX-NEXT:    v_mov_b32_e32 v14, s6
 ; GPRIDX-NEXT:    v_mov_b32_e32 v8, s0
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GPRIDX-NEXT:    v_cndmask_b32_e32 v8, v8, v0, vcc
 ; GPRIDX-NEXT:    v_mov_b32_e32 v9, s1
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v8, v8, v0, vcc
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GPRIDX-NEXT:    v_cndmask_b32_e32 v7, v9, v0, vcc
 ; GPRIDX-NEXT:    v_mov_b32_e32 v10, s2
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v7, v9, v0, vcc
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v1
-; GPRIDX-NEXT:    v_cndmask_b32_e32 v2, v10, v0, vcc
 ; GPRIDX-NEXT:    v_mov_b32_e32 v11, s3
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v2, v10, v0, vcc
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v1
-; GPRIDX-NEXT:    v_cndmask_b32_e32 v3, v11, v0, vcc
 ; GPRIDX-NEXT:    v_mov_b32_e32 v12, s4
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v3, v11, v0, vcc
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v1
-; GPRIDX-NEXT:    v_cndmask_b32_e32 v4, v12, v0, vcc
 ; GPRIDX-NEXT:    v_mov_b32_e32 v13, s5
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v4, v12, v0, vcc
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v1
 ; GPRIDX-NEXT:    v_cndmask_b32_e32 v5, v13, v0, vcc
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 6, v1
@@ -3915,22 +3915,22 @@ define amdgpu_ps <7 x double> @dyn_insertelement_v7f64_s_v_v(<7 x double> inreg
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v2
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v2
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e64 s[4:5], 4, v2
-; GPRIDX-NEXT:    v_cmp_eq_u32_e64 s[10:11], 1, v2
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e64 s[6:7], 5, v2
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e64 s[8:9], 6, v2
+; GPRIDX-NEXT:    v_cmp_eq_u32_e64 s[10:11], 1, v2
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v3, v3, v0, vcc
 ; GPRIDX-NEXT:    v_cndmask_b32_e64 v2, v5, v0, s[10:11]
 ; GPRIDX-NEXT:    v_cndmask_b32_e64 v5, v7, v0, s[0:1]
 ; GPRIDX-NEXT:    v_cndmask_b32_e64 v7, v9, v0, s[2:3]
 ; GPRIDX-NEXT:    v_cndmask_b32_e64 v9, v11, v0, s[4:5]
 ; GPRIDX-NEXT:    v_cndmask_b32_e64 v11, v13, v0, s[6:7]
-; GPRIDX-NEXT:    v_cndmask_b32_e32 v3, v3, v0, vcc
 ; GPRIDX-NEXT:    v_cndmask_b32_e64 v0, v15, v0, s[8:9]
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v4, v4, v1, vcc
 ; GPRIDX-NEXT:    v_cndmask_b32_e64 v6, v6, v1, s[10:11]
-; GPRIDX-NEXT:    v_cndmask_b32_e64 v10, v10, v1, s[2:3]
-; GPRIDX-NEXT:    v_cndmask_b32_e64 v13, v14, v1, s[6:7]
 ; GPRIDX-NEXT:    v_cndmask_b32_e64 v8, v8, v1, s[0:1]
+; GPRIDX-NEXT:    v_cndmask_b32_e64 v10, v10, v1, s[2:3]
 ; GPRIDX-NEXT:    v_cndmask_b32_e64 v12, v12, v1, s[4:5]
-; GPRIDX-NEXT:    v_cndmask_b32_e32 v4, v4, v1, vcc
+; GPRIDX-NEXT:    v_cndmask_b32_e64 v13, v14, v1, s[6:7]
 ; GPRIDX-NEXT:    v_cndmask_b32_e64 v1, v16, v1, s[8:9]
 ; GPRIDX-NEXT:    v_readfirstlane_b32 s0, v3
 ; GPRIDX-NEXT:    v_readfirstlane_b32 s1, v4
@@ -3996,8 +3996,8 @@ define amdgpu_ps <7 x double> @dyn_insertelement_v7f64_s_v_v(<7 x double> inreg
 ; MOVREL-NEXT:    v_cndmask_b32_e64 v10, v10, v1, s0
 ; MOVREL-NEXT:    v_cmp_eq_u32_e64 s0, 5, v2
 ; MOVREL-NEXT:    v_readfirstlane_b32 s2, v5
-; MOVREL-NEXT:    v_cndmask_b32_e32 v2, v12, v1, vcc_lo
 ; MOVREL-NEXT:    v_cndmask_b32_e32 v11, v11, v0, vcc_lo
+; MOVREL-NEXT:    v_cndmask_b32_e32 v2, v12, v1, vcc_lo
 ; MOVREL-NEXT:    v_readfirstlane_b32 s3, v6
 ; MOVREL-NEXT:    v_cndmask_b32_e64 v12, v13, v0, s0
 ; MOVREL-NEXT:    v_cndmask_b32_e64 v13, v14, v1, s0
@@ -4082,20 +4082,20 @@ define amdgpu_ps <7 x double> @dyn_insertelement_v7f64_v_v_v(<7 x double> %vec,
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e64 s[6:7], 4, v16
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e64 s[8:9], 5, v16
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e64 s[10:11], 6, v16
-; GPRIDX-NEXT:    v_cndmask_b32_e64 v12, v12, v14, s[10:11]
-; GPRIDX-NEXT:    v_cndmask_b32_e64 v13, v13, v15, s[10:11]
-; GPRIDX-NEXT:    v_cndmask_b32_e64 v10, v10, v14, s[8:9]
-; GPRIDX-NEXT:    v_cndmask_b32_e64 v11, v11, v15, s[8:9]
-; GPRIDX-NEXT:    v_cndmask_b32_e64 v8, v8, v14, s[6:7]
-; GPRIDX-NEXT:    v_cndmask_b32_e64 v9, v9, v15, s[6:7]
-; GPRIDX-NEXT:    v_cndmask_b32_e64 v6, v6, v14, s[4:5]
-; GPRIDX-NEXT:    v_cndmask_b32_e64 v7, v7, v15, s[4:5]
-; GPRIDX-NEXT:    v_cndmask_b32_e64 v4, v4, v14, s[2:3]
-; GPRIDX-NEXT:    v_cndmask_b32_e64 v5, v5, v15, s[2:3]
-; GPRIDX-NEXT:    v_cndmask_b32_e64 v2, v2, v14, s[0:1]
-; GPRIDX-NEXT:    v_cndmask_b32_e64 v3, v3, v15, s[0:1]
 ; GPRIDX-NEXT:    v_cndmask_b32_e32 v0, v0, v14, vcc
+; GPRIDX-NEXT:    v_cndmask_b32_e64 v2, v2, v14, s[0:1]
+; GPRIDX-NEXT:    v_cndmask_b32_e64 v4, v4, v14, s[2:3]
+; GPRIDX-NEXT:    v_cndmask_b32_e64 v6, v6, v14, s[4:5]
+; GPRIDX-NEXT:    v_cndmask_b32_e64 v8, v8, v14, s[6:7]
+; GPRIDX-NEXT:    v_cndmask_b32_e64 v10, v10, v14, s[8:9]
+; GPRIDX-NEXT:    v_cndmask_b32_e64 v12, v12, v14, s[10:11]
 ; GPRIDX-NEXT:    v_cndmask_b32_e32 v1, v1, v15, vcc
+; GPRIDX-NEXT:    v_cndmask_b32_e64 v3, v3, v15, s[0:1]
+; GPRIDX-NEXT:    v_cndmask_b32_e64 v5, v5, v15, s[2:3]
+; GPRIDX-NEXT:    v_cndmask_b32_e64 v7, v7, v15, s[4:5]
+; GPRIDX-NEXT:    v_cndmask_b32_e64 v9, v9, v15, s[6:7]
+; GPRIDX-NEXT:    v_cndmask_b32_e64 v11, v11, v15, s[8:9]
+; GPRIDX-NEXT:    v_cndmask_b32_e64 v13, v13, v15, s[10:11]
 ; GPRIDX-NEXT:    v_readfirstlane_b32 s0, v0
 ; GPRIDX-NEXT:    v_readfirstlane_b32 s1, v1
 ; GPRIDX-NEXT:    v_readfirstlane_b32 s2, v2
@@ -4122,24 +4122,24 @@ define amdgpu_ps <7 x double> @dyn_insertelement_v7f64_v_v_v(<7 x double> %vec,
 ; MOVREL-NEXT:    v_cmp_eq_u32_e64 s4, 5, v16
 ; MOVREL-NEXT:    v_cmp_eq_u32_e64 s5, 6, v16
 ; MOVREL-NEXT:    v_cndmask_b32_e32 v0, v0, v14, vcc_lo
+; MOVREL-NEXT:    v_cndmask_b32_e64 v2, v2, v14, s0
+; MOVREL-NEXT:    v_cndmask_b32_e64 v4, v4, v14, s1
 ; MOVREL-NEXT:    v_cndmask_b32_e64 v6, v6, v14, s2
 ; MOVREL-NEXT:    v_cndmask_b32_e64 v8, v8, v14, s3
 ; MOVREL-NEXT:    v_cndmask_b32_e64 v10, v10, v14, s4
 ; MOVREL-NEXT:    v_cndmask_b32_e64 v12, v12, v14, s5
-; MOVREL-NEXT:    v_cndmask_b32_e64 v7, v7, v15, s2
-; MOVREL-NEXT:    v_cndmask_b32_e64 v2, v2, v14, s0
+; MOVREL-NEXT:    v_cndmask_b32_e32 v1, v1, v15, vcc_lo
 ; MOVREL-NEXT:    v_cndmask_b32_e64 v3, v3, v15, s0
+; MOVREL-NEXT:    v_cndmask_b32_e64 v5, v5, v15, s1
+; MOVREL-NEXT:    v_cndmask_b32_e64 v7, v7, v15, s2
 ; MOVREL-NEXT:    v_cndmask_b32_e64 v9, v9, v15, s3
 ; MOVREL-NEXT:    v_cndmask_b32_e64 v11, v11, v15, s4
-; MOVREL-NEXT:    v_cndmask_b32_e64 v4, v4, v14, s1
 ; MOVREL-NEXT:    v_cndmask_b32_e64 v13, v13, v15, s5
-; MOVREL-NEXT:    v_cndmask_b32_e64 v5, v5, v15, s1
-; MOVREL-NEXT:    v_cndmask_b32_e32 v1, v1, v15, vcc_lo
 ; MOVREL-NEXT:    v_readfirstlane_b32 s0, v0
+; MOVREL-NEXT:    v_readfirstlane_b32 s1, v1
 ; MOVREL-NEXT:    v_readfirstlane_b32 s2, v2
 ; MOVREL-NEXT:    v_readfirstlane_b32 s3, v3
 ; MOVREL-NEXT:    v_readfirstlane_b32 s4, v4
-; MOVREL-NEXT:    v_readfirstlane_b32 s1, v1
 ; MOVREL-NEXT:    v_readfirstlane_b32 s5, v5
 ; MOVREL-NEXT:    v_readfirstlane_b32 s6, v6
 ; MOVREL-NEXT:    v_readfirstlane_b32 s7, v7
@@ -4220,17 +4220,17 @@ define amdgpu_ps <5 x double> @dyn_insertelement_v5f64_s_v_s(<5 x double> inreg
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e64 vcc, s12, 0
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e64 s[0:1], s12, 1
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e64 s[2:3], s12, 3
-; GPRIDX-NEXT:    v_cmp_eq_u32_e64 s[6:7], s12, 2
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e64 s[4:5], s12, 4
+; GPRIDX-NEXT:    v_cmp_eq_u32_e64 s[6:7], s12, 2
 ; GPRIDX-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
 ; GPRIDX-NEXT:    v_cndmask_b32_e64 v4, v4, v0, s[0:1]
-; GPRIDX-NEXT:    v_cndmask_b32_e64 v5, v5, v1, s[0:1]
-; GPRIDX-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
 ; GPRIDX-NEXT:    v_cndmask_b32_e64 v6, v6, v0, s[6:7]
 ; GPRIDX-NEXT:    v_cndmask_b32_e64 v8, v8, v0, s[2:3]
-; GPRIDX-NEXT:    v_cndmask_b32_e64 v9, v9, v1, s[2:3]
-; GPRIDX-NEXT:    v_cndmask_b32_e64 v7, v7, v1, s[6:7]
 ; GPRIDX-NEXT:    v_cndmask_b32_e64 v0, v10, v0, s[4:5]
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
+; GPRIDX-NEXT:    v_cndmask_b32_e64 v5, v5, v1, s[0:1]
+; GPRIDX-NEXT:    v_cndmask_b32_e64 v7, v7, v1, s[6:7]
+; GPRIDX-NEXT:    v_cndmask_b32_e64 v9, v9, v1, s[2:3]
 ; GPRIDX-NEXT:    v_cndmask_b32_e64 v1, v11, v1, s[4:5]
 ; GPRIDX-NEXT:    v_readfirstlane_b32 s0, v2
 ; GPRIDX-NEXT:    v_readfirstlane_b32 s1, v3
@@ -4277,13 +4277,13 @@ define amdgpu_ps <5 x double> @dyn_insertelement_v5f64_s_v_s(<5 x double> inreg
 ; MOVREL-NEXT:    v_cmp_eq_u32_e64 s1, s12, 4
 ; MOVREL-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
 ; MOVREL-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc_lo
-; MOVREL-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s12, 2
 ; MOVREL-NEXT:    v_cndmask_b32_e64 v4, v4, v0, s0
+; MOVREL-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s12, 2
 ; MOVREL-NEXT:    v_cndmask_b32_e64 v5, v5, v1, s0
 ; MOVREL-NEXT:    v_cmp_eq_u32_e64 s0, s12, 3
+; MOVREL-NEXT:    v_readfirstlane_b32 s2, v4
 ; MOVREL-NEXT:    v_cndmask_b32_e32 v6, v6, v0, vcc_lo
 ; MOVREL-NEXT:    v_cndmask_b32_e32 v7, v7, v1, vcc_lo
-; MOVREL-NEXT:    v_readfirstlane_b32 s2, v4
 ; MOVREL-NEXT:    v_cndmask_b32_e64 v8, v8, v0, s0
 ; MOVREL-NEXT:    v_cndmask_b32_e64 v9, v9, v1, s0
 ; MOVREL-NEXT:    v_cndmask_b32_e64 v0, v10, v0, s1
@@ -4334,18 +4334,18 @@ define amdgpu_ps <5 x double> @dyn_insertelement_v5f64_s_v_v(<5 x double> inreg
 ; GPRIDX-NEXT:    v_mov_b32_e32 v3, s0
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v2
-; GPRIDX-NEXT:    v_cmp_eq_u32_e64 s[6:7], 1, v2
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v2
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e64 s[4:5], 4, v2
+; GPRIDX-NEXT:    v_cmp_eq_u32_e64 s[6:7], 1, v2
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v3, v3, v0, vcc
 ; GPRIDX-NEXT:    v_cndmask_b32_e64 v2, v5, v0, s[6:7]
 ; GPRIDX-NEXT:    v_cndmask_b32_e64 v5, v7, v0, s[0:1]
 ; GPRIDX-NEXT:    v_cndmask_b32_e64 v7, v9, v0, s[2:3]
-; GPRIDX-NEXT:    v_cndmask_b32_e32 v3, v3, v0, vcc
 ; GPRIDX-NEXT:    v_cndmask_b32_e64 v0, v11, v0, s[4:5]
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v4, v4, v1, vcc
 ; GPRIDX-NEXT:    v_cndmask_b32_e64 v6, v6, v1, s[6:7]
-; GPRIDX-NEXT:    v_cndmask_b32_e64 v9, v10, v1, s[2:3]
 ; GPRIDX-NEXT:    v_cndmask_b32_e64 v8, v8, v1, s[0:1]
-; GPRIDX-NEXT:    v_cndmask_b32_e32 v4, v4, v1, vcc
+; GPRIDX-NEXT:    v_cndmask_b32_e64 v9, v10, v1, s[2:3]
 ; GPRIDX-NEXT:    v_cndmask_b32_e64 v1, v12, v1, s[4:5]
 ; GPRIDX-NEXT:    v_readfirstlane_b32 s0, v3
 ; GPRIDX-NEXT:    v_readfirstlane_b32 s1, v4
@@ -4393,19 +4393,19 @@ define amdgpu_ps <5 x double> @dyn_insertelement_v5f64_s_v_v(<5 x double> inreg
 ; MOVREL-NEXT:    v_cndmask_b32_e32 v3, v3, v0, vcc_lo
 ; MOVREL-NEXT:    v_cndmask_b32_e32 v4, v4, v1, vcc_lo
 ; MOVREL-NEXT:    v_cndmask_b32_e64 v5, v5, v0, s0
-; MOVREL-NEXT:    v_cndmask_b32_e64 v6, v6, v1, s0
 ; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 2, v2
+; MOVREL-NEXT:    v_cndmask_b32_e64 v6, v6, v1, s0
 ; MOVREL-NEXT:    v_cmp_eq_u32_e64 s0, 3, v2
 ; MOVREL-NEXT:    v_readfirstlane_b32 s2, v5
-; MOVREL-NEXT:    v_readfirstlane_b32 s3, v6
+; MOVREL-NEXT:    v_cndmask_b32_e32 v7, v7, v0, vcc_lo
 ; MOVREL-NEXT:    v_cndmask_b32_e32 v2, v8, v1, vcc_lo
 ; MOVREL-NEXT:    v_cndmask_b32_e64 v8, v9, v0, s0
 ; MOVREL-NEXT:    v_cndmask_b32_e64 v9, v10, v1, s0
-; MOVREL-NEXT:    v_cndmask_b32_e32 v7, v7, v0, vcc_lo
 ; MOVREL-NEXT:    v_cndmask_b32_e64 v0, v11, v0, s1
 ; MOVREL-NEXT:    v_cndmask_b32_e64 v1, v12, v1, s1
 ; MOVREL-NEXT:    v_readfirstlane_b32 s0, v3
 ; MOVREL-NEXT:    v_readfirstlane_b32 s1, v4
+; MOVREL-NEXT:    v_readfirstlane_b32 s3, v6
 ; MOVREL-NEXT:    v_readfirstlane_b32 s4, v7
 ; MOVREL-NEXT:    v_readfirstlane_b32 s5, v2
 ; MOVREL-NEXT:    v_readfirstlane_b32 s6, v8
@@ -4426,15 +4426,15 @@ define amdgpu_ps <5 x double> @dyn_insertelement_v5f64_v_v_s(<5 x double> %vec,
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e64 s[8:9], s2, 2
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e64 s[4:5], s2, 3
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e64 s[6:7], s2, 4
-; GPRIDX-NEXT:    v_cndmask_b32_e64 v2, v2, v10, s[0:1]
-; GPRIDX-NEXT:    v_cndmask_b32_e64 v3, v3, v11, s[0:1]
 ; GPRIDX-NEXT:    v_cndmask_b32_e32 v0, v0, v10, vcc
-; GPRIDX-NEXT:    v_cndmask_b32_e32 v1, v1, v11, vcc
+; GPRIDX-NEXT:    v_cndmask_b32_e64 v2, v2, v10, s[0:1]
 ; GPRIDX-NEXT:    v_cndmask_b32_e64 v4, v4, v10, s[8:9]
 ; GPRIDX-NEXT:    v_cndmask_b32_e64 v6, v6, v10, s[4:5]
-; GPRIDX-NEXT:    v_cndmask_b32_e64 v7, v7, v11, s[4:5]
-; GPRIDX-NEXT:    v_cndmask_b32_e64 v5, v5, v11, s[8:9]
 ; GPRIDX-NEXT:    v_cndmask_b32_e64 v8, v8, v10, s[6:7]
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v1, v1, v11, vcc
+; GPRIDX-NEXT:    v_cndmask_b32_e64 v3, v3, v11, s[0:1]
+; GPRIDX-NEXT:    v_cndmask_b32_e64 v5, v5, v11, s[8:9]
+; GPRIDX-NEXT:    v_cndmask_b32_e64 v7, v7, v11, s[4:5]
 ; GPRIDX-NEXT:    v_cndmask_b32_e64 v9, v9, v11, s[6:7]
 ; GPRIDX-NEXT:    v_readfirstlane_b32 s0, v0
 ; GPRIDX-NEXT:    v_readfirstlane_b32 s1, v1
@@ -4456,8 +4456,8 @@ define amdgpu_ps <5 x double> @dyn_insertelement_v5f64_v_v_s(<5 x double> %vec,
 ; MOVREL-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s2, 1
 ; MOVREL-NEXT:    v_readfirstlane_b32 s0, v0
 ; MOVREL-NEXT:    v_readfirstlane_b32 s1, v1
-; MOVREL-NEXT:    v_cndmask_b32_e32 v3, v3, v11, vcc_lo
 ; MOVREL-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc_lo
+; MOVREL-NEXT:    v_cndmask_b32_e32 v3, v3, v11, vcc_lo
 ; MOVREL-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s2, 2
 ; MOVREL-NEXT:    v_readfirstlane_b32 s3, v3
 ; MOVREL-NEXT:    v_cndmask_b32_e32 v4, v4, v10, vcc_lo
@@ -4489,16 +4489,16 @@ define amdgpu_ps <5 x double> @dyn_insertelement_v5f64_v_v_v(<5 x double> %vec,
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e64 s[2:3], 2, v12
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e64 s[4:5], 3, v12
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e64 s[6:7], 4, v12
-; GPRIDX-NEXT:    v_cndmask_b32_e64 v8, v8, v10, s[6:7]
-; GPRIDX-NEXT:    v_cndmask_b32_e64 v9, v9, v11, s[6:7]
-; GPRIDX-NEXT:    v_cndmask_b32_e64 v6, v6, v10, s[4:5]
-; GPRIDX-NEXT:    v_cndmask_b32_e64 v7, v7, v11, s[4:5]
-; GPRIDX-NEXT:    v_cndmask_b32_e64 v4, v4, v10, s[2:3]
-; GPRIDX-NEXT:    v_cndmask_b32_e64 v5, v5, v11, s[2:3]
-; GPRIDX-NEXT:    v_cndmask_b32_e64 v2, v2, v10, s[0:1]
-; GPRIDX-NEXT:    v_cndmask_b32_e64 v3, v3, v11, s[0:1]
 ; GPRIDX-NEXT:    v_cndmask_b32_e32 v0, v0, v10, vcc
+; GPRIDX-NEXT:    v_cndmask_b32_e64 v2, v2, v10, s[0:1]
+; GPRIDX-NEXT:    v_cndmask_b32_e64 v4, v4, v10, s[2:3]
+; GPRIDX-NEXT:    v_cndmask_b32_e64 v6, v6, v10, s[4:5]
+; GPRIDX-NEXT:    v_cndmask_b32_e64 v8, v8, v10, s[6:7]
 ; GPRIDX-NEXT:    v_cndmask_b32_e32 v1, v1, v11, vcc
+; GPRIDX-NEXT:    v_cndmask_b32_e64 v3, v3, v11, s[0:1]
+; GPRIDX-NEXT:    v_cndmask_b32_e64 v5, v5, v11, s[2:3]
+; GPRIDX-NEXT:    v_cndmask_b32_e64 v7, v7, v11, s[4:5]
+; GPRIDX-NEXT:    v_cndmask_b32_e64 v9, v9, v11, s[6:7]
 ; GPRIDX-NEXT:    v_readfirstlane_b32 s0, v0
 ; GPRIDX-NEXT:    v_readfirstlane_b32 s1, v1
 ; GPRIDX-NEXT:    v_readfirstlane_b32 s2, v2

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll
index e9a68d3250bbd..3e6036039aa1a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll
@@ -443,8 +443,8 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(i32 addrspa
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
-; CI-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
+; CI-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, 20, v0
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
@@ -458,8 +458,8 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(i32 addrspa
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 20, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
@@ -746,8 +746,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(i32* %ptr) #0
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
-; CI-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
+; CI-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, 20, v0
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
@@ -761,8 +761,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(i32* %ptr) #0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 20, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
@@ -990,11 +990,11 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(i64* %out, i64*
 ; CI-NEXT:    v_add_i32_e32 v4, vcc, v0, v2
 ; CI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
-; CI-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
+; CI-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT:    v_add_i32_e32 v4, vcc, 40, v4
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
+; CI-NEXT:    v_add_i32_e32 v4, vcc, 40, v4
 ; CI-NEXT:    v_mov_b32_e32 v3, 0
 ; CI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
 ; CI-NEXT:    flat_atomic_dec_x2 v[2:3], v[4:5], v[2:3] glc
@@ -1012,11 +1012,11 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(i64* %out, i64*
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, v0, v2
 ; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    v_add_u32_e32 v4, vcc, 40, v4
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
+; VI-NEXT:    v_add_u32_e32 v4, vcc, 40, v4
 ; VI-NEXT:    v_mov_b32_e32 v3, 0
 ; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
 ; VI-NEXT:    flat_atomic_dec_x2 v[2:3], v[4:5], v[2:3] glc
@@ -1067,8 +1067,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(i64* %ptr) #0
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    v_add_i32_e32 v2, vcc, v0, v2
 ; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; CI-NEXT:    v_add_i32_e32 v2, vcc, 40, v2
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
+; CI-NEXT:    v_add_i32_e32 v2, vcc, 40, v2
 ; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; CI-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
@@ -1083,8 +1083,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(i64* %ptr) #0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, v0, v2
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 40, v2
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
+; VI-NEXT:    v_add_u32_e32 v2, vcc, 40, v2
 ; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
@@ -1549,11 +1549,11 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(i64 addrspace
 ; CI-NEXT:    v_add_i32_e32 v4, vcc, v0, v2
 ; CI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
-; CI-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
+; CI-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT:    v_add_i32_e32 v4, vcc, 40, v4
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
+; CI-NEXT:    v_add_i32_e32 v4, vcc, 40, v4
 ; CI-NEXT:    v_mov_b32_e32 v3, 0
 ; CI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
 ; CI-NEXT:    flat_atomic_dec_x2 v[2:3], v[4:5], v[2:3] glc
@@ -1571,11 +1571,11 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(i64 addrspace
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, v0, v2
 ; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    v_add_u32_e32 v4, vcc, 40, v4
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
+; VI-NEXT:    v_add_u32_e32 v4, vcc, 40, v4
 ; VI-NEXT:    v_mov_b32_e32 v3, 0
 ; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
 ; VI-NEXT:    flat_atomic_dec_x2 v[2:3], v[4:5], v[2:3] glc
@@ -1626,8 +1626,8 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(i64 addrspa
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    v_add_i32_e32 v2, vcc, v0, v2
 ; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; CI-NEXT:    v_add_i32_e32 v2, vcc, 40, v2
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
+; CI-NEXT:    v_add_i32_e32 v2, vcc, 40, v2
 ; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; CI-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
@@ -1642,8 +1642,8 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(i64 addrspa
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, v0, v2
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 40, v2
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
+; VI-NEXT:    v_add_u32_e32 v2, vcc, 40, v2
 ; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll
index a68602a17023c..df89351335ccc 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll
@@ -512,8 +512,8 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(i32 addrspa
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
-; CI-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
+; CI-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, 20, v0
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
@@ -527,8 +527,8 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(i32 addrspa
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 20, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
@@ -599,8 +599,8 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(i32 addrspace(1)* %out,
 ;
 ; GFX9-LABEL: atomic_inc_shl_base_lds_0_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_add_u32_e32 v1, 2, v0
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX9-NEXT:    v_add_u32_e32 v1, 2, v0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 9
 ; GFX9-NEXT:    ds_inc_rtn_u32 v0, v0, v2 offset:8
@@ -1086,11 +1086,11 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(i64 addrspace
 ; CI-NEXT:    v_add_i32_e32 v4, vcc, v0, v2
 ; CI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
-; CI-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
+; CI-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT:    v_add_i32_e32 v4, vcc, 40, v4
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
+; CI-NEXT:    v_add_i32_e32 v4, vcc, 40, v4
 ; CI-NEXT:    v_mov_b32_e32 v3, 0
 ; CI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
 ; CI-NEXT:    flat_atomic_inc_x2 v[2:3], v[4:5], v[2:3] glc
@@ -1108,11 +1108,11 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(i64 addrspace
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, v0, v2
 ; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    v_add_u32_e32 v4, vcc, 40, v4
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
+; VI-NEXT:    v_add_u32_e32 v4, vcc, 40, v4
 ; VI-NEXT:    v_mov_b32_e32 v3, 0
 ; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
 ; VI-NEXT:    flat_atomic_inc_x2 v[2:3], v[4:5], v[2:3] glc
@@ -1162,8 +1162,8 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(i64 addrspa
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    v_add_i32_e32 v2, vcc, v0, v2
 ; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; CI-NEXT:    v_add_i32_e32 v2, vcc, 40, v2
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
+; CI-NEXT:    v_add_i32_e32 v2, vcc, 40, v2
 ; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; CI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
@@ -1178,8 +1178,8 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(i64 addrspa
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, v0, v2
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 40, v2
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
+; VI-NEXT:    v_add_u32_e32 v2, vcc, 40, v2
 ; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
@@ -1412,9 +1412,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(i32* %out, i32*
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
@@ -1461,8 +1461,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(i32* %ptr) #0
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
-; CI-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
+; CI-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, 20, v0
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
@@ -1476,8 +1476,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(i32* %ptr) #0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 20, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
@@ -1491,8 +1491,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(i32* %ptr) #0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX9-NEXT:    flat_atomic_inc v0, v[0:1], v2 offset:20 glc
@@ -1506,8 +1506,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(i32* %ptr) #0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, 20
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX10-NEXT:    flat_atomic_inc v0, v[0:1], v2 glc
@@ -1561,8 +1561,8 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(i64 addrspace(1)* %out,
 ; GFX9-LABEL: atomic_inc_shl_base_lds_0_i64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 9
-; GFX9-NEXT:    v_add_u32_e32 v3, 2, v0
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX9-NEXT:    v_add_u32_e32 v3, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX9-NEXT:    ds_inc_rtn_u64 v[0:1], v0, v[1:2] offset:16
@@ -1575,8 +1575,8 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(i64 addrspace(1)* %out,
 ; GFX10-LABEL: atomic_inc_shl_base_lds_0_i64:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v0
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, 2, v0
 ; GFX10-NEXT:    ds_inc_rtn_u64 v[1:2], v3, v[1:2] offset:16
@@ -1767,11 +1767,11 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(i64* %out, i64*
 ; CI-NEXT:    v_add_i32_e32 v4, vcc, v0, v2
 ; CI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
-; CI-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
+; CI-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT:    v_add_i32_e32 v4, vcc, 40, v4
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
+; CI-NEXT:    v_add_i32_e32 v4, vcc, 40, v4
 ; CI-NEXT:    v_mov_b32_e32 v3, 0
 ; CI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
 ; CI-NEXT:    flat_atomic_inc_x2 v[2:3], v[4:5], v[2:3] glc
@@ -1789,11 +1789,11 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(i64* %out, i64*
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, v0, v2
 ; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    v_add_u32_e32 v4, vcc, 40, v4
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
+; VI-NEXT:    v_add_u32_e32 v4, vcc, 40, v4
 ; VI-NEXT:    v_mov_b32_e32 v3, 0
 ; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
 ; VI-NEXT:    flat_atomic_inc_x2 v[2:3], v[4:5], v[2:3] glc
@@ -1831,8 +1831,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(i64* %out, i64*
 ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v0, v4
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 42
-; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, 40
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
 ; GFX10-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s1
@@ -1861,8 +1861,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(i64* %ptr) #0
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    v_add_i32_e32 v2, vcc, v0, v2
 ; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; CI-NEXT:    v_add_i32_e32 v2, vcc, 40, v2
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
+; CI-NEXT:    v_add_i32_e32 v2, vcc, 40, v2
 ; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; CI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
@@ -1877,8 +1877,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(i64* %ptr) #0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, v0, v2
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 40, v2
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
+; VI-NEXT:    v_add_u32_e32 v2, vcc, 40, v2
 ; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
@@ -1908,8 +1908,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(i64* %ptr) #0
 ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v0, v2
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 42
-; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, 40
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
 ; GFX10-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
 ; GFX10-NEXT:    s_endpgm

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
index 40b2131b21ae3..c208caf5bc33d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
@@ -141,8 +141,8 @@ define amdgpu_ps double @s_div_fmas_f64(double inreg %a, double inreg %b, double
 ; GFX7-LABEL: s_div_fmas_f64:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_cmp_eq_u32 s6, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX7-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v4, s4
@@ -159,8 +159,8 @@ define amdgpu_ps double @s_div_fmas_f64(double inreg %a, double inreg %b, double
 ; GFX8-LABEL: s_div_fmas_f64:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_cmp_eq_u32 s6, 0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s4

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll
index 68afd70819b48..e5757eee9f464 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll
@@ -530,8 +530,8 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_num_1(double addrspace(1)*
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s6
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s4
@@ -548,8 +548,8 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_num_1(double addrspace(1)*
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s6
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s4
@@ -590,8 +590,8 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_num_2(double addrspace(1)*
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s6
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s4
@@ -608,8 +608,8 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_num_2(double addrspace(1)*
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s6
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s4
@@ -650,8 +650,8 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_den_1(double addrspace(1)*
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s6
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s4
@@ -668,8 +668,8 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_den_1(double addrspace(1)*
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s6
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s4
@@ -710,8 +710,8 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_den_2(double addrspace(1)*
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s6
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s4
@@ -728,8 +728,8 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_den_2(double addrspace(1)*
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s6
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s4

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.a16.ll
index 1282a61b52cf2..8803508a7ecdc 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.a16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.a16.ll
@@ -496,8 +496,8 @@ define amdgpu_ps float @atomic_add_i32_3d(<8 x i32> inreg %rsrc, i32 %data, i16
 ; GFX9-NEXT:    s_mov_b32 s1, s3
 ; GFX9-NEXT:    s_mov_b32 s3, s5
 ; GFX9-NEXT:    s_mov_b32 s5, s7
-; GFX9-NEXT:    v_and_or_b32 v1, v1, v4, v2
 ; GFX9-NEXT:    s_mov_b32 s7, s9
+; GFX9-NEXT:    v_and_or_b32 v1, v1, v4, v2
 ; GFX9-NEXT:    v_and_or_b32 v2, v3, v4, s8
 ; GFX9-NEXT:    image_atomic_add v0, v[1:2], s[0:7] dmask:0x1 unorm glc a16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -540,8 +540,8 @@ define amdgpu_ps float @atomic_add_i32_cube(<8 x i32> inreg %rsrc, i32 %data, i1
 ; GFX9-NEXT:    s_mov_b32 s1, s3
 ; GFX9-NEXT:    s_mov_b32 s3, s5
 ; GFX9-NEXT:    s_mov_b32 s5, s7
-; GFX9-NEXT:    v_and_or_b32 v1, v1, v4, v2
 ; GFX9-NEXT:    s_mov_b32 s7, s9
+; GFX9-NEXT:    v_and_or_b32 v1, v1, v4, v2
 ; GFX9-NEXT:    v_and_or_b32 v2, v3, v4, s8
 ; GFX9-NEXT:    image_atomic_add v0, v[1:2], s[0:7] dmask:0x1 unorm glc a16 da
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -623,8 +623,8 @@ define amdgpu_ps float @atomic_add_i32_2darray(<8 x i32> inreg %rsrc, i32 %data,
 ; GFX9-NEXT:    s_mov_b32 s1, s3
 ; GFX9-NEXT:    s_mov_b32 s3, s5
 ; GFX9-NEXT:    s_mov_b32 s5, s7
-; GFX9-NEXT:    v_and_or_b32 v1, v1, v4, v2
 ; GFX9-NEXT:    s_mov_b32 s7, s9
+; GFX9-NEXT:    v_and_or_b32 v1, v1, v4, v2
 ; GFX9-NEXT:    v_and_or_b32 v2, v3, v4, s8
 ; GFX9-NEXT:    image_atomic_add v0, v[1:2], s[0:7] dmask:0x1 unorm glc a16 da
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -667,8 +667,8 @@ define amdgpu_ps float @atomic_add_i32_2dmsaa(<8 x i32> inreg %rsrc, i32 %data,
 ; GFX9-NEXT:    s_mov_b32 s1, s3
 ; GFX9-NEXT:    s_mov_b32 s3, s5
 ; GFX9-NEXT:    s_mov_b32 s5, s7
-; GFX9-NEXT:    v_and_or_b32 v1, v1, v4, v2
 ; GFX9-NEXT:    s_mov_b32 s7, s9
+; GFX9-NEXT:    v_and_or_b32 v1, v1, v4, v2
 ; GFX9-NEXT:    v_and_or_b32 v2, v3, v4, s8
 ; GFX9-NEXT:    image_atomic_add v0, v[1:2], s[0:7] dmask:0x1 unorm glc a16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -1270,8 +1270,8 @@ define amdgpu_ps <2 x float> @atomic_add_i64_3d(<8 x i32> inreg %rsrc, i64 %data
 ; GFX9-NEXT:    s_mov_b32 s1, s3
 ; GFX9-NEXT:    s_mov_b32 s3, s5
 ; GFX9-NEXT:    s_mov_b32 s5, s7
-; GFX9-NEXT:    v_and_or_b32 v2, v2, v5, v3
 ; GFX9-NEXT:    s_mov_b32 s7, s9
+; GFX9-NEXT:    v_and_or_b32 v2, v2, v5, v3
 ; GFX9-NEXT:    v_and_or_b32 v3, v4, v5, s8
 ; GFX9-NEXT:    image_atomic_add v[0:1], v[2:3], s[0:7] dmask:0x3 unorm glc a16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -1314,8 +1314,8 @@ define amdgpu_ps <2 x float> @atomic_add_i64_cube(<8 x i32> inreg %rsrc, i64 %da
 ; GFX9-NEXT:    s_mov_b32 s1, s3
 ; GFX9-NEXT:    s_mov_b32 s3, s5
 ; GFX9-NEXT:    s_mov_b32 s5, s7
-; GFX9-NEXT:    v_and_or_b32 v2, v2, v5, v3
 ; GFX9-NEXT:    s_mov_b32 s7, s9
+; GFX9-NEXT:    v_and_or_b32 v2, v2, v5, v3
 ; GFX9-NEXT:    v_and_or_b32 v3, v4, v5, s8
 ; GFX9-NEXT:    image_atomic_add v[0:1], v[2:3], s[0:7] dmask:0x3 unorm glc a16 da
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -1397,8 +1397,8 @@ define amdgpu_ps <2 x float> @atomic_add_i64_2darray(<8 x i32> inreg %rsrc, i64
 ; GFX9-NEXT:    s_mov_b32 s1, s3
 ; GFX9-NEXT:    s_mov_b32 s3, s5
 ; GFX9-NEXT:    s_mov_b32 s5, s7
-; GFX9-NEXT:    v_and_or_b32 v2, v2, v5, v3
 ; GFX9-NEXT:    s_mov_b32 s7, s9
+; GFX9-NEXT:    v_and_or_b32 v2, v2, v5, v3
 ; GFX9-NEXT:    v_and_or_b32 v3, v4, v5, s8
 ; GFX9-NEXT:    image_atomic_add v[0:1], v[2:3], s[0:7] dmask:0x3 unorm glc a16 da
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -1441,8 +1441,8 @@ define amdgpu_ps <2 x float> @atomic_add_i64_2dmsaa(<8 x i32> inreg %rsrc, i64 %
 ; GFX9-NEXT:    s_mov_b32 s1, s3
 ; GFX9-NEXT:    s_mov_b32 s3, s5
 ; GFX9-NEXT:    s_mov_b32 s5, s7
-; GFX9-NEXT:    v_and_or_b32 v2, v2, v5, v3
 ; GFX9-NEXT:    s_mov_b32 s7, s9
+; GFX9-NEXT:    v_and_or_b32 v2, v2, v5, v3
 ; GFX9-NEXT:    v_and_or_b32 v3, v4, v5, s8
 ; GFX9-NEXT:    image_atomic_add v[0:1], v[2:3], s[0:7] dmask:0x3 unorm glc a16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll
index ade9b053bc8f9..108062a76af86 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll
@@ -73,8 +73,8 @@ define amdgpu_ps <4 x float> @gather4_cube(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX9-NEXT:    s_mov_b32 s5, s7
 ; GFX9-NEXT:    s_mov_b32 s7, s9
 ; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    v_and_or_b32 v0, v0, v3, v1
 ; GFX9-NEXT:    s_mov_b32 s11, s13
+; GFX9-NEXT:    v_and_or_b32 v0, v0, v3, v1
 ; GFX9-NEXT:    v_and_or_b32 v1, v2, v3, s12
 ; GFX9-NEXT:    s_and_b64 exec, exec, s[14:15]
 ; GFX9-NEXT:    image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 a16 da
@@ -99,8 +99,8 @@ define amdgpu_ps <4 x float> @gather4_cube(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX10NSA-NEXT:    s_mov_b32 s5, s7
 ; GFX10NSA-NEXT:    s_mov_b32 s7, s9
 ; GFX10NSA-NEXT:    s_mov_b32 s9, s11
-; GFX10NSA-NEXT:    v_and_or_b32 v0, v0, v3, v1
 ; GFX10NSA-NEXT:    s_mov_b32 s11, s13
+; GFX10NSA-NEXT:    v_and_or_b32 v0, v0, v3, v1
 ; GFX10NSA-NEXT:    v_and_or_b32 v1, v2, v3, s12
 ; GFX10NSA-NEXT:    s_and_b32 exec_lo, exec_lo, s14
 ; GFX10NSA-NEXT:    image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_CUBE a16
@@ -130,8 +130,8 @@ define amdgpu_ps <4 x float> @gather4_2darray(<8 x i32> inreg %rsrc, <4 x i32> i
 ; GFX9-NEXT:    s_mov_b32 s5, s7
 ; GFX9-NEXT:    s_mov_b32 s7, s9
 ; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    v_and_or_b32 v0, v0, v3, v1
 ; GFX9-NEXT:    s_mov_b32 s11, s13
+; GFX9-NEXT:    v_and_or_b32 v0, v0, v3, v1
 ; GFX9-NEXT:    v_and_or_b32 v1, v2, v3, s12
 ; GFX9-NEXT:    s_and_b64 exec, exec, s[14:15]
 ; GFX9-NEXT:    image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 a16 da
@@ -156,8 +156,8 @@ define amdgpu_ps <4 x float> @gather4_2darray(<8 x i32> inreg %rsrc, <4 x i32> i
 ; GFX10NSA-NEXT:    s_mov_b32 s5, s7
 ; GFX10NSA-NEXT:    s_mov_b32 s7, s9
 ; GFX10NSA-NEXT:    s_mov_b32 s9, s11
-; GFX10NSA-NEXT:    v_and_or_b32 v0, v0, v3, v1
 ; GFX10NSA-NEXT:    s_mov_b32 s11, s13
+; GFX10NSA-NEXT:    v_and_or_b32 v0, v0, v3, v1
 ; GFX10NSA-NEXT:    v_and_or_b32 v1, v2, v3, s12
 ; GFX10NSA-NEXT:    s_and_b32 exec_lo, exec_lo, s14
 ; GFX10NSA-NEXT:    image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY a16
@@ -239,8 +239,8 @@ define amdgpu_ps <4 x float> @gather4_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
 ; GFX9-NEXT:    s_mov_b32 s5, s7
 ; GFX9-NEXT:    s_mov_b32 s7, s9
 ; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    v_and_or_b32 v0, v0, v3, v1
 ; GFX9-NEXT:    s_mov_b32 s11, s13
+; GFX9-NEXT:    v_and_or_b32 v0, v0, v3, v1
 ; GFX9-NEXT:    v_and_or_b32 v1, v2, v3, s12
 ; GFX9-NEXT:    s_and_b64 exec, exec, s[14:15]
 ; GFX9-NEXT:    image_gather4_cl v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 a16
@@ -265,8 +265,8 @@ define amdgpu_ps <4 x float> @gather4_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
 ; GFX10NSA-NEXT:    s_mov_b32 s5, s7
 ; GFX10NSA-NEXT:    s_mov_b32 s7, s9
 ; GFX10NSA-NEXT:    s_mov_b32 s9, s11
-; GFX10NSA-NEXT:    v_and_or_b32 v0, v0, v3, v1
 ; GFX10NSA-NEXT:    s_mov_b32 s11, s13
+; GFX10NSA-NEXT:    v_and_or_b32 v0, v0, v3, v1
 ; GFX10NSA-NEXT:    v_and_or_b32 v1, v2, v3, s12
 ; GFX10NSA-NEXT:    s_and_b32 exec_lo, exec_lo, s14
 ; GFX10NSA-NEXT:    image_gather4_cl v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
@@ -296,8 +296,8 @@ define amdgpu_ps <4 x float> @gather4_c_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
 ; GFX9-NEXT:    s_mov_b32 s5, s7
 ; GFX9-NEXT:    s_mov_b32 s7, s9
 ; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    v_and_or_b32 v1, v1, v4, v2
 ; GFX9-NEXT:    s_mov_b32 s11, s13
+; GFX9-NEXT:    v_and_or_b32 v1, v1, v4, v2
 ; GFX9-NEXT:    v_and_or_b32 v2, v3, v4, s12
 ; GFX9-NEXT:    s_and_b64 exec, exec, s[14:15]
 ; GFX9-NEXT:    image_gather4_c_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 a16
@@ -322,8 +322,8 @@ define amdgpu_ps <4 x float> @gather4_c_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
 ; GFX10NSA-NEXT:    s_mov_b32 s5, s7
 ; GFX10NSA-NEXT:    s_mov_b32 s7, s9
 ; GFX10NSA-NEXT:    s_mov_b32 s9, s11
-; GFX10NSA-NEXT:    v_and_or_b32 v1, v1, v4, v2
 ; GFX10NSA-NEXT:    s_mov_b32 s11, s13
+; GFX10NSA-NEXT:    v_and_or_b32 v1, v1, v4, v2
 ; GFX10NSA-NEXT:    v_and_or_b32 v2, v3, v4, s12
 ; GFX10NSA-NEXT:    s_and_b32 exec_lo, exec_lo, s14
 ; GFX10NSA-NEXT:    image_gather4_c_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
@@ -457,8 +457,8 @@ define amdgpu_ps <4 x float> @gather4_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
 ; GFX9-NEXT:    s_mov_b32 s5, s7
 ; GFX9-NEXT:    s_mov_b32 s7, s9
 ; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    v_and_or_b32 v1, v1, v4, v2
 ; GFX9-NEXT:    s_mov_b32 s11, s13
+; GFX9-NEXT:    v_and_or_b32 v1, v1, v4, v2
 ; GFX9-NEXT:    v_and_or_b32 v2, v3, v4, s12
 ; GFX9-NEXT:    s_and_b64 exec, exec, s[14:15]
 ; GFX9-NEXT:    image_gather4_b_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 a16
@@ -483,8 +483,8 @@ define amdgpu_ps <4 x float> @gather4_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
 ; GFX10NSA-NEXT:    s_mov_b32 s5, s7
 ; GFX10NSA-NEXT:    s_mov_b32 s7, s9
 ; GFX10NSA-NEXT:    s_mov_b32 s9, s11
-; GFX10NSA-NEXT:    v_and_or_b32 v1, v1, v4, v2
 ; GFX10NSA-NEXT:    s_mov_b32 s11, s13
+; GFX10NSA-NEXT:    v_and_or_b32 v1, v1, v4, v2
 ; GFX10NSA-NEXT:    v_and_or_b32 v2, v3, v4, s12
 ; GFX10NSA-NEXT:    s_and_b32 exec_lo, exec_lo, s14
 ; GFX10NSA-NEXT:    image_gather4_b_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
@@ -514,8 +514,8 @@ define amdgpu_ps <4 x float> @gather4_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
 ; GFX9-NEXT:    s_mov_b32 s5, s7
 ; GFX9-NEXT:    s_mov_b32 s7, s9
 ; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    v_and_or_b32 v2, v2, v5, v3
 ; GFX9-NEXT:    s_mov_b32 s11, s13
+; GFX9-NEXT:    v_and_or_b32 v2, v2, v5, v3
 ; GFX9-NEXT:    v_and_or_b32 v3, v4, v5, s12
 ; GFX9-NEXT:    s_and_b64 exec, exec, s[14:15]
 ; GFX9-NEXT:    image_gather4_c_b_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 a16
@@ -540,8 +540,8 @@ define amdgpu_ps <4 x float> @gather4_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
 ; GFX10NSA-NEXT:    s_mov_b32 s5, s7
 ; GFX10NSA-NEXT:    s_mov_b32 s7, s9
 ; GFX10NSA-NEXT:    s_mov_b32 s9, s11
-; GFX10NSA-NEXT:    v_and_or_b32 v2, v2, v5, v3
 ; GFX10NSA-NEXT:    s_mov_b32 s11, s13
+; GFX10NSA-NEXT:    v_and_or_b32 v2, v2, v5, v3
 ; GFX10NSA-NEXT:    v_and_or_b32 v3, v4, v5, s12
 ; GFX10NSA-NEXT:    s_and_b32 exec_lo, exec_lo, s14
 ; GFX10NSA-NEXT:    image_gather4_c_b_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
@@ -569,8 +569,8 @@ define amdgpu_ps <4 x float> @gather4_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX9-NEXT:    s_mov_b32 s5, s7
 ; GFX9-NEXT:    s_mov_b32 s7, s9
 ; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    v_and_or_b32 v0, v0, v3, v1
 ; GFX9-NEXT:    s_mov_b32 s11, s13
+; GFX9-NEXT:    v_and_or_b32 v0, v0, v3, v1
 ; GFX9-NEXT:    v_and_or_b32 v1, v2, v3, s12
 ; GFX9-NEXT:    image_gather4_l v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 a16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -620,8 +620,8 @@ define amdgpu_ps <4 x float> @gather4_c_l_2d(<8 x i32> inreg %rsrc, <4 x i32> in
 ; GFX9-NEXT:    s_mov_b32 s5, s7
 ; GFX9-NEXT:    s_mov_b32 s7, s9
 ; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    v_and_or_b32 v1, v1, v4, v2
 ; GFX9-NEXT:    s_mov_b32 s11, s13
+; GFX9-NEXT:    v_and_or_b32 v1, v1, v4, v2
 ; GFX9-NEXT:    v_and_or_b32 v2, v3, v4, s12
 ; GFX9-NEXT:    image_gather4_c_l v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 a16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.dim.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.dim.ll
index 88ab397f4b7e0..e10af9ae09be1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.dim.ll
@@ -5,6 +5,7 @@
 define amdgpu_ps <4 x float> @gather4_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
 ; GFX6-LABEL: gather4_2d:
 ; GFX6:       ; %bb.0: ; %main_body
+; GFX6-NEXT:    s_mov_b64 s[14:15], exec
 ; GFX6-NEXT:    s_mov_b32 s0, s2
 ; GFX6-NEXT:    s_mov_b32 s1, s3
 ; GFX6-NEXT:    s_mov_b32 s2, s4
@@ -15,7 +16,6 @@ define amdgpu_ps <4 x float> @gather4_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
 ; GFX6-NEXT:    s_mov_b32 s7, s9
 ; GFX6-NEXT:    s_mov_b32 s8, s10
 ; GFX6-NEXT:    s_mov_b32 s9, s11
-; GFX6-NEXT:    s_mov_b64 s[14:15], exec
 ; GFX6-NEXT:    s_mov_b32 s10, s12
 ; GFX6-NEXT:    s_mov_b32 s11, s13
 ; GFX6-NEXT:    s_wqm_b64 exec, exec
@@ -52,6 +52,7 @@ main_body:
 define amdgpu_ps <4 x float> @gather4_2d_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
 ; GFX6-LABEL: gather4_2d_tfe:
 ; GFX6:       ; %bb.0: ; %main_body
+; GFX6-NEXT:    s_mov_b64 s[14:15], exec
 ; GFX6-NEXT:    s_mov_b32 s0, s2
 ; GFX6-NEXT:    s_mov_b32 s1, s3
 ; GFX6-NEXT:    s_mov_b32 s2, s4
@@ -62,7 +63,6 @@ define amdgpu_ps <4 x float> @gather4_2d_tfe(<8 x i32> inreg %rsrc, <4 x i32> in
 ; GFX6-NEXT:    s_mov_b32 s7, s9
 ; GFX6-NEXT:    s_mov_b32 s8, s10
 ; GFX6-NEXT:    s_mov_b32 s9, s11
-; GFX6-NEXT:    s_mov_b64 s[14:15], exec
 ; GFX6-NEXT:    s_mov_b32 s10, s12
 ; GFX6-NEXT:    s_mov_b32 s11, s13
 ; GFX6-NEXT:    s_wqm_b64 exec, exec
@@ -114,6 +114,7 @@ main_body:
 define amdgpu_ps <4 x float> @gather4_cube(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, float %face) {
 ; GFX6-LABEL: gather4_cube:
 ; GFX6:       ; %bb.0: ; %main_body
+; GFX6-NEXT:    s_mov_b64 s[14:15], exec
 ; GFX6-NEXT:    s_mov_b32 s0, s2
 ; GFX6-NEXT:    s_mov_b32 s1, s3
 ; GFX6-NEXT:    s_mov_b32 s2, s4
@@ -124,7 +125,6 @@ define amdgpu_ps <4 x float> @gather4_cube(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX6-NEXT:    s_mov_b32 s7, s9
 ; GFX6-NEXT:    s_mov_b32 s8, s10
 ; GFX6-NEXT:    s_mov_b32 s9, s11
-; GFX6-NEXT:    s_mov_b64 s[14:15], exec
 ; GFX6-NEXT:    s_mov_b32 s10, s12
 ; GFX6-NEXT:    s_mov_b32 s11, s13
 ; GFX6-NEXT:    s_wqm_b64 exec, exec
@@ -161,6 +161,7 @@ main_body:
 define amdgpu_ps <4 x float> @gather4_2darray(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, float %slice) {
 ; GFX6-LABEL: gather4_2darray:
 ; GFX6:       ; %bb.0: ; %main_body
+; GFX6-NEXT:    s_mov_b64 s[14:15], exec
 ; GFX6-NEXT:    s_mov_b32 s0, s2
 ; GFX6-NEXT:    s_mov_b32 s1, s3
 ; GFX6-NEXT:    s_mov_b32 s2, s4
@@ -171,7 +172,6 @@ define amdgpu_ps <4 x float> @gather4_2darray(<8 x i32> inreg %rsrc, <4 x i32> i
 ; GFX6-NEXT:    s_mov_b32 s7, s9
 ; GFX6-NEXT:    s_mov_b32 s8, s10
 ; GFX6-NEXT:    s_mov_b32 s9, s11
-; GFX6-NEXT:    s_mov_b64 s[14:15], exec
 ; GFX6-NEXT:    s_mov_b32 s10, s12
 ; GFX6-NEXT:    s_mov_b32 s11, s13
 ; GFX6-NEXT:    s_wqm_b64 exec, exec
@@ -208,6 +208,7 @@ main_body:
 define amdgpu_ps <4 x float> @gather4_c_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %t) {
 ; GFX6-LABEL: gather4_c_2d:
 ; GFX6:       ; %bb.0: ; %main_body
+; GFX6-NEXT:    s_mov_b64 s[14:15], exec
 ; GFX6-NEXT:    s_mov_b32 s0, s2
 ; GFX6-NEXT:    s_mov_b32 s1, s3
 ; GFX6-NEXT:    s_mov_b32 s2, s4
@@ -218,7 +219,6 @@ define amdgpu_ps <4 x float> @gather4_c_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX6-NEXT:    s_mov_b32 s7, s9
 ; GFX6-NEXT:    s_mov_b32 s8, s10
 ; GFX6-NEXT:    s_mov_b32 s9, s11
-; GFX6-NEXT:    s_mov_b64 s[14:15], exec
 ; GFX6-NEXT:    s_mov_b32 s10, s12
 ; GFX6-NEXT:    s_mov_b32 s11, s13
 ; GFX6-NEXT:    s_wqm_b64 exec, exec
@@ -255,6 +255,7 @@ main_body:
 define amdgpu_ps <4 x float> @gather4_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, float %clamp) {
 ; GFX6-LABEL: gather4_cl_2d:
 ; GFX6:       ; %bb.0: ; %main_body
+; GFX6-NEXT:    s_mov_b64 s[14:15], exec
 ; GFX6-NEXT:    s_mov_b32 s0, s2
 ; GFX6-NEXT:    s_mov_b32 s1, s3
 ; GFX6-NEXT:    s_mov_b32 s2, s4
@@ -265,7 +266,6 @@ define amdgpu_ps <4 x float> @gather4_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
 ; GFX6-NEXT:    s_mov_b32 s7, s9
 ; GFX6-NEXT:    s_mov_b32 s8, s10
 ; GFX6-NEXT:    s_mov_b32 s9, s11
-; GFX6-NEXT:    s_mov_b64 s[14:15], exec
 ; GFX6-NEXT:    s_mov_b32 s10, s12
 ; GFX6-NEXT:    s_mov_b32 s11, s13
 ; GFX6-NEXT:    s_wqm_b64 exec, exec
@@ -302,6 +302,7 @@ main_body:
 define amdgpu_ps <4 x float> @gather4_c_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %t, float %clamp) {
 ; GFX6-LABEL: gather4_c_cl_2d:
 ; GFX6:       ; %bb.0: ; %main_body
+; GFX6-NEXT:    s_mov_b64 s[14:15], exec
 ; GFX6-NEXT:    s_mov_b32 s0, s2
 ; GFX6-NEXT:    s_mov_b32 s1, s3
 ; GFX6-NEXT:    s_mov_b32 s2, s4
@@ -312,7 +313,6 @@ define amdgpu_ps <4 x float> @gather4_c_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
 ; GFX6-NEXT:    s_mov_b32 s7, s9
 ; GFX6-NEXT:    s_mov_b32 s8, s10
 ; GFX6-NEXT:    s_mov_b32 s9, s11
-; GFX6-NEXT:    s_mov_b64 s[14:15], exec
 ; GFX6-NEXT:    s_mov_b32 s10, s12
 ; GFX6-NEXT:    s_mov_b32 s11, s13
 ; GFX6-NEXT:    s_wqm_b64 exec, exec
@@ -349,6 +349,7 @@ main_body:
 define amdgpu_ps <4 x float> @gather4_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %s, float %t) {
 ; GFX6-LABEL: gather4_b_2d:
 ; GFX6:       ; %bb.0: ; %main_body
+; GFX6-NEXT:    s_mov_b64 s[14:15], exec
 ; GFX6-NEXT:    s_mov_b32 s0, s2
 ; GFX6-NEXT:    s_mov_b32 s1, s3
 ; GFX6-NEXT:    s_mov_b32 s2, s4
@@ -359,7 +360,6 @@ define amdgpu_ps <4 x float> @gather4_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX6-NEXT:    s_mov_b32 s7, s9
 ; GFX6-NEXT:    s_mov_b32 s8, s10
 ; GFX6-NEXT:    s_mov_b32 s9, s11
-; GFX6-NEXT:    s_mov_b64 s[14:15], exec
 ; GFX6-NEXT:    s_mov_b32 s10, s12
 ; GFX6-NEXT:    s_mov_b32 s11, s13
 ; GFX6-NEXT:    s_wqm_b64 exec, exec
@@ -396,6 +396,7 @@ main_body:
 define amdgpu_ps <4 x float> @gather4_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, float %s, float %t) {
 ; GFX6-LABEL: gather4_c_b_2d:
 ; GFX6:       ; %bb.0: ; %main_body
+; GFX6-NEXT:    s_mov_b64 s[14:15], exec
 ; GFX6-NEXT:    s_mov_b32 s0, s2
 ; GFX6-NEXT:    s_mov_b32 s1, s3
 ; GFX6-NEXT:    s_mov_b32 s2, s4
@@ -406,7 +407,6 @@ define amdgpu_ps <4 x float> @gather4_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> in
 ; GFX6-NEXT:    s_mov_b32 s7, s9
 ; GFX6-NEXT:    s_mov_b32 s8, s10
 ; GFX6-NEXT:    s_mov_b32 s9, s11
-; GFX6-NEXT:    s_mov_b64 s[14:15], exec
 ; GFX6-NEXT:    s_mov_b32 s10, s12
 ; GFX6-NEXT:    s_mov_b32 s11, s13
 ; GFX6-NEXT:    s_wqm_b64 exec, exec
@@ -443,6 +443,7 @@ main_body:
 define amdgpu_ps <4 x float> @gather4_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %s, float %t, float %clamp) {
 ; GFX6-LABEL: gather4_b_cl_2d:
 ; GFX6:       ; %bb.0: ; %main_body
+; GFX6-NEXT:    s_mov_b64 s[14:15], exec
 ; GFX6-NEXT:    s_mov_b32 s0, s2
 ; GFX6-NEXT:    s_mov_b32 s1, s3
 ; GFX6-NEXT:    s_mov_b32 s2, s4
@@ -453,7 +454,6 @@ define amdgpu_ps <4 x float> @gather4_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
 ; GFX6-NEXT:    s_mov_b32 s7, s9
 ; GFX6-NEXT:    s_mov_b32 s8, s10
 ; GFX6-NEXT:    s_mov_b32 s9, s11
-; GFX6-NEXT:    s_mov_b64 s[14:15], exec
 ; GFX6-NEXT:    s_mov_b32 s10, s12
 ; GFX6-NEXT:    s_mov_b32 s11, s13
 ; GFX6-NEXT:    s_wqm_b64 exec, exec
@@ -490,6 +490,7 @@ main_body:
 define amdgpu_ps <4 x float> @gather4_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, float %s, float %t, float %clamp) {
 ; GFX6-LABEL: gather4_c_b_cl_2d:
 ; GFX6:       ; %bb.0: ; %main_body
+; GFX6-NEXT:    s_mov_b64 s[14:15], exec
 ; GFX6-NEXT:    s_mov_b32 s0, s2
 ; GFX6-NEXT:    s_mov_b32 s1, s3
 ; GFX6-NEXT:    s_mov_b32 s2, s4
@@ -500,7 +501,6 @@ define amdgpu_ps <4 x float> @gather4_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
 ; GFX6-NEXT:    s_mov_b32 s7, s9
 ; GFX6-NEXT:    s_mov_b32 s8, s10
 ; GFX6-NEXT:    s_mov_b32 s9, s11
-; GFX6-NEXT:    s_mov_b64 s[14:15], exec
 ; GFX6-NEXT:    s_mov_b32 s10, s12
 ; GFX6-NEXT:    s_mov_b32 s11, s13
 ; GFX6-NEXT:    s_wqm_b64 exec, exec
@@ -701,6 +701,7 @@ main_body:
 define amdgpu_ps <4 x float> @gather4_2d_dmask_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
 ; GFX6-LABEL: gather4_2d_dmask_2:
 ; GFX6:       ; %bb.0: ; %main_body
+; GFX6-NEXT:    s_mov_b64 s[14:15], exec
 ; GFX6-NEXT:    s_mov_b32 s0, s2
 ; GFX6-NEXT:    s_mov_b32 s1, s3
 ; GFX6-NEXT:    s_mov_b32 s2, s4
@@ -711,7 +712,6 @@ define amdgpu_ps <4 x float> @gather4_2d_dmask_2(<8 x i32> inreg %rsrc, <4 x i32
 ; GFX6-NEXT:    s_mov_b32 s7, s9
 ; GFX6-NEXT:    s_mov_b32 s8, s10
 ; GFX6-NEXT:    s_mov_b32 s9, s11
-; GFX6-NEXT:    s_mov_b64 s[14:15], exec
 ; GFX6-NEXT:    s_mov_b32 s10, s12
 ; GFX6-NEXT:    s_mov_b32 s11, s13
 ; GFX6-NEXT:    s_wqm_b64 exec, exec
@@ -748,6 +748,7 @@ main_body:
 define amdgpu_ps <4 x float> @gather4_2d_dmask_4(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
 ; GFX6-LABEL: gather4_2d_dmask_4:
 ; GFX6:       ; %bb.0: ; %main_body
+; GFX6-NEXT:    s_mov_b64 s[14:15], exec
 ; GFX6-NEXT:    s_mov_b32 s0, s2
 ; GFX6-NEXT:    s_mov_b32 s1, s3
 ; GFX6-NEXT:    s_mov_b32 s2, s4
@@ -758,7 +759,6 @@ define amdgpu_ps <4 x float> @gather4_2d_dmask_4(<8 x i32> inreg %rsrc, <4 x i32
 ; GFX6-NEXT:    s_mov_b32 s7, s9
 ; GFX6-NEXT:    s_mov_b32 s8, s10
 ; GFX6-NEXT:    s_mov_b32 s9, s11
-; GFX6-NEXT:    s_mov_b64 s[14:15], exec
 ; GFX6-NEXT:    s_mov_b32 s10, s12
 ; GFX6-NEXT:    s_mov_b32 s11, s13
 ; GFX6-NEXT:    s_wqm_b64 exec, exec
@@ -795,6 +795,7 @@ main_body:
 define amdgpu_ps <4 x float> @gather4_2d_dmask_8(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
 ; GFX6-LABEL: gather4_2d_dmask_8:
 ; GFX6:       ; %bb.0: ; %main_body
+; GFX6-NEXT:    s_mov_b64 s[14:15], exec
 ; GFX6-NEXT:    s_mov_b32 s0, s2
 ; GFX6-NEXT:    s_mov_b32 s1, s3
 ; GFX6-NEXT:    s_mov_b32 s2, s4
@@ -805,7 +806,6 @@ define amdgpu_ps <4 x float> @gather4_2d_dmask_8(<8 x i32> inreg %rsrc, <4 x i32
 ; GFX6-NEXT:    s_mov_b32 s7, s9
 ; GFX6-NEXT:    s_mov_b32 s8, s10
 ; GFX6-NEXT:    s_mov_b32 s9, s11
-; GFX6-NEXT:    s_mov_b64 s[14:15], exec
 ; GFX6-NEXT:    s_mov_b32 s10, s12
 ; GFX6-NEXT:    s_mov_b32 s11, s13
 ; GFX6-NEXT:    s_wqm_b64 exec, exec

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.o.dim.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.o.dim.ll
index 9792459169859..7420e81db5fd6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.o.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.o.dim.ll
@@ -5,6 +5,7 @@
 define amdgpu_ps <4 x float> @gather4_o_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %s, float %t) {
 ; GFX6-LABEL: gather4_o_2d:
 ; GFX6:       ; %bb.0: ; %main_body
+; GFX6-NEXT:    s_mov_b64 s[14:15], exec
 ; GFX6-NEXT:    s_mov_b32 s0, s2
 ; GFX6-NEXT:    s_mov_b32 s1, s3
 ; GFX6-NEXT:    s_mov_b32 s2, s4
@@ -15,7 +16,6 @@ define amdgpu_ps <4 x float> @gather4_o_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX6-NEXT:    s_mov_b32 s7, s9
 ; GFX6-NEXT:    s_mov_b32 s8, s10
 ; GFX6-NEXT:    s_mov_b32 s9, s11
-; GFX6-NEXT:    s_mov_b64 s[14:15], exec
 ; GFX6-NEXT:    s_mov_b32 s10, s12
 ; GFX6-NEXT:    s_mov_b32 s11, s13
 ; GFX6-NEXT:    s_wqm_b64 exec, exec
@@ -52,6 +52,7 @@ main_body:
 define amdgpu_ps <4 x float> @gather4_c_o_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %s, float %t) {
 ; GFX6-LABEL: gather4_c_o_2d:
 ; GFX6:       ; %bb.0: ; %main_body
+; GFX6-NEXT:    s_mov_b64 s[14:15], exec
 ; GFX6-NEXT:    s_mov_b32 s0, s2
 ; GFX6-NEXT:    s_mov_b32 s1, s3
 ; GFX6-NEXT:    s_mov_b32 s2, s4
@@ -62,7 +63,6 @@ define amdgpu_ps <4 x float> @gather4_c_o_2d(<8 x i32> inreg %rsrc, <4 x i32> in
 ; GFX6-NEXT:    s_mov_b32 s7, s9
 ; GFX6-NEXT:    s_mov_b32 s8, s10
 ; GFX6-NEXT:    s_mov_b32 s9, s11
-; GFX6-NEXT:    s_mov_b64 s[14:15], exec
 ; GFX6-NEXT:    s_mov_b32 s10, s12
 ; GFX6-NEXT:    s_mov_b32 s11, s13
 ; GFX6-NEXT:    s_wqm_b64 exec, exec
@@ -99,6 +99,7 @@ main_body:
 define amdgpu_ps <4 x float> @gather4_cl_o_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %s, float %t, float %clamp) {
 ; GFX6-LABEL: gather4_cl_o_2d:
 ; GFX6:       ; %bb.0: ; %main_body
+; GFX6-NEXT:    s_mov_b64 s[14:15], exec
 ; GFX6-NEXT:    s_mov_b32 s0, s2
 ; GFX6-NEXT:    s_mov_b32 s1, s3
 ; GFX6-NEXT:    s_mov_b32 s2, s4
@@ -109,7 +110,6 @@ define amdgpu_ps <4 x float> @gather4_cl_o_2d(<8 x i32> inreg %rsrc, <4 x i32> i
 ; GFX6-NEXT:    s_mov_b32 s7, s9
 ; GFX6-NEXT:    s_mov_b32 s8, s10
 ; GFX6-NEXT:    s_mov_b32 s9, s11
-; GFX6-NEXT:    s_mov_b64 s[14:15], exec
 ; GFX6-NEXT:    s_mov_b32 s10, s12
 ; GFX6-NEXT:    s_mov_b32 s11, s13
 ; GFX6-NEXT:    s_wqm_b64 exec, exec
@@ -146,6 +146,7 @@ main_body:
 define amdgpu_ps <4 x float> @gather4_c_cl_o_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %s, float %t, float %clamp) {
 ; GFX6-LABEL: gather4_c_cl_o_2d:
 ; GFX6:       ; %bb.0: ; %main_body
+; GFX6-NEXT:    s_mov_b64 s[14:15], exec
 ; GFX6-NEXT:    s_mov_b32 s0, s2
 ; GFX6-NEXT:    s_mov_b32 s1, s3
 ; GFX6-NEXT:    s_mov_b32 s2, s4
@@ -156,7 +157,6 @@ define amdgpu_ps <4 x float> @gather4_c_cl_o_2d(<8 x i32> inreg %rsrc, <4 x i32>
 ; GFX6-NEXT:    s_mov_b32 s7, s9
 ; GFX6-NEXT:    s_mov_b32 s8, s10
 ; GFX6-NEXT:    s_mov_b32 s9, s11
-; GFX6-NEXT:    s_mov_b64 s[14:15], exec
 ; GFX6-NEXT:    s_mov_b32 s10, s12
 ; GFX6-NEXT:    s_mov_b32 s11, s13
 ; GFX6-NEXT:    s_wqm_b64 exec, exec
@@ -193,6 +193,7 @@ main_body:
 define amdgpu_ps <4 x float> @gather4_b_o_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %bias, float %s, float %t) {
 ; GFX6-LABEL: gather4_b_o_2d:
 ; GFX6:       ; %bb.0: ; %main_body
+; GFX6-NEXT:    s_mov_b64 s[14:15], exec
 ; GFX6-NEXT:    s_mov_b32 s0, s2
 ; GFX6-NEXT:    s_mov_b32 s1, s3
 ; GFX6-NEXT:    s_mov_b32 s2, s4
@@ -203,7 +204,6 @@ define amdgpu_ps <4 x float> @gather4_b_o_2d(<8 x i32> inreg %rsrc, <4 x i32> in
 ; GFX6-NEXT:    s_mov_b32 s7, s9
 ; GFX6-NEXT:    s_mov_b32 s8, s10
 ; GFX6-NEXT:    s_mov_b32 s9, s11
-; GFX6-NEXT:    s_mov_b64 s[14:15], exec
 ; GFX6-NEXT:    s_mov_b32 s10, s12
 ; GFX6-NEXT:    s_mov_b32 s11, s13
 ; GFX6-NEXT:    s_wqm_b64 exec, exec
@@ -240,6 +240,7 @@ main_body:
 define amdgpu_ps <4 x float> @gather4_c_b_o_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %bias, float %zcompare, float %s, float %t) {
 ; GFX6-LABEL: gather4_c_b_o_2d:
 ; GFX6:       ; %bb.0: ; %main_body
+; GFX6-NEXT:    s_mov_b64 s[14:15], exec
 ; GFX6-NEXT:    s_mov_b32 s0, s2
 ; GFX6-NEXT:    s_mov_b32 s1, s3
 ; GFX6-NEXT:    s_mov_b32 s2, s4
@@ -250,7 +251,6 @@ define amdgpu_ps <4 x float> @gather4_c_b_o_2d(<8 x i32> inreg %rsrc, <4 x i32>
 ; GFX6-NEXT:    s_mov_b32 s7, s9
 ; GFX6-NEXT:    s_mov_b32 s8, s10
 ; GFX6-NEXT:    s_mov_b32 s9, s11
-; GFX6-NEXT:    s_mov_b64 s[14:15], exec
 ; GFX6-NEXT:    s_mov_b32 s10, s12
 ; GFX6-NEXT:    s_mov_b32 s11, s13
 ; GFX6-NEXT:    s_wqm_b64 exec, exec
@@ -328,6 +328,7 @@ main_body:
 define amdgpu_ps <4 x float> @gather4_c_b_cl_o_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %bias, float %zcompare, float %s, float %t, float %clamp) {
 ; GFX6-LABEL: gather4_c_b_cl_o_2d:
 ; GFX6:       ; %bb.0: ; %main_body
+; GFX6-NEXT:    s_mov_b64 s[14:15], exec
 ; GFX6-NEXT:    s_mov_b32 s0, s2
 ; GFX6-NEXT:    s_mov_b32 s1, s3
 ; GFX6-NEXT:    s_mov_b32 s2, s4
@@ -338,7 +339,6 @@ define amdgpu_ps <4 x float> @gather4_c_b_cl_o_2d(<8 x i32> inreg %rsrc, <4 x i3
 ; GFX6-NEXT:    s_mov_b32 s7, s9
 ; GFX6-NEXT:    s_mov_b32 s8, s10
 ; GFX6-NEXT:    s_mov_b32 s9, s11
-; GFX6-NEXT:    s_mov_b64 s[14:15], exec
 ; GFX6-NEXT:    s_mov_b32 s10, s12
 ; GFX6-NEXT:    s_mov_b32 s11, s13
 ; GFX6-NEXT:    s_wqm_b64 exec, exec

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2d.ll
index 10ce92f688f42..c76bf66b3128b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2d.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2d.ll
@@ -73,13 +73,13 @@ define amdgpu_ps <4 x float> @load_2d_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, i32
 ; GFX10-NEXT:    v_mov_b32_e32 v9, v7
 ; GFX10-NEXT:    v_mov_b32_e32 v10, v7
 ; GFX10-NEXT:    v_mov_b32_e32 v11, v7
-; GFX10-NEXT:    v_mov_b32_e32 v0, v7
 ; GFX10-NEXT:    s_mov_b32 s2, s4
 ; GFX10-NEXT:    s_mov_b32 s3, s5
 ; GFX10-NEXT:    s_mov_b32 s4, s6
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s6, s8
 ; GFX10-NEXT:    s_mov_b32 s7, s9
+; GFX10-NEXT:    v_mov_b32_e32 v0, v7
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v8
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v9
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v10
@@ -135,13 +135,13 @@ define amdgpu_ps <4 x float> @load_2d_v4f32_xyzw_tfe_lwe(<8 x i32> inreg %rsrc,
 ; GFX10-NEXT:    v_mov_b32_e32 v9, v7
 ; GFX10-NEXT:    v_mov_b32_e32 v10, v7
 ; GFX10-NEXT:    v_mov_b32_e32 v11, v7
-; GFX10-NEXT:    v_mov_b32_e32 v0, v7
 ; GFX10-NEXT:    s_mov_b32 s2, s4
 ; GFX10-NEXT:    s_mov_b32 s3, s5
 ; GFX10-NEXT:    s_mov_b32 s4, s6
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s6, s8
 ; GFX10-NEXT:    s_mov_b32 s7, s9
+; GFX10-NEXT:    v_mov_b32_e32 v0, v7
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v8
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v9
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v10

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll
index d8bc4daeafda7..cb596f2021e4a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll
@@ -49,9 +49,9 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw_tfe(<8 x i32> inreg %r
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0xffff
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-NEXT:    v_and_or_b32 v10, v0, v4, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-NEXT:    v_and_or_b32 v11, v2, v4, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v6, v5
 ; GFX9-NEXT:    v_mov_b32_e32 v7, v5
@@ -89,7 +89,6 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw_tfe(<8 x i32> inreg %r
 ; GFX10-NEXT:    v_mov_b32_e32 v9, v5
 ; GFX10-NEXT:    v_and_or_b32 v10, v0, v4, v1
 ; GFX10-NEXT:    v_and_or_b32 v11, v2, v4, v3
-; GFX10-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX10-NEXT:    s_mov_b32 s1, s3
 ; GFX10-NEXT:    s_mov_b32 s2, s4
 ; GFX10-NEXT:    s_mov_b32 s3, s5
@@ -97,6 +96,7 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw_tfe(<8 x i32> inreg %r
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s6, s8
 ; GFX10-NEXT:    s_mov_b32 s7, s9
+; GFX10-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v6
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v7
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v8
@@ -118,9 +118,9 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw_tfe_lwe(<8 x i32> inre
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0xffff
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-NEXT:    v_and_or_b32 v10, v0, v4, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-NEXT:    v_and_or_b32 v11, v2, v4, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v6, v5
 ; GFX9-NEXT:    v_mov_b32_e32 v7, v5
@@ -158,7 +158,6 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw_tfe_lwe(<8 x i32> inre
 ; GFX10-NEXT:    v_mov_b32_e32 v9, v5
 ; GFX10-NEXT:    v_and_or_b32 v10, v0, v4, v1
 ; GFX10-NEXT:    v_and_or_b32 v11, v2, v4, v3
-; GFX10-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX10-NEXT:    s_mov_b32 s1, s3
 ; GFX10-NEXT:    s_mov_b32 s2, s4
 ; GFX10-NEXT:    s_mov_b32 s3, s5
@@ -166,6 +165,7 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw_tfe_lwe(<8 x i32> inre
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s6, s8
 ; GFX10-NEXT:    s_mov_b32 s7, s9
+; GFX10-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v6
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v7
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v8

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.ll
index b82d0e62a7f0e..9849396e6441d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.ll
@@ -75,7 +75,6 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw_tfe(<8 x i32> inreg %r
 ; GFX10-NEXT:    v_mov_b32_e32 v11, v9
 ; GFX10-NEXT:    v_mov_b32_e32 v12, v9
 ; GFX10-NEXT:    v_mov_b32_e32 v13, v9
-; GFX10-NEXT:    v_mov_b32_e32 v0, v9
 ; GFX10-NEXT:    s_mov_b32 s0, s2
 ; GFX10-NEXT:    s_mov_b32 s1, s3
 ; GFX10-NEXT:    s_mov_b32 s2, s4
@@ -84,6 +83,7 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw_tfe(<8 x i32> inreg %r
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s6, s8
 ; GFX10-NEXT:    s_mov_b32 s7, s9
+; GFX10-NEXT:    v_mov_b32_e32 v0, v9
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v10
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v11
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v12
@@ -141,7 +141,6 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw_tfe_lwe(<8 x i32> inre
 ; GFX10-NEXT:    v_mov_b32_e32 v11, v9
 ; GFX10-NEXT:    v_mov_b32_e32 v12, v9
 ; GFX10-NEXT:    v_mov_b32_e32 v13, v9
-; GFX10-NEXT:    v_mov_b32_e32 v0, v9
 ; GFX10-NEXT:    s_mov_b32 s0, s2
 ; GFX10-NEXT:    s_mov_b32 s1, s3
 ; GFX10-NEXT:    s_mov_b32 s2, s4
@@ -150,6 +149,7 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw_tfe_lwe(<8 x i32> inre
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s6, s8
 ; GFX10-NEXT:    s_mov_b32 s7, s9
+; GFX10-NEXT:    v_mov_b32_e32 v0, v9
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v10
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v11
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v12

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll
index ca79993b07aa8..dcc3137545e97 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll
@@ -15,8 +15,8 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw(<8 x i32> inreg %rsrc, i16 %s,
 ; GFX9-NEXT:    s_mov_b32 s1, s3
 ; GFX9-NEXT:    s_mov_b32 s3, s5
 ; GFX9-NEXT:    s_mov_b32 s5, s7
-; GFX9-NEXT:    v_and_or_b32 v0, v0, v3, v1
 ; GFX9-NEXT:    s_mov_b32 s7, s9
+; GFX9-NEXT:    v_and_or_b32 v0, v0, v3, v1
 ; GFX9-NEXT:    v_and_or_b32 v1, v2, v3, s8
 ; GFX9-NEXT:    image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm a16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -48,13 +48,13 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, i32
 ; GFX9-LABEL: load_3d_v4f32_xyzw_tfe:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_mov_b32 s0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-NEXT:    s_mov_b32 s2, s4
 ; GFX9-NEXT:    s_mov_b32 s4, s6
 ; GFX9-NEXT:    s_mov_b32 s6, s8
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0xffff
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    s_lshl_b32 s8, s0, 16
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-NEXT:    v_and_or_b32 v10, v0, v3, v1
 ; GFX9-NEXT:    v_and_or_b32 v11, v2, v3, s8
 ; GFX9-NEXT:    v_mov_b32_e32 v6, v5
@@ -92,11 +92,11 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, i32
 ; GFX10-NEXT:    v_mov_b32_e32 v9, v5
 ; GFX10-NEXT:    v_and_or_b32 v10, v0, v3, v1
 ; GFX10-NEXT:    v_and_or_b32 v11, v2, v3, s8
-; GFX10-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX10-NEXT:    s_mov_b32 s1, s3
 ; GFX10-NEXT:    s_mov_b32 s3, s5
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s7, s9
+; GFX10-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v6
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v7
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v8
@@ -117,13 +117,13 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe_lwe(<8 x i32> inreg %rsrc,
 ; GFX9-LABEL: load_3d_v4f32_xyzw_tfe_lwe:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_mov_b32 s0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-NEXT:    s_mov_b32 s2, s4
 ; GFX9-NEXT:    s_mov_b32 s4, s6
 ; GFX9-NEXT:    s_mov_b32 s6, s8
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0xffff
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    s_lshl_b32 s8, s0, 16
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-NEXT:    v_and_or_b32 v10, v0, v3, v1
 ; GFX9-NEXT:    v_and_or_b32 v11, v2, v3, s8
 ; GFX9-NEXT:    v_mov_b32_e32 v6, v5
@@ -161,11 +161,11 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe_lwe(<8 x i32> inreg %rsrc,
 ; GFX10-NEXT:    v_mov_b32_e32 v9, v5
 ; GFX10-NEXT:    v_and_or_b32 v10, v0, v3, v1
 ; GFX10-NEXT:    v_and_or_b32 v11, v2, v3, s8
-; GFX10-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX10-NEXT:    s_mov_b32 s1, s3
 ; GFX10-NEXT:    s_mov_b32 s3, s5
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s7, s9
+; GFX10-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v6
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v7
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v8

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.ll
index d19db8b76a68c..cc274cbfc08dc 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.ll
@@ -74,7 +74,6 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, i32
 ; GFX10-NEXT:    v_mov_b32_e32 v10, v8
 ; GFX10-NEXT:    v_mov_b32_e32 v11, v8
 ; GFX10-NEXT:    v_mov_b32_e32 v12, v8
-; GFX10-NEXT:    v_mov_b32_e32 v0, v8
 ; GFX10-NEXT:    s_mov_b32 s1, s3
 ; GFX10-NEXT:    s_mov_b32 s2, s4
 ; GFX10-NEXT:    s_mov_b32 s3, s5
@@ -82,6 +81,7 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, i32
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s6, s8
 ; GFX10-NEXT:    s_mov_b32 s7, s9
+; GFX10-NEXT:    v_mov_b32_e32 v0, v8
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v9
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v10
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v11
@@ -138,7 +138,6 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe_lwe(<8 x i32> inreg %rsrc,
 ; GFX10-NEXT:    v_mov_b32_e32 v10, v8
 ; GFX10-NEXT:    v_mov_b32_e32 v11, v8
 ; GFX10-NEXT:    v_mov_b32_e32 v12, v8
-; GFX10-NEXT:    v_mov_b32_e32 v0, v8
 ; GFX10-NEXT:    s_mov_b32 s1, s3
 ; GFX10-NEXT:    s_mov_b32 s2, s4
 ; GFX10-NEXT:    s_mov_b32 s3, s5
@@ -146,6 +145,7 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe_lwe(<8 x i32> inreg %rsrc,
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s6, s8
 ; GFX10-NEXT:    s_mov_b32 s7, s9
+; GFX10-NEXT:    v_mov_b32_e32 v0, v8
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v9
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v10
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v11

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll
index 9f79ac2bf80f8..7705bb2392ed1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll
@@ -38,12 +38,12 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg
 ; GFX10-NEXT:    v_mov_b32_e32 v9, v2
 ; GFX10-NEXT:    v_mov_b32_e32 v10, v3
 ; GFX10-NEXT:    v_mov_b32_e32 v11, 0xffff
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX10-NEXT:    s_lshl_b32 s12, s0, 16
 ; GFX10-NEXT:    v_and_or_b32 v3, v9, v11, s12
-; GFX10-NEXT:    v_and_or_b32 v4, v10, v11, v4
 ; GFX10-NEXT:    v_and_or_b32 v2, v0, v11, v1
+; GFX10-NEXT:    v_and_or_b32 v4, v10, v11, v4
 ; GFX10-NEXT:    v_and_or_b32 v5, v5, v11, s12
 ; GFX10-NEXT:    image_sample_d_g16 v[0:3], v[2:8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
@@ -281,14 +281,14 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32>
 ; GFX10:       ; %bb.0: ; %main_body
 ; GFX10-NEXT:    v_mov_b32_e32 v9, v3
 ; GFX10-NEXT:    v_mov_b32_e32 v10, v2
+; GFX10-NEXT:    v_mov_b32_e32 v11, v4
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v1
-; GFX10-NEXT:    v_mov_b32_e32 v11, v4
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0xffff
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v9
-; GFX10-NEXT:    v_and_or_b32 v5, v11, v0, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; GFX10-NEXT:    v_and_or_b32 v4, v10, v0, v1
+; GFX10-NEXT:    v_and_or_b32 v5, v11, v0, v5
 ; GFX10-NEXT:    image_sample_c_d_o_g16 v0, v[2:8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -302,14 +302,14 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4
 ; GFX10:       ; %bb.0: ; %main_body
 ; GFX10-NEXT:    v_mov_b32_e32 v9, v3
 ; GFX10-NEXT:    v_mov_b32_e32 v10, v2
+; GFX10-NEXT:    v_mov_b32_e32 v11, v4
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v1
-; GFX10-NEXT:    v_mov_b32_e32 v11, v4
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0xffff
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v9
-; GFX10-NEXT:    v_and_or_b32 v5, v11, v0, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; GFX10-NEXT:    v_and_or_b32 v4, v10, v0, v1
+; GFX10-NEXT:    v_and_or_b32 v5, v11, v0, v5
 ; GFX10-NEXT:    image_sample_c_d_o_g16 v[0:1], v[2:8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.d16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.d16.ll
index 09921f113710c..9ee22ae85011e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.d16.ll
@@ -117,9 +117,9 @@ define amdgpu_ps void @image_store_v3f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t,
 ; GFX81-NEXT:    s_mov_b32 s3, s5
 ; GFX81-NEXT:    s_mov_b32 s4, s6
 ; GFX81-NEXT:    s_mov_b32 s5, s7
-; GFX81-NEXT:    v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX81-NEXT:    s_mov_b32 s6, s8
 ; GFX81-NEXT:    s_mov_b32 s7, s9
+; GFX81-NEXT:    v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX81-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX81-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX81-NEXT:    image_store v[2:4], v[0:1], s[0:7] dmask:0x7 unorm d16

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
index 23aa8146a8497..1748f3eddc5e2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
@@ -208,13 +208,13 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16_vgpr_descr(i32 %node_p
 ; GFX1030:       ; %bb.0:
 ; GFX1030-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX1030-NEXT:    v_mov_b32_e32 v5, v0
-; GFX1030-NEXT:    v_lshrrev_b32_e32 v0, 16, v6
 ; GFX1030-NEXT:    v_mov_b32_e32 v14, v1
+; GFX1030-NEXT:    v_lshrrev_b32_e32 v0, 16, v6
 ; GFX1030-NEXT:    v_and_b32_e32 v1, s0, v8
 ; GFX1030-NEXT:    v_mov_b32_e32 v15, v2
 ; GFX1030-NEXT:    v_mov_b32_e32 v16, v3
-; GFX1030-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX1030-NEXT:    v_lshrrev_b32_e32 v2, 16, v8
+; GFX1030-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX1030-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX1030-NEXT:    v_and_b32_e32 v3, s0, v9
 ; GFX1030-NEXT:    s_mov_b32 s1, exec_lo
@@ -340,13 +340,13 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node
 ; GFX1030:       ; %bb.0:
 ; GFX1030-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX1030-NEXT:    v_mov_b32_e32 v6, v0
-; GFX1030-NEXT:    v_lshrrev_b32_e32 v0, 16, v7
 ; GFX1030-NEXT:    v_mov_b32_e32 v15, v1
+; GFX1030-NEXT:    v_lshrrev_b32_e32 v0, 16, v7
 ; GFX1030-NEXT:    v_and_b32_e32 v1, s0, v9
 ; GFX1030-NEXT:    v_mov_b32_e32 v16, v2
 ; GFX1030-NEXT:    v_mov_b32_e32 v17, v3
-; GFX1030-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX1030-NEXT:    v_lshrrev_b32_e32 v2, 16, v9
+; GFX1030-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX1030-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX1030-NEXT:    v_and_b32_e32 v3, s0, v10
 ; GFX1030-NEXT:    s_mov_b32 s1, exec_lo
@@ -430,8 +430,8 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(i32* %p_node_ptr
 ; GFX1030-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
 ; GFX1030-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX1030-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
-; GFX1030-NEXT:    v_mov_b32_e32 v4, 2.0
 ; GFX1030-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
+; GFX1030-NEXT:    v_mov_b32_e32 v4, 2.0
 ; GFX1030-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX1030-NEXT:    flat_load_dword v1, v[2:3]
 ; GFX1030-NEXT:    v_mov_b32_e32 v2, 0
@@ -460,8 +460,8 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(i32* %p_node_ptr
 ; GFX1013-NEXT:    v_add_co_u32 v4, vcc_lo, v0, v6
 ; GFX1013-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
 ; GFX1013-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v6
-; GFX1013-NEXT:    v_mov_b32_e32 v6, 4.0
 ; GFX1013-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
+; GFX1013-NEXT:    v_mov_b32_e32 v6, 4.0
 ; GFX1013-NEXT:    flat_load_dword v0, v[4:5]
 ; GFX1013-NEXT:    flat_load_dword v1, v[2:3]
 ; GFX1013-NEXT:    v_mov_b32_e32 v2, 0
@@ -509,16 +509,16 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(i32* %p_node
 ; GFX1030-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX1030-NEXT:    s_movk_i32 s5, 0x4400
 ; GFX1030-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
-; GFX1030-NEXT:    s_movk_i32 s6, 0x4200
 ; GFX1030-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX1030-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
-; GFX1030-NEXT:    s_bfe_u32 s5, s5, 0x100000
 ; GFX1030-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
+; GFX1030-NEXT:    s_movk_i32 s6, 0x4200
+; GFX1030-NEXT:    flat_load_dword v0, v[0:1]
+; GFX1030-NEXT:    flat_load_dword v1, v[2:3]
+; GFX1030-NEXT:    s_bfe_u32 s5, s5, 0x100000
 ; GFX1030-NEXT:    s_movk_i32 s7, 0x4800
 ; GFX1030-NEXT:    s_bfe_u32 s6, s6, 0x100000
 ; GFX1030-NEXT:    s_lshl_b32 s5, s5, 16
-; GFX1030-NEXT:    flat_load_dword v0, v[0:1]
-; GFX1030-NEXT:    flat_load_dword v1, v[2:3]
 ; GFX1030-NEXT:    s_movk_i32 s4, 0x4500
 ; GFX1030-NEXT:    s_or_b32 s5, s6, s5
 ; GFX1030-NEXT:    s_bfe_u32 s6, s9, 0x100000
@@ -553,8 +553,8 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(i32* %p_node
 ; GFX1013-NEXT:    s_bfe_u32 s2, s2, 0x100000
 ; GFX1013-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX1013-NEXT:    s_movk_i32 s0, 0x4500
-; GFX1013-NEXT:    s_bfe_u32 s3, s3, 0x100000
 ; GFX1013-NEXT:    s_or_b32 s1, s2, s1
+; GFX1013-NEXT:    s_bfe_u32 s3, s3, 0x100000
 ; GFX1013-NEXT:    s_bfe_u32 s0, s0, 0x100000
 ; GFX1013-NEXT:    s_lshl_b32 s3, s3, 16
 ; GFX1013-NEXT:    s_waitcnt lgkmcnt(0)
@@ -564,18 +564,18 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(i32* %p_node
 ; GFX1013-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX1013-NEXT:    s_movk_i32 s5, 0x4600
 ; GFX1013-NEXT:    v_add_co_u32 v4, vcc_lo, v0, v6
-; GFX1013-NEXT:    s_movk_i32 s4, 0x4700
 ; GFX1013-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
 ; GFX1013-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v6
-; GFX1013-NEXT:    s_bfe_u32 s2, s5, 0x100000
 ; GFX1013-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
-; GFX1013-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX1013-NEXT:    s_bfe_u32 s4, s4, 0x100000
-; GFX1013-NEXT:    s_or_b32 s0, s0, s2
+; GFX1013-NEXT:    s_movk_i32 s4, 0x4700
 ; GFX1013-NEXT:    flat_load_dword v0, v[4:5]
 ; GFX1013-NEXT:    flat_load_dword v1, v[2:3]
-; GFX1013-NEXT:    s_or_b32 s2, s4, s3
+; GFX1013-NEXT:    s_bfe_u32 s2, s5, 0x100000
+; GFX1013-NEXT:    s_bfe_u32 s4, s4, 0x100000
+; GFX1013-NEXT:    s_lshl_b32 s2, s2, 16
 ; GFX1013-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1013-NEXT:    s_or_b32 s0, s0, s2
+; GFX1013-NEXT:    s_or_b32 s2, s4, s3
 ; GFX1013-NEXT:    v_mov_b32_e32 v3, 1.0
 ; GFX1013-NEXT:    v_mov_b32_e32 v4, 2.0
 ; GFX1013-NEXT:    v_mov_b32_e32 v5, s1
@@ -705,17 +705,17 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(float* %p_
 ; GFX1030-NEXT:    s_movk_i32 s4, 0x4500
 ; GFX1030-NEXT:    s_bfe_u32 s5, s5, 0x100000
 ; GFX1030-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX1030-NEXT:    s_lshl_b32 s5, s5, 16
 ; GFX1030-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX1030-NEXT:    s_lshl_b32 s5, s5, 16
+; GFX1030-NEXT:    s_bfe_u32 s4, s4, 0x100000
 ; GFX1030-NEXT:    s_or_b32 s5, s6, s5
+; GFX1030-NEXT:    flat_load_dword v2, v[0:1]
 ; GFX1030-NEXT:    s_bfe_u32 s6, s9, 0x100000
-; GFX1030-NEXT:    s_bfe_u32 s4, s4, 0x100000
+; GFX1030-NEXT:    v_mov_b32_e32 v0, 0xb36211c6
 ; GFX1030-NEXT:    s_lshl_b32 s6, s6, 16
-; GFX1030-NEXT:    flat_load_dword v2, v[0:1]
+; GFX1030-NEXT:    v_mov_b32_e32 v1, 0x102
 ; GFX1030-NEXT:    s_or_b32 s4, s4, s6
 ; GFX1030-NEXT:    s_or_b32 s6, s8, s7
-; GFX1030-NEXT:    v_mov_b32_e32 v0, 0xb36211c6
-; GFX1030-NEXT:    v_mov_b32_e32 v1, 0x102
 ; GFX1030-NEXT:    v_mov_b32_e32 v6, s5
 ; GFX1030-NEXT:    v_mov_b32_e32 v7, s4
 ; GFX1030-NEXT:    v_mov_b32_e32 v8, s6
@@ -749,13 +749,13 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(float* %p_
 ; GFX1013-NEXT:    s_movk_i32 s3, 0x4800
 ; GFX1013-NEXT:    s_bfe_u32 s2, s2, 0x100000
 ; GFX1013-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX1013-NEXT:    s_or_b32 s1, s2, s1
 ; GFX1013-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX1013-NEXT:    s_or_b32 s1, s2, s1
 ; GFX1013-NEXT:    s_bfe_u32 s2, s9, 0x100000
 ; GFX1013-NEXT:    s_bfe_u32 s3, s3, 0x100000
+; GFX1013-NEXT:    flat_load_dword v2, v[0:1]
 ; GFX1013-NEXT:    s_lshl_b32 s2, s2, 16
 ; GFX1013-NEXT:    s_lshl_b32 s3, s3, 16
-; GFX1013-NEXT:    flat_load_dword v2, v[0:1]
 ; GFX1013-NEXT:    s_or_b32 s0, s0, s2
 ; GFX1013-NEXT:    s_or_b32 s2, s8, s3
 ; GFX1013-NEXT:    v_mov_b32_e32 v0, 0xb36211c6

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll
index 81d0a225ea594..2220f794d26ae 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll
@@ -47,9 +47,8 @@ define amdgpu_kernel void @update_dpp64_test(i64 addrspace(1)* %arg, i64 %in1, i
 ; GFX8-NEXT:    v_mov_b32_e32 v5, s3
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    s_nop 0
-; GFX8-NEXT:    v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
 ; GFX8-NEXT:    v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX8-NEXT:    v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
 ; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[4:5]
 ; GFX8-NEXT:    s_endpgm
 ;

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll
index db06742e233ea..cfd2236b817ab 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll
@@ -57,8 +57,8 @@ define <3 x i32> @v_load_constant_v3i32_align1(<3 x i32> addrspace(4)* %ptr) {
 ; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v4, 24, v5
 ; GFX9-NOUNALIGNED-NEXT:    v_and_or_b32 v5, v6, s4, v7
 ; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v6, 16, v8
-; GFX9-NOUNALIGNED-NEXT:    v_and_or_b32 v8, v10, v0, v1
 ; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v7, 24, v9
+; GFX9-NOUNALIGNED-NEXT:    v_and_or_b32 v8, v10, v0, v1
 ; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v9, 16, v11
 ; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v10, 24, v12
 ; GFX9-NOUNALIGNED-NEXT:    v_or3_b32 v0, v2, v3, v4
@@ -94,8 +94,8 @@ define <3 x i32> @v_load_constant_v3i32_align1(<3 x i32> addrspace(4)* %ptr) {
 ; GFX7-NOUNALIGNED-NEXT:    buffer_load_ubyte v11, v[0:1], s[4:7], 0 addr64 offset:9
 ; GFX7-NOUNALIGNED-NEXT:    buffer_load_ubyte v12, v[0:1], s[4:7], 0 addr64 offset:10
 ; GFX7-NOUNALIGNED-NEXT:    buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:11
-; GFX7-NOUNALIGNED-NEXT:    v_mov_b32_e32 v1, 0xff
 ; GFX7-NOUNALIGNED-NEXT:    s_movk_i32 s4, 0xff
+; GFX7-NOUNALIGNED-NEXT:    v_mov_b32_e32 v1, 0xff
 ; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(11)
 ; GFX7-NOUNALIGNED-NEXT:    v_and_b32_e32 v2, s4, v2
 ; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(10)
@@ -211,8 +211,8 @@ define <3 x i32> @v_load_constant_v3i32_align2(<3 x i32> addrspace(4)* %ptr) {
 ; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NOUNALIGNED-NEXT:    v_and_b32_e32 v0, s4, v0
 ; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
 ; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
 ; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v0, v1, v2
 ; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v1, v3, v4
 ; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v2, v5, v6
@@ -323,10 +323,10 @@ define <12 x i8> @v_load_constant_v12i8_align8(<12 x i8> addrspace(4)* %ptr) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
-; GFX9-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 24, v2
+; GFX9-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v8, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v13
 ; GFX9-NEXT:    v_mov_b32_e32 v2, v12
@@ -346,10 +346,10 @@ define <12 x i8> @v_load_constant_v12i8_align8(<12 x i8> addrspace(4)* %ptr) {
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
-; GFX7-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v11, 24, v2
+; GFX7-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX7-NEXT:    v_mov_b32_e32 v8, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v13
 ; GFX7-NEXT:    v_mov_b32_e32 v2, v12
@@ -405,8 +405,8 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align1(<3 x i32> addrspace(4)*
 ; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v10, v0, s[0:1] offset:9
 ; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v11, v0, s[0:1] offset:10
 ; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v12, v0, s[0:1] offset:11
-; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v0, 0xff
 ; GFX9-NOUNALIGNED-NEXT:    s_movk_i32 s0, 0xff
+; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v0, 0xff
 ; GFX9-NOUNALIGNED-NEXT:    s_mov_b32 s1, 8
 ; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v13, 8
 ; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(10)
@@ -471,8 +471,8 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align1(<3 x i32> addrspace(4)*
 ; GFX7-NOUNALIGNED-NEXT:    buffer_load_ubyte v9, off, s[0:3], 0 offset:9
 ; GFX7-NOUNALIGNED-NEXT:    buffer_load_ubyte v10, off, s[0:3], 0 offset:10
 ; GFX7-NOUNALIGNED-NEXT:    buffer_load_ubyte v11, off, s[0:3], 0 offset:11
-; GFX7-NOUNALIGNED-NEXT:    v_mov_b32_e32 v12, 0xff
 ; GFX7-NOUNALIGNED-NEXT:    s_movk_i32 s0, 0xff
+; GFX7-NOUNALIGNED-NEXT:    v_mov_b32_e32 v12, 0xff
 ; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(11)
 ; GFX7-NOUNALIGNED-NEXT:    v_and_b32_e32 v0, s0, v0
 ; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(10)
@@ -498,17 +498,17 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align1(<3 x i32> addrspace(4)*
 ; GFX7-NOUNALIGNED-NEXT:    v_and_b32_e32 v7, v7, v12
 ; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NOUNALIGNED-NEXT:    v_and_b32_e32 v11, v11, v12
-; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v1, v4, v5
 ; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
 ; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v1, v4, v5
 ; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v4, v8, v9
-; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
 ; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
-; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v1, v1, v6
 ; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v11, 24, v11
+; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v1, v1, v6
 ; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v2, v4, v10
 ; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v0, v0, v3
 ; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v1, v1, v7
@@ -585,13 +585,15 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align2(<3 x i32> addrspace(4)*
 ; GFX7-NOUNALIGNED-NEXT:    v_and_b32_e32 v0, s0, v0
 ; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(4)
 ; GFX7-NOUNALIGNED-NEXT:    v_and_b32_e32 v1, s0, v1
-; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-NOUNALIGNED-NEXT:    v_and_b32_e32 v2, s0, v2
 ; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(2)
 ; GFX7-NOUNALIGNED-NEXT:    v_and_b32_e32 v3, s0, v3
-; GFX7-NOUNALIGNED-NEXT:    v_and_b32_e32 v2, s0, v2
+; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NOUNALIGNED-NEXT:    v_and_b32_e32 v4, s0, v4
 ; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NOUNALIGNED-NEXT:    v_and_b32_e32 v5, s0, v5
-; GFX7-NOUNALIGNED-NEXT:    v_and_b32_e32 v4, s0, v4
+; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v0, v0, v1

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll
index 939b491ff08c1..91100f2c405da 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll
@@ -86,12 +86,13 @@ define <4 x i32> @load_lds_v4i32_align1(<4 x i32> addrspace(3)* %ptr) {
 ; GFX9-NEXT:    v_and_b32_e32 v7, v8, v3
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
+; GFX9-NEXT:    v_or3_b32 v2, v2, v6, v7
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(2)
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v5, v5, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX9-NEXT:    v_and_b32_e32 v6, v11, v3
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v0, v0, v3
-; GFX9-NEXT:    v_or3_b32 v2, v2, v6, v7
-; GFX9-NEXT:    v_and_b32_e32 v6, v11, v3
 ; GFX9-NEXT:    v_and_or_b32 v5, v9, v3, v5
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
@@ -123,8 +124,8 @@ define <4 x i32> @load_lds_v4i32_align1(<4 x i32> addrspace(3)* %ptr) {
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(4)
 ; GFX7-NEXT:    v_and_b32_e32 v2, s4, v5
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v3, 0xff
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
 ; GFX7-NEXT:    v_or_b32_e32 v4, v1, v2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(2)
 ; GFX7-NEXT:    v_and_b32_e32 v2, v7, v3
@@ -158,18 +159,18 @@ define <4 x i32> @load_lds_v4i32_align1(<4 x i32> addrspace(3)* %ptr) {
 ; GFX7-NEXT:    v_or_b32_e32 v2, v2, v5
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(4)
 ; GFX7-NEXT:    v_and_b32_e32 v5, v7, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(2)
 ; GFX7-NEXT:    v_and_b32_e32 v6, v9, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
 ; GFX7-NEXT:    v_or_b32_e32 v2, v2, v5
 ; GFX7-NEXT:    v_and_b32_e32 v5, v8, v3
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
 ; GFX7-NEXT:    v_or_b32_e32 v5, v5, v6
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(1)
 ; GFX7-NEXT:    v_and_b32_e32 v6, v10, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v0, v0, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
 ; GFX7-NEXT:    v_or_b32_e32 v5, v5, v6
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; GFX7-NEXT:    v_or_b32_e32 v3, v5, v0
@@ -216,9 +217,10 @@ define <4 x i32> @load_lds_v4i32_align1(<4 x i32> addrspace(3)* %ptr) {
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v7, v17, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(8)
 ; GFX10-NEXT:    v_and_or_b32 v1, v8, s4, v1
+; GFX10-NEXT:    s_waitcnt lgkmcnt(7)
+; GFX10-NEXT:    v_and_or_b32 v4, v9, s4, v4
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(5)
 ; GFX10-NEXT:    v_and_b32_e32 v8, v12, v11
-; GFX10-NEXT:    v_and_or_b32 v4, v9, s4, v4
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(4)
 ; GFX10-NEXT:    v_and_b32_e32 v9, v13, v11
 ; GFX10-NEXT:    v_and_or_b32 v7, v10, v11, v7
@@ -230,17 +232,17 @@ define <4 x i32> @load_lds_v4i32_align1(<4 x i32> addrspace(3)* %ptr) {
 ; GFX10-NEXT:    v_and_b32_e32 v0, v0, v11
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX10-NEXT:    v_and_or_b32 v10, v14, v11, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
-; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 24, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
+; GFX10-NEXT:    v_and_or_b32 v10, v14, v11, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 24, v0
 ; GFX10-NEXT:    v_or3_b32 v0, v1, v2, v3
-; GFX10-NEXT:    v_or3_b32 v3, v10, v11, v12
 ; GFX10-NEXT:    v_or3_b32 v1, v4, v5, v6
 ; GFX10-NEXT:    v_or3_b32 v2, v7, v8, v9
+; GFX10-NEXT:    v_or3_b32 v3, v10, v11, v12
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 1
   ret <4 x i32> %load
@@ -266,9 +268,9 @@ define <4 x i32> @load_lds_v4i32_align2(<4 x i32> addrspace(3)* %ptr) {
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(4)
 ; GFX9-NEXT:    v_and_b32_e32 v1, s4, v4
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_and_or_b32 v1, v3, s4, v1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(2)
 ; GFX9-NEXT:    v_and_b32_e32 v2, s4, v6
-; GFX9-NEXT:    v_and_or_b32 v1, v3, s4, v1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v3, s4, v8
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
@@ -294,19 +296,19 @@ define <4 x i32> @load_lds_v4i32_align2(<4 x i32> addrspace(3)* %ptr) {
 ; GFX7-NEXT:    v_and_b32_e32 v0, s4, v1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(6)
 ; GFX7-NEXT:    v_and_b32_e32 v1, s4, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(4)
 ; GFX7-NEXT:    v_and_b32_e32 v2, s4, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7-NEXT:    v_and_b32_e32 v1, s4, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(2)
 ; GFX7-NEXT:    v_and_b32_e32 v3, s4, v6
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff, v8
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX7-NEXT:    v_and_b32_e32 v2, s4, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff, v8
 ; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX7-NEXT:    v_and_b32_e32 v3, s4, v7
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll
index 8fbeccd9f2a0e..a79c9ebc618c0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll
@@ -138,9 +138,9 @@ define <3 x i32> @load_lds_v3i32_align1(<3 x i32> addrspace(3)* %ptr) {
 ; GFX7-NEXT:    v_or_b32_e32 v4, v4, v5
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(1)
 ; GFX7-NEXT:    v_and_b32_e32 v5, v6, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v0, v0, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; GFX7-NEXT:    v_or_b32_e32 v4, v4, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; GFX7-NEXT:    v_or_b32_e32 v2, v4, v0
@@ -189,11 +189,12 @@ define <3 x i32> @load_lds_v3i32_align1(<3 x i32> addrspace(3)* %ptr) {
 ; GFX10-NEXT:    v_and_or_b32 v1, v10, s4, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_and_or_b32 v7, v0, v12, v7
+; GFX10-NEXT:    s_waitcnt lgkmcnt(1)
 ; GFX10-NEXT:    v_and_or_b32 v4, v11, s4, v4
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_and_or_b32 v7, v0, v12, v7
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
 ; GFX10-NEXT:    v_or3_b32 v0, v1, v2, v3
@@ -244,14 +245,14 @@ define <3 x i32> @load_lds_v3i32_align2(<3 x i32> addrspace(3)* %ptr) {
 ; GFX7-NEXT:    v_and_b32_e32 v0, s4, v1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(4)
 ; GFX7-NEXT:    v_and_b32_e32 v1, s4, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(2)
 ; GFX7-NEXT:    v_and_b32_e32 v2, s4, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7-NEXT:    v_and_b32_e32 v1, s4, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v3, s4, v6
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX7-NEXT:    v_and_b32_e32 v2, s4, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll
index 8d7208f72810f..70a351ed65c40 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll
@@ -39,8 +39,8 @@ define <4 x i32> @load_lds_v4i32_align1(<4 x i32> addrspace(3)* %ptr) {
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(4)
 ; GFX7-NEXT:    v_and_b32_e32 v2, s4, v5
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v3, 0xff
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
 ; GFX7-NEXT:    v_or_b32_e32 v4, v1, v2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(2)
 ; GFX7-NEXT:    v_and_b32_e32 v2, v7, v3
@@ -74,18 +74,18 @@ define <4 x i32> @load_lds_v4i32_align1(<4 x i32> addrspace(3)* %ptr) {
 ; GFX7-NEXT:    v_or_b32_e32 v2, v2, v5
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(4)
 ; GFX7-NEXT:    v_and_b32_e32 v5, v7, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(2)
 ; GFX7-NEXT:    v_and_b32_e32 v6, v9, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
 ; GFX7-NEXT:    v_or_b32_e32 v2, v2, v5
 ; GFX7-NEXT:    v_and_b32_e32 v5, v8, v3
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
 ; GFX7-NEXT:    v_or_b32_e32 v5, v5, v6
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(1)
 ; GFX7-NEXT:    v_and_b32_e32 v6, v10, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v0, v0, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
 ; GFX7-NEXT:    v_or_b32_e32 v5, v5, v6
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; GFX7-NEXT:    v_or_b32_e32 v3, v5, v0
@@ -132,9 +132,10 @@ define <4 x i32> @load_lds_v4i32_align1(<4 x i32> addrspace(3)* %ptr) {
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v7, v17, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(8)
 ; GFX10-NEXT:    v_and_or_b32 v1, v8, s4, v1
+; GFX10-NEXT:    s_waitcnt lgkmcnt(7)
+; GFX10-NEXT:    v_and_or_b32 v4, v9, s4, v4
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(5)
 ; GFX10-NEXT:    v_and_b32_e32 v8, v12, v11
-; GFX10-NEXT:    v_and_or_b32 v4, v9, s4, v4
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(4)
 ; GFX10-NEXT:    v_and_b32_e32 v9, v13, v11
 ; GFX10-NEXT:    v_and_or_b32 v7, v10, v11, v7
@@ -146,17 +147,17 @@ define <4 x i32> @load_lds_v4i32_align1(<4 x i32> addrspace(3)* %ptr) {
 ; GFX10-NEXT:    v_and_b32_e32 v0, v0, v11
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX10-NEXT:    v_and_or_b32 v10, v14, v11, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
-; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 24, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
+; GFX10-NEXT:    v_and_or_b32 v10, v14, v11, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 24, v0
 ; GFX10-NEXT:    v_or3_b32 v0, v1, v2, v3
-; GFX10-NEXT:    v_or3_b32 v3, v10, v11, v12
 ; GFX10-NEXT:    v_or3_b32 v1, v4, v5, v6
 ; GFX10-NEXT:    v_or3_b32 v2, v7, v8, v9
+; GFX10-NEXT:    v_or3_b32 v3, v10, v11, v12
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 1
   ret <4 x i32> %load
@@ -222,9 +223,9 @@ define <3 x i32> @load_lds_v3i32_align1(<3 x i32> addrspace(3)* %ptr) {
 ; GFX7-NEXT:    v_or_b32_e32 v4, v4, v5
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(1)
 ; GFX7-NEXT:    v_and_b32_e32 v5, v6, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v0, v0, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; GFX7-NEXT:    v_or_b32_e32 v4, v4, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; GFX7-NEXT:    v_or_b32_e32 v2, v4, v0
@@ -273,11 +274,12 @@ define <3 x i32> @load_lds_v3i32_align1(<3 x i32> addrspace(3)* %ptr) {
 ; GFX10-NEXT:    v_and_or_b32 v1, v10, s4, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_and_or_b32 v7, v0, v12, v7
+; GFX10-NEXT:    s_waitcnt lgkmcnt(1)
 ; GFX10-NEXT:    v_and_or_b32 v4, v11, s4, v4
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_and_or_b32 v7, v0, v12, v7
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
 ; GFX10-NEXT:    v_or3_b32 v0, v1, v2, v3

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
index 6cef254938e81..0360057af2c88 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
@@ -128,12 +128,12 @@ define amdgpu_kernel void @localize_globals(i1 %cond) {
 ; GFX9-NEXT:    s_getpc_b64 s[0:1]
 ; GFX9-NEXT:    s_add_u32 s0, s0, gv0 at gotpcrel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s1, s1, gv0 at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; GFX9-NEXT:    s_getpc_b64 s[2:3]
 ; GFX9-NEXT:    s_add_u32 s2, s2, gv1 at gotpcrel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s3, s3, gv1 at gotpcrel32@hi+12
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_store_dword v0, v0, s[0:1]

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
index 89de8bddc26aa..4b3fc2e1a6102 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
@@ -824,8 +824,8 @@ define amdgpu_ps i32 @s_lshr_v2i16(<2 x i16> inreg %value, <2 x i16> inreg %amou
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_mov_b32 s3, 0xffff
 ; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
-; GFX8-NEXT:    s_lshr_b32 s4, s1, 16
 ; GFX8-NEXT:    s_and_b32 s0, s0, s3
+; GFX8-NEXT:    s_lshr_b32 s4, s1, 16
 ; GFX8-NEXT:    s_and_b32 s1, s1, s3
 ; GFX8-NEXT:    s_lshr_b32 s0, s0, s1
 ; GFX8-NEXT:    s_lshr_b32 s1, s2, s4
@@ -838,8 +838,8 @@ define amdgpu_ps i32 @s_lshr_v2i16(<2 x i16> inreg %value, <2 x i16> inreg %amou
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_mov_b32 s3, 0xffff
 ; GFX9-NEXT:    s_lshr_b32 s2, s0, 16
-; GFX9-NEXT:    s_lshr_b32 s4, s1, 16
 ; GFX9-NEXT:    s_and_b32 s0, s0, s3
+; GFX9-NEXT:    s_lshr_b32 s4, s1, 16
 ; GFX9-NEXT:    s_and_b32 s1, s1, s3
 ; GFX9-NEXT:    s_lshr_b32 s0, s0, s1
 ; GFX9-NEXT:    s_lshr_b32 s1, s2, s4
@@ -963,8 +963,8 @@ define <2 x float> @v_lshr_v4i16(<4 x i16> %value, <4 x i16> %amount) {
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v2
 ; GFX6-NEXT:    v_and_b32_e32 v4, s4, v7
 ; GFX6-NEXT:    v_and_b32_e32 v3, s4, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v4, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
 ; GFX6-NEXT:    v_or_b32_e32 v1, v2, v1
@@ -1015,8 +1015,8 @@ define amdgpu_ps <2 x i32> @s_lshr_v4i16(<4 x i16> inreg %value, <4 x i16> inreg
 ; GFX6-NEXT:    s_lshr_b32 s2, s2, s4
 ; GFX6-NEXT:    s_and_b32 s4, s7, s8
 ; GFX6-NEXT:    s_and_b32 s3, s3, s8
-; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_lshr_b32 s3, s3, s4
+; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
 ; GFX6-NEXT:    s_lshl_b32 s1, s3, 16
 ; GFX6-NEXT:    s_or_b32 s1, s2, s1
@@ -1026,15 +1026,15 @@ define amdgpu_ps <2 x i32> @s_lshr_v4i16(<4 x i16> inreg %value, <4 x i16> inreg
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_mov_b32 s6, 0xffff
 ; GFX8-NEXT:    s_lshr_b32 s4, s0, 16
-; GFX8-NEXT:    s_lshr_b32 s7, s2, 16
 ; GFX8-NEXT:    s_and_b32 s0, s0, s6
+; GFX8-NEXT:    s_lshr_b32 s7, s2, 16
 ; GFX8-NEXT:    s_and_b32 s2, s2, s6
-; GFX8-NEXT:    s_lshr_b32 s0, s0, s2
-; GFX8-NEXT:    s_lshr_b32 s2, s4, s7
 ; GFX8-NEXT:    s_lshr_b32 s5, s1, 16
-; GFX8-NEXT:    s_lshr_b32 s8, s3, 16
 ; GFX8-NEXT:    s_and_b32 s1, s1, s6
+; GFX8-NEXT:    s_lshr_b32 s8, s3, 16
 ; GFX8-NEXT:    s_and_b32 s3, s3, s6
+; GFX8-NEXT:    s_lshr_b32 s0, s0, s2
+; GFX8-NEXT:    s_lshr_b32 s2, s4, s7
 ; GFX8-NEXT:    s_lshr_b32 s1, s1, s3
 ; GFX8-NEXT:    s_lshr_b32 s3, s5, s8
 ; GFX8-NEXT:    s_lshl_b32 s2, s2, 16
@@ -1049,15 +1049,15 @@ define amdgpu_ps <2 x i32> @s_lshr_v4i16(<4 x i16> inreg %value, <4 x i16> inreg
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_mov_b32 s5, 0xffff
 ; GFX9-NEXT:    s_lshr_b32 s4, s0, 16
-; GFX9-NEXT:    s_lshr_b32 s6, s2, 16
 ; GFX9-NEXT:    s_and_b32 s0, s0, s5
+; GFX9-NEXT:    s_lshr_b32 s6, s2, 16
 ; GFX9-NEXT:    s_and_b32 s2, s2, s5
 ; GFX9-NEXT:    s_lshr_b32 s0, s0, s2
 ; GFX9-NEXT:    s_lshr_b32 s2, s4, s6
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
 ; GFX9-NEXT:    s_lshr_b32 s2, s1, 16
-; GFX9-NEXT:    s_lshr_b32 s4, s3, 16
 ; GFX9-NEXT:    s_and_b32 s1, s1, s5
+; GFX9-NEXT:    s_lshr_b32 s4, s3, 16
 ; GFX9-NEXT:    s_and_b32 s3, s3, s5
 ; GFX9-NEXT:    s_lshr_b32 s1, s1, s3
 ; GFX9-NEXT:    s_lshr_b32 s2, s2, s4
@@ -1137,15 +1137,15 @@ define <4 x float> @v_lshr_v8i16(<8 x i16> %value, <8 x i16> %amount) {
 ; GFX6-NEXT:    v_and_b32_e32 v8, s4, v14
 ; GFX6-NEXT:    v_and_b32_e32 v6, s4, v6
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v6, v8, v6
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_and_b32_e32 v8, v15, v16
 ; GFX6-NEXT:    v_and_b32_e32 v7, v7, v16
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v7, v8, v7
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; GFX6-NEXT:    v_lshrrev_b32_e32 v7, v8, v7
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
 ; GFX6-NEXT:    v_or_b32_e32 v1, v2, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
 ; GFX6-NEXT:    v_or_b32_e32 v2, v4, v2
 ; GFX6-NEXT:    v_or_b32_e32 v3, v6, v3
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -1215,15 +1215,15 @@ define amdgpu_ps <4 x i32> @s_lshr_v8i16(<8 x i16> inreg %value, <8 x i16> inreg
 ; GFX6-NEXT:    s_and_b32 s8, s14, s16
 ; GFX6-NEXT:    s_and_b32 s6, s6, s16
 ; GFX6-NEXT:    s_lshr_b32 s6, s6, s8
-; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_and_b32 s8, s15, s16
 ; GFX6-NEXT:    s_and_b32 s7, s7, s16
+; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX6-NEXT:    s_lshr_b32 s7, s7, s8
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
 ; GFX6-NEXT:    s_lshl_b32 s1, s3, 16
-; GFX6-NEXT:    s_lshr_b32 s7, s7, s8
-; GFX6-NEXT:    s_lshl_b32 s3, s7, 16
 ; GFX6-NEXT:    s_or_b32 s1, s2, s1
 ; GFX6-NEXT:    s_lshl_b32 s2, s5, 16
+; GFX6-NEXT:    s_lshl_b32 s3, s7, 16
 ; GFX6-NEXT:    s_or_b32 s2, s4, s2
 ; GFX6-NEXT:    s_or_b32 s3, s6, s3
 ; GFX6-NEXT:    ; return to shader part epilog
@@ -1232,35 +1232,35 @@ define amdgpu_ps <4 x i32> @s_lshr_v8i16(<8 x i16> inreg %value, <8 x i16> inreg
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_mov_b32 s12, 0xffff
 ; GFX8-NEXT:    s_lshr_b32 s8, s0, 16
-; GFX8-NEXT:    s_lshr_b32 s13, s4, 16
 ; GFX8-NEXT:    s_and_b32 s0, s0, s12
+; GFX8-NEXT:    s_lshr_b32 s13, s4, 16
 ; GFX8-NEXT:    s_and_b32 s4, s4, s12
-; GFX8-NEXT:    s_lshr_b32 s0, s0, s4
-; GFX8-NEXT:    s_lshr_b32 s4, s8, s13
 ; GFX8-NEXT:    s_lshr_b32 s9, s1, 16
-; GFX8-NEXT:    s_lshr_b32 s14, s5, 16
 ; GFX8-NEXT:    s_and_b32 s1, s1, s12
+; GFX8-NEXT:    s_lshr_b32 s14, s5, 16
 ; GFX8-NEXT:    s_and_b32 s5, s5, s12
-; GFX8-NEXT:    s_lshr_b32 s1, s1, s5
+; GFX8-NEXT:    s_lshr_b32 s0, s0, s4
+; GFX8-NEXT:    s_lshr_b32 s4, s8, s13
 ; GFX8-NEXT:    s_lshr_b32 s10, s2, 16
-; GFX8-NEXT:    s_lshr_b32 s15, s6, 16
 ; GFX8-NEXT:    s_and_b32 s2, s2, s12
+; GFX8-NEXT:    s_lshr_b32 s15, s6, 16
 ; GFX8-NEXT:    s_and_b32 s6, s6, s12
+; GFX8-NEXT:    s_lshr_b32 s1, s1, s5
 ; GFX8-NEXT:    s_lshr_b32 s5, s9, s14
 ; GFX8-NEXT:    s_lshl_b32 s4, s4, 16
 ; GFX8-NEXT:    s_and_b32 s0, s0, s12
-; GFX8-NEXT:    s_lshr_b32 s2, s2, s6
 ; GFX8-NEXT:    s_lshr_b32 s11, s3, 16
-; GFX8-NEXT:    s_lshr_b32 s16, s7, 16
-; GFX8-NEXT:    s_or_b32 s0, s4, s0
 ; GFX8-NEXT:    s_and_b32 s3, s3, s12
+; GFX8-NEXT:    s_lshr_b32 s16, s7, 16
 ; GFX8-NEXT:    s_and_b32 s7, s7, s12
+; GFX8-NEXT:    s_lshr_b32 s2, s2, s6
 ; GFX8-NEXT:    s_lshr_b32 s6, s10, s15
+; GFX8-NEXT:    s_or_b32 s0, s4, s0
 ; GFX8-NEXT:    s_lshl_b32 s4, s5, 16
 ; GFX8-NEXT:    s_and_b32 s1, s1, s12
 ; GFX8-NEXT:    s_lshr_b32 s3, s3, s7
-; GFX8-NEXT:    s_or_b32 s1, s4, s1
 ; GFX8-NEXT:    s_lshr_b32 s7, s11, s16
+; GFX8-NEXT:    s_or_b32 s1, s4, s1
 ; GFX8-NEXT:    s_lshl_b32 s4, s6, 16
 ; GFX8-NEXT:    s_and_b32 s2, s2, s12
 ; GFX8-NEXT:    s_or_b32 s2, s4, s2
@@ -1273,29 +1273,29 @@ define amdgpu_ps <4 x i32> @s_lshr_v8i16(<8 x i16> inreg %value, <8 x i16> inreg
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_mov_b32 s9, 0xffff
 ; GFX9-NEXT:    s_lshr_b32 s8, s0, 16
-; GFX9-NEXT:    s_lshr_b32 s10, s4, 16
 ; GFX9-NEXT:    s_and_b32 s0, s0, s9
+; GFX9-NEXT:    s_lshr_b32 s10, s4, 16
 ; GFX9-NEXT:    s_and_b32 s4, s4, s9
 ; GFX9-NEXT:    s_lshr_b32 s0, s0, s4
 ; GFX9-NEXT:    s_lshr_b32 s4, s8, s10
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s4
 ; GFX9-NEXT:    s_lshr_b32 s4, s1, 16
-; GFX9-NEXT:    s_lshr_b32 s8, s5, 16
 ; GFX9-NEXT:    s_and_b32 s1, s1, s9
+; GFX9-NEXT:    s_lshr_b32 s8, s5, 16
 ; GFX9-NEXT:    s_and_b32 s5, s5, s9
 ; GFX9-NEXT:    s_lshr_b32 s1, s1, s5
 ; GFX9-NEXT:    s_lshr_b32 s4, s4, s8
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s4
 ; GFX9-NEXT:    s_lshr_b32 s4, s2, 16
-; GFX9-NEXT:    s_lshr_b32 s5, s6, 16
 ; GFX9-NEXT:    s_and_b32 s2, s2, s9
+; GFX9-NEXT:    s_lshr_b32 s5, s6, 16
 ; GFX9-NEXT:    s_and_b32 s6, s6, s9
 ; GFX9-NEXT:    s_lshr_b32 s2, s2, s6
 ; GFX9-NEXT:    s_lshr_b32 s4, s4, s5
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s2, s4
 ; GFX9-NEXT:    s_lshr_b32 s4, s3, 16
-; GFX9-NEXT:    s_lshr_b32 s5, s7, 16
 ; GFX9-NEXT:    s_and_b32 s3, s3, s9
+; GFX9-NEXT:    s_lshr_b32 s5, s7, 16
 ; GFX9-NEXT:    s_and_b32 s6, s7, s9
 ; GFX9-NEXT:    s_lshr_b32 s3, s3, s6
 ; GFX9-NEXT:    s_lshr_b32 s4, s4, s5
@@ -1306,28 +1306,28 @@ define amdgpu_ps <4 x i32> @s_lshr_v8i16(<8 x i16> inreg %value, <8 x i16> inreg
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_mov_b32 s8, 0xffff
 ; GFX10-NEXT:    s_lshr_b32 s9, s0, 16
-; GFX10-NEXT:    s_and_b32 s10, s4, s8
 ; GFX10-NEXT:    s_and_b32 s0, s0, s8
+; GFX10-NEXT:    s_and_b32 s10, s4, s8
 ; GFX10-NEXT:    s_lshr_b32 s4, s4, 16
 ; GFX10-NEXT:    s_lshr_b32 s0, s0, s10
 ; GFX10-NEXT:    s_lshr_b32 s4, s9, s4
 ; GFX10-NEXT:    s_lshr_b32 s9, s1, 16
-; GFX10-NEXT:    s_and_b32 s10, s5, s8
 ; GFX10-NEXT:    s_and_b32 s1, s1, s8
+; GFX10-NEXT:    s_and_b32 s10, s5, s8
 ; GFX10-NEXT:    s_lshr_b32 s5, s5, 16
 ; GFX10-NEXT:    s_lshr_b32 s1, s1, s10
 ; GFX10-NEXT:    s_lshr_b32 s5, s9, s5
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s4
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s5
 ; GFX10-NEXT:    s_lshr_b32 s4, s2, 16
-; GFX10-NEXT:    s_and_b32 s5, s6, s8
 ; GFX10-NEXT:    s_and_b32 s2, s2, s8
+; GFX10-NEXT:    s_and_b32 s5, s6, s8
 ; GFX10-NEXT:    s_lshr_b32 s6, s6, 16
 ; GFX10-NEXT:    s_lshr_b32 s2, s2, s5
 ; GFX10-NEXT:    s_lshr_b32 s4, s4, s6
 ; GFX10-NEXT:    s_lshr_b32 s5, s3, 16
-; GFX10-NEXT:    s_and_b32 s6, s7, s8
 ; GFX10-NEXT:    s_and_b32 s3, s3, s8
+; GFX10-NEXT:    s_and_b32 s6, s7, s8
 ; GFX10-NEXT:    s_lshr_b32 s7, s7, 16
 ; GFX10-NEXT:    s_lshr_b32 s3, s3, s6
 ; GFX10-NEXT:    s_lshr_b32 s5, s5, s7

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll
index a0f02fad31b95..11ed050f473ff 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll
@@ -338,9 +338,9 @@ define amdgpu_ps void @mubuf_store_vgpr_ptr_sgpr_offset256_offset(i32 addrspace(
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0x400
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s4
-; GFX6-NEXT:    s_bfe_i64 s[0:1], s[2:3], 0x200000
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s5
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GFX6-NEXT:    s_bfe_i64 s[0:1], s[2:3], 0x200000
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX6-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 0
@@ -353,9 +353,9 @@ define amdgpu_ps void @mubuf_store_vgpr_ptr_sgpr_offset256_offset(i32 addrspace(
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0x400
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s4
-; GFX7-NEXT:    s_bfe_i64 s[0:1], s[2:3], 0x200000
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s5
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GFX7-NEXT:    s_bfe_i64 s[0:1], s[2:3], 0x200000
 ; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX7-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
 ; GFX7-NEXT:    v_mov_b32_e32 v2, 0
@@ -792,9 +792,9 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_sgpr_offset256_offset(float addrspac
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0x400
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s4
-; GFX6-NEXT:    s_bfe_i64 s[0:1], s[2:3], 0x200000
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GFX6-NEXT:    s_bfe_i64 s[0:1], s[2:3], 0x200000
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX6-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
@@ -807,9 +807,9 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_sgpr_offset256_offset(float addrspac
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0x400
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s4
-; GFX7-NEXT:    s_bfe_i64 s[0:1], s[2:3], 0x200000
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GFX7-NEXT:    s_bfe_i64 s[0:1], s[2:3], 0x200000
 ; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX7-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
index 71d3329fe94bb..fd97da40302d9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
@@ -382,8 +382,8 @@ define i64 @v_mul_i64(i64 %num, i64 %den) {
 ; GFX7-LABEL: v_mul_i64:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mul_lo_u32 v4, v0, v3
 ; GFX7-NEXT:    v_mul_lo_u32 v1, v1, v2
+; GFX7-NEXT:    v_mul_lo_u32 v4, v0, v3
 ; GFX7-NEXT:    v_mul_lo_u32 v3, v0, v2
 ; GFX7-NEXT:    v_mul_hi_u32 v0, v0, v2
 ; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
@@ -394,8 +394,8 @@ define i64 @v_mul_i64(i64 %num, i64 %den) {
 ; GFX8-LABEL: v_mul_i64:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mul_lo_u32 v4, v0, v3
 ; GFX8-NEXT:    v_mul_lo_u32 v1, v1, v2
+; GFX8-NEXT:    v_mul_lo_u32 v4, v0, v3
 ; GFX8-NEXT:    v_mul_lo_u32 v3, v0, v2
 ; GFX8-NEXT:    v_mul_hi_u32 v0, v0, v2
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v4
@@ -432,24 +432,24 @@ define amdgpu_ps <3 x i32> @s_mul_i96(i96 inreg %num, i96 inreg %den) {
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX7-NEXT:    v_mul_hi_u32 v0, s0, v0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX7-NEXT:    s_mul_i32 s7, s1, s3
 ; GFX7-NEXT:    s_mul_i32 s8, s0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX7-NEXT:    s_add_u32 s7, s7, s8
-; GFX7-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX7-NEXT:    v_mul_hi_u32 v2, v2, s3
+; GFX7-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, s7, v0
-; GFX7-NEXT:    s_mul_i32 s7, s1, s4
 ; GFX7-NEXT:    s_mul_i32 s2, s2, s3
+; GFX7-NEXT:    s_mul_i32 s7, s1, s4
 ; GFX7-NEXT:    v_mul_hi_u32 v3, s0, v3
-; GFX7-NEXT:    s_cselect_b32 s8, 1, 0
 ; GFX7-NEXT:    s_mul_i32 s6, s0, s3
+; GFX7-NEXT:    s_cselect_b32 s8, 1, 0
 ; GFX7-NEXT:    s_mul_i32 s5, s0, s5
 ; GFX7-NEXT:    s_add_i32 s0, s2, s7
 ; GFX7-NEXT:    s_add_i32 s0, s0, s5
+; GFX7-NEXT:    s_and_b32 s8, s8, 1
 ; GFX7-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX7-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
-; GFX7-NEXT:    s_and_b32 s8, s8, 1
 ; GFX7-NEXT:    v_add_i32_e32 v1, vcc, s8, v1
 ; GFX7-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
@@ -462,24 +462,24 @@ define amdgpu_ps <3 x i32> @s_mul_i96(i96 inreg %num, i96 inreg %den) {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX8-NEXT:    v_mul_hi_u32 v0, s0, v0
-; GFX8-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX8-NEXT:    s_mul_i32 s7, s1, s3
 ; GFX8-NEXT:    s_mul_i32 s8, s0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX8-NEXT:    s_add_u32 s7, s7, s8
-; GFX8-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX8-NEXT:    v_mul_hi_u32 v2, v2, s3
+; GFX8-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s7, v0
-; GFX8-NEXT:    s_mul_i32 s7, s1, s4
 ; GFX8-NEXT:    s_mul_i32 s2, s2, s3
+; GFX8-NEXT:    s_mul_i32 s7, s1, s4
 ; GFX8-NEXT:    v_mul_hi_u32 v3, s0, v3
-; GFX8-NEXT:    s_cselect_b32 s8, 1, 0
 ; GFX8-NEXT:    s_mul_i32 s6, s0, s3
+; GFX8-NEXT:    s_cselect_b32 s8, 1, 0
 ; GFX8-NEXT:    s_mul_i32 s5, s0, s5
 ; GFX8-NEXT:    s_add_i32 s0, s2, s7
 ; GFX8-NEXT:    s_add_i32 s0, s0, s5
+; GFX8-NEXT:    s_and_b32 s8, s8, 1
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
-; GFX8-NEXT:    s_and_b32 s8, s8, 1
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s8, v1
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v3
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v2, v1
@@ -500,8 +500,8 @@ define amdgpu_ps <3 x i32> @s_mul_i96(i96 inreg %num, i96 inreg %den) {
 ; GFX9-NEXT:    s_cselect_b32 s9, 1, 0
 ; GFX9-NEXT:    s_and_b32 s9, s9, 1
 ; GFX9-NEXT:    s_add_i32 s8, s8, s9
-; GFX9-NEXT:    s_mul_i32 s9, s1, s4
 ; GFX9-NEXT:    s_mul_i32 s2, s2, s3
+; GFX9-NEXT:    s_mul_i32 s9, s1, s4
 ; GFX9-NEXT:    s_mul_i32 s5, s0, s5
 ; GFX9-NEXT:    s_add_i32 s2, s2, s9
 ; GFX9-NEXT:    s_mul_hi_u32 s1, s1, s3
@@ -522,9 +522,9 @@ define amdgpu_ps <3 x i32> @s_mul_i96(i96 inreg %num, i96 inreg %den) {
 ; GFX10-NEXT:    s_mul_hi_u32 s8, s0, s3
 ; GFX10-NEXT:    s_add_u32 s6, s6, s7
 ; GFX10-NEXT:    s_cselect_b32 s7, 1, 0
-; GFX10-NEXT:    s_mul_i32 s9, s1, s4
-; GFX10-NEXT:    s_and_b32 s7, s7, 1
 ; GFX10-NEXT:    s_mul_i32 s2, s2, s3
+; GFX10-NEXT:    s_and_b32 s7, s7, 1
+; GFX10-NEXT:    s_mul_i32 s9, s1, s4
 ; GFX10-NEXT:    s_add_u32 s6, s6, s8
 ; GFX10-NEXT:    s_cselect_b32 s8, 1, 0
 ; GFX10-NEXT:    s_mul_i32 s5, s0, s5
@@ -604,12 +604,12 @@ define i96 @v_mul_i96(i96 %num, i96 %den) {
 ; GFX9-NEXT:    v_mul_lo_u32 v7, v1, v3
 ; GFX9-NEXT:    v_mul_lo_u32 v8, v0, v4
 ; GFX9-NEXT:    v_mul_hi_u32 v9, v0, v3
-; GFX9-NEXT:    v_mul_lo_u32 v10, v1, v4
 ; GFX9-NEXT:    v_mul_lo_u32 v2, v2, v3
+; GFX9-NEXT:    v_mul_lo_u32 v10, v1, v4
 ; GFX9-NEXT:    v_mul_lo_u32 v5, v0, v5
 ; GFX9-NEXT:    v_mul_hi_u32 v1, v1, v3
-; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v8
 ; GFX9-NEXT:    v_mul_lo_u32 v6, v0, v3
+; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v8
 ; GFX9-NEXT:    v_mul_hi_u32 v0, v0, v4
 ; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v9
@@ -629,8 +629,8 @@ define i96 @v_mul_i96(i96 %num, i96 %den) {
 ; GFX10-NEXT:    v_mul_lo_u32 v6, v1, v3
 ; GFX10-NEXT:    v_mul_lo_u32 v7, v0, v4
 ; GFX10-NEXT:    v_mul_hi_u32 v8, v0, v3
-; GFX10-NEXT:    v_mul_lo_u32 v9, v1, v4
 ; GFX10-NEXT:    v_mul_lo_u32 v2, v2, v3
+; GFX10-NEXT:    v_mul_lo_u32 v9, v1, v4
 ; GFX10-NEXT:    v_mul_lo_u32 v5, v0, v5
 ; GFX10-NEXT:    v_mul_hi_u32 v4, v0, v4
 ; GFX10-NEXT:    v_mul_lo_u32 v0, v0, v3
@@ -665,16 +665,16 @@ define amdgpu_ps <4 x i32> @s_mul_i128(i128 inreg %num, i128 inreg %den) {
 ; GFX7-NEXT:    s_mul_i32 s10, s1, s5
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX7-NEXT:    s_add_u32 s9, s9, s10
-; GFX7-NEXT:    s_cselect_b32 s10, 1, 0
 ; GFX7-NEXT:    v_mul_hi_u32 v2, v2, s4
+; GFX7-NEXT:    s_cselect_b32 s10, 1, 0
 ; GFX7-NEXT:    s_mul_i32 s11, s0, s6
 ; GFX7-NEXT:    s_and_b32 s10, s10, 1
-; GFX7-NEXT:    s_add_u32 s9, s9, s11
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s5
-; GFX7-NEXT:    s_cselect_b32 s11, 1, 0
+; GFX7-NEXT:    s_add_u32 s9, s9, s11
 ; GFX7-NEXT:    v_mul_hi_u32 v4, s0, v3
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, s9, v2
+; GFX7-NEXT:    s_cselect_b32 s11, 1, 0
 ; GFX7-NEXT:    s_and_b32 s11, s11, 1
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, s9, v2
 ; GFX7-NEXT:    s_add_i32 s10, s10, s11
 ; GFX7-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GFX7-NEXT:    v_add_i32_e32 v5, vcc, s10, v5
@@ -685,14 +685,14 @@ define amdgpu_ps <4 x i32> @s_mul_i128(i128 inreg %num, i128 inreg %den) {
 ; GFX7-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX7-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v4, s2
-; GFX7-NEXT:    v_mov_b32_e32 v5, s6
-; GFX7-NEXT:    s_mul_i32 s5, s2, s5
 ; GFX7-NEXT:    s_mul_i32 s3, s3, s4
+; GFX7-NEXT:    s_mul_i32 s5, s2, s5
 ; GFX7-NEXT:    v_mul_hi_u32 v4, v4, s4
+; GFX7-NEXT:    v_mov_b32_e32 v5, s6
 ; GFX7-NEXT:    s_mul_i32 s8, s0, s4
 ; GFX7-NEXT:    s_mul_i32 s9, s1, s6
-; GFX7-NEXT:    v_mul_hi_u32 v3, s1, v3
 ; GFX7-NEXT:    s_mul_i32 s7, s0, s7
+; GFX7-NEXT:    v_mul_hi_u32 v3, s1, v3
 ; GFX7-NEXT:    v_mul_hi_u32 v5, s0, v5
 ; GFX7-NEXT:    s_add_i32 s0, s3, s5
 ; GFX7-NEXT:    s_add_i32 s0, s0, s9
@@ -723,16 +723,16 @@ define amdgpu_ps <4 x i32> @s_mul_i128(i128 inreg %num, i128 inreg %den) {
 ; GFX8-NEXT:    s_mul_i32 s10, s1, s5
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX8-NEXT:    s_add_u32 s9, s9, s10
-; GFX8-NEXT:    s_cselect_b32 s10, 1, 0
 ; GFX8-NEXT:    v_mul_hi_u32 v2, v2, s4
+; GFX8-NEXT:    s_cselect_b32 s10, 1, 0
 ; GFX8-NEXT:    s_mul_i32 s11, s0, s6
 ; GFX8-NEXT:    s_and_b32 s10, s10, 1
-; GFX8-NEXT:    s_add_u32 s9, s9, s11
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s5
-; GFX8-NEXT:    s_cselect_b32 s11, 1, 0
+; GFX8-NEXT:    s_add_u32 s9, s9, s11
 ; GFX8-NEXT:    v_mul_hi_u32 v4, s0, v3
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s9, v2
+; GFX8-NEXT:    s_cselect_b32 s11, 1, 0
 ; GFX8-NEXT:    s_and_b32 s11, s11, 1
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s9, v2
 ; GFX8-NEXT:    s_add_i32 s10, s10, s11
 ; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s10, v5
@@ -743,14 +743,14 @@ define amdgpu_ps <4 x i32> @s_mul_i128(i128 inreg %num, i128 inreg %den) {
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v4, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s2
-; GFX8-NEXT:    v_mov_b32_e32 v5, s6
-; GFX8-NEXT:    s_mul_i32 s5, s2, s5
 ; GFX8-NEXT:    s_mul_i32 s3, s3, s4
+; GFX8-NEXT:    s_mul_i32 s5, s2, s5
 ; GFX8-NEXT:    v_mul_hi_u32 v4, v4, s4
+; GFX8-NEXT:    v_mov_b32_e32 v5, s6
 ; GFX8-NEXT:    s_mul_i32 s8, s0, s4
 ; GFX8-NEXT:    s_mul_i32 s9, s1, s6
-; GFX8-NEXT:    v_mul_hi_u32 v3, s1, v3
 ; GFX8-NEXT:    s_mul_i32 s7, s0, s7
+; GFX8-NEXT:    v_mul_hi_u32 v3, s1, v3
 ; GFX8-NEXT:    v_mul_hi_u32 v5, s0, v5
 ; GFX8-NEXT:    s_add_i32 s0, s3, s5
 ; GFX8-NEXT:    s_add_i32 s0, s0, s9
@@ -801,8 +801,8 @@ define amdgpu_ps <4 x i32> @s_mul_i128(i128 inreg %num, i128 inreg %den) {
 ; GFX9-NEXT:    s_cselect_b32 s11, 1, 0
 ; GFX9-NEXT:    s_and_b32 s11, s11, 1
 ; GFX9-NEXT:    s_add_i32 s12, s12, s11
-; GFX9-NEXT:    s_mul_i32 s11, s2, s5
 ; GFX9-NEXT:    s_mul_i32 s3, s3, s4
+; GFX9-NEXT:    s_mul_i32 s11, s2, s5
 ; GFX9-NEXT:    s_mul_i32 s13, s1, s6
 ; GFX9-NEXT:    s_add_i32 s3, s3, s11
 ; GFX9-NEXT:    s_mul_i32 s7, s0, s7
@@ -812,8 +812,8 @@ define amdgpu_ps <4 x i32> @s_mul_i128(i128 inreg %num, i128 inreg %den) {
 ; GFX9-NEXT:    s_mul_hi_u32 s1, s1, s5
 ; GFX9-NEXT:    s_add_i32 s2, s3, s2
 ; GFX9-NEXT:    s_mul_i32 s8, s0, s4
-; GFX9-NEXT:    s_add_i32 s1, s2, s1
 ; GFX9-NEXT:    s_mul_hi_u32 s0, s0, s6
+; GFX9-NEXT:    s_add_i32 s1, s2, s1
 ; GFX9-NEXT:    s_add_i32 s0, s1, s0
 ; GFX9-NEXT:    s_add_i32 s3, s0, s12
 ; GFX9-NEXT:    s_mov_b32 s0, s8
@@ -916,11 +916,11 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
 ; GFX7-NEXT:    v_mul_lo_u32 v13, v1, v6
 ; GFX7-NEXT:    v_mul_lo_u32 v7, v0, v7
 ; GFX7-NEXT:    v_mul_hi_u32 v2, v2, v4
-; GFX7-NEXT:    v_add_i32_e32 v3, vcc, v3, v12
 ; GFX7-NEXT:    v_mul_hi_u32 v1, v1, v5
-; GFX7-NEXT:    v_add_i32_e32 v3, vcc, v3, v13
+; GFX7-NEXT:    v_add_i32_e32 v3, vcc, v3, v12
 ; GFX7-NEXT:    v_mul_lo_u32 v8, v0, v4
 ; GFX7-NEXT:    v_mul_hi_u32 v0, v0, v6
+; GFX7-NEXT:    v_add_i32_e32 v3, vcc, v3, v13
 ; GFX7-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
 ; GFX7-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
@@ -966,11 +966,11 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
 ; GFX8-NEXT:    v_mul_lo_u32 v13, v1, v6
 ; GFX8-NEXT:    v_mul_lo_u32 v7, v0, v7
 ; GFX8-NEXT:    v_mul_hi_u32 v2, v2, v4
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v12
 ; GFX8-NEXT:    v_mul_hi_u32 v1, v1, v5
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v13
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v12
 ; GFX8-NEXT:    v_mul_lo_u32 v8, v0, v4
 ; GFX8-NEXT:    v_mul_hi_u32 v0, v0, v6
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v13
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v7
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v2, v1
@@ -1042,28 +1042,28 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
 ; GFX10-NEXT:    v_add_co_u32 v8, s4, v8, v9
 ; GFX10-NEXT:    v_add_co_u32 v9, s5, v10, v11
 ; GFX10-NEXT:    v_mul_hi_u32 v11, v1, v4
-; GFX10-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s4
 ; GFX10-NEXT:    v_add_co_u32 v13, s4, v8, v13
 ; GFX10-NEXT:    v_add_co_u32 v8, s5, v9, v12
-; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s4
 ; GFX10-NEXT:    v_add_co_u32 v11, s4, v13, v11
 ; GFX10-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s4
 ; GFX10-NEXT:    v_add_nc_u32_e32 v9, v10, v9
 ; GFX10-NEXT:    v_mul_lo_u32 v10, v2, v5
 ; GFX10-NEXT:    v_add_co_u32 v11, s4, v11, v15
-; GFX10-NEXT:    v_mul_hi_u32 v15, v2, v4
 ; GFX10-NEXT:    v_add3_u32 v12, v14, v12, v13
 ; GFX10-NEXT:    v_mul_lo_u32 v13, v1, v6
+; GFX10-NEXT:    v_mul_hi_u32 v15, v2, v4
 ; GFX10-NEXT:    v_mul_hi_u32 v1, v1, v5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s4
-; GFX10-NEXT:    v_add_co_u32 v2, s4, v11, v9
 ; GFX10-NEXT:    v_add_nc_u32_e32 v3, v3, v10
+; GFX10-NEXT:    v_add_co_u32 v2, s4, v11, v9
 ; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s4
 ; GFX10-NEXT:    v_mul_hi_u32 v6, v0, v6
-; GFX10-NEXT:    v_mul_lo_u32 v0, v0, v4
 ; GFX10-NEXT:    v_add3_u32 v3, v3, v13, v7
+; GFX10-NEXT:    v_mul_lo_u32 v0, v0, v4
 ; GFX10-NEXT:    v_add3_u32 v4, v12, v14, v5
 ; GFX10-NEXT:    v_add3_u32 v1, v3, v15, v1
 ; GFX10-NEXT:    v_add3_u32 v3, v1, v6, v4
@@ -1090,16 +1090,16 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX7-NEXT:    s_mul_i32 s18, s1, s9
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX7-NEXT:    s_add_u32 s17, s17, s18
-; GFX7-NEXT:    s_cselect_b32 s18, 1, 0
 ; GFX7-NEXT:    v_mul_hi_u32 v2, v2, s8
+; GFX7-NEXT:    s_cselect_b32 s18, 1, 0
 ; GFX7-NEXT:    s_mul_i32 s19, s0, s10
 ; GFX7-NEXT:    s_and_b32 s18, s18, 1
-; GFX7-NEXT:    s_add_u32 s17, s17, s19
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s9
-; GFX7-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX7-NEXT:    s_add_u32 s17, s17, s19
 ; GFX7-NEXT:    v_mul_hi_u32 v4, s0, v3
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, s17, v2
+; GFX7-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX7-NEXT:    s_and_b32 s19, s19, 1
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, s17, v2
 ; GFX7-NEXT:    s_add_i32 s18, s18, s19
 ; GFX7-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GFX7-NEXT:    v_add_i32_e32 v5, vcc, s18, v5
@@ -1108,30 +1108,30 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX7-NEXT:    s_mul_i32 s18, s2, s9
 ; GFX7-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; GFX7-NEXT:    s_add_u32 s17, s17, s18
-; GFX7-NEXT:    s_cselect_b32 s18, 1, 0
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; GFX7-NEXT:    s_cselect_b32 s18, 1, 0
 ; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
 ; GFX7-NEXT:    s_mul_i32 s19, s1, s10
 ; GFX7-NEXT:    s_and_b32 s18, s18, 1
 ; GFX7-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX7-NEXT:    s_add_u32 s17, s17, s19
-; GFX7-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX7-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v4, s2
+; GFX7-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX7-NEXT:    v_mul_hi_u32 v5, v4, s8
 ; GFX7-NEXT:    s_and_b32 s19, s19, 1
 ; GFX7-NEXT:    s_mul_i32 s20, s0, s11
 ; GFX7-NEXT:    s_add_i32 s18, s18, s19
 ; GFX7-NEXT:    s_add_u32 s17, s17, s20
-; GFX7-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX7-NEXT:    v_mul_hi_u32 v3, s1, v3
-; GFX7-NEXT:    v_add_i32_e32 v5, vcc, s17, v5
+; GFX7-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX7-NEXT:    s_and_b32 s19, s19, 1
+; GFX7-NEXT:    v_add_i32_e32 v5, vcc, s17, v5
 ; GFX7-NEXT:    v_mov_b32_e32 v6, s10
 ; GFX7-NEXT:    s_add_i32 s18, s18, s19
 ; GFX7-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX7-NEXT:    v_add_i32_e32 v8, vcc, s18, v8
 ; GFX7-NEXT:    v_mul_hi_u32 v7, s0, v6
+; GFX7-NEXT:    v_add_i32_e32 v8, vcc, s18, v8
 ; GFX7-NEXT:    s_mul_i32 s17, s4, s8
 ; GFX7-NEXT:    s_mul_i32 s18, s3, s9
 ; GFX7-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
@@ -1143,8 +1143,8 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX7-NEXT:    s_and_b32 s18, s18, 1
 ; GFX7-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
 ; GFX7-NEXT:    s_add_u32 s17, s17, s19
-; GFX7-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX7-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GFX7-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX7-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
 ; GFX7-NEXT:    s_and_b32 s19, s19, 1
 ; GFX7-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
@@ -1152,36 +1152,36 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX7-NEXT:    s_add_i32 s18, s18, s19
 ; GFX7-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX7-NEXT:    s_add_u32 s17, s17, s20
-; GFX7-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX7-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
 ; GFX7-NEXT:    v_mov_b32_e32 v5, s3
-; GFX7-NEXT:    s_and_b32 s19, s19, 1
+; GFX7-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX7-NEXT:    v_mul_hi_u32 v7, v5, s8
+; GFX7-NEXT:    s_and_b32 s19, s19, 1
 ; GFX7-NEXT:    s_mul_i32 s21, s0, s12
 ; GFX7-NEXT:    s_add_i32 s18, s18, s19
 ; GFX7-NEXT:    s_add_u32 s17, s17, s21
 ; GFX7-NEXT:    s_cselect_b32 s19, 1, 0
-; GFX7-NEXT:    v_add_i32_e32 v7, vcc, s17, v7
-; GFX7-NEXT:    s_and_b32 s19, s19, 1
 ; GFX7-NEXT:    v_mul_hi_u32 v4, v4, s9
+; GFX7-NEXT:    s_and_b32 s19, s19, 1
+; GFX7-NEXT:    v_add_i32_e32 v7, vcc, s17, v7
 ; GFX7-NEXT:    s_add_i32 s18, s18, s19
 ; GFX7-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; GFX7-NEXT:    v_add_i32_e32 v11, vcc, s18, v11
 ; GFX7-NEXT:    s_mul_i32 s17, s5, s8
 ; GFX7-NEXT:    s_mul_i32 s18, s4, s9
-; GFX7-NEXT:    s_add_u32 s17, s17, s18
 ; GFX7-NEXT:    v_mul_hi_u32 v8, s1, v6
-; GFX7-NEXT:    s_cselect_b32 s18, 1, 0
+; GFX7-NEXT:    s_add_u32 s17, s17, s18
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
-; GFX7-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GFX7-NEXT:    s_cselect_b32 s18, 1, 0
 ; GFX7-NEXT:    v_mov_b32_e32 v9, s11
+; GFX7-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GFX7-NEXT:    s_mul_i32 s19, s3, s10
 ; GFX7-NEXT:    s_and_b32 s18, s18, 1
+; GFX7-NEXT:    v_mul_hi_u32 v10, s0, v9
 ; GFX7-NEXT:    v_add_i32_e32 v7, vcc, v11, v7
 ; GFX7-NEXT:    s_add_u32 s17, s17, s19
-; GFX7-NEXT:    v_mul_hi_u32 v10, s0, v9
-; GFX7-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
+; GFX7-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX7-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GFX7-NEXT:    s_and_b32 s19, s19, 1
 ; GFX7-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
@@ -1189,8 +1189,8 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX7-NEXT:    s_add_i32 s18, s18, s19
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
 ; GFX7-NEXT:    s_add_u32 s17, s17, s20
-; GFX7-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX7-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GFX7-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX7-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
 ; GFX7-NEXT:    s_and_b32 s19, s19, 1
 ; GFX7-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
@@ -1198,17 +1198,17 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX7-NEXT:    s_add_i32 s18, s18, s19
 ; GFX7-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; GFX7-NEXT:    s_add_u32 s17, s17, s21
-; GFX7-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v7, s4
-; GFX7-NEXT:    s_and_b32 s19, s19, 1
+; GFX7-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX7-NEXT:    v_mul_hi_u32 v8, v7, s8
+; GFX7-NEXT:    s_and_b32 s19, s19, 1
 ; GFX7-NEXT:    s_mul_i32 s22, s0, s13
 ; GFX7-NEXT:    s_add_i32 s18, s18, s19
 ; GFX7-NEXT:    s_add_u32 s17, s17, s22
 ; GFX7-NEXT:    s_cselect_b32 s19, 1, 0
-; GFX7-NEXT:    v_add_i32_e32 v8, vcc, s17, v8
 ; GFX7-NEXT:    s_and_b32 s19, s19, 1
+; GFX7-NEXT:    v_add_i32_e32 v8, vcc, s17, v8
 ; GFX7-NEXT:    v_mul_hi_u32 v10, v5, s9
 ; GFX7-NEXT:    s_add_i32 s18, s18, s19
 ; GFX7-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
@@ -1216,27 +1216,27 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX7-NEXT:    s_mul_i32 s17, s6, s8
 ; GFX7-NEXT:    s_mul_i32 s18, s5, s9
 ; GFX7-NEXT:    s_add_u32 s17, s17, s18
-; GFX7-NEXT:    s_cselect_b32 s18, 1, 0
 ; GFX7-NEXT:    v_mul_hi_u32 v6, s2, v6
+; GFX7-NEXT:    s_cselect_b32 s18, 1, 0
 ; GFX7-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
 ; GFX7-NEXT:    s_mul_i32 s19, s4, s10
 ; GFX7-NEXT:    s_and_b32 s18, s18, 1
 ; GFX7-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GFX7-NEXT:    s_add_u32 s17, s17, s19
-; GFX7-NEXT:    s_cselect_b32 s19, 1, 0
-; GFX7-NEXT:    v_add_i32_e32 v10, vcc, v14, v10
 ; GFX7-NEXT:    v_mul_hi_u32 v11, s1, v9
+; GFX7-NEXT:    v_add_i32_e32 v10, vcc, v14, v10
+; GFX7-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX7-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
 ; GFX7-NEXT:    s_and_b32 s19, s19, 1
-; GFX7-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GFX7-NEXT:    v_mov_b32_e32 v12, s12
+; GFX7-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GFX7-NEXT:    s_mul_i32 s20, s3, s11
 ; GFX7-NEXT:    s_add_i32 s18, s18, s19
+; GFX7-NEXT:    v_mul_hi_u32 v13, s0, v12
 ; GFX7-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
 ; GFX7-NEXT:    s_add_u32 s17, s17, s20
-; GFX7-NEXT:    v_mul_hi_u32 v13, s0, v12
-; GFX7-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX7-NEXT:    v_add_i32_e32 v6, vcc, v6, v11
+; GFX7-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX7-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GFX7-NEXT:    s_and_b32 s19, s19, 1
 ; GFX7-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
@@ -1244,8 +1244,8 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX7-NEXT:    s_add_i32 s18, s18, s19
 ; GFX7-NEXT:    v_add_i32_e32 v6, vcc, v6, v13
 ; GFX7-NEXT:    s_add_u32 s17, s17, s21
-; GFX7-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX7-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GFX7-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX7-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
 ; GFX7-NEXT:    s_and_b32 s19, s19, 1
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
@@ -1253,46 +1253,46 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX7-NEXT:    s_add_i32 s18, s18, s19
 ; GFX7-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GFX7-NEXT:    s_add_u32 s17, s17, s22
-; GFX7-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX7-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
 ; GFX7-NEXT:    v_mov_b32_e32 v8, s5
+; GFX7-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX7-NEXT:    v_mul_hi_u32 v10, v8, s8
 ; GFX7-NEXT:    s_and_b32 s19, s19, 1
 ; GFX7-NEXT:    s_mul_i32 s23, s0, s14
 ; GFX7-NEXT:    s_add_i32 s18, s18, s19
 ; GFX7-NEXT:    s_add_u32 s17, s17, s23
-; GFX7-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX7-NEXT:    v_mul_hi_u32 v11, v7, s9
-; GFX7-NEXT:    v_add_i32_e32 v10, vcc, s17, v10
+; GFX7-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX7-NEXT:    s_and_b32 s19, s19, 1
+; GFX7-NEXT:    v_add_i32_e32 v10, vcc, s17, v10
 ; GFX7-NEXT:    s_add_i32 s18, s18, s19
 ; GFX7-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; GFX7-NEXT:    v_add_i32_e32 v17, vcc, s18, v17
 ; GFX7-NEXT:    v_mul_hi_u32 v5, v5, s10
+; GFX7-NEXT:    v_add_i32_e32 v17, vcc, s18, v17
 ; GFX7-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
 ; GFX7-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; GFX7-NEXT:    v_mul_hi_u32 v13, s2, v9
 ; GFX7-NEXT:    v_add_i32_e32 v11, vcc, v17, v11
 ; GFX7-NEXT:    v_add_i32_e32 v5, vcc, v10, v5
 ; GFX7-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GFX7-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
 ; GFX7-NEXT:    v_mul_hi_u32 v14, s1, v12
+; GFX7-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
 ; GFX7-NEXT:    v_add_i32_e32 v5, vcc, v5, v13
-; GFX7-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; GFX7-NEXT:    v_mov_b32_e32 v15, s13
-; GFX7-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
+; GFX7-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; GFX7-NEXT:    v_mul_hi_u32 v16, s0, v15
+; GFX7-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
 ; GFX7-NEXT:    v_add_i32_e32 v5, vcc, v5, v14
 ; GFX7-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; GFX7-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
 ; GFX7-NEXT:    v_add_i32_e32 v5, vcc, v5, v16
 ; GFX7-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; GFX7-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
-; GFX7-NEXT:    v_mov_b32_e32 v13, s14
 ; GFX7-NEXT:    s_mul_i32 s7, s7, s8
 ; GFX7-NEXT:    s_mul_i32 s17, s6, s9
-; GFX7-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
+; GFX7-NEXT:    v_mov_b32_e32 v13, s14
 ; GFX7-NEXT:    s_mul_i32 s16, s0, s8
+; GFX7-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; GFX7-NEXT:    s_mul_i32 s5, s5, s10
 ; GFX7-NEXT:    s_mul_i32 s15, s0, s15
 ; GFX7-NEXT:    v_mul_hi_u32 v13, s0, v13
@@ -1301,17 +1301,17 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX7-NEXT:    s_mul_i32 s4, s4, s11
 ; GFX7-NEXT:    s_add_i32 s0, s0, s5
 ; GFX7-NEXT:    v_add_i32_e32 v6, vcc, v10, v6
-; GFX7-NEXT:    v_mov_b32_e32 v10, s6
 ; GFX7-NEXT:    s_mul_i32 s11, s3, s12
+; GFX7-NEXT:    v_mov_b32_e32 v10, s6
 ; GFX7-NEXT:    s_add_i32 s0, s0, s4
 ; GFX7-NEXT:    s_mul_i32 s12, s2, s13
-; GFX7-NEXT:    s_add_i32 s0, s0, s11
 ; GFX7-NEXT:    v_mul_hi_u32 v10, v10, s8
+; GFX7-NEXT:    s_add_i32 s0, s0, s11
 ; GFX7-NEXT:    s_mul_i32 s13, s1, s14
-; GFX7-NEXT:    s_add_i32 s0, s0, s12
 ; GFX7-NEXT:    v_mul_hi_u32 v8, v8, s9
-; GFX7-NEXT:    s_add_i32 s0, s0, s13
+; GFX7-NEXT:    s_add_i32 s0, s0, s12
 ; GFX7-NEXT:    v_mul_hi_u32 v7, v7, s10
+; GFX7-NEXT:    s_add_i32 s0, s0, s13
 ; GFX7-NEXT:    v_mul_hi_u32 v9, s3, v9
 ; GFX7-NEXT:    s_add_i32 s0, s0, s15
 ; GFX7-NEXT:    v_mul_hi_u32 v11, s2, v12
@@ -1350,16 +1350,16 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX8-NEXT:    s_mul_i32 s18, s1, s9
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX8-NEXT:    s_add_u32 s17, s17, s18
-; GFX8-NEXT:    s_cselect_b32 s18, 1, 0
 ; GFX8-NEXT:    v_mul_hi_u32 v2, v2, s8
+; GFX8-NEXT:    s_cselect_b32 s18, 1, 0
 ; GFX8-NEXT:    s_mul_i32 s19, s0, s10
 ; GFX8-NEXT:    s_and_b32 s18, s18, 1
-; GFX8-NEXT:    s_add_u32 s17, s17, s19
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s9
-; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX8-NEXT:    s_add_u32 s17, s17, s19
 ; GFX8-NEXT:    v_mul_hi_u32 v4, s0, v3
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s17, v2
+; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX8-NEXT:    s_and_b32 s19, s19, 1
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s17, v2
 ; GFX8-NEXT:    s_add_i32 s18, s18, s19
 ; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s18, v5
@@ -1368,30 +1368,30 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX8-NEXT:    s_mul_i32 s18, s2, s9
 ; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; GFX8-NEXT:    s_add_u32 s17, s17, s18
-; GFX8-NEXT:    s_cselect_b32 s18, 1, 0
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v5, v4
+; GFX8-NEXT:    s_cselect_b32 s18, 1, 0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v2, v1
 ; GFX8-NEXT:    s_mul_i32 s19, s1, s10
 ; GFX8-NEXT:    s_and_b32 s18, s18, 1
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX8-NEXT:    s_add_u32 s17, s17, s19
-; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v4, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s2
+; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX8-NEXT:    v_mul_hi_u32 v5, v4, s8
 ; GFX8-NEXT:    s_and_b32 s19, s19, 1
 ; GFX8-NEXT:    s_mul_i32 s20, s0, s11
 ; GFX8-NEXT:    s_add_i32 s18, s18, s19
 ; GFX8-NEXT:    s_add_u32 s17, s17, s20
-; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX8-NEXT:    v_mul_hi_u32 v3, s1, v3
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s17, v5
+; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX8-NEXT:    s_and_b32 s19, s19, 1
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s17, v5
 ; GFX8-NEXT:    v_mov_b32_e32 v6, s10
 ; GFX8-NEXT:    s_add_i32 s18, s18, s19
 ; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, s18, v8
 ; GFX8-NEXT:    v_mul_hi_u32 v7, s0, v6
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, s18, v8
 ; GFX8-NEXT:    s_mul_i32 s17, s4, s8
 ; GFX8-NEXT:    s_mul_i32 s18, s3, s9
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v5, v3
@@ -1403,8 +1403,8 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX8-NEXT:    s_and_b32 s18, s18, 1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v7
 ; GFX8-NEXT:    s_add_u32 s17, s17, s19
-; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v7
 ; GFX8-NEXT:    s_and_b32 s19, s19, 1
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
@@ -1412,36 +1412,36 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX8-NEXT:    s_add_i32 s18, s18, s19
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX8-NEXT:    s_add_u32 s17, s17, s20
-; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v5, v3
 ; GFX8-NEXT:    v_mov_b32_e32 v5, s3
-; GFX8-NEXT:    s_and_b32 s19, s19, 1
+; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX8-NEXT:    v_mul_hi_u32 v7, v5, s8
+; GFX8-NEXT:    s_and_b32 s19, s19, 1
 ; GFX8-NEXT:    s_mul_i32 s21, s0, s12
 ; GFX8-NEXT:    s_add_i32 s18, s18, s19
 ; GFX8-NEXT:    s_add_u32 s17, s17, s21
 ; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, s17, v7
-; GFX8-NEXT:    s_and_b32 s19, s19, 1
 ; GFX8-NEXT:    v_mul_hi_u32 v4, v4, s9
+; GFX8-NEXT:    s_and_b32 s19, s19, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, s17, v7
 ; GFX8-NEXT:    s_add_i32 s18, s18, s19
 ; GFX8-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v11, vcc, s18, v11
 ; GFX8-NEXT:    s_mul_i32 s17, s5, s8
 ; GFX8-NEXT:    s_mul_i32 s18, s4, s9
-; GFX8-NEXT:    s_add_u32 s17, s17, s18
 ; GFX8-NEXT:    v_mul_hi_u32 v8, s1, v6
-; GFX8-NEXT:    s_cselect_b32 s18, 1, 0
+; GFX8-NEXT:    s_add_u32 s17, s17, s18
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v7, v4
-; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GFX8-NEXT:    s_cselect_b32 s18, 1, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v9, s11
+; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GFX8-NEXT:    s_mul_i32 s19, s3, s10
 ; GFX8-NEXT:    s_and_b32 s18, s18, 1
+; GFX8-NEXT:    v_mul_hi_u32 v10, s0, v9
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v11, v7
 ; GFX8-NEXT:    s_add_u32 s17, s17, s19
-; GFX8-NEXT:    v_mul_hi_u32 v10, s0, v9
-; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v8
+; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GFX8-NEXT:    s_and_b32 s19, s19, 1
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v8
@@ -1449,8 +1449,8 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX8-NEXT:    s_add_i32 s18, s18, s19
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v10
 ; GFX8-NEXT:    s_add_u32 s17, s17, s20
-; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v8
 ; GFX8-NEXT:    s_and_b32 s19, s19, 1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v4, v3
@@ -1458,17 +1458,17 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX8-NEXT:    s_add_i32 s18, s18, s19
 ; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; GFX8-NEXT:    s_add_u32 s17, s17, s21
-; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v7, v4
 ; GFX8-NEXT:    v_mov_b32_e32 v7, s4
-; GFX8-NEXT:    s_and_b32 s19, s19, 1
+; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX8-NEXT:    v_mul_hi_u32 v8, v7, s8
+; GFX8-NEXT:    s_and_b32 s19, s19, 1
 ; GFX8-NEXT:    s_mul_i32 s22, s0, s13
 ; GFX8-NEXT:    s_add_i32 s18, s18, s19
 ; GFX8-NEXT:    s_add_u32 s17, s17, s22
 ; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, s17, v8
 ; GFX8-NEXT:    s_and_b32 s19, s19, 1
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, s17, v8
 ; GFX8-NEXT:    v_mul_hi_u32 v10, v5, s9
 ; GFX8-NEXT:    s_add_i32 s18, s18, s19
 ; GFX8-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
@@ -1476,27 +1476,27 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX8-NEXT:    s_mul_i32 s17, s6, s8
 ; GFX8-NEXT:    s_mul_i32 s18, s5, s9
 ; GFX8-NEXT:    s_add_u32 s17, s17, s18
-; GFX8-NEXT:    s_cselect_b32 s18, 1, 0
 ; GFX8-NEXT:    v_mul_hi_u32 v6, s2, v6
+; GFX8-NEXT:    s_cselect_b32 s18, 1, 0
 ; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v10
 ; GFX8-NEXT:    s_mul_i32 s19, s4, s10
 ; GFX8-NEXT:    s_and_b32 s18, s18, 1
 ; GFX8-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GFX8-NEXT:    s_add_u32 s17, s17, s19
-; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
-; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v14, v10
 ; GFX8-NEXT:    v_mul_hi_u32 v11, s1, v9
+; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v14, v10
+; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v8, v6
 ; GFX8-NEXT:    s_and_b32 s19, s19, 1
-; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GFX8-NEXT:    v_mov_b32_e32 v12, s12
+; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GFX8-NEXT:    s_mul_i32 s20, s3, s11
 ; GFX8-NEXT:    s_add_i32 s18, s18, s19
+; GFX8-NEXT:    v_mul_hi_u32 v13, s0, v12
 ; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v10, v8
 ; GFX8-NEXT:    s_add_u32 s17, s17, s20
-; GFX8-NEXT:    v_mul_hi_u32 v13, s0, v12
-; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v11
+; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GFX8-NEXT:    s_and_b32 s19, s19, 1
 ; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v10
@@ -1504,8 +1504,8 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX8-NEXT:    s_add_i32 s18, s18, s19
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v13
 ; GFX8-NEXT:    s_add_u32 s17, s17, s21
-; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v10
 ; GFX8-NEXT:    s_and_b32 s19, s19, 1
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v6, v4
@@ -1513,46 +1513,46 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX8-NEXT:    s_add_i32 s18, s18, s19
 ; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GFX8-NEXT:    s_add_u32 s17, s17, s22
-; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v8, v6
 ; GFX8-NEXT:    v_mov_b32_e32 v8, s5
+; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX8-NEXT:    v_mul_hi_u32 v10, v8, s8
 ; GFX8-NEXT:    s_and_b32 s19, s19, 1
 ; GFX8-NEXT:    s_mul_i32 s23, s0, s14
 ; GFX8-NEXT:    s_add_i32 s18, s18, s19
 ; GFX8-NEXT:    s_add_u32 s17, s17, s23
-; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX8-NEXT:    v_mul_hi_u32 v11, v7, s9
-; GFX8-NEXT:    v_add_u32_e32 v10, vcc, s17, v10
+; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX8-NEXT:    s_and_b32 s19, s19, 1
+; GFX8-NEXT:    v_add_u32_e32 v10, vcc, s17, v10
 ; GFX8-NEXT:    s_add_i32 s18, s18, s19
 ; GFX8-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s18, v17
 ; GFX8-NEXT:    v_mul_hi_u32 v5, v5, s10
+; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s18, v17
 ; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v10, v11
 ; GFX8-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; GFX8-NEXT:    v_mul_hi_u32 v13, s2, v9
 ; GFX8-NEXT:    v_add_u32_e32 v11, vcc, v17, v11
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v10, v5
 ; GFX8-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v11, v10
 ; GFX8-NEXT:    v_mul_hi_u32 v14, s1, v12
+; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v11, v10
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v13
-; GFX8-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; GFX8-NEXT:    v_mov_b32_e32 v15, s13
-; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v10, v11
+; GFX8-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; GFX8-NEXT:    v_mul_hi_u32 v16, s0, v15
+; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v10, v11
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v14
 ; GFX8-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v10, v11
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v16
 ; GFX8-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v10, v11
-; GFX8-NEXT:    v_mov_b32_e32 v13, s14
 ; GFX8-NEXT:    s_mul_i32 s7, s7, s8
 ; GFX8-NEXT:    s_mul_i32 s17, s6, s9
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v6
+; GFX8-NEXT:    v_mov_b32_e32 v13, s14
 ; GFX8-NEXT:    s_mul_i32 s16, s0, s8
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v6
 ; GFX8-NEXT:    s_mul_i32 s5, s5, s10
 ; GFX8-NEXT:    s_mul_i32 s15, s0, s15
 ; GFX8-NEXT:    v_mul_hi_u32 v13, s0, v13
@@ -1561,17 +1561,17 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX8-NEXT:    s_mul_i32 s4, s4, s11
 ; GFX8-NEXT:    s_add_i32 s0, s0, s5
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v10, v6
-; GFX8-NEXT:    v_mov_b32_e32 v10, s6
 ; GFX8-NEXT:    s_mul_i32 s11, s3, s12
+; GFX8-NEXT:    v_mov_b32_e32 v10, s6
 ; GFX8-NEXT:    s_add_i32 s0, s0, s4
 ; GFX8-NEXT:    s_mul_i32 s12, s2, s13
-; GFX8-NEXT:    s_add_i32 s0, s0, s11
 ; GFX8-NEXT:    v_mul_hi_u32 v10, v10, s8
+; GFX8-NEXT:    s_add_i32 s0, s0, s11
 ; GFX8-NEXT:    s_mul_i32 s13, s1, s14
-; GFX8-NEXT:    s_add_i32 s0, s0, s12
 ; GFX8-NEXT:    v_mul_hi_u32 v8, v8, s9
-; GFX8-NEXT:    s_add_i32 s0, s0, s13
+; GFX8-NEXT:    s_add_i32 s0, s0, s12
 ; GFX8-NEXT:    v_mul_hi_u32 v7, v7, s10
+; GFX8-NEXT:    s_add_i32 s0, s0, s13
 ; GFX8-NEXT:    v_mul_hi_u32 v9, s3, v9
 ; GFX8-NEXT:    s_add_i32 s0, s0, s15
 ; GFX8-NEXT:    v_mul_hi_u32 v11, s2, v12
@@ -1826,8 +1826,8 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX9-NEXT:    s_cselect_b32 s23, 1, 0
 ; GFX9-NEXT:    s_and_b32 s23, s23, 1
 ; GFX9-NEXT:    s_add_i32 s24, s24, s23
-; GFX9-NEXT:    s_mul_i32 s23, s6, s9
 ; GFX9-NEXT:    s_mul_i32 s7, s7, s8
+; GFX9-NEXT:    s_mul_i32 s23, s6, s9
 ; GFX9-NEXT:    s_mul_i32 s25, s5, s10
 ; GFX9-NEXT:    s_add_i32 s7, s7, s23
 ; GFX9-NEXT:    s_mul_i32 s26, s4, s11
@@ -1844,17 +1844,17 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX9-NEXT:    s_add_i32 s7, s7, s15
 ; GFX9-NEXT:    s_mul_hi_u32 s5, s5, s9
 ; GFX9-NEXT:    s_add_i32 s6, s7, s6
-; GFX9-NEXT:    s_add_i32 s5, s6, s5
 ; GFX9-NEXT:    s_mul_hi_u32 s4, s4, s10
-; GFX9-NEXT:    s_add_i32 s4, s5, s4
+; GFX9-NEXT:    s_add_i32 s5, s6, s5
 ; GFX9-NEXT:    s_mul_hi_u32 s3, s3, s11
-; GFX9-NEXT:    s_add_i32 s3, s4, s3
+; GFX9-NEXT:    s_add_i32 s4, s5, s4
 ; GFX9-NEXT:    s_mul_hi_u32 s2, s2, s12
-; GFX9-NEXT:    s_add_i32 s2, s3, s2
+; GFX9-NEXT:    s_add_i32 s3, s4, s3
 ; GFX9-NEXT:    s_mul_hi_u32 s1, s1, s13
+; GFX9-NEXT:    s_add_i32 s2, s3, s2
 ; GFX9-NEXT:    s_mul_i32 s16, s0, s8
-; GFX9-NEXT:    s_add_i32 s1, s2, s1
 ; GFX9-NEXT:    s_mul_hi_u32 s0, s0, s14
+; GFX9-NEXT:    s_add_i32 s1, s2, s1
 ; GFX9-NEXT:    s_add_i32 s0, s1, s0
 ; GFX9-NEXT:    s_add_i32 s7, s0, s24
 ; GFX9-NEXT:    s_mov_b32 s0, s16
@@ -2166,7 +2166,6 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
 ; GFX7-NEXT:    v_mul_hi_u32 v21, v0, v9
 ; GFX7-NEXT:    v_cndmask_b32_e64 v20, 0, 1, vcc
 ; GFX7-NEXT:    v_add_i32_e32 v19, vcc, v19, v20
-; GFX7-NEXT:    v_mul_lo_u32 v22, v0, v11
 ; GFX7-NEXT:    v_add_i32_e32 v18, vcc, v18, v21
 ; GFX7-NEXT:    v_cndmask_b32_e64 v20, 0, 1, vcc
 ; GFX7-NEXT:    v_add_i32_e32 v19, vcc, v19, v20
@@ -2176,8 +2175,8 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
 ; GFX7-NEXT:    v_mul_lo_u32 v21, v2, v9
 ; GFX7-NEXT:    v_add_i32_e32 v18, vcc, v19, v18
 ; GFX7-NEXT:    v_mul_lo_u32 v19, v1, v10
-; GFX7-NEXT:    v_mul_lo_u32 v23, v1, v11
 ; GFX7-NEXT:    v_add_i32_e32 v20, vcc, v20, v21
+; GFX7-NEXT:    v_mul_lo_u32 v22, v0, v11
 ; GFX7-NEXT:    v_cndmask_b32_e64 v21, 0, 1, vcc
 ; GFX7-NEXT:    v_add_i32_e32 v19, vcc, v20, v19
 ; GFX7-NEXT:    v_cndmask_b32_e64 v20, 0, 1, vcc
@@ -2186,12 +2185,10 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
 ; GFX7-NEXT:    v_mul_hi_u32 v22, v2, v8
 ; GFX7-NEXT:    v_cndmask_b32_e64 v21, 0, 1, vcc
 ; GFX7-NEXT:    v_add_i32_e32 v20, vcc, v20, v21
-; GFX7-NEXT:    v_mul_lo_u32 v7, v7, v8
 ; GFX7-NEXT:    v_add_i32_e32 v19, vcc, v19, v22
 ; GFX7-NEXT:    v_mul_hi_u32 v22, v1, v9
 ; GFX7-NEXT:    v_cndmask_b32_e64 v21, 0, 1, vcc
 ; GFX7-NEXT:    v_add_i32_e32 v20, vcc, v20, v21
-; GFX7-NEXT:    v_mul_lo_u32 v15, v0, v15
 ; GFX7-NEXT:    v_add_i32_e32 v19, vcc, v19, v22
 ; GFX7-NEXT:    v_mul_hi_u32 v22, v0, v10
 ; GFX7-NEXT:    v_cndmask_b32_e64 v21, 0, 1, vcc
@@ -2206,6 +2203,7 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
 ; GFX7-NEXT:    v_add_i32_e32 v19, vcc, v20, v19
 ; GFX7-NEXT:    v_mul_lo_u32 v20, v2, v10
 ; GFX7-NEXT:    v_add_i32_e32 v21, vcc, v21, v22
+; GFX7-NEXT:    v_mul_lo_u32 v23, v1, v11
 ; GFX7-NEXT:    v_cndmask_b32_e64 v22, 0, 1, vcc
 ; GFX7-NEXT:    v_add_i32_e32 v20, vcc, v21, v20
 ; GFX7-NEXT:    v_cndmask_b32_e64 v21, 0, 1, vcc
@@ -2245,6 +2243,8 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
 ; GFX7-NEXT:    v_cndmask_b32_e64 v22, 0, 1, vcc
 ; GFX7-NEXT:    v_add_i32_e32 v22, vcc, v23, v22
 ; GFX7-NEXT:    v_mul_lo_u32 v23, v2, v11
+; GFX7-NEXT:    v_mul_lo_u32 v7, v7, v8
+; GFX7-NEXT:    v_mul_lo_u32 v15, v0, v15
 ; GFX7-NEXT:    v_add_i32_e32 v21, vcc, v21, v23
 ; GFX7-NEXT:    v_cndmask_b32_e64 v23, 0, 1, vcc
 ; GFX7-NEXT:    v_add_i32_e32 v22, vcc, v22, v23
@@ -2342,9 +2342,9 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
 ; GFX7-NEXT:    v_mul_hi_u32 v12, v2, v12
 ; GFX7-NEXT:    v_mul_lo_u32 v2, v2, v13
 ; GFX7-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; GFX7-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
 ; GFX7-NEXT:    v_mul_hi_u32 v13, v1, v13
 ; GFX7-NEXT:    v_mul_lo_u32 v1, v1, v14
+; GFX7-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
 ; GFX7-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
 ; GFX7-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
@@ -2352,8 +2352,8 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
 ; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v1, v15
 ; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v1, v8
 ; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v1, v9
-; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v1, v10
 ; GFX7-NEXT:    v_mul_hi_u32 v0, v0, v14
+; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v1, v10
 ; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v1, v11
 ; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v1, v12
 ; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v1, v13
@@ -2392,7 +2392,6 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
 ; GFX8-NEXT:    v_mul_hi_u32 v21, v0, v9
 ; GFX8-NEXT:    v_cndmask_b32_e64 v20, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v19, vcc, v19, v20
-; GFX8-NEXT:    v_mul_lo_u32 v22, v0, v11
 ; GFX8-NEXT:    v_add_u32_e32 v18, vcc, v18, v21
 ; GFX8-NEXT:    v_cndmask_b32_e64 v20, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v19, vcc, v19, v20
@@ -2402,8 +2401,8 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
 ; GFX8-NEXT:    v_mul_lo_u32 v21, v2, v9
 ; GFX8-NEXT:    v_add_u32_e32 v18, vcc, v19, v18
 ; GFX8-NEXT:    v_mul_lo_u32 v19, v1, v10
-; GFX8-NEXT:    v_mul_lo_u32 v23, v1, v11
 ; GFX8-NEXT:    v_add_u32_e32 v20, vcc, v20, v21
+; GFX8-NEXT:    v_mul_lo_u32 v22, v0, v11
 ; GFX8-NEXT:    v_cndmask_b32_e64 v21, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v19, vcc, v20, v19
 ; GFX8-NEXT:    v_cndmask_b32_e64 v20, 0, 1, vcc
@@ -2412,12 +2411,10 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
 ; GFX8-NEXT:    v_mul_hi_u32 v22, v2, v8
 ; GFX8-NEXT:    v_cndmask_b32_e64 v21, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v20, vcc, v20, v21
-; GFX8-NEXT:    v_mul_lo_u32 v7, v7, v8
 ; GFX8-NEXT:    v_add_u32_e32 v19, vcc, v19, v22
 ; GFX8-NEXT:    v_mul_hi_u32 v22, v1, v9
 ; GFX8-NEXT:    v_cndmask_b32_e64 v21, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v20, vcc, v20, v21
-; GFX8-NEXT:    v_mul_lo_u32 v15, v0, v15
 ; GFX8-NEXT:    v_add_u32_e32 v19, vcc, v19, v22
 ; GFX8-NEXT:    v_mul_hi_u32 v22, v0, v10
 ; GFX8-NEXT:    v_cndmask_b32_e64 v21, 0, 1, vcc
@@ -2432,6 +2429,7 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
 ; GFX8-NEXT:    v_add_u32_e32 v19, vcc, v20, v19
 ; GFX8-NEXT:    v_mul_lo_u32 v20, v2, v10
 ; GFX8-NEXT:    v_add_u32_e32 v21, vcc, v21, v22
+; GFX8-NEXT:    v_mul_lo_u32 v23, v1, v11
 ; GFX8-NEXT:    v_cndmask_b32_e64 v22, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v20, vcc, v21, v20
 ; GFX8-NEXT:    v_cndmask_b32_e64 v21, 0, 1, vcc
@@ -2471,6 +2469,8 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
 ; GFX8-NEXT:    v_cndmask_b32_e64 v22, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v22, vcc, v23, v22
 ; GFX8-NEXT:    v_mul_lo_u32 v23, v2, v11
+; GFX8-NEXT:    v_mul_lo_u32 v7, v7, v8
+; GFX8-NEXT:    v_mul_lo_u32 v15, v0, v15
 ; GFX8-NEXT:    v_add_u32_e32 v21, vcc, v21, v23
 ; GFX8-NEXT:    v_cndmask_b32_e64 v23, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v22, vcc, v22, v23
@@ -2568,9 +2568,9 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
 ; GFX8-NEXT:    v_mul_hi_u32 v12, v2, v12
 ; GFX8-NEXT:    v_mul_lo_u32 v2, v2, v13
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v7, v6
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v6, v5
 ; GFX8-NEXT:    v_mul_hi_u32 v13, v1, v13
 ; GFX8-NEXT:    v_mul_lo_u32 v1, v1, v14
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v6, v5
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v5, v4
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v4, v3
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
@@ -2578,8 +2578,8 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v15
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v8
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v9
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v10
 ; GFX8-NEXT:    v_mul_hi_u32 v0, v0, v14
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v10
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v11
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v12
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v13
@@ -2616,8 +2616,8 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
 ; GFX9-NEXT:    v_cndmask_b32_e64 v20, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v16, vcc, v17, v16
 ; GFX9-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v19, vcc, v19, v21
 ; GFX9-NEXT:    v_add_u32_e32 v17, v20, v17
+; GFX9-NEXT:    v_add_co_u32_e32 v19, vcc, v19, v21
 ; GFX9-NEXT:    v_mul_lo_u32 v21, v3, v8
 ; GFX9-NEXT:    v_mul_lo_u32 v22, v2, v9
 ; GFX9-NEXT:    v_cndmask_b32_e64 v20, 0, 1, vcc
@@ -2769,8 +2769,8 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
 ; GFX9-NEXT:    v_mul_lo_u32 v23, v5, v10
 ; GFX9-NEXT:    v_mul_hi_u32 v5, v5, v9
 ; GFX9-NEXT:    v_mul_hi_u32 v9, v3, v11
-; GFX9-NEXT:    v_mul_hi_u32 v10, v2, v12
 ; GFX9-NEXT:    v_mul_lo_u32 v3, v3, v12
+; GFX9-NEXT:    v_mul_hi_u32 v10, v2, v12
 ; GFX9-NEXT:    v_mul_lo_u32 v2, v2, v13
 ; GFX9-NEXT:    v_mul_hi_u32 v11, v1, v13
 ; GFX9-NEXT:    v_mul_lo_u32 v12, v1, v14
@@ -2778,8 +2778,8 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
 ; GFX9-NEXT:    v_add3_u32 v7, v7, v23, v24
 ; GFX9-NEXT:    v_add3_u32 v2, v7, v3, v2
 ; GFX9-NEXT:    v_mul_lo_u32 v1, v0, v8
-; GFX9-NEXT:    v_add3_u32 v2, v2, v12, v13
 ; GFX9-NEXT:    v_mul_hi_u32 v0, v0, v14
+; GFX9-NEXT:    v_add3_u32 v2, v2, v12, v13
 ; GFX9-NEXT:    v_add3_u32 v2, v2, v6, v5
 ; GFX9-NEXT:    v_add3_u32 v2, v2, v4, v9
 ; GFX9-NEXT:    v_add3_u32 v2, v2, v10, v11
@@ -2807,53 +2807,53 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
 ; GFX10-NEXT:    v_mul_lo_u32 v25, v1, v10
 ; GFX10-NEXT:    v_mul_hi_u32 v23, v0, v9
 ; GFX10-NEXT:    v_add_co_u32 v16, s4, v16, v17
-; GFX10-NEXT:    v_mul_hi_u32 v27, v0, v10
 ; GFX10-NEXT:    v_cndmask_b32_e64 v17, 0, 1, s4
+; GFX10-NEXT:    v_mul_hi_u32 v27, v0, v10
 ; GFX10-NEXT:    v_mul_hi_u32 v29, v3, v9
-; GFX10-NEXT:    v_mul_hi_u32 v31, v4, v9
 ; GFX10-NEXT:    v_add_co_u32 v16, s4, v16, v18
-; GFX10-NEXT:    v_mul_lo_u32 v7, v7, v8
 ; GFX10-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s4
 ; GFX10-NEXT:    v_add_co_u32 v19, s4, v19, v20
 ; GFX10-NEXT:    v_mul_lo_u32 v20, v2, v9
 ; GFX10-NEXT:    v_cndmask_b32_e64 v24, 0, 1, s4
-; GFX10-NEXT:    v_mul_lo_u32 v15, v0, v15
 ; GFX10-NEXT:    v_add_nc_u32_e32 v17, v17, v18
 ; GFX10-NEXT:    v_mul_lo_u32 v18, v0, v10
+; GFX10-NEXT:    v_mul_hi_u32 v31, v4, v9
+; GFX10-NEXT:    v_mul_lo_u32 v7, v7, v8
+; GFX10-NEXT:    v_mul_lo_u32 v15, v0, v15
 ; GFX10-NEXT:    v_add_co_u32 v18, s4, v19, v18
 ; GFX10-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s4
 ; GFX10-NEXT:    v_add_co_u32 v20, s4, v22, v20
-; GFX10-NEXT:    v_mul_lo_u32 v22, v0, v11
 ; GFX10-NEXT:    v_add_co_u32 v18, s5, v18, v21
-; GFX10-NEXT:    v_cndmask_b32_e64 v26, 0, 1, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v21, 0, 1, s5
+; GFX10-NEXT:    v_mul_lo_u32 v22, v0, v11
+; GFX10-NEXT:    v_cndmask_b32_e64 v26, 0, 1, s4
 ; GFX10-NEXT:    v_add_co_u32 v20, s4, v20, v25
-; GFX10-NEXT:    v_add_co_u32 v18, s5, v18, v23
-; GFX10-NEXT:    v_mul_hi_u32 v23, v1, v9
 ; GFX10-NEXT:    v_add3_u32 v19, v24, v19, v21
 ; GFX10-NEXT:    v_mul_hi_u32 v21, v2, v8
 ; GFX10-NEXT:    v_cndmask_b32_e64 v24, 0, 1, s4
+; GFX10-NEXT:    v_add_co_u32 v18, s5, v18, v23
 ; GFX10-NEXT:    v_add_co_u32 v20, s4, v20, v22
-; GFX10-NEXT:    v_cndmask_b32_e64 v22, 0, 1, s5
+; GFX10-NEXT:    v_mul_hi_u32 v23, v1, v9
 ; GFX10-NEXT:    v_cndmask_b32_e64 v25, 0, 1, s4
-; GFX10-NEXT:    v_add_co_u32 v17, s5, v18, v17
-; GFX10-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v22, 0, 1, s5
 ; GFX10-NEXT:    v_add_co_u32 v20, s4, v20, v21
+; GFX10-NEXT:    v_add_co_u32 v17, s5, v18, v17
 ; GFX10-NEXT:    v_add3_u32 v21, v26, v24, v25
 ; GFX10-NEXT:    v_cndmask_b32_e64 v24, 0, 1, s4
-; GFX10-NEXT:    v_mul_lo_u32 v25, v4, v8
-; GFX10-NEXT:    v_mul_lo_u32 v26, v3, v9
 ; GFX10-NEXT:    v_add_co_u32 v20, s4, v20, v23
-; GFX10-NEXT:    v_add3_u32 v18, v19, v22, v18
+; GFX10-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v23, 0, 1, s4
+; GFX10-NEXT:    v_mul_lo_u32 v25, v4, v8
+; GFX10-NEXT:    v_mul_lo_u32 v26, v3, v9
 ; GFX10-NEXT:    v_add_co_u32 v20, s5, v20, v27
+; GFX10-NEXT:    v_add3_u32 v18, v19, v22, v18
 ; GFX10-NEXT:    v_add3_u32 v19, v21, v24, v23
 ; GFX10-NEXT:    v_mul_lo_u32 v21, v2, v10
-; GFX10-NEXT:    v_add_co_u32 v22, s4, v25, v26
 ; GFX10-NEXT:    v_mul_lo_u32 v24, v1, v11
+; GFX10-NEXT:    v_cndmask_b32_e64 v23, 0, 1, s5
+; GFX10-NEXT:    v_add_co_u32 v22, s4, v25, v26
 ; GFX10-NEXT:    v_cndmask_b32_e64 v25, 0, 1, s4
 ; GFX10-NEXT:    v_mul_hi_u32 v26, v3, v8
-; GFX10-NEXT:    v_cndmask_b32_e64 v23, 0, 1, s5
 ; GFX10-NEXT:    v_add_co_u32 v21, s4, v22, v21
 ; GFX10-NEXT:    v_mul_lo_u32 v22, v0, v12
 ; GFX10-NEXT:    v_cndmask_b32_e64 v27, 0, 1, s4
@@ -2865,116 +2865,116 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
 ; GFX10-NEXT:    v_mul_hi_u32 v22, v2, v9
 ; GFX10-NEXT:    v_add3_u32 v24, v25, v27, v24
 ; GFX10-NEXT:    v_cndmask_b32_e64 v25, 0, 1, s4
-; GFX10-NEXT:    v_add3_u32 v19, v19, v23, v20
 ; GFX10-NEXT:    v_add_co_u32 v21, s4, v21, v26
-; GFX10-NEXT:    v_mul_hi_u32 v20, v1, v10
 ; GFX10-NEXT:    v_cndmask_b32_e64 v26, 0, 1, s4
-; GFX10-NEXT:    v_mul_hi_u32 v27, v0, v11
+; GFX10-NEXT:    v_add3_u32 v19, v19, v23, v20
+; GFX10-NEXT:    v_mul_hi_u32 v20, v1, v10
 ; GFX10-NEXT:    v_add_co_u32 v21, s4, v21, v22
-; GFX10-NEXT:    v_mul_lo_u32 v22, v5, v8
 ; GFX10-NEXT:    v_add3_u32 v23, v24, v25, v26
+; GFX10-NEXT:    v_mul_lo_u32 v22, v5, v8
 ; GFX10-NEXT:    v_mul_lo_u32 v24, v4, v9
 ; GFX10-NEXT:    v_cndmask_b32_e64 v25, 0, 1, s4
-; GFX10-NEXT:    v_add_co_u32 v20, s4, v21, v20
 ; GFX10-NEXT:    v_mul_lo_u32 v26, v3, v10
+; GFX10-NEXT:    v_add_co_u32 v20, s4, v21, v20
 ; GFX10-NEXT:    v_cndmask_b32_e64 v21, 0, 1, s4
-; GFX10-NEXT:    v_add_co_u32 v20, s5, v20, v27
+; GFX10-NEXT:    v_mul_hi_u32 v27, v0, v11
 ; GFX10-NEXT:    v_add_co_u32 v22, s4, v22, v24
 ; GFX10-NEXT:    v_add3_u32 v21, v23, v25, v21
 ; GFX10-NEXT:    v_mul_lo_u32 v23, v2, v11
 ; GFX10-NEXT:    v_cndmask_b32_e64 v24, 0, 1, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v25, 0, 1, s5
 ; GFX10-NEXT:    v_add_co_u32 v22, s4, v22, v26
 ; GFX10-NEXT:    v_mul_lo_u32 v26, v1, v12
+; GFX10-NEXT:    v_add_co_u32 v20, s5, v20, v27
 ; GFX10-NEXT:    v_cndmask_b32_e64 v27, 0, 1, s4
-; GFX10-NEXT:    v_add_co_u32 v19, s5, v20, v19
 ; GFX10-NEXT:    v_add_co_u32 v22, s4, v22, v23
 ; GFX10-NEXT:    v_mul_lo_u32 v23, v0, v13
 ; GFX10-NEXT:    v_cndmask_b32_e64 v28, 0, 1, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v25, 0, 1, s5
 ; GFX10-NEXT:    v_add_co_u32 v22, s4, v22, v26
 ; GFX10-NEXT:    v_mul_hi_u32 v26, v4, v8
+; GFX10-NEXT:    v_add_co_u32 v19, s5, v20, v19
+; GFX10-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v30, 0, 1, s4
-; GFX10-NEXT:    v_add3_u32 v20, v21, v25, v20
 ; GFX10-NEXT:    v_add_co_u32 v22, s4, v22, v23
 ; GFX10-NEXT:    v_add3_u32 v23, v24, v27, v28
 ; GFX10-NEXT:    v_cndmask_b32_e64 v24, 0, 1, s4
-; GFX10-NEXT:    v_mul_lo_u32 v27, v6, v8
-; GFX10-NEXT:    v_mul_lo_u32 v28, v5, v9
+; GFX10-NEXT:    v_add3_u32 v20, v21, v25, v20
 ; GFX10-NEXT:    v_add_co_u32 v21, s4, v22, v26
 ; GFX10-NEXT:    v_mul_hi_u32 v22, v2, v10
 ; GFX10-NEXT:    v_add3_u32 v23, v23, v30, v24
 ; GFX10-NEXT:    v_cndmask_b32_e64 v24, 0, 1, s4
-; GFX10-NEXT:    v_mul_hi_u32 v26, v1, v11
 ; GFX10-NEXT:    v_add_co_u32 v21, s4, v21, v29
-; GFX10-NEXT:    v_mul_hi_u32 v29, v0, v12
 ; GFX10-NEXT:    v_cndmask_b32_e64 v25, 0, 1, s4
+; GFX10-NEXT:    v_mul_hi_u32 v26, v1, v11
+; GFX10-NEXT:    v_mul_lo_u32 v27, v6, v8
+; GFX10-NEXT:    v_mul_lo_u32 v28, v5, v9
 ; GFX10-NEXT:    v_add_co_u32 v21, s4, v21, v22
 ; GFX10-NEXT:    v_add3_u32 v23, v23, v24, v25
 ; GFX10-NEXT:    v_mul_lo_u32 v24, v4, v10
 ; GFX10-NEXT:    v_cndmask_b32_e64 v22, 0, 1, s4
-; GFX10-NEXT:    v_add_co_u32 v25, s4, v27, v28
 ; GFX10-NEXT:    v_add_co_u32 v21, s5, v21, v26
-; GFX10-NEXT:    v_mul_lo_u32 v27, v3, v11
+; GFX10-NEXT:    v_add_co_u32 v25, s4, v27, v28
 ; GFX10-NEXT:    v_cndmask_b32_e64 v26, 0, 1, s5
+; GFX10-NEXT:    v_mul_lo_u32 v27, v3, v11
 ; GFX10-NEXT:    v_cndmask_b32_e64 v28, 0, 1, s4
 ; GFX10-NEXT:    v_add_co_u32 v24, s4, v25, v24
-; GFX10-NEXT:    v_add_co_u32 v21, s5, v21, v29
+; GFX10-NEXT:    v_mul_hi_u32 v29, v0, v12
 ; GFX10-NEXT:    v_add3_u32 v22, v23, v22, v26
 ; GFX10-NEXT:    v_mul_lo_u32 v23, v2, v12
 ; GFX10-NEXT:    v_cndmask_b32_e64 v25, 0, 1, s4
 ; GFX10-NEXT:    v_add_co_u32 v24, s4, v24, v27
 ; GFX10-NEXT:    v_mul_lo_u32 v27, v1, v13
+; GFX10-NEXT:    v_add_co_u32 v21, s5, v21, v29
 ; GFX10-NEXT:    v_cndmask_b32_e64 v29, 0, 1, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v26, 0, 1, s5
-; GFX10-NEXT:    v_add_co_u32 v20, s5, v21, v20
 ; GFX10-NEXT:    v_add_co_u32 v23, s4, v24, v23
 ; GFX10-NEXT:    v_mul_lo_u32 v24, v0, v14
 ; GFX10-NEXT:    v_cndmask_b32_e64 v30, 0, 1, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v21, 0, 1, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v26, 0, 1, s5
 ; GFX10-NEXT:    v_add_co_u32 v23, s4, v23, v27
 ; GFX10-NEXT:    v_mul_hi_u32 v27, v5, v8
 ; GFX10-NEXT:    v_cndmask_b32_e64 v32, 0, 1, s4
-; GFX10-NEXT:    v_add3_u32 v21, v22, v26, v21
-; GFX10-NEXT:    v_mul_hi_u32 v26, v2, v11
+; GFX10-NEXT:    v_add_co_u32 v20, s5, v21, v20
 ; GFX10-NEXT:    v_add_co_u32 v23, s4, v23, v24
 ; GFX10-NEXT:    v_add3_u32 v24, v28, v25, v29
 ; GFX10-NEXT:    v_cndmask_b32_e64 v28, 0, 1, s4
 ; GFX10-NEXT:    v_mul_hi_u32 v25, v3, v10
-; GFX10-NEXT:    v_mul_lo_u32 v29, v3, v12
 ; GFX10-NEXT:    v_add_co_u32 v23, s4, v23, v27
+; GFX10-NEXT:    v_cndmask_b32_e64 v21, 0, 1, s5
 ; GFX10-NEXT:    v_add3_u32 v24, v24, v30, v32
 ; GFX10-NEXT:    v_cndmask_b32_e64 v27, 0, 1, s4
-; GFX10-NEXT:    v_mul_hi_u32 v3, v3, v11
 ; GFX10-NEXT:    v_add_co_u32 v23, s4, v23, v31
+; GFX10-NEXT:    v_add3_u32 v21, v22, v26, v21
+; GFX10-NEXT:    v_mul_hi_u32 v26, v2, v11
 ; GFX10-NEXT:    v_add3_u32 v22, v24, v28, v27
 ; GFX10-NEXT:    v_cndmask_b32_e64 v24, 0, 1, s4
-; GFX10-NEXT:    v_mul_lo_u32 v28, v6, v9
 ; GFX10-NEXT:    v_add_co_u32 v23, s4, v23, v25
-; GFX10-NEXT:    v_mul_hi_u32 v27, v1, v12
 ; GFX10-NEXT:    v_cndmask_b32_e64 v25, 0, 1, s4
-; GFX10-NEXT:    v_mul_hi_u32 v6, v6, v8
+; GFX10-NEXT:    v_mul_lo_u32 v28, v6, v9
+; GFX10-NEXT:    v_mul_lo_u32 v29, v3, v12
+; GFX10-NEXT:    v_mul_hi_u32 v27, v1, v12
 ; GFX10-NEXT:    v_add_co_u32 v23, s4, v23, v26
 ; GFX10-NEXT:    v_add3_u32 v22, v22, v24, v25
 ; GFX10-NEXT:    v_mul_lo_u32 v24, v5, v10
 ; GFX10-NEXT:    v_mul_lo_u32 v25, v4, v11
 ; GFX10-NEXT:    v_add_nc_u32_e32 v7, v7, v28
 ; GFX10-NEXT:    v_mul_lo_u32 v28, v2, v13
+; GFX10-NEXT:    v_mul_hi_u32 v6, v6, v8
 ; GFX10-NEXT:    v_mul_hi_u32 v5, v5, v9
 ; GFX10-NEXT:    v_cndmask_b32_e64 v26, 0, 1, s4
 ; GFX10-NEXT:    v_add_co_u32 v23, s4, v23, v27
-; GFX10-NEXT:    v_mul_hi_u32 v4, v4, v10
-; GFX10-NEXT:    v_cndmask_b32_e64 v27, 0, 1, s4
 ; GFX10-NEXT:    v_add3_u32 v7, v7, v24, v25
 ; GFX10-NEXT:    v_mul_lo_u32 v24, v1, v14
 ; GFX10-NEXT:    v_mul_hi_u32 v25, v0, v13
+; GFX10-NEXT:    v_mul_hi_u32 v4, v4, v10
+; GFX10-NEXT:    v_mul_hi_u32 v3, v3, v11
+; GFX10-NEXT:    v_add3_u32 v7, v7, v29, v28
+; GFX10-NEXT:    v_cndmask_b32_e64 v27, 0, 1, s4
 ; GFX10-NEXT:    v_mul_hi_u32 v2, v2, v12
 ; GFX10-NEXT:    v_mul_hi_u32 v1, v1, v13
-; GFX10-NEXT:    v_add3_u32 v7, v7, v29, v28
-; GFX10-NEXT:    v_add3_u32 v22, v22, v26, v27
 ; GFX10-NEXT:    v_add3_u32 v7, v7, v24, v15
 ; GFX10-NEXT:    v_add_co_u32 v9, s4, v23, v25
 ; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s4
+; GFX10-NEXT:    v_add3_u32 v22, v22, v26, v27
 ; GFX10-NEXT:    v_add3_u32 v5, v7, v6, v5
 ; GFX10-NEXT:    v_add_co_u32 v6, s4, v9, v21
 ; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s4

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll
index d5acef7a00d01..bec739bd0bf06 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll
@@ -430,8 +430,8 @@ define amdgpu_ps i32 @s_orn2_v2i16(<2 x i16> inreg %src0, <2 x i16> inreg %src1)
 ; GFX6-LABEL: s_orn2_v2i16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_mov_b32 s1, 0xffff
-; GFX6-NEXT:    s_and_b32 s2, s2, s1
 ; GFX6-NEXT:    s_lshl_b32 s0, s3, 16
+; GFX6-NEXT:    s_and_b32 s2, s2, s1
 ; GFX6-NEXT:    s_or_b32 s0, s0, s2
 ; GFX6-NEXT:    s_lshl_b32 s2, s5, 16
 ; GFX6-NEXT:    s_and_b32 s1, s4, s1
@@ -459,8 +459,8 @@ define amdgpu_ps i32 @s_orn2_v2i16_commute(<2 x i16> inreg %src0, <2 x i16> inre
 ; GFX6-LABEL: s_orn2_v2i16_commute:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_mov_b32 s1, 0xffff
-; GFX6-NEXT:    s_and_b32 s2, s2, s1
 ; GFX6-NEXT:    s_lshl_b32 s0, s3, 16
+; GFX6-NEXT:    s_and_b32 s2, s2, s1
 ; GFX6-NEXT:    s_or_b32 s0, s0, s2
 ; GFX6-NEXT:    s_lshl_b32 s2, s5, 16
 ; GFX6-NEXT:    s_and_b32 s1, s4, s1
@@ -488,8 +488,8 @@ define amdgpu_ps { i32, i32 } @s_orn2_v2i16_multi_use(<2 x i16> inreg %src0, <2
 ; GFX6-LABEL: s_orn2_v2i16_multi_use:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_mov_b32 s1, 0xffff
-; GFX6-NEXT:    s_and_b32 s2, s2, s1
 ; GFX6-NEXT:    s_lshl_b32 s0, s3, 16
+; GFX6-NEXT:    s_and_b32 s2, s2, s1
 ; GFX6-NEXT:    s_or_b32 s0, s0, s2
 ; GFX6-NEXT:    s_lshl_b32 s2, s5, 16
 ; GFX6-NEXT:    s_and_b32 s1, s4, s1
@@ -526,8 +526,8 @@ define amdgpu_ps { i32, i32 } @s_orn2_v2i16_multi_foldable_use(<2 x i16> inreg %
 ; GFX6-NEXT:    s_lshl_b32 s0, s3, 16
 ; GFX6-NEXT:    s_and_b32 s2, s2, s1
 ; GFX6-NEXT:    s_or_b32 s0, s0, s2
-; GFX6-NEXT:    s_and_b32 s3, s4, s1
 ; GFX6-NEXT:    s_lshl_b32 s2, s5, 16
+; GFX6-NEXT:    s_and_b32 s3, s4, s1
 ; GFX6-NEXT:    s_or_b32 s2, s2, s3
 ; GFX6-NEXT:    s_lshl_b32 s3, s7, 16
 ; GFX6-NEXT:    s_and_b32 s1, s6, s1
@@ -633,11 +633,11 @@ define amdgpu_ps i64 @s_orn2_v4i16(<4 x i16> inreg %src0, <4 x i16> inreg %src1)
 ; GFX6-NEXT:    s_mov_b32 s3, 0xffff
 ; GFX6-NEXT:    s_and_b32 s1, s2, s3
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
-; GFX6-NEXT:    s_and_b32 s2, s4, s3
 ; GFX6-NEXT:    s_lshl_b32 s1, s5, 16
+; GFX6-NEXT:    s_and_b32 s2, s4, s3
 ; GFX6-NEXT:    s_or_b32 s1, s1, s2
-; GFX6-NEXT:    s_and_b32 s4, s6, s3
 ; GFX6-NEXT:    s_lshl_b32 s2, s7, 16
+; GFX6-NEXT:    s_and_b32 s4, s6, s3
 ; GFX6-NEXT:    s_or_b32 s2, s2, s4
 ; GFX6-NEXT:    s_lshl_b32 s4, s9, 16
 ; GFX6-NEXT:    s_and_b32 s3, s8, s3
@@ -676,11 +676,11 @@ define amdgpu_ps i64 @s_orn2_v4i16_commute(<4 x i16> inreg %src0, <4 x i16> inre
 ; GFX6-NEXT:    s_mov_b32 s3, 0xffff
 ; GFX6-NEXT:    s_and_b32 s1, s2, s3
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
-; GFX6-NEXT:    s_and_b32 s2, s4, s3
 ; GFX6-NEXT:    s_lshl_b32 s1, s5, 16
+; GFX6-NEXT:    s_and_b32 s2, s4, s3
 ; GFX6-NEXT:    s_or_b32 s1, s1, s2
-; GFX6-NEXT:    s_and_b32 s4, s6, s3
 ; GFX6-NEXT:    s_lshl_b32 s2, s7, 16
+; GFX6-NEXT:    s_and_b32 s4, s6, s3
 ; GFX6-NEXT:    s_or_b32 s2, s2, s4
 ; GFX6-NEXT:    s_lshl_b32 s4, s9, 16
 ; GFX6-NEXT:    s_and_b32 s3, s8, s3
@@ -719,11 +719,11 @@ define amdgpu_ps { i64, i64 } @s_orn2_v4i16_multi_use(<4 x i16> inreg %src0, <4
 ; GFX6-NEXT:    s_mov_b32 s3, 0xffff
 ; GFX6-NEXT:    s_and_b32 s1, s2, s3
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
-; GFX6-NEXT:    s_and_b32 s2, s4, s3
 ; GFX6-NEXT:    s_lshl_b32 s1, s5, 16
+; GFX6-NEXT:    s_and_b32 s2, s4, s3
 ; GFX6-NEXT:    s_or_b32 s1, s1, s2
-; GFX6-NEXT:    s_and_b32 s4, s6, s3
 ; GFX6-NEXT:    s_lshl_b32 s2, s7, 16
+; GFX6-NEXT:    s_and_b32 s4, s6, s3
 ; GFX6-NEXT:    s_or_b32 s2, s2, s4
 ; GFX6-NEXT:    s_lshl_b32 s4, s9, 16
 ; GFX6-NEXT:    s_and_b32 s3, s8, s3
@@ -773,8 +773,8 @@ define amdgpu_ps { i64, i64 } @s_orn2_v4i16_multi_foldable_use(<4 x i16> inreg %
 ; GFX6-NEXT:    s_lshl_b32 s1, s5, 16
 ; GFX6-NEXT:    s_and_b32 s2, s4, s14
 ; GFX6-NEXT:    s_or_b32 s1, s1, s2
-; GFX6-NEXT:    s_and_b32 s3, s6, s14
 ; GFX6-NEXT:    s_lshl_b32 s2, s7, 16
+; GFX6-NEXT:    s_and_b32 s3, s6, s14
 ; GFX6-NEXT:    s_or_b32 s2, s2, s3
 ; GFX6-NEXT:    s_lshl_b32 s3, s9, 16
 ; GFX6-NEXT:    s_and_b32 s4, s8, s14
@@ -831,8 +831,8 @@ define <4 x i16> @v_orn2_v4i16(<4 x i16> %src0, <4 x i16> %src1) {
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
 ; GFX6-NEXT:    v_and_b32_e32 v2, v2, v8
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX6-NEXT:    v_and_b32_e32 v3, v4, v8
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX6-NEXT:    v_and_b32_e32 v3, v4, v8
 ; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
 ; GFX6-NEXT:    v_and_b32_e32 v4, v6, v8

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll
index 472b315bc626d..28e679558cc64 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll
@@ -375,10 +375,10 @@ define <4 x half> @v_roundeven_v4f16(<4 x half> %x) {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_rndne_f16_e32 v2, v0
-; GFX8-NEXT:    v_rndne_f16_e32 v3, v1
 ; GFX8-NEXT:    v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT:    v_mov_b32_e32 v4, 16
+; GFX8-NEXT:    v_rndne_f16_e32 v3, v1
 ; GFX8-NEXT:    v_rndne_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT:    v_mov_b32_e32 v4, 16
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -389,10 +389,10 @@ define <4 x half> @v_roundeven_v4f16(<4 x half> %x) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_rndne_f16_e32 v2, v0
-; GFX9-NEXT:    v_rndne_f16_e32 v3, v1
 ; GFX9-NEXT:    v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0xffff
+; GFX9-NEXT:    v_rndne_f16_e32 v3, v1
 ; GFX9-NEXT:    v_rndne_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0xffff
 ; GFX9-NEXT:    v_and_or_b32 v0, v2, v4, v0
 ; GFX9-NEXT:    v_and_or_b32 v1, v3, v4, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -402,8 +402,8 @@ define <4 x half> @v_roundeven_v4f16(<4 x half> %x) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_rndne_f16_e32 v2, v0
-; GFX10-NEXT:    v_rndne_f16_e32 v3, v1
 ; GFX10-NEXT:    v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT:    v_rndne_f16_e32 v3, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0xffff
 ; GFX10-NEXT:    v_rndne_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX10-NEXT:    v_and_or_b32 v0, v2, v4, v0

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
index ebdac64a8301b..ba62cb1af0d90 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
@@ -10,8 +10,8 @@ define i7 @v_saddsat_i7(i7 %lhs, i7 %rhs) {
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 25, v0
 ; GFX6-NEXT:    v_min_i32_e32 v3, 0, v0
-; GFX6-NEXT:    v_max_i32_e32 v2, 0, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 25, v1
+; GFX6-NEXT:    v_max_i32_e32 v2, 0, v0
 ; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, 0x80000000, v3
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 0x7fffffff, v2
 ; GFX6-NEXT:    v_max_i32_e32 v1, v3, v1
@@ -25,8 +25,8 @@ define i7 @v_saddsat_i7(i7 %lhs, i7 %rhs) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 9, v0
 ; GFX8-NEXT:    v_min_i16_e32 v3, 0, v0
-; GFX8-NEXT:    v_max_i16_e32 v2, 0, v0
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 9, v1
+; GFX8-NEXT:    v_max_i16_e32 v2, 0, v0
 ; GFX8-NEXT:    v_sub_u16_e32 v3, 0x8000, v3
 ; GFX8-NEXT:    v_sub_u16_e32 v2, 0x7fff, v2
 ; GFX8-NEXT:    v_max_i16_e32 v1, v3, v1
@@ -62,8 +62,8 @@ define amdgpu_ps i7 @s_saddsat_i7(i7 inreg %lhs, i7 inreg %rhs) {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 25
 ; GFX6-NEXT:    s_min_i32 s3, s0, 0
-; GFX6-NEXT:    s_max_i32 s2, s0, 0
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 25
+; GFX6-NEXT:    s_max_i32 s2, s0, 0
 ; GFX6-NEXT:    s_sub_i32 s3, 0x80000000, s3
 ; GFX6-NEXT:    s_sub_i32 s2, 0x7fffffff, s2
 ; GFX6-NEXT:    s_max_i32 s1, s3, s1
@@ -84,8 +84,8 @@ define amdgpu_ps i7 @s_saddsat_i7(i7 inreg %lhs, i7 inreg %rhs) {
 ; GFX8-NEXT:    s_sub_i32 s3, 0xffff8000, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s1, s1
-; GFX8-NEXT:    s_max_i32 s1, s3, s1
 ; GFX8-NEXT:    s_sub_i32 s5, 0x7fff, s5
+; GFX8-NEXT:    s_max_i32 s1, s3, s1
 ; GFX8-NEXT:    s_sext_i32_i16 s1, s1
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s5
 ; GFX8-NEXT:    s_min_i32 s1, s1, s3
@@ -124,8 +124,8 @@ define i8 @v_saddsat_i8(i8 %lhs, i8 %rhs) {
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; GFX6-NEXT:    v_min_i32_e32 v3, 0, v0
-; GFX6-NEXT:    v_max_i32_e32 v2, 0, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
+; GFX6-NEXT:    v_max_i32_e32 v2, 0, v0
 ; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, 0x80000000, v3
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 0x7fffffff, v2
 ; GFX6-NEXT:    v_max_i32_e32 v1, v3, v1
@@ -139,8 +139,8 @@ define i8 @v_saddsat_i8(i8 %lhs, i8 %rhs) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX8-NEXT:    v_min_i16_e32 v3, 0, v0
-; GFX8-NEXT:    v_max_i16_e32 v2, 0, v0
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; GFX8-NEXT:    v_max_i16_e32 v2, 0, v0
 ; GFX8-NEXT:    v_sub_u16_e32 v3, 0x8000, v3
 ; GFX8-NEXT:    v_sub_u16_e32 v2, 0x7fff, v2
 ; GFX8-NEXT:    v_max_i16_e32 v1, v3, v1
@@ -176,8 +176,8 @@ define amdgpu_ps i8 @s_saddsat_i8(i8 inreg %lhs, i8 inreg %rhs) {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 24
 ; GFX6-NEXT:    s_min_i32 s3, s0, 0
-; GFX6-NEXT:    s_max_i32 s2, s0, 0
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 24
+; GFX6-NEXT:    s_max_i32 s2, s0, 0
 ; GFX6-NEXT:    s_sub_i32 s3, 0x80000000, s3
 ; GFX6-NEXT:    s_sub_i32 s2, 0x7fffffff, s2
 ; GFX6-NEXT:    s_max_i32 s1, s3, s1
@@ -198,8 +198,8 @@ define amdgpu_ps i8 @s_saddsat_i8(i8 inreg %lhs, i8 inreg %rhs) {
 ; GFX8-NEXT:    s_sub_i32 s3, 0xffff8000, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s1, s1
-; GFX8-NEXT:    s_max_i32 s1, s3, s1
 ; GFX8-NEXT:    s_sub_i32 s5, 0x7fff, s5
+; GFX8-NEXT:    s_max_i32 s1, s3, s1
 ; GFX8-NEXT:    s_sext_i32_i16 s1, s1
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s5
 ; GFX8-NEXT:    s_min_i32 s1, s1, s3
@@ -242,9 +242,9 @@ define i16 @v_saddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
 ; GFX6-NEXT:    v_min_i32_e32 v5, 0, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
-; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, s5, v5
 ; GFX6-NEXT:    s_brev_b32 s4, -2
 ; GFX6-NEXT:    v_max_i32_e32 v4, 0, v0
+; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, s5, v5
 ; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s4, v4
 ; GFX6-NEXT:    v_max_i32_e32 v1, v5, v1
 ; GFX6-NEXT:    v_min_i32_e32 v1, v1, v4
@@ -277,9 +277,9 @@ define i16 @v_saddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
 ; GFX8-NEXT:    v_min_i16_e32 v5, 0, v0
 ; GFX8-NEXT:    v_lshrrev_b32_sdwa v2, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX8-NEXT:    v_sub_u16_e32 v5, s5, v5
 ; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX8-NEXT:    v_max_i16_e32 v4, 0, v0
+; GFX8-NEXT:    v_sub_u16_e32 v5, s5, v5
 ; GFX8-NEXT:    v_sub_u16_e32 v4, s4, v4
 ; GFX8-NEXT:    v_max_i16_e32 v1, v5, v1
 ; GFX8-NEXT:    v_min_i16_e32 v1, v1, v4
@@ -290,8 +290,8 @@ define i16 @v_saddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
 ; GFX8-NEXT:    v_sub_u16_e32 v1, s4, v1
 ; GFX8-NEXT:    v_max_i16_e32 v2, v4, v2
 ; GFX8-NEXT:    v_min_i16_e32 v1, v2, v1
-; GFX8-NEXT:    v_mov_b32_e32 v2, 0xff
 ; GFX8-NEXT:    v_add_u16_e32 v1, v3, v1
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0xff
 ; GFX8-NEXT:    v_and_b32_sdwa v0, sext(v0), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_and_b32_sdwa v1, sext(v1), v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -349,9 +349,9 @@ define amdgpu_ps i16 @s_saddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
 ; GFX6-NEXT:    s_min_i32 s7, s0, 0
 ; GFX6-NEXT:    s_lshr_b32 s3, s1, 8
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 24
-; GFX6-NEXT:    s_sub_i32 s7, s5, s7
 ; GFX6-NEXT:    s_brev_b32 s4, -2
 ; GFX6-NEXT:    s_max_i32 s6, s0, 0
+; GFX6-NEXT:    s_sub_i32 s7, s5, s7
 ; GFX6-NEXT:    s_sub_i32 s6, s4, s6
 ; GFX6-NEXT:    s_max_i32 s1, s7, s1
 ; GFX6-NEXT:    s_min_i32 s1, s1, s6
@@ -381,17 +381,17 @@ define amdgpu_ps i16 @s_saddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s7, s0
 ; GFX8-NEXT:    s_sext_i32_i16 s8, 0
-; GFX8-NEXT:    s_max_i32 s9, s7, s8
 ; GFX8-NEXT:    s_movk_i32 s6, 0x8000
+; GFX8-NEXT:    s_max_i32 s9, s7, s8
 ; GFX8-NEXT:    s_min_i32 s7, s7, s8
-; GFX8-NEXT:    s_sub_i32 s7, s6, s7
 ; GFX8-NEXT:    s_lshr_b32 s3, s1, 8
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, s4
+; GFX8-NEXT:    s_sub_i32 s7, s6, s7
 ; GFX8-NEXT:    s_movk_i32 s5, 0x7fff
 ; GFX8-NEXT:    s_sext_i32_i16 s7, s7
 ; GFX8-NEXT:    s_sext_i32_i16 s1, s1
-; GFX8-NEXT:    s_max_i32 s1, s7, s1
 ; GFX8-NEXT:    s_sub_i32 s9, s5, s9
+; GFX8-NEXT:    s_max_i32 s1, s7, s1
 ; GFX8-NEXT:    s_sext_i32_i16 s1, s1
 ; GFX8-NEXT:    s_sext_i32_i16 s7, s9
 ; GFX8-NEXT:    s_min_i32 s1, s1, s7
@@ -404,8 +404,8 @@ define amdgpu_ps i16 @s_saddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
 ; GFX8-NEXT:    s_sub_i32 s3, s6, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s2, s2
-; GFX8-NEXT:    s_max_i32 s2, s3, s2
 ; GFX8-NEXT:    s_sub_i32 s5, s5, s7
+; GFX8-NEXT:    s_max_i32 s2, s3, s2
 ; GFX8-NEXT:    s_sext_i32_i16 s2, s2
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s5
 ; GFX8-NEXT:    s_min_i32 s2, s2, s3
@@ -427,8 +427,8 @@ define amdgpu_ps i16 @s_saddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
 ; GFX9-NEXT:    s_lshr_b32 s3, s1, 8
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s3
-; GFX9-NEXT:    s_lshr_b32 s3, s0, 16
 ; GFX9-NEXT:    s_mov_b32 s2, 0x80008
+; GFX9-NEXT:    s_lshr_b32 s3, s0, 16
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, s2
 ; GFX9-NEXT:    s_lshl_b32 s3, s3, 8
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s3
@@ -451,8 +451,8 @@ define amdgpu_ps i16 @s_saddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
 ; GFX10-NEXT:    s_lshr_b32 s3, s1, 8
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s3
-; GFX10-NEXT:    s_lshr_b32 s3, s0, 16
 ; GFX10-NEXT:    s_mov_b32 s2, 0x80008
+; GFX10-NEXT:    s_lshr_b32 s3, s0, 16
 ; GFX10-NEXT:    s_lshr_b32 s4, s1, 16
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, s2
 ; GFX10-NEXT:    s_lshl_b32 s3, s3, 8
@@ -488,9 +488,9 @@ define i32 @v_saddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
-; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, s5, v10
 ; GFX6-NEXT:    s_brev_b32 s4, -2
 ; GFX6-NEXT:    v_max_i32_e32 v8, 0, v0
+; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, s5, v10
 ; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, s4, v8
 ; GFX6-NEXT:    v_max_i32_e32 v1, v10, v1
 ; GFX6-NEXT:    v_min_i32_e32 v1, v1, v8
@@ -507,9 +507,9 @@ define i32 @v_saddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 24, v6
 ; GFX6-NEXT:    v_min_i32_e32 v6, 0, v2
-; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, s5, v6
 ; GFX6-NEXT:    v_bfrev_b32_e32 v9, -2
 ; GFX6-NEXT:    v_max_i32_e32 v5, 0, v2
+; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, s5, v6
 ; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v9, v5
 ; GFX6-NEXT:    v_max_i32_e32 v3, v6, v3
 ; GFX6-NEXT:    v_min_i32_e32 v3, v3, v5
@@ -517,24 +517,24 @@ define i32 @v_saddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 24, v4
 ; GFX6-NEXT:    v_bfrev_b32_e32 v11, 1
 ; GFX6-NEXT:    v_min_i32_e32 v6, 0, v3
-; GFX6-NEXT:    v_max_i32_e32 v5, 0, v3
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 24, v1
-; GFX6-NEXT:    s_movk_i32 s4, 0xff
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 24, v7
+; GFX6-NEXT:    v_max_i32_e32 v5, 0, v3
 ; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, v11, v6
+; GFX6-NEXT:    s_movk_i32 s4, 0xff
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 24, v0
-; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
 ; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v9, v5
 ; GFX6-NEXT:    v_max_i32_e32 v4, v6, v4
-; GFX6-NEXT:    v_min_i32_e32 v4, v4, v5
+; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 24, v2
+; GFX6-NEXT:    v_min_i32_e32 v4, v4, v5
 ; GFX6-NEXT:    v_and_b32_e32 v0, s4, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_and_b32_e32 v1, s4, v2
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v3, 24, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_and_b32_e32 v1, s4, v3
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
@@ -555,9 +555,9 @@ define i32 @v_saddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX8-NEXT:    v_sub_u16_e32 v10, s5, v10
 ; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX8-NEXT:    v_max_i16_e32 v8, 0, v0
+; GFX8-NEXT:    v_sub_u16_e32 v10, s5, v10
 ; GFX8-NEXT:    v_sub_u16_e32 v8, s4, v8
 ; GFX8-NEXT:    v_max_i16_e32 v1, v10, v1
 ; GFX8-NEXT:    v_min_i16_e32 v1, v1, v8
@@ -572,17 +572,17 @@ define i32 @v_saddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
 ; GFX8-NEXT:    v_add_u16_e32 v1, v3, v1
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v3, 8, v6
 ; GFX8-NEXT:    v_min_i16_e32 v6, 0, v2
-; GFX8-NEXT:    v_sub_u16_e32 v6, s5, v6
 ; GFX8-NEXT:    v_mov_b32_e32 v9, 0x7fff
 ; GFX8-NEXT:    v_max_i16_e32 v4, 0, v2
+; GFX8-NEXT:    v_sub_u16_e32 v6, s5, v6
 ; GFX8-NEXT:    v_sub_u16_e32 v4, v9, v4
 ; GFX8-NEXT:    v_max_i16_e32 v3, v6, v3
 ; GFX8-NEXT:    v_min_i16_e32 v3, v3, v4
 ; GFX8-NEXT:    v_add_u16_e32 v2, v2, v3
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v3, 8, v5
 ; GFX8-NEXT:    v_min_i16_e32 v6, 0, v3
-; GFX8-NEXT:    v_max_i16_e32 v5, 0, v3
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v4, 8, v7
+; GFX8-NEXT:    v_max_i16_e32 v5, 0, v3
 ; GFX8-NEXT:    v_sub_u16_e32 v6, 0x8000, v6
 ; GFX8-NEXT:    v_sub_u16_e32 v5, v9, v5
 ; GFX8-NEXT:    v_max_i16_e32 v4, v6, v4
@@ -607,20 +607,20 @@ define i32 @v_saddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v8, 0xffff
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX9-NEXT:    v_and_or_b32 v0, v0, v8, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
 ; GFX9-NEXT:    v_lshrrev_b32_sdwa v5, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
-; GFX9-NEXT:    v_and_or_b32 v2, v3, v8, v2
+; GFX9-NEXT:    v_and_or_b32 v0, v0, v8, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; GFX9-NEXT:    v_and_or_b32 v2, v3, v8, v2
 ; GFX9-NEXT:    v_and_or_b32 v1, v1, v8, v5
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
-; GFX9-NEXT:    v_and_or_b32 v3, v6, v8, v3
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
+; GFX9-NEXT:    v_and_or_b32 v3, v6, v8, v3
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
-; GFX9-NEXT:    v_pk_add_i16 v0, v0, v1 clamp
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1]
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1]
+; GFX9-NEXT:    v_pk_add_i16 v0, v0, v1 clamp
 ; GFX9-NEXT:    v_pk_add_i16 v1, v2, v3 clamp
 ; GFX9-NEXT:    v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 8
@@ -691,9 +691,9 @@ define amdgpu_ps i32 @s_saddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 ; GFX6-NEXT:    s_lshr_b32 s6, s1, 16
 ; GFX6-NEXT:    s_lshr_b32 s7, s1, 24
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 24
-; GFX6-NEXT:    s_sub_i32 s11, s9, s11
 ; GFX6-NEXT:    s_brev_b32 s8, -2
 ; GFX6-NEXT:    s_max_i32 s10, s0, 0
+; GFX6-NEXT:    s_sub_i32 s11, s9, s11
 ; GFX6-NEXT:    s_sub_i32 s10, s8, s10
 ; GFX6-NEXT:    s_max_i32 s1, s11, s1
 ; GFX6-NEXT:    s_min_i32 s1, s1, s10
@@ -718,14 +718,14 @@ define amdgpu_ps i32 @s_saddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 ; GFX6-NEXT:    s_add_i32 s2, s2, s3
 ; GFX6-NEXT:    s_lshl_b32 s3, s4, 24
 ; GFX6-NEXT:    s_min_i32 s6, s3, 0
-; GFX6-NEXT:    s_max_i32 s5, s3, 0
 ; GFX6-NEXT:    s_lshl_b32 s4, s7, 24
+; GFX6-NEXT:    s_max_i32 s5, s3, 0
 ; GFX6-NEXT:    s_sub_i32 s6, s9, s6
 ; GFX6-NEXT:    s_sub_i32 s5, s8, s5
 ; GFX6-NEXT:    s_max_i32 s4, s6, s4
 ; GFX6-NEXT:    s_min_i32 s4, s4, s5
-; GFX6-NEXT:    s_add_i32 s3, s3, s4
 ; GFX6-NEXT:    s_ashr_i32 s1, s1, 24
+; GFX6-NEXT:    s_add_i32 s3, s3, s4
 ; GFX6-NEXT:    s_movk_i32 s4, 0xff
 ; GFX6-NEXT:    s_ashr_i32 s0, s0, 24
 ; GFX6-NEXT:    s_and_b32 s1, s1, s4
@@ -734,8 +734,8 @@ define amdgpu_ps i32 @s_saddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 8
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
 ; GFX6-NEXT:    s_and_b32 s1, s2, s4
-; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_ashr_i32 s3, s3, 24
+; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
 ; GFX6-NEXT:    s_and_b32 s1, s3, s4
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 24
@@ -751,19 +751,19 @@ define amdgpu_ps i32 @s_saddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, s8
 ; GFX8-NEXT:    s_sext_i32_i16 s11, s0
 ; GFX8-NEXT:    s_sext_i32_i16 s12, 0
-; GFX8-NEXT:    s_max_i32 s13, s11, s12
 ; GFX8-NEXT:    s_movk_i32 s10, 0x8000
+; GFX8-NEXT:    s_max_i32 s13, s11, s12
 ; GFX8-NEXT:    s_min_i32 s11, s11, s12
-; GFX8-NEXT:    s_sub_i32 s11, s10, s11
 ; GFX8-NEXT:    s_lshr_b32 s5, s1, 8
 ; GFX8-NEXT:    s_lshr_b32 s6, s1, 16
 ; GFX8-NEXT:    s_lshr_b32 s7, s1, 24
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, s8
+; GFX8-NEXT:    s_sub_i32 s11, s10, s11
 ; GFX8-NEXT:    s_movk_i32 s9, 0x7fff
 ; GFX8-NEXT:    s_sext_i32_i16 s11, s11
 ; GFX8-NEXT:    s_sext_i32_i16 s1, s1
-; GFX8-NEXT:    s_max_i32 s1, s11, s1
 ; GFX8-NEXT:    s_sub_i32 s13, s9, s13
+; GFX8-NEXT:    s_max_i32 s1, s11, s1
 ; GFX8-NEXT:    s_sext_i32_i16 s1, s1
 ; GFX8-NEXT:    s_sext_i32_i16 s11, s13
 ; GFX8-NEXT:    s_min_i32 s1, s1, s11
@@ -776,8 +776,8 @@ define amdgpu_ps i32 @s_saddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 ; GFX8-NEXT:    s_sub_i32 s5, s10, s5
 ; GFX8-NEXT:    s_sext_i32_i16 s5, s5
 ; GFX8-NEXT:    s_sext_i32_i16 s2, s2
-; GFX8-NEXT:    s_max_i32 s2, s5, s2
 ; GFX8-NEXT:    s_sub_i32 s11, s9, s11
+; GFX8-NEXT:    s_max_i32 s2, s5, s2
 ; GFX8-NEXT:    s_sext_i32_i16 s2, s2
 ; GFX8-NEXT:    s_sext_i32_i16 s5, s11
 ; GFX8-NEXT:    s_min_i32 s2, s2, s5
@@ -790,8 +790,8 @@ define amdgpu_ps i32 @s_saddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 ; GFX8-NEXT:    s_sub_i32 s5, s10, s5
 ; GFX8-NEXT:    s_sext_i32_i16 s5, s5
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
-; GFX8-NEXT:    s_max_i32 s3, s5, s3
 ; GFX8-NEXT:    s_sub_i32 s6, s9, s6
+; GFX8-NEXT:    s_max_i32 s3, s5, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s5, s6
 ; GFX8-NEXT:    s_min_i32 s3, s3, s5
@@ -804,15 +804,15 @@ define amdgpu_ps i32 @s_saddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 ; GFX8-NEXT:    s_sub_i32 s5, s10, s5
 ; GFX8-NEXT:    s_sext_i32_i16 s5, s5
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
-; GFX8-NEXT:    s_max_i32 s4, s5, s4
 ; GFX8-NEXT:    s_sub_i32 s6, s9, s6
+; GFX8-NEXT:    s_max_i32 s4, s5, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s5, s6
 ; GFX8-NEXT:    s_sext_i32_i16 s1, s1
 ; GFX8-NEXT:    s_min_i32 s4, s4, s5
-; GFX8-NEXT:    s_add_i32 s3, s3, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s0, s0
 ; GFX8-NEXT:    s_ashr_i32 s1, s1, s8
+; GFX8-NEXT:    s_add_i32 s3, s3, s4
 ; GFX8-NEXT:    s_movk_i32 s4, 0xff
 ; GFX8-NEXT:    s_ashr_i32 s0, s0, s8
 ; GFX8-NEXT:    s_sext_i32_i16 s2, s2
@@ -823,8 +823,8 @@ define amdgpu_ps i32 @s_saddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    s_and_b32 s1, s2, s4
-; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX8-NEXT:    s_ashr_i32 s3, s3, s8
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    s_and_b32 s1, s3, s4
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 24
@@ -838,19 +838,19 @@ define amdgpu_ps i32 @s_saddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 ; GFX9-NEXT:    s_lshr_b32 s6, s0, 24
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s3
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s4, s6
-; GFX9-NEXT:    s_lshr_b32 s6, s0, 16
 ; GFX9-NEXT:    s_mov_b32 s4, 0x80008
+; GFX9-NEXT:    s_lshr_b32 s6, s0, 16
 ; GFX9-NEXT:    s_lshr_b32 s7, s1, 8
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, s4
 ; GFX9-NEXT:    s_lshl_b32 s6, s6, 8
-; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s6
-; GFX9-NEXT:    s_lshr_b32 s6, s3, 16
 ; GFX9-NEXT:    s_lshr_b32 s8, s1, 16
 ; GFX9-NEXT:    s_lshr_b32 s9, s1, 24
+; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s6
+; GFX9-NEXT:    s_lshr_b32 s6, s3, 16
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s7
-; GFX9-NEXT:    s_lshr_b32 s7, s1, 16
 ; GFX9-NEXT:    s_lshl_b32 s3, s3, s4
 ; GFX9-NEXT:    s_lshl_b32 s6, s6, 8
+; GFX9-NEXT:    s_lshr_b32 s7, s1, 16
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s3, s6
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s6, s8, s9
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, s4
@@ -859,19 +859,19 @@ define amdgpu_ps i32 @s_saddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 ; GFX9-NEXT:    s_lshr_b32 s7, s6, 16
 ; GFX9-NEXT:    s_lshl_b32 s4, s6, s4
 ; GFX9-NEXT:    s_lshl_b32 s6, s7, 8
-; GFX9-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s4, s6
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX9-NEXT:    v_pk_add_i16 v0, s0, v0 clamp
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s4
-; GFX9-NEXT:    v_pk_add_i16 v1, s3, v1 clamp
 ; GFX9-NEXT:    s_mov_b32 s2, 8
+; GFX9-NEXT:    v_pk_add_i16 v1, s3, v1 clamp
 ; GFX9-NEXT:    v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
 ; GFX9-NEXT:    v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1]
 ; GFX9-NEXT:    s_movk_i32 s0, 0xff
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    s_mov_b32 s5, 24
 ; GFX9-NEXT:    v_and_or_b32 v0, v0, s0, v2
 ; GFX9-NEXT:    v_and_b32_e32 v2, s0, v1
-; GFX9-NEXT:    s_mov_b32 s5, 24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_or3_b32 v0, v0, v2, v1
@@ -885,8 +885,8 @@ define amdgpu_ps i32 @s_saddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 ; GFX10-NEXT:    s_lshr_b32 s4, s0, 24
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s2, s3, s4
-; GFX10-NEXT:    s_lshr_b32 s4, s0, 16
 ; GFX10-NEXT:    s_mov_b32 s3, 0x80008
+; GFX10-NEXT:    s_lshr_b32 s4, s0, 16
 ; GFX10-NEXT:    s_lshr_b32 s5, s1, 8
 ; GFX10-NEXT:    s_lshr_b32 s6, s1, 16
 ; GFX10-NEXT:    s_lshr_b32 s7, s1, 24
@@ -904,8 +904,8 @@ define amdgpu_ps i32 @s_saddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 ; GFX10-NEXT:    s_lshl_b32 s5, s5, 8
 ; GFX10-NEXT:    s_lshl_b32 s3, s4, s3
 ; GFX10-NEXT:    s_lshl_b32 s4, s6, 8
-; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s5
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s2, s2, s8
+; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s5
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s3, s3, s4
 ; GFX10-NEXT:    v_pk_add_i16 v0, s0, s1 clamp
 ; GFX10-NEXT:    v_pk_add_i16 v1, s2, s3 clamp
@@ -935,8 +935,8 @@ define i24 @v_saddsat_i24(i24 %lhs, i24 %rhs) {
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; GFX6-NEXT:    v_min_i32_e32 v3, 0, v0
-; GFX6-NEXT:    v_max_i32_e32 v2, 0, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
+; GFX6-NEXT:    v_max_i32_e32 v2, 0, v0
 ; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, 0x80000000, v3
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 0x7fffffff, v2
 ; GFX6-NEXT:    v_max_i32_e32 v1, v3, v1
@@ -987,8 +987,8 @@ define amdgpu_ps i24 @s_saddsat_i24(i24 inreg %lhs, i24 inreg %rhs) {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 8
 ; GFX6-NEXT:    s_min_i32 s3, s0, 0
-; GFX6-NEXT:    s_max_i32 s2, s0, 0
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX6-NEXT:    s_max_i32 s2, s0, 0
 ; GFX6-NEXT:    s_sub_i32 s3, 0x80000000, s3
 ; GFX6-NEXT:    s_sub_i32 s2, 0x7fffffff, s2
 ; GFX6-NEXT:    s_max_i32 s1, s3, s1
@@ -1211,9 +1211,9 @@ define <2 x i32> @v_saddsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_brev_b32 s5, 1
 ; GFX6-NEXT:    v_min_i32_e32 v5, 0, v0
-; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, s5, v5
 ; GFX6-NEXT:    s_brev_b32 s4, -2
 ; GFX6-NEXT:    v_max_i32_e32 v4, 0, v0
+; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, s5, v5
 ; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s4, v4
 ; GFX6-NEXT:    v_max_i32_e32 v2, v5, v2
 ; GFX6-NEXT:    v_min_i32_e32 v2, v2, v4
@@ -1232,9 +1232,9 @@ define <2 x i32> @v_saddsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_brev_b32 s5, 1
 ; GFX8-NEXT:    v_min_i32_e32 v5, 0, v0
-; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, s5, v5
 ; GFX8-NEXT:    s_brev_b32 s4, -2
 ; GFX8-NEXT:    v_max_i32_e32 v4, 0, v0
+; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, s5, v5
 ; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, s4, v4
 ; GFX8-NEXT:    v_max_i32_e32 v2, v5, v2
 ; GFX8-NEXT:    v_min_i32_e32 v2, v2, v4
@@ -1271,9 +1271,9 @@ define amdgpu_ps <2 x i32> @s_saddsat_v2i32(<2 x i32> inreg %lhs, <2 x i32> inre
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_brev_b32 s5, 1
 ; GFX6-NEXT:    s_min_i32 s7, s0, 0
-; GFX6-NEXT:    s_sub_i32 s7, s5, s7
 ; GFX6-NEXT:    s_brev_b32 s4, -2
 ; GFX6-NEXT:    s_max_i32 s6, s0, 0
+; GFX6-NEXT:    s_sub_i32 s7, s5, s7
 ; GFX6-NEXT:    s_sub_i32 s6, s4, s6
 ; GFX6-NEXT:    s_max_i32 s2, s7, s2
 ; GFX6-NEXT:    s_min_i32 s2, s2, s6
@@ -1291,9 +1291,9 @@ define amdgpu_ps <2 x i32> @s_saddsat_v2i32(<2 x i32> inreg %lhs, <2 x i32> inre
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_brev_b32 s5, 1
 ; GFX8-NEXT:    s_min_i32 s7, s0, 0
-; GFX8-NEXT:    s_sub_i32 s7, s5, s7
 ; GFX8-NEXT:    s_brev_b32 s4, -2
 ; GFX8-NEXT:    s_max_i32 s6, s0, 0
+; GFX8-NEXT:    s_sub_i32 s7, s5, s7
 ; GFX8-NEXT:    s_sub_i32 s6, s4, s6
 ; GFX8-NEXT:    s_max_i32 s2, s7, s2
 ; GFX8-NEXT:    s_min_i32 s2, s2, s6
@@ -1334,9 +1334,9 @@ define <3 x i32> @v_saddsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) {
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_brev_b32 s5, 1
 ; GFX6-NEXT:    v_min_i32_e32 v7, 0, v0
-; GFX6-NEXT:    v_sub_i32_e32 v7, vcc, s5, v7
 ; GFX6-NEXT:    s_brev_b32 s4, -2
 ; GFX6-NEXT:    v_max_i32_e32 v6, 0, v0
+; GFX6-NEXT:    v_sub_i32_e32 v7, vcc, s5, v7
 ; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, s4, v6
 ; GFX6-NEXT:    v_max_i32_e32 v3, v7, v3
 ; GFX6-NEXT:    v_min_i32_e32 v3, v3, v6
@@ -1348,9 +1348,9 @@ define <3 x i32> @v_saddsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) {
 ; GFX6-NEXT:    v_max_i32_e32 v4, v6, v4
 ; GFX6-NEXT:    v_min_i32_e32 v3, v4, v3
 ; GFX6-NEXT:    v_min_i32_e32 v4, 0, v2
-; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s5, v4
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
 ; GFX6-NEXT:    v_max_i32_e32 v3, 0, v2
+; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s5, v4
 ; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s4, v3
 ; GFX6-NEXT:    v_max_i32_e32 v4, v4, v5
 ; GFX6-NEXT:    v_min_i32_e32 v3, v4, v3
@@ -1362,9 +1362,9 @@ define <3 x i32> @v_saddsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_brev_b32 s5, 1
 ; GFX8-NEXT:    v_min_i32_e32 v7, 0, v0
-; GFX8-NEXT:    v_sub_u32_e32 v7, vcc, s5, v7
 ; GFX8-NEXT:    s_brev_b32 s4, -2
 ; GFX8-NEXT:    v_max_i32_e32 v6, 0, v0
+; GFX8-NEXT:    v_sub_u32_e32 v7, vcc, s5, v7
 ; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, s4, v6
 ; GFX8-NEXT:    v_max_i32_e32 v3, v7, v3
 ; GFX8-NEXT:    v_min_i32_e32 v3, v3, v6
@@ -1376,9 +1376,9 @@ define <3 x i32> @v_saddsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) {
 ; GFX8-NEXT:    v_max_i32_e32 v4, v6, v4
 ; GFX8-NEXT:    v_min_i32_e32 v3, v4, v3
 ; GFX8-NEXT:    v_min_i32_e32 v4, 0, v2
-; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, s5, v4
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v3
 ; GFX8-NEXT:    v_max_i32_e32 v3, 0, v2
+; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, s5, v4
 ; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, s4, v3
 ; GFX8-NEXT:    v_max_i32_e32 v4, v4, v5
 ; GFX8-NEXT:    v_min_i32_e32 v3, v4, v3
@@ -1410,9 +1410,9 @@ define amdgpu_ps <3 x i32> @s_saddsat_v3i32(<3 x i32> inreg %lhs, <3 x i32> inre
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_brev_b32 s7, 1
 ; GFX6-NEXT:    s_min_i32 s9, s0, 0
-; GFX6-NEXT:    s_sub_i32 s9, s7, s9
 ; GFX6-NEXT:    s_brev_b32 s6, -2
 ; GFX6-NEXT:    s_max_i32 s8, s0, 0
+; GFX6-NEXT:    s_sub_i32 s9, s7, s9
 ; GFX6-NEXT:    s_sub_i32 s8, s6, s8
 ; GFX6-NEXT:    s_max_i32 s3, s9, s3
 ; GFX6-NEXT:    s_min_i32 s3, s3, s8
@@ -1424,9 +1424,9 @@ define amdgpu_ps <3 x i32> @s_saddsat_v3i32(<3 x i32> inreg %lhs, <3 x i32> inre
 ; GFX6-NEXT:    s_max_i32 s4, s8, s4
 ; GFX6-NEXT:    s_min_i32 s3, s4, s3
 ; GFX6-NEXT:    s_min_i32 s4, s2, 0
-; GFX6-NEXT:    s_sub_i32 s4, s7, s4
 ; GFX6-NEXT:    s_add_i32 s1, s1, s3
 ; GFX6-NEXT:    s_max_i32 s3, s2, 0
+; GFX6-NEXT:    s_sub_i32 s4, s7, s4
 ; GFX6-NEXT:    s_sub_i32 s3, s6, s3
 ; GFX6-NEXT:    s_max_i32 s4, s4, s5
 ; GFX6-NEXT:    s_min_i32 s3, s4, s3
@@ -1437,9 +1437,9 @@ define amdgpu_ps <3 x i32> @s_saddsat_v3i32(<3 x i32> inreg %lhs, <3 x i32> inre
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_brev_b32 s7, 1
 ; GFX8-NEXT:    s_min_i32 s9, s0, 0
-; GFX8-NEXT:    s_sub_i32 s9, s7, s9
 ; GFX8-NEXT:    s_brev_b32 s6, -2
 ; GFX8-NEXT:    s_max_i32 s8, s0, 0
+; GFX8-NEXT:    s_sub_i32 s9, s7, s9
 ; GFX8-NEXT:    s_sub_i32 s8, s6, s8
 ; GFX8-NEXT:    s_max_i32 s3, s9, s3
 ; GFX8-NEXT:    s_min_i32 s3, s3, s8
@@ -1451,9 +1451,9 @@ define amdgpu_ps <3 x i32> @s_saddsat_v3i32(<3 x i32> inreg %lhs, <3 x i32> inre
 ; GFX8-NEXT:    s_max_i32 s4, s8, s4
 ; GFX8-NEXT:    s_min_i32 s3, s4, s3
 ; GFX8-NEXT:    s_min_i32 s4, s2, 0
-; GFX8-NEXT:    s_sub_i32 s4, s7, s4
 ; GFX8-NEXT:    s_add_i32 s1, s1, s3
 ; GFX8-NEXT:    s_max_i32 s3, s2, 0
+; GFX8-NEXT:    s_sub_i32 s4, s7, s4
 ; GFX8-NEXT:    s_sub_i32 s3, s6, s3
 ; GFX8-NEXT:    s_max_i32 s4, s4, s5
 ; GFX8-NEXT:    s_min_i32 s3, s4, s3
@@ -1492,9 +1492,9 @@ define <4 x i32> @v_saddsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_brev_b32 s5, 1
 ; GFX6-NEXT:    v_min_i32_e32 v9, 0, v0
-; GFX6-NEXT:    v_sub_i32_e32 v9, vcc, s5, v9
 ; GFX6-NEXT:    s_brev_b32 s4, -2
 ; GFX6-NEXT:    v_max_i32_e32 v8, 0, v0
+; GFX6-NEXT:    v_sub_i32_e32 v9, vcc, s5, v9
 ; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, s4, v8
 ; GFX6-NEXT:    v_max_i32_e32 v4, v9, v4
 ; GFX6-NEXT:    v_min_i32_e32 v4, v4, v8
@@ -1506,16 +1506,16 @@ define <4 x i32> @v_saddsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
 ; GFX6-NEXT:    v_max_i32_e32 v5, v8, v5
 ; GFX6-NEXT:    v_min_i32_e32 v4, v5, v4
 ; GFX6-NEXT:    v_min_i32_e32 v5, 0, v2
-; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, s5, v5
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
 ; GFX6-NEXT:    v_max_i32_e32 v4, 0, v2
+; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, s5, v5
 ; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s4, v4
 ; GFX6-NEXT:    v_max_i32_e32 v5, v5, v6
 ; GFX6-NEXT:    v_min_i32_e32 v4, v5, v4
 ; GFX6-NEXT:    v_min_i32_e32 v5, 0, v3
-; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, 0x80000000, v5
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; GFX6-NEXT:    v_max_i32_e32 v4, 0, v3
+; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, 0x80000000, v5
 ; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, 0x7fffffff, v4
 ; GFX6-NEXT:    v_max_i32_e32 v5, v5, v7
 ; GFX6-NEXT:    v_min_i32_e32 v4, v5, v4
@@ -1527,9 +1527,9 @@ define <4 x i32> @v_saddsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_brev_b32 s5, 1
 ; GFX8-NEXT:    v_min_i32_e32 v9, 0, v0
-; GFX8-NEXT:    v_sub_u32_e32 v9, vcc, s5, v9
 ; GFX8-NEXT:    s_brev_b32 s4, -2
 ; GFX8-NEXT:    v_max_i32_e32 v8, 0, v0
+; GFX8-NEXT:    v_sub_u32_e32 v9, vcc, s5, v9
 ; GFX8-NEXT:    v_sub_u32_e32 v8, vcc, s4, v8
 ; GFX8-NEXT:    v_max_i32_e32 v4, v9, v4
 ; GFX8-NEXT:    v_min_i32_e32 v4, v4, v8
@@ -1541,16 +1541,16 @@ define <4 x i32> @v_saddsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
 ; GFX8-NEXT:    v_max_i32_e32 v5, v8, v5
 ; GFX8-NEXT:    v_min_i32_e32 v4, v5, v4
 ; GFX8-NEXT:    v_min_i32_e32 v5, 0, v2
-; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, s5, v5
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v4
 ; GFX8-NEXT:    v_max_i32_e32 v4, 0, v2
+; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, s5, v5
 ; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, s4, v4
 ; GFX8-NEXT:    v_max_i32_e32 v5, v5, v6
 ; GFX8-NEXT:    v_min_i32_e32 v4, v5, v4
 ; GFX8-NEXT:    v_min_i32_e32 v5, 0, v3
-; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, 0x80000000, v5
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
 ; GFX8-NEXT:    v_max_i32_e32 v4, 0, v3
+; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, 0x80000000, v5
 ; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, 0x7fffffff, v4
 ; GFX8-NEXT:    v_max_i32_e32 v5, v5, v7
 ; GFX8-NEXT:    v_min_i32_e32 v4, v5, v4
@@ -1584,9 +1584,9 @@ define amdgpu_ps <4 x i32> @s_saddsat_v4i32(<4 x i32> inreg %lhs, <4 x i32> inre
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_brev_b32 s9, 1
 ; GFX6-NEXT:    s_min_i32 s11, s0, 0
-; GFX6-NEXT:    s_sub_i32 s11, s9, s11
 ; GFX6-NEXT:    s_brev_b32 s8, -2
 ; GFX6-NEXT:    s_max_i32 s10, s0, 0
+; GFX6-NEXT:    s_sub_i32 s11, s9, s11
 ; GFX6-NEXT:    s_sub_i32 s10, s8, s10
 ; GFX6-NEXT:    s_max_i32 s4, s11, s4
 ; GFX6-NEXT:    s_min_i32 s4, s4, s10
@@ -1598,16 +1598,16 @@ define amdgpu_ps <4 x i32> @s_saddsat_v4i32(<4 x i32> inreg %lhs, <4 x i32> inre
 ; GFX6-NEXT:    s_max_i32 s5, s10, s5
 ; GFX6-NEXT:    s_min_i32 s4, s5, s4
 ; GFX6-NEXT:    s_min_i32 s5, s2, 0
-; GFX6-NEXT:    s_sub_i32 s5, s9, s5
 ; GFX6-NEXT:    s_add_i32 s1, s1, s4
 ; GFX6-NEXT:    s_max_i32 s4, s2, 0
+; GFX6-NEXT:    s_sub_i32 s5, s9, s5
 ; GFX6-NEXT:    s_sub_i32 s4, s8, s4
 ; GFX6-NEXT:    s_max_i32 s5, s5, s6
 ; GFX6-NEXT:    s_min_i32 s4, s5, s4
 ; GFX6-NEXT:    s_min_i32 s5, s3, 0
-; GFX6-NEXT:    s_sub_i32 s5, s9, s5
 ; GFX6-NEXT:    s_add_i32 s2, s2, s4
 ; GFX6-NEXT:    s_max_i32 s4, s3, 0
+; GFX6-NEXT:    s_sub_i32 s5, s9, s5
 ; GFX6-NEXT:    s_sub_i32 s4, s8, s4
 ; GFX6-NEXT:    s_max_i32 s5, s5, s7
 ; GFX6-NEXT:    s_min_i32 s4, s5, s4
@@ -1618,9 +1618,9 @@ define amdgpu_ps <4 x i32> @s_saddsat_v4i32(<4 x i32> inreg %lhs, <4 x i32> inre
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_brev_b32 s9, 1
 ; GFX8-NEXT:    s_min_i32 s11, s0, 0
-; GFX8-NEXT:    s_sub_i32 s11, s9, s11
 ; GFX8-NEXT:    s_brev_b32 s8, -2
 ; GFX8-NEXT:    s_max_i32 s10, s0, 0
+; GFX8-NEXT:    s_sub_i32 s11, s9, s11
 ; GFX8-NEXT:    s_sub_i32 s10, s8, s10
 ; GFX8-NEXT:    s_max_i32 s4, s11, s4
 ; GFX8-NEXT:    s_min_i32 s4, s4, s10
@@ -1632,16 +1632,16 @@ define amdgpu_ps <4 x i32> @s_saddsat_v4i32(<4 x i32> inreg %lhs, <4 x i32> inre
 ; GFX8-NEXT:    s_max_i32 s5, s10, s5
 ; GFX8-NEXT:    s_min_i32 s4, s5, s4
 ; GFX8-NEXT:    s_min_i32 s5, s2, 0
-; GFX8-NEXT:    s_sub_i32 s5, s9, s5
 ; GFX8-NEXT:    s_add_i32 s1, s1, s4
 ; GFX8-NEXT:    s_max_i32 s4, s2, 0
+; GFX8-NEXT:    s_sub_i32 s5, s9, s5
 ; GFX8-NEXT:    s_sub_i32 s4, s8, s4
 ; GFX8-NEXT:    s_max_i32 s5, s5, s6
 ; GFX8-NEXT:    s_min_i32 s4, s5, s4
 ; GFX8-NEXT:    s_min_i32 s5, s3, 0
-; GFX8-NEXT:    s_sub_i32 s5, s9, s5
 ; GFX8-NEXT:    s_add_i32 s2, s2, s4
 ; GFX8-NEXT:    s_max_i32 s4, s3, 0
+; GFX8-NEXT:    s_sub_i32 s5, s9, s5
 ; GFX8-NEXT:    s_sub_i32 s4, s8, s4
 ; GFX8-NEXT:    s_max_i32 s5, s5, s7
 ; GFX8-NEXT:    s_min_i32 s4, s5, s4
@@ -1685,9 +1685,9 @@ define <5 x i32> @v_saddsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) {
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_brev_b32 s5, 1
 ; GFX6-NEXT:    v_min_i32_e32 v12, 0, v0
-; GFX6-NEXT:    v_sub_i32_e32 v12, vcc, s5, v12
 ; GFX6-NEXT:    s_brev_b32 s4, -2
 ; GFX6-NEXT:    v_max_i32_e32 v10, 0, v0
+; GFX6-NEXT:    v_sub_i32_e32 v12, vcc, s5, v12
 ; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, s4, v10
 ; GFX6-NEXT:    v_max_i32_e32 v5, v12, v5
 ; GFX6-NEXT:    v_min_i32_e32 v5, v5, v10
@@ -1699,25 +1699,25 @@ define <5 x i32> @v_saddsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) {
 ; GFX6-NEXT:    v_max_i32_e32 v6, v10, v6
 ; GFX6-NEXT:    v_min_i32_e32 v5, v6, v5
 ; GFX6-NEXT:    v_min_i32_e32 v6, 0, v2
-; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, s5, v6
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v5
 ; GFX6-NEXT:    v_max_i32_e32 v5, 0, v2
+; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, s5, v6
 ; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, s4, v5
 ; GFX6-NEXT:    v_max_i32_e32 v6, v6, v7
-; GFX6-NEXT:    v_min_i32_e32 v5, v6, v5
 ; GFX6-NEXT:    v_bfrev_b32_e32 v13, 1
+; GFX6-NEXT:    v_min_i32_e32 v5, v6, v5
 ; GFX6-NEXT:    v_min_i32_e32 v6, 0, v3
-; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, v13, v6
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
 ; GFX6-NEXT:    v_bfrev_b32_e32 v11, -2
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
 ; GFX6-NEXT:    v_max_i32_e32 v5, 0, v3
+; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, v13, v6
 ; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v11, v5
 ; GFX6-NEXT:    v_max_i32_e32 v6, v6, v8
 ; GFX6-NEXT:    v_min_i32_e32 v5, v6, v5
 ; GFX6-NEXT:    v_min_i32_e32 v6, 0, v4
-; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, v13, v6
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
 ; GFX6-NEXT:    v_max_i32_e32 v5, 0, v4
+; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, v13, v6
 ; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v11, v5
 ; GFX6-NEXT:    v_max_i32_e32 v6, v6, v9
 ; GFX6-NEXT:    v_min_i32_e32 v5, v6, v5
@@ -1729,9 +1729,9 @@ define <5 x i32> @v_saddsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_brev_b32 s5, 1
 ; GFX8-NEXT:    v_min_i32_e32 v12, 0, v0
-; GFX8-NEXT:    v_sub_u32_e32 v12, vcc, s5, v12
 ; GFX8-NEXT:    s_brev_b32 s4, -2
 ; GFX8-NEXT:    v_max_i32_e32 v10, 0, v0
+; GFX8-NEXT:    v_sub_u32_e32 v12, vcc, s5, v12
 ; GFX8-NEXT:    v_sub_u32_e32 v10, vcc, s4, v10
 ; GFX8-NEXT:    v_max_i32_e32 v5, v12, v5
 ; GFX8-NEXT:    v_min_i32_e32 v5, v5, v10
@@ -1743,25 +1743,25 @@ define <5 x i32> @v_saddsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) {
 ; GFX8-NEXT:    v_max_i32_e32 v6, v10, v6
 ; GFX8-NEXT:    v_min_i32_e32 v5, v6, v5
 ; GFX8-NEXT:    v_min_i32_e32 v6, 0, v2
-; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, s5, v6
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v5
 ; GFX8-NEXT:    v_max_i32_e32 v5, 0, v2
+; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, s5, v6
 ; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, s4, v5
 ; GFX8-NEXT:    v_max_i32_e32 v6, v6, v7
-; GFX8-NEXT:    v_min_i32_e32 v5, v6, v5
 ; GFX8-NEXT:    v_bfrev_b32_e32 v13, 1
+; GFX8-NEXT:    v_min_i32_e32 v5, v6, v5
 ; GFX8-NEXT:    v_min_i32_e32 v6, 0, v3
-; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, v13, v6
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v5
 ; GFX8-NEXT:    v_bfrev_b32_e32 v11, -2
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v5
 ; GFX8-NEXT:    v_max_i32_e32 v5, 0, v3
+; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, v13, v6
 ; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, v11, v5
 ; GFX8-NEXT:    v_max_i32_e32 v6, v6, v8
 ; GFX8-NEXT:    v_min_i32_e32 v5, v6, v5
 ; GFX8-NEXT:    v_min_i32_e32 v6, 0, v4
-; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, v13, v6
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
 ; GFX8-NEXT:    v_max_i32_e32 v5, 0, v4
+; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, v13, v6
 ; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, v11, v5
 ; GFX8-NEXT:    v_max_i32_e32 v6, v6, v9
 ; GFX8-NEXT:    v_min_i32_e32 v5, v6, v5
@@ -1797,9 +1797,9 @@ define amdgpu_ps <5 x i32> @s_saddsat_v5i32(<5 x i32> inreg %lhs, <5 x i32> inre
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_brev_b32 s11, 1
 ; GFX6-NEXT:    s_min_i32 s13, s0, 0
-; GFX6-NEXT:    s_sub_i32 s13, s11, s13
 ; GFX6-NEXT:    s_brev_b32 s10, -2
 ; GFX6-NEXT:    s_max_i32 s12, s0, 0
+; GFX6-NEXT:    s_sub_i32 s13, s11, s13
 ; GFX6-NEXT:    s_sub_i32 s12, s10, s12
 ; GFX6-NEXT:    s_max_i32 s5, s13, s5
 ; GFX6-NEXT:    s_min_i32 s5, s5, s12
@@ -1811,23 +1811,23 @@ define amdgpu_ps <5 x i32> @s_saddsat_v5i32(<5 x i32> inreg %lhs, <5 x i32> inre
 ; GFX6-NEXT:    s_max_i32 s6, s12, s6
 ; GFX6-NEXT:    s_min_i32 s5, s6, s5
 ; GFX6-NEXT:    s_min_i32 s6, s2, 0
-; GFX6-NEXT:    s_sub_i32 s6, s11, s6
 ; GFX6-NEXT:    s_add_i32 s1, s1, s5
 ; GFX6-NEXT:    s_max_i32 s5, s2, 0
+; GFX6-NEXT:    s_sub_i32 s6, s11, s6
 ; GFX6-NEXT:    s_sub_i32 s5, s10, s5
 ; GFX6-NEXT:    s_max_i32 s6, s6, s7
 ; GFX6-NEXT:    s_min_i32 s5, s6, s5
 ; GFX6-NEXT:    s_min_i32 s6, s3, 0
-; GFX6-NEXT:    s_sub_i32 s6, s11, s6
 ; GFX6-NEXT:    s_add_i32 s2, s2, s5
 ; GFX6-NEXT:    s_max_i32 s5, s3, 0
+; GFX6-NEXT:    s_sub_i32 s6, s11, s6
 ; GFX6-NEXT:    s_sub_i32 s5, s10, s5
 ; GFX6-NEXT:    s_max_i32 s6, s6, s8
 ; GFX6-NEXT:    s_min_i32 s5, s6, s5
 ; GFX6-NEXT:    s_min_i32 s6, s4, 0
-; GFX6-NEXT:    s_sub_i32 s6, s11, s6
 ; GFX6-NEXT:    s_add_i32 s3, s3, s5
 ; GFX6-NEXT:    s_max_i32 s5, s4, 0
+; GFX6-NEXT:    s_sub_i32 s6, s11, s6
 ; GFX6-NEXT:    s_sub_i32 s5, s10, s5
 ; GFX6-NEXT:    s_max_i32 s6, s6, s9
 ; GFX6-NEXT:    s_min_i32 s5, s6, s5
@@ -1838,9 +1838,9 @@ define amdgpu_ps <5 x i32> @s_saddsat_v5i32(<5 x i32> inreg %lhs, <5 x i32> inre
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_brev_b32 s11, 1
 ; GFX8-NEXT:    s_min_i32 s13, s0, 0
-; GFX8-NEXT:    s_sub_i32 s13, s11, s13
 ; GFX8-NEXT:    s_brev_b32 s10, -2
 ; GFX8-NEXT:    s_max_i32 s12, s0, 0
+; GFX8-NEXT:    s_sub_i32 s13, s11, s13
 ; GFX8-NEXT:    s_sub_i32 s12, s10, s12
 ; GFX8-NEXT:    s_max_i32 s5, s13, s5
 ; GFX8-NEXT:    s_min_i32 s5, s5, s12
@@ -1852,23 +1852,23 @@ define amdgpu_ps <5 x i32> @s_saddsat_v5i32(<5 x i32> inreg %lhs, <5 x i32> inre
 ; GFX8-NEXT:    s_max_i32 s6, s12, s6
 ; GFX8-NEXT:    s_min_i32 s5, s6, s5
 ; GFX8-NEXT:    s_min_i32 s6, s2, 0
-; GFX8-NEXT:    s_sub_i32 s6, s11, s6
 ; GFX8-NEXT:    s_add_i32 s1, s1, s5
 ; GFX8-NEXT:    s_max_i32 s5, s2, 0
+; GFX8-NEXT:    s_sub_i32 s6, s11, s6
 ; GFX8-NEXT:    s_sub_i32 s5, s10, s5
 ; GFX8-NEXT:    s_max_i32 s6, s6, s7
 ; GFX8-NEXT:    s_min_i32 s5, s6, s5
 ; GFX8-NEXT:    s_min_i32 s6, s3, 0
-; GFX8-NEXT:    s_sub_i32 s6, s11, s6
 ; GFX8-NEXT:    s_add_i32 s2, s2, s5
 ; GFX8-NEXT:    s_max_i32 s5, s3, 0
+; GFX8-NEXT:    s_sub_i32 s6, s11, s6
 ; GFX8-NEXT:    s_sub_i32 s5, s10, s5
 ; GFX8-NEXT:    s_max_i32 s6, s6, s8
 ; GFX8-NEXT:    s_min_i32 s5, s6, s5
 ; GFX8-NEXT:    s_min_i32 s6, s4, 0
-; GFX8-NEXT:    s_sub_i32 s6, s11, s6
 ; GFX8-NEXT:    s_add_i32 s3, s3, s5
 ; GFX8-NEXT:    s_max_i32 s5, s4, 0
+; GFX8-NEXT:    s_sub_i32 s6, s11, s6
 ; GFX8-NEXT:    s_sub_i32 s5, s10, s5
 ; GFX8-NEXT:    s_max_i32 s6, s6, s9
 ; GFX8-NEXT:    s_min_i32 s5, s6, s5
@@ -2205,9 +2205,9 @@ define amdgpu_ps <16 x i32> @s_saddsat_v16i32(<16 x i32> inreg %lhs, <16 x i32>
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_brev_b32 s33, 1
 ; GFX6-NEXT:    s_min_i32 s35, s0, 0
-; GFX6-NEXT:    s_sub_i32 s35, s33, s35
 ; GFX6-NEXT:    s_brev_b32 s32, -2
 ; GFX6-NEXT:    s_max_i32 s34, s0, 0
+; GFX6-NEXT:    s_sub_i32 s35, s33, s35
 ; GFX6-NEXT:    s_sub_i32 s34, s32, s34
 ; GFX6-NEXT:    s_max_i32 s16, s35, s16
 ; GFX6-NEXT:    s_min_i32 s16, s16, s34
@@ -2219,100 +2219,100 @@ define amdgpu_ps <16 x i32> @s_saddsat_v16i32(<16 x i32> inreg %lhs, <16 x i32>
 ; GFX6-NEXT:    s_max_i32 s17, s34, s17
 ; GFX6-NEXT:    s_min_i32 s16, s17, s16
 ; GFX6-NEXT:    s_min_i32 s17, s2, 0
-; GFX6-NEXT:    s_sub_i32 s17, s33, s17
 ; GFX6-NEXT:    s_add_i32 s1, s1, s16
 ; GFX6-NEXT:    s_max_i32 s16, s2, 0
+; GFX6-NEXT:    s_sub_i32 s17, s33, s17
 ; GFX6-NEXT:    s_sub_i32 s16, s32, s16
 ; GFX6-NEXT:    s_max_i32 s17, s17, s18
 ; GFX6-NEXT:    s_min_i32 s16, s17, s16
 ; GFX6-NEXT:    s_min_i32 s17, s3, 0
-; GFX6-NEXT:    s_sub_i32 s17, s33, s17
 ; GFX6-NEXT:    s_add_i32 s2, s2, s16
 ; GFX6-NEXT:    s_max_i32 s16, s3, 0
+; GFX6-NEXT:    s_sub_i32 s17, s33, s17
 ; GFX6-NEXT:    s_sub_i32 s16, s32, s16
 ; GFX6-NEXT:    s_max_i32 s17, s17, s19
 ; GFX6-NEXT:    s_min_i32 s16, s17, s16
 ; GFX6-NEXT:    s_min_i32 s17, s4, 0
-; GFX6-NEXT:    s_sub_i32 s17, s33, s17
 ; GFX6-NEXT:    s_add_i32 s3, s3, s16
 ; GFX6-NEXT:    s_max_i32 s16, s4, 0
+; GFX6-NEXT:    s_sub_i32 s17, s33, s17
 ; GFX6-NEXT:    s_sub_i32 s16, s32, s16
 ; GFX6-NEXT:    s_max_i32 s17, s17, s20
 ; GFX6-NEXT:    s_min_i32 s16, s17, s16
 ; GFX6-NEXT:    s_min_i32 s17, s5, 0
-; GFX6-NEXT:    s_sub_i32 s17, s33, s17
 ; GFX6-NEXT:    s_add_i32 s4, s4, s16
 ; GFX6-NEXT:    s_max_i32 s16, s5, 0
+; GFX6-NEXT:    s_sub_i32 s17, s33, s17
 ; GFX6-NEXT:    s_sub_i32 s16, s32, s16
 ; GFX6-NEXT:    s_max_i32 s17, s17, s21
 ; GFX6-NEXT:    s_min_i32 s16, s17, s16
 ; GFX6-NEXT:    s_min_i32 s17, s6, 0
-; GFX6-NEXT:    s_sub_i32 s17, s33, s17
 ; GFX6-NEXT:    s_add_i32 s5, s5, s16
 ; GFX6-NEXT:    s_max_i32 s16, s6, 0
+; GFX6-NEXT:    s_sub_i32 s17, s33, s17
 ; GFX6-NEXT:    s_sub_i32 s16, s32, s16
 ; GFX6-NEXT:    s_max_i32 s17, s17, s22
 ; GFX6-NEXT:    s_min_i32 s16, s17, s16
 ; GFX6-NEXT:    s_min_i32 s17, s7, 0
-; GFX6-NEXT:    s_sub_i32 s17, s33, s17
 ; GFX6-NEXT:    s_add_i32 s6, s6, s16
 ; GFX6-NEXT:    s_max_i32 s16, s7, 0
+; GFX6-NEXT:    s_sub_i32 s17, s33, s17
 ; GFX6-NEXT:    s_sub_i32 s16, s32, s16
 ; GFX6-NEXT:    s_max_i32 s17, s17, s23
 ; GFX6-NEXT:    s_min_i32 s16, s17, s16
 ; GFX6-NEXT:    s_min_i32 s17, s8, 0
-; GFX6-NEXT:    s_sub_i32 s17, s33, s17
 ; GFX6-NEXT:    s_add_i32 s7, s7, s16
 ; GFX6-NEXT:    s_max_i32 s16, s8, 0
+; GFX6-NEXT:    s_sub_i32 s17, s33, s17
 ; GFX6-NEXT:    s_sub_i32 s16, s32, s16
 ; GFX6-NEXT:    s_max_i32 s17, s17, s24
 ; GFX6-NEXT:    s_min_i32 s16, s17, s16
 ; GFX6-NEXT:    s_min_i32 s17, s9, 0
-; GFX6-NEXT:    s_sub_i32 s17, s33, s17
 ; GFX6-NEXT:    s_add_i32 s8, s8, s16
 ; GFX6-NEXT:    s_max_i32 s16, s9, 0
+; GFX6-NEXT:    s_sub_i32 s17, s33, s17
 ; GFX6-NEXT:    s_sub_i32 s16, s32, s16
 ; GFX6-NEXT:    s_max_i32 s17, s17, s25
 ; GFX6-NEXT:    s_min_i32 s16, s17, s16
 ; GFX6-NEXT:    s_min_i32 s17, s10, 0
-; GFX6-NEXT:    s_sub_i32 s17, s33, s17
 ; GFX6-NEXT:    s_add_i32 s9, s9, s16
 ; GFX6-NEXT:    s_max_i32 s16, s10, 0
+; GFX6-NEXT:    s_sub_i32 s17, s33, s17
 ; GFX6-NEXT:    s_sub_i32 s16, s32, s16
 ; GFX6-NEXT:    s_max_i32 s17, s17, s26
 ; GFX6-NEXT:    s_min_i32 s16, s17, s16
 ; GFX6-NEXT:    s_min_i32 s17, s11, 0
-; GFX6-NEXT:    s_sub_i32 s17, s33, s17
 ; GFX6-NEXT:    s_add_i32 s10, s10, s16
 ; GFX6-NEXT:    s_max_i32 s16, s11, 0
+; GFX6-NEXT:    s_sub_i32 s17, s33, s17
 ; GFX6-NEXT:    s_sub_i32 s16, s32, s16
 ; GFX6-NEXT:    s_max_i32 s17, s17, s27
 ; GFX6-NEXT:    s_min_i32 s16, s17, s16
 ; GFX6-NEXT:    s_min_i32 s17, s12, 0
-; GFX6-NEXT:    s_sub_i32 s17, s33, s17
 ; GFX6-NEXT:    s_add_i32 s11, s11, s16
 ; GFX6-NEXT:    s_max_i32 s16, s12, 0
+; GFX6-NEXT:    s_sub_i32 s17, s33, s17
 ; GFX6-NEXT:    s_sub_i32 s16, s32, s16
 ; GFX6-NEXT:    s_max_i32 s17, s17, s28
 ; GFX6-NEXT:    s_min_i32 s16, s17, s16
 ; GFX6-NEXT:    s_min_i32 s17, s13, 0
-; GFX6-NEXT:    s_sub_i32 s17, s33, s17
 ; GFX6-NEXT:    s_add_i32 s12, s12, s16
 ; GFX6-NEXT:    s_max_i32 s16, s13, 0
+; GFX6-NEXT:    s_sub_i32 s17, s33, s17
 ; GFX6-NEXT:    s_sub_i32 s16, s32, s16
 ; GFX6-NEXT:    s_max_i32 s17, s17, s29
 ; GFX6-NEXT:    s_min_i32 s16, s17, s16
 ; GFX6-NEXT:    s_min_i32 s17, s14, 0
-; GFX6-NEXT:    s_sub_i32 s17, s33, s17
 ; GFX6-NEXT:    s_add_i32 s13, s13, s16
 ; GFX6-NEXT:    s_max_i32 s16, s14, 0
+; GFX6-NEXT:    s_sub_i32 s17, s33, s17
 ; GFX6-NEXT:    s_sub_i32 s16, s32, s16
 ; GFX6-NEXT:    s_max_i32 s17, s17, s30
 ; GFX6-NEXT:    s_min_i32 s16, s17, s16
 ; GFX6-NEXT:    s_min_i32 s17, s15, 0
-; GFX6-NEXT:    s_sub_i32 s17, s33, s17
 ; GFX6-NEXT:    s_add_i32 s14, s14, s16
 ; GFX6-NEXT:    s_max_i32 s16, s15, 0
+; GFX6-NEXT:    s_sub_i32 s17, s33, s17
 ; GFX6-NEXT:    s_sub_i32 s16, s32, s16
 ; GFX6-NEXT:    s_max_i32 s17, s17, s31
 ; GFX6-NEXT:    s_min_i32 s16, s17, s16
@@ -2323,9 +2323,9 @@ define amdgpu_ps <16 x i32> @s_saddsat_v16i32(<16 x i32> inreg %lhs, <16 x i32>
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_brev_b32 s33, 1
 ; GFX8-NEXT:    s_min_i32 s35, s0, 0
-; GFX8-NEXT:    s_sub_i32 s35, s33, s35
 ; GFX8-NEXT:    s_brev_b32 s32, -2
 ; GFX8-NEXT:    s_max_i32 s34, s0, 0
+; GFX8-NEXT:    s_sub_i32 s35, s33, s35
 ; GFX8-NEXT:    s_sub_i32 s34, s32, s34
 ; GFX8-NEXT:    s_max_i32 s16, s35, s16
 ; GFX8-NEXT:    s_min_i32 s16, s16, s34
@@ -2337,100 +2337,100 @@ define amdgpu_ps <16 x i32> @s_saddsat_v16i32(<16 x i32> inreg %lhs, <16 x i32>
 ; GFX8-NEXT:    s_max_i32 s17, s34, s17
 ; GFX8-NEXT:    s_min_i32 s16, s17, s16
 ; GFX8-NEXT:    s_min_i32 s17, s2, 0
-; GFX8-NEXT:    s_sub_i32 s17, s33, s17
 ; GFX8-NEXT:    s_add_i32 s1, s1, s16
 ; GFX8-NEXT:    s_max_i32 s16, s2, 0
+; GFX8-NEXT:    s_sub_i32 s17, s33, s17
 ; GFX8-NEXT:    s_sub_i32 s16, s32, s16
 ; GFX8-NEXT:    s_max_i32 s17, s17, s18
 ; GFX8-NEXT:    s_min_i32 s16, s17, s16
 ; GFX8-NEXT:    s_min_i32 s17, s3, 0
-; GFX8-NEXT:    s_sub_i32 s17, s33, s17
 ; GFX8-NEXT:    s_add_i32 s2, s2, s16
 ; GFX8-NEXT:    s_max_i32 s16, s3, 0
+; GFX8-NEXT:    s_sub_i32 s17, s33, s17
 ; GFX8-NEXT:    s_sub_i32 s16, s32, s16
 ; GFX8-NEXT:    s_max_i32 s17, s17, s19
 ; GFX8-NEXT:    s_min_i32 s16, s17, s16
 ; GFX8-NEXT:    s_min_i32 s17, s4, 0
-; GFX8-NEXT:    s_sub_i32 s17, s33, s17
 ; GFX8-NEXT:    s_add_i32 s3, s3, s16
 ; GFX8-NEXT:    s_max_i32 s16, s4, 0
+; GFX8-NEXT:    s_sub_i32 s17, s33, s17
 ; GFX8-NEXT:    s_sub_i32 s16, s32, s16
 ; GFX8-NEXT:    s_max_i32 s17, s17, s20
 ; GFX8-NEXT:    s_min_i32 s16, s17, s16
 ; GFX8-NEXT:    s_min_i32 s17, s5, 0
-; GFX8-NEXT:    s_sub_i32 s17, s33, s17
 ; GFX8-NEXT:    s_add_i32 s4, s4, s16
 ; GFX8-NEXT:    s_max_i32 s16, s5, 0
+; GFX8-NEXT:    s_sub_i32 s17, s33, s17
 ; GFX8-NEXT:    s_sub_i32 s16, s32, s16
 ; GFX8-NEXT:    s_max_i32 s17, s17, s21
 ; GFX8-NEXT:    s_min_i32 s16, s17, s16
 ; GFX8-NEXT:    s_min_i32 s17, s6, 0
-; GFX8-NEXT:    s_sub_i32 s17, s33, s17
 ; GFX8-NEXT:    s_add_i32 s5, s5, s16
 ; GFX8-NEXT:    s_max_i32 s16, s6, 0
+; GFX8-NEXT:    s_sub_i32 s17, s33, s17
 ; GFX8-NEXT:    s_sub_i32 s16, s32, s16
 ; GFX8-NEXT:    s_max_i32 s17, s17, s22
 ; GFX8-NEXT:    s_min_i32 s16, s17, s16
 ; GFX8-NEXT:    s_min_i32 s17, s7, 0
-; GFX8-NEXT:    s_sub_i32 s17, s33, s17
 ; GFX8-NEXT:    s_add_i32 s6, s6, s16
 ; GFX8-NEXT:    s_max_i32 s16, s7, 0
+; GFX8-NEXT:    s_sub_i32 s17, s33, s17
 ; GFX8-NEXT:    s_sub_i32 s16, s32, s16
 ; GFX8-NEXT:    s_max_i32 s17, s17, s23
 ; GFX8-NEXT:    s_min_i32 s16, s17, s16
 ; GFX8-NEXT:    s_min_i32 s17, s8, 0
-; GFX8-NEXT:    s_sub_i32 s17, s33, s17
 ; GFX8-NEXT:    s_add_i32 s7, s7, s16
 ; GFX8-NEXT:    s_max_i32 s16, s8, 0
+; GFX8-NEXT:    s_sub_i32 s17, s33, s17
 ; GFX8-NEXT:    s_sub_i32 s16, s32, s16
 ; GFX8-NEXT:    s_max_i32 s17, s17, s24
 ; GFX8-NEXT:    s_min_i32 s16, s17, s16
 ; GFX8-NEXT:    s_min_i32 s17, s9, 0
-; GFX8-NEXT:    s_sub_i32 s17, s33, s17
 ; GFX8-NEXT:    s_add_i32 s8, s8, s16
 ; GFX8-NEXT:    s_max_i32 s16, s9, 0
+; GFX8-NEXT:    s_sub_i32 s17, s33, s17
 ; GFX8-NEXT:    s_sub_i32 s16, s32, s16
 ; GFX8-NEXT:    s_max_i32 s17, s17, s25
 ; GFX8-NEXT:    s_min_i32 s16, s17, s16
 ; GFX8-NEXT:    s_min_i32 s17, s10, 0
-; GFX8-NEXT:    s_sub_i32 s17, s33, s17
 ; GFX8-NEXT:    s_add_i32 s9, s9, s16
 ; GFX8-NEXT:    s_max_i32 s16, s10, 0
+; GFX8-NEXT:    s_sub_i32 s17, s33, s17
 ; GFX8-NEXT:    s_sub_i32 s16, s32, s16
 ; GFX8-NEXT:    s_max_i32 s17, s17, s26
 ; GFX8-NEXT:    s_min_i32 s16, s17, s16
 ; GFX8-NEXT:    s_min_i32 s17, s11, 0
-; GFX8-NEXT:    s_sub_i32 s17, s33, s17
 ; GFX8-NEXT:    s_add_i32 s10, s10, s16
 ; GFX8-NEXT:    s_max_i32 s16, s11, 0
+; GFX8-NEXT:    s_sub_i32 s17, s33, s17
 ; GFX8-NEXT:    s_sub_i32 s16, s32, s16
 ; GFX8-NEXT:    s_max_i32 s17, s17, s27
 ; GFX8-NEXT:    s_min_i32 s16, s17, s16
 ; GFX8-NEXT:    s_min_i32 s17, s12, 0
-; GFX8-NEXT:    s_sub_i32 s17, s33, s17
 ; GFX8-NEXT:    s_add_i32 s11, s11, s16
 ; GFX8-NEXT:    s_max_i32 s16, s12, 0
+; GFX8-NEXT:    s_sub_i32 s17, s33, s17
 ; GFX8-NEXT:    s_sub_i32 s16, s32, s16
 ; GFX8-NEXT:    s_max_i32 s17, s17, s28
 ; GFX8-NEXT:    s_min_i32 s16, s17, s16
 ; GFX8-NEXT:    s_min_i32 s17, s13, 0
-; GFX8-NEXT:    s_sub_i32 s17, s33, s17
 ; GFX8-NEXT:    s_add_i32 s12, s12, s16
 ; GFX8-NEXT:    s_max_i32 s16, s13, 0
+; GFX8-NEXT:    s_sub_i32 s17, s33, s17
 ; GFX8-NEXT:    s_sub_i32 s16, s32, s16
 ; GFX8-NEXT:    s_max_i32 s17, s17, s29
 ; GFX8-NEXT:    s_min_i32 s16, s17, s16
 ; GFX8-NEXT:    s_min_i32 s17, s14, 0
-; GFX8-NEXT:    s_sub_i32 s17, s33, s17
 ; GFX8-NEXT:    s_add_i32 s13, s13, s16
 ; GFX8-NEXT:    s_max_i32 s16, s14, 0
+; GFX8-NEXT:    s_sub_i32 s17, s33, s17
 ; GFX8-NEXT:    s_sub_i32 s16, s32, s16
 ; GFX8-NEXT:    s_max_i32 s17, s17, s30
 ; GFX8-NEXT:    s_min_i32 s16, s17, s16
 ; GFX8-NEXT:    s_min_i32 s17, s15, 0
-; GFX8-NEXT:    s_sub_i32 s17, s33, s17
 ; GFX8-NEXT:    s_add_i32 s14, s14, s16
 ; GFX8-NEXT:    s_max_i32 s16, s15, 0
+; GFX8-NEXT:    s_sub_i32 s17, s33, s17
 ; GFX8-NEXT:    s_sub_i32 s16, s32, s16
 ; GFX8-NEXT:    s_max_i32 s17, s17, s31
 ; GFX8-NEXT:    s_min_i32 s16, s17, s16
@@ -2534,8 +2534,8 @@ define i16 @v_saddsat_i16(i16 %lhs, i16 %rhs) {
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    v_min_i32_e32 v3, 0, v0
-; GFX6-NEXT:    v_max_i32_e32 v2, 0, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_max_i32_e32 v2, 0, v0
 ; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, 0x80000000, v3
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 0x7fffffff, v2
 ; GFX6-NEXT:    v_max_i32_e32 v1, v3, v1
@@ -2577,8 +2577,8 @@ define amdgpu_ps i16 @s_saddsat_i16(i16 inreg %lhs, i16 inreg %rhs) {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX6-NEXT:    s_min_i32 s3, s0, 0
-; GFX6-NEXT:    s_max_i32 s2, s0, 0
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX6-NEXT:    s_max_i32 s2, s0, 0
 ; GFX6-NEXT:    s_sub_i32 s3, 0x80000000, s3
 ; GFX6-NEXT:    s_sub_i32 s2, 0x7fffffff, s2
 ; GFX6-NEXT:    s_max_i32 s1, s3, s1
@@ -2596,8 +2596,8 @@ define amdgpu_ps i16 @s_saddsat_i16(i16 inreg %lhs, i16 inreg %rhs) {
 ; GFX8-NEXT:    s_sub_i32 s2, 0xffff8000, s2
 ; GFX8-NEXT:    s_sext_i32_i16 s2, s2
 ; GFX8-NEXT:    s_sext_i32_i16 s1, s1
-; GFX8-NEXT:    s_max_i32 s1, s2, s1
 ; GFX8-NEXT:    s_sub_i32 s4, 0x7fff, s4
+; GFX8-NEXT:    s_max_i32 s1, s2, s1
 ; GFX8-NEXT:    s_sext_i32_i16 s1, s1
 ; GFX8-NEXT:    s_sext_i32_i16 s2, s4
 ; GFX8-NEXT:    s_min_i32 s1, s1, s2
@@ -2625,8 +2625,8 @@ define amdgpu_ps half @saddsat_i16_sv(i16 inreg %lhs, i16 %rhs) {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX6-NEXT:    s_min_i32 s2, s0, 0
-; GFX6-NEXT:    s_max_i32 s1, s0, 0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX6-NEXT:    s_max_i32 s1, s0, 0
 ; GFX6-NEXT:    s_sub_i32 s2, 0x80000000, s2
 ; GFX6-NEXT:    s_sub_i32 s1, 0x7fffffff, s1
 ; GFX6-NEXT:    v_max_i32_e32 v0, s2, v0
@@ -2667,8 +2667,8 @@ define amdgpu_ps half @saddsat_i16_vs(i16 %lhs, i16 inreg %rhs) {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    v_min_i32_e32 v2, 0, v0
-; GFX6-NEXT:    v_max_i32_e32 v1, 0, v0
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
+; GFX6-NEXT:    v_max_i32_e32 v1, 0, v0
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 0x80000000, v2
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, 0x7fffffff, v1
 ; GFX6-NEXT:    v_max_i32_e32 v2, s0, v2
@@ -2710,9 +2710,9 @@ define <2 x i16> @v_saddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
 ; GFX6-NEXT:    s_brev_b32 s5, 1
 ; GFX6-NEXT:    v_min_i32_e32 v5, 0, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, s5, v5
 ; GFX6-NEXT:    s_brev_b32 s4, -2
 ; GFX6-NEXT:    v_max_i32_e32 v4, 0, v0
+; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, s5, v5
 ; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s4, v4
 ; GFX6-NEXT:    v_max_i32_e32 v2, v5, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
@@ -2735,13 +2735,13 @@ define <2 x i16> @v_saddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_movk_i32 s5, 0x8000
 ; GFX8-NEXT:    v_min_i16_e32 v4, 0, v0
-; GFX8-NEXT:    v_sub_u16_e32 v4, s5, v4
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX8-NEXT:    v_max_i16_e32 v3, 0, v0
-; GFX8-NEXT:    v_min_i16_e32 v5, 0, v2
+; GFX8-NEXT:    v_sub_u16_e32 v4, s5, v4
 ; GFX8-NEXT:    v_sub_u16_e32 v3, s4, v3
 ; GFX8-NEXT:    v_max_i16_e32 v4, v4, v1
+; GFX8-NEXT:    v_min_i16_e32 v5, 0, v2
 ; GFX8-NEXT:    v_min_i16_e32 v3, v4, v3
 ; GFX8-NEXT:    v_max_i16_e32 v4, 0, v2
 ; GFX8-NEXT:    v_sub_u16_e32 v5, s5, v5
@@ -2776,9 +2776,9 @@ define amdgpu_ps i32 @s_saddsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs
 ; GFX6-NEXT:    s_brev_b32 s5, 1
 ; GFX6-NEXT:    s_min_i32 s7, s0, 0
 ; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX6-NEXT:    s_sub_i32 s7, s5, s7
 ; GFX6-NEXT:    s_brev_b32 s4, -2
 ; GFX6-NEXT:    s_max_i32 s6, s0, 0
+; GFX6-NEXT:    s_sub_i32 s7, s5, s7
 ; GFX6-NEXT:    s_sub_i32 s6, s4, s6
 ; GFX6-NEXT:    s_max_i32 s2, s7, s2
 ; GFX6-NEXT:    s_min_i32 s2, s2, s6
@@ -2805,16 +2805,16 @@ define amdgpu_ps i32 @s_saddsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_sext_i32_i16 s6, s0
 ; GFX8-NEXT:    s_sext_i32_i16 s7, 0
-; GFX8-NEXT:    s_max_i32 s8, s6, s7
 ; GFX8-NEXT:    s_movk_i32 s5, 0x8000
+; GFX8-NEXT:    s_max_i32 s8, s6, s7
 ; GFX8-NEXT:    s_min_i32 s6, s6, s7
 ; GFX8-NEXT:    s_sub_i32 s6, s5, s6
 ; GFX8-NEXT:    s_lshr_b32 s3, s1, 16
 ; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX8-NEXT:    s_sext_i32_i16 s6, s6
 ; GFX8-NEXT:    s_sext_i32_i16 s1, s1
-; GFX8-NEXT:    s_max_i32 s1, s6, s1
 ; GFX8-NEXT:    s_sub_i32 s8, s4, s8
+; GFX8-NEXT:    s_max_i32 s1, s6, s1
 ; GFX8-NEXT:    s_sext_i32_i16 s1, s1
 ; GFX8-NEXT:    s_sext_i32_i16 s6, s8
 ; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
@@ -2826,8 +2826,8 @@ define amdgpu_ps i32 @s_saddsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs
 ; GFX8-NEXT:    s_sub_i32 s1, s5, s1
 ; GFX8-NEXT:    s_sext_i32_i16 s1, s1
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
-; GFX8-NEXT:    s_max_i32 s1, s1, s3
 ; GFX8-NEXT:    s_sub_i32 s4, s4, s6
+; GFX8-NEXT:    s_max_i32 s1, s1, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s1, s1
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s4
 ; GFX8-NEXT:    s_min_i32 s1, s1, s3
@@ -2862,9 +2862,9 @@ define amdgpu_ps float @saddsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) {
 ; GFX6-NEXT:    s_brev_b32 s3, 1
 ; GFX6-NEXT:    s_min_i32 s5, s0, 0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX6-NEXT:    s_sub_i32 s5, s3, s5
 ; GFX6-NEXT:    s_brev_b32 s2, -2
 ; GFX6-NEXT:    s_max_i32 s4, s0, 0
+; GFX6-NEXT:    s_sub_i32 s5, s3, s5
 ; GFX6-NEXT:    s_sub_i32 s4, s2, s4
 ; GFX6-NEXT:    v_max_i32_e32 v0, s5, v0
 ; GFX6-NEXT:    v_min_i32_e32 v0, s4, v0
@@ -2891,22 +2891,22 @@ define amdgpu_ps float @saddsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s0
 ; GFX8-NEXT:    s_sext_i32_i16 s5, 0
-; GFX8-NEXT:    s_max_i32 s6, s4, s5
 ; GFX8-NEXT:    s_movk_i32 s3, 0x8000
+; GFX8-NEXT:    s_max_i32 s6, s4, s5
 ; GFX8-NEXT:    s_min_i32 s4, s4, s5
+; GFX8-NEXT:    s_lshr_b32 s1, s0, 16
 ; GFX8-NEXT:    s_movk_i32 s2, 0x7fff
 ; GFX8-NEXT:    s_sub_i32 s4, s3, s4
-; GFX8-NEXT:    s_lshr_b32 s1, s0, 16
-; GFX8-NEXT:    v_max_i16_e32 v1, s4, v0
 ; GFX8-NEXT:    s_sub_i32 s6, s2, s6
+; GFX8-NEXT:    v_max_i16_e32 v1, s4, v0
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s1
 ; GFX8-NEXT:    v_min_i16_e32 v1, s6, v1
 ; GFX8-NEXT:    s_max_i32 s6, s4, s5
 ; GFX8-NEXT:    s_min_i32 s4, s4, s5
 ; GFX8-NEXT:    s_sub_i32 s3, s3, s4
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s3
-; GFX8-NEXT:    v_max_i16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-NEXT:    s_sub_i32 s2, s2, s6
+; GFX8-NEXT:    v_max_i16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-NEXT:    v_min_i16_e32 v0, s2, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX8-NEXT:    v_add_u16_e32 v1, s0, v1
@@ -2935,20 +2935,20 @@ define amdgpu_ps float @saddsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) {
 ; GFX6-NEXT:    s_brev_b32 s3, 1
 ; GFX6-NEXT:    v_min_i32_e32 v3, 0, v0
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s3, v3
 ; GFX6-NEXT:    s_brev_b32 s2, -2
 ; GFX6-NEXT:    v_max_i32_e32 v2, 0, v0
-; GFX6-NEXT:    v_max_i32_e32 v3, s0, v3
+; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s3, v3
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s2, v2
+; GFX6-NEXT:    v_max_i32_e32 v3, s0, v3
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_min_i32_e32 v2, v3, v2
 ; GFX6-NEXT:    v_min_i32_e32 v3, 0, v1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GFX6-NEXT:    v_max_i32_e32 v2, 0, v1
 ; GFX6-NEXT:    s_lshl_b32 s0, s1, 16
+; GFX6-NEXT:    v_max_i32_e32 v2, 0, v1
 ; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s3, v3
-; GFX6-NEXT:    v_max_i32_e32 v3, s0, v3
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s2, v2
+; GFX6-NEXT:    v_max_i32_e32 v3, s0, v3
 ; GFX6-NEXT:    v_min_i32_e32 v2, v3, v2
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
@@ -2964,16 +2964,16 @@ define amdgpu_ps float @saddsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_movk_i32 s3, 0x8000
 ; GFX8-NEXT:    v_min_i16_e32 v3, 0, v0
-; GFX8-NEXT:    v_sub_u16_e32 v3, s3, v3
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX8-NEXT:    s_movk_i32 s2, 0x7fff
 ; GFX8-NEXT:    v_max_i16_e32 v2, 0, v0
-; GFX8-NEXT:    v_min_i16_e32 v4, 0, v1
+; GFX8-NEXT:    v_sub_u16_e32 v3, s3, v3
 ; GFX8-NEXT:    v_sub_u16_e32 v2, s2, v2
 ; GFX8-NEXT:    v_max_i16_e32 v3, s0, v3
+; GFX8-NEXT:    v_min_i16_e32 v4, 0, v1
+; GFX8-NEXT:    s_lshr_b32 s1, s0, 16
 ; GFX8-NEXT:    v_min_i16_e32 v2, v3, v2
 ; GFX8-NEXT:    v_max_i16_e32 v3, 0, v1
-; GFX8-NEXT:    s_lshr_b32 s1, s0, 16
 ; GFX8-NEXT:    v_sub_u16_e32 v4, s3, v4
 ; GFX8-NEXT:    v_sub_u16_e32 v3, s2, v3
 ; GFX8-NEXT:    v_max_i16_e32 v4, s1, v4
@@ -3016,9 +3016,9 @@ define <2 x float> @v_saddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 ; GFX6-NEXT:    s_brev_b32 s5, 1
 ; GFX6-NEXT:    v_min_i32_e32 v10, 0, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, s5, v10
 ; GFX6-NEXT:    s_brev_b32 s4, -2
 ; GFX6-NEXT:    v_max_i32_e32 v8, 0, v0
+; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, s5, v10
 ; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, s4, v8
 ; GFX6-NEXT:    v_max_i32_e32 v4, v10, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
@@ -3035,23 +3035,23 @@ define <2 x float> @v_saddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
 ; GFX6-NEXT:    v_min_i32_e32 v6, 0, v2
-; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, s5, v6
 ; GFX6-NEXT:    v_bfrev_b32_e32 v9, -2
 ; GFX6-NEXT:    v_max_i32_e32 v5, 0, v2
+; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, s5, v6
 ; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v9, v5
 ; GFX6-NEXT:    v_max_i32_e32 v4, v6, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_min_i32_e32 v4, v4, v5
 ; GFX6-NEXT:    v_bfrev_b32_e32 v11, 1
+; GFX6-NEXT:    v_min_i32_e32 v4, v4, v5
 ; GFX6-NEXT:    v_min_i32_e32 v6, 0, v3
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
-; GFX6-NEXT:    v_max_i32_e32 v5, 0, v3
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
+; GFX6-NEXT:    v_max_i32_e32 v5, 0, v3
 ; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, v11, v6
 ; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v9, v5
 ; GFX6-NEXT:    v_max_i32_e32 v4, v6, v4
-; GFX6-NEXT:    v_min_i32_e32 v4, v4, v5
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
+; GFX6-NEXT:    v_min_i32_e32 v4, v4, v5
 ; GFX6-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
@@ -3072,26 +3072,26 @@ define <2 x float> @v_saddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_movk_i32 s5, 0x8000
 ; GFX8-NEXT:    v_min_i16_e32 v7, 0, v0
-; GFX8-NEXT:    v_sub_u16_e32 v7, s5, v7
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
 ; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX8-NEXT:    v_max_i16_e32 v6, 0, v0
-; GFX8-NEXT:    v_min_i16_e32 v8, 0, v4
+; GFX8-NEXT:    v_sub_u16_e32 v7, s5, v7
 ; GFX8-NEXT:    v_sub_u16_e32 v6, s4, v6
 ; GFX8-NEXT:    v_max_i16_e32 v7, v7, v2
+; GFX8-NEXT:    v_min_i16_e32 v8, 0, v4
 ; GFX8-NEXT:    v_min_i16_e32 v6, v7, v6
 ; GFX8-NEXT:    v_max_i16_e32 v7, 0, v4
 ; GFX8-NEXT:    v_sub_u16_e32 v8, s5, v8
+; GFX8-NEXT:    v_sub_u16_e32 v7, s4, v7
 ; GFX8-NEXT:    v_max_i16_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-NEXT:    v_min_i16_e32 v8, 0, v1
-; GFX8-NEXT:    v_sub_u16_e32 v7, s4, v7
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
 ; GFX8-NEXT:    v_min_i16_e32 v2, v2, v7
 ; GFX8-NEXT:    v_max_i16_e32 v7, 0, v1
 ; GFX8-NEXT:    v_sub_u16_e32 v8, s5, v8
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; GFX8-NEXT:    v_min_i16_e32 v9, 0, v5
 ; GFX8-NEXT:    v_sub_u16_e32 v7, s4, v7
 ; GFX8-NEXT:    v_max_i16_e32 v8, v8, v3
+; GFX8-NEXT:    v_min_i16_e32 v9, 0, v5
 ; GFX8-NEXT:    v_min_i16_e32 v7, v8, v7
 ; GFX8-NEXT:    v_max_i16_e32 v8, 0, v5
 ; GFX8-NEXT:    v_sub_u16_e32 v9, s5, v9
@@ -3132,9 +3132,9 @@ define amdgpu_ps <2 x i32> @s_saddsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre
 ; GFX6-NEXT:    s_brev_b32 s9, 1
 ; GFX6-NEXT:    s_min_i32 s11, s0, 0
 ; GFX6-NEXT:    s_lshl_b32 s4, s4, 16
-; GFX6-NEXT:    s_sub_i32 s11, s9, s11
 ; GFX6-NEXT:    s_brev_b32 s8, -2
 ; GFX6-NEXT:    s_max_i32 s10, s0, 0
+; GFX6-NEXT:    s_sub_i32 s11, s9, s11
 ; GFX6-NEXT:    s_sub_i32 s10, s8, s10
 ; GFX6-NEXT:    s_max_i32 s4, s11, s4
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
@@ -3159,14 +3159,14 @@ define amdgpu_ps <2 x i32> @s_saddsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre
 ; GFX6-NEXT:    s_min_i32 s4, s4, s5
 ; GFX6-NEXT:    s_min_i32 s6, s3, 0
 ; GFX6-NEXT:    s_add_i32 s2, s2, s4
-; GFX6-NEXT:    s_max_i32 s5, s3, 0
 ; GFX6-NEXT:    s_lshl_b32 s4, s7, 16
+; GFX6-NEXT:    s_max_i32 s5, s3, 0
 ; GFX6-NEXT:    s_sub_i32 s6, s9, s6
 ; GFX6-NEXT:    s_sub_i32 s5, s8, s5
 ; GFX6-NEXT:    s_max_i32 s4, s6, s4
 ; GFX6-NEXT:    s_min_i32 s4, s4, s5
-; GFX6-NEXT:    s_add_i32 s3, s3, s4
 ; GFX6-NEXT:    s_ashr_i32 s1, s1, 16
+; GFX6-NEXT:    s_add_i32 s3, s3, s4
 ; GFX6-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX6-NEXT:    s_ashr_i32 s0, s0, 16
 ; GFX6-NEXT:    s_and_b32 s1, s1, s4
@@ -3185,16 +3185,16 @@ define amdgpu_ps <2 x i32> @s_saddsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_sext_i32_i16 s10, s0
 ; GFX8-NEXT:    s_sext_i32_i16 s11, 0
-; GFX8-NEXT:    s_max_i32 s12, s10, s11
 ; GFX8-NEXT:    s_movk_i32 s9, 0x8000
+; GFX8-NEXT:    s_max_i32 s12, s10, s11
 ; GFX8-NEXT:    s_min_i32 s10, s10, s11
 ; GFX8-NEXT:    s_sub_i32 s10, s9, s10
 ; GFX8-NEXT:    s_lshr_b32 s6, s2, 16
 ; GFX8-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX8-NEXT:    s_sext_i32_i16 s10, s10
 ; GFX8-NEXT:    s_sext_i32_i16 s2, s2
-; GFX8-NEXT:    s_max_i32 s2, s10, s2
 ; GFX8-NEXT:    s_sub_i32 s12, s8, s12
+; GFX8-NEXT:    s_max_i32 s2, s10, s2
 ; GFX8-NEXT:    s_sext_i32_i16 s2, s2
 ; GFX8-NEXT:    s_sext_i32_i16 s10, s12
 ; GFX8-NEXT:    s_lshr_b32 s4, s0, 16
@@ -3206,8 +3206,8 @@ define amdgpu_ps <2 x i32> @s_saddsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre
 ; GFX8-NEXT:    s_sub_i32 s2, s9, s2
 ; GFX8-NEXT:    s_sext_i32_i16 s2, s2
 ; GFX8-NEXT:    s_sext_i32_i16 s6, s6
-; GFX8-NEXT:    s_max_i32 s2, s2, s6
 ; GFX8-NEXT:    s_sub_i32 s10, s8, s10
+; GFX8-NEXT:    s_max_i32 s2, s2, s6
 ; GFX8-NEXT:    s_sext_i32_i16 s2, s2
 ; GFX8-NEXT:    s_sext_i32_i16 s6, s10
 ; GFX8-NEXT:    s_min_i32 s2, s2, s6
@@ -3219,10 +3219,10 @@ define amdgpu_ps <2 x i32> @s_saddsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre
 ; GFX8-NEXT:    s_lshr_b32 s7, s3, 16
 ; GFX8-NEXT:    s_sext_i32_i16 s2, s2
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
-; GFX8-NEXT:    s_max_i32 s2, s2, s3
 ; GFX8-NEXT:    s_sub_i32 s6, s8, s6
-; GFX8-NEXT:    s_sext_i32_i16 s3, s6
+; GFX8-NEXT:    s_max_i32 s2, s2, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s2, s2
+; GFX8-NEXT:    s_sext_i32_i16 s3, s6
 ; GFX8-NEXT:    s_lshr_b32 s5, s1, 16
 ; GFX8-NEXT:    s_min_i32 s2, s2, s3
 ; GFX8-NEXT:    s_add_i32 s1, s1, s2
@@ -3289,9 +3289,9 @@ define <3 x float> @v_saddsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) {
 ; GFX6-NEXT:    s_brev_b32 s5, 1
 ; GFX6-NEXT:    v_min_i32_e32 v14, 0, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX6-NEXT:    v_sub_i32_e32 v14, vcc, s5, v14
 ; GFX6-NEXT:    s_brev_b32 s4, -2
 ; GFX6-NEXT:    v_max_i32_e32 v12, 0, v0
+; GFX6-NEXT:    v_sub_i32_e32 v14, vcc, s5, v14
 ; GFX6-NEXT:    v_sub_i32_e32 v12, vcc, s4, v12
 ; GFX6-NEXT:    v_max_i32_e32 v6, v14, v6
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
@@ -3308,18 +3308,18 @@ define <3 x float> @v_saddsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) {
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v6
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 16, v8
 ; GFX6-NEXT:    v_min_i32_e32 v8, 0, v2
-; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, s5, v8
 ; GFX6-NEXT:    v_bfrev_b32_e32 v13, -2
 ; GFX6-NEXT:    v_max_i32_e32 v7, 0, v2
+; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, s5, v8
 ; GFX6-NEXT:    v_sub_i32_e32 v7, vcc, v13, v7
 ; GFX6-NEXT:    v_max_i32_e32 v6, v8, v6
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_min_i32_e32 v6, v6, v7
 ; GFX6-NEXT:    v_bfrev_b32_e32 v15, 1
+; GFX6-NEXT:    v_min_i32_e32 v6, v6, v7
 ; GFX6-NEXT:    v_min_i32_e32 v8, 0, v3
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
-; GFX6-NEXT:    v_max_i32_e32 v7, 0, v3
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 16, v9
+; GFX6-NEXT:    v_max_i32_e32 v7, 0, v3
 ; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, v15, v8
 ; GFX6-NEXT:    v_sub_i32_e32 v7, vcc, v13, v7
 ; GFX6-NEXT:    v_max_i32_e32 v6, v8, v6
@@ -3327,8 +3327,8 @@ define <3 x float> @v_saddsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) {
 ; GFX6-NEXT:    v_min_i32_e32 v6, v6, v7
 ; GFX6-NEXT:    v_min_i32_e32 v8, 0, v4
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
-; GFX6-NEXT:    v_max_i32_e32 v7, 0, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 16, v10
+; GFX6-NEXT:    v_max_i32_e32 v7, 0, v4
 ; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, v15, v8
 ; GFX6-NEXT:    v_sub_i32_e32 v7, vcc, v13, v7
 ; GFX6-NEXT:    v_max_i32_e32 v6, v8, v6
@@ -3336,28 +3336,28 @@ define <3 x float> @v_saddsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) {
 ; GFX6-NEXT:    v_min_i32_e32 v6, v6, v7
 ; GFX6-NEXT:    v_min_i32_e32 v8, 0, v5
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
-; GFX6-NEXT:    v_max_i32_e32 v7, 0, v5
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 16, v11
+; GFX6-NEXT:    v_max_i32_e32 v7, 0, v5
 ; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, v15, v8
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
-; GFX6-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX6-NEXT:    v_sub_i32_e32 v7, vcc, v13, v7
 ; GFX6-NEXT:    v_max_i32_e32 v6, v8, v6
+; GFX6-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
-; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
 ; GFX6-NEXT:    v_min_i32_e32 v6, v6, v7
-; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
+; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 16, v2
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v3, 16, v3
+; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; GFX6-NEXT:    v_and_b32_e32 v0, s4, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_ashrrev_i32_e32 v5, 16, v5
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_and_b32_e32 v1, s4, v2
 ; GFX6-NEXT:    v_and_b32_e32 v2, s4, v3
-; GFX6-NEXT:    v_ashrrev_i32_e32 v5, 16, v5
-; GFX6-NEXT:    v_and_b32_e32 v3, s4, v5
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v4, 16, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_and_b32_e32 v3, s4, v5
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX6-NEXT:    v_and_b32_e32 v2, s4, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
@@ -3369,37 +3369,37 @@ define <3 x float> @v_saddsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_movk_i32 s5, 0x8000
 ; GFX8-NEXT:    v_min_i16_e32 v11, 0, v0
-; GFX8-NEXT:    v_sub_u16_e32 v11, s5, v11
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
 ; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX8-NEXT:    v_max_i16_e32 v9, 0, v0
-; GFX8-NEXT:    v_min_i16_e32 v13, 0, v6
+; GFX8-NEXT:    v_sub_u16_e32 v11, s5, v11
 ; GFX8-NEXT:    v_sub_u16_e32 v9, s4, v9
 ; GFX8-NEXT:    v_max_i16_e32 v11, v11, v3
+; GFX8-NEXT:    v_min_i16_e32 v13, 0, v6
 ; GFX8-NEXT:    v_min_i16_e32 v9, v11, v9
 ; GFX8-NEXT:    v_max_i16_e32 v11, 0, v6
 ; GFX8-NEXT:    v_sub_u16_e32 v13, s5, v13
+; GFX8-NEXT:    v_sub_u16_e32 v11, s4, v11
 ; GFX8-NEXT:    v_max_i16_sdwa v3, v13, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-NEXT:    v_min_i16_e32 v13, 0, v1
-; GFX8-NEXT:    v_sub_u16_e32 v11, s4, v11
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
 ; GFX8-NEXT:    v_min_i16_e32 v3, v3, v11
 ; GFX8-NEXT:    v_max_i16_e32 v11, 0, v1
 ; GFX8-NEXT:    v_sub_u16_e32 v13, s5, v13
-; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
-; GFX8-NEXT:    v_min_i16_e32 v14, 0, v7
 ; GFX8-NEXT:    v_sub_u16_e32 v11, s4, v11
 ; GFX8-NEXT:    v_max_i16_e32 v13, v13, v4
+; GFX8-NEXT:    v_min_i16_e32 v14, 0, v7
 ; GFX8-NEXT:    v_min_i16_e32 v11, v13, v11
 ; GFX8-NEXT:    v_max_i16_e32 v13, 0, v7
 ; GFX8-NEXT:    v_sub_u16_e32 v14, s5, v14
-; GFX8-NEXT:    v_max_i16_sdwa v4, v14, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_sub_u16_e32 v13, s4, v13
 ; GFX8-NEXT:    v_mov_b32_e32 v12, 0xffff8000
+; GFX8-NEXT:    v_sub_u16_e32 v13, s4, v13
+; GFX8-NEXT:    v_max_i16_sdwa v4, v14, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-NEXT:    v_min_i16_e32 v14, 0, v2
-; GFX8-NEXT:    v_sub_u16_e32 v14, v12, v14
-; GFX8-NEXT:    v_min_i16_e32 v4, v4, v13
 ; GFX8-NEXT:    v_mov_b32_e32 v10, 0x7fff
+; GFX8-NEXT:    v_min_i16_e32 v4, v4, v13
 ; GFX8-NEXT:    v_max_i16_e32 v13, 0, v2
+; GFX8-NEXT:    v_sub_u16_e32 v14, v12, v14
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
 ; GFX8-NEXT:    v_sub_u16_e32 v13, v10, v13
 ; GFX8-NEXT:    v_max_i16_e32 v14, v14, v5
@@ -3411,8 +3411,8 @@ define <3 x float> @v_saddsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) {
 ; GFX8-NEXT:    v_max_i16_sdwa v5, v12, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-NEXT:    v_add_u16_e32 v0, v0, v9
 ; GFX8-NEXT:    v_add_u16_sdwa v3, v6, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v3
 ; GFX8-NEXT:    v_min_i16_e32 v5, v5, v10
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v3
 ; GFX8-NEXT:    v_add_u16_e32 v1, v1, v11
 ; GFX8-NEXT:    v_add_u16_sdwa v3, v7, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v3
@@ -3449,9 +3449,9 @@ define amdgpu_ps <3 x i32> @s_saddsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre
 ; GFX6-NEXT:    s_brev_b32 s13, 1
 ; GFX6-NEXT:    s_min_i32 s15, s0, 0
 ; GFX6-NEXT:    s_lshl_b32 s6, s6, 16
-; GFX6-NEXT:    s_sub_i32 s15, s13, s15
 ; GFX6-NEXT:    s_brev_b32 s12, -2
 ; GFX6-NEXT:    s_max_i32 s14, s0, 0
+; GFX6-NEXT:    s_sub_i32 s15, s13, s15
 ; GFX6-NEXT:    s_sub_i32 s14, s12, s14
 ; GFX6-NEXT:    s_max_i32 s6, s15, s6
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
@@ -3476,8 +3476,8 @@ define amdgpu_ps <3 x i32> @s_saddsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre
 ; GFX6-NEXT:    s_min_i32 s6, s6, s7
 ; GFX6-NEXT:    s_min_i32 s8, s3, 0
 ; GFX6-NEXT:    s_add_i32 s2, s2, s6
-; GFX6-NEXT:    s_max_i32 s7, s3, 0
 ; GFX6-NEXT:    s_lshl_b32 s6, s9, 16
+; GFX6-NEXT:    s_max_i32 s7, s3, 0
 ; GFX6-NEXT:    s_sub_i32 s8, s13, s8
 ; GFX6-NEXT:    s_sub_i32 s7, s12, s7
 ; GFX6-NEXT:    s_max_i32 s6, s8, s6
@@ -3485,8 +3485,8 @@ define amdgpu_ps <3 x i32> @s_saddsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre
 ; GFX6-NEXT:    s_min_i32 s6, s6, s7
 ; GFX6-NEXT:    s_min_i32 s8, s4, 0
 ; GFX6-NEXT:    s_add_i32 s3, s3, s6
-; GFX6-NEXT:    s_max_i32 s7, s4, 0
 ; GFX6-NEXT:    s_lshl_b32 s6, s10, 16
+; GFX6-NEXT:    s_max_i32 s7, s4, 0
 ; GFX6-NEXT:    s_sub_i32 s8, s13, s8
 ; GFX6-NEXT:    s_sub_i32 s7, s12, s7
 ; GFX6-NEXT:    s_max_i32 s6, s8, s6
@@ -3494,14 +3494,14 @@ define amdgpu_ps <3 x i32> @s_saddsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre
 ; GFX6-NEXT:    s_min_i32 s6, s6, s7
 ; GFX6-NEXT:    s_min_i32 s8, s5, 0
 ; GFX6-NEXT:    s_add_i32 s4, s4, s6
-; GFX6-NEXT:    s_max_i32 s7, s5, 0
 ; GFX6-NEXT:    s_lshl_b32 s6, s11, 16
+; GFX6-NEXT:    s_max_i32 s7, s5, 0
 ; GFX6-NEXT:    s_sub_i32 s8, s13, s8
 ; GFX6-NEXT:    s_sub_i32 s7, s12, s7
 ; GFX6-NEXT:    s_max_i32 s6, s8, s6
 ; GFX6-NEXT:    s_min_i32 s6, s6, s7
-; GFX6-NEXT:    s_add_i32 s5, s5, s6
 ; GFX6-NEXT:    s_ashr_i32 s1, s1, 16
+; GFX6-NEXT:    s_add_i32 s5, s5, s6
 ; GFX6-NEXT:    s_mov_b32 s6, 0xffff
 ; GFX6-NEXT:    s_ashr_i32 s0, s0, 16
 ; GFX6-NEXT:    s_and_b32 s1, s1, s6
@@ -3509,13 +3509,13 @@ define amdgpu_ps <3 x i32> @s_saddsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre
 ; GFX6-NEXT:    s_ashr_i32 s3, s3, 16
 ; GFX6-NEXT:    s_and_b32 s0, s0, s6
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX6-NEXT:    s_ashr_i32 s5, s5, 16
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
 ; GFX6-NEXT:    s_and_b32 s1, s2, s6
 ; GFX6-NEXT:    s_and_b32 s2, s3, s6
-; GFX6-NEXT:    s_ashr_i32 s5, s5, 16
-; GFX6-NEXT:    s_and_b32 s3, s5, s6
-; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
 ; GFX6-NEXT:    s_ashr_i32 s4, s4, 16
+; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX6-NEXT:    s_and_b32 s3, s5, s6
 ; GFX6-NEXT:    s_or_b32 s1, s1, s2
 ; GFX6-NEXT:    s_and_b32 s2, s4, s6
 ; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
@@ -3526,16 +3526,16 @@ define amdgpu_ps <3 x i32> @s_saddsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_sext_i32_i16 s14, s0
 ; GFX8-NEXT:    s_sext_i32_i16 s15, 0
-; GFX8-NEXT:    s_max_i32 s16, s14, s15
 ; GFX8-NEXT:    s_movk_i32 s13, 0x8000
+; GFX8-NEXT:    s_max_i32 s16, s14, s15
 ; GFX8-NEXT:    s_min_i32 s14, s14, s15
 ; GFX8-NEXT:    s_sub_i32 s14, s13, s14
 ; GFX8-NEXT:    s_lshr_b32 s9, s3, 16
 ; GFX8-NEXT:    s_movk_i32 s12, 0x7fff
 ; GFX8-NEXT:    s_sext_i32_i16 s14, s14
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
-; GFX8-NEXT:    s_max_i32 s3, s14, s3
 ; GFX8-NEXT:    s_sub_i32 s16, s12, s16
+; GFX8-NEXT:    s_max_i32 s3, s14, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s14, s16
 ; GFX8-NEXT:    s_lshr_b32 s6, s0, 16
@@ -3547,8 +3547,8 @@ define amdgpu_ps <3 x i32> @s_saddsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre
 ; GFX8-NEXT:    s_sub_i32 s3, s13, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s9, s9
-; GFX8-NEXT:    s_max_i32 s3, s3, s9
 ; GFX8-NEXT:    s_sub_i32 s14, s12, s14
+; GFX8-NEXT:    s_max_i32 s3, s3, s9
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s9, s14
 ; GFX8-NEXT:    s_min_i32 s3, s3, s9
@@ -3560,10 +3560,10 @@ define amdgpu_ps <3 x i32> @s_saddsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre
 ; GFX8-NEXT:    s_lshr_b32 s10, s4, 16
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
-; GFX8-NEXT:    s_max_i32 s3, s3, s4
 ; GFX8-NEXT:    s_sub_i32 s9, s12, s9
-; GFX8-NEXT:    s_sext_i32_i16 s4, s9
+; GFX8-NEXT:    s_max_i32 s3, s3, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
+; GFX8-NEXT:    s_sext_i32_i16 s4, s9
 ; GFX8-NEXT:    s_lshr_b32 s7, s1, 16
 ; GFX8-NEXT:    s_min_i32 s3, s3, s4
 ; GFX8-NEXT:    s_add_i32 s1, s1, s3
@@ -3586,8 +3586,8 @@ define amdgpu_ps <3 x i32> @s_saddsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre
 ; GFX8-NEXT:    s_lshr_b32 s11, s5, 16
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s5, s5
-; GFX8-NEXT:    s_max_i32 s3, s3, s5
 ; GFX8-NEXT:    s_sub_i32 s4, s12, s4
+; GFX8-NEXT:    s_max_i32 s3, s3, s5
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
 ; GFX8-NEXT:    s_lshr_b32 s8, s2, 16
@@ -3654,9 +3654,9 @@ define <4 x float> @v_saddsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
 ; GFX6-NEXT:    s_brev_b32 s5, 1
 ; GFX6-NEXT:    v_min_i32_e32 v18, 0, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX6-NEXT:    v_sub_i32_e32 v18, vcc, s5, v18
 ; GFX6-NEXT:    s_brev_b32 s4, -2
 ; GFX6-NEXT:    v_max_i32_e32 v16, 0, v0
+; GFX6-NEXT:    v_sub_i32_e32 v18, vcc, s5, v18
 ; GFX6-NEXT:    v_sub_i32_e32 v16, vcc, s4, v16
 ; GFX6-NEXT:    v_max_i32_e32 v8, v18, v8
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
@@ -3673,18 +3673,18 @@ define <4 x float> @v_saddsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v8
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v10
 ; GFX6-NEXT:    v_min_i32_e32 v10, 0, v2
-; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, s5, v10
 ; GFX6-NEXT:    v_bfrev_b32_e32 v17, -2
 ; GFX6-NEXT:    v_max_i32_e32 v9, 0, v2
+; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, s5, v10
 ; GFX6-NEXT:    v_sub_i32_e32 v9, vcc, v17, v9
 ; GFX6-NEXT:    v_max_i32_e32 v8, v10, v8
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_min_i32_e32 v8, v8, v9
 ; GFX6-NEXT:    v_bfrev_b32_e32 v19, 1
+; GFX6-NEXT:    v_min_i32_e32 v8, v8, v9
 ; GFX6-NEXT:    v_min_i32_e32 v10, 0, v3
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
-; GFX6-NEXT:    v_max_i32_e32 v9, 0, v3
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v11
+; GFX6-NEXT:    v_max_i32_e32 v9, 0, v3
 ; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, v19, v10
 ; GFX6-NEXT:    v_sub_i32_e32 v9, vcc, v17, v9
 ; GFX6-NEXT:    v_max_i32_e32 v8, v10, v8
@@ -3692,8 +3692,8 @@ define <4 x float> @v_saddsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
 ; GFX6-NEXT:    v_min_i32_e32 v8, v8, v9
 ; GFX6-NEXT:    v_min_i32_e32 v10, 0, v4
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v8
-; GFX6-NEXT:    v_max_i32_e32 v9, 0, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
+; GFX6-NEXT:    v_max_i32_e32 v9, 0, v4
 ; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, v19, v10
 ; GFX6-NEXT:    v_sub_i32_e32 v9, vcc, v17, v9
 ; GFX6-NEXT:    v_max_i32_e32 v8, v10, v8
@@ -3701,8 +3701,8 @@ define <4 x float> @v_saddsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
 ; GFX6-NEXT:    v_min_i32_e32 v8, v8, v9
 ; GFX6-NEXT:    v_min_i32_e32 v10, 0, v5
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
-; GFX6-NEXT:    v_max_i32_e32 v9, 0, v5
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v13
+; GFX6-NEXT:    v_max_i32_e32 v9, 0, v5
 ; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, v19, v10
 ; GFX6-NEXT:    v_sub_i32_e32 v9, vcc, v17, v9
 ; GFX6-NEXT:    v_max_i32_e32 v8, v10, v8
@@ -3710,43 +3710,43 @@ define <4 x float> @v_saddsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
 ; GFX6-NEXT:    v_min_i32_e32 v8, v8, v9
 ; GFX6-NEXT:    v_min_i32_e32 v10, 0, v6
 ; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
-; GFX6-NEXT:    v_max_i32_e32 v9, 0, v6
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v14
+; GFX6-NEXT:    v_max_i32_e32 v9, 0, v6
 ; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, v19, v10
 ; GFX6-NEXT:    v_sub_i32_e32 v9, vcc, v17, v9
 ; GFX6-NEXT:    v_max_i32_e32 v8, v10, v8
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
 ; GFX6-NEXT:    v_min_i32_e32 v8, v8, v9
 ; GFX6-NEXT:    v_min_i32_e32 v10, 0, v7
-; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
-; GFX6-NEXT:    v_max_i32_e32 v9, 0, v7
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
-; GFX6-NEXT:    s_mov_b32 s4, 0xffff
+; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v15
+; GFX6-NEXT:    v_max_i32_e32 v9, 0, v7
 ; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, v19, v10
+; GFX6-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
-; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
 ; GFX6-NEXT:    v_sub_i32_e32 v9, vcc, v17, v9
 ; GFX6-NEXT:    v_max_i32_e32 v8, v10, v8
-; GFX6-NEXT:    v_min_i32_e32 v8, v8, v9
+; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 16, v2
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v3, 16, v3
+; GFX6-NEXT:    v_min_i32_e32 v8, v8, v9
 ; GFX6-NEXT:    v_and_b32_e32 v0, s4, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_ashrrev_i32_e32 v5, 16, v5
 ; GFX6-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_and_b32_e32 v1, s4, v2
 ; GFX6-NEXT:    v_and_b32_e32 v2, s4, v3
-; GFX6-NEXT:    v_ashrrev_i32_e32 v5, 16, v5
-; GFX6-NEXT:    v_and_b32_e32 v3, s4, v5
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v4, 16, v4
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v7, 16, v7
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_and_b32_e32 v3, s4, v5
+; GFX6-NEXT:    v_ashrrev_i32_e32 v6, 16, v6
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX6-NEXT:    v_and_b32_e32 v2, s4, v4
-; GFX6-NEXT:    v_and_b32_e32 v4, s4, v7
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_ashrrev_i32_e32 v6, 16, v6
+; GFX6-NEXT:    v_and_b32_e32 v4, s4, v7
 ; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX6-NEXT:    v_and_b32_e32 v3, s4, v6
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
@@ -3758,50 +3758,50 @@ define <4 x float> @v_saddsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_movk_i32 s5, 0x8000
 ; GFX8-NEXT:    v_min_i16_e32 v14, 0, v0
-; GFX8-NEXT:    v_sub_u16_e32 v14, s5, v14
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
 ; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX8-NEXT:    v_max_i16_e32 v12, 0, v0
-; GFX8-NEXT:    v_min_i16_e32 v16, 0, v8
+; GFX8-NEXT:    v_sub_u16_e32 v14, s5, v14
 ; GFX8-NEXT:    v_sub_u16_e32 v12, s4, v12
 ; GFX8-NEXT:    v_max_i16_e32 v14, v14, v4
+; GFX8-NEXT:    v_min_i16_e32 v16, 0, v8
 ; GFX8-NEXT:    v_min_i16_e32 v12, v14, v12
 ; GFX8-NEXT:    v_max_i16_e32 v14, 0, v8
 ; GFX8-NEXT:    v_sub_u16_e32 v16, s5, v16
+; GFX8-NEXT:    v_sub_u16_e32 v14, s4, v14
 ; GFX8-NEXT:    v_max_i16_sdwa v4, v16, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-NEXT:    v_min_i16_e32 v16, 0, v1
-; GFX8-NEXT:    v_sub_u16_e32 v14, s4, v14
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
 ; GFX8-NEXT:    v_min_i16_e32 v4, v4, v14
 ; GFX8-NEXT:    v_max_i16_e32 v14, 0, v1
 ; GFX8-NEXT:    v_sub_u16_e32 v16, s5, v16
-; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
-; GFX8-NEXT:    v_min_i16_e32 v17, 0, v9
 ; GFX8-NEXT:    v_sub_u16_e32 v14, s4, v14
 ; GFX8-NEXT:    v_max_i16_e32 v16, v16, v5
+; GFX8-NEXT:    v_min_i16_e32 v17, 0, v9
 ; GFX8-NEXT:    v_min_i16_e32 v14, v16, v14
 ; GFX8-NEXT:    v_max_i16_e32 v16, 0, v9
 ; GFX8-NEXT:    v_sub_u16_e32 v17, s5, v17
-; GFX8-NEXT:    v_max_i16_sdwa v5, v17, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_sub_u16_e32 v16, s4, v16
 ; GFX8-NEXT:    v_mov_b32_e32 v15, 0xffff8000
+; GFX8-NEXT:    v_sub_u16_e32 v16, s4, v16
+; GFX8-NEXT:    v_max_i16_sdwa v5, v17, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-NEXT:    v_min_i16_e32 v17, 0, v2
-; GFX8-NEXT:    v_sub_u16_e32 v17, v15, v17
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
-; GFX8-NEXT:    v_min_i16_e32 v5, v5, v16
 ; GFX8-NEXT:    v_mov_b32_e32 v13, 0x7fff
+; GFX8-NEXT:    v_min_i16_e32 v5, v5, v16
 ; GFX8-NEXT:    v_max_i16_e32 v16, 0, v2
-; GFX8-NEXT:    v_min_i16_e32 v18, 0, v10
+; GFX8-NEXT:    v_sub_u16_e32 v17, v15, v17
 ; GFX8-NEXT:    v_sub_u16_e32 v16, v13, v16
 ; GFX8-NEXT:    v_max_i16_e32 v17, v17, v6
+; GFX8-NEXT:    v_min_i16_e32 v18, 0, v10
 ; GFX8-NEXT:    v_min_i16_e32 v16, v17, v16
 ; GFX8-NEXT:    v_max_i16_e32 v17, 0, v10
 ; GFX8-NEXT:    v_sub_u16_e32 v18, v15, v18
+; GFX8-NEXT:    v_sub_u16_e32 v17, v13, v17
 ; GFX8-NEXT:    v_max_i16_sdwa v6, v18, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-NEXT:    v_min_i16_e32 v18, 0, v3
-; GFX8-NEXT:    v_sub_u16_e32 v17, v13, v17
-; GFX8-NEXT:    v_sub_u16_e32 v18, v15, v18
 ; GFX8-NEXT:    v_min_i16_e32 v6, v6, v17
 ; GFX8-NEXT:    v_max_i16_e32 v17, 0, v3
+; GFX8-NEXT:    v_sub_u16_e32 v18, v15, v18
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
 ; GFX8-NEXT:    v_sub_u16_e32 v17, v13, v17
 ; GFX8-NEXT:    v_max_i16_e32 v18, v18, v7
@@ -3816,8 +3816,8 @@ define <4 x float> @v_saddsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX8-NEXT:    v_add_u16_e32 v1, v1, v14
 ; GFX8-NEXT:    v_add_u16_sdwa v4, v9, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
 ; GFX8-NEXT:    v_min_i16_e32 v7, v7, v13
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
 ; GFX8-NEXT:    v_add_u16_e32 v2, v2, v16
 ; GFX8-NEXT:    v_add_u16_sdwa v4, v10, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v2, v2, v4
@@ -3856,9 +3856,9 @@ define amdgpu_ps <4 x i32> @s_saddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
 ; GFX6-NEXT:    s_brev_b32 s17, 1
 ; GFX6-NEXT:    s_min_i32 s19, s0, 0
 ; GFX6-NEXT:    s_lshl_b32 s8, s8, 16
-; GFX6-NEXT:    s_sub_i32 s19, s17, s19
 ; GFX6-NEXT:    s_brev_b32 s16, -2
 ; GFX6-NEXT:    s_max_i32 s18, s0, 0
+; GFX6-NEXT:    s_sub_i32 s19, s17, s19
 ; GFX6-NEXT:    s_sub_i32 s18, s16, s18
 ; GFX6-NEXT:    s_max_i32 s8, s19, s8
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
@@ -3883,8 +3883,8 @@ define amdgpu_ps <4 x i32> @s_saddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
 ; GFX6-NEXT:    s_min_i32 s8, s8, s9
 ; GFX6-NEXT:    s_min_i32 s10, s3, 0
 ; GFX6-NEXT:    s_add_i32 s2, s2, s8
-; GFX6-NEXT:    s_max_i32 s9, s3, 0
 ; GFX6-NEXT:    s_lshl_b32 s8, s11, 16
+; GFX6-NEXT:    s_max_i32 s9, s3, 0
 ; GFX6-NEXT:    s_sub_i32 s10, s17, s10
 ; GFX6-NEXT:    s_sub_i32 s9, s16, s9
 ; GFX6-NEXT:    s_max_i32 s8, s10, s8
@@ -3892,8 +3892,8 @@ define amdgpu_ps <4 x i32> @s_saddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
 ; GFX6-NEXT:    s_min_i32 s8, s8, s9
 ; GFX6-NEXT:    s_min_i32 s10, s4, 0
 ; GFX6-NEXT:    s_add_i32 s3, s3, s8
-; GFX6-NEXT:    s_max_i32 s9, s4, 0
 ; GFX6-NEXT:    s_lshl_b32 s8, s12, 16
+; GFX6-NEXT:    s_max_i32 s9, s4, 0
 ; GFX6-NEXT:    s_sub_i32 s10, s17, s10
 ; GFX6-NEXT:    s_sub_i32 s9, s16, s9
 ; GFX6-NEXT:    s_max_i32 s8, s10, s8
@@ -3901,8 +3901,8 @@ define amdgpu_ps <4 x i32> @s_saddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
 ; GFX6-NEXT:    s_min_i32 s8, s8, s9
 ; GFX6-NEXT:    s_min_i32 s10, s5, 0
 ; GFX6-NEXT:    s_add_i32 s4, s4, s8
-; GFX6-NEXT:    s_max_i32 s9, s5, 0
 ; GFX6-NEXT:    s_lshl_b32 s8, s13, 16
+; GFX6-NEXT:    s_max_i32 s9, s5, 0
 ; GFX6-NEXT:    s_sub_i32 s10, s17, s10
 ; GFX6-NEXT:    s_sub_i32 s9, s16, s9
 ; GFX6-NEXT:    s_max_i32 s8, s10, s8
@@ -3910,8 +3910,8 @@ define amdgpu_ps <4 x i32> @s_saddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
 ; GFX6-NEXT:    s_min_i32 s8, s8, s9
 ; GFX6-NEXT:    s_min_i32 s10, s6, 0
 ; GFX6-NEXT:    s_add_i32 s5, s5, s8
-; GFX6-NEXT:    s_max_i32 s9, s6, 0
 ; GFX6-NEXT:    s_lshl_b32 s8, s14, 16
+; GFX6-NEXT:    s_max_i32 s9, s6, 0
 ; GFX6-NEXT:    s_sub_i32 s10, s17, s10
 ; GFX6-NEXT:    s_sub_i32 s9, s16, s9
 ; GFX6-NEXT:    s_max_i32 s8, s10, s8
@@ -3919,14 +3919,14 @@ define amdgpu_ps <4 x i32> @s_saddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
 ; GFX6-NEXT:    s_min_i32 s8, s8, s9
 ; GFX6-NEXT:    s_min_i32 s10, s7, 0
 ; GFX6-NEXT:    s_add_i32 s6, s6, s8
-; GFX6-NEXT:    s_max_i32 s9, s7, 0
 ; GFX6-NEXT:    s_lshl_b32 s8, s15, 16
+; GFX6-NEXT:    s_max_i32 s9, s7, 0
 ; GFX6-NEXT:    s_sub_i32 s10, s17, s10
 ; GFX6-NEXT:    s_sub_i32 s9, s16, s9
 ; GFX6-NEXT:    s_max_i32 s8, s10, s8
 ; GFX6-NEXT:    s_min_i32 s8, s8, s9
-; GFX6-NEXT:    s_add_i32 s7, s7, s8
 ; GFX6-NEXT:    s_ashr_i32 s1, s1, 16
+; GFX6-NEXT:    s_add_i32 s7, s7, s8
 ; GFX6-NEXT:    s_mov_b32 s8, 0xffff
 ; GFX6-NEXT:    s_ashr_i32 s0, s0, 16
 ; GFX6-NEXT:    s_and_b32 s1, s1, s8
@@ -3934,19 +3934,19 @@ define amdgpu_ps <4 x i32> @s_saddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
 ; GFX6-NEXT:    s_ashr_i32 s3, s3, 16
 ; GFX6-NEXT:    s_and_b32 s0, s0, s8
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX6-NEXT:    s_ashr_i32 s5, s5, 16
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
 ; GFX6-NEXT:    s_and_b32 s1, s2, s8
 ; GFX6-NEXT:    s_and_b32 s2, s3, s8
-; GFX6-NEXT:    s_ashr_i32 s5, s5, 16
-; GFX6-NEXT:    s_and_b32 s3, s5, s8
-; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
 ; GFX6-NEXT:    s_ashr_i32 s4, s4, 16
 ; GFX6-NEXT:    s_ashr_i32 s7, s7, 16
+; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX6-NEXT:    s_and_b32 s3, s5, s8
+; GFX6-NEXT:    s_ashr_i32 s6, s6, 16
 ; GFX6-NEXT:    s_or_b32 s1, s1, s2
 ; GFX6-NEXT:    s_and_b32 s2, s4, s8
-; GFX6-NEXT:    s_and_b32 s4, s7, s8
 ; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
-; GFX6-NEXT:    s_ashr_i32 s6, s6, 16
+; GFX6-NEXT:    s_and_b32 s4, s7, s8
 ; GFX6-NEXT:    s_or_b32 s2, s2, s3
 ; GFX6-NEXT:    s_and_b32 s3, s6, s8
 ; GFX6-NEXT:    s_lshl_b32 s4, s4, 16
@@ -3957,16 +3957,16 @@ define amdgpu_ps <4 x i32> @s_saddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_sext_i32_i16 s18, s0
 ; GFX8-NEXT:    s_sext_i32_i16 s19, 0
-; GFX8-NEXT:    s_max_i32 s20, s18, s19
 ; GFX8-NEXT:    s_movk_i32 s17, 0x8000
+; GFX8-NEXT:    s_max_i32 s20, s18, s19
 ; GFX8-NEXT:    s_min_i32 s18, s18, s19
 ; GFX8-NEXT:    s_sub_i32 s18, s17, s18
 ; GFX8-NEXT:    s_lshr_b32 s12, s4, 16
 ; GFX8-NEXT:    s_movk_i32 s16, 0x7fff
 ; GFX8-NEXT:    s_sext_i32_i16 s18, s18
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
-; GFX8-NEXT:    s_max_i32 s4, s18, s4
 ; GFX8-NEXT:    s_sub_i32 s20, s16, s20
+; GFX8-NEXT:    s_max_i32 s4, s18, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s18, s20
 ; GFX8-NEXT:    s_lshr_b32 s8, s0, 16
@@ -3978,8 +3978,8 @@ define amdgpu_ps <4 x i32> @s_saddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
 ; GFX8-NEXT:    s_sub_i32 s4, s17, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s12, s12
-; GFX8-NEXT:    s_max_i32 s4, s4, s12
 ; GFX8-NEXT:    s_sub_i32 s18, s16, s18
+; GFX8-NEXT:    s_max_i32 s4, s4, s12
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s12, s18
 ; GFX8-NEXT:    s_min_i32 s4, s4, s12
@@ -3991,10 +3991,10 @@ define amdgpu_ps <4 x i32> @s_saddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
 ; GFX8-NEXT:    s_lshr_b32 s13, s5, 16
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s5, s5
-; GFX8-NEXT:    s_max_i32 s4, s4, s5
 ; GFX8-NEXT:    s_sub_i32 s12, s16, s12
-; GFX8-NEXT:    s_sext_i32_i16 s5, s12
+; GFX8-NEXT:    s_max_i32 s4, s4, s5
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
+; GFX8-NEXT:    s_sext_i32_i16 s5, s12
 ; GFX8-NEXT:    s_lshr_b32 s9, s1, 16
 ; GFX8-NEXT:    s_min_i32 s4, s4, s5
 ; GFX8-NEXT:    s_add_i32 s1, s1, s4
@@ -4017,8 +4017,8 @@ define amdgpu_ps <4 x i32> @s_saddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
 ; GFX8-NEXT:    s_lshr_b32 s14, s6, 16
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s6, s6
-; GFX8-NEXT:    s_max_i32 s4, s4, s6
 ; GFX8-NEXT:    s_sub_i32 s5, s16, s5
+; GFX8-NEXT:    s_max_i32 s4, s4, s6
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s5, s5
 ; GFX8-NEXT:    s_lshr_b32 s10, s2, 16
@@ -4030,8 +4030,8 @@ define amdgpu_ps <4 x i32> @s_saddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
 ; GFX8-NEXT:    s_sub_i32 s4, s17, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s6, s14
-; GFX8-NEXT:    s_max_i32 s4, s4, s6
 ; GFX8-NEXT:    s_sub_i32 s5, s16, s5
+; GFX8-NEXT:    s_max_i32 s4, s4, s6
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s5, s5
 ; GFX8-NEXT:    s_min_i32 s4, s4, s5
@@ -4052,8 +4052,8 @@ define amdgpu_ps <4 x i32> @s_saddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s11
 ; GFX8-NEXT:    s_max_i32 s5, s4, s19
 ; GFX8-NEXT:    s_min_i32 s4, s4, s19
-; GFX8-NEXT:    s_sub_i32 s4, s17, s4
 ; GFX8-NEXT:    s_lshr_b32 s15, s7, 16
+; GFX8-NEXT:    s_sub_i32 s4, s17, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s6, s15
 ; GFX8-NEXT:    s_sub_i32 s5, s16, s5
@@ -4462,8 +4462,8 @@ define <2 x i64> @v_saddsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v9
 ; GFX6-NEXT:    v_bfrev_b32_e32 v10, 1
 ; GFX6-NEXT:    v_add_i32_e64 v1, s[6:7], 0, v0
-; GFX6-NEXT:    s_xor_b64 vcc, s[4:5], vcc
 ; GFX6-NEXT:    v_addc_u32_e64 v4, s[6:7], v0, v10, s[6:7]
+; GFX6-NEXT:    s_xor_b64 vcc, s[4:5], vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v8, v1, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v9, v4, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v2, v6
@@ -4488,8 +4488,8 @@ define <2 x i64> @v_saddsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 31, v9
 ; GFX8-NEXT:    v_bfrev_b32_e32 v10, 1
 ; GFX8-NEXT:    v_add_u32_e64 v1, s[6:7], 0, v0
-; GFX8-NEXT:    s_xor_b64 vcc, s[4:5], vcc
 ; GFX8-NEXT:    v_addc_u32_e64 v4, s[6:7], v0, v10, s[6:7]
+; GFX8-NEXT:    s_xor_b64 vcc, s[4:5], vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v8, v1, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v9, v4, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v2, v6
@@ -4514,8 +4514,8 @@ define <2 x i64> @v_saddsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 31, v9
 ; GFX9-NEXT:    v_bfrev_b32_e32 v10, 1
 ; GFX9-NEXT:    v_add_co_u32_e64 v1, s[6:7], 0, v0
-; GFX9-NEXT:    s_xor_b64 vcc, s[4:5], vcc
 ; GFX9-NEXT:    v_addc_co_u32_e64 v4, s[6:7], v0, v10, s[6:7]
+; GFX9-NEXT:    s_xor_b64 vcc, s[4:5], vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v8, v1, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v9, v4, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v2, v6
@@ -4535,20 +4535,20 @@ define <2 x i64> @v_saddsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_add_co_u32 v8, vcc_lo, v0, v4
-; GFX10-NEXT:    v_cmp_gt_i64_e64 s4, 0, v[4:5]
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, v1, v5, vcc_lo
 ; GFX10-NEXT:    v_add_co_u32 v10, vcc_lo, v2, v6
-; GFX10-NEXT:    v_cmp_gt_i64_e64 s6, 0, v[6:7]
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v11, vcc_lo, v3, v7, vcc_lo
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v12, 31, v9
 ; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[8:9], v[0:1]
+; GFX10-NEXT:    v_cmp_gt_i64_e64 s4, 0, v[4:5]
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v0, 31, v11
+; GFX10-NEXT:    v_cmp_gt_i64_e64 s6, 0, v[6:7]
 ; GFX10-NEXT:    v_add_co_u32 v1, s5, v12, 0
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v4, s5, 0x80000000, v12, s5
 ; GFX10-NEXT:    v_cmp_lt_i64_e64 s5, v[10:11], v[2:3]
 ; GFX10-NEXT:    v_add_co_u32 v2, s7, v0, 0
-; GFX10-NEXT:    s_xor_b32 vcc_lo, s4, vcc_lo
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s7, 0x80000000, v0, s7
+; GFX10-NEXT:    s_xor_b32 vcc_lo, s4, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v8, v1, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v9, v4, vcc_lo
 ; GFX10-NEXT:    s_xor_b32 vcc_lo, s6, s5
@@ -4569,8 +4569,8 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX6-NEXT:    s_addc_u32 s9, s1, s5
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[4:5], 0
 ; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
+; GFX6-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[4:5], 0
 ; GFX6-NEXT:    s_ashr_i32 s4, s9, 31
 ; GFX6-NEXT:    s_xor_b64 vcc, s[0:1], vcc
 ; GFX6-NEXT:    s_add_u32 s0, s4, 0
@@ -4583,13 +4583,13 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX6-NEXT:    s_add_u32 s0, s2, s6
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX6-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX6-NEXT:    s_and_b32 s1, s1, 1
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s8
-; GFX6-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX6-NEXT:    s_and_b32 s1, s1, 1
 ; GFX6-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc
+; GFX6-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_addc_u32 s1, s3, s7
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s9
+; GFX6-NEXT:    s_addc_u32 s1, s3, s7
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
 ; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
@@ -4603,8 +4603,8 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX6-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX6-NEXT:    s_addc_u32 s3, s4, s5
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s0
-; GFX6-NEXT:    v_mov_b32_e32 v5, s1
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s3
+; GFX6-NEXT:    v_mov_b32_e32 v5, s1
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
 ; GFX6-NEXT:    v_readfirstlane_b32 s0, v4
@@ -4622,8 +4622,8 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    s_addc_u32 s9, s1, s5
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
-; GFX8-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[4:5], 0
 ; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
+; GFX8-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[4:5], 0
 ; GFX8-NEXT:    s_ashr_i32 s4, s9, 31
 ; GFX8-NEXT:    s_xor_b64 vcc, s[0:1], vcc
 ; GFX8-NEXT:    s_add_u32 s0, s4, 0
@@ -4636,13 +4636,13 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX8-NEXT:    s_add_u32 s0, s2, s6
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX8-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX8-NEXT:    s_and_b32 s1, s1, 1
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s8
-; GFX8-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX8-NEXT:    s_and_b32 s1, s1, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc
+; GFX8-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s2
-; GFX8-NEXT:    s_addc_u32 s1, s3, s7
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s9
+; GFX8-NEXT:    s_addc_u32 s1, s3, s7
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
 ; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
@@ -4656,8 +4656,8 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX8-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX8-NEXT:    s_addc_u32 s3, s4, s5
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s0
-; GFX8-NEXT:    v_mov_b32_e32 v5, s1
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NEXT:    v_mov_b32_e32 v5, s1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
 ; GFX8-NEXT:    v_readfirstlane_b32 s0, v4
@@ -4675,8 +4675,8 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    s_addc_u32 s9, s1, s5
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[4:5], 0
 ; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
+; GFX9-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[4:5], 0
 ; GFX9-NEXT:    s_ashr_i32 s4, s9, 31
 ; GFX9-NEXT:    s_xor_b64 vcc, s[0:1], vcc
 ; GFX9-NEXT:    s_add_u32 s0, s4, 0
@@ -4689,13 +4689,13 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX9-NEXT:    s_add_u32 s0, s2, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX9-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX9-NEXT:    s_and_b32 s1, s1, 1
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s8
-; GFX9-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX9-NEXT:    s_and_b32 s1, s1, 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc
+; GFX9-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    s_addc_u32 s1, s3, s7
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s9
+; GFX9-NEXT:    s_addc_u32 s1, s3, s7
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
 ; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
@@ -4709,8 +4709,8 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX9-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX9-NEXT:    s_addc_u32 s3, s4, s5
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-NEXT:    v_mov_b32_e32 v5, s1
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s3
+; GFX9-NEXT:    v_mov_b32_e32 v5, s1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v4
@@ -4784,8 +4784,8 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX6-NEXT:    s_and_b32 s9, s9, 1
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX6-NEXT:    s_cmp_lg_u32 s9, 0
-; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3]
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s2
+; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3]
 ; GFX6-NEXT:    s_addc_u32 s9, s3, s7
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
@@ -4816,13 +4816,13 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s4
-; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX6-NEXT:    v_mov_b32_e32 v4, s5
+; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s2
-; GFX6-NEXT:    v_mov_b32_e32 v4, s8
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s3
+; GFX6-NEXT:    v_mov_b32_e32 v4, s8
 ; GFX6-NEXT:    v_mov_b32_e32 v5, s9
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
@@ -4849,8 +4849,8 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX8-NEXT:    s_cmp_lg_u32 s9, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX8-NEXT:    s_addc_u32 s9, s3, s7
-; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3]
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3]
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX8-NEXT:    s_cmp_eq_u64 s[8:9], s[2:3]
 ; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
@@ -4886,13 +4886,13 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s4
-; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s5
+; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s2
-; GFX8-NEXT:    v_mov_b32_e32 v4, s8
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NEXT:    v_mov_b32_e32 v4, s8
 ; GFX8-NEXT:    v_mov_b32_e32 v5, s9
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
@@ -4919,8 +4919,8 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX9-NEXT:    s_cmp_lg_u32 s9, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX9-NEXT:    s_addc_u32 s9, s3, s7
-; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    s_cmp_eq_u64 s[8:9], s[2:3]
 ; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
@@ -4956,13 +4956,13 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s4
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v4, s5
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s2
-; GFX9-NEXT:    v_mov_b32_e32 v4, s8
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s3
+; GFX9-NEXT:    v_mov_b32_e32 v4, s8
 ; GFX9-NEXT:    v_mov_b32_e32 v5, s9
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
@@ -5286,18 +5286,18 @@ define amdgpu_ps <4 x float> @saddsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
 ; GFX10-LABEL: saddsat_i128_vs:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, v0, s0
-; GFX10-NEXT:    s_cmp_eq_u64 s[2:3], 0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, s1, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_i64_e64 s1, s[2:3], 0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, s2, v2, vcc_lo
-; GFX10-NEXT:    s_cselect_b32 s0, 1, 0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, s3, v3, vcc_lo
 ; GFX10-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[0:1]
+; GFX10-NEXT:    s_cmp_eq_u64 s[2:3], 0
+; GFX10-NEXT:    v_cmp_lt_i64_e64 s1, s[2:3], 0
+; GFX10-NEXT:    s_cselect_b32 s0, 1, 0
 ; GFX10-NEXT:    s_and_b32 s0, 1, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s1
-; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[6:7], v[2:3]
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s1
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3]
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
@@ -5516,19 +5516,19 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX10-NEXT:    v_add_co_u32 v10, vcc_lo, v4, v12
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v11, vcc_lo, v5, v13, vcc_lo
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v12, vcc_lo, v6, v14, vcc_lo
-; GFX10-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v13, vcc_lo, v7, v15, vcc_lo
 ; GFX10-NEXT:    v_cmp_lt_u64_e64 s4, v[10:11], v[4:5]
+; GFX10-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v1, 31, v17
-; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    v_cmp_eq_u64_e64 s5, v[12:13], v[6:7]
 ; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s4
 ; GFX10-NEXT:    v_cmp_lt_i64_e64 s4, v[12:13], v[6:7]
+; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v1, 0
-; GFX10-NEXT:    v_ashrrev_i32_e32 v7, 31, v13
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s4
 ; GFX10-NEXT:    v_cmp_gt_i64_e64 s4, 0, v[14:15]
+; GFX10-NEXT:    v_ashrrev_i32_e32 v7, 31, v13
 ; GFX10-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s4
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 s4, 0, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v5, v4, s5
@@ -5573,8 +5573,8 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX6-NEXT:    s_and_b32 s17, s17, 1
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX6-NEXT:    s_cmp_lg_u32 s17, 0
-; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, s[8:9], v[2:3]
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s2
+; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, s[8:9], v[2:3]
 ; GFX6-NEXT:    s_addc_u32 s17, s3, s11
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
@@ -5614,24 +5614,24 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX6-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX6-NEXT:    s_and_b32 s2, s2, 1
-; GFX6-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX6-NEXT:    v_mov_b32_e32 v4, s9
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s8
+; GFX6-NEXT:    v_mov_b32_e32 v4, s9
+; GFX6-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v5, v3, v1, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v4, v4, v2, vcc
-; GFX6-NEXT:    v_mov_b32_e32 v2, s16
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    v_mov_b32_e32 v2, s16
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s17
 ; GFX6-NEXT:    s_addc_u32 s2, s6, s14
-; GFX6-NEXT:    s_cselect_b32 s3, 1, 0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v6, v2, v0, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v7, v3, v1, vcc
+; GFX6-NEXT:    s_cselect_b32 s3, 1, 0
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    s_and_b32 s3, s3, 1
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX6-NEXT:    s_cmp_lg_u32 s3, 0
-; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
 ; GFX6-NEXT:    s_addc_u32 s3, s7, s15
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
@@ -5658,18 +5658,18 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX6-NEXT:    s_and_b32 s8, s8, 1
 ; GFX6-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX6-NEXT:    v_mov_b32_e32 v3, s0
-; GFX6-NEXT:    v_mov_b32_e32 v8, s1
 ; GFX6-NEXT:    s_addc_u32 s7, s7, s10
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s4
-; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    v_mov_b32_e32 v3, s0
+; GFX6-NEXT:    v_mov_b32_e32 v8, s1
+; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v8, v2, vcc
-; GFX6-NEXT:    v_mov_b32_e32 v8, s2
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s6
-; GFX6-NEXT:    v_mov_b32_e32 v9, s3
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s7
+; GFX6-NEXT:    v_mov_b32_e32 v8, s2
+; GFX6-NEXT:    v_mov_b32_e32 v9, s3
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
 ; GFX6-NEXT:    v_readfirstlane_b32 s0, v5
@@ -5699,8 +5699,8 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX8-NEXT:    s_cmp_lg_u32 s17, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX8-NEXT:    s_addc_u32 s17, s3, s11
-; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[8:9], v[2:3]
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[8:9], v[2:3]
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX8-NEXT:    s_cmp_eq_u64 s[16:17], s[2:3]
 ; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
@@ -5746,24 +5746,24 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX8-NEXT:    s_and_b32 s2, s2, 1
 ; GFX8-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX8-NEXT:    v_mov_b32_e32 v4, s9
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s8
+; GFX8-NEXT:    v_mov_b32_e32 v4, s9
 ; GFX8-NEXT:    s_addc_u32 s2, s6, s14
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v3, v1, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v2, vcc
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s3
-; GFX8-NEXT:    s_cselect_b32 s3, 1, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s16
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s17
-; GFX8-NEXT:    s_and_b32 s3, s3, 1
+; GFX8-NEXT:    s_cselect_b32 s3, 1, 0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, v2, v0, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, v3, v1, vcc
+; GFX8-NEXT:    s_and_b32 s3, s3, 1
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX8-NEXT:    s_addc_u32 s3, s7, s15
-; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX8-NEXT:    s_cmp_eq_u64 s[2:3], s[6:7]
 ; GFX8-NEXT:    s_cselect_b32 s6, 1, 0
@@ -5795,18 +5795,18 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX8-NEXT:    s_and_b32 s8, s8, 1
 ; GFX8-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    v_mov_b32_e32 v3, s0
-; GFX8-NEXT:    v_mov_b32_e32 v8, s1
 ; GFX8-NEXT:    s_addc_u32 s7, s7, s10
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s4
-; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s5
+; GFX8-NEXT:    v_mov_b32_e32 v3, s0
+; GFX8-NEXT:    v_mov_b32_e32 v8, s1
+; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v8, v2, vcc
-; GFX8-NEXT:    v_mov_b32_e32 v8, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s6
-; GFX8-NEXT:    v_mov_b32_e32 v9, s3
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s7
+; GFX8-NEXT:    v_mov_b32_e32 v8, s2
+; GFX8-NEXT:    v_mov_b32_e32 v9, s3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
 ; GFX8-NEXT:    v_readfirstlane_b32 s0, v5
@@ -5836,8 +5836,8 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX9-NEXT:    s_cmp_lg_u32 s17, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX9-NEXT:    s_addc_u32 s17, s3, s11
-; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[8:9], v[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[8:9], v[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    s_cmp_eq_u64 s[16:17], s[2:3]
 ; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
@@ -5883,24 +5883,24 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX9-NEXT:    s_and_b32 s2, s2, 1
 ; GFX9-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX9-NEXT:    v_mov_b32_e32 v4, s9
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s8
+; GFX9-NEXT:    v_mov_b32_e32 v4, s9
 ; GFX9-NEXT:    s_addc_u32 s2, s6, s14
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v3, v1, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v2, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    s_cselect_b32 s3, 1, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s16
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s17
-; GFX9-NEXT:    s_and_b32 s3, s3, 1
+; GFX9-NEXT:    s_cselect_b32 s3, 1, 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v2, v0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v7, v3, v1, vcc
+; GFX9-NEXT:    s_and_b32 s3, s3, 1
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX9-NEXT:    s_addc_u32 s3, s7, s15
-; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9-NEXT:    s_cmp_eq_u64 s[2:3], s[6:7]
 ; GFX9-NEXT:    s_cselect_b32 s6, 1, 0
@@ -5932,18 +5932,18 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX9-NEXT:    s_and_b32 s8, s8, 1
 ; GFX9-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_mov_b32_e32 v3, s0
-; GFX9-NEXT:    v_mov_b32_e32 v8, s1
 ; GFX9-NEXT:    s_addc_u32 s7, s7, s10
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s4
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s5
+; GFX9-NEXT:    v_mov_b32_e32 v3, s0
+; GFX9-NEXT:    v_mov_b32_e32 v8, s1
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v8, v2, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v8, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s6
-; GFX9-NEXT:    v_mov_b32_e32 v9, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-NEXT:    v_mov_b32_e32 v8, s2
+; GFX9-NEXT:    v_mov_b32_e32 v9, s3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v5

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
index 3d495e85556bb..47e2475a73189 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
@@ -27,29 +27,29 @@ define i64 @v_sdiv_i64(i64 %num, i64 %den) {
 ; CHECK-NEXT:    v_cvt_f32_u32_e32 v6, v2
 ; CHECK-NEXT:    v_ashrrev_i32_e32 v7, 31, v5
 ; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
-; CHECK-NEXT:    v_addc_u32_e32 v5, vcc, v5, v7, vcc
 ; CHECK-NEXT:    v_mac_f32_e32 v3, 0x4f800000, v6
 ; CHECK-NEXT:    v_rcp_iflag_f32_e32 v3, v3
+; CHECK-NEXT:    v_addc_u32_e32 v5, vcc, v5, v7, vcc
 ; CHECK-NEXT:    v_sub_i32_e32 v8, vcc, 0, v1
-; CHECK-NEXT:    v_subb_u32_e32 v9, vcc, 0, v2, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v4, v4, v7
 ; CHECK-NEXT:    v_mul_f32_e32 v3, 0x5f7ffffc, v3
 ; CHECK-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v3
 ; CHECK-NEXT:    v_trunc_f32_e32 v6, v6
 ; CHECK-NEXT:    v_mac_f32_e32 v3, 0xcf800000, v6
 ; CHECK-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; CHECK-NEXT:    v_cvt_u32_f32_e32 v6, v6
-; CHECK-NEXT:    v_xor_b32_e32 v5, v5, v7
+; CHECK-NEXT:    v_subb_u32_e32 v9, vcc, 0, v2, vcc
 ; CHECK-NEXT:    v_mul_lo_u32 v10, v9, v3
 ; CHECK-NEXT:    v_mul_lo_u32 v11, v8, v6
 ; CHECK-NEXT:    v_mul_hi_u32 v13, v8, v3
 ; CHECK-NEXT:    v_mul_lo_u32 v12, v8, v3
+; CHECK-NEXT:    v_xor_b32_e32 v4, v4, v7
 ; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
 ; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v10, v13
 ; CHECK-NEXT:    v_mul_lo_u32 v11, v6, v12
 ; CHECK-NEXT:    v_mul_lo_u32 v13, v3, v10
 ; CHECK-NEXT:    v_mul_hi_u32 v14, v3, v12
 ; CHECK-NEXT:    v_mul_hi_u32 v12, v6, v12
+; CHECK-NEXT:    v_xor_b32_e32 v5, v5, v7
 ; CHECK-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
 ; CHECK-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v11, vcc, v11, v14
@@ -57,12 +57,12 @@ define i64 @v_sdiv_i64(i64 %num, i64 %den) {
 ; CHECK-NEXT:    v_mul_lo_u32 v14, v6, v10
 ; CHECK-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
 ; CHECK-NEXT:    v_mul_hi_u32 v13, v3, v10
-; CHECK-NEXT:    v_mul_hi_u32 v10, v6, v10
 ; CHECK-NEXT:    v_add_i32_e32 v12, vcc, v14, v12
 ; CHECK-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
 ; CHECK-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
+; CHECK-NEXT:    v_mul_hi_u32 v10, v6, v10
 ; CHECK-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
 ; CHECK-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
@@ -75,10 +75,10 @@ define i64 @v_sdiv_i64(i64 %num, i64 %den) {
 ; CHECK-NEXT:    v_mul_hi_u32 v8, v8, v3
 ; CHECK-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v10
 ; CHECK-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v12
-; CHECK-NEXT:    v_mul_hi_u32 v10, v3, v13
 ; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v9, v8
 ; CHECK-NEXT:    v_mul_lo_u32 v9, v11, v13
 ; CHECK-NEXT:    v_mul_lo_u32 v12, v3, v8
+; CHECK-NEXT:    v_mul_hi_u32 v10, v3, v13
 ; CHECK-NEXT:    v_mul_hi_u32 v13, v11, v13
 ; CHECK-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v12
 ; CHECK-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
@@ -87,12 +87,12 @@ define i64 @v_sdiv_i64(i64 %num, i64 %den) {
 ; CHECK-NEXT:    v_mul_lo_u32 v10, v11, v8
 ; CHECK-NEXT:    v_add_i32_e64 v9, s[4:5], v12, v9
 ; CHECK-NEXT:    v_mul_hi_u32 v12, v3, v8
-; CHECK-NEXT:    v_mul_hi_u32 v8, v11, v8
 ; CHECK-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v13
 ; CHECK-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
 ; CHECK-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v12
 ; CHECK-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
 ; CHECK-NEXT:    v_add_i32_e64 v12, s[4:5], v13, v12
+; CHECK-NEXT:    v_mul_hi_u32 v8, v11, v8
 ; CHECK-NEXT:    v_add_i32_e64 v9, s[4:5], v10, v9
 ; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
 ; CHECK-NEXT:    v_add_i32_e64 v10, s[4:5], v12, v10
@@ -111,12 +111,12 @@ define i64 @v_sdiv_i64(i64 %num, i64 %den) {
 ; CHECK-NEXT:    v_mul_lo_u32 v10, v5, v6
 ; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
 ; CHECK-NEXT:    v_mul_hi_u32 v9, v4, v6
-; CHECK-NEXT:    v_mul_hi_u32 v6, v5, v6
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v10, v3
 ; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v9
 ; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
+; CHECK-NEXT:    v_mul_hi_u32 v6, v5, v6
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v8
 ; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
@@ -235,12 +235,12 @@ define amdgpu_ps i64 @s_sdiv_i64(i64 inreg %num, i64 inreg %den) {
 ; CHECK-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; CHECK-NEXT:    v_trunc_f32_e32 v1, v1
 ; CHECK-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; CHECK-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; CHECK-NEXT:    s_subb_u32 s5, 0, s11
 ; CHECK-NEXT:    v_mov_b32_e32 v6, s11
-; CHECK-NEXT:    v_mul_lo_u32 v2, s5, v0
 ; CHECK-NEXT:    v_mul_lo_u32 v3, s3, v1
+; CHECK-NEXT:    v_mul_lo_u32 v2, s5, v0
 ; CHECK-NEXT:    v_mul_hi_u32 v5, s3, v0
 ; CHECK-NEXT:    v_mul_lo_u32 v4, s3, v0
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
@@ -256,12 +256,12 @@ define amdgpu_ps i64 @s_sdiv_i64(i64 inreg %num, i64 inreg %den) {
 ; CHECK-NEXT:    v_mul_lo_u32 v7, v1, v2
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
 ; CHECK-NEXT:    v_mul_hi_u32 v5, v0, v2
-; CHECK-NEXT:    v_mul_hi_u32 v2, v1, v2
 ; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
 ; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
+; CHECK-NEXT:    v_mul_hi_u32 v2, v1, v2
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
 ; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
@@ -286,12 +286,12 @@ define amdgpu_ps i64 @s_sdiv_i64(i64 inreg %num, i64 inreg %den) {
 ; CHECK-NEXT:    v_mul_lo_u32 v5, v3, v4
 ; CHECK-NEXT:    v_add_i32_e64 v2, s[0:1], v8, v2
 ; CHECK-NEXT:    v_mul_hi_u32 v8, v0, v4
-; CHECK-NEXT:    v_mul_hi_u32 v3, v3, v4
 ; CHECK-NEXT:    v_add_i32_e64 v5, s[0:1], v5, v7
 ; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[0:1]
 ; CHECK-NEXT:    v_add_i32_e64 v5, s[0:1], v5, v8
 ; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[0:1]
 ; CHECK-NEXT:    v_add_i32_e64 v7, s[0:1], v7, v8
+; CHECK-NEXT:    v_mul_hi_u32 v3, v3, v4
 ; CHECK-NEXT:    v_add_i32_e64 v2, s[0:1], v5, v2
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[0:1]
 ; CHECK-NEXT:    v_add_i32_e64 v4, s[0:1], v7, v5
@@ -311,12 +311,12 @@ define amdgpu_ps i64 @s_sdiv_i64(i64 inreg %num, i64 inreg %den) {
 ; CHECK-NEXT:    v_mul_lo_u32 v5, s13, v1
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; CHECK-NEXT:    v_mul_hi_u32 v3, s12, v1
-; CHECK-NEXT:    v_mul_hi_u32 v1, s13, v1
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
 ; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
+; CHECK-NEXT:    v_mul_hi_u32 v1, s13, v1
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
@@ -334,9 +334,9 @@ define amdgpu_ps i64 @s_sdiv_i64(i64 inreg %num, i64 inreg %den) {
 ; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[0:1]
 ; CHECK-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v2
 ; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v6, vcc
-; CHECK-NEXT:    v_subrev_i32_e32 v2, vcc, s10, v2
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
 ; CHECK-NEXT:    v_cmp_eq_u32_e64 s[0:1], s11, v3
+; CHECK-NEXT:    v_subrev_i32_e32 v2, vcc, s10, v2
 ; CHECK-NEXT:    v_cndmask_b32_e64 v3, v4, v5, s[0:1]
 ; CHECK-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v4, vcc, 1, v0
@@ -350,8 +350,8 @@ define amdgpu_ps i64 @s_sdiv_i64(i64 inreg %num, i64 inreg %den) {
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
 ; CHECK-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
-; CHECK-NEXT:    s_xor_b64 s[0:1], s[6:7], s[8:9]
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; CHECK-NEXT:    s_xor_b64 s[0:1], s[6:7], s[8:9]
 ; CHECK-NEXT:    v_xor_b32_e32 v0, s0, v0
 ; CHECK-NEXT:    v_subrev_i32_e32 v0, vcc, s0, v0
 ; CHECK-NEXT:    s_mov_b32 s1, 0
@@ -412,29 +412,29 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v10, v5
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v11, 31, v1
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v11
-; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v11, vcc
 ; GISEL-NEXT:    v_mac_f32_e32 v9, 0x4f800000, v10
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v9, v9
+; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v11, vcc
 ; GISEL-NEXT:    v_sub_i32_e32 v12, vcc, 0, v4
-; GISEL-NEXT:    v_subb_u32_e32 v13, vcc, 0, v5, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v11
 ; GISEL-NEXT:    v_mul_f32_e32 v9, 0x5f7ffffc, v9
 ; GISEL-NEXT:    v_mul_f32_e32 v10, 0x2f800000, v9
 ; GISEL-NEXT:    v_trunc_f32_e32 v10, v10
 ; GISEL-NEXT:    v_mac_f32_e32 v9, 0xcf800000, v10
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v9, v9
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v10, v10
-; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v11
+; GISEL-NEXT:    v_subb_u32_e32 v13, vcc, 0, v5, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v14, v13, v9
 ; GISEL-NEXT:    v_mul_lo_u32 v15, v12, v10
 ; GISEL-NEXT:    v_mul_hi_u32 v17, v12, v9
 ; GISEL-NEXT:    v_mul_lo_u32 v16, v12, v9
+; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v11
 ; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v15
 ; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v17
 ; GISEL-NEXT:    v_mul_lo_u32 v15, v10, v16
 ; GISEL-NEXT:    v_mul_lo_u32 v17, v9, v14
 ; GISEL-NEXT:    v_mul_hi_u32 v18, v9, v16
 ; GISEL-NEXT:    v_mul_hi_u32 v16, v10, v16
+; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v11
 ; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v15, v17
 ; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v15, v18
@@ -442,12 +442,12 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_mul_lo_u32 v18, v10, v14
 ; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v17, v15
 ; GISEL-NEXT:    v_mul_hi_u32 v17, v9, v14
-; GISEL-NEXT:    v_mul_hi_u32 v14, v10, v14
 ; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v18, v16
 ; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v16, v17
 ; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v17, vcc, v18, v17
+; GISEL-NEXT:    v_mul_hi_u32 v14, v10, v14
 ; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v16, v15
 ; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v17, v16
@@ -460,10 +460,10 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_mul_hi_u32 v12, v12, v9
 ; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v14
 ; GISEL-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v16
-; GISEL-NEXT:    v_mul_hi_u32 v14, v9, v17
 ; GISEL-NEXT:    v_add_i32_e64 v12, s[4:5], v13, v12
 ; GISEL-NEXT:    v_mul_lo_u32 v13, v15, v17
 ; GISEL-NEXT:    v_mul_lo_u32 v16, v9, v12
+; GISEL-NEXT:    v_mul_hi_u32 v14, v9, v17
 ; GISEL-NEXT:    v_mul_hi_u32 v17, v15, v17
 ; GISEL-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v16
 ; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s[4:5]
@@ -472,12 +472,12 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_mul_lo_u32 v14, v15, v12
 ; GISEL-NEXT:    v_add_i32_e64 v13, s[4:5], v16, v13
 ; GISEL-NEXT:    v_mul_hi_u32 v16, v9, v12
-; GISEL-NEXT:    v_mul_hi_u32 v12, v15, v12
 ; GISEL-NEXT:    v_add_i32_e64 v14, s[4:5], v14, v17
 ; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v14, s[4:5], v14, v16
 ; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v16, s[4:5], v17, v16
+; GISEL-NEXT:    v_mul_hi_u32 v12, v15, v12
 ; GISEL-NEXT:    v_add_i32_e64 v13, s[4:5], v14, v13
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v14, s[4:5], v16, v14
@@ -496,12 +496,12 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_mul_lo_u32 v14, v1, v10
 ; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
 ; GISEL-NEXT:    v_mul_hi_u32 v13, v0, v10
-; GISEL-NEXT:    v_mul_hi_u32 v10, v1, v10
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v14, v9
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v13
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
+; GISEL-NEXT:    v_mul_hi_u32 v10, v1, v10
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
@@ -539,42 +539,40 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v14, v4, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v12
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v5, 31, v7
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v10, v1, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v9, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v10, v1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v5
 ; GISEL-NEXT:    v_addc_u32_e32 v7, vcc, v7, v5, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v7, v7, v5
 ; GISEL-NEXT:    v_xor_b32_e32 v6, v6, v5
+; GISEL-NEXT:    v_xor_b32_e32 v7, v7, v5
 ; GISEL-NEXT:    v_xor_b32_e32 v4, v11, v8
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v8, v6
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v9, v7
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v10, 31, v3
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v10
-; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v10, vcc
 ; GISEL-NEXT:    v_mac_f32_e32 v8, 0x4f800000, v9
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v8, v8
+; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v10, vcc
 ; GISEL-NEXT:    v_sub_i32_e32 v11, vcc, 0, v6
-; GISEL-NEXT:    v_subb_u32_e32 v12, vcc, 0, v7, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v4
 ; GISEL-NEXT:    v_mul_f32_e32 v8, 0x5f7ffffc, v8
 ; GISEL-NEXT:    v_mul_f32_e32 v9, 0x2f800000, v8
 ; GISEL-NEXT:    v_trunc_f32_e32 v9, v9
 ; GISEL-NEXT:    v_mac_f32_e32 v8, 0xcf800000, v9
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v8, v8
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v9, v9
-; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v10
-; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v10
+; GISEL-NEXT:    v_subb_u32_e32 v12, vcc, 0, v7, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v13, v12, v8
 ; GISEL-NEXT:    v_mul_lo_u32 v14, v11, v9
 ; GISEL-NEXT:    v_mul_hi_u32 v16, v11, v8
 ; GISEL-NEXT:    v_mul_lo_u32 v15, v11, v8
-; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v4
+; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v4
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v16
 ; GISEL-NEXT:    v_mul_lo_u32 v14, v9, v15
 ; GISEL-NEXT:    v_mul_lo_u32 v16, v8, v13
 ; GISEL-NEXT:    v_mul_hi_u32 v17, v8, v15
 ; GISEL-NEXT:    v_mul_hi_u32 v15, v9, v15
+; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v10
 ; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v16
 ; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v17
@@ -582,12 +580,12 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_mul_lo_u32 v17, v9, v13
 ; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v16, v14
 ; GISEL-NEXT:    v_mul_hi_u32 v16, v8, v13
-; GISEL-NEXT:    v_mul_hi_u32 v13, v9, v13
 ; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v17, v15
 ; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v15, v16
 ; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v17, v16
+; GISEL-NEXT:    v_mul_hi_u32 v13, v9, v13
 ; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
 ; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v16, v15
@@ -600,11 +598,12 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_mul_hi_u32 v11, v11, v8
 ; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v13
 ; GISEL-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v15
-; GISEL-NEXT:    v_mul_hi_u32 v13, v8, v16
 ; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v12, v11
 ; GISEL-NEXT:    v_mul_lo_u32 v12, v14, v16
 ; GISEL-NEXT:    v_mul_lo_u32 v15, v8, v11
+; GISEL-NEXT:    v_mul_hi_u32 v13, v8, v16
 ; GISEL-NEXT:    v_mul_hi_u32 v16, v14, v16
+; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v10
 ; GISEL-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v15
 ; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v13
@@ -612,12 +611,12 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_mul_lo_u32 v13, v14, v11
 ; GISEL-NEXT:    v_add_i32_e64 v12, s[4:5], v15, v12
 ; GISEL-NEXT:    v_mul_hi_u32 v15, v8, v11
-; GISEL-NEXT:    v_mul_hi_u32 v11, v14, v11
 ; GISEL-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v16
 ; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v15
 ; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v15, s[4:5], v16, v15
+; GISEL-NEXT:    v_mul_hi_u32 v11, v14, v11
 ; GISEL-NEXT:    v_add_i32_e64 v12, s[4:5], v13, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v13, s[4:5], v15, v13
@@ -625,25 +624,26 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, v9, v11, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v12
 ; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, 0, v9, vcc
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
+; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v4
 ; GISEL-NEXT:    v_mul_lo_u32 v11, v3, v8
 ; GISEL-NEXT:    v_mul_lo_u32 v12, v2, v9
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
 ; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v1, v4, vcc
 ; GISEL-NEXT:    v_mul_hi_u32 v4, v2, v8
-; GISEL-NEXT:    v_mul_hi_u32 v8, v3, v8
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v11, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v11, v3, v9
+; GISEL-NEXT:    v_mul_hi_u32 v8, v3, v8
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v12, v4
 ; GISEL-NEXT:    v_mul_hi_u32 v12, v2, v9
-; GISEL-NEXT:    v_mul_hi_u32 v9, v3, v9
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v11, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
+; GISEL-NEXT:    v_mul_hi_u32 v9, v3, v9
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v11, v8
@@ -716,26 +716,26 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v3, v3
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v10, v5
 ; CGP-NEXT:    v_addc_u32_e32 v10, vcc, v11, v5, vcc
-; CGP-NEXT:    v_sub_i32_e32 v12, vcc, 0, v1
 ; CGP-NEXT:    v_mul_f32_e32 v3, 0x5f7ffffc, v3
 ; CGP-NEXT:    v_mul_f32_e32 v11, 0x2f800000, v3
 ; CGP-NEXT:    v_trunc_f32_e32 v11, v11
 ; CGP-NEXT:    v_mac_f32_e32 v3, 0xcf800000, v11
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v11, v11
+; CGP-NEXT:    v_sub_i32_e32 v12, vcc, 0, v1
 ; CGP-NEXT:    v_subb_u32_e32 v13, vcc, 0, v2, vcc
-; CGP-NEXT:    v_xor_b32_e32 v4, v4, v5
 ; CGP-NEXT:    v_mul_lo_u32 v14, v13, v3
 ; CGP-NEXT:    v_mul_lo_u32 v15, v12, v11
 ; CGP-NEXT:    v_mul_hi_u32 v17, v12, v3
 ; CGP-NEXT:    v_mul_lo_u32 v16, v12, v3
-; CGP-NEXT:    v_xor_b32_e32 v10, v10, v5
+; CGP-NEXT:    v_xor_b32_e32 v4, v4, v5
 ; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v15
 ; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v17
 ; CGP-NEXT:    v_mul_lo_u32 v15, v11, v16
 ; CGP-NEXT:    v_mul_lo_u32 v17, v3, v14
 ; CGP-NEXT:    v_mul_hi_u32 v18, v3, v16
 ; CGP-NEXT:    v_mul_hi_u32 v16, v11, v16
+; CGP-NEXT:    v_xor_b32_e32 v10, v10, v5
 ; CGP-NEXT:    v_add_i32_e32 v15, vcc, v15, v17
 ; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v15, vcc, v15, v18
@@ -743,12 +743,12 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_mul_lo_u32 v18, v11, v14
 ; CGP-NEXT:    v_add_i32_e32 v15, vcc, v17, v15
 ; CGP-NEXT:    v_mul_hi_u32 v17, v3, v14
-; CGP-NEXT:    v_mul_hi_u32 v14, v11, v14
 ; CGP-NEXT:    v_add_i32_e32 v16, vcc, v18, v16
 ; CGP-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v16, vcc, v16, v17
 ; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v17, vcc, v18, v17
+; CGP-NEXT:    v_mul_hi_u32 v14, v11, v14
 ; CGP-NEXT:    v_add_i32_e32 v15, vcc, v16, v15
 ; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v16, vcc, v17, v16
@@ -761,10 +761,10 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_mul_hi_u32 v12, v12, v3
 ; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v14
 ; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v16
-; CGP-NEXT:    v_mul_hi_u32 v14, v3, v17
 ; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v13, v12
 ; CGP-NEXT:    v_mul_lo_u32 v13, v15, v17
 ; CGP-NEXT:    v_mul_lo_u32 v16, v3, v12
+; CGP-NEXT:    v_mul_hi_u32 v14, v3, v17
 ; CGP-NEXT:    v_mul_hi_u32 v17, v15, v17
 ; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v16
 ; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s[4:5]
@@ -773,12 +773,12 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_mul_lo_u32 v14, v15, v12
 ; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v16, v13
 ; CGP-NEXT:    v_mul_hi_u32 v16, v3, v12
-; CGP-NEXT:    v_mul_hi_u32 v12, v15, v12
 ; CGP-NEXT:    v_add_i32_e64 v14, s[4:5], v14, v17
 ; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v14, s[4:5], v14, v16
 ; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v16, s[4:5], v17, v16
+; CGP-NEXT:    v_mul_hi_u32 v12, v15, v12
 ; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v14, v13
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v14, s[4:5], v16, v14
@@ -797,12 +797,12 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_mul_lo_u32 v14, v10, v11
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
 ; CGP-NEXT:    v_mul_hi_u32 v13, v4, v11
-; CGP-NEXT:    v_mul_hi_u32 v11, v10, v11
 ; CGP-NEXT:    v_add_i32_e32 v3, vcc, v14, v3
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v13
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
+; CGP-NEXT:    v_mul_hi_u32 v11, v10, v11
 ; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
@@ -895,26 +895,26 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v5, v5
 ; CGP-NEXT:    v_add_i32_e32 v6, vcc, v8, v7
 ; CGP-NEXT:    v_addc_u32_e32 v8, vcc, v9, v7, vcc
-; CGP-NEXT:    v_sub_i32_e32 v10, vcc, 0, v3
 ; CGP-NEXT:    v_mul_f32_e32 v5, 0x5f7ffffc, v5
 ; CGP-NEXT:    v_mul_f32_e32 v9, 0x2f800000, v5
 ; CGP-NEXT:    v_trunc_f32_e32 v9, v9
 ; CGP-NEXT:    v_mac_f32_e32 v5, 0xcf800000, v9
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v5, v5
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v9, v9
+; CGP-NEXT:    v_sub_i32_e32 v10, vcc, 0, v3
 ; CGP-NEXT:    v_subb_u32_e32 v11, vcc, 0, v4, vcc
-; CGP-NEXT:    v_xor_b32_e32 v6, v6, v7
 ; CGP-NEXT:    v_mul_lo_u32 v12, v11, v5
 ; CGP-NEXT:    v_mul_lo_u32 v13, v10, v9
 ; CGP-NEXT:    v_mul_hi_u32 v15, v10, v5
 ; CGP-NEXT:    v_mul_lo_u32 v14, v10, v5
-; CGP-NEXT:    v_xor_b32_e32 v8, v8, v7
+; CGP-NEXT:    v_xor_b32_e32 v6, v6, v7
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v15
 ; CGP-NEXT:    v_mul_lo_u32 v13, v9, v14
 ; CGP-NEXT:    v_mul_lo_u32 v15, v5, v12
 ; CGP-NEXT:    v_mul_hi_u32 v16, v5, v14
 ; CGP-NEXT:    v_mul_hi_u32 v14, v9, v14
+; CGP-NEXT:    v_xor_b32_e32 v8, v8, v7
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v15
 ; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v16
@@ -922,12 +922,12 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_mul_lo_u32 v16, v9, v12
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v15, v13
 ; CGP-NEXT:    v_mul_hi_u32 v15, v5, v12
-; CGP-NEXT:    v_mul_hi_u32 v12, v9, v12
 ; CGP-NEXT:    v_add_i32_e32 v14, vcc, v16, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v15
 ; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v15, vcc, v16, v15
+; CGP-NEXT:    v_mul_hi_u32 v12, v9, v12
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
@@ -940,10 +940,10 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_mul_hi_u32 v10, v10, v5
 ; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v12
 ; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v14
-; CGP-NEXT:    v_mul_hi_u32 v12, v5, v15
 ; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v11, v10
 ; CGP-NEXT:    v_mul_lo_u32 v11, v13, v15
 ; CGP-NEXT:    v_mul_lo_u32 v14, v5, v10
+; CGP-NEXT:    v_mul_hi_u32 v12, v5, v15
 ; CGP-NEXT:    v_mul_hi_u32 v15, v13, v15
 ; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
@@ -952,12 +952,12 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_mul_lo_u32 v12, v13, v10
 ; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v14, v11
 ; CGP-NEXT:    v_mul_hi_u32 v14, v5, v10
-; CGP-NEXT:    v_mul_hi_u32 v10, v13, v10
 ; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v15
 ; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v14, s[4:5], v15, v14
+; CGP-NEXT:    v_mul_hi_u32 v10, v13, v10
 ; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v12, v11
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v14, v12
@@ -976,12 +976,12 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_mul_lo_u32 v12, v8, v9
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
 ; CGP-NEXT:    v_mul_hi_u32 v11, v6, v9
-; CGP-NEXT:    v_mul_hi_u32 v9, v8, v9
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v12, v5
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v11
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; CGP-NEXT:    v_mul_hi_u32 v9, v8, v9
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v10
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
@@ -1071,19 +1071,19 @@ define i64 @v_sdiv_i64_pow2k_denom(i64 %num) {
 ; CHECK-NEXT:    v_rcp_iflag_f32_e32 v2, v2
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
 ; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v3
 ; CHECK-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
 ; CHECK-NEXT:    v_mul_f32_e32 v4, 0x2f800000, v2
 ; CHECK-NEXT:    v_trunc_f32_e32 v4, v4
 ; CHECK-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v4
 ; CHECK-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; CHECK-NEXT:    v_cvt_u32_f32_e32 v4, v4
+; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v3
 ; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v3
-; CHECK-NEXT:    s_bfe_i32 s7, -1, 0x10000
 ; CHECK-NEXT:    v_mul_lo_u32 v5, -1, v2
 ; CHECK-NEXT:    v_mul_lo_u32 v6, s6, v4
 ; CHECK-NEXT:    v_mul_hi_u32 v8, s6, v2
 ; CHECK-NEXT:    v_mul_lo_u32 v7, s6, v2
+; CHECK-NEXT:    s_bfe_i32 s7, -1, 0x10000
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
 ; CHECK-NEXT:    v_mul_lo_u32 v6, v4, v7
@@ -1097,12 +1097,12 @@ define i64 @v_sdiv_i64_pow2k_denom(i64 %num) {
 ; CHECK-NEXT:    v_mul_lo_u32 v9, v4, v5
 ; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
 ; CHECK-NEXT:    v_mul_hi_u32 v8, v2, v5
-; CHECK-NEXT:    v_mul_hi_u32 v5, v4, v5
 ; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v9, v7
 ; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
 ; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; CHECK-NEXT:    v_mul_hi_u32 v5, v4, v5
 ; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
 ; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
@@ -1128,12 +1128,12 @@ define i64 @v_sdiv_i64_pow2k_denom(i64 %num) {
 ; CHECK-NEXT:    v_mul_lo_u32 v8, v6, v7
 ; CHECK-NEXT:    v_add_i32_e64 v5, s[4:5], v10, v5
 ; CHECK-NEXT:    v_mul_hi_u32 v10, v2, v7
-; CHECK-NEXT:    v_mul_hi_u32 v6, v6, v7
 ; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v9
 ; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[4:5]
 ; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v10
 ; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
 ; CHECK-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v10
+; CHECK-NEXT:    v_mul_hi_u32 v6, v6, v7
 ; CHECK-NEXT:    v_add_i32_e64 v5, s[4:5], v8, v5
 ; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
 ; CHECK-NEXT:    v_add_i32_e64 v7, s[4:5], v9, v8
@@ -1152,12 +1152,12 @@ define i64 @v_sdiv_i64_pow2k_denom(i64 %num) {
 ; CHECK-NEXT:    v_mul_lo_u32 v7, v1, v4
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
 ; CHECK-NEXT:    v_mul_hi_u32 v6, v0, v4
-; CHECK-NEXT:    v_mul_hi_u32 v4, v1, v4
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
 ; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
 ; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; CHECK-NEXT:    v_mul_hi_u32 v4, v1, v4
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
@@ -1176,13 +1176,13 @@ define i64 @v_sdiv_i64_pow2k_denom(i64 %num) {
 ; CHECK-NEXT:    v_subrev_i32_e32 v0, vcc, s6, v0
 ; CHECK-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[4:5]
-; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v6
 ; CHECK-NEXT:    v_mov_b32_e32 v7, s7
+; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v6
 ; CHECK-NEXT:    v_add_i32_e32 v6, vcc, 1, v2
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, v7, v5, s[4:5]
 ; CHECK-NEXT:    v_addc_u32_e32 v7, vcc, 0, v4, vcc
-; CHECK-NEXT:    v_cmp_le_u32_e32 vcc, s6, v0
 ; CHECK-NEXT:    s_bfe_i32 s4, -1, 0x10000
+; CHECK-NEXT:    v_cmp_le_u32_e32 vcc, s6, v0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
 ; CHECK-NEXT:    v_mov_b32_e32 v8, s4
 ; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
@@ -1212,8 +1212,8 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    s_add_u32 s4, s10, 0
 ; GISEL-NEXT:    s_cselect_b32 s5, 1, 0
 ; GISEL-NEXT:    s_and_b32 s5, s5, 1
-; GISEL-NEXT:    s_cmp_lg_u32 s5, 0
 ; GISEL-NEXT:    s_mov_b32 s6, 0
+; GISEL-NEXT:    s_cmp_lg_u32 s5, 0
 ; GISEL-NEXT:    s_mov_b32 s7, s6
 ; GISEL-NEXT:    s_addc_u32 s5, 0, 0
 ; GISEL-NEXT:    s_xor_b64 s[8:9], s[4:5], s[6:7]
@@ -1234,19 +1234,18 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v4, v4
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v5, v5
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
-; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v7, s12, v4
 ; GISEL-NEXT:    v_mul_lo_u32 v8, s11, v5
 ; GISEL-NEXT:    v_mul_hi_u32 v10, s11, v4
 ; GISEL-NEXT:    v_mul_lo_u32 v9, s11, v4
-; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v6
+; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
 ; GISEL-NEXT:    v_mul_lo_u32 v8, v5, v9
 ; GISEL-NEXT:    v_mul_lo_u32 v10, v4, v7
 ; GISEL-NEXT:    v_mul_hi_u32 v11, v4, v9
 ; GISEL-NEXT:    v_mul_hi_u32 v9, v5, v9
-; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v6
+; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v6
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
@@ -1254,12 +1253,12 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_mul_lo_u32 v11, v5, v7
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
 ; GISEL-NEXT:    v_mul_hi_u32 v10, v4, v7
-; GISEL-NEXT:    v_mul_hi_u32 v7, v5, v7
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; GISEL-NEXT:    v_mul_hi_u32 v7, v5, v7
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
@@ -1277,6 +1276,7 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_mul_lo_u32 v12, v4, v9
 ; GISEL-NEXT:    v_mul_hi_u32 v7, v4, v11
 ; GISEL-NEXT:    v_mul_hi_u32 v11, v8, v11
+; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v6
 ; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v10, v7
@@ -1284,12 +1284,12 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_mul_lo_u32 v10, v8, v9
 ; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v12, v7
 ; GISEL-NEXT:    v_mul_hi_u32 v12, v4, v9
-; GISEL-NEXT:    v_mul_hi_u32 v8, v8, v9
 ; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v12
+; GISEL-NEXT:    v_mul_hi_u32 v8, v8, v9
 ; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v10, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v11, v10
@@ -1309,12 +1309,12 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_mul_lo_u32 v10, v1, v5
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
 ; GISEL-NEXT:    v_mul_hi_u32 v8, v0, v5
-; GISEL-NEXT:    v_mul_hi_u32 v5, v1, v5
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v10, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
+; GISEL-NEXT:    v_mul_hi_u32 v5, v1, v5
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
@@ -1343,20 +1343,20 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, -1, vcc
 ; GISEL-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
 ; GISEL-NEXT:    s_add_u32 s4, s10, 0
-; GISEL-NEXT:    s_cselect_b32 s5, 1, 0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
 ; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, s9, v1
-; GISEL-NEXT:    s_and_b32 s5, s5, 1
+; GISEL-NEXT:    s_cselect_b32 s5, 1, 0
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v10, v0, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, 1, v8
-; GISEL-NEXT:    s_cmp_lg_u32 s5, 0
+; GISEL-NEXT:    s_and_b32 s5, s5, 1
 ; GISEL-NEXT:    v_addc_u32_e32 v10, vcc, 0, v9, vcc
+; GISEL-NEXT:    s_cmp_lg_u32 s5, 0
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GISEL-NEXT:    s_addc_u32 s5, 0, 0
-; GISEL-NEXT:    s_xor_b64 s[6:7], s[4:5], s[6:7]
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v8, v1, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v9, v10, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
+; GISEL-NEXT:    s_xor_b64 s[6:7], s[4:5], s[6:7]
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, s6
@@ -1399,12 +1399,12 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_mul_lo_u32 v11, v5, v7
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
 ; GISEL-NEXT:    v_mul_hi_u32 v10, v4, v7
-; GISEL-NEXT:    v_mul_hi_u32 v7, v5, v7
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; GISEL-NEXT:    v_mul_hi_u32 v7, v5, v7
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
@@ -1430,12 +1430,12 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_mul_lo_u32 v10, v8, v9
 ; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v12, v7
 ; GISEL-NEXT:    v_mul_hi_u32 v12, v4, v9
-; GISEL-NEXT:    v_mul_hi_u32 v8, v8, v9
 ; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v12
+; GISEL-NEXT:    v_mul_hi_u32 v8, v8, v9
 ; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v10, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v11, v10
@@ -1455,12 +1455,12 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_mul_lo_u32 v10, v3, v5
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
 ; GISEL-NEXT:    v_mul_hi_u32 v8, v2, v5
-; GISEL-NEXT:    v_mul_hi_u32 v5, v3, v5
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v10, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
+; GISEL-NEXT:    v_mul_hi_u32 v5, v3, v5
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
@@ -1517,27 +1517,26 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v7, v7
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v5
 ; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v5, vcc
-; CGP-NEXT:    v_xor_b32_e32 v0, v0, v5
 ; CGP-NEXT:    v_mul_f32_e32 v7, 0x5f7ffffc, v7
 ; CGP-NEXT:    v_mul_f32_e32 v8, 0x2f800000, v7
 ; CGP-NEXT:    v_trunc_f32_e32 v8, v8
 ; CGP-NEXT:    v_mac_f32_e32 v7, 0xcf800000, v8
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v7, v7
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v8, v8
+; CGP-NEXT:    v_xor_b32_e32 v0, v0, v5
 ; CGP-NEXT:    v_xor_b32_e32 v1, v1, v5
-; CGP-NEXT:    s_movk_i32 s7, 0x1000
 ; CGP-NEXT:    v_mul_lo_u32 v9, -1, v7
 ; CGP-NEXT:    v_mul_lo_u32 v10, s6, v8
 ; CGP-NEXT:    v_mul_hi_u32 v12, s6, v7
 ; CGP-NEXT:    v_mul_lo_u32 v11, s6, v7
-; CGP-NEXT:    s_bfe_i32 s8, -1, 0x10000
+; CGP-NEXT:    s_movk_i32 s7, 0x1000
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
 ; CGP-NEXT:    v_mul_lo_u32 v10, v8, v11
 ; CGP-NEXT:    v_mul_lo_u32 v12, v7, v9
 ; CGP-NEXT:    v_mul_hi_u32 v13, v7, v11
 ; CGP-NEXT:    v_mul_hi_u32 v11, v8, v11
-; CGP-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v6
+; CGP-NEXT:    s_bfe_i32 s8, -1, 0x10000
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v13
@@ -1545,12 +1544,12 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_mul_lo_u32 v13, v8, v9
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
 ; CGP-NEXT:    v_mul_hi_u32 v12, v7, v9
-; CGP-NEXT:    v_mul_hi_u32 v9, v8, v9
 ; CGP-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
+; CGP-NEXT:    v_mul_hi_u32 v9, v8, v9
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
@@ -1568,7 +1567,7 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_mul_lo_u32 v14, v7, v11
 ; CGP-NEXT:    v_mul_hi_u32 v9, v7, v13
 ; CGP-NEXT:    v_mul_hi_u32 v13, v10, v13
-; CGP-NEXT:    v_rcp_iflag_f32_e32 v4, v4
+; CGP-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v6
 ; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v12, v9
@@ -1576,12 +1575,12 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_mul_lo_u32 v12, v10, v11
 ; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v14, v9
 ; CGP-NEXT:    v_mul_hi_u32 v14, v7, v11
-; CGP-NEXT:    v_mul_hi_u32 v10, v10, v11
 ; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v13
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v14
+; CGP-NEXT:    v_mul_hi_u32 v10, v10, v11
 ; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v12, v9
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v13, v12
@@ -1593,7 +1592,7 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_mul_lo_u32 v10, v0, v8
 ; CGP-NEXT:    v_mul_hi_u32 v11, v0, v7
 ; CGP-NEXT:    v_mul_hi_u32 v7, v1, v7
-; CGP-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
+; CGP-NEXT:    v_rcp_iflag_f32_e32 v4, v4
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
@@ -1601,12 +1600,12 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_mul_lo_u32 v11, v1, v8
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
 ; CGP-NEXT:    v_mul_hi_u32 v10, v0, v8
-; CGP-NEXT:    v_mul_hi_u32 v8, v1, v8
 ; CGP-NEXT:    v_add_i32_e32 v7, vcc, v11, v7
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; CGP-NEXT:    v_mul_hi_u32 v8, v1, v8
 ; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
 ; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
@@ -1615,7 +1614,7 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_mul_lo_u32 v10, s7, v8
 ; CGP-NEXT:    v_mul_hi_u32 v12, s7, v7
 ; CGP-NEXT:    v_mul_lo_u32 v11, s7, v7
-; CGP-NEXT:    v_ashrrev_i32_e32 v6, 31, v3
+; CGP-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
 ; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v11
@@ -1626,13 +1625,13 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_subrev_i32_e32 v0, vcc, s7, v0
 ; CGP-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
 ; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v10
 ; CGP-NEXT:    v_mov_b32_e32 v11, s8
+; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v10
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, 1, v7
 ; CGP-NEXT:    v_cndmask_b32_e64 v9, v11, v9, s[4:5]
 ; CGP-NEXT:    v_addc_u32_e32 v11, vcc, 0, v8, vcc
-; CGP-NEXT:    v_cmp_le_u32_e32 vcc, s7, v0
 ; CGP-NEXT:    s_bfe_i32 s4, -1, 0x10000
+; CGP-NEXT:    v_cmp_le_u32_e32 vcc, s7, v0
 ; CGP-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
 ; CGP-NEXT:    v_mov_b32_e32 v12, s4
 ; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
@@ -1650,11 +1649,12 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v4, v4
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v7, v7
 ; CGP-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
+; CGP-NEXT:    v_ashrrev_i32_e32 v6, 31, v3
 ; CGP-NEXT:    v_mul_lo_u32 v8, -1, v4
 ; CGP-NEXT:    v_mul_lo_u32 v9, s6, v7
 ; CGP-NEXT:    v_mul_hi_u32 v11, s6, v4
 ; CGP-NEXT:    v_mul_lo_u32 v10, s6, v4
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
 ; CGP-NEXT:    v_addc_u32_e32 v3, vcc, v3, v6, vcc
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
@@ -1670,12 +1670,12 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_mul_lo_u32 v12, v7, v8
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
 ; CGP-NEXT:    v_mul_hi_u32 v11, v4, v8
-; CGP-NEXT:    v_mul_hi_u32 v8, v7, v8
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; CGP-NEXT:    v_mul_hi_u32 v8, v7, v8
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
@@ -1701,40 +1701,40 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_mul_lo_u32 v11, v9, v10
 ; CGP-NEXT:    v_add_i32_e64 v8, s[4:5], v13, v8
 ; CGP-NEXT:    v_mul_hi_u32 v13, v4, v10
-; CGP-NEXT:    v_mul_hi_u32 v9, v9, v10
 ; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v13
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v13
+; CGP-NEXT:    v_mul_hi_u32 v9, v9, v10
 ; CGP-NEXT:    v_add_i32_e64 v8, s[4:5], v11, v8
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v12, v11
 ; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v10
 ; CGP-NEXT:    v_addc_u32_e32 v7, vcc, v7, v9, vcc
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
-; CGP-NEXT:    v_addc_u32_e32 v7, vcc, 0, v7, vcc
 ; CGP-NEXT:    v_xor_b32_e32 v3, v3, v6
+; CGP-NEXT:    v_addc_u32_e32 v7, vcc, 0, v7, vcc
 ; CGP-NEXT:    v_xor_b32_e32 v1, v1, v5
-; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v5
 ; CGP-NEXT:    v_mul_lo_u32 v8, v3, v4
 ; CGP-NEXT:    v_mul_lo_u32 v9, v2, v7
+; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v5
 ; CGP-NEXT:    v_subb_u32_e32 v1, vcc, v1, v5, vcc
 ; CGP-NEXT:    v_mul_hi_u32 v5, v2, v4
-; CGP-NEXT:    v_mul_hi_u32 v4, v3, v4
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
 ; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
 ; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v8, v3, v7
+; CGP-NEXT:    v_mul_hi_u32 v4, v3, v4
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
 ; CGP-NEXT:    v_mul_hi_u32 v9, v2, v7
-; CGP-NEXT:    v_mul_hi_u32 v7, v3, v7
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
 ; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
+; CGP-NEXT:    v_mul_hi_u32 v7, v3, v7
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
 ; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
@@ -1754,13 +1754,13 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_subrev_i32_e32 v2, vcc, s7, v2
 ; CGP-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
 ; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v8
 ; CGP-NEXT:    v_mov_b32_e32 v9, s6
+; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v8
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, 1, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v7, v9, v7, s[4:5]
 ; CGP-NEXT:    v_addc_u32_e32 v9, vcc, 0, v5, vcc
-; CGP-NEXT:    v_cmp_le_u32_e32 vcc, s7, v2
 ; CGP-NEXT:    s_bfe_i32 s4, -1, 0x10000
+; CGP-NEXT:    v_cmp_le_u32_e32 vcc, s7, v2
 ; CGP-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
 ; CGP-NEXT:    v_mov_b32_e32 v10, s4
 ; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
@@ -1794,19 +1794,19 @@ define i64 @v_sdiv_i64_oddk_denom(i64 %num) {
 ; CHECK-NEXT:    v_rcp_iflag_f32_e32 v2, v2
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
 ; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v3
 ; CHECK-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
 ; CHECK-NEXT:    v_mul_f32_e32 v4, 0x2f800000, v2
 ; CHECK-NEXT:    v_trunc_f32_e32 v4, v4
 ; CHECK-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v4
 ; CHECK-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; CHECK-NEXT:    v_cvt_u32_f32_e32 v4, v4
+; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v3
 ; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v3
-; CHECK-NEXT:    s_bfe_i32 s7, -1, 0x10000
 ; CHECK-NEXT:    v_mul_lo_u32 v5, -1, v2
 ; CHECK-NEXT:    v_mul_lo_u32 v6, s6, v4
 ; CHECK-NEXT:    v_mul_hi_u32 v8, s6, v2
 ; CHECK-NEXT:    v_mul_lo_u32 v7, s6, v2
+; CHECK-NEXT:    s_bfe_i32 s7, -1, 0x10000
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
 ; CHECK-NEXT:    v_mul_lo_u32 v6, v4, v7
@@ -1820,12 +1820,12 @@ define i64 @v_sdiv_i64_oddk_denom(i64 %num) {
 ; CHECK-NEXT:    v_mul_lo_u32 v9, v4, v5
 ; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
 ; CHECK-NEXT:    v_mul_hi_u32 v8, v2, v5
-; CHECK-NEXT:    v_mul_hi_u32 v5, v4, v5
 ; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v9, v7
 ; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
 ; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; CHECK-NEXT:    v_mul_hi_u32 v5, v4, v5
 ; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
 ; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
@@ -1851,12 +1851,12 @@ define i64 @v_sdiv_i64_oddk_denom(i64 %num) {
 ; CHECK-NEXT:    v_mul_lo_u32 v8, v6, v7
 ; CHECK-NEXT:    v_add_i32_e64 v5, s[4:5], v10, v5
 ; CHECK-NEXT:    v_mul_hi_u32 v10, v2, v7
-; CHECK-NEXT:    v_mul_hi_u32 v6, v6, v7
 ; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v9
 ; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[4:5]
 ; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v10
 ; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
 ; CHECK-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v10
+; CHECK-NEXT:    v_mul_hi_u32 v6, v6, v7
 ; CHECK-NEXT:    v_add_i32_e64 v5, s[4:5], v8, v5
 ; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
 ; CHECK-NEXT:    v_add_i32_e64 v7, s[4:5], v9, v8
@@ -1875,12 +1875,12 @@ define i64 @v_sdiv_i64_oddk_denom(i64 %num) {
 ; CHECK-NEXT:    v_mul_lo_u32 v7, v1, v4
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
 ; CHECK-NEXT:    v_mul_hi_u32 v6, v0, v4
-; CHECK-NEXT:    v_mul_hi_u32 v4, v1, v4
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
 ; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
 ; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; CHECK-NEXT:    v_mul_hi_u32 v4, v1, v4
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
@@ -1899,13 +1899,13 @@ define i64 @v_sdiv_i64_oddk_denom(i64 %num) {
 ; CHECK-NEXT:    v_subrev_i32_e32 v0, vcc, s6, v0
 ; CHECK-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[4:5]
-; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v6
 ; CHECK-NEXT:    v_mov_b32_e32 v7, s7
+; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v6
 ; CHECK-NEXT:    v_add_i32_e32 v6, vcc, 1, v2
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, v7, v5, s[4:5]
 ; CHECK-NEXT:    v_addc_u32_e32 v7, vcc, 0, v4, vcc
-; CHECK-NEXT:    v_cmp_le_u32_e32 vcc, s6, v0
 ; CHECK-NEXT:    s_bfe_i32 s4, -1, 0x10000
+; CHECK-NEXT:    v_cmp_le_u32_e32 vcc, s6, v0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
 ; CHECK-NEXT:    v_mov_b32_e32 v8, s4
 ; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
@@ -1935,8 +1935,8 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    s_add_u32 s4, s10, 0
 ; GISEL-NEXT:    s_cselect_b32 s5, 1, 0
 ; GISEL-NEXT:    s_and_b32 s5, s5, 1
-; GISEL-NEXT:    s_cmp_lg_u32 s5, 0
 ; GISEL-NEXT:    s_mov_b32 s6, 0
+; GISEL-NEXT:    s_cmp_lg_u32 s5, 0
 ; GISEL-NEXT:    s_mov_b32 s7, s6
 ; GISEL-NEXT:    s_addc_u32 s5, 0, 0
 ; GISEL-NEXT:    s_xor_b64 s[8:9], s[4:5], s[6:7]
@@ -1957,19 +1957,18 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v4, v4
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v5, v5
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
-; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v7, s12, v4
 ; GISEL-NEXT:    v_mul_lo_u32 v8, s11, v5
 ; GISEL-NEXT:    v_mul_hi_u32 v10, s11, v4
 ; GISEL-NEXT:    v_mul_lo_u32 v9, s11, v4
-; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v6
+; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
 ; GISEL-NEXT:    v_mul_lo_u32 v8, v5, v9
 ; GISEL-NEXT:    v_mul_lo_u32 v10, v4, v7
 ; GISEL-NEXT:    v_mul_hi_u32 v11, v4, v9
 ; GISEL-NEXT:    v_mul_hi_u32 v9, v5, v9
-; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v6
+; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v6
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
@@ -1977,12 +1976,12 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_mul_lo_u32 v11, v5, v7
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
 ; GISEL-NEXT:    v_mul_hi_u32 v10, v4, v7
-; GISEL-NEXT:    v_mul_hi_u32 v7, v5, v7
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; GISEL-NEXT:    v_mul_hi_u32 v7, v5, v7
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
@@ -2000,6 +1999,7 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_mul_lo_u32 v12, v4, v9
 ; GISEL-NEXT:    v_mul_hi_u32 v7, v4, v11
 ; GISEL-NEXT:    v_mul_hi_u32 v11, v8, v11
+; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v6
 ; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v10, v7
@@ -2007,12 +2007,12 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_mul_lo_u32 v10, v8, v9
 ; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v12, v7
 ; GISEL-NEXT:    v_mul_hi_u32 v12, v4, v9
-; GISEL-NEXT:    v_mul_hi_u32 v8, v8, v9
 ; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v12
+; GISEL-NEXT:    v_mul_hi_u32 v8, v8, v9
 ; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v10, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v11, v10
@@ -2032,12 +2032,12 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_mul_lo_u32 v10, v1, v5
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
 ; GISEL-NEXT:    v_mul_hi_u32 v8, v0, v5
-; GISEL-NEXT:    v_mul_hi_u32 v5, v1, v5
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v10, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
+; GISEL-NEXT:    v_mul_hi_u32 v5, v1, v5
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
@@ -2066,20 +2066,20 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, -1, vcc
 ; GISEL-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
 ; GISEL-NEXT:    s_add_u32 s4, s10, 0
-; GISEL-NEXT:    s_cselect_b32 s5, 1, 0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
 ; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, s9, v1
-; GISEL-NEXT:    s_and_b32 s5, s5, 1
+; GISEL-NEXT:    s_cselect_b32 s5, 1, 0
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v10, v0, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, 1, v8
-; GISEL-NEXT:    s_cmp_lg_u32 s5, 0
+; GISEL-NEXT:    s_and_b32 s5, s5, 1
 ; GISEL-NEXT:    v_addc_u32_e32 v10, vcc, 0, v9, vcc
+; GISEL-NEXT:    s_cmp_lg_u32 s5, 0
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GISEL-NEXT:    s_addc_u32 s5, 0, 0
-; GISEL-NEXT:    s_xor_b64 s[6:7], s[4:5], s[6:7]
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v8, v1, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v9, v10, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
+; GISEL-NEXT:    s_xor_b64 s[6:7], s[4:5], s[6:7]
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, s6
@@ -2122,12 +2122,12 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_mul_lo_u32 v11, v5, v7
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
 ; GISEL-NEXT:    v_mul_hi_u32 v10, v4, v7
-; GISEL-NEXT:    v_mul_hi_u32 v7, v5, v7
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; GISEL-NEXT:    v_mul_hi_u32 v7, v5, v7
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
@@ -2153,12 +2153,12 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_mul_lo_u32 v10, v8, v9
 ; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v12, v7
 ; GISEL-NEXT:    v_mul_hi_u32 v12, v4, v9
-; GISEL-NEXT:    v_mul_hi_u32 v8, v8, v9
 ; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v12
+; GISEL-NEXT:    v_mul_hi_u32 v8, v8, v9
 ; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v10, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v11, v10
@@ -2178,12 +2178,12 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_mul_lo_u32 v10, v3, v5
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
 ; GISEL-NEXT:    v_mul_hi_u32 v8, v2, v5
-; GISEL-NEXT:    v_mul_hi_u32 v5, v3, v5
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v10, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
+; GISEL-NEXT:    v_mul_hi_u32 v5, v3, v5
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
@@ -2240,27 +2240,26 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v7, v7
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v5
 ; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v5, vcc
-; CGP-NEXT:    v_xor_b32_e32 v0, v0, v5
 ; CGP-NEXT:    v_mul_f32_e32 v7, 0x5f7ffffc, v7
 ; CGP-NEXT:    v_mul_f32_e32 v8, 0x2f800000, v7
 ; CGP-NEXT:    v_trunc_f32_e32 v8, v8
 ; CGP-NEXT:    v_mac_f32_e32 v7, 0xcf800000, v8
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v7, v7
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v8, v8
+; CGP-NEXT:    v_xor_b32_e32 v0, v0, v5
 ; CGP-NEXT:    v_xor_b32_e32 v1, v1, v5
-; CGP-NEXT:    s_mov_b32 s7, 0x12d8fb
 ; CGP-NEXT:    v_mul_lo_u32 v9, -1, v7
 ; CGP-NEXT:    v_mul_lo_u32 v10, s6, v8
 ; CGP-NEXT:    v_mul_hi_u32 v12, s6, v7
 ; CGP-NEXT:    v_mul_lo_u32 v11, s6, v7
-; CGP-NEXT:    s_bfe_i32 s8, -1, 0x10000
+; CGP-NEXT:    s_mov_b32 s7, 0x12d8fb
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
 ; CGP-NEXT:    v_mul_lo_u32 v10, v8, v11
 ; CGP-NEXT:    v_mul_lo_u32 v12, v7, v9
 ; CGP-NEXT:    v_mul_hi_u32 v13, v7, v11
 ; CGP-NEXT:    v_mul_hi_u32 v11, v8, v11
-; CGP-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v6
+; CGP-NEXT:    s_bfe_i32 s8, -1, 0x10000
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v13
@@ -2268,12 +2267,12 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_mul_lo_u32 v13, v8, v9
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
 ; CGP-NEXT:    v_mul_hi_u32 v12, v7, v9
-; CGP-NEXT:    v_mul_hi_u32 v9, v8, v9
 ; CGP-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
+; CGP-NEXT:    v_mul_hi_u32 v9, v8, v9
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
@@ -2291,7 +2290,7 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_mul_lo_u32 v14, v7, v11
 ; CGP-NEXT:    v_mul_hi_u32 v9, v7, v13
 ; CGP-NEXT:    v_mul_hi_u32 v13, v10, v13
-; CGP-NEXT:    v_rcp_iflag_f32_e32 v4, v4
+; CGP-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v6
 ; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v12, v9
@@ -2299,12 +2298,12 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_mul_lo_u32 v12, v10, v11
 ; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v14, v9
 ; CGP-NEXT:    v_mul_hi_u32 v14, v7, v11
-; CGP-NEXT:    v_mul_hi_u32 v10, v10, v11
 ; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v13
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v14
+; CGP-NEXT:    v_mul_hi_u32 v10, v10, v11
 ; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v12, v9
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v13, v12
@@ -2316,7 +2315,7 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_mul_lo_u32 v10, v0, v8
 ; CGP-NEXT:    v_mul_hi_u32 v11, v0, v7
 ; CGP-NEXT:    v_mul_hi_u32 v7, v1, v7
-; CGP-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
+; CGP-NEXT:    v_rcp_iflag_f32_e32 v4, v4
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
@@ -2324,12 +2323,12 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_mul_lo_u32 v11, v1, v8
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
 ; CGP-NEXT:    v_mul_hi_u32 v10, v0, v8
-; CGP-NEXT:    v_mul_hi_u32 v8, v1, v8
 ; CGP-NEXT:    v_add_i32_e32 v7, vcc, v11, v7
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; CGP-NEXT:    v_mul_hi_u32 v8, v1, v8
 ; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
 ; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
@@ -2338,7 +2337,7 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_mul_lo_u32 v10, s7, v8
 ; CGP-NEXT:    v_mul_hi_u32 v12, s7, v7
 ; CGP-NEXT:    v_mul_lo_u32 v11, s7, v7
-; CGP-NEXT:    v_ashrrev_i32_e32 v6, 31, v3
+; CGP-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
 ; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v11
@@ -2349,13 +2348,13 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_subrev_i32_e32 v0, vcc, s7, v0
 ; CGP-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
 ; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v10
 ; CGP-NEXT:    v_mov_b32_e32 v11, s8
+; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v10
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, 1, v7
 ; CGP-NEXT:    v_cndmask_b32_e64 v9, v11, v9, s[4:5]
 ; CGP-NEXT:    v_addc_u32_e32 v11, vcc, 0, v8, vcc
-; CGP-NEXT:    v_cmp_le_u32_e32 vcc, s7, v0
 ; CGP-NEXT:    s_bfe_i32 s4, -1, 0x10000
+; CGP-NEXT:    v_cmp_le_u32_e32 vcc, s7, v0
 ; CGP-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
 ; CGP-NEXT:    v_mov_b32_e32 v12, s4
 ; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
@@ -2373,11 +2372,12 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v4, v4
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v7, v7
 ; CGP-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
+; CGP-NEXT:    v_ashrrev_i32_e32 v6, 31, v3
 ; CGP-NEXT:    v_mul_lo_u32 v8, -1, v4
 ; CGP-NEXT:    v_mul_lo_u32 v9, s6, v7
 ; CGP-NEXT:    v_mul_hi_u32 v11, s6, v4
 ; CGP-NEXT:    v_mul_lo_u32 v10, s6, v4
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
 ; CGP-NEXT:    v_addc_u32_e32 v3, vcc, v3, v6, vcc
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
@@ -2393,12 +2393,12 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_mul_lo_u32 v12, v7, v8
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
 ; CGP-NEXT:    v_mul_hi_u32 v11, v4, v8
-; CGP-NEXT:    v_mul_hi_u32 v8, v7, v8
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; CGP-NEXT:    v_mul_hi_u32 v8, v7, v8
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
@@ -2424,40 +2424,40 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_mul_lo_u32 v11, v9, v10
 ; CGP-NEXT:    v_add_i32_e64 v8, s[4:5], v13, v8
 ; CGP-NEXT:    v_mul_hi_u32 v13, v4, v10
-; CGP-NEXT:    v_mul_hi_u32 v9, v9, v10
 ; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v13
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v13
+; CGP-NEXT:    v_mul_hi_u32 v9, v9, v10
 ; CGP-NEXT:    v_add_i32_e64 v8, s[4:5], v11, v8
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v12, v11
 ; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v10
 ; CGP-NEXT:    v_addc_u32_e32 v7, vcc, v7, v9, vcc
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
-; CGP-NEXT:    v_addc_u32_e32 v7, vcc, 0, v7, vcc
 ; CGP-NEXT:    v_xor_b32_e32 v3, v3, v6
+; CGP-NEXT:    v_addc_u32_e32 v7, vcc, 0, v7, vcc
 ; CGP-NEXT:    v_xor_b32_e32 v1, v1, v5
-; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v5
 ; CGP-NEXT:    v_mul_lo_u32 v8, v3, v4
 ; CGP-NEXT:    v_mul_lo_u32 v9, v2, v7
+; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v5
 ; CGP-NEXT:    v_subb_u32_e32 v1, vcc, v1, v5, vcc
 ; CGP-NEXT:    v_mul_hi_u32 v5, v2, v4
-; CGP-NEXT:    v_mul_hi_u32 v4, v3, v4
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
 ; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
 ; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v8, v3, v7
+; CGP-NEXT:    v_mul_hi_u32 v4, v3, v4
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
 ; CGP-NEXT:    v_mul_hi_u32 v9, v2, v7
-; CGP-NEXT:    v_mul_hi_u32 v7, v3, v7
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
 ; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
+; CGP-NEXT:    v_mul_hi_u32 v7, v3, v7
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
 ; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
@@ -2477,13 +2477,13 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_subrev_i32_e32 v2, vcc, s7, v2
 ; CGP-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
 ; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v8
 ; CGP-NEXT:    v_mov_b32_e32 v9, s6
+; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v8
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, 1, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v7, v9, v7, s[4:5]
 ; CGP-NEXT:    v_addc_u32_e32 v9, vcc, 0, v5, vcc
-; CGP-NEXT:    v_cmp_le_u32_e32 vcc, s7, v2
 ; CGP-NEXT:    s_bfe_i32 s4, -1, 0x10000
+; CGP-NEXT:    v_cmp_le_u32_e32 vcc, s7, v2
 ; CGP-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
 ; CGP-NEXT:    v_mov_b32_e32 v10, s4
 ; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
@@ -2530,29 +2530,29 @@ define i64 @v_sdiv_i64_pow2_shl_denom(i64 %x, i64 %y) {
 ; CHECK-NEXT:    v_cvt_f32_u32_e32 v6, v2
 ; CHECK-NEXT:    v_ashrrev_i32_e32 v7, 31, v4
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
-; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, v4, v7, vcc
 ; CHECK-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v6
 ; CHECK-NEXT:    v_rcp_iflag_f32_e32 v5, v5
+; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, v4, v7, vcc
 ; CHECK-NEXT:    v_sub_i32_e32 v8, vcc, 0, v1
-; CHECK-NEXT:    v_subb_u32_e32 v9, vcc, 0, v2, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v3, v3, v7
 ; CHECK-NEXT:    v_mul_f32_e32 v5, 0x5f7ffffc, v5
 ; CHECK-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v5
 ; CHECK-NEXT:    v_trunc_f32_e32 v6, v6
 ; CHECK-NEXT:    v_mac_f32_e32 v5, 0xcf800000, v6
 ; CHECK-NEXT:    v_cvt_u32_f32_e32 v5, v5
 ; CHECK-NEXT:    v_cvt_u32_f32_e32 v6, v6
-; CHECK-NEXT:    v_xor_b32_e32 v4, v4, v7
+; CHECK-NEXT:    v_subb_u32_e32 v9, vcc, 0, v2, vcc
 ; CHECK-NEXT:    v_mul_lo_u32 v10, v9, v5
 ; CHECK-NEXT:    v_mul_lo_u32 v11, v8, v6
 ; CHECK-NEXT:    v_mul_hi_u32 v13, v8, v5
 ; CHECK-NEXT:    v_mul_lo_u32 v12, v8, v5
+; CHECK-NEXT:    v_xor_b32_e32 v3, v3, v7
 ; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
 ; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v10, v13
 ; CHECK-NEXT:    v_mul_lo_u32 v11, v6, v12
 ; CHECK-NEXT:    v_mul_lo_u32 v13, v5, v10
 ; CHECK-NEXT:    v_mul_hi_u32 v14, v5, v12
 ; CHECK-NEXT:    v_mul_hi_u32 v12, v6, v12
+; CHECK-NEXT:    v_xor_b32_e32 v4, v4, v7
 ; CHECK-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
 ; CHECK-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v11, vcc, v11, v14
@@ -2560,12 +2560,12 @@ define i64 @v_sdiv_i64_pow2_shl_denom(i64 %x, i64 %y) {
 ; CHECK-NEXT:    v_mul_lo_u32 v14, v6, v10
 ; CHECK-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
 ; CHECK-NEXT:    v_mul_hi_u32 v13, v5, v10
-; CHECK-NEXT:    v_mul_hi_u32 v10, v6, v10
 ; CHECK-NEXT:    v_add_i32_e32 v12, vcc, v14, v12
 ; CHECK-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
 ; CHECK-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
+; CHECK-NEXT:    v_mul_hi_u32 v10, v6, v10
 ; CHECK-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
 ; CHECK-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
@@ -2578,10 +2578,10 @@ define i64 @v_sdiv_i64_pow2_shl_denom(i64 %x, i64 %y) {
 ; CHECK-NEXT:    v_mul_hi_u32 v8, v8, v5
 ; CHECK-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v10
 ; CHECK-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v12
-; CHECK-NEXT:    v_mul_hi_u32 v10, v5, v13
 ; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v9, v8
 ; CHECK-NEXT:    v_mul_lo_u32 v9, v11, v13
 ; CHECK-NEXT:    v_mul_lo_u32 v12, v5, v8
+; CHECK-NEXT:    v_mul_hi_u32 v10, v5, v13
 ; CHECK-NEXT:    v_mul_hi_u32 v13, v11, v13
 ; CHECK-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v12
 ; CHECK-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
@@ -2590,12 +2590,12 @@ define i64 @v_sdiv_i64_pow2_shl_denom(i64 %x, i64 %y) {
 ; CHECK-NEXT:    v_mul_lo_u32 v10, v11, v8
 ; CHECK-NEXT:    v_add_i32_e64 v9, s[4:5], v12, v9
 ; CHECK-NEXT:    v_mul_hi_u32 v12, v5, v8
-; CHECK-NEXT:    v_mul_hi_u32 v8, v11, v8
 ; CHECK-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v13
 ; CHECK-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
 ; CHECK-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v12
 ; CHECK-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
 ; CHECK-NEXT:    v_add_i32_e64 v12, s[4:5], v13, v12
+; CHECK-NEXT:    v_mul_hi_u32 v8, v11, v8
 ; CHECK-NEXT:    v_add_i32_e64 v9, s[4:5], v10, v9
 ; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
 ; CHECK-NEXT:    v_add_i32_e64 v10, s[4:5], v12, v10
@@ -2614,12 +2614,12 @@ define i64 @v_sdiv_i64_pow2_shl_denom(i64 %x, i64 %y) {
 ; CHECK-NEXT:    v_mul_lo_u32 v10, v4, v6
 ; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
 ; CHECK-NEXT:    v_mul_hi_u32 v9, v3, v6
-; CHECK-NEXT:    v_mul_hi_u32 v6, v4, v6
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v10, v5
 ; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
 ; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
+; CHECK-NEXT:    v_mul_hi_u32 v6, v4, v6
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
 ; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
@@ -2714,22 +2714,22 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v9, v5
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v10
 ; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v10, vcc
-; GISEL-NEXT:    v_sub_i32_e32 v11, vcc, 0, v4
 ; GISEL-NEXT:    v_mac_f32_e32 v8, 0x4f800000, v9
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v8, v8
 ; GISEL-NEXT:    v_xor_b32_e32 v9, v0, v10
-; GISEL-NEXT:    v_subb_u32_e32 v12, vcc, 0, v5, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v17, v1, v10
+; GISEL-NEXT:    v_sub_i32_e32 v11, vcc, 0, v4
 ; GISEL-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v8
 ; GISEL-NEXT:    v_mul_f32_e32 v8, 0x2f800000, v0
 ; GISEL-NEXT:    v_trunc_f32_e32 v8, v8
 ; GISEL-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v8
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v8, v8
+; GISEL-NEXT:    v_subb_u32_e32 v12, vcc, 0, v5, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v13, v12, v0
 ; GISEL-NEXT:    v_mul_lo_u32 v14, v11, v8
 ; GISEL-NEXT:    v_mul_hi_u32 v16, v11, v0
 ; GISEL-NEXT:    v_mul_lo_u32 v15, v11, v0
+; GISEL-NEXT:    v_xor_b32_e32 v17, v1, v10
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v16
 ; GISEL-NEXT:    v_mul_lo_u32 v14, v8, v15
@@ -2743,12 +2743,12 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL-NEXT:    v_mul_lo_u32 v14, v8, v13
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v16, v1
 ; GISEL-NEXT:    v_mul_hi_u32 v16, v0, v13
-; GISEL-NEXT:    v_mul_hi_u32 v13, v8, v13
 ; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v15
 ; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v16
 ; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v15, v16
+; GISEL-NEXT:    v_mul_hi_u32 v13, v8, v13
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v14, v1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
@@ -2761,10 +2761,10 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL-NEXT:    v_mul_hi_u32 v11, v11, v0
 ; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v13
 ; GISEL-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v14
-; GISEL-NEXT:    v_mul_hi_u32 v13, v0, v15
 ; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v12, v11
 ; GISEL-NEXT:    v_mul_lo_u32 v12, v1, v15
 ; GISEL-NEXT:    v_mul_lo_u32 v14, v0, v11
+; GISEL-NEXT:    v_mul_hi_u32 v13, v0, v15
 ; GISEL-NEXT:    v_mul_hi_u32 v15, v1, v15
 ; GISEL-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v14
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
@@ -2773,12 +2773,12 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL-NEXT:    v_mul_lo_u32 v13, v1, v11
 ; GISEL-NEXT:    v_add_i32_e64 v12, s[4:5], v14, v12
 ; GISEL-NEXT:    v_mul_hi_u32 v14, v0, v11
-; GISEL-NEXT:    v_mul_hi_u32 v1, v1, v11
 ; GISEL-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v15
 ; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v14
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v14, s[4:5], v15, v14
+; GISEL-NEXT:    v_mul_hi_u32 v1, v1, v11
 ; GISEL-NEXT:    v_add_i32_e64 v12, s[4:5], v13, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v14, v13
@@ -2798,12 +2798,12 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL-NEXT:    v_mul_lo_u32 v12, v17, v11
 ; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v13, v6
 ; GISEL-NEXT:    v_mul_hi_u32 v13, v9, v11
-; GISEL-NEXT:    v_mul_hi_u32 v11, v17, v11
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v12, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v13
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
+; GISEL-NEXT:    v_mul_hi_u32 v11, v17, v11
 ; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v12, v8
@@ -2857,8 +2857,8 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v2, v10
 ; GISEL-NEXT:    v_addc_u32_e32 v2, vcc, v3, v10, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v3, v1, v10
 ; GISEL-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
+; GISEL-NEXT:    v_xor_b32_e32 v3, v1, v10
 ; GISEL-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GISEL-NEXT:    v_trunc_f32_e32 v1, v1
 ; GISEL-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
@@ -2884,12 +2884,12 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL-NEXT:    v_mul_lo_u32 v17, v1, v13
 ; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v16, v14
 ; GISEL-NEXT:    v_mul_hi_u32 v16, v0, v13
-; GISEL-NEXT:    v_mul_hi_u32 v13, v1, v13
 ; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v17, v15
 ; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v15, v16
 ; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v17, v16
+; GISEL-NEXT:    v_mul_hi_u32 v13, v1, v13
 ; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
 ; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v16, v15
@@ -2902,10 +2902,10 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL-NEXT:    v_mul_hi_u32 v11, v11, v0
 ; GISEL-NEXT:    v_add_i32_e64 v1, s[4:5], v1, v13
 ; GISEL-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v15
-; GISEL-NEXT:    v_mul_hi_u32 v13, v0, v16
 ; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v12, v11
 ; GISEL-NEXT:    v_mul_lo_u32 v12, v14, v16
 ; GISEL-NEXT:    v_mul_lo_u32 v15, v0, v11
+; GISEL-NEXT:    v_mul_hi_u32 v13, v0, v16
 ; GISEL-NEXT:    v_mul_hi_u32 v16, v14, v16
 ; GISEL-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v15
 ; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
@@ -2914,12 +2914,12 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL-NEXT:    v_mul_lo_u32 v13, v14, v11
 ; GISEL-NEXT:    v_add_i32_e64 v12, s[4:5], v15, v12
 ; GISEL-NEXT:    v_mul_hi_u32 v15, v0, v11
-; GISEL-NEXT:    v_mul_hi_u32 v11, v14, v11
 ; GISEL-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v16
 ; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v15
 ; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v15, s[4:5], v16, v15
+; GISEL-NEXT:    v_mul_hi_u32 v11, v14, v11
 ; GISEL-NEXT:    v_add_i32_e64 v12, s[4:5], v13, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v13, s[4:5], v15, v13
@@ -3002,8 +3002,8 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_mov_b32_e32 v8, v0
 ; CGP-NEXT:    v_or_b32_e32 v1, v9, v3
 ; CGP-NEXT:    v_mov_b32_e32 v0, 0
-; CGP-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
 ; CGP-NEXT:    v_lshl_b64 v[10:11], s[4:5], v6
+; CGP-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
 ; CGP-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; CGP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; CGP-NEXT:    s_xor_b64 s[6:7], exec, s[4:5]
@@ -3021,26 +3021,26 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v3, v3
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v8, v6
 ; CGP-NEXT:    v_addc_u32_e32 v8, vcc, v9, v6, vcc
-; CGP-NEXT:    v_sub_i32_e32 v12, vcc, 0, v1
 ; CGP-NEXT:    v_mul_f32_e32 v3, 0x5f7ffffc, v3
 ; CGP-NEXT:    v_mul_f32_e32 v9, 0x2f800000, v3
 ; CGP-NEXT:    v_trunc_f32_e32 v9, v9
 ; CGP-NEXT:    v_mac_f32_e32 v3, 0xcf800000, v9
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v9, v9
+; CGP-NEXT:    v_sub_i32_e32 v12, vcc, 0, v1
 ; CGP-NEXT:    v_subb_u32_e32 v13, vcc, 0, v2, vcc
-; CGP-NEXT:    v_xor_b32_e32 v4, v4, v6
 ; CGP-NEXT:    v_mul_lo_u32 v14, v13, v3
 ; CGP-NEXT:    v_mul_lo_u32 v15, v12, v9
 ; CGP-NEXT:    v_mul_hi_u32 v17, v12, v3
 ; CGP-NEXT:    v_mul_lo_u32 v16, v12, v3
-; CGP-NEXT:    v_xor_b32_e32 v8, v8, v6
+; CGP-NEXT:    v_xor_b32_e32 v4, v4, v6
 ; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v15
 ; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v17
 ; CGP-NEXT:    v_mul_lo_u32 v15, v9, v16
 ; CGP-NEXT:    v_mul_lo_u32 v17, v3, v14
 ; CGP-NEXT:    v_mul_hi_u32 v18, v3, v16
 ; CGP-NEXT:    v_mul_hi_u32 v16, v9, v16
+; CGP-NEXT:    v_xor_b32_e32 v8, v8, v6
 ; CGP-NEXT:    v_add_i32_e32 v15, vcc, v15, v17
 ; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v15, vcc, v15, v18
@@ -3048,12 +3048,12 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_mul_lo_u32 v18, v9, v14
 ; CGP-NEXT:    v_add_i32_e32 v15, vcc, v17, v15
 ; CGP-NEXT:    v_mul_hi_u32 v17, v3, v14
-; CGP-NEXT:    v_mul_hi_u32 v14, v9, v14
 ; CGP-NEXT:    v_add_i32_e32 v16, vcc, v18, v16
 ; CGP-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v16, vcc, v16, v17
 ; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v17, vcc, v18, v17
+; CGP-NEXT:    v_mul_hi_u32 v14, v9, v14
 ; CGP-NEXT:    v_add_i32_e32 v15, vcc, v16, v15
 ; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v16, vcc, v17, v16
@@ -3066,10 +3066,10 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_mul_hi_u32 v12, v12, v3
 ; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v14
 ; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v16
-; CGP-NEXT:    v_mul_hi_u32 v14, v3, v17
 ; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v13, v12
 ; CGP-NEXT:    v_mul_lo_u32 v13, v15, v17
 ; CGP-NEXT:    v_mul_lo_u32 v16, v3, v12
+; CGP-NEXT:    v_mul_hi_u32 v14, v3, v17
 ; CGP-NEXT:    v_mul_hi_u32 v17, v15, v17
 ; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v16
 ; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s[4:5]
@@ -3078,12 +3078,12 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_mul_lo_u32 v14, v15, v12
 ; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v16, v13
 ; CGP-NEXT:    v_mul_hi_u32 v16, v3, v12
-; CGP-NEXT:    v_mul_hi_u32 v12, v15, v12
 ; CGP-NEXT:    v_add_i32_e64 v14, s[4:5], v14, v17
 ; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v14, s[4:5], v14, v16
 ; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v16, s[4:5], v17, v16
+; CGP-NEXT:    v_mul_hi_u32 v12, v15, v12
 ; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v14, v13
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v14, s[4:5], v16, v14
@@ -3102,12 +3102,12 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_mul_lo_u32 v14, v8, v9
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
 ; CGP-NEXT:    v_mul_hi_u32 v13, v4, v9
-; CGP-NEXT:    v_mul_hi_u32 v9, v8, v9
 ; CGP-NEXT:    v_add_i32_e32 v3, vcc, v14, v3
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v13
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
+; CGP-NEXT:    v_mul_hi_u32 v9, v8, v9
 ; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
@@ -3197,29 +3197,29 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v8, v4
 ; CGP-NEXT:    v_ashrrev_i32_e32 v9, 31, v7
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
-; CGP-NEXT:    v_addc_u32_e32 v7, vcc, v7, v9, vcc
 ; CGP-NEXT:    v_mac_f32_e32 v6, 0x4f800000, v8
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v6, v6
+; CGP-NEXT:    v_addc_u32_e32 v7, vcc, v7, v9, vcc
 ; CGP-NEXT:    v_sub_i32_e32 v10, vcc, 0, v3
-; CGP-NEXT:    v_subb_u32_e32 v11, vcc, 0, v4, vcc
-; CGP-NEXT:    v_xor_b32_e32 v5, v5, v9
 ; CGP-NEXT:    v_mul_f32_e32 v6, 0x5f7ffffc, v6
 ; CGP-NEXT:    v_mul_f32_e32 v8, 0x2f800000, v6
 ; CGP-NEXT:    v_trunc_f32_e32 v8, v8
 ; CGP-NEXT:    v_mac_f32_e32 v6, 0xcf800000, v8
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v6, v6
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v8, v8
-; CGP-NEXT:    v_xor_b32_e32 v7, v7, v9
+; CGP-NEXT:    v_subb_u32_e32 v11, vcc, 0, v4, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v12, v11, v6
 ; CGP-NEXT:    v_mul_lo_u32 v13, v10, v8
 ; CGP-NEXT:    v_mul_hi_u32 v15, v10, v6
 ; CGP-NEXT:    v_mul_lo_u32 v14, v10, v6
+; CGP-NEXT:    v_xor_b32_e32 v5, v5, v9
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v15
 ; CGP-NEXT:    v_mul_lo_u32 v13, v8, v14
 ; CGP-NEXT:    v_mul_lo_u32 v15, v6, v12
 ; CGP-NEXT:    v_mul_hi_u32 v16, v6, v14
 ; CGP-NEXT:    v_mul_hi_u32 v14, v8, v14
+; CGP-NEXT:    v_xor_b32_e32 v7, v7, v9
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v15
 ; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v16
@@ -3227,12 +3227,12 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_mul_lo_u32 v16, v8, v12
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v15, v13
 ; CGP-NEXT:    v_mul_hi_u32 v15, v6, v12
-; CGP-NEXT:    v_mul_hi_u32 v12, v8, v12
 ; CGP-NEXT:    v_add_i32_e32 v14, vcc, v16, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v15
 ; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v15, vcc, v16, v15
+; CGP-NEXT:    v_mul_hi_u32 v12, v8, v12
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
@@ -3245,10 +3245,10 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_mul_hi_u32 v10, v10, v6
 ; CGP-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v12
 ; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v14
-; CGP-NEXT:    v_mul_hi_u32 v12, v6, v15
 ; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v11, v10
 ; CGP-NEXT:    v_mul_lo_u32 v11, v13, v15
 ; CGP-NEXT:    v_mul_lo_u32 v14, v6, v10
+; CGP-NEXT:    v_mul_hi_u32 v12, v6, v15
 ; CGP-NEXT:    v_mul_hi_u32 v15, v13, v15
 ; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
@@ -3257,12 +3257,12 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_mul_lo_u32 v12, v13, v10
 ; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v14, v11
 ; CGP-NEXT:    v_mul_hi_u32 v14, v6, v10
-; CGP-NEXT:    v_mul_hi_u32 v10, v13, v10
 ; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v15
 ; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v14, s[4:5], v15, v14
+; CGP-NEXT:    v_mul_hi_u32 v10, v13, v10
 ; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v12, v11
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v14, v12
@@ -3281,12 +3281,12 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_mul_lo_u32 v12, v7, v8
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
 ; CGP-NEXT:    v_mul_hi_u32 v11, v5, v8
-; CGP-NEXT:    v_mul_hi_u32 v8, v7, v8
 ; CGP-NEXT:    v_add_i32_e32 v6, vcc, v12, v6
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v11
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; CGP-NEXT:    v_mul_hi_u32 v8, v7, v8
 ; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
@@ -3385,8 +3385,8 @@ define i64 @v_sdiv_i64_24bit(i64 %num, i64 %den) {
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, 1, v2
 ; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v3
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
-; GISEL-NEXT:    v_sub_i32_e64 v3, s[4:5], v0, v1
 ; GISEL-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v3, s[4:5], v0, v1
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, 1, v2
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
@@ -3431,9 +3431,9 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v5, v3
 ; GISEL-NEXT:    v_sub_i32_e32 v7, vcc, 0, v1
 ; GISEL-NEXT:    v_subb_u32_e32 v8, vcc, 0, v3, vcc
-; GISEL-NEXT:    v_and_b32_e32 v0, s6, v0
 ; GISEL-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v4, v4
+; GISEL-NEXT:    v_and_b32_e32 v0, s6, v0
 ; GISEL-NEXT:    v_and_b32_e32 v6, s6, v6
 ; GISEL-NEXT:    v_and_b32_e32 v2, s6, v2
 ; GISEL-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
@@ -3461,12 +3461,12 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_mul_hi_u32 v11, v5, v11
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
 ; GISEL-NEXT:    v_mul_hi_u32 v12, v4, v9
-; GISEL-NEXT:    v_mul_hi_u32 v9, v5, v9
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v14, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v14, v12
+; GISEL-NEXT:    v_mul_hi_u32 v9, v5, v9
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
@@ -3479,10 +3479,10 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_mul_hi_u32 v7, v7, v4
 ; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v5, v9
 ; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v11
-; GISEL-NEXT:    v_mul_hi_u32 v9, v4, v12
 ; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v8, v7
 ; GISEL-NEXT:    v_mul_lo_u32 v8, v10, v12
 ; GISEL-NEXT:    v_mul_lo_u32 v11, v4, v7
+; GISEL-NEXT:    v_mul_hi_u32 v9, v4, v12
 ; GISEL-NEXT:    v_mul_hi_u32 v12, v10, v12
 ; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
@@ -3491,12 +3491,12 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_mul_lo_u32 v9, v10, v7
 ; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v11, v8
 ; GISEL-NEXT:    v_mul_hi_u32 v11, v4, v7
-; GISEL-NEXT:    v_mul_hi_u32 v7, v10, v7
 ; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v12, v11
+; GISEL-NEXT:    v_mul_hi_u32 v7, v10, v7
 ; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v9, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v11, v9
@@ -3515,12 +3515,12 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_mul_lo_u32 v9, v13, v5
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
 ; GISEL-NEXT:    v_mul_hi_u32 v8, v0, v5
-; GISEL-NEXT:    v_mul_hi_u32 v5, v13, v5
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v9, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; GISEL-NEXT:    v_mul_hi_u32 v5, v13, v5
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
@@ -3592,12 +3592,12 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_mul_hi_u32 v11, v4, v11
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
 ; GISEL-NEXT:    v_mul_hi_u32 v12, v3, v9
-; GISEL-NEXT:    v_mul_hi_u32 v9, v4, v9
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v14, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v14, v12
+; GISEL-NEXT:    v_mul_hi_u32 v9, v4, v9
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
@@ -3610,10 +3610,10 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_mul_hi_u32 v5, v5, v3
 ; GISEL-NEXT:    v_add_i32_e64 v4, s[4:5], v4, v9
 ; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v11
-; GISEL-NEXT:    v_mul_hi_u32 v9, v3, v12
 ; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v8, v5
 ; GISEL-NEXT:    v_mul_lo_u32 v8, v10, v12
 ; GISEL-NEXT:    v_mul_lo_u32 v11, v3, v5
+; GISEL-NEXT:    v_mul_hi_u32 v9, v3, v12
 ; GISEL-NEXT:    v_mul_hi_u32 v12, v10, v12
 ; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
@@ -3622,12 +3622,12 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_mul_lo_u32 v9, v10, v5
 ; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v11, v8
 ; GISEL-NEXT:    v_mul_hi_u32 v11, v3, v5
-; GISEL-NEXT:    v_mul_hi_u32 v5, v10, v5
 ; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v12, v11
+; GISEL-NEXT:    v_mul_hi_u32 v5, v10, v5
 ; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v9, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v11, v9
@@ -3648,12 +3648,12 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_mul_hi_u32 v3, v13, v3
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
 ; GISEL-NEXT:    v_mul_hi_u32 v8, v2, v4
-; GISEL-NEXT:    v_mul_hi_u32 v4, v13, v4
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v9, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; GISEL-NEXT:    v_mul_hi_u32 v4, v13, v4
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v8, v5

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
index 8d911d2843ab1..59db307e8a9bd 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
@@ -85,8 +85,8 @@ define amdgpu_kernel void @sdivrem_i32(i32 addrspace(1)* %out0, i32 addrspace(1)
 ; GFX9-NEXT:    v_subrev_u32_e32 v3, s7, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX9-NEXT:    v_xor_b32_e32 v0, s4, v0
-; GFX9-NEXT:    v_xor_b32_e32 v1, s8, v1
 ; GFX9-NEXT:    v_subrev_u32_e32 v0, s4, v0
+; GFX9-NEXT:    v_xor_b32_e32 v1, s8, v1
 ; GFX9-NEXT:    v_subrev_u32_e32 v1, s8, v1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
@@ -117,8 +117,8 @@ define amdgpu_kernel void @sdivrem_i32(i32 addrspace(1)* %out0, i32 addrspace(1)
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v1, s0, v1
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX10-NEXT:    s_xor_b32 s4, s8, s6
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s7, v1
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, s7, v1
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s7, v1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
 ; GFX10-NEXT:    v_add_nc_u32_e32 v2, 1, v0
@@ -147,8 +147,8 @@ define amdgpu_kernel void @sdivrem_i64(i64 addrspace(1)* %out0, i64 addrspace(1)
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx8 s[4:11], s[4:5], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_ashr_i32 s12, s11, 31
 ; GFX8-NEXT:    s_ashr_i32 s2, s9, 31
+; GFX8-NEXT:    s_ashr_i32 s12, s11, 31
 ; GFX8-NEXT:    s_add_u32 s0, s8, s2
 ; GFX8-NEXT:    s_cselect_b32 s1, 1, 0
 ; GFX8-NEXT:    s_and_b32 s1, s1, 1
@@ -176,12 +176,12 @@ define amdgpu_kernel void @sdivrem_i64(i64 addrspace(1)* %out0, i64 addrspace(1)
 ; GFX8-NEXT:    v_trunc_f32_e32 v1, v1
 ; GFX8-NEXT:    v_mul_f32_e32 v2, 0xcf800000, v1
 ; GFX8-NEXT:    v_add_f32_e32 v0, v2, v0
-; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX8-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX8-NEXT:    s_subb_u32 s15, 0, s9
-; GFX8-NEXT:    v_mul_lo_u32 v3, s15, v0
 ; GFX8-NEXT:    v_mul_lo_u32 v2, s14, v1
+; GFX8-NEXT:    v_mul_lo_u32 v3, s15, v0
 ; GFX8-NEXT:    v_mul_hi_u32 v5, s14, v0
 ; GFX8-NEXT:    v_mul_lo_u32 v4, s14, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v6, s9
@@ -198,12 +198,12 @@ define amdgpu_kernel void @sdivrem_i64(i64 addrspace(1)* %out0, i64 addrspace(1)
 ; GFX8-NEXT:    v_mul_lo_u32 v7, v1, v2
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v5, v3
 ; GFX8-NEXT:    v_mul_hi_u32 v5, v0, v2
-; GFX8-NEXT:    v_mul_hi_u32 v2, v1, v2
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v7, v4
 ; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
 ; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v7, v5
+; GFX8-NEXT:    v_mul_hi_u32 v2, v1, v2
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v4, v3
 ; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v5, v4
@@ -228,12 +228,12 @@ define amdgpu_kernel void @sdivrem_i64(i64 addrspace(1)* %out0, i64 addrspace(1)
 ; GFX8-NEXT:    v_mul_lo_u32 v5, v3, v4
 ; GFX8-NEXT:    v_add_u32_e64 v2, s[0:1], v8, v2
 ; GFX8-NEXT:    v_mul_hi_u32 v8, v0, v4
-; GFX8-NEXT:    v_mul_hi_u32 v3, v3, v4
 ; GFX8-NEXT:    v_add_u32_e64 v5, s[0:1], v5, v7
 ; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[0:1]
 ; GFX8-NEXT:    v_add_u32_e64 v5, s[0:1], v5, v8
 ; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[0:1]
 ; GFX8-NEXT:    v_add_u32_e64 v7, s[0:1], v7, v8
+; GFX8-NEXT:    v_mul_hi_u32 v3, v3, v4
 ; GFX8-NEXT:    v_add_u32_e64 v2, s[0:1], v5, v2
 ; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[0:1]
 ; GFX8-NEXT:    v_add_u32_e64 v4, s[0:1], v7, v5
@@ -253,12 +253,12 @@ define amdgpu_kernel void @sdivrem_i64(i64 addrspace(1)* %out0, i64 addrspace(1)
 ; GFX8-NEXT:    v_mul_lo_u32 v5, s11, v1
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
 ; GFX8-NEXT:    v_mul_hi_u32 v3, s10, v1
-; GFX8-NEXT:    v_mul_hi_u32 v1, s11, v1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v5, v0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v3
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v5, v3
+; GFX8-NEXT:    v_mul_hi_u32 v1, s11, v1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
@@ -284,25 +284,25 @@ define amdgpu_kernel void @sdivrem_i64(i64 addrspace(1)* %out0, i64 addrspace(1)
 ; GFX8-NEXT:    v_add_u32_e64 v9, s[0:1], 1, v0
 ; GFX8-NEXT:    v_addc_u32_e64 v10, s[0:1], 0, v1, s[0:1]
 ; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v8
-; GFX8-NEXT:    v_subb_u32_e32 v2, vcc, v2, v6, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[0:1]
 ; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v7
-; GFX8-NEXT:    v_subrev_u32_e32 v6, vcc, s8, v7
+; GFX8-NEXT:    v_subb_u32_e32 v2, vcc, v2, v6, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[0:1]
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v8
+; GFX8-NEXT:    v_subrev_u32_e32 v6, vcc, s8, v7
 ; GFX8-NEXT:    v_cndmask_b32_e64 v11, v11, v12, s[0:1]
 ; GFX8-NEXT:    v_add_u32_e64 v12, s[0:1], 1, v9
 ; GFX8-NEXT:    v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
 ; GFX8-NEXT:    v_addc_u32_e64 v13, s[0:1], 0, v10, s[0:1]
 ; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
-; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v9, v9, v12, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v10, v10, v13, vcc
+; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v5
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v7, v6, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v9, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v10, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, v4, v2, s[0:1]
 ; GFX8-NEXT:    s_xor_b64 s[0:1], s[2:3], s[12:13]
 ; GFX8-NEXT:    v_xor_b32_e32 v0, s0, v0
@@ -327,8 +327,8 @@ define amdgpu_kernel void @sdivrem_i64(i64 addrspace(1)* %out0, i64 addrspace(1)
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[4:5], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_ashr_i32 s12, s11, 31
 ; GFX9-NEXT:    s_ashr_i32 s2, s9, 31
+; GFX9-NEXT:    s_ashr_i32 s12, s11, 31
 ; GFX9-NEXT:    s_add_u32 s0, s8, s2
 ; GFX9-NEXT:    s_cselect_b32 s1, 1, 0
 ; GFX9-NEXT:    s_and_b32 s1, s1, 1
@@ -356,12 +356,12 @@ define amdgpu_kernel void @sdivrem_i64(i64 addrspace(1)* %out0, i64 addrspace(1)
 ; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
 ; GFX9-NEXT:    v_mul_f32_e32 v2, 0xcf800000, v1
 ; GFX9-NEXT:    v_add_f32_e32 v0, v2, v0
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX9-NEXT:    s_subb_u32 s15, 0, s9
-; GFX9-NEXT:    v_mul_lo_u32 v3, s15, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v2, s14, v1
+; GFX9-NEXT:    v_mul_lo_u32 v3, s15, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v4, s14, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v5, s14, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v8, s11
@@ -383,10 +383,10 @@ define amdgpu_kernel void @sdivrem_i64(i64 addrspace(1)* %out0, i64 addrspace(1)
 ; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v5, v4
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v5, v6, v5
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; GFX9-NEXT:    v_add3_u32 v2, v5, v4, v2
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
 ; GFX9-NEXT:    v_addc_co_u32_e64 v3, s[0:1], v1, v2, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v4, s15, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v5, s14, v3
@@ -459,26 +459,26 @@ define amdgpu_kernel void @sdivrem_i64(i64 addrspace(1)* %out0, i64 addrspace(1)
 ; GFX9-NEXT:    v_add_co_u32_e64 v10, s[0:1], 1, v0
 ; GFX9-NEXT:    v_addc_co_u32_e64 v11, s[0:1], 0, v1, s[0:1]
 ; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v9
-; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v4, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[0:1]
 ; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v8
-; GFX9-NEXT:    v_subrev_co_u32_e32 v4, vcc, s8, v8
+; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v4, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[0:1]
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v9
+; GFX9-NEXT:    v_subrev_co_u32_e32 v4, vcc, s8, v8
 ; GFX9-NEXT:    v_cndmask_b32_e64 v12, v12, v13, s[0:1]
 ; GFX9-NEXT:    v_add_co_u32_e64 v13, s[0:1], 1, v10
 ; GFX9-NEXT:    v_subbrev_co_u32_e32 v2, vcc, 0, v2, vcc
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v12
 ; GFX9-NEXT:    v_addc_co_u32_e64 v14, s[0:1], 0, v11, s[0:1]
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v12
 ; GFX9-NEXT:    v_cndmask_b32_e32 v10, v10, v13, vcc
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v7
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v11, v11, v14, vcc
+; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v7
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v10, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v11, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[0:1]
 ; GFX9-NEXT:    s_xor_b64 s[0:1], s[2:3], s[12:13]
 ; GFX9-NEXT:    v_xor_b32_e32 v0, s0, v0
 ; GFX9-NEXT:    v_xor_b32_e32 v1, s1, v1
@@ -518,11 +518,11 @@ define amdgpu_kernel void @sdivrem_i64(i64 addrspace(1)* %out0, i64 addrspace(1)
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s8
 ; GFX10-NEXT:    s_sub_u32 s1, 0, s8
 ; GFX10-NEXT:    s_cselect_b32 s0, 1, 0
-; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
 ; GFX10-NEXT:    s_and_b32 s0, s0, 1
+; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
 ; GFX10-NEXT:    s_cmp_lg_u32 s0, 0
-; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    s_subb_u32 s14, 0, s9
+; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
@@ -614,8 +614,8 @@ define amdgpu_kernel void @sdivrem_i64(i64 addrspace(1)* %out0, i64 addrspace(1)
 ; GFX10-NEXT:    v_add3_u32 v2, v2, v4, v3
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, v0, 1
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    v_sub_co_u32 v5, vcc_lo, s10, v5
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v6, s11, v2
+; GFX10-NEXT:    v_sub_co_u32 v5, vcc_lo, s10, v5
 ; GFX10-NEXT:    v_sub_co_ci_u32_e64 v2, s0, s11, v2, vcc_lo
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v6, vcc_lo, s9, v6, vcc_lo
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s8, v5
@@ -637,13 +637,13 @@ define amdgpu_kernel void @sdivrem_i64(i64 addrspace(1)* %out0, i64 addrspace(1)
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v11
 ; GFX10-NEXT:    v_cndmask_b32_e64 v7, v10, v7, s0
 ; GFX10-NEXT:    v_sub_co_u32 v10, s0, v8, s8
-; GFX10-NEXT:    s_xor_b64 s[8:9], s[2:3], s[12:13]
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v6, s0, 0, v6, s0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v13, vcc_lo
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, v7
 ; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v14, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v7, v8, v10, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v6, v9, v6, vcc_lo
+; GFX10-NEXT:    s_xor_b64 s[8:9], s[2:3], s[12:13]
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, v5, v7, s0
@@ -784,8 +784,8 @@ define amdgpu_kernel void @sdivrem_v2i32(<2 x i32> addrspace(1)* %out0, <2 x i32
 ; GFX9-NEXT:    v_mul_hi_u32 v2, v1, v2
 ; GFX9-NEXT:    v_add_u32_e32 v4, 1, v0
 ; GFX9-NEXT:    v_sub_u32_e32 v3, s8, v3
-; GFX9-NEXT:    v_add_u32_e32 v1, v1, v2
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s6, v3
+; GFX9-NEXT:    v_add_u32_e32 v1, v1, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
 ; GFX9-NEXT:    v_subrev_u32_e32 v4, s6, v3
 ; GFX9-NEXT:    v_mul_hi_u32 v1, s9, v1
@@ -810,12 +810,12 @@ define amdgpu_kernel void @sdivrem_v2i32(<2 x i32> addrspace(1)* %out0, <2 x i32
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX9-NEXT:    v_subrev_u32_e32 v4, s7, v3
 ; GFX9-NEXT:    s_xor_b32 s4, s11, s5
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX9-NEXT:    v_xor_b32_e32 v0, s6, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX9-NEXT:    v_xor_b32_e32 v1, s4, v1
-; GFX9-NEXT:    v_xor_b32_e32 v3, s11, v3
 ; GFX9-NEXT:    v_subrev_u32_e32 v0, s6, v0
 ; GFX9-NEXT:    v_subrev_u32_e32 v1, s4, v1
+; GFX9-NEXT:    v_xor_b32_e32 v3, s11, v3
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_subrev_u32_e32 v3, s11, v3
 ; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
@@ -866,22 +866,21 @@ define amdgpu_kernel void @sdivrem_v2i32(<2 x i32> addrspace(1)* %out0, <2 x i32
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v2, s0, v2
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v3, s1, v3
 ; GFX10-NEXT:    s_xor_b32 s1, s10, s2
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s8, v2
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v6, s8, v2
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s9, v3
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s8, v2
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, s9, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s0
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc_lo
+; GFX10-NEXT:    v_add_nc_u32_e32 v5, 1, v1
 ; GFX10-NEXT:    v_add_nc_u32_e32 v4, 1, v0
 ; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s8, v2
-; GFX10-NEXT:    v_add_nc_u32_e32 v5, 1, v1
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s9, v3
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v6, s8, v2
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, s9, v3
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s0
-; GFX10-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc_lo
@@ -890,6 +889,7 @@ define amdgpu_kernel void @sdivrem_v2i32(<2 x i32> addrspace(1)* %out0, <2 x i32
 ; GFX10-NEXT:    v_xor_b32_e32 v1, s0, v1
 ; GFX10-NEXT:    v_xor_b32_e32 v2, s10, v2
 ; GFX10-NEXT:    v_xor_b32_e32 v3, s11, v3
+; GFX10-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v0, s1, v0
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v1, s0, v1
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v2, s10, v2
@@ -960,9 +960,9 @@ define amdgpu_kernel void @sdivrem_v4i32(<4 x i32> addrspace(1)* %out0, <4 x i32
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v4
 ; GFX8-NEXT:    v_mul_hi_u32 v1, s1, v1
 ; GFX8-NEXT:    v_subrev_u32_e32 v0, vcc, s0, v0
-; GFX8-NEXT:    s_add_i32 s0, s2, s5
 ; GFX8-NEXT:    v_subrev_u32_e32 v4, vcc, s14, v2
 ; GFX8-NEXT:    v_mul_lo_u32 v5, v1, s16
+; GFX8-NEXT:    s_add_i32 s0, s2, s5
 ; GFX8-NEXT:    s_xor_b32 s2, s0, s5
 ; GFX8-NEXT:    s_ashr_i32 s12, s6, 31
 ; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s1, v5
@@ -972,29 +972,29 @@ define amdgpu_kernel void @sdivrem_v4i32(<4 x i32> addrspace(1)* %out0, <4 x i32
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v5, s2
 ; GFX8-NEXT:    v_subrev_u32_e64 v6, s[0:1], s16, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 1, v1
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v5, v5
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 1, v1
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s16, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
-; GFX8-NEXT:    v_subrev_u32_e64 v6, s[0:1], s16, v2
 ; GFX8-NEXT:    v_mul_f32_e32 v5, v5, v3
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v5, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
+; GFX8-NEXT:    v_subrev_u32_e64 v6, s[0:1], s16, v2
 ; GFX8-NEXT:    s_sub_i32 s0, 0, s2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
-; GFX8-NEXT:    s_add_i32 s1, s6, s12
 ; GFX8-NEXT:    v_mul_lo_u32 v6, s0, v5
+; GFX8-NEXT:    s_add_i32 s1, s6, s12
 ; GFX8-NEXT:    s_xor_b32 s1, s1, s12
 ; GFX8-NEXT:    s_xor_b32 s0, s4, s15
-; GFX8-NEXT:    v_xor_b32_e32 v2, s4, v2
 ; GFX8-NEXT:    v_mul_hi_u32 v6, v5, v6
+; GFX8-NEXT:    v_xor_b32_e32 v2, s4, v2
 ; GFX8-NEXT:    v_xor_b32_e32 v1, s0, v1
 ; GFX8-NEXT:    v_subrev_u32_e32 v1, vcc, s0, v1
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v6
 ; GFX8-NEXT:    v_mul_hi_u32 v6, s1, v5
 ; GFX8-NEXT:    v_subrev_u32_e32 v5, vcc, s4, v2
 ; GFX8-NEXT:    s_ashr_i32 s4, s3, 31
-; GFX8-NEXT:    s_add_i32 s0, s3, s4
 ; GFX8-NEXT:    v_mul_lo_u32 v7, v6, s2
+; GFX8-NEXT:    s_add_i32 s0, s3, s4
 ; GFX8-NEXT:    s_xor_b32 s3, s0, s4
 ; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s1, v7
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 1, v6
@@ -1003,10 +1003,9 @@ define amdgpu_kernel void @sdivrem_v4i32(<4 x i32> addrspace(1)* %out0, <4 x i32
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v7, s3
 ; GFX8-NEXT:    v_subrev_u32_e64 v8, s[0:1], s2, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v8, vcc
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 1, v6
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v7, v7
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 1, v6
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s2, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc
 ; GFX8-NEXT:    v_mul_f32_e32 v3, v7, v3
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; GFX8-NEXT:    v_subrev_u32_e64 v7, s[0:1], s2, v2
@@ -1015,16 +1014,17 @@ define amdgpu_kernel void @sdivrem_v4i32(<4 x i32> addrspace(1)* %out0, <4 x i32
 ; GFX8-NEXT:    v_mul_lo_u32 v2, s0, v3
 ; GFX8-NEXT:    s_ashr_i32 s2, s7, 31
 ; GFX8-NEXT:    s_add_i32 s1, s7, s2
-; GFX8-NEXT:    s_xor_b32 s1, s1, s2
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc
 ; GFX8-NEXT:    v_mul_hi_u32 v2, v3, v2
+; GFX8-NEXT:    s_xor_b32 s1, s1, s2
 ; GFX8-NEXT:    s_xor_b32 s0, s12, s5
 ; GFX8-NEXT:    v_xor_b32_e32 v6, s0, v6
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
 ; GFX8-NEXT:    v_mul_hi_u32 v3, s1, v2
 ; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, s0, v6
 ; GFX8-NEXT:    v_xor_b32_e32 v6, s12, v7
-; GFX8-NEXT:    v_subrev_u32_e32 v6, vcc, s12, v6
 ; GFX8-NEXT:    v_mul_lo_u32 v7, v3, s3
+; GFX8-NEXT:    v_subrev_u32_e32 v6, vcc, s12, v6
 ; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 1, v3
 ; GFX8-NEXT:    v_sub_u32_e32 v7, vcc, s1, v7
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s3, v7
@@ -1097,8 +1097,8 @@ define amdgpu_kernel void @sdivrem_v4i32(<4 x i32> addrspace(1)* %out0, <4 x i32
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s7, v4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
 ; GFX9-NEXT:    v_subrev_u32_e32 v5, s7, v4
-; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
 ; GFX9-NEXT:    s_xor_b32 s7, s9, s13
+; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
 ; GFX9-NEXT:    v_mul_hi_u32 v1, s7, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
 ; GFX9-NEXT:    v_xor_b32_e32 v0, s6, v0
@@ -1136,15 +1136,14 @@ define amdgpu_kernel void @sdivrem_v4i32(<4 x i32> addrspace(1)* %out0, <4 x i32
 ; GFX9-NEXT:    s_add_i32 s8, s10, s5
 ; GFX9-NEXT:    s_xor_b32 s8, s8, s5
 ; GFX9-NEXT:    v_add_u32_e32 v5, v5, v6
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v8, v8
 ; GFX9-NEXT:    v_mul_hi_u32 v6, s8, v5
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v8, v8
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
 ; GFX9-NEXT:    v_xor_b32_e32 v3, s13, v3
-; GFX9-NEXT:    v_mul_f32_e32 v2, v8, v2
 ; GFX9-NEXT:    v_mul_lo_u32 v7, v6, s7
+; GFX9-NEXT:    v_mul_f32_e32 v2, v8, v2
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GFX9-NEXT:    v_subrev_u32_e32 v5, s13, v3
-; GFX9-NEXT:    s_xor_b32 s6, s5, s6
 ; GFX9-NEXT:    v_sub_u32_e32 v3, s8, v7
 ; GFX9-NEXT:    s_sub_i32 s8, 0, s9
 ; GFX9-NEXT:    v_mul_lo_u32 v8, s8, v2
@@ -1164,12 +1163,12 @@ define amdgpu_kernel void @sdivrem_v4i32(<4 x i32> addrspace(1)* %out0, <4 x i32
 ; GFX9-NEXT:    v_add_u32_e32 v2, v2, v8
 ; GFX9-NEXT:    v_mul_hi_u32 v8, s8, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
+; GFX9-NEXT:    s_xor_b32 s6, s5, s6
 ; GFX9-NEXT:    v_xor_b32_e32 v3, s5, v3
-; GFX9-NEXT:    v_xor_b32_e32 v2, s6, v6
 ; GFX9-NEXT:    v_mul_lo_u32 v7, v8, s9
+; GFX9-NEXT:    v_xor_b32_e32 v2, s6, v6
 ; GFX9-NEXT:    v_subrev_u32_e32 v6, s5, v3
 ; GFX9-NEXT:    s_xor_b32 s4, s7, s4
-; GFX9-NEXT:    v_subrev_u32_e32 v2, s6, v2
 ; GFX9-NEXT:    v_sub_u32_e32 v3, s8, v7
 ; GFX9-NEXT:    v_add_u32_e32 v7, 1, v8
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v3
@@ -1182,8 +1181,9 @@ define amdgpu_kernel void @sdivrem_v4i32(<4 x i32> addrspace(1)* %out0, <4 x i32
 ; GFX9-NEXT:    v_subrev_u32_e32 v8, s9, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v8, v3, v8, vcc
 ; GFX9-NEXT:    v_xor_b32_e32 v3, s4, v7
-; GFX9-NEXT:    v_xor_b32_e32 v7, s7, v8
+; GFX9-NEXT:    v_subrev_u32_e32 v2, s6, v2
 ; GFX9-NEXT:    v_subrev_u32_e32 v3, s4, v3
+; GFX9-NEXT:    v_xor_b32_e32 v7, s7, v8
 ; GFX9-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX9-NEXT:    v_subrev_u32_e32 v7, s7, v7
 ; GFX9-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
@@ -1294,23 +1294,23 @@ define amdgpu_kernel void @sdivrem_v4i32(<4 x i32> addrspace(1)* %out0, <4 x i32
 ; GFX10-NEXT:    v_add_nc_u32_e32 v9, 1, v1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc_lo
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v8, s10, v4
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s11, v5
 ; GFX10-NEXT:    v_add_nc_u32_e32 v10, 1, v2
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s11, v5
 ; GFX10-NEXT:    v_cmp_le_u32_e64 s1, s8, v6
 ; GFX10-NEXT:    v_add_nc_u32_e32 v11, 1, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc_lo
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s9, v7
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v10, s1
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v9, s11, v5
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v10, s1
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v10, s8, v6
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v12, s9, v7
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v11, vcc_lo
-; GFX10-NEXT:    v_xor_b32_e32 v0, s12, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, v9, s0
+; GFX10-NEXT:    s_xor_b32 s0, s19, s15
 ; GFX10-NEXT:    v_cndmask_b32_e64 v6, v6, v10, s1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v7, v7, v12, vcc_lo
-; GFX10-NEXT:    s_xor_b32 s0, s19, s15
+; GFX10-NEXT:    v_xor_b32_e32 v0, s12, v0
 ; GFX10-NEXT:    v_xor_b32_e32 v1, s13, v1
 ; GFX10-NEXT:    v_xor_b32_e32 v2, s14, v2
 ; GFX10-NEXT:    v_xor_b32_e32 v3, s0, v3
@@ -1373,12 +1373,12 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX8-NEXT:    v_trunc_f32_e32 v1, v1
 ; GFX8-NEXT:    v_mul_f32_e32 v2, 0xcf800000, v1
 ; GFX8-NEXT:    v_add_f32_e32 v0, v2, v0
-; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX8-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX8-NEXT:    s_subb_u32 s17, 0, s15
-; GFX8-NEXT:    v_mul_lo_u32 v3, s17, v0
 ; GFX8-NEXT:    v_mul_lo_u32 v2, s16, v1
+; GFX8-NEXT:    v_mul_lo_u32 v3, s17, v0
 ; GFX8-NEXT:    v_mul_hi_u32 v5, s16, v0
 ; GFX8-NEXT:    v_mul_lo_u32 v4, s16, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v6, s15
@@ -1395,12 +1395,12 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX8-NEXT:    v_mul_lo_u32 v7, v1, v2
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v5, v3
 ; GFX8-NEXT:    v_mul_hi_u32 v5, v0, v2
-; GFX8-NEXT:    v_mul_hi_u32 v2, v1, v2
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v7, v4
 ; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
 ; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v7, v5
+; GFX8-NEXT:    v_mul_hi_u32 v2, v1, v2
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v4, v3
 ; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v5, v4
@@ -1425,12 +1425,12 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX8-NEXT:    v_mul_lo_u32 v5, v3, v4
 ; GFX8-NEXT:    v_add_u32_e64 v2, s[0:1], v8, v2
 ; GFX8-NEXT:    v_mul_hi_u32 v8, v0, v4
-; GFX8-NEXT:    v_mul_hi_u32 v3, v3, v4
 ; GFX8-NEXT:    v_add_u32_e64 v5, s[0:1], v5, v7
 ; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[0:1]
 ; GFX8-NEXT:    v_add_u32_e64 v5, s[0:1], v5, v8
 ; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[0:1]
 ; GFX8-NEXT:    v_add_u32_e64 v7, s[0:1], v7, v8
+; GFX8-NEXT:    v_mul_hi_u32 v3, v3, v4
 ; GFX8-NEXT:    v_add_u32_e64 v2, s[0:1], v5, v2
 ; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[0:1]
 ; GFX8-NEXT:    v_add_u32_e64 v4, s[0:1], v7, v5
@@ -1450,12 +1450,12 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX8-NEXT:    v_mul_lo_u32 v5, s9, v1
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
 ; GFX8-NEXT:    v_mul_hi_u32 v3, s8, v1
-; GFX8-NEXT:    v_mul_hi_u32 v1, s9, v1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v5, v0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v3
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v5, v3
+; GFX8-NEXT:    v_mul_hi_u32 v1, s9, v1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
@@ -1481,26 +1481,26 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX8-NEXT:    v_add_u32_e64 v9, s[0:1], 1, v0
 ; GFX8-NEXT:    v_addc_u32_e64 v10, s[0:1], 0, v1, s[0:1]
 ; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s15, v8
-; GFX8-NEXT:    v_subb_u32_e32 v2, vcc, v2, v6, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[0:1]
 ; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s14, v7
-; GFX8-NEXT:    v_subrev_u32_e32 v6, vcc, s14, v7
+; GFX8-NEXT:    v_subb_u32_e32 v2, vcc, v2, v6, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[0:1]
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s15, v8
+; GFX8-NEXT:    v_subrev_u32_e32 v6, vcc, s14, v7
 ; GFX8-NEXT:    v_cndmask_b32_e64 v11, v11, v12, s[0:1]
 ; GFX8-NEXT:    v_add_u32_e64 v12, s[0:1], 1, v9
 ; GFX8-NEXT:    v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
 ; GFX8-NEXT:    v_addc_u32_e64 v13, s[0:1], 0, v10, s[0:1]
 ; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
+; GFX8-NEXT:    v_cndmask_b32_e32 v9, v9, v12, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v10, v10, v13, vcc
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v5
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v7, v6, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v9, v9, v12, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v10, v10, v13, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, v4, v2, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v9, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v10, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v4, v2, s[0:1]
 ; GFX8-NEXT:    s_xor_b64 s[0:1], s[6:7], s[12:13]
 ; GFX8-NEXT:    v_xor_b32_e32 v0, s0, v0
 ; GFX8-NEXT:    s_ashr_i32 s8, s11, 31
@@ -1535,13 +1535,13 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX8-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
 ; GFX8-NEXT:    s_mov_b32 s9, s8
 ; GFX8-NEXT:    v_trunc_f32_e32 v3, v3
-; GFX8-NEXT:    v_mul_f32_e32 v6, 0xcf800000, v3
 ; GFX8-NEXT:    s_xor_b64 s[6:7], s[0:1], s[8:9]
+; GFX8-NEXT:    v_mul_f32_e32 v6, 0xcf800000, v3
 ; GFX8-NEXT:    v_add_f32_e32 v2, v6, v2
 ; GFX8-NEXT:    s_sub_u32 s10, 0, s2
-; GFX8-NEXT:    s_cselect_b32 s0, 1, 0
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX8-NEXT:    s_cselect_b32 s0, 1, 0
 ; GFX8-NEXT:    s_and_b32 s0, s0, 1
 ; GFX8-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX8-NEXT:    s_subb_u32 s11, 0, s3
@@ -1563,12 +1563,12 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX8-NEXT:    v_mul_lo_u32 v11, v3, v6
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v9, v7
 ; GFX8-NEXT:    v_mul_hi_u32 v9, v2, v6
-; GFX8-NEXT:    v_mul_hi_u32 v6, v3, v6
 ; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v11, v8
 ; GFX8-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v9
 ; GFX8-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v11, v9
+; GFX8-NEXT:    v_mul_hi_u32 v6, v3, v6
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v8, v7
 ; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v9, v8
@@ -1593,12 +1593,12 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX8-NEXT:    v_mul_lo_u32 v9, v7, v8
 ; GFX8-NEXT:    v_add_u32_e64 v6, s[0:1], v12, v6
 ; GFX8-NEXT:    v_mul_hi_u32 v12, v2, v8
-; GFX8-NEXT:    v_mul_hi_u32 v7, v7, v8
 ; GFX8-NEXT:    v_add_u32_e64 v9, s[0:1], v9, v11
 ; GFX8-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[0:1]
 ; GFX8-NEXT:    v_add_u32_e64 v9, s[0:1], v9, v12
 ; GFX8-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[0:1]
 ; GFX8-NEXT:    v_add_u32_e64 v11, s[0:1], v11, v12
+; GFX8-NEXT:    v_mul_hi_u32 v7, v7, v8
 ; GFX8-NEXT:    v_add_u32_e64 v6, s[0:1], v9, v6
 ; GFX8-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[0:1]
 ; GFX8-NEXT:    v_add_u32_e64 v8, s[0:1], v11, v9
@@ -1618,12 +1618,12 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX8-NEXT:    v_mul_lo_u32 v9, s7, v3
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v7, v6
 ; GFX8-NEXT:    v_mul_hi_u32 v7, s6, v3
-; GFX8-NEXT:    v_mul_hi_u32 v3, s7, v3
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v9, v2
 ; GFX8-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v7
 ; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v9, v7
+; GFX8-NEXT:    v_mul_hi_u32 v3, s7, v3
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v6
 ; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v7, v6
@@ -1661,14 +1661,14 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX8-NEXT:    v_subrev_u32_e64 v13, s[0:1], s2, v11
 ; GFX8-NEXT:    v_subbrev_u32_e64 v6, s[0:1], 0, v6, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v10, v14, v10, vcc
-; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v9
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v12, v6, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v14, v15, v16, vcc
+; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v9
 ; GFX8-NEXT:    v_cndmask_b32_e32 v9, v11, v13, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v7, v7, v9, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v12, v6, vcc
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v10, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v14, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v7, v7, v9, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v6, v8, v6, s[0:1]
 ; GFX8-NEXT:    s_xor_b64 s[0:1], s[8:9], s[12:13]
 ; GFX8-NEXT:    v_xor_b32_e32 v2, s0, v2
@@ -1725,12 +1725,12 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
 ; GFX9-NEXT:    v_mul_f32_e32 v2, 0xcf800000, v1
 ; GFX9-NEXT:    v_add_f32_e32 v0, v2, v0
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX9-NEXT:    s_subb_u32 s17, 0, s15
-; GFX9-NEXT:    v_mul_lo_u32 v3, s17, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v2, s16, v1
+; GFX9-NEXT:    v_mul_lo_u32 v3, s17, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v4, s16, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v5, s16, v0
 ; GFX9-NEXT:    v_add3_u32 v2, v3, v2, v4
@@ -1751,10 +1751,10 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v5, v4
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v5, v6, v5
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; GFX9-NEXT:    v_add3_u32 v2, v5, v4, v2
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
 ; GFX9-NEXT:    v_addc_co_u32_e64 v3, s[0:1], v1, v2, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v4, s17, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v5, s16, v3
@@ -1827,28 +1827,28 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX9-NEXT:    v_add_co_u32_e64 v9, s[0:1], 1, v0
 ; GFX9-NEXT:    v_addc_co_u32_e64 v10, s[0:1], 0, v1, s[0:1]
 ; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s15, v8
-; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v5, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[0:1]
 ; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s14, v7
-; GFX9-NEXT:    v_subrev_co_u32_e32 v5, vcc, s14, v7
+; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v5, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[0:1]
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s15, v8
+; GFX9-NEXT:    v_subrev_co_u32_e32 v5, vcc, s14, v7
 ; GFX9-NEXT:    v_cndmask_b32_e64 v11, v11, v12, s[0:1]
 ; GFX9-NEXT:    v_add_co_u32_e64 v12, s[0:1], 1, v9
 ; GFX9-NEXT:    v_subbrev_co_u32_e32 v2, vcc, 0, v2, vcc
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
 ; GFX9-NEXT:    v_addc_co_u32_e64 v13, s[0:1], 0, v10, s[0:1]
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
 ; GFX9-NEXT:    v_cndmask_b32_e32 v9, v9, v12, vcc
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v10, v10, v13, vcc
+; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v4, v2, s[0:1]
-; GFX9-NEXT:    s_ashr_i32 s8, s11, 31
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v9, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v10, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v4, v2, s[0:1]
 ; GFX9-NEXT:    s_xor_b64 s[0:1], s[6:7], s[12:13]
+; GFX9-NEXT:    s_ashr_i32 s8, s11, 31
 ; GFX9-NEXT:    s_ashr_i32 s12, s3, 31
 ; GFX9-NEXT:    s_add_u32 s10, s10, s8
 ; GFX9-NEXT:    s_cselect_b32 s7, 1, 0
@@ -1877,9 +1877,9 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX9-NEXT:    v_trunc_f32_e32 v6, v6
 ; GFX9-NEXT:    v_mul_f32_e32 v7, 0xcf800000, v6
 ; GFX9-NEXT:    v_add_f32_e32 v4, v7, v4
-; GFX9-NEXT:    s_cselect_b32 s1, 1, 0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v4
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v6
+; GFX9-NEXT:    s_cselect_b32 s1, 1, 0
 ; GFX9-NEXT:    s_and_b32 s1, s1, 1
 ; GFX9-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX9-NEXT:    s_subb_u32 s14, 0, s3
@@ -1909,10 +1909,10 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v9
 ; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v8
-; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v9, v10, v9
-; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v7
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GFX9-NEXT:    v_add3_u32 v5, v9, v8, v5
+; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v7
 ; GFX9-NEXT:    v_addc_co_u32_e64 v7, s[0:1], v6, v5, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v8, s14, v4
 ; GFX9-NEXT:    v_mul_lo_u32 v9, s7, v7
@@ -1945,17 +1945,17 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v5, v7, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v4, v6
 ; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v5, vcc
-; GFX9-NEXT:    v_subrev_co_u32_e32 v4, vcc, s6, v3
 ; GFX9-NEXT:    v_mul_lo_u32 v8, s11, v6
 ; GFX9-NEXT:    v_mul_lo_u32 v9, s10, v7
+; GFX9-NEXT:    v_subrev_co_u32_e32 v4, vcc, s6, v3
 ; GFX9-NEXT:    v_subb_co_u32_e32 v5, vcc, v2, v12, vcc
 ; GFX9-NEXT:    v_mul_hi_u32 v2, s10, v6
-; GFX9-NEXT:    v_mul_hi_u32 v6, s11, v6
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v8, v9
 ; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v3, s11, v7
+; GFX9-NEXT:    v_mul_hi_u32 v6, s11, v6
 ; GFX9-NEXT:    v_add_u32_e32 v2, v8, v2
 ; GFX9-NEXT:    v_mul_hi_u32 v8, s10, v7
 ; GFX9-NEXT:    v_mul_hi_u32 v7, s11, v7
@@ -2003,12 +2003,12 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX9-NEXT:    v_subrev_co_u32_e64 v15, s[0:1], s2, v11
 ; GFX9-NEXT:    v_subbrev_co_u32_e64 v6, s[0:1], 0, v6, s[0:1]
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v10
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v12, v6, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v9, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v9, v11, v15, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, v9, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v12, v6, vcc
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v14, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, v9, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v6, v8, v6, s[0:1]
 ; GFX9-NEXT:    s_xor_b64 s[0:1], s[8:9], s[12:13]
 ; GFX9-NEXT:    v_xor_b32_e32 v2, s0, v2
@@ -2017,8 +2017,8 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX9-NEXT:    v_subrev_co_u32_e32 v2, vcc, s0, v2
 ; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v8, vcc
 ; GFX9-NEXT:    v_xor_b32_e32 v7, s8, v7
-; GFX9-NEXT:    v_xor_b32_e32 v8, s8, v6
 ; GFX9-NEXT:    v_mov_b32_e32 v13, 0
+; GFX9-NEXT:    v_xor_b32_e32 v8, s8, v6
 ; GFX9-NEXT:    v_mov_b32_e32 v9, s8
 ; GFX9-NEXT:    v_subrev_co_u32_e32 v6, vcc, s8, v7
 ; GFX9-NEXT:    v_subb_co_u32_e32 v7, vcc, v8, v9, vcc
@@ -2058,32 +2058,32 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX10-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX10-NEXT:    s_subb_u32 s23, 0, s9
 ; GFX10-NEXT:    s_ashr_i32 s16, s11, 31
-; GFX10-NEXT:    v_add_f32_e32 v0, v1, v0
-; GFX10-NEXT:    s_ashr_i32 s18, s3, 31
 ; GFX10-NEXT:    s_xor_b64 s[20:21], s[12:13], s[6:7]
+; GFX10-NEXT:    s_ashr_i32 s18, s3, 31
+; GFX10-NEXT:    v_add_f32_e32 v0, v1, v0
 ; GFX10-NEXT:    s_add_u32 s0, s10, s16
 ; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX10-NEXT:    s_and_b32 s1, s1, 1
 ; GFX10-NEXT:    s_mov_b32 s19, s18
+; GFX10-NEXT:    s_and_b32 s1, s1, 1
+; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX10-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX10-NEXT:    s_mov_b32 s17, s16
 ; GFX10-NEXT:    s_addc_u32 s1, s11, s16
 ; GFX10-NEXT:    s_add_u32 s2, s2, s18
 ; GFX10-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX10-NEXT:    s_and_b32 s6, s6, 1
-; GFX10-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX10-NEXT:    s_cmp_lg_u32 s6, 0
+; GFX10-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX10-NEXT:    s_addc_u32 s3, s3, s18
 ; GFX10-NEXT:    s_xor_b64 s[10:11], s[0:1], s[16:17]
 ; GFX10-NEXT:    s_xor_b64 s[2:3], s[2:3], s[18:19]
-; GFX10-NEXT:    v_mul_f32_e32 v2, 0x2f800000, v0
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s3
+; GFX10-NEXT:    v_mul_f32_e32 v2, 0x2f800000, v0
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v3, s2
 ; GFX10-NEXT:    s_sub_u32 s6, 0, s2
 ; GFX10-NEXT:    s_cselect_b32 s0, 1, 0
-; GFX10-NEXT:    v_trunc_f32_e32 v2, v2
 ; GFX10-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v1
+; GFX10-NEXT:    v_trunc_f32_e32 v2, v2
 ; GFX10-NEXT:    s_and_b32 s0, s0, 1
 ; GFX10-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX10-NEXT:    v_add_f32_e32 v1, v1, v3
@@ -2112,19 +2112,19 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX10-NEXT:    v_mul_hi_u32 v3, v2, v3
 ; GFX10-NEXT:    v_add_f32_e32 v1, v9, v1
 ; GFX10-NEXT:    v_add_co_u32 v5, s0, v5, v8
-; GFX10-NEXT:    v_mul_lo_u32 v9, s6, v4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s0
 ; GFX10-NEXT:    v_add_co_u32 v6, s0, v10, v6
-; GFX10-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s0
 ; GFX10-NEXT:    v_add_co_u32 v5, s0, v5, v7
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s0
+; GFX10-NEXT:    v_mul_lo_u32 v9, s6, v4
+; GFX10-NEXT:    v_add_co_u32 v6, s0, v6, v11
 ; GFX10-NEXT:    v_mul_lo_u32 v12, s7, v1
 ; GFX10-NEXT:    v_mul_hi_u32 v13, s6, v1
-; GFX10-NEXT:    v_add_co_u32 v6, s0, v6, v11
-; GFX10-NEXT:    v_mul_lo_u32 v11, s6, v1
 ; GFX10-NEXT:    v_add_nc_u32_e32 v5, v8, v5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s0
+; GFX10-NEXT:    v_mul_lo_u32 v11, s6, v1
 ; GFX10-NEXT:    v_add_co_u32 v5, s0, v6, v5
 ; GFX10-NEXT:    v_add_nc_u32_e32 v7, v10, v7
 ; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s0
@@ -2136,95 +2136,95 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX10-NEXT:    v_mul_lo_u32 v6, v1, v8
 ; GFX10-NEXT:    v_mul_lo_u32 v7, v4, v8
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v5
-; GFX10-NEXT:    v_mul_hi_u32 v5, v1, v8
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v12, s0, v2, v3, vcc_lo
-; GFX10-NEXT:    v_mul_hi_u32 v8, v4, v8
-; GFX10-NEXT:    v_add_nc_u32_e32 v2, v2, v3
+; GFX10-NEXT:    v_mul_hi_u32 v5, v1, v8
 ; GFX10-NEXT:    v_mul_lo_u32 v14, s23, v0
 ; GFX10-NEXT:    v_add_co_u32 v6, s0, v9, v6
-; GFX10-NEXT:    v_mul_hi_u32 v15, s22, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s0
 ; GFX10-NEXT:    v_add_co_u32 v7, s0, v7, v11
+; GFX10-NEXT:    v_mul_hi_u32 v15, s22, v0
 ; GFX10-NEXT:    v_mul_lo_u32 v16, s22, v12
 ; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s0
 ; GFX10-NEXT:    v_add_co_u32 v6, s0, v6, v10
-; GFX10-NEXT:    v_mul_lo_u32 v13, s22, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s0
+; GFX10-NEXT:    v_mul_lo_u32 v13, s22, v0
 ; GFX10-NEXT:    v_add_co_u32 v5, s0, v7, v5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s0
-; GFX10-NEXT:    v_add3_u32 v14, v14, v16, v15
 ; GFX10-NEXT:    v_add_nc_u32_e32 v6, v9, v6
+; GFX10-NEXT:    v_add3_u32 v14, v14, v16, v15
+; GFX10-NEXT:    v_mul_hi_u32 v8, v4, v8
+; GFX10-NEXT:    v_add_nc_u32_e32 v2, v2, v3
 ; GFX10-NEXT:    v_mul_lo_u32 v10, v12, v13
 ; GFX10-NEXT:    v_add_nc_u32_e32 v7, v11, v7
-; GFX10-NEXT:    v_mul_lo_u32 v11, v0, v14
 ; GFX10-NEXT:    v_add_co_u32 v5, s0, v5, v6
+; GFX10-NEXT:    v_mul_lo_u32 v11, v0, v14
 ; GFX10-NEXT:    v_mul_hi_u32 v9, v0, v13
 ; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s0
 ; GFX10-NEXT:    v_mul_hi_u32 v13, v12, v13
 ; GFX10-NEXT:    v_mul_lo_u32 v15, v12, v14
 ; GFX10-NEXT:    v_add_co_u32 v1, s0, v1, v5
-; GFX10-NEXT:    v_mul_hi_u32 v16, v0, v14
 ; GFX10-NEXT:    v_add3_u32 v6, v7, v6, v8
 ; GFX10-NEXT:    v_add_co_u32 v5, s1, v10, v11
 ; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s1
-; GFX10-NEXT:    v_mul_lo_u32 v3, s6, v1
+; GFX10-NEXT:    v_mul_hi_u32 v16, v0, v14
 ; GFX10-NEXT:    v_add_co_u32 v8, s1, v15, v13
-; GFX10-NEXT:    v_mul_lo_u32 v13, s7, v1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s1
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v11, s1, v4, v6, s0
 ; GFX10-NEXT:    v_add_co_u32 v5, s1, v5, v9
-; GFX10-NEXT:    v_mul_hi_u32 v15, s6, v1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s1
 ; GFX10-NEXT:    v_add_co_u32 v8, s1, v8, v16
-; GFX10-NEXT:    v_mul_lo_u32 v9, s6, v11
 ; GFX10-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s1
-; GFX10-NEXT:    v_add_nc_u32_e32 v4, v4, v6
+; GFX10-NEXT:    v_mul_lo_u32 v13, s7, v1
 ; GFX10-NEXT:    v_add_nc_u32_e32 v5, v7, v5
+; GFX10-NEXT:    v_mul_hi_u32 v15, s6, v1
+; GFX10-NEXT:    v_mul_lo_u32 v9, s6, v11
 ; GFX10-NEXT:    v_mul_hi_u32 v7, v12, v14
-; GFX10-NEXT:    v_mul_lo_u32 v12, v11, v3
 ; GFX10-NEXT:    v_add_nc_u32_e32 v10, v10, v16
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
 ; GFX10-NEXT:    v_add_co_u32 v5, s1, v8, v5
-; GFX10-NEXT:    v_add3_u32 v9, v13, v9, v15
 ; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s1
+; GFX10-NEXT:    v_mul_lo_u32 v3, s6, v1
+; GFX10-NEXT:    v_add_nc_u32_e32 v4, v4, v6
+; GFX10-NEXT:    v_add3_u32 v9, v13, v9, v15
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
+; GFX10-NEXT:    v_add3_u32 v7, v10, v8, v7
+; GFX10-NEXT:    v_mul_lo_u32 v14, v1, v9
+; GFX10-NEXT:    v_mul_lo_u32 v12, v11, v3
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v2, v7, vcc_lo
 ; GFX10-NEXT:    v_mul_hi_u32 v13, v1, v3
 ; GFX10-NEXT:    v_mul_hi_u32 v3, v11, v3
-; GFX10-NEXT:    v_mul_lo_u32 v14, v1, v9
-; GFX10-NEXT:    v_add3_u32 v7, v10, v8, v7
 ; GFX10-NEXT:    v_mul_lo_u32 v8, v11, v9
-; GFX10-NEXT:    v_mul_hi_u32 v10, v1, v9
-; GFX10-NEXT:    v_mul_hi_u32 v9, v11, v9
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v2, v7, vcc_lo
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v5
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, 0, v2, vcc_lo
+; GFX10-NEXT:    v_mul_hi_u32 v10, v1, v9
 ; GFX10-NEXT:    v_add_co_u32 v7, s1, v12, v14
+; GFX10-NEXT:    v_mul_hi_u32 v9, v11, v9
 ; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s1
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, 0, v2, vcc_lo
 ; GFX10-NEXT:    v_add_co_u32 v3, s1, v8, v3
 ; GFX10-NEXT:    v_mul_lo_u32 v8, s15, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s1
-; GFX10-NEXT:    v_add_co_u32 v7, s1, v7, v13
 ; GFX10-NEXT:    v_mul_lo_u32 v14, s14, v2
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s1
 ; GFX10-NEXT:    v_mul_hi_u32 v12, s14, v0
 ; GFX10-NEXT:    v_mul_hi_u32 v0, s15, v0
+; GFX10-NEXT:    v_add_co_u32 v7, s1, v7, v13
 ; GFX10-NEXT:    v_mul_lo_u32 v13, s15, v2
 ; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s1
 ; GFX10-NEXT:    v_add_co_u32 v3, s1, v3, v10
-; GFX10-NEXT:    v_mul_hi_u32 v15, s14, v2
 ; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s1
 ; GFX10-NEXT:    v_add_co_u32 v8, s1, v8, v14
-; GFX10-NEXT:    v_add_nc_u32_e32 v7, v11, v7
 ; GFX10-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s1
 ; GFX10-NEXT:    v_add_co_u32 v0, s1, v13, v0
-; GFX10-NEXT:    v_mul_hi_u32 v2, s15, v2
+; GFX10-NEXT:    v_mul_hi_u32 v15, s14, v2
 ; GFX10-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s1
 ; GFX10-NEXT:    v_add_co_u32 v8, s1, v8, v12
-; GFX10-NEXT:    v_add_nc_u32_e32 v5, v5, v10
 ; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s1
+; GFX10-NEXT:    v_add_nc_u32_e32 v7, v11, v7
+; GFX10-NEXT:    v_mul_hi_u32 v2, s15, v2
+; GFX10-NEXT:    v_add_nc_u32_e32 v5, v5, v10
 ; GFX10-NEXT:    v_add_co_u32 v0, s1, v0, v15
-; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s1
 ; GFX10-NEXT:    v_add_nc_u32_e32 v8, v14, v8
-; GFX10-NEXT:    v_add_nc_u32_e32 v10, v13, v12
+; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s1
 ; GFX10-NEXT:    v_add_co_u32 v0, s1, v0, v8
+; GFX10-NEXT:    v_add_nc_u32_e32 v10, v13, v12
 ; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s1
 ; GFX10-NEXT:    v_add_co_u32 v3, s1, v3, v7
 ; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s1
@@ -2242,21 +2242,21 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX10-NEXT:    v_mul_lo_u32 v6, s11, v1
 ; GFX10-NEXT:    v_mul_hi_u32 v7, s11, v1
 ; GFX10-NEXT:    v_sub_co_u32 v5, vcc_lo, s14, v5
-; GFX10-NEXT:    v_mul_lo_u32 v14, s10, v3
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v8, s15, v4
 ; GFX10-NEXT:    v_sub_co_ci_u32_e64 v4, s0, s15, v4, vcc_lo
-; GFX10-NEXT:    v_mul_lo_u32 v15, s11, v3
 ; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s8, v5
-; GFX10-NEXT:    v_mul_hi_u32 v1, s10, v1
+; GFX10-NEXT:    v_mul_lo_u32 v14, s10, v3
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v8, vcc_lo, s9, v8, vcc_lo
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s9, v4
-; GFX10-NEXT:    v_mul_hi_u32 v17, s10, v3
 ; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s0
-; GFX10-NEXT:    v_mul_hi_u32 v3, s11, v3
+; GFX10-NEXT:    v_mul_lo_u32 v15, s11, v3
+; GFX10-NEXT:    v_mul_hi_u32 v1, s10, v1
+; GFX10-NEXT:    v_mul_hi_u32 v17, s10, v3
 ; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, -1, vcc_lo
 ; GFX10-NEXT:    v_sub_co_u32 v12, vcc_lo, v5, s8
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v13, s0, 0, v8, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s9, v4
+; GFX10-NEXT:    v_mul_hi_u32 v3, s11, v3
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v8, vcc_lo, s9, v8, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v10, v11, v10, s0
 ; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s9, v13
@@ -2268,13 +2268,13 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX10-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s0
 ; GFX10-NEXT:    v_add_co_u32 v7, s0, v15, v7
 ; GFX10-NEXT:    v_add_co_u32 v1, s1, v6, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s1
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s0
 ; GFX10-NEXT:    v_add_co_u32 v7, s0, v7, v17
 ; GFX10-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s0
 ; GFX10-NEXT:    v_add_co_u32 v17, s0, v0, 1
-; GFX10-NEXT:    v_add_nc_u32_e32 v1, v14, v1
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v18, s0, 0, v2, s0
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, v14, v1
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s9, v13
 ; GFX10-NEXT:    v_add_nc_u32_e32 v6, v6, v15
 ; GFX10-NEXT:    v_cndmask_b32_e64 v11, v11, v16, s0
@@ -2295,18 +2295,18 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v2, v6, vcc_lo
 ; GFX10-NEXT:    v_add3_u32 v6, v10, v11, v15
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v13, v8, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v12, v12, v14, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v13, v8, s0
 ; GFX10-NEXT:    v_sub_co_u32 v8, s0, s10, v16
-; GFX10-NEXT:    v_xor_b32_e32 v0, s20, v0
 ; GFX10-NEXT:    v_sub_co_ci_u32_e64 v10, s1, s11, v6, s0
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v12, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc_lo
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v4, s11, v6
-; GFX10-NEXT:    v_xor_b32_e32 v1, s21, v1
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s3, v10
-; GFX10-NEXT:    v_xor_b32_e32 v2, s12, v2
+; GFX10-NEXT:    v_xor_b32_e32 v0, s20, v0
+; GFX10-NEXT:    v_xor_b32_e32 v1, s21, v1
 ; GFX10-NEXT:    v_xor_b32_e32 v5, s12, v5
+; GFX10-NEXT:    v_xor_b32_e32 v2, s12, v2
 ; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc_lo
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v4, vcc_lo, s3, v4, s0
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s2, v8
@@ -2314,9 +2314,9 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX10-NEXT:    v_sub_co_u32 v12, vcc_lo, v8, s2
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v13, s0, 0, v4, vcc_lo
 ; GFX10-NEXT:    v_sub_co_u32 v0, s0, v0, s20
-; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v4, vcc_lo, s3, v4, vcc_lo
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v1, s0, s21, v1, s0
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s3, v10
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v4, vcc_lo, s3, v4, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v6, v6, v11, s0
 ; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s3, v13
 ; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s0
@@ -2331,8 +2331,8 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v11
 ; GFX10-NEXT:    v_sub_co_u32 v11, s0, v12, s2
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v4, s0, 0, v4, s0
-; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, v6
 ; GFX10-NEXT:    v_cndmask_b32_e32 v14, v15, v14, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, v6
 ; GFX10-NEXT:    v_cndmask_b32_e32 v15, v16, v17, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v6, v12, v11, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v4, v13, v4, vcc_lo
@@ -2340,8 +2340,8 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v15, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v6, v8, v6, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v8, v10, v4, s0
-; GFX10-NEXT:    v_sub_co_u32 v4, vcc_lo, v5, s12
 ; GFX10-NEXT:    s_xor_b64 s[0:1], s[16:17], s[18:19]
+; GFX10-NEXT:    v_sub_co_u32 v4, vcc_lo, v5, s12
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v5, vcc_lo, s12, v2, vcc_lo
 ; GFX10-NEXT:    v_xor_b32_e32 v2, s0, v7
 ; GFX10-NEXT:    v_xor_b32_e32 v3, s1, v3
@@ -2448,8 +2448,8 @@ define amdgpu_kernel void @sdiv_i8(i8 addrspace(1)* %out0, i8 addrspace(1)* %out
 ; GFX9-NEXT:    v_subrev_u32_e32 v3, s7, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX9-NEXT:    v_xor_b32_e32 v0, s4, v0
-; GFX9-NEXT:    v_xor_b32_e32 v1, s8, v1
 ; GFX9-NEXT:    v_subrev_u32_e32 v0, s4, v0
+; GFX9-NEXT:    v_xor_b32_e32 v1, s8, v1
 ; GFX9-NEXT:    v_subrev_u32_e32 v1, s8, v1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_store_byte v2, v0, s[0:1]
@@ -2482,8 +2482,8 @@ define amdgpu_kernel void @sdiv_i8(i8 addrspace(1)* %out0, i8 addrspace(1)* %out
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v1, s0, v1
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX10-NEXT:    s_xor_b32 s4, s8, s6
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s7, v1
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, s7, v1
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s7, v1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
 ; GFX10-NEXT:    v_add_nc_u32_e32 v2, 1, v0
@@ -2523,10 +2523,10 @@ define amdgpu_kernel void @sdivrem_v2i8(<2 x i8> addrspace(1)* %out0, <2 x i8> a
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX8-NEXT:    s_add_i32 s1, s1, s10
 ; GFX8-NEXT:    s_xor_b32 s11, s1, s10
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v2, s11
+; GFX8-NEXT:    s_sext_i32_i8 s0, s2
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX8-NEXT:    s_sext_i32_i8 s0, s2
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v2, s11
 ; GFX8-NEXT:    s_ashr_i32 s9, s0, 31
 ; GFX8-NEXT:    s_add_i32 s0, s0, s9
 ; GFX8-NEXT:    v_mul_lo_u32 v1, s6, v0
@@ -2563,8 +2563,8 @@ define amdgpu_kernel void @sdivrem_v2i8(<2 x i8> addrspace(1)* %out0, <2 x i8> a
 ; GFX8-NEXT:    v_mul_hi_u32 v1, s1, v1
 ; GFX8-NEXT:    v_xor_b32_e32 v2, s9, v2
 ; GFX8-NEXT:    v_subrev_u32_e32 v0, vcc, s0, v0
-; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, s9, v2
 ; GFX8-NEXT:    v_mul_lo_u32 v3, v1, s11
+; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, s9, v2
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 1, v1
 ; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, s1, v3
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
@@ -2586,8 +2586,8 @@ define amdgpu_kernel void @sdivrem_v2i8(<2 x i8> addrspace(1)* %out0, <2 x i8> a
 ; GFX8-NEXT:    v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s4
-; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, s2, v3
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    flat_store_short v[0:1], v4
 ; GFX8-NEXT:    v_and_b32_e32 v0, s0, v3
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
@@ -2667,12 +2667,12 @@ define amdgpu_kernel void @sdivrem_v2i8(<2 x i8> addrspace(1)* %out0, <2 x i8> a
 ; GFX9-NEXT:    s_movk_i32 s4, 0xff
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX9-NEXT:    v_and_b32_e32 v1, s4, v1
-; GFX9-NEXT:    v_xor_b32_e32 v3, s11, v3
 ; GFX9-NEXT:    v_subrev_u32_e32 v0, s6, v0
+; GFX9-NEXT:    v_xor_b32_e32 v3, s11, v3
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; GFX9-NEXT:    v_subrev_u32_e32 v3, s11, v3
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    v_subrev_u32_e32 v3, s11, v3
 ; GFX9-NEXT:    v_xor_b32_e32 v2, s10, v2
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_store_short v1, v0, s[0:1]
@@ -2730,8 +2730,8 @@ define amdgpu_kernel void @sdivrem_v2i8(<2 x i8> addrspace(1)* %out0, <2 x i8> a
 ; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s1, v2
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v6, s1, v2
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s2, v3
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, s2, v3
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s2, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s0
@@ -2859,8 +2859,8 @@ define amdgpu_kernel void @sdiv_i16(i16 addrspace(1)* %out0, i16 addrspace(1)* %
 ; GFX9-NEXT:    v_subrev_u32_e32 v3, s7, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX9-NEXT:    v_xor_b32_e32 v0, s4, v0
-; GFX9-NEXT:    v_xor_b32_e32 v1, s8, v1
 ; GFX9-NEXT:    v_subrev_u32_e32 v0, s4, v0
+; GFX9-NEXT:    v_xor_b32_e32 v1, s8, v1
 ; GFX9-NEXT:    v_subrev_u32_e32 v1, s8, v1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_store_short v2, v0, s[0:1]
@@ -2893,8 +2893,8 @@ define amdgpu_kernel void @sdiv_i16(i16 addrspace(1)* %out0, i16 addrspace(1)* %
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v1, s0, v1
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX10-NEXT:    s_xor_b32 s4, s8, s6
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s7, v1
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, s7, v1
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s7, v1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
 ; GFX10-NEXT:    v_add_nc_u32_e32 v2, 1, v0
@@ -2940,8 +2940,8 @@ define amdgpu_kernel void @sdivrem_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX8-NEXT:    s_add_i32 s0, s0, s11
-; GFX8-NEXT:    s_xor_b32 s12, s0, s11
 ; GFX8-NEXT:    s_xor_b32 s1, s1, s10
+; GFX8-NEXT:    s_xor_b32 s12, s0, s11
 ; GFX8-NEXT:    v_mul_lo_u32 v1, s6, v0
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v2, s12
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
@@ -2952,13 +2952,13 @@ define amdgpu_kernel void @sdivrem_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16
 ; GFX8-NEXT:    v_mul_lo_u32 v2, v0, s3
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 1, v0
 ; GFX8-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
-; GFX8-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s1, v2
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s3, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX8-NEXT:    v_subrev_u32_e64 v3, s[0:1], s3, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 1, v0
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s3, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX8-NEXT:    v_subrev_u32_e64 v3, s[0:1], s3, v2
@@ -2976,8 +2976,8 @@ define amdgpu_kernel void @sdivrem_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16
 ; GFX8-NEXT:    v_mul_hi_u32 v1, s1, v1
 ; GFX8-NEXT:    v_xor_b32_e32 v2, s10, v2
 ; GFX8-NEXT:    v_subrev_u32_e32 v0, vcc, s0, v0
-; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, s10, v2
 ; GFX8-NEXT:    v_mul_lo_u32 v3, v1, s12
+; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, s10, v2
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 1, v1
 ; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, s1, v3
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s12, v3
@@ -2995,8 +2995,8 @@ define amdgpu_kernel void @sdivrem_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16
 ; GFX8-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX8-NEXT:    v_xor_b32_e32 v3, s2, v3
 ; GFX8-NEXT:    v_and_b32_e32 v1, s0, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, s2, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_and_b32_e32 v0, s0, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
@@ -3026,10 +3026,10 @@ define amdgpu_kernel void @sdivrem_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    s_ashr_i32 s10, s6, 31
 ; GFX9-NEXT:    s_add_i32 s6, s6, s10
-; GFX9-NEXT:    s_xor_b32 s6, s6, s10
+; GFX9-NEXT:    s_sub_i32 s11, 0, s8
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    s_sub_i32 s11, 0, s8
+; GFX9-NEXT:    s_xor_b32 s6, s6, s10
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s6
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_sext_i32_i16 s5, s9
@@ -3085,8 +3085,8 @@ define amdgpu_kernel void @sdivrem_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16
 ; GFX9-NEXT:    v_subrev_u32_e32 v0, s7, v0
 ; GFX9-NEXT:    v_subrev_u32_e32 v2, s11, v2
 ; GFX9-NEXT:    v_sub_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0xffff
 ; GFX9-NEXT:    v_sub_u32_sdwa v3, v3, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0xffff
 ; GFX9-NEXT:    v_and_or_b32 v0, v0, v4, v1
 ; GFX9-NEXT:    v_and_or_b32 v1, v2, v4, v3
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
@@ -3142,22 +3142,22 @@ define amdgpu_kernel void @sdivrem_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v2, s6, v2
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v3, s0, v3
 ; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s2, v2
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v6, s2, v2
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s9, v3
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s2, v2
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, s9, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s0
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc_lo
+; GFX10-NEXT:    v_add_nc_u32_e32 v5, 1, v1
 ; GFX10-NEXT:    v_add_nc_u32_e32 v4, 1, v0
 ; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s2, v2
-; GFX10-NEXT:    v_add_nc_u32_e32 v5, 1, v1
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s9, v3
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v6, s2, v2
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, s9, v3
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s0
 ; GFX10-NEXT:    s_xor_b32 s2, s1, s3
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc_lo
@@ -3228,8 +3228,8 @@ define amdgpu_kernel void @sdivrem_i3(i3 addrspace(1)* %out0, i3 addrspace(1)* %
 ; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, s4, v2
 ; GFX8-NEXT:    v_xor_b32_e32 v3, s8, v3
 ; GFX8-NEXT:    v_and_b32_e32 v2, 7, v2
-; GFX8-NEXT:    flat_store_byte v[0:1], v2
 ; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, s8, v3
+; GFX8-NEXT:    flat_store_byte v[0:1], v2
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 7, v3
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s3
@@ -3275,8 +3275,8 @@ define amdgpu_kernel void @sdivrem_i3(i3 addrspace(1)* %out0, i3 addrspace(1)* %
 ; GFX9-NEXT:    v_xor_b32_e32 v0, s4, v0
 ; GFX9-NEXT:    v_subrev_u32_e32 v0, s4, v0
 ; GFX9-NEXT:    v_xor_b32_e32 v1, s8, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 7, v0
 ; GFX9-NEXT:    v_subrev_u32_e32 v1, s8, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 7, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_store_byte v2, v0, s[0:1]
 ; GFX9-NEXT:    v_and_b32_e32 v0, 7, v1
@@ -3307,8 +3307,8 @@ define amdgpu_kernel void @sdivrem_i3(i3 addrspace(1)* %out0, i3 addrspace(1)* %
 ; GFX10-NEXT:    v_mul_lo_u32 v1, v0, s1
 ; GFX10-NEXT:    v_add_nc_u32_e32 v2, 1, v0
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v1, s0, v1
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s1, v1
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, s1, v1
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s1, v1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
 ; GFX10-NEXT:    v_add_nc_u32_e32 v2, 1, v0
@@ -3427,8 +3427,8 @@ define amdgpu_kernel void @sdivrem_i27(i27 addrspace(1)* %out0, i27 addrspace(1)
 ; GFX9-NEXT:    v_xor_b32_e32 v0, s5, v0
 ; GFX9-NEXT:    v_subrev_u32_e32 v0, s5, v0
 ; GFX9-NEXT:    v_xor_b32_e32 v1, s8, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, s4, v0
 ; GFX9-NEXT:    v_subrev_u32_e32 v1, s8, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, s4, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX9-NEXT:    v_and_b32_e32 v0, s4, v1
@@ -3459,8 +3459,8 @@ define amdgpu_kernel void @sdivrem_i27(i27 addrspace(1)* %out0, i27 addrspace(1)
 ; GFX10-NEXT:    v_mul_lo_u32 v1, v0, s1
 ; GFX10-NEXT:    v_add_nc_u32_e32 v2, 1, v0
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v1, s0, v1
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s1, v1
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, s1, v1
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s1, v1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
 ; GFX10-NEXT:    v_add_nc_u32_e32 v2, 1, v0

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll
index d42c290d00bae..34db791ac9b5c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll
@@ -313,8 +313,8 @@ define amdgpu_kernel void @muli24_shl64(i64 addrspace(1)* nocapture %arg, i32 ad
 ; GFX8-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
 ; GFX8-NEXT:    flat_load_dword v4, v[1:2]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s1
-; GFX8-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v5
 ; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
@@ -404,8 +404,8 @@ define <2 x i64> @v_shl_v2i64_zext_v2i32(<2 x i32> %x) {
 ; GFX7-NEXT:    s_brev_b32 s4, -4
 ; GFX7-NEXT:    v_and_b32_e32 v2, s4, v1
 ; GFX7-NEXT:    v_mov_b32_e32 v1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX7-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX7-NEXT:    v_lshl_b64 v[0:1], v[0:1], 2
 ; GFX7-NEXT:    v_lshl_b64 v[2:3], v[2:3], 2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
@@ -416,8 +416,8 @@ define <2 x i64> @v_shl_v2i64_zext_v2i32(<2 x i32> %x) {
 ; GFX8-NEXT:    s_brev_b32 s4, -4
 ; GFX8-NEXT:    v_and_b32_e32 v2, s4, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0
-; GFX8-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX8-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX8-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
 ; GFX8-NEXT:    v_lshlrev_b64 v[2:3], 2, v[2:3]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -428,8 +428,8 @@ define <2 x i64> @v_shl_v2i64_zext_v2i32(<2 x i32> %x) {
 ; GFX9-NEXT:    s_brev_b32 s4, -4
 ; GFX9-NEXT:    v_and_b32_e32 v2, s4, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX9-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
 ; GFX9-NEXT:    v_lshlrev_b64 v[2:3], 2, v[2:3]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -487,8 +487,8 @@ define <2 x i64> @v_shl_v2i64_sext_v2i32(<2 x i32> %x) {
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_brev_b32 s4, -8
-; GFX7-NEXT:    v_and_b32_e32 v2, s4, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX7-NEXT:    v_and_b32_e32 v2, s4, v1
 ; GFX7-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GFX7-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
 ; GFX7-NEXT:    v_lshl_b64 v[0:1], v[0:1], 2
@@ -499,8 +499,8 @@ define <2 x i64> @v_shl_v2i64_sext_v2i32(<2 x i32> %x) {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_brev_b32 s4, -8
-; GFX8-NEXT:    v_and_b32_e32 v2, s4, v1
 ; GFX8-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX8-NEXT:    v_and_b32_e32 v2, s4, v1
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
 ; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
@@ -511,8 +511,8 @@ define <2 x i64> @v_shl_v2i64_sext_v2i32(<2 x i32> %x) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_brev_b32 s4, -8
-; GFX9-NEXT:    v_and_b32_e32 v2, s4, v1
 ; GFX9-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX9-NEXT:    v_and_b32_e32 v2, s4, v1
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
@@ -524,12 +524,12 @@ define <2 x i64> @v_shl_v2i64_sext_v2i32(<2 x i32> %x) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    s_brev_b32 s4, -8
-; GFX10-NEXT:    v_and_b32_e32 v2, s4, v1
 ; GFX10-NEXT:    v_and_b32_e32 v0, s4, v0
-; GFX10-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, s4, v1
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GFX10-NEXT:    v_lshlrev_b64 v[2:3], 2, v[2:3]
+; GFX10-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b64 v[2:3], 2, v[2:3]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %and = and <2 x i32> %x, <i32 536870911, i32 536870911>
   %ext = sext <2 x i32> %and to <2 x i64>
@@ -622,8 +622,8 @@ define amdgpu_ps <2 x i32> @s_shl_v2i32_zext_v2i16(<2 x i16> inreg %x) {
 ; GFX8-LABEL: s_shl_v2i32_zext_v2i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_movk_i32 s2, 0x3fff
-; GFX8-NEXT:    s_lshr_b32 s1, s0, 16
 ; GFX8-NEXT:    s_mov_b32 s3, s2
+; GFX8-NEXT:    s_lshr_b32 s1, s0, 16
 ; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX8-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, 2

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll
index 454f4cf98f614..83618c1c7ede2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll
@@ -812,8 +812,8 @@ define amdgpu_ps i32 @s_shl_v2i16(<2 x i16> inreg %value, <2 x i16> inreg %amoun
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_mov_b32 s3, 0xffff
 ; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
-; GFX8-NEXT:    s_lshr_b32 s4, s1, 16
 ; GFX8-NEXT:    s_and_b32 s0, s0, s3
+; GFX8-NEXT:    s_lshr_b32 s4, s1, 16
 ; GFX8-NEXT:    s_and_b32 s1, s1, s3
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, s1
 ; GFX8-NEXT:    s_lshl_b32 s1, s2, s4
@@ -939,9 +939,9 @@ define <2 x float> @v_shl_v4i16(<4 x i16> %value, <4 x i16> %amount) {
 ; GFX6-NEXT:    v_and_b32_e32 v4, s4, v5
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, v4, v1
 ; GFX6-NEXT:    v_and_b32_e32 v4, s4, v6
-; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
 ; GFX6-NEXT:    v_and_b32_e32 v4, s4, v7
+; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v4, v3
 ; GFX6-NEXT:    v_and_b32_e32 v0, s4, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
@@ -991,9 +991,9 @@ define amdgpu_ps <2 x i32> @s_shl_v4i16(<4 x i16> inreg %value, <4 x i16> inreg
 ; GFX6-NEXT:    s_and_b32 s4, s5, s8
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, s4
 ; GFX6-NEXT:    s_and_b32 s4, s6, s8
-; GFX6-NEXT:    s_and_b32 s1, s1, s8
 ; GFX6-NEXT:    s_lshl_b32 s2, s2, s4
 ; GFX6-NEXT:    s_and_b32 s4, s7, s8
+; GFX6-NEXT:    s_and_b32 s1, s1, s8
 ; GFX6-NEXT:    s_lshl_b32 s3, s3, s4
 ; GFX6-NEXT:    s_and_b32 s0, s0, s8
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
@@ -1008,15 +1008,15 @@ define amdgpu_ps <2 x i32> @s_shl_v4i16(<4 x i16> inreg %value, <4 x i16> inreg
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_mov_b32 s6, 0xffff
 ; GFX8-NEXT:    s_lshr_b32 s4, s0, 16
-; GFX8-NEXT:    s_lshr_b32 s7, s2, 16
 ; GFX8-NEXT:    s_and_b32 s0, s0, s6
+; GFX8-NEXT:    s_lshr_b32 s7, s2, 16
 ; GFX8-NEXT:    s_and_b32 s2, s2, s6
-; GFX8-NEXT:    s_lshl_b32 s0, s0, s2
-; GFX8-NEXT:    s_lshl_b32 s2, s4, s7
 ; GFX8-NEXT:    s_lshr_b32 s5, s1, 16
-; GFX8-NEXT:    s_lshr_b32 s8, s3, 16
 ; GFX8-NEXT:    s_and_b32 s1, s1, s6
+; GFX8-NEXT:    s_lshr_b32 s8, s3, 16
 ; GFX8-NEXT:    s_and_b32 s3, s3, s6
+; GFX8-NEXT:    s_lshl_b32 s0, s0, s2
+; GFX8-NEXT:    s_lshl_b32 s2, s4, s7
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, s3
 ; GFX8-NEXT:    s_lshl_b32 s3, s5, s8
 ; GFX8-NEXT:    s_lshl_b32 s2, s2, 16
@@ -1096,12 +1096,12 @@ define <4 x float> @v_shl_v8i16(<8 x i16> %value, <8 x i16> %amount) {
 ; GFX6-NEXT:    v_and_b32_e32 v8, s4, v11
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v8, v3
 ; GFX6-NEXT:    v_and_b32_e32 v8, s4, v12
-; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, v8, v4
 ; GFX6-NEXT:    v_and_b32_e32 v8, s4, v13
+; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX6-NEXT:    v_mov_b32_e32 v16, 0xffff
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v5, v8, v5
 ; GFX6-NEXT:    v_and_b32_e32 v8, s4, v14
-; GFX6-NEXT:    v_mov_b32_e32 v16, 0xffff
 ; GFX6-NEXT:    v_and_b32_e32 v0, s4, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v6, v8, v6
@@ -1109,13 +1109,13 @@ define <4 x float> @v_shl_v8i16(<8 x i16> %value, <8 x i16> %amount) {
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_and_b32_e32 v1, v2, v16
 ; GFX6-NEXT:    v_and_b32_e32 v2, v3, v16
-; GFX6-NEXT:    v_and_b32_e32 v3, v5, v16
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v7, v8, v7
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_and_b32_e32 v3, v5, v16
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX6-NEXT:    v_and_b32_e32 v2, v4, v16
-; GFX6-NEXT:    v_and_b32_e32 v4, v7, v16
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT:    v_and_b32_e32 v4, v7, v16
 ; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX6-NEXT:    v_and_b32_e32 v3, v6, v16
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
@@ -1175,9 +1175,9 @@ define amdgpu_ps <4 x i32> @s_shl_v8i16(<8 x i16> inreg %value, <8 x i16> inreg
 ; GFX6-NEXT:    s_and_b32 s8, s11, s16
 ; GFX6-NEXT:    s_lshl_b32 s3, s3, s8
 ; GFX6-NEXT:    s_and_b32 s8, s12, s16
-; GFX6-NEXT:    s_and_b32 s1, s1, s16
 ; GFX6-NEXT:    s_lshl_b32 s4, s4, s8
 ; GFX6-NEXT:    s_and_b32 s8, s13, s16
+; GFX6-NEXT:    s_and_b32 s1, s1, s16
 ; GFX6-NEXT:    s_lshl_b32 s5, s5, s8
 ; GFX6-NEXT:    s_and_b32 s8, s14, s16
 ; GFX6-NEXT:    s_and_b32 s0, s0, s16
@@ -1187,13 +1187,13 @@ define amdgpu_ps <4 x i32> @s_shl_v8i16(<8 x i16> inreg %value, <8 x i16> inreg
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
 ; GFX6-NEXT:    s_and_b32 s1, s2, s16
 ; GFX6-NEXT:    s_and_b32 s2, s3, s16
-; GFX6-NEXT:    s_and_b32 s3, s5, s16
-; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
 ; GFX6-NEXT:    s_lshl_b32 s7, s7, s8
+; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX6-NEXT:    s_and_b32 s3, s5, s16
 ; GFX6-NEXT:    s_or_b32 s1, s1, s2
 ; GFX6-NEXT:    s_and_b32 s2, s4, s16
-; GFX6-NEXT:    s_and_b32 s4, s7, s16
 ; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
+; GFX6-NEXT:    s_and_b32 s4, s7, s16
 ; GFX6-NEXT:    s_or_b32 s2, s2, s3
 ; GFX6-NEXT:    s_and_b32 s3, s6, s16
 ; GFX6-NEXT:    s_lshl_b32 s4, s4, 16
@@ -1204,35 +1204,35 @@ define amdgpu_ps <4 x i32> @s_shl_v8i16(<8 x i16> inreg %value, <8 x i16> inreg
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_mov_b32 s12, 0xffff
 ; GFX8-NEXT:    s_lshr_b32 s8, s0, 16
-; GFX8-NEXT:    s_lshr_b32 s13, s4, 16
 ; GFX8-NEXT:    s_and_b32 s0, s0, s12
+; GFX8-NEXT:    s_lshr_b32 s13, s4, 16
 ; GFX8-NEXT:    s_and_b32 s4, s4, s12
-; GFX8-NEXT:    s_lshl_b32 s0, s0, s4
-; GFX8-NEXT:    s_lshl_b32 s4, s8, s13
 ; GFX8-NEXT:    s_lshr_b32 s9, s1, 16
-; GFX8-NEXT:    s_lshr_b32 s14, s5, 16
 ; GFX8-NEXT:    s_and_b32 s1, s1, s12
+; GFX8-NEXT:    s_lshr_b32 s14, s5, 16
 ; GFX8-NEXT:    s_and_b32 s5, s5, s12
-; GFX8-NEXT:    s_lshl_b32 s1, s1, s5
+; GFX8-NEXT:    s_lshl_b32 s0, s0, s4
+; GFX8-NEXT:    s_lshl_b32 s4, s8, s13
 ; GFX8-NEXT:    s_lshr_b32 s10, s2, 16
-; GFX8-NEXT:    s_lshr_b32 s15, s6, 16
 ; GFX8-NEXT:    s_and_b32 s2, s2, s12
+; GFX8-NEXT:    s_lshr_b32 s15, s6, 16
 ; GFX8-NEXT:    s_and_b32 s6, s6, s12
+; GFX8-NEXT:    s_lshl_b32 s1, s1, s5
 ; GFX8-NEXT:    s_lshl_b32 s5, s9, s14
 ; GFX8-NEXT:    s_lshl_b32 s4, s4, 16
 ; GFX8-NEXT:    s_and_b32 s0, s0, s12
-; GFX8-NEXT:    s_lshl_b32 s2, s2, s6
 ; GFX8-NEXT:    s_lshr_b32 s11, s3, 16
-; GFX8-NEXT:    s_lshr_b32 s16, s7, 16
-; GFX8-NEXT:    s_or_b32 s0, s4, s0
 ; GFX8-NEXT:    s_and_b32 s3, s3, s12
+; GFX8-NEXT:    s_lshr_b32 s16, s7, 16
 ; GFX8-NEXT:    s_and_b32 s7, s7, s12
+; GFX8-NEXT:    s_lshl_b32 s2, s2, s6
 ; GFX8-NEXT:    s_lshl_b32 s6, s10, s15
+; GFX8-NEXT:    s_or_b32 s0, s4, s0
 ; GFX8-NEXT:    s_lshl_b32 s4, s5, 16
 ; GFX8-NEXT:    s_and_b32 s1, s1, s12
 ; GFX8-NEXT:    s_lshl_b32 s3, s3, s7
-; GFX8-NEXT:    s_or_b32 s1, s4, s1
 ; GFX8-NEXT:    s_lshl_b32 s7, s11, s16
+; GFX8-NEXT:    s_or_b32 s1, s4, s1
 ; GFX8-NEXT:    s_lshl_b32 s4, s6, 16
 ; GFX8-NEXT:    s_and_b32 s2, s2, s12
 ; GFX8-NEXT:    s_or_b32 s2, s4, s2
@@ -1255,8 +1255,8 @@ define amdgpu_ps <4 x i32> @s_shl_v8i16(<8 x i16> inreg %value, <8 x i16> inreg
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s4
 ; GFX9-NEXT:    s_lshr_b32 s4, s2, 16
 ; GFX9-NEXT:    s_lshr_b32 s5, s6, 16
-; GFX9-NEXT:    s_lshl_b32 s4, s4, s5
 ; GFX9-NEXT:    s_lshl_b32 s2, s2, s6
+; GFX9-NEXT:    s_lshl_b32 s4, s4, s5
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s2, s4
 ; GFX9-NEXT:    s_lshr_b32 s4, s3, 16
 ; GFX9-NEXT:    s_lshr_b32 s5, s7, 16

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
index 03790c0a68498..a2efc7cec7a6c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
@@ -30,26 +30,26 @@ define i64 @v_srem_i64(i64 %num, i64 %den) {
 ; CHECK-NEXT:    v_rcp_iflag_f32_e32 v2, v2
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v4, v6
 ; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, v5, v6, vcc
-; CHECK-NEXT:    v_sub_i32_e32 v7, vcc, 0, v1
 ; CHECK-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
 ; CHECK-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v2
 ; CHECK-NEXT:    v_trunc_f32_e32 v5, v5
 ; CHECK-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v5
 ; CHECK-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; CHECK-NEXT:    v_cvt_u32_f32_e32 v5, v5
+; CHECK-NEXT:    v_sub_i32_e32 v7, vcc, 0, v1
 ; CHECK-NEXT:    v_subb_u32_e32 v8, vcc, 0, v0, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v3, v3, v6
 ; CHECK-NEXT:    v_mul_lo_u32 v9, v8, v2
 ; CHECK-NEXT:    v_mul_lo_u32 v10, v7, v5
 ; CHECK-NEXT:    v_mul_hi_u32 v12, v7, v2
 ; CHECK-NEXT:    v_mul_lo_u32 v11, v7, v2
-; CHECK-NEXT:    v_xor_b32_e32 v4, v4, v6
+; CHECK-NEXT:    v_xor_b32_e32 v3, v3, v6
 ; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
 ; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
 ; CHECK-NEXT:    v_mul_lo_u32 v10, v5, v11
 ; CHECK-NEXT:    v_mul_lo_u32 v12, v2, v9
 ; CHECK-NEXT:    v_mul_hi_u32 v13, v2, v11
 ; CHECK-NEXT:    v_mul_hi_u32 v11, v5, v11
+; CHECK-NEXT:    v_xor_b32_e32 v4, v4, v6
 ; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
 ; CHECK-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v10, v13
@@ -57,12 +57,12 @@ define i64 @v_srem_i64(i64 %num, i64 %den) {
 ; CHECK-NEXT:    v_mul_lo_u32 v13, v5, v9
 ; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
 ; CHECK-NEXT:    v_mul_hi_u32 v12, v2, v9
-; CHECK-NEXT:    v_mul_hi_u32 v9, v5, v9
 ; CHECK-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
 ; CHECK-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
 ; CHECK-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
+; CHECK-NEXT:    v_mul_hi_u32 v9, v5, v9
 ; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
 ; CHECK-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
@@ -75,10 +75,10 @@ define i64 @v_srem_i64(i64 %num, i64 %den) {
 ; CHECK-NEXT:    v_mul_hi_u32 v7, v7, v2
 ; CHECK-NEXT:    v_add_i32_e64 v5, s[4:5], v5, v9
 ; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v11
-; CHECK-NEXT:    v_mul_hi_u32 v9, v2, v12
 ; CHECK-NEXT:    v_add_i32_e64 v7, s[4:5], v8, v7
 ; CHECK-NEXT:    v_mul_lo_u32 v8, v10, v12
 ; CHECK-NEXT:    v_mul_lo_u32 v11, v2, v7
+; CHECK-NEXT:    v_mul_hi_u32 v9, v2, v12
 ; CHECK-NEXT:    v_mul_hi_u32 v12, v10, v12
 ; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v11
 ; CHECK-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
@@ -87,12 +87,12 @@ define i64 @v_srem_i64(i64 %num, i64 %den) {
 ; CHECK-NEXT:    v_mul_lo_u32 v9, v10, v7
 ; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v11, v8
 ; CHECK-NEXT:    v_mul_hi_u32 v11, v2, v7
-; CHECK-NEXT:    v_mul_hi_u32 v7, v10, v7
 ; CHECK-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v12
 ; CHECK-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
 ; CHECK-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v11
 ; CHECK-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
 ; CHECK-NEXT:    v_add_i32_e64 v11, s[4:5], v12, v11
+; CHECK-NEXT:    v_mul_hi_u32 v7, v10, v7
 ; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v9, v8
 ; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[4:5]
 ; CHECK-NEXT:    v_add_i32_e64 v9, s[4:5], v11, v9
@@ -111,12 +111,12 @@ define i64 @v_srem_i64(i64 %num, i64 %den) {
 ; CHECK-NEXT:    v_mul_lo_u32 v9, v4, v5
 ; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
 ; CHECK-NEXT:    v_mul_hi_u32 v8, v3, v5
-; CHECK-NEXT:    v_mul_hi_u32 v5, v4, v5
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v9, v2
 ; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
 ; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; CHECK-NEXT:    v_mul_hi_u32 v5, v4, v5
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v7
 ; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
@@ -203,8 +203,8 @@ define amdgpu_ps i64 @s_srem_i64(i64 inreg %num, i64 inreg %den) {
 ; CHECK-NEXT:    v_cmp_ne_u64_e64 vcc, s[6:7], 0
 ; CHECK-NEXT:    s_cbranch_vccz BB1_2
 ; CHECK-NEXT:  ; %bb.1:
-; CHECK-NEXT:    s_ashr_i32 s0, s5, 31
 ; CHECK-NEXT:    s_ashr_i32 s6, s3, 31
+; CHECK-NEXT:    s_ashr_i32 s0, s5, 31
 ; CHECK-NEXT:    s_add_u32 s8, s2, s6
 ; CHECK-NEXT:    s_cselect_b32 s7, 1, 0
 ; CHECK-NEXT:    s_and_b32 s7, s7, 1
@@ -231,12 +231,12 @@ define amdgpu_ps i64 @s_srem_i64(i64 inreg %num, i64 inreg %den) {
 ; CHECK-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; CHECK-NEXT:    v_trunc_f32_e32 v1, v1
 ; CHECK-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; CHECK-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; CHECK-NEXT:    s_subb_u32 s5, 0, s11
 ; CHECK-NEXT:    v_mov_b32_e32 v6, s11
-; CHECK-NEXT:    v_mul_lo_u32 v2, s5, v0
 ; CHECK-NEXT:    v_mul_lo_u32 v3, s3, v1
+; CHECK-NEXT:    v_mul_lo_u32 v2, s5, v0
 ; CHECK-NEXT:    v_mul_hi_u32 v5, s3, v0
 ; CHECK-NEXT:    v_mul_lo_u32 v4, s3, v0
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
@@ -252,12 +252,12 @@ define amdgpu_ps i64 @s_srem_i64(i64 inreg %num, i64 inreg %den) {
 ; CHECK-NEXT:    v_mul_lo_u32 v7, v1, v2
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
 ; CHECK-NEXT:    v_mul_hi_u32 v5, v0, v2
-; CHECK-NEXT:    v_mul_hi_u32 v2, v1, v2
 ; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
 ; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
+; CHECK-NEXT:    v_mul_hi_u32 v2, v1, v2
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
 ; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
@@ -282,12 +282,12 @@ define amdgpu_ps i64 @s_srem_i64(i64 inreg %num, i64 inreg %den) {
 ; CHECK-NEXT:    v_mul_lo_u32 v5, v3, v4
 ; CHECK-NEXT:    v_add_i32_e64 v2, s[0:1], v8, v2
 ; CHECK-NEXT:    v_mul_hi_u32 v8, v0, v4
-; CHECK-NEXT:    v_mul_hi_u32 v3, v3, v4
 ; CHECK-NEXT:    v_add_i32_e64 v5, s[0:1], v5, v7
 ; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[0:1]
 ; CHECK-NEXT:    v_add_i32_e64 v5, s[0:1], v5, v8
 ; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[0:1]
 ; CHECK-NEXT:    v_add_i32_e64 v7, s[0:1], v7, v8
+; CHECK-NEXT:    v_mul_hi_u32 v3, v3, v4
 ; CHECK-NEXT:    v_add_i32_e64 v2, s[0:1], v5, v2
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[0:1]
 ; CHECK-NEXT:    v_add_i32_e64 v4, s[0:1], v7, v5
@@ -307,12 +307,12 @@ define amdgpu_ps i64 @s_srem_i64(i64 inreg %num, i64 inreg %den) {
 ; CHECK-NEXT:    v_mul_lo_u32 v5, s9, v1
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; CHECK-NEXT:    v_mul_hi_u32 v3, s8, v1
-; CHECK-NEXT:    v_mul_hi_u32 v1, s9, v1
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
 ; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
+; CHECK-NEXT:    v_mul_hi_u32 v1, s9, v1
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
@@ -398,35 +398,35 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v8, 31, v5
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
 ; GISEL-NEXT:    v_addc_u32_e32 v5, vcc, v5, v8, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v5, v5, v8
 ; GISEL-NEXT:    v_xor_b32_e32 v4, v4, v8
+; GISEL-NEXT:    v_xor_b32_e32 v5, v5, v8
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v8, v4
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v9, v5
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v10, 31, v1
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v10
-; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v10, vcc
 ; GISEL-NEXT:    v_mac_f32_e32 v8, 0x4f800000, v9
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v8, v8
+; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v10, vcc
 ; GISEL-NEXT:    v_sub_i32_e32 v11, vcc, 0, v4
-; GISEL-NEXT:    v_subb_u32_e32 v12, vcc, 0, v5, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v10
 ; GISEL-NEXT:    v_mul_f32_e32 v8, 0x5f7ffffc, v8
 ; GISEL-NEXT:    v_mul_f32_e32 v9, 0x2f800000, v8
 ; GISEL-NEXT:    v_trunc_f32_e32 v9, v9
 ; GISEL-NEXT:    v_mac_f32_e32 v8, 0xcf800000, v9
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v8, v8
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v9, v9
-; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v10
+; GISEL-NEXT:    v_subb_u32_e32 v12, vcc, 0, v5, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v13, v12, v8
 ; GISEL-NEXT:    v_mul_lo_u32 v14, v11, v9
 ; GISEL-NEXT:    v_mul_hi_u32 v16, v11, v8
 ; GISEL-NEXT:    v_mul_lo_u32 v15, v11, v8
+; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v10
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v16
 ; GISEL-NEXT:    v_mul_lo_u32 v14, v9, v15
 ; GISEL-NEXT:    v_mul_lo_u32 v16, v8, v13
 ; GISEL-NEXT:    v_mul_hi_u32 v17, v8, v15
 ; GISEL-NEXT:    v_mul_hi_u32 v15, v9, v15
+; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v10
 ; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v16
 ; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v17
@@ -434,12 +434,12 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_mul_lo_u32 v17, v9, v13
 ; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v16, v14
 ; GISEL-NEXT:    v_mul_hi_u32 v16, v8, v13
-; GISEL-NEXT:    v_mul_hi_u32 v13, v9, v13
 ; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v17, v15
 ; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v15, v16
 ; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v17, v16
+; GISEL-NEXT:    v_mul_hi_u32 v13, v9, v13
 ; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
 ; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v16, v15
@@ -452,10 +452,10 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_mul_hi_u32 v11, v11, v8
 ; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v13
 ; GISEL-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v15
-; GISEL-NEXT:    v_mul_hi_u32 v13, v8, v16
 ; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v12, v11
 ; GISEL-NEXT:    v_mul_lo_u32 v12, v14, v16
 ; GISEL-NEXT:    v_mul_lo_u32 v15, v8, v11
+; GISEL-NEXT:    v_mul_hi_u32 v13, v8, v16
 ; GISEL-NEXT:    v_mul_hi_u32 v16, v14, v16
 ; GISEL-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v15
 ; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
@@ -464,12 +464,12 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_mul_lo_u32 v13, v14, v11
 ; GISEL-NEXT:    v_add_i32_e64 v12, s[4:5], v15, v12
 ; GISEL-NEXT:    v_mul_hi_u32 v15, v8, v11
-; GISEL-NEXT:    v_mul_hi_u32 v11, v14, v11
 ; GISEL-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v16
 ; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v15
 ; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v15, s[4:5], v16, v15
+; GISEL-NEXT:    v_mul_hi_u32 v11, v14, v11
 ; GISEL-NEXT:    v_add_i32_e64 v12, s[4:5], v13, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v13, s[4:5], v15, v13
@@ -488,12 +488,12 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_mul_lo_u32 v13, v1, v9
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
 ; GISEL-NEXT:    v_mul_hi_u32 v12, v0, v9
-; GISEL-NEXT:    v_mul_hi_u32 v9, v1, v9
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v13, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
+; GISEL-NEXT:    v_mul_hi_u32 v9, v1, v9
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
@@ -520,9 +520,9 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v11, v4
 ; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v1, v5, vcc
-; GISEL-NEXT:    v_sub_i32_e32 v4, vcc, v11, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v12, v5
+; GISEL-NEXT:    v_sub_i32_e32 v4, vcc, v11, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, v13, v14, s[4:5]
 ; GISEL-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v13
@@ -540,31 +540,29 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v7, v4
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v8, 31, v3
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
-; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v8, vcc
 ; GISEL-NEXT:    v_mac_f32_e32 v6, 0x4f800000, v7
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v6, v6
+; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v8, vcc
 ; GISEL-NEXT:    v_sub_i32_e32 v9, vcc, 0, v5
-; GISEL-NEXT:    v_subb_u32_e32 v11, vcc, 0, v4, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v10
 ; GISEL-NEXT:    v_mul_f32_e32 v6, 0x5f7ffffc, v6
 ; GISEL-NEXT:    v_mul_f32_e32 v7, 0x2f800000, v6
 ; GISEL-NEXT:    v_trunc_f32_e32 v7, v7
 ; GISEL-NEXT:    v_mac_f32_e32 v6, 0xcf800000, v7
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v6, v6
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v7, v7
-; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v8
-; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v8
+; GISEL-NEXT:    v_subb_u32_e32 v11, vcc, 0, v4, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v12, v11, v6
 ; GISEL-NEXT:    v_mul_lo_u32 v13, v9, v7
 ; GISEL-NEXT:    v_mul_hi_u32 v15, v9, v6
 ; GISEL-NEXT:    v_mul_lo_u32 v14, v9, v6
-; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v10
+; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v10
 ; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
 ; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v15
 ; GISEL-NEXT:    v_mul_lo_u32 v13, v7, v14
 ; GISEL-NEXT:    v_mul_lo_u32 v15, v6, v12
 ; GISEL-NEXT:    v_mul_hi_u32 v16, v6, v14
 ; GISEL-NEXT:    v_mul_hi_u32 v14, v7, v14
+; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v8
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v15
 ; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v16
@@ -572,12 +570,12 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_mul_lo_u32 v16, v7, v12
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v15, v13
 ; GISEL-NEXT:    v_mul_hi_u32 v15, v6, v12
-; GISEL-NEXT:    v_mul_hi_u32 v12, v7, v12
 ; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v16, v14
 ; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v15
 ; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v16, v15
+; GISEL-NEXT:    v_mul_hi_u32 v12, v7, v12
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
@@ -590,11 +588,12 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_mul_hi_u32 v9, v9, v6
 ; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v7, v12
 ; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v14
-; GISEL-NEXT:    v_mul_hi_u32 v12, v6, v15
 ; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v11, v9
 ; GISEL-NEXT:    v_mul_lo_u32 v11, v13, v15
 ; GISEL-NEXT:    v_mul_lo_u32 v14, v6, v9
+; GISEL-NEXT:    v_mul_hi_u32 v12, v6, v15
 ; GISEL-NEXT:    v_mul_hi_u32 v15, v13, v15
+; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v8
 ; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v14
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v12
@@ -602,12 +601,12 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_mul_lo_u32 v12, v13, v9
 ; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v14, v11
 ; GISEL-NEXT:    v_mul_hi_u32 v14, v6, v9
-; GISEL-NEXT:    v_mul_hi_u32 v9, v13, v9
 ; GISEL-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v15
 ; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v14
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v14, s[4:5], v15, v14
+; GISEL-NEXT:    v_mul_hi_u32 v9, v13, v9
 ; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v12, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v12, s[4:5], v14, v12
@@ -615,25 +614,26 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_addc_u32_e32 v7, vcc, v7, v9, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v11
 ; GISEL-NEXT:    v_addc_u32_e32 v7, vcc, 0, v7, vcc
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v10
+; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v10
 ; GISEL-NEXT:    v_mul_lo_u32 v9, v3, v6
 ; GISEL-NEXT:    v_mul_lo_u32 v11, v2, v7
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v10
 ; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v1, v10, vcc
 ; GISEL-NEXT:    v_mul_hi_u32 v10, v2, v6
-; GISEL-NEXT:    v_mul_hi_u32 v6, v3, v6
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v10, v3, v7
+; GISEL-NEXT:    v_mul_hi_u32 v6, v3, v6
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
 ; GISEL-NEXT:    v_mul_hi_u32 v11, v2, v7
-; GISEL-NEXT:    v_mul_hi_u32 v7, v3, v7
 ; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v10, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
+; GISEL-NEXT:    v_mul_hi_u32 v7, v3, v7
 ; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
 ; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
@@ -704,26 +704,26 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v2, v2
 ; CGP-NEXT:    v_add_i32_e32 v3, vcc, v10, v4
 ; CGP-NEXT:    v_addc_u32_e32 v5, vcc, v11, v4, vcc
-; CGP-NEXT:    v_sub_i32_e32 v11, vcc, 0, v1
 ; CGP-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
 ; CGP-NEXT:    v_mul_f32_e32 v10, 0x2f800000, v2
 ; CGP-NEXT:    v_trunc_f32_e32 v10, v10
 ; CGP-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v10
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v10, v10
+; CGP-NEXT:    v_sub_i32_e32 v11, vcc, 0, v1
 ; CGP-NEXT:    v_subb_u32_e32 v12, vcc, 0, v0, vcc
-; CGP-NEXT:    v_xor_b32_e32 v3, v3, v4
 ; CGP-NEXT:    v_mul_lo_u32 v13, v12, v2
 ; CGP-NEXT:    v_mul_lo_u32 v14, v11, v10
 ; CGP-NEXT:    v_mul_hi_u32 v16, v11, v2
 ; CGP-NEXT:    v_mul_lo_u32 v15, v11, v2
-; CGP-NEXT:    v_xor_b32_e32 v5, v5, v4
+; CGP-NEXT:    v_xor_b32_e32 v3, v3, v4
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v16
 ; CGP-NEXT:    v_mul_lo_u32 v14, v10, v15
 ; CGP-NEXT:    v_mul_lo_u32 v16, v2, v13
 ; CGP-NEXT:    v_mul_hi_u32 v17, v2, v15
 ; CGP-NEXT:    v_mul_hi_u32 v15, v10, v15
+; CGP-NEXT:    v_xor_b32_e32 v5, v5, v4
 ; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v16
 ; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v17
@@ -731,12 +731,12 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_mul_lo_u32 v17, v10, v13
 ; CGP-NEXT:    v_add_i32_e32 v14, vcc, v16, v14
 ; CGP-NEXT:    v_mul_hi_u32 v16, v2, v13
-; CGP-NEXT:    v_mul_hi_u32 v13, v10, v13
 ; CGP-NEXT:    v_add_i32_e32 v15, vcc, v17, v15
 ; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v15, vcc, v15, v16
 ; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v16, vcc, v17, v16
+; CGP-NEXT:    v_mul_hi_u32 v13, v10, v13
 ; CGP-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v15, vcc, v16, v15
@@ -749,10 +749,10 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_mul_hi_u32 v11, v11, v2
 ; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v13
 ; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v15
-; CGP-NEXT:    v_mul_hi_u32 v13, v2, v16
 ; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v12, v11
 ; CGP-NEXT:    v_mul_lo_u32 v12, v14, v16
 ; CGP-NEXT:    v_mul_lo_u32 v15, v2, v11
+; CGP-NEXT:    v_mul_hi_u32 v13, v2, v16
 ; CGP-NEXT:    v_mul_hi_u32 v16, v14, v16
 ; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v15
 ; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
@@ -761,12 +761,12 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_mul_lo_u32 v13, v14, v11
 ; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v15, v12
 ; CGP-NEXT:    v_mul_hi_u32 v15, v2, v11
-; CGP-NEXT:    v_mul_hi_u32 v11, v14, v11
 ; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v16
 ; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v15
 ; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v15, s[4:5], v16, v15
+; CGP-NEXT:    v_mul_hi_u32 v11, v14, v11
 ; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v13, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v15, v13
@@ -785,12 +785,12 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_mul_lo_u32 v13, v5, v10
 ; CGP-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
 ; CGP-NEXT:    v_mul_hi_u32 v12, v3, v10
-; CGP-NEXT:    v_mul_hi_u32 v10, v5, v10
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v13, v2
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
+; CGP-NEXT:    v_mul_hi_u32 v10, v5, v10
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v11
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
@@ -879,26 +879,26 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v4, v4
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v8, v6
 ; CGP-NEXT:    v_addc_u32_e32 v7, vcc, v9, v6, vcc
-; CGP-NEXT:    v_sub_i32_e32 v9, vcc, 0, v3
 ; CGP-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
 ; CGP-NEXT:    v_mul_f32_e32 v8, 0x2f800000, v4
 ; CGP-NEXT:    v_trunc_f32_e32 v8, v8
 ; CGP-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v8
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v4, v4
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v8, v8
+; CGP-NEXT:    v_sub_i32_e32 v9, vcc, 0, v3
 ; CGP-NEXT:    v_subb_u32_e32 v10, vcc, 0, v2, vcc
-; CGP-NEXT:    v_xor_b32_e32 v5, v5, v6
 ; CGP-NEXT:    v_mul_lo_u32 v11, v10, v4
 ; CGP-NEXT:    v_mul_lo_u32 v12, v9, v8
 ; CGP-NEXT:    v_mul_hi_u32 v14, v9, v4
 ; CGP-NEXT:    v_mul_lo_u32 v13, v9, v4
-; CGP-NEXT:    v_xor_b32_e32 v7, v7, v6
+; CGP-NEXT:    v_xor_b32_e32 v5, v5, v6
 ; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
 ; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v14
 ; CGP-NEXT:    v_mul_lo_u32 v12, v8, v13
 ; CGP-NEXT:    v_mul_lo_u32 v14, v4, v11
 ; CGP-NEXT:    v_mul_hi_u32 v15, v4, v13
 ; CGP-NEXT:    v_mul_hi_u32 v13, v8, v13
+; CGP-NEXT:    v_xor_b32_e32 v7, v7, v6
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v15
@@ -906,12 +906,12 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_mul_lo_u32 v15, v8, v11
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v14, v12
 ; CGP-NEXT:    v_mul_hi_u32 v14, v4, v11
-; CGP-NEXT:    v_mul_hi_u32 v11, v8, v11
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v15, v13
 ; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
+; CGP-NEXT:    v_mul_hi_u32 v11, v8, v11
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
@@ -924,10 +924,10 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_mul_hi_u32 v9, v9, v4
 ; CGP-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v11
 ; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v13
-; CGP-NEXT:    v_mul_hi_u32 v11, v4, v14
 ; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v10, v9
 ; CGP-NEXT:    v_mul_lo_u32 v10, v12, v14
 ; CGP-NEXT:    v_mul_lo_u32 v13, v4, v9
+; CGP-NEXT:    v_mul_hi_u32 v11, v4, v14
 ; CGP-NEXT:    v_mul_hi_u32 v14, v12, v14
 ; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v13
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
@@ -936,12 +936,12 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_mul_lo_u32 v11, v12, v9
 ; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v13, v10
 ; CGP-NEXT:    v_mul_hi_u32 v13, v4, v9
-; CGP-NEXT:    v_mul_hi_u32 v9, v12, v9
 ; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v13
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v14, v13
+; CGP-NEXT:    v_mul_hi_u32 v9, v12, v9
 ; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v11, v10
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v13, v11
@@ -960,12 +960,12 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_mul_lo_u32 v11, v7, v8
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
 ; CGP-NEXT:    v_mul_hi_u32 v10, v5, v8
-; CGP-NEXT:    v_mul_hi_u32 v8, v7, v8
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v11, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; CGP-NEXT:    v_mul_hi_u32 v8, v7, v8
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
 ; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
@@ -1051,19 +1051,19 @@ define i64 @v_srem_i64_pow2k_denom(i64 %num) {
 ; CHECK-NEXT:    v_rcp_iflag_f32_e32 v2, v2
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
 ; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v3
 ; CHECK-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
 ; CHECK-NEXT:    v_mul_f32_e32 v4, 0x2f800000, v2
 ; CHECK-NEXT:    v_trunc_f32_e32 v4, v4
 ; CHECK-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v4
 ; CHECK-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; CHECK-NEXT:    v_cvt_u32_f32_e32 v4, v4
+; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v3
 ; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v3
-; CHECK-NEXT:    s_bfe_i32 s7, -1, 0x10000
 ; CHECK-NEXT:    v_mul_lo_u32 v5, -1, v2
 ; CHECK-NEXT:    v_mul_lo_u32 v6, s6, v4
 ; CHECK-NEXT:    v_mul_hi_u32 v8, s6, v2
 ; CHECK-NEXT:    v_mul_lo_u32 v7, s6, v2
+; CHECK-NEXT:    s_bfe_i32 s7, -1, 0x10000
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
 ; CHECK-NEXT:    v_mul_lo_u32 v6, v4, v7
@@ -1077,12 +1077,12 @@ define i64 @v_srem_i64_pow2k_denom(i64 %num) {
 ; CHECK-NEXT:    v_mul_lo_u32 v9, v4, v5
 ; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
 ; CHECK-NEXT:    v_mul_hi_u32 v8, v2, v5
-; CHECK-NEXT:    v_mul_hi_u32 v5, v4, v5
 ; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v9, v7
 ; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
 ; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; CHECK-NEXT:    v_mul_hi_u32 v5, v4, v5
 ; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
 ; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
@@ -1108,12 +1108,12 @@ define i64 @v_srem_i64_pow2k_denom(i64 %num) {
 ; CHECK-NEXT:    v_mul_lo_u32 v8, v6, v7
 ; CHECK-NEXT:    v_add_i32_e64 v5, s[4:5], v10, v5
 ; CHECK-NEXT:    v_mul_hi_u32 v10, v2, v7
-; CHECK-NEXT:    v_mul_hi_u32 v6, v6, v7
 ; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v9
 ; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[4:5]
 ; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v10
 ; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
 ; CHECK-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v10
+; CHECK-NEXT:    v_mul_hi_u32 v6, v6, v7
 ; CHECK-NEXT:    v_add_i32_e64 v5, s[4:5], v8, v5
 ; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
 ; CHECK-NEXT:    v_add_i32_e64 v7, s[4:5], v9, v8
@@ -1132,12 +1132,12 @@ define i64 @v_srem_i64_pow2k_denom(i64 %num) {
 ; CHECK-NEXT:    v_mul_lo_u32 v7, v1, v4
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
 ; CHECK-NEXT:    v_mul_hi_u32 v6, v0, v4
-; CHECK-NEXT:    v_mul_hi_u32 v4, v1, v4
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
 ; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
 ; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; CHECK-NEXT:    v_mul_hi_u32 v4, v1, v4
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
@@ -1147,13 +1147,13 @@ define i64 @v_srem_i64_pow2k_denom(i64 %num) {
 ; CHECK-NEXT:    v_mul_lo_u32 v6, s6, v2
 ; CHECK-NEXT:    v_mul_hi_u32 v2, s6, v2
 ; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
-; CHECK-NEXT:    v_mov_b32_e32 v5, s7
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
 ; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v6
 ; CHECK-NEXT:    v_subb_u32_e64 v4, s[4:5], v1, v2, vcc
 ; CHECK-NEXT:    v_sub_i32_e64 v1, s[4:5], v1, v2
 ; CHECK-NEXT:    v_cmp_le_u32_e64 s[4:5], s6, v0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s[4:5]
+; CHECK-NEXT:    v_mov_b32_e32 v5, s7
 ; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v4
 ; CHECK-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
 ; CHECK-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[4:5]
@@ -1190,8 +1190,8 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    s_add_u32 s4, s10, 0
 ; GISEL-NEXT:    s_cselect_b32 s5, 1, 0
 ; GISEL-NEXT:    s_and_b32 s5, s5, 1
-; GISEL-NEXT:    s_cmp_lg_u32 s5, 0
 ; GISEL-NEXT:    s_mov_b32 s6, 0
+; GISEL-NEXT:    s_cmp_lg_u32 s5, 0
 ; GISEL-NEXT:    s_mov_b32 s7, s6
 ; GISEL-NEXT:    s_addc_u32 s5, 0, 0
 ; GISEL-NEXT:    s_xor_b64 s[8:9], s[4:5], s[6:7]
@@ -1212,19 +1212,18 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v4, v4
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v5, v5
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
-; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v7, s12, v4
 ; GISEL-NEXT:    v_mul_lo_u32 v8, s11, v5
 ; GISEL-NEXT:    v_mul_hi_u32 v10, s11, v4
 ; GISEL-NEXT:    v_mul_lo_u32 v9, s11, v4
-; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v6
+; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
 ; GISEL-NEXT:    v_mul_lo_u32 v8, v5, v9
 ; GISEL-NEXT:    v_mul_lo_u32 v10, v4, v7
 ; GISEL-NEXT:    v_mul_hi_u32 v11, v4, v9
 ; GISEL-NEXT:    v_mul_hi_u32 v9, v5, v9
-; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v6
+; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v6
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
@@ -1232,12 +1231,12 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_mul_lo_u32 v11, v5, v7
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
 ; GISEL-NEXT:    v_mul_hi_u32 v10, v4, v7
-; GISEL-NEXT:    v_mul_hi_u32 v7, v5, v7
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; GISEL-NEXT:    v_mul_hi_u32 v7, v5, v7
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
@@ -1255,6 +1254,7 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_mul_lo_u32 v12, v4, v9
 ; GISEL-NEXT:    v_mul_hi_u32 v7, v4, v11
 ; GISEL-NEXT:    v_mul_hi_u32 v11, v8, v11
+; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v6
 ; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v10, v7
@@ -1262,12 +1262,12 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_mul_lo_u32 v10, v8, v9
 ; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v12, v7
 ; GISEL-NEXT:    v_mul_hi_u32 v12, v4, v9
-; GISEL-NEXT:    v_mul_hi_u32 v8, v8, v9
 ; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v12
+; GISEL-NEXT:    v_mul_hi_u32 v8, v8, v9
 ; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v10, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v11, v10
@@ -1287,12 +1287,12 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_mul_lo_u32 v10, v1, v5
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
 ; GISEL-NEXT:    v_mul_hi_u32 v8, v0, v5
-; GISEL-NEXT:    v_mul_hi_u32 v5, v1, v5
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v10, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
+; GISEL-NEXT:    v_mul_hi_u32 v5, v1, v5
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
@@ -1326,14 +1326,14 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    s_cselect_b32 s5, 1, 0
 ; GISEL-NEXT:    v_subrev_i32_e32 v9, vcc, s8, v7
 ; GISEL-NEXT:    s_and_b32 s5, s5, 1
-; GISEL-NEXT:    s_cmp_lg_u32 s5, 0
 ; GISEL-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
+; GISEL-NEXT:    s_cmp_lg_u32 s5, 0
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
 ; GISEL-NEXT:    s_addc_u32 s5, 0, 0
-; GISEL-NEXT:    s_xor_b64 s[6:7], s[4:5], s[6:7]
 ; GISEL-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
+; GISEL-NEXT:    s_xor_b64 s[6:7], s[4:5], s[6:7]
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, s6
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v5, s7
@@ -1376,12 +1376,12 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_mul_lo_u32 v11, v5, v7
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
 ; GISEL-NEXT:    v_mul_hi_u32 v10, v4, v7
-; GISEL-NEXT:    v_mul_hi_u32 v7, v5, v7
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; GISEL-NEXT:    v_mul_hi_u32 v7, v5, v7
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
@@ -1407,12 +1407,12 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_mul_lo_u32 v10, v8, v9
 ; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v12, v7
 ; GISEL-NEXT:    v_mul_hi_u32 v12, v4, v9
-; GISEL-NEXT:    v_mul_hi_u32 v8, v8, v9
 ; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v12
+; GISEL-NEXT:    v_mul_hi_u32 v8, v8, v9
 ; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v10, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v11, v10
@@ -1432,12 +1432,12 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_mul_lo_u32 v10, v3, v5
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
 ; GISEL-NEXT:    v_mul_hi_u32 v8, v2, v5
-; GISEL-NEXT:    v_mul_hi_u32 v5, v3, v5
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v10, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
+; GISEL-NEXT:    v_mul_hi_u32 v5, v3, v5
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
@@ -1461,12 +1461,12 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_subrev_i32_e32 v7, vcc, s6, v2
 ; GISEL-NEXT:    v_subbrev_u32_e64 v8, s[4:5], 0, v3, vcc
 ; GISEL-NEXT:    v_cmp_le_u32_e64 s[4:5], s7, v8
-; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v9, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_le_u32_e64 s[4:5], s6, v7
-; GISEL-NEXT:    v_subrev_i32_e32 v9, vcc, s6, v7
+; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v9, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], s7, v8
+; GISEL-NEXT:    v_subrev_i32_e32 v9, vcc, s6, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, v10, v11, s[4:5]
 ; GISEL-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
@@ -1493,27 +1493,26 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v7, v7
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v5
 ; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v5, vcc
-; CGP-NEXT:    v_xor_b32_e32 v0, v0, v5
 ; CGP-NEXT:    v_mul_f32_e32 v7, 0x5f7ffffc, v7
 ; CGP-NEXT:    v_mul_f32_e32 v8, 0x2f800000, v7
 ; CGP-NEXT:    v_trunc_f32_e32 v8, v8
 ; CGP-NEXT:    v_mac_f32_e32 v7, 0xcf800000, v8
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v7, v7
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v8, v8
+; CGP-NEXT:    v_xor_b32_e32 v0, v0, v5
 ; CGP-NEXT:    v_xor_b32_e32 v1, v1, v5
-; CGP-NEXT:    s_movk_i32 s7, 0x1000
 ; CGP-NEXT:    v_mul_lo_u32 v9, -1, v7
 ; CGP-NEXT:    v_mul_lo_u32 v10, s6, v8
 ; CGP-NEXT:    v_mul_hi_u32 v12, s6, v7
 ; CGP-NEXT:    v_mul_lo_u32 v11, s6, v7
-; CGP-NEXT:    s_bfe_i32 s8, -1, 0x10000
+; CGP-NEXT:    s_movk_i32 s7, 0x1000
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
 ; CGP-NEXT:    v_mul_lo_u32 v10, v8, v11
 ; CGP-NEXT:    v_mul_lo_u32 v12, v7, v9
 ; CGP-NEXT:    v_mul_hi_u32 v13, v7, v11
 ; CGP-NEXT:    v_mul_hi_u32 v11, v8, v11
-; CGP-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v6
+; CGP-NEXT:    s_bfe_i32 s8, -1, 0x10000
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v13
@@ -1521,12 +1520,12 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_mul_lo_u32 v13, v8, v9
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
 ; CGP-NEXT:    v_mul_hi_u32 v12, v7, v9
-; CGP-NEXT:    v_mul_hi_u32 v9, v8, v9
 ; CGP-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
+; CGP-NEXT:    v_mul_hi_u32 v9, v8, v9
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
@@ -1544,7 +1543,7 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_mul_lo_u32 v14, v7, v11
 ; CGP-NEXT:    v_mul_hi_u32 v9, v7, v13
 ; CGP-NEXT:    v_mul_hi_u32 v13, v10, v13
-; CGP-NEXT:    v_rcp_iflag_f32_e32 v4, v4
+; CGP-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v6
 ; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v12, v9
@@ -1552,12 +1551,12 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_mul_lo_u32 v12, v10, v11
 ; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v14, v9
 ; CGP-NEXT:    v_mul_hi_u32 v14, v7, v11
-; CGP-NEXT:    v_mul_hi_u32 v10, v10, v11
 ; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v13
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v14
+; CGP-NEXT:    v_mul_hi_u32 v10, v10, v11
 ; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v12, v9
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v13, v12
@@ -1569,7 +1568,7 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_mul_lo_u32 v10, v0, v8
 ; CGP-NEXT:    v_mul_hi_u32 v11, v0, v7
 ; CGP-NEXT:    v_mul_hi_u32 v7, v1, v7
-; CGP-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
+; CGP-NEXT:    v_rcp_iflag_f32_e32 v4, v4
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
@@ -1577,12 +1576,12 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_mul_lo_u32 v11, v1, v8
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
 ; CGP-NEXT:    v_mul_hi_u32 v10, v0, v8
-; CGP-NEXT:    v_mul_hi_u32 v8, v1, v8
 ; CGP-NEXT:    v_add_i32_e32 v7, vcc, v11, v7
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; CGP-NEXT:    v_mul_hi_u32 v8, v1, v8
 ; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
 ; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
@@ -1591,15 +1590,15 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_mul_lo_u32 v8, s7, v8
 ; CGP-NEXT:    v_mul_lo_u32 v10, s7, v7
 ; CGP-NEXT:    v_mul_hi_u32 v7, s7, v7
-; CGP-NEXT:    v_ashrrev_i32_e32 v6, 31, v3
+; CGP-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; CGP-NEXT:    v_mov_b32_e32 v9, s8
 ; CGP-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
 ; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v10
 ; CGP-NEXT:    v_subb_u32_e64 v8, s[4:5], v1, v7, vcc
 ; CGP-NEXT:    v_sub_i32_e64 v1, s[4:5], v1, v7
 ; CGP-NEXT:    v_cmp_le_u32_e64 s[4:5], s7, v0
 ; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[4:5]
+; CGP-NEXT:    v_mov_b32_e32 v9, s8
 ; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v8
 ; CGP-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
 ; CGP-NEXT:    v_cndmask_b32_e64 v7, v9, v7, s[4:5]
@@ -1627,6 +1626,7 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_mul_lo_u32 v8, -1, v4
 ; CGP-NEXT:    v_mul_lo_u32 v9, s6, v7
 ; CGP-NEXT:    v_mul_hi_u32 v11, s6, v4
+; CGP-NEXT:    v_ashrrev_i32_e32 v6, 31, v3
 ; CGP-NEXT:    v_mul_lo_u32 v10, s6, v4
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
 ; CGP-NEXT:    v_addc_u32_e32 v3, vcc, v3, v6, vcc
@@ -1644,12 +1644,12 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_mul_lo_u32 v12, v7, v8
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
 ; CGP-NEXT:    v_mul_hi_u32 v11, v4, v8
-; CGP-NEXT:    v_mul_hi_u32 v8, v7, v8
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; CGP-NEXT:    v_mul_hi_u32 v8, v7, v8
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
@@ -1675,40 +1675,40 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_mul_lo_u32 v11, v9, v10
 ; CGP-NEXT:    v_add_i32_e64 v8, s[4:5], v13, v8
 ; CGP-NEXT:    v_mul_hi_u32 v13, v4, v10
-; CGP-NEXT:    v_mul_hi_u32 v9, v9, v10
 ; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v13
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v13
+; CGP-NEXT:    v_mul_hi_u32 v9, v9, v10
 ; CGP-NEXT:    v_add_i32_e64 v8, s[4:5], v11, v8
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v12, v11
 ; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v10
 ; CGP-NEXT:    v_addc_u32_e32 v7, vcc, v7, v9, vcc
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
-; CGP-NEXT:    v_addc_u32_e32 v7, vcc, 0, v7, vcc
 ; CGP-NEXT:    v_xor_b32_e32 v3, v3, v6
+; CGP-NEXT:    v_addc_u32_e32 v7, vcc, 0, v7, vcc
 ; CGP-NEXT:    v_xor_b32_e32 v1, v1, v5
-; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v5
 ; CGP-NEXT:    v_mul_lo_u32 v8, v3, v4
 ; CGP-NEXT:    v_mul_lo_u32 v9, v2, v7
+; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v5
 ; CGP-NEXT:    v_subb_u32_e32 v1, vcc, v1, v5, vcc
 ; CGP-NEXT:    v_mul_hi_u32 v5, v2, v4
-; CGP-NEXT:    v_mul_hi_u32 v4, v3, v4
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
 ; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
 ; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v8, v3, v7
+; CGP-NEXT:    v_mul_hi_u32 v4, v3, v4
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
 ; CGP-NEXT:    v_mul_hi_u32 v9, v2, v7
-; CGP-NEXT:    v_mul_hi_u32 v7, v3, v7
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
 ; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
+; CGP-NEXT:    v_mul_hi_u32 v7, v3, v7
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
 ; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
@@ -1719,13 +1719,13 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_mul_hi_u32 v4, s7, v4
 ; CGP-NEXT:    s_bfe_i32 s6, -1, 0x10000
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
-; CGP-NEXT:    v_mov_b32_e32 v7, s6
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
 ; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v8
 ; CGP-NEXT:    v_subb_u32_e64 v5, s[4:5], v3, v4, vcc
 ; CGP-NEXT:    v_sub_i32_e64 v3, s[4:5], v3, v4
 ; CGP-NEXT:    v_cmp_le_u32_e64 s[4:5], s7, v2
 ; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[4:5]
+; CGP-NEXT:    v_mov_b32_e32 v7, s6
 ; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v5
 ; CGP-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
 ; CGP-NEXT:    v_cndmask_b32_e64 v4, v7, v4, s[4:5]
@@ -1766,19 +1766,19 @@ define i64 @v_srem_i64_oddk_denom(i64 %num) {
 ; CHECK-NEXT:    v_rcp_iflag_f32_e32 v2, v2
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
 ; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v3
 ; CHECK-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
 ; CHECK-NEXT:    v_mul_f32_e32 v4, 0x2f800000, v2
 ; CHECK-NEXT:    v_trunc_f32_e32 v4, v4
 ; CHECK-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v4
 ; CHECK-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; CHECK-NEXT:    v_cvt_u32_f32_e32 v4, v4
+; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v3
 ; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v3
-; CHECK-NEXT:    s_bfe_i32 s7, -1, 0x10000
 ; CHECK-NEXT:    v_mul_lo_u32 v5, -1, v2
 ; CHECK-NEXT:    v_mul_lo_u32 v6, s6, v4
 ; CHECK-NEXT:    v_mul_hi_u32 v8, s6, v2
 ; CHECK-NEXT:    v_mul_lo_u32 v7, s6, v2
+; CHECK-NEXT:    s_bfe_i32 s7, -1, 0x10000
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
 ; CHECK-NEXT:    v_mul_lo_u32 v6, v4, v7
@@ -1792,12 +1792,12 @@ define i64 @v_srem_i64_oddk_denom(i64 %num) {
 ; CHECK-NEXT:    v_mul_lo_u32 v9, v4, v5
 ; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
 ; CHECK-NEXT:    v_mul_hi_u32 v8, v2, v5
-; CHECK-NEXT:    v_mul_hi_u32 v5, v4, v5
 ; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v9, v7
 ; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
 ; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; CHECK-NEXT:    v_mul_hi_u32 v5, v4, v5
 ; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
 ; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
@@ -1823,12 +1823,12 @@ define i64 @v_srem_i64_oddk_denom(i64 %num) {
 ; CHECK-NEXT:    v_mul_lo_u32 v8, v6, v7
 ; CHECK-NEXT:    v_add_i32_e64 v5, s[4:5], v10, v5
 ; CHECK-NEXT:    v_mul_hi_u32 v10, v2, v7
-; CHECK-NEXT:    v_mul_hi_u32 v6, v6, v7
 ; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v9
 ; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[4:5]
 ; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v10
 ; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
 ; CHECK-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v10
+; CHECK-NEXT:    v_mul_hi_u32 v6, v6, v7
 ; CHECK-NEXT:    v_add_i32_e64 v5, s[4:5], v8, v5
 ; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
 ; CHECK-NEXT:    v_add_i32_e64 v7, s[4:5], v9, v8
@@ -1847,12 +1847,12 @@ define i64 @v_srem_i64_oddk_denom(i64 %num) {
 ; CHECK-NEXT:    v_mul_lo_u32 v7, v1, v4
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
 ; CHECK-NEXT:    v_mul_hi_u32 v6, v0, v4
-; CHECK-NEXT:    v_mul_hi_u32 v4, v1, v4
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
 ; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
 ; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; CHECK-NEXT:    v_mul_hi_u32 v4, v1, v4
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
@@ -1862,13 +1862,13 @@ define i64 @v_srem_i64_oddk_denom(i64 %num) {
 ; CHECK-NEXT:    v_mul_lo_u32 v6, s6, v2
 ; CHECK-NEXT:    v_mul_hi_u32 v2, s6, v2
 ; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
-; CHECK-NEXT:    v_mov_b32_e32 v5, s7
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
 ; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v6
 ; CHECK-NEXT:    v_subb_u32_e64 v4, s[4:5], v1, v2, vcc
 ; CHECK-NEXT:    v_sub_i32_e64 v1, s[4:5], v1, v2
 ; CHECK-NEXT:    v_cmp_le_u32_e64 s[4:5], s6, v0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s[4:5]
+; CHECK-NEXT:    v_mov_b32_e32 v5, s7
 ; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v4
 ; CHECK-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
 ; CHECK-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[4:5]
@@ -1905,8 +1905,8 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    s_add_u32 s4, s10, 0
 ; GISEL-NEXT:    s_cselect_b32 s5, 1, 0
 ; GISEL-NEXT:    s_and_b32 s5, s5, 1
-; GISEL-NEXT:    s_cmp_lg_u32 s5, 0
 ; GISEL-NEXT:    s_mov_b32 s6, 0
+; GISEL-NEXT:    s_cmp_lg_u32 s5, 0
 ; GISEL-NEXT:    s_mov_b32 s7, s6
 ; GISEL-NEXT:    s_addc_u32 s5, 0, 0
 ; GISEL-NEXT:    s_xor_b64 s[8:9], s[4:5], s[6:7]
@@ -1927,19 +1927,18 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v4, v4
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v5, v5
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
-; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v7, s12, v4
 ; GISEL-NEXT:    v_mul_lo_u32 v8, s11, v5
 ; GISEL-NEXT:    v_mul_hi_u32 v10, s11, v4
 ; GISEL-NEXT:    v_mul_lo_u32 v9, s11, v4
-; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v6
+; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
 ; GISEL-NEXT:    v_mul_lo_u32 v8, v5, v9
 ; GISEL-NEXT:    v_mul_lo_u32 v10, v4, v7
 ; GISEL-NEXT:    v_mul_hi_u32 v11, v4, v9
 ; GISEL-NEXT:    v_mul_hi_u32 v9, v5, v9
-; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v6
+; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v6
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
@@ -1947,12 +1946,12 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_mul_lo_u32 v11, v5, v7
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
 ; GISEL-NEXT:    v_mul_hi_u32 v10, v4, v7
-; GISEL-NEXT:    v_mul_hi_u32 v7, v5, v7
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; GISEL-NEXT:    v_mul_hi_u32 v7, v5, v7
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
@@ -1970,6 +1969,7 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_mul_lo_u32 v12, v4, v9
 ; GISEL-NEXT:    v_mul_hi_u32 v7, v4, v11
 ; GISEL-NEXT:    v_mul_hi_u32 v11, v8, v11
+; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v6
 ; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v10, v7
@@ -1977,12 +1977,12 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_mul_lo_u32 v10, v8, v9
 ; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v12, v7
 ; GISEL-NEXT:    v_mul_hi_u32 v12, v4, v9
-; GISEL-NEXT:    v_mul_hi_u32 v8, v8, v9
 ; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v12
+; GISEL-NEXT:    v_mul_hi_u32 v8, v8, v9
 ; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v10, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v11, v10
@@ -2002,12 +2002,12 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_mul_lo_u32 v10, v1, v5
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
 ; GISEL-NEXT:    v_mul_hi_u32 v8, v0, v5
-; GISEL-NEXT:    v_mul_hi_u32 v5, v1, v5
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v10, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
+; GISEL-NEXT:    v_mul_hi_u32 v5, v1, v5
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
@@ -2041,14 +2041,14 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    s_cselect_b32 s5, 1, 0
 ; GISEL-NEXT:    v_subrev_i32_e32 v9, vcc, s8, v7
 ; GISEL-NEXT:    s_and_b32 s5, s5, 1
-; GISEL-NEXT:    s_cmp_lg_u32 s5, 0
 ; GISEL-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
+; GISEL-NEXT:    s_cmp_lg_u32 s5, 0
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
 ; GISEL-NEXT:    s_addc_u32 s5, 0, 0
-; GISEL-NEXT:    s_xor_b64 s[6:7], s[4:5], s[6:7]
 ; GISEL-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
+; GISEL-NEXT:    s_xor_b64 s[6:7], s[4:5], s[6:7]
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, s6
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v5, s7
@@ -2091,12 +2091,12 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_mul_lo_u32 v11, v5, v7
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
 ; GISEL-NEXT:    v_mul_hi_u32 v10, v4, v7
-; GISEL-NEXT:    v_mul_hi_u32 v7, v5, v7
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; GISEL-NEXT:    v_mul_hi_u32 v7, v5, v7
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
@@ -2122,12 +2122,12 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_mul_lo_u32 v10, v8, v9
 ; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v12, v7
 ; GISEL-NEXT:    v_mul_hi_u32 v12, v4, v9
-; GISEL-NEXT:    v_mul_hi_u32 v8, v8, v9
 ; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v12
+; GISEL-NEXT:    v_mul_hi_u32 v8, v8, v9
 ; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v10, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v11, v10
@@ -2147,12 +2147,12 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_mul_lo_u32 v10, v3, v5
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
 ; GISEL-NEXT:    v_mul_hi_u32 v8, v2, v5
-; GISEL-NEXT:    v_mul_hi_u32 v5, v3, v5
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v10, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
+; GISEL-NEXT:    v_mul_hi_u32 v5, v3, v5
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
@@ -2176,12 +2176,12 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_subrev_i32_e32 v7, vcc, s6, v2
 ; GISEL-NEXT:    v_subbrev_u32_e64 v8, s[4:5], 0, v3, vcc
 ; GISEL-NEXT:    v_cmp_le_u32_e64 s[4:5], s7, v8
-; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v9, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_le_u32_e64 s[4:5], s6, v7
-; GISEL-NEXT:    v_subrev_i32_e32 v9, vcc, s6, v7
+; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v9, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], s7, v8
+; GISEL-NEXT:    v_subrev_i32_e32 v9, vcc, s6, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, v10, v11, s[4:5]
 ; GISEL-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
@@ -2208,27 +2208,26 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v7, v7
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v5
 ; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v5, vcc
-; CGP-NEXT:    v_xor_b32_e32 v0, v0, v5
 ; CGP-NEXT:    v_mul_f32_e32 v7, 0x5f7ffffc, v7
 ; CGP-NEXT:    v_mul_f32_e32 v8, 0x2f800000, v7
 ; CGP-NEXT:    v_trunc_f32_e32 v8, v8
 ; CGP-NEXT:    v_mac_f32_e32 v7, 0xcf800000, v8
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v7, v7
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v8, v8
+; CGP-NEXT:    v_xor_b32_e32 v0, v0, v5
 ; CGP-NEXT:    v_xor_b32_e32 v1, v1, v5
-; CGP-NEXT:    s_mov_b32 s7, 0x12d8fb
 ; CGP-NEXT:    v_mul_lo_u32 v9, -1, v7
 ; CGP-NEXT:    v_mul_lo_u32 v10, s6, v8
 ; CGP-NEXT:    v_mul_hi_u32 v12, s6, v7
 ; CGP-NEXT:    v_mul_lo_u32 v11, s6, v7
-; CGP-NEXT:    s_bfe_i32 s8, -1, 0x10000
+; CGP-NEXT:    s_mov_b32 s7, 0x12d8fb
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
 ; CGP-NEXT:    v_mul_lo_u32 v10, v8, v11
 ; CGP-NEXT:    v_mul_lo_u32 v12, v7, v9
 ; CGP-NEXT:    v_mul_hi_u32 v13, v7, v11
 ; CGP-NEXT:    v_mul_hi_u32 v11, v8, v11
-; CGP-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v6
+; CGP-NEXT:    s_bfe_i32 s8, -1, 0x10000
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v13
@@ -2236,12 +2235,12 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_mul_lo_u32 v13, v8, v9
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
 ; CGP-NEXT:    v_mul_hi_u32 v12, v7, v9
-; CGP-NEXT:    v_mul_hi_u32 v9, v8, v9
 ; CGP-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
+; CGP-NEXT:    v_mul_hi_u32 v9, v8, v9
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
@@ -2259,7 +2258,7 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_mul_lo_u32 v14, v7, v11
 ; CGP-NEXT:    v_mul_hi_u32 v9, v7, v13
 ; CGP-NEXT:    v_mul_hi_u32 v13, v10, v13
-; CGP-NEXT:    v_rcp_iflag_f32_e32 v4, v4
+; CGP-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v6
 ; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v12, v9
@@ -2267,12 +2266,12 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_mul_lo_u32 v12, v10, v11
 ; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v14, v9
 ; CGP-NEXT:    v_mul_hi_u32 v14, v7, v11
-; CGP-NEXT:    v_mul_hi_u32 v10, v10, v11
 ; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v13
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v14
+; CGP-NEXT:    v_mul_hi_u32 v10, v10, v11
 ; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v12, v9
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v13, v12
@@ -2284,7 +2283,7 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_mul_lo_u32 v10, v0, v8
 ; CGP-NEXT:    v_mul_hi_u32 v11, v0, v7
 ; CGP-NEXT:    v_mul_hi_u32 v7, v1, v7
-; CGP-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
+; CGP-NEXT:    v_rcp_iflag_f32_e32 v4, v4
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
@@ -2292,12 +2291,12 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_mul_lo_u32 v11, v1, v8
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
 ; CGP-NEXT:    v_mul_hi_u32 v10, v0, v8
-; CGP-NEXT:    v_mul_hi_u32 v8, v1, v8
 ; CGP-NEXT:    v_add_i32_e32 v7, vcc, v11, v7
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; CGP-NEXT:    v_mul_hi_u32 v8, v1, v8
 ; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
 ; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
@@ -2306,15 +2305,15 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_mul_lo_u32 v8, s7, v8
 ; CGP-NEXT:    v_mul_lo_u32 v10, s7, v7
 ; CGP-NEXT:    v_mul_hi_u32 v7, s7, v7
-; CGP-NEXT:    v_ashrrev_i32_e32 v6, 31, v3
+; CGP-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; CGP-NEXT:    v_mov_b32_e32 v9, s8
 ; CGP-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
 ; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v10
 ; CGP-NEXT:    v_subb_u32_e64 v8, s[4:5], v1, v7, vcc
 ; CGP-NEXT:    v_sub_i32_e64 v1, s[4:5], v1, v7
 ; CGP-NEXT:    v_cmp_le_u32_e64 s[4:5], s7, v0
 ; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[4:5]
+; CGP-NEXT:    v_mov_b32_e32 v9, s8
 ; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v8
 ; CGP-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
 ; CGP-NEXT:    v_cndmask_b32_e64 v7, v9, v7, s[4:5]
@@ -2342,6 +2341,7 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_mul_lo_u32 v8, -1, v4
 ; CGP-NEXT:    v_mul_lo_u32 v9, s6, v7
 ; CGP-NEXT:    v_mul_hi_u32 v11, s6, v4
+; CGP-NEXT:    v_ashrrev_i32_e32 v6, 31, v3
 ; CGP-NEXT:    v_mul_lo_u32 v10, s6, v4
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
 ; CGP-NEXT:    v_addc_u32_e32 v3, vcc, v3, v6, vcc
@@ -2359,12 +2359,12 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_mul_lo_u32 v12, v7, v8
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
 ; CGP-NEXT:    v_mul_hi_u32 v11, v4, v8
-; CGP-NEXT:    v_mul_hi_u32 v8, v7, v8
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; CGP-NEXT:    v_mul_hi_u32 v8, v7, v8
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
@@ -2390,40 +2390,40 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_mul_lo_u32 v11, v9, v10
 ; CGP-NEXT:    v_add_i32_e64 v8, s[4:5], v13, v8
 ; CGP-NEXT:    v_mul_hi_u32 v13, v4, v10
-; CGP-NEXT:    v_mul_hi_u32 v9, v9, v10
 ; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v13
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v13
+; CGP-NEXT:    v_mul_hi_u32 v9, v9, v10
 ; CGP-NEXT:    v_add_i32_e64 v8, s[4:5], v11, v8
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v12, v11
 ; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v10
 ; CGP-NEXT:    v_addc_u32_e32 v7, vcc, v7, v9, vcc
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
-; CGP-NEXT:    v_addc_u32_e32 v7, vcc, 0, v7, vcc
 ; CGP-NEXT:    v_xor_b32_e32 v3, v3, v6
+; CGP-NEXT:    v_addc_u32_e32 v7, vcc, 0, v7, vcc
 ; CGP-NEXT:    v_xor_b32_e32 v1, v1, v5
-; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v5
 ; CGP-NEXT:    v_mul_lo_u32 v8, v3, v4
 ; CGP-NEXT:    v_mul_lo_u32 v9, v2, v7
+; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v5
 ; CGP-NEXT:    v_subb_u32_e32 v1, vcc, v1, v5, vcc
 ; CGP-NEXT:    v_mul_hi_u32 v5, v2, v4
-; CGP-NEXT:    v_mul_hi_u32 v4, v3, v4
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
 ; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
 ; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v8, v3, v7
+; CGP-NEXT:    v_mul_hi_u32 v4, v3, v4
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
 ; CGP-NEXT:    v_mul_hi_u32 v9, v2, v7
-; CGP-NEXT:    v_mul_hi_u32 v7, v3, v7
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
 ; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
+; CGP-NEXT:    v_mul_hi_u32 v7, v3, v7
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
 ; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
@@ -2434,13 +2434,13 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_mul_hi_u32 v4, s7, v4
 ; CGP-NEXT:    s_bfe_i32 s6, -1, 0x10000
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
-; CGP-NEXT:    v_mov_b32_e32 v7, s6
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
 ; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v8
 ; CGP-NEXT:    v_subb_u32_e64 v5, s[4:5], v3, v4, vcc
 ; CGP-NEXT:    v_sub_i32_e64 v3, s[4:5], v3, v4
 ; CGP-NEXT:    v_cmp_le_u32_e64 s[4:5], s7, v2
 ; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[4:5]
+; CGP-NEXT:    v_mov_b32_e32 v7, s6
 ; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v5
 ; CGP-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
 ; CGP-NEXT:    v_cndmask_b32_e64 v4, v7, v4, s[4:5]
@@ -2494,29 +2494,29 @@ define i64 @v_srem_i64_pow2_shl_denom(i64 %x, i64 %y) {
 ; CHECK-NEXT:    v_cvt_f32_u32_e32 v5, v0
 ; CHECK-NEXT:    v_ashrrev_i32_e32 v6, 31, v4
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
-; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
 ; CHECK-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v5
 ; CHECK-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
 ; CHECK-NEXT:    v_sub_i32_e32 v7, vcc, 0, v1
-; CHECK-NEXT:    v_subb_u32_e32 v8, vcc, 0, v0, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v3, v3, v6
 ; CHECK-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
 ; CHECK-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v2
 ; CHECK-NEXT:    v_trunc_f32_e32 v5, v5
 ; CHECK-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v5
 ; CHECK-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; CHECK-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; CHECK-NEXT:    v_xor_b32_e32 v4, v4, v6
+; CHECK-NEXT:    v_subb_u32_e32 v8, vcc, 0, v0, vcc
 ; CHECK-NEXT:    v_mul_lo_u32 v9, v8, v2
 ; CHECK-NEXT:    v_mul_lo_u32 v10, v7, v5
 ; CHECK-NEXT:    v_mul_hi_u32 v12, v7, v2
 ; CHECK-NEXT:    v_mul_lo_u32 v11, v7, v2
+; CHECK-NEXT:    v_xor_b32_e32 v3, v3, v6
 ; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
 ; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
 ; CHECK-NEXT:    v_mul_lo_u32 v10, v5, v11
 ; CHECK-NEXT:    v_mul_lo_u32 v12, v2, v9
 ; CHECK-NEXT:    v_mul_hi_u32 v13, v2, v11
 ; CHECK-NEXT:    v_mul_hi_u32 v11, v5, v11
+; CHECK-NEXT:    v_xor_b32_e32 v4, v4, v6
 ; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
 ; CHECK-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v10, v13
@@ -2524,12 +2524,12 @@ define i64 @v_srem_i64_pow2_shl_denom(i64 %x, i64 %y) {
 ; CHECK-NEXT:    v_mul_lo_u32 v13, v5, v9
 ; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
 ; CHECK-NEXT:    v_mul_hi_u32 v12, v2, v9
-; CHECK-NEXT:    v_mul_hi_u32 v9, v5, v9
 ; CHECK-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
 ; CHECK-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
 ; CHECK-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
+; CHECK-NEXT:    v_mul_hi_u32 v9, v5, v9
 ; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
 ; CHECK-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
@@ -2542,10 +2542,10 @@ define i64 @v_srem_i64_pow2_shl_denom(i64 %x, i64 %y) {
 ; CHECK-NEXT:    v_mul_hi_u32 v7, v7, v2
 ; CHECK-NEXT:    v_add_i32_e64 v5, s[4:5], v5, v9
 ; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v11
-; CHECK-NEXT:    v_mul_hi_u32 v9, v2, v12
 ; CHECK-NEXT:    v_add_i32_e64 v7, s[4:5], v8, v7
 ; CHECK-NEXT:    v_mul_lo_u32 v8, v10, v12
 ; CHECK-NEXT:    v_mul_lo_u32 v11, v2, v7
+; CHECK-NEXT:    v_mul_hi_u32 v9, v2, v12
 ; CHECK-NEXT:    v_mul_hi_u32 v12, v10, v12
 ; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v11
 ; CHECK-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
@@ -2554,12 +2554,12 @@ define i64 @v_srem_i64_pow2_shl_denom(i64 %x, i64 %y) {
 ; CHECK-NEXT:    v_mul_lo_u32 v9, v10, v7
 ; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v11, v8
 ; CHECK-NEXT:    v_mul_hi_u32 v11, v2, v7
-; CHECK-NEXT:    v_mul_hi_u32 v7, v10, v7
 ; CHECK-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v12
 ; CHECK-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
 ; CHECK-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v11
 ; CHECK-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
 ; CHECK-NEXT:    v_add_i32_e64 v11, s[4:5], v12, v11
+; CHECK-NEXT:    v_mul_hi_u32 v7, v10, v7
 ; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v9, v8
 ; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[4:5]
 ; CHECK-NEXT:    v_add_i32_e64 v9, s[4:5], v11, v9
@@ -2578,12 +2578,12 @@ define i64 @v_srem_i64_pow2_shl_denom(i64 %x, i64 %y) {
 ; CHECK-NEXT:    v_mul_lo_u32 v9, v4, v5
 ; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
 ; CHECK-NEXT:    v_mul_hi_u32 v8, v3, v5
-; CHECK-NEXT:    v_mul_hi_u32 v5, v4, v5
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v9, v2
 ; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
 ; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; CHECK-NEXT:    v_mul_hi_u32 v5, v4, v5
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v7
 ; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
@@ -2668,28 +2668,28 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v7, 31, v5
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
 ; GISEL-NEXT:    v_addc_u32_e32 v5, vcc, v5, v7, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v5, v5, v7
 ; GISEL-NEXT:    v_xor_b32_e32 v4, v4, v7
+; GISEL-NEXT:    v_xor_b32_e32 v5, v5, v7
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v7, v4
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v8, v5
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v9
 ; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v9, vcc
-; GISEL-NEXT:    v_sub_i32_e32 v10, vcc, 0, v4
 ; GISEL-NEXT:    v_mac_f32_e32 v7, 0x4f800000, v8
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v7, v7
 ; GISEL-NEXT:    v_xor_b32_e32 v8, v0, v9
-; GISEL-NEXT:    v_subb_u32_e32 v11, vcc, 0, v5, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v16, v1, v9
+; GISEL-NEXT:    v_sub_i32_e32 v10, vcc, 0, v4
 ; GISEL-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v7
 ; GISEL-NEXT:    v_mul_f32_e32 v7, 0x2f800000, v0
 ; GISEL-NEXT:    v_trunc_f32_e32 v7, v7
 ; GISEL-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v7
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v7, v7
+; GISEL-NEXT:    v_subb_u32_e32 v11, vcc, 0, v5, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v12, v11, v0
 ; GISEL-NEXT:    v_mul_lo_u32 v13, v10, v7
 ; GISEL-NEXT:    v_mul_hi_u32 v15, v10, v0
 ; GISEL-NEXT:    v_mul_lo_u32 v14, v10, v0
+; GISEL-NEXT:    v_xor_b32_e32 v16, v1, v9
 ; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
 ; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v15
 ; GISEL-NEXT:    v_mul_lo_u32 v13, v7, v14
@@ -2703,12 +2703,12 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL-NEXT:    v_mul_lo_u32 v13, v7, v12
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v15, v1
 ; GISEL-NEXT:    v_mul_hi_u32 v15, v0, v12
-; GISEL-NEXT:    v_mul_hi_u32 v12, v7, v12
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v15
 ; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v15
+; GISEL-NEXT:    v_mul_hi_u32 v12, v7, v12
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v13, v1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
@@ -2721,10 +2721,10 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL-NEXT:    v_mul_hi_u32 v10, v10, v0
 ; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v7, v12
 ; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v13
-; GISEL-NEXT:    v_mul_hi_u32 v12, v0, v14
 ; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v11, v10
 ; GISEL-NEXT:    v_mul_lo_u32 v11, v1, v14
 ; GISEL-NEXT:    v_mul_lo_u32 v13, v0, v10
+; GISEL-NEXT:    v_mul_hi_u32 v12, v0, v14
 ; GISEL-NEXT:    v_mul_hi_u32 v14, v1, v14
 ; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v13
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
@@ -2733,12 +2733,12 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL-NEXT:    v_mul_lo_u32 v12, v1, v10
 ; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v13, v11
 ; GISEL-NEXT:    v_mul_hi_u32 v13, v0, v10
-; GISEL-NEXT:    v_mul_hi_u32 v1, v1, v10
 ; GISEL-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v14
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v13
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v13, s[4:5], v14, v13
+; GISEL-NEXT:    v_mul_hi_u32 v1, v1, v10
 ; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v12, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v13, v12
@@ -2758,12 +2758,12 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL-NEXT:    v_mul_lo_u32 v11, v16, v10
 ; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v12, v6
 ; GISEL-NEXT:    v_mul_hi_u32 v12, v8, v10
-; GISEL-NEXT:    v_mul_hi_u32 v10, v16, v10
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v11, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
+; GISEL-NEXT:    v_mul_hi_u32 v10, v16, v10
 ; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v11, v7
@@ -2800,8 +2800,8 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL-NEXT:    v_cndmask_b32_e32 v5, v12, v5, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v6, 31, v1
-; GISEL-NEXT:    v_cndmask_b32_e32 v5, v8, v5, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v4, v7, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v5, v8, v5, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
 ; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
 ; GISEL-NEXT:    v_xor_b32_e32 v7, v0, v6
@@ -2815,8 +2815,8 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v2, v8
 ; GISEL-NEXT:    v_addc_u32_e32 v2, vcc, v3, v8, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v3, v1, v8
 ; GISEL-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
+; GISEL-NEXT:    v_xor_b32_e32 v3, v1, v8
 ; GISEL-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GISEL-NEXT:    v_trunc_f32_e32 v1, v1
 ; GISEL-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
@@ -2842,12 +2842,12 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL-NEXT:    v_mul_lo_u32 v16, v1, v12
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v15, v13
 ; GISEL-NEXT:    v_mul_hi_u32 v15, v0, v12
-; GISEL-NEXT:    v_mul_hi_u32 v12, v1, v12
 ; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v16, v14
 ; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v15
 ; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v16, v15
+; GISEL-NEXT:    v_mul_hi_u32 v12, v1, v12
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
@@ -2860,10 +2860,10 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL-NEXT:    v_mul_hi_u32 v10, v10, v0
 ; GISEL-NEXT:    v_add_i32_e64 v1, s[4:5], v1, v12
 ; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v14
-; GISEL-NEXT:    v_mul_hi_u32 v12, v0, v15
 ; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v11, v10
 ; GISEL-NEXT:    v_mul_lo_u32 v11, v13, v15
 ; GISEL-NEXT:    v_mul_lo_u32 v14, v0, v10
+; GISEL-NEXT:    v_mul_hi_u32 v12, v0, v15
 ; GISEL-NEXT:    v_mul_hi_u32 v15, v13, v15
 ; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v14
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
@@ -2872,12 +2872,12 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL-NEXT:    v_mul_lo_u32 v12, v13, v10
 ; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v14, v11
 ; GISEL-NEXT:    v_mul_hi_u32 v14, v0, v10
-; GISEL-NEXT:    v_mul_hi_u32 v10, v13, v10
 ; GISEL-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v15
 ; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v14
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v14, s[4:5], v15, v14
+; GISEL-NEXT:    v_mul_hi_u32 v10, v13, v10
 ; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v12, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v12, s[4:5], v14, v12
@@ -2958,8 +2958,8 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_mov_b32_e32 v8, v0
 ; CGP-NEXT:    v_or_b32_e32 v1, v9, v3
 ; CGP-NEXT:    v_mov_b32_e32 v0, 0
-; CGP-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
 ; CGP-NEXT:    v_lshl_b64 v[10:11], s[4:5], v6
+; CGP-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
 ; CGP-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; CGP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; CGP-NEXT:    s_xor_b64 s[6:7], exec, s[4:5]
@@ -2977,26 +2977,26 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v2, v2
 ; CGP-NEXT:    v_add_i32_e32 v3, vcc, v8, v4
 ; CGP-NEXT:    v_addc_u32_e32 v6, vcc, v9, v4, vcc
-; CGP-NEXT:    v_sub_i32_e32 v9, vcc, 0, v1
 ; CGP-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
 ; CGP-NEXT:    v_mul_f32_e32 v8, 0x2f800000, v2
 ; CGP-NEXT:    v_trunc_f32_e32 v8, v8
 ; CGP-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v8
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v8, v8
+; CGP-NEXT:    v_sub_i32_e32 v9, vcc, 0, v1
 ; CGP-NEXT:    v_subb_u32_e32 v12, vcc, 0, v0, vcc
-; CGP-NEXT:    v_xor_b32_e32 v3, v3, v4
 ; CGP-NEXT:    v_mul_lo_u32 v13, v12, v2
 ; CGP-NEXT:    v_mul_lo_u32 v14, v9, v8
 ; CGP-NEXT:    v_mul_hi_u32 v16, v9, v2
 ; CGP-NEXT:    v_mul_lo_u32 v15, v9, v2
-; CGP-NEXT:    v_xor_b32_e32 v6, v6, v4
+; CGP-NEXT:    v_xor_b32_e32 v3, v3, v4
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v16
 ; CGP-NEXT:    v_mul_lo_u32 v14, v8, v15
 ; CGP-NEXT:    v_mul_lo_u32 v16, v2, v13
 ; CGP-NEXT:    v_mul_hi_u32 v17, v2, v15
 ; CGP-NEXT:    v_mul_hi_u32 v15, v8, v15
+; CGP-NEXT:    v_xor_b32_e32 v6, v6, v4
 ; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v16
 ; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v17
@@ -3004,12 +3004,12 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_mul_lo_u32 v17, v8, v13
 ; CGP-NEXT:    v_add_i32_e32 v14, vcc, v16, v14
 ; CGP-NEXT:    v_mul_hi_u32 v16, v2, v13
-; CGP-NEXT:    v_mul_hi_u32 v13, v8, v13
 ; CGP-NEXT:    v_add_i32_e32 v15, vcc, v17, v15
 ; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v15, vcc, v15, v16
 ; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v16, vcc, v17, v16
+; CGP-NEXT:    v_mul_hi_u32 v13, v8, v13
 ; CGP-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v15, vcc, v16, v15
@@ -3022,10 +3022,10 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_mul_hi_u32 v9, v9, v2
 ; CGP-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v13
 ; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v15
-; CGP-NEXT:    v_mul_hi_u32 v13, v2, v16
 ; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v12, v9
 ; CGP-NEXT:    v_mul_lo_u32 v12, v14, v16
 ; CGP-NEXT:    v_mul_lo_u32 v15, v2, v9
+; CGP-NEXT:    v_mul_hi_u32 v13, v2, v16
 ; CGP-NEXT:    v_mul_hi_u32 v16, v14, v16
 ; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v15
 ; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
@@ -3034,12 +3034,12 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_mul_lo_u32 v13, v14, v9
 ; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v15, v12
 ; CGP-NEXT:    v_mul_hi_u32 v15, v2, v9
-; CGP-NEXT:    v_mul_hi_u32 v9, v14, v9
 ; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v16
 ; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v15
 ; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v15, s[4:5], v16, v15
+; CGP-NEXT:    v_mul_hi_u32 v9, v14, v9
 ; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v13, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v15, v13
@@ -3058,12 +3058,12 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_mul_lo_u32 v13, v6, v8
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v12, v9
 ; CGP-NEXT:    v_mul_hi_u32 v12, v3, v8
-; CGP-NEXT:    v_mul_hi_u32 v8, v6, v8
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v13, v2
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
+; CGP-NEXT:    v_mul_hi_u32 v8, v6, v8
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v9
 ; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v12, v9
@@ -3149,29 +3149,29 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v6, v2
 ; CGP-NEXT:    v_ashrrev_i32_e32 v8, 31, v7
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
-; CGP-NEXT:    v_xor_b32_e32 v5, v5, v8
 ; CGP-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v6
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v4, v4
 ; CGP-NEXT:    v_addc_u32_e32 v6, vcc, v7, v8, vcc
 ; CGP-NEXT:    v_sub_i32_e32 v9, vcc, 0, v3
-; CGP-NEXT:    v_subb_u32_e32 v10, vcc, 0, v2, vcc
 ; CGP-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
 ; CGP-NEXT:    v_mul_f32_e32 v7, 0x2f800000, v4
 ; CGP-NEXT:    v_trunc_f32_e32 v7, v7
 ; CGP-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v7
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v4, v4
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v7, v7
-; CGP-NEXT:    v_xor_b32_e32 v6, v6, v8
+; CGP-NEXT:    v_subb_u32_e32 v10, vcc, 0, v2, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v11, v10, v4
 ; CGP-NEXT:    v_mul_lo_u32 v12, v9, v7
 ; CGP-NEXT:    v_mul_hi_u32 v14, v9, v4
 ; CGP-NEXT:    v_mul_lo_u32 v13, v9, v4
+; CGP-NEXT:    v_xor_b32_e32 v5, v5, v8
 ; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
 ; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v14
 ; CGP-NEXT:    v_mul_lo_u32 v12, v7, v13
 ; CGP-NEXT:    v_mul_lo_u32 v14, v4, v11
 ; CGP-NEXT:    v_mul_hi_u32 v15, v4, v13
 ; CGP-NEXT:    v_mul_hi_u32 v13, v7, v13
+; CGP-NEXT:    v_xor_b32_e32 v6, v6, v8
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v15
@@ -3179,12 +3179,12 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_mul_lo_u32 v15, v7, v11
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v14, v12
 ; CGP-NEXT:    v_mul_hi_u32 v14, v4, v11
-; CGP-NEXT:    v_mul_hi_u32 v11, v7, v11
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v15, v13
 ; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
+; CGP-NEXT:    v_mul_hi_u32 v11, v7, v11
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
@@ -3197,10 +3197,10 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_mul_hi_u32 v9, v9, v4
 ; CGP-NEXT:    v_add_i32_e64 v7, s[4:5], v7, v11
 ; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v13
-; CGP-NEXT:    v_mul_hi_u32 v11, v4, v14
 ; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v10, v9
 ; CGP-NEXT:    v_mul_lo_u32 v10, v12, v14
 ; CGP-NEXT:    v_mul_lo_u32 v13, v4, v9
+; CGP-NEXT:    v_mul_hi_u32 v11, v4, v14
 ; CGP-NEXT:    v_mul_hi_u32 v14, v12, v14
 ; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v13
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
@@ -3209,12 +3209,12 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_mul_lo_u32 v11, v12, v9
 ; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v13, v10
 ; CGP-NEXT:    v_mul_hi_u32 v13, v4, v9
-; CGP-NEXT:    v_mul_hi_u32 v9, v12, v9
 ; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v13
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v14, v13
+; CGP-NEXT:    v_mul_hi_u32 v9, v12, v9
 ; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v11, v10
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v13, v11
@@ -3233,12 +3233,12 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_mul_lo_u32 v11, v6, v7
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
 ; CGP-NEXT:    v_mul_hi_u32 v10, v5, v7
-; CGP-NEXT:    v_mul_hi_u32 v7, v6, v7
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v11, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; CGP-NEXT:    v_mul_hi_u32 v7, v6, v7
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
 ; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
@@ -3271,8 +3271,8 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, v11, v12, s[4:5]
 ; CGP-NEXT:    v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
-; CGP-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc
 ; CGP-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
 ; CGP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; CGP-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
@@ -3379,9 +3379,9 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v5, v3
 ; GISEL-NEXT:    v_sub_i32_e32 v7, vcc, 0, v1
 ; GISEL-NEXT:    v_subb_u32_e32 v8, vcc, 0, v3, vcc
-; GISEL-NEXT:    v_and_b32_e32 v0, s6, v0
 ; GISEL-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v4, v4
+; GISEL-NEXT:    v_and_b32_e32 v0, s6, v0
 ; GISEL-NEXT:    v_and_b32_e32 v6, s6, v6
 ; GISEL-NEXT:    v_and_b32_e32 v2, s6, v2
 ; GISEL-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
@@ -3409,12 +3409,12 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_mul_hi_u32 v11, v5, v11
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
 ; GISEL-NEXT:    v_mul_hi_u32 v12, v4, v9
-; GISEL-NEXT:    v_mul_hi_u32 v9, v5, v9
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v14, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v14, v12
+; GISEL-NEXT:    v_mul_hi_u32 v9, v5, v9
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
@@ -3427,10 +3427,10 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_mul_hi_u32 v7, v7, v4
 ; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v5, v9
 ; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v11
-; GISEL-NEXT:    v_mul_hi_u32 v9, v4, v12
 ; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v8, v7
 ; GISEL-NEXT:    v_mul_lo_u32 v8, v10, v12
 ; GISEL-NEXT:    v_mul_lo_u32 v11, v4, v7
+; GISEL-NEXT:    v_mul_hi_u32 v9, v4, v12
 ; GISEL-NEXT:    v_mul_hi_u32 v12, v10, v12
 ; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
@@ -3439,12 +3439,12 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_mul_lo_u32 v9, v10, v7
 ; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v11, v8
 ; GISEL-NEXT:    v_mul_hi_u32 v11, v4, v7
-; GISEL-NEXT:    v_mul_hi_u32 v7, v10, v7
 ; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v12, v11
+; GISEL-NEXT:    v_mul_hi_u32 v7, v10, v7
 ; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v9, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v11, v9
@@ -3463,12 +3463,12 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_mul_lo_u32 v9, v13, v5
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
 ; GISEL-NEXT:    v_mul_hi_u32 v8, v0, v5
-; GISEL-NEXT:    v_mul_hi_u32 v5, v13, v5
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v9, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; GISEL-NEXT:    v_mul_hi_u32 v5, v13, v5
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
@@ -3539,12 +3539,12 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_mul_hi_u32 v11, v5, v11
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
 ; GISEL-NEXT:    v_mul_hi_u32 v12, v3, v9
-; GISEL-NEXT:    v_mul_hi_u32 v9, v5, v9
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v14, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v14, v12
+; GISEL-NEXT:    v_mul_hi_u32 v9, v5, v9
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
@@ -3557,10 +3557,10 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_mul_hi_u32 v7, v7, v3
 ; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v5, v9
 ; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v11
-; GISEL-NEXT:    v_mul_hi_u32 v9, v3, v12
 ; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v8, v7
 ; GISEL-NEXT:    v_mul_lo_u32 v8, v10, v12
 ; GISEL-NEXT:    v_mul_lo_u32 v11, v3, v7
+; GISEL-NEXT:    v_mul_hi_u32 v9, v3, v12
 ; GISEL-NEXT:    v_mul_hi_u32 v12, v10, v12
 ; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
@@ -3569,12 +3569,12 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_mul_lo_u32 v9, v10, v7
 ; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v11, v8
 ; GISEL-NEXT:    v_mul_hi_u32 v11, v3, v7
-; GISEL-NEXT:    v_mul_hi_u32 v7, v10, v7
 ; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v12, v11
+; GISEL-NEXT:    v_mul_hi_u32 v7, v10, v7
 ; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v9, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v11, v9
@@ -3595,12 +3595,12 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_mul_hi_u32 v3, v13, v3
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
 ; GISEL-NEXT:    v_mul_hi_u32 v8, v2, v5
-; GISEL-NEXT:    v_mul_hi_u32 v5, v13, v5
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v9, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; GISEL-NEXT:    v_mul_hi_u32 v5, v13, v5
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
@@ -3627,9 +3627,9 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v4
 ; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v6, vcc
-; GISEL-NEXT:    v_sub_i32_e32 v4, vcc, v8, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v9, v6
+; GISEL-NEXT:    v_sub_i32_e32 v4, vcc, v8, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, v10, v11, s[4:5]
 ; GISEL-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
@@ -3665,13 +3665,13 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_cvt_f32_i32_e32 v3, v2
 ; CGP-NEXT:    v_rcp_f32_e32 v5, v4
 ; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
-; CGP-NEXT:    v_bfe_i32 v0, v0, 0, 25
 ; CGP-NEXT:    v_mul_f32_e32 v1, v3, v5
 ; CGP-NEXT:    v_trunc_f32_e32 v1, v1
 ; CGP-NEXT:    v_mad_f32 v3, -v1, v4, v3
 ; CGP-NEXT:    v_cvt_i32_f32_e32 v1, v1
 ; CGP-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v3|, |v4|
 ; CGP-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[4:5]
+; CGP-NEXT:    v_bfe_i32 v0, v0, 0, 25
 ; CGP-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
 ; CGP-NEXT:    v_mul_lo_u32 v3, v1, v6
 ; CGP-NEXT:    v_ashrrev_i32_e32 v1, 31, v0

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
index 77ef4c2918e6a..b47fa630fb5f5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
@@ -10,9 +10,9 @@ define i7 @v_ssubsat_i7(i7 %lhs, i7 %rhs) {
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 25, v0
 ; GFX6-NEXT:    v_max_i32_e32 v2, -1, v0
-; GFX6-NEXT:    v_min_i32_e32 v3, -1, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 25, v1
 ; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, 0x7fffffff, v2
+; GFX6-NEXT:    v_min_i32_e32 v3, -1, v0
 ; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, 0x80000000, v3
 ; GFX6-NEXT:    v_max_i32_e32 v1, v2, v1
 ; GFX6-NEXT:    v_min_i32_e32 v1, v1, v3
@@ -25,9 +25,9 @@ define i7 @v_ssubsat_i7(i7 %lhs, i7 %rhs) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 9, v0
 ; GFX8-NEXT:    v_max_i16_e32 v2, -1, v0
-; GFX8-NEXT:    v_min_i16_e32 v3, -1, v0
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 9, v1
 ; GFX8-NEXT:    v_subrev_u16_e32 v2, 0x7fff, v2
+; GFX8-NEXT:    v_min_i16_e32 v3, -1, v0
 ; GFX8-NEXT:    v_subrev_u16_e32 v3, 0x8000, v3
 ; GFX8-NEXT:    v_max_i16_e32 v1, v2, v1
 ; GFX8-NEXT:    v_min_i16_e32 v1, v1, v3
@@ -62,9 +62,9 @@ define amdgpu_ps i7 @s_ssubsat_i7(i7 inreg %lhs, i7 inreg %rhs) {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 25
 ; GFX6-NEXT:    s_max_i32 s2, s0, -1
-; GFX6-NEXT:    s_min_i32 s3, s0, -1
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 25
 ; GFX6-NEXT:    s_sub_i32 s2, s2, 0x7fffffff
+; GFX6-NEXT:    s_min_i32 s3, s0, -1
 ; GFX6-NEXT:    s_sub_i32 s3, s3, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s1, s2, s1
 ; GFX6-NEXT:    s_min_i32 s1, s1, s3
@@ -124,9 +124,9 @@ define i8 @v_ssubsat_i8(i8 %lhs, i8 %rhs) {
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; GFX6-NEXT:    v_max_i32_e32 v2, -1, v0
-; GFX6-NEXT:    v_min_i32_e32 v3, -1, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
 ; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, 0x7fffffff, v2
+; GFX6-NEXT:    v_min_i32_e32 v3, -1, v0
 ; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, 0x80000000, v3
 ; GFX6-NEXT:    v_max_i32_e32 v1, v2, v1
 ; GFX6-NEXT:    v_min_i32_e32 v1, v1, v3
@@ -139,9 +139,9 @@ define i8 @v_ssubsat_i8(i8 %lhs, i8 %rhs) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX8-NEXT:    v_max_i16_e32 v2, -1, v0
-; GFX8-NEXT:    v_min_i16_e32 v3, -1, v0
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; GFX8-NEXT:    v_subrev_u16_e32 v2, 0x7fff, v2
+; GFX8-NEXT:    v_min_i16_e32 v3, -1, v0
 ; GFX8-NEXT:    v_subrev_u16_e32 v3, 0x8000, v3
 ; GFX8-NEXT:    v_max_i16_e32 v1, v2, v1
 ; GFX8-NEXT:    v_min_i16_e32 v1, v1, v3
@@ -176,9 +176,9 @@ define amdgpu_ps i8 @s_ssubsat_i8(i8 inreg %lhs, i8 inreg %rhs) {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 24
 ; GFX6-NEXT:    s_max_i32 s2, s0, -1
-; GFX6-NEXT:    s_min_i32 s3, s0, -1
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 24
 ; GFX6-NEXT:    s_sub_i32 s2, s2, 0x7fffffff
+; GFX6-NEXT:    s_min_i32 s3, s0, -1
 ; GFX6-NEXT:    s_sub_i32 s3, s3, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s1, s2, s1
 ; GFX6-NEXT:    s_min_i32 s1, s1, s3
@@ -242,8 +242,8 @@ define i16 @v_ssubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
 ; GFX6-NEXT:    v_max_i32_e32 v4, -1, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
-; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s4, v4
 ; GFX6-NEXT:    s_brev_b32 s5, 1
+; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s4, v4
 ; GFX6-NEXT:    v_min_i32_e32 v5, -1, v0
 ; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s5, v5
 ; GFX6-NEXT:    v_max_i32_e32 v1, v4, v1
@@ -252,8 +252,8 @@ define i16 @v_ssubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
 ; GFX6-NEXT:    v_max_i32_e32 v3, -1, v1
-; GFX6-NEXT:    v_min_i32_e32 v4, -1, v1
 ; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s4, v3
+; GFX6-NEXT:    v_min_i32_e32 v4, -1, v1
 ; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s5, v4
 ; GFX6-NEXT:    v_max_i32_e32 v2, v3, v2
 ; GFX6-NEXT:    v_min_i32_e32 v2, v2, v4
@@ -277,21 +277,21 @@ define i16 @v_ssubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
 ; GFX8-NEXT:    v_max_i16_e32 v4, -1, v0
 ; GFX8-NEXT:    v_lshrrev_b32_sdwa v2, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX8-NEXT:    v_subrev_u16_e32 v4, s4, v4
 ; GFX8-NEXT:    s_movk_i32 s5, 0x8000
+; GFX8-NEXT:    v_subrev_u16_e32 v4, s4, v4
 ; GFX8-NEXT:    v_min_i16_e32 v5, -1, v0
-; GFX8-NEXT:    v_max_i16_e32 v1, v4, v1
 ; GFX8-NEXT:    v_subrev_u16_e32 v5, s5, v5
+; GFX8-NEXT:    v_max_i16_e32 v1, v4, v1
 ; GFX8-NEXT:    v_min_i16_e32 v1, v1, v5
 ; GFX8-NEXT:    v_sub_u16_e32 v0, v0, v1
 ; GFX8-NEXT:    v_max_i16_e32 v1, -1, v3
 ; GFX8-NEXT:    v_subrev_u16_e32 v1, s4, v1
 ; GFX8-NEXT:    v_min_i16_e32 v4, -1, v3
-; GFX8-NEXT:    v_max_i16_e32 v1, v1, v2
 ; GFX8-NEXT:    v_subrev_u16_e32 v4, s5, v4
+; GFX8-NEXT:    v_max_i16_e32 v1, v1, v2
 ; GFX8-NEXT:    v_min_i16_e32 v1, v1, v4
-; GFX8-NEXT:    v_mov_b32_e32 v2, 0xff
 ; GFX8-NEXT:    v_sub_u16_e32 v1, v3, v1
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0xff
 ; GFX8-NEXT:    v_and_b32_sdwa v0, sext(v0), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_and_b32_sdwa v1, sext(v1), v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -349,8 +349,8 @@ define amdgpu_ps i16 @s_ssubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
 ; GFX6-NEXT:    s_max_i32 s6, s0, -1
 ; GFX6-NEXT:    s_lshr_b32 s3, s1, 8
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 24
-; GFX6-NEXT:    s_sub_i32 s6, s6, s4
 ; GFX6-NEXT:    s_brev_b32 s5, 1
+; GFX6-NEXT:    s_sub_i32 s6, s6, s4
 ; GFX6-NEXT:    s_min_i32 s7, s0, -1
 ; GFX6-NEXT:    s_sub_i32 s7, s7, s5
 ; GFX6-NEXT:    s_max_i32 s1, s6, s1
@@ -381,11 +381,11 @@ define amdgpu_ps i16 @s_ssubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s7, s0
 ; GFX8-NEXT:    s_sext_i32_i16 s8, -1
-; GFX8-NEXT:    s_max_i32 s9, s7, s8
 ; GFX8-NEXT:    s_movk_i32 s5, 0x7fff
-; GFX8-NEXT:    s_sub_i32 s9, s9, s5
+; GFX8-NEXT:    s_max_i32 s9, s7, s8
 ; GFX8-NEXT:    s_lshr_b32 s3, s1, 8
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, s4
+; GFX8-NEXT:    s_sub_i32 s9, s9, s5
 ; GFX8-NEXT:    s_movk_i32 s6, 0x8000
 ; GFX8-NEXT:    s_min_i32 s7, s7, s8
 ; GFX8-NEXT:    s_sext_i32_i16 s9, s9
@@ -427,8 +427,8 @@ define amdgpu_ps i16 @s_ssubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
 ; GFX9-NEXT:    s_lshr_b32 s3, s1, 8
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s3
-; GFX9-NEXT:    s_lshr_b32 s3, s0, 16
 ; GFX9-NEXT:    s_mov_b32 s2, 0x80008
+; GFX9-NEXT:    s_lshr_b32 s3, s0, 16
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, s2
 ; GFX9-NEXT:    s_lshl_b32 s3, s3, 8
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s3
@@ -451,8 +451,8 @@ define amdgpu_ps i16 @s_ssubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
 ; GFX10-NEXT:    s_lshr_b32 s3, s1, 8
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s3
-; GFX10-NEXT:    s_lshr_b32 s3, s0, 16
 ; GFX10-NEXT:    s_mov_b32 s2, 0x80008
+; GFX10-NEXT:    s_lshr_b32 s3, s0, 16
 ; GFX10-NEXT:    s_lshr_b32 s4, s1, 16
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, s2
 ; GFX10-NEXT:    s_lshl_b32 s3, s3, 8
@@ -488,8 +488,8 @@ define i32 @v_ssubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
-; GFX6-NEXT:    v_subrev_i32_e32 v8, vcc, s4, v8
 ; GFX6-NEXT:    s_brev_b32 s5, 1
+; GFX6-NEXT:    v_subrev_i32_e32 v8, vcc, s4, v8
 ; GFX6-NEXT:    v_min_i32_e32 v10, -1, v0
 ; GFX6-NEXT:    v_subrev_i32_e32 v10, vcc, s5, v10
 ; GFX6-NEXT:    v_max_i32_e32 v1, v8, v1
@@ -508,8 +508,8 @@ define i32 @v_ssubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
 ; GFX6-NEXT:    v_bfrev_b32_e32 v9, -2
 ; GFX6-NEXT:    v_max_i32_e32 v5, -1, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 24, v6
-; GFX6-NEXT:    v_min_i32_e32 v6, -1, v2
 ; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v5, v9
+; GFX6-NEXT:    v_min_i32_e32 v6, -1, v2
 ; GFX6-NEXT:    v_subrev_i32_e32 v6, vcc, s5, v6
 ; GFX6-NEXT:    v_max_i32_e32 v3, v5, v3
 ; GFX6-NEXT:    v_min_i32_e32 v3, v3, v6
@@ -517,24 +517,24 @@ define i32 @v_ssubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 24, v4
 ; GFX6-NEXT:    v_max_i32_e32 v5, -1, v3
 ; GFX6-NEXT:    v_bfrev_b32_e32 v11, 1
-; GFX6-NEXT:    v_min_i32_e32 v6, -1, v3
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 24, v1
-; GFX6-NEXT:    s_movk_i32 s4, 0xff
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 24, v7
 ; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v5, v9
+; GFX6-NEXT:    v_min_i32_e32 v6, -1, v3
+; GFX6-NEXT:    s_movk_i32 s4, 0xff
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 24, v0
-; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
 ; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, v6, v11
 ; GFX6-NEXT:    v_max_i32_e32 v4, v5, v4
-; GFX6-NEXT:    v_min_i32_e32 v4, v4, v6
+; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 24, v2
+; GFX6-NEXT:    v_min_i32_e32 v4, v4, v6
 ; GFX6-NEXT:    v_and_b32_e32 v0, s4, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, v3, v4
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_and_b32_e32 v1, s4, v2
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v3, 24, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_and_b32_e32 v1, s4, v3
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
@@ -555,35 +555,35 @@ define i32 @v_ssubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX8-NEXT:    v_subrev_u16_e32 v8, s4, v8
 ; GFX8-NEXT:    s_movk_i32 s5, 0x8000
+; GFX8-NEXT:    v_subrev_u16_e32 v8, s4, v8
 ; GFX8-NEXT:    v_min_i16_e32 v10, -1, v0
-; GFX8-NEXT:    v_max_i16_e32 v1, v8, v1
 ; GFX8-NEXT:    v_subrev_u16_e32 v10, s5, v10
+; GFX8-NEXT:    v_max_i16_e32 v1, v8, v1
 ; GFX8-NEXT:    v_min_i16_e32 v1, v1, v10
 ; GFX8-NEXT:    v_sub_u16_e32 v0, v0, v1
 ; GFX8-NEXT:    v_max_i16_e32 v1, -1, v3
 ; GFX8-NEXT:    v_subrev_u16_e32 v1, s4, v1
 ; GFX8-NEXT:    v_min_i16_e32 v8, -1, v3
+; GFX8-NEXT:    v_subrev_u16_e32 v8, s5, v8
 ; GFX8-NEXT:    v_max_i16_e32 v1, v1, v2
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v2, 8, v4
-; GFX8-NEXT:    v_subrev_u16_e32 v8, s5, v8
-; GFX8-NEXT:    v_min_i16_e32 v1, v1, v8
 ; GFX8-NEXT:    v_mov_b32_e32 v9, 0x7fff
+; GFX8-NEXT:    v_min_i16_e32 v1, v1, v8
 ; GFX8-NEXT:    v_max_i16_e32 v4, -1, v2
 ; GFX8-NEXT:    v_sub_u16_e32 v1, v3, v1
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v3, 8, v6
-; GFX8-NEXT:    v_min_i16_e32 v6, -1, v2
 ; GFX8-NEXT:    v_sub_u16_e32 v4, v4, v9
-; GFX8-NEXT:    v_max_i16_e32 v3, v4, v3
+; GFX8-NEXT:    v_min_i16_e32 v6, -1, v2
 ; GFX8-NEXT:    v_subrev_u16_e32 v6, s5, v6
+; GFX8-NEXT:    v_max_i16_e32 v3, v4, v3
 ; GFX8-NEXT:    v_min_i16_e32 v3, v3, v6
 ; GFX8-NEXT:    v_sub_u16_e32 v2, v2, v3
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v3, 8, v5
 ; GFX8-NEXT:    v_max_i16_e32 v5, -1, v3
-; GFX8-NEXT:    v_min_i16_e32 v6, -1, v3
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v4, 8, v7
 ; GFX8-NEXT:    v_sub_u16_e32 v5, v5, v9
+; GFX8-NEXT:    v_min_i16_e32 v6, -1, v3
 ; GFX8-NEXT:    v_subrev_u16_e32 v6, 0x8000, v6
 ; GFX8-NEXT:    v_max_i16_e32 v4, v5, v4
 ; GFX8-NEXT:    v_min_i16_e32 v4, v4, v6
@@ -607,20 +607,20 @@ define i32 @v_ssubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v8, 0xffff
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX9-NEXT:    v_and_or_b32 v0, v0, v8, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
 ; GFX9-NEXT:    v_lshrrev_b32_sdwa v5, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
-; GFX9-NEXT:    v_and_or_b32 v2, v3, v8, v2
+; GFX9-NEXT:    v_and_or_b32 v0, v0, v8, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; GFX9-NEXT:    v_and_or_b32 v2, v3, v8, v2
 ; GFX9-NEXT:    v_and_or_b32 v1, v1, v8, v5
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
-; GFX9-NEXT:    v_and_or_b32 v3, v6, v8, v3
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
+; GFX9-NEXT:    v_and_or_b32 v3, v6, v8, v3
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
-; GFX9-NEXT:    v_pk_sub_i16 v0, v0, v1 clamp
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1]
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1]
+; GFX9-NEXT:    v_pk_sub_i16 v0, v0, v1 clamp
 ; GFX9-NEXT:    v_pk_sub_i16 v1, v2, v3 clamp
 ; GFX9-NEXT:    v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 8
@@ -691,8 +691,8 @@ define amdgpu_ps i32 @s_ssubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 ; GFX6-NEXT:    s_lshr_b32 s6, s1, 16
 ; GFX6-NEXT:    s_lshr_b32 s7, s1, 24
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 24
-; GFX6-NEXT:    s_sub_i32 s10, s10, s8
 ; GFX6-NEXT:    s_brev_b32 s9, 1
+; GFX6-NEXT:    s_sub_i32 s10, s10, s8
 ; GFX6-NEXT:    s_min_i32 s11, s0, -1
 ; GFX6-NEXT:    s_sub_i32 s11, s11, s9
 ; GFX6-NEXT:    s_max_i32 s1, s10, s1
@@ -701,8 +701,8 @@ define amdgpu_ps i32 @s_ssubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 ; GFX6-NEXT:    s_lshl_b32 s1, s2, 24
 ; GFX6-NEXT:    s_lshl_b32 s2, s5, 24
 ; GFX6-NEXT:    s_max_i32 s5, s1, -1
-; GFX6-NEXT:    s_min_i32 s10, s1, -1
 ; GFX6-NEXT:    s_sub_i32 s5, s5, s8
+; GFX6-NEXT:    s_min_i32 s10, s1, -1
 ; GFX6-NEXT:    s_sub_i32 s10, s10, s9
 ; GFX6-NEXT:    s_max_i32 s2, s5, s2
 ; GFX6-NEXT:    s_min_i32 s2, s2, s10
@@ -710,22 +710,22 @@ define amdgpu_ps i32 @s_ssubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 ; GFX6-NEXT:    s_lshl_b32 s2, s3, 24
 ; GFX6-NEXT:    s_max_i32 s5, s2, -1
 ; GFX6-NEXT:    s_lshl_b32 s3, s6, 24
-; GFX6-NEXT:    s_min_i32 s6, s2, -1
 ; GFX6-NEXT:    s_sub_i32 s5, s5, s8
+; GFX6-NEXT:    s_min_i32 s6, s2, -1
 ; GFX6-NEXT:    s_sub_i32 s6, s6, s9
 ; GFX6-NEXT:    s_max_i32 s3, s5, s3
 ; GFX6-NEXT:    s_min_i32 s3, s3, s6
 ; GFX6-NEXT:    s_sub_i32 s2, s2, s3
 ; GFX6-NEXT:    s_lshl_b32 s3, s4, 24
 ; GFX6-NEXT:    s_max_i32 s5, s3, -1
-; GFX6-NEXT:    s_min_i32 s6, s3, -1
 ; GFX6-NEXT:    s_lshl_b32 s4, s7, 24
 ; GFX6-NEXT:    s_sub_i32 s5, s5, s8
+; GFX6-NEXT:    s_min_i32 s6, s3, -1
 ; GFX6-NEXT:    s_sub_i32 s6, s6, s9
 ; GFX6-NEXT:    s_max_i32 s4, s5, s4
 ; GFX6-NEXT:    s_min_i32 s4, s4, s6
-; GFX6-NEXT:    s_sub_i32 s3, s3, s4
 ; GFX6-NEXT:    s_ashr_i32 s1, s1, 24
+; GFX6-NEXT:    s_sub_i32 s3, s3, s4
 ; GFX6-NEXT:    s_movk_i32 s4, 0xff
 ; GFX6-NEXT:    s_ashr_i32 s0, s0, 24
 ; GFX6-NEXT:    s_and_b32 s1, s1, s4
@@ -734,8 +734,8 @@ define amdgpu_ps i32 @s_ssubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 8
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
 ; GFX6-NEXT:    s_and_b32 s1, s2, s4
-; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_ashr_i32 s3, s3, 24
+; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
 ; GFX6-NEXT:    s_and_b32 s1, s3, s4
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 24
@@ -751,13 +751,13 @@ define amdgpu_ps i32 @s_ssubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, s8
 ; GFX8-NEXT:    s_sext_i32_i16 s11, s0
 ; GFX8-NEXT:    s_sext_i32_i16 s12, -1
-; GFX8-NEXT:    s_max_i32 s13, s11, s12
 ; GFX8-NEXT:    s_movk_i32 s9, 0x7fff
-; GFX8-NEXT:    s_sub_i32 s13, s13, s9
+; GFX8-NEXT:    s_max_i32 s13, s11, s12
 ; GFX8-NEXT:    s_lshr_b32 s5, s1, 8
 ; GFX8-NEXT:    s_lshr_b32 s6, s1, 16
 ; GFX8-NEXT:    s_lshr_b32 s7, s1, 24
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, s8
+; GFX8-NEXT:    s_sub_i32 s13, s13, s9
 ; GFX8-NEXT:    s_movk_i32 s10, 0x8000
 ; GFX8-NEXT:    s_min_i32 s11, s11, s12
 ; GFX8-NEXT:    s_sext_i32_i16 s13, s13
@@ -810,9 +810,9 @@ define amdgpu_ps i32 @s_ssubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 ; GFX8-NEXT:    s_sext_i32_i16 s5, s5
 ; GFX8-NEXT:    s_sext_i32_i16 s1, s1
 ; GFX8-NEXT:    s_min_i32 s4, s4, s5
-; GFX8-NEXT:    s_sub_i32 s3, s3, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s0, s0
 ; GFX8-NEXT:    s_ashr_i32 s1, s1, s8
+; GFX8-NEXT:    s_sub_i32 s3, s3, s4
 ; GFX8-NEXT:    s_movk_i32 s4, 0xff
 ; GFX8-NEXT:    s_ashr_i32 s0, s0, s8
 ; GFX8-NEXT:    s_sext_i32_i16 s2, s2
@@ -823,8 +823,8 @@ define amdgpu_ps i32 @s_ssubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    s_and_b32 s1, s2, s4
-; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX8-NEXT:    s_ashr_i32 s3, s3, s8
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    s_and_b32 s1, s3, s4
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 24
@@ -838,19 +838,19 @@ define amdgpu_ps i32 @s_ssubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 ; GFX9-NEXT:    s_lshr_b32 s6, s0, 24
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s3
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s4, s6
-; GFX9-NEXT:    s_lshr_b32 s6, s0, 16
 ; GFX9-NEXT:    s_mov_b32 s4, 0x80008
+; GFX9-NEXT:    s_lshr_b32 s6, s0, 16
 ; GFX9-NEXT:    s_lshr_b32 s7, s1, 8
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, s4
 ; GFX9-NEXT:    s_lshl_b32 s6, s6, 8
-; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s6
-; GFX9-NEXT:    s_lshr_b32 s6, s3, 16
 ; GFX9-NEXT:    s_lshr_b32 s8, s1, 16
 ; GFX9-NEXT:    s_lshr_b32 s9, s1, 24
+; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s6
+; GFX9-NEXT:    s_lshr_b32 s6, s3, 16
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s7
-; GFX9-NEXT:    s_lshr_b32 s7, s1, 16
 ; GFX9-NEXT:    s_lshl_b32 s3, s3, s4
 ; GFX9-NEXT:    s_lshl_b32 s6, s6, 8
+; GFX9-NEXT:    s_lshr_b32 s7, s1, 16
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s3, s6
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s6, s8, s9
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, s4
@@ -859,19 +859,19 @@ define amdgpu_ps i32 @s_ssubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 ; GFX9-NEXT:    s_lshr_b32 s7, s6, 16
 ; GFX9-NEXT:    s_lshl_b32 s4, s6, s4
 ; GFX9-NEXT:    s_lshl_b32 s6, s7, 8
-; GFX9-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s4, s6
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX9-NEXT:    v_pk_sub_i16 v0, s0, v0 clamp
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s4
-; GFX9-NEXT:    v_pk_sub_i16 v1, s3, v1 clamp
 ; GFX9-NEXT:    s_mov_b32 s2, 8
+; GFX9-NEXT:    v_pk_sub_i16 v1, s3, v1 clamp
 ; GFX9-NEXT:    v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
 ; GFX9-NEXT:    v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1]
 ; GFX9-NEXT:    s_movk_i32 s0, 0xff
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    s_mov_b32 s5, 24
 ; GFX9-NEXT:    v_and_or_b32 v0, v0, s0, v2
 ; GFX9-NEXT:    v_and_b32_e32 v2, s0, v1
-; GFX9-NEXT:    s_mov_b32 s5, 24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_or3_b32 v0, v0, v2, v1
@@ -885,8 +885,8 @@ define amdgpu_ps i32 @s_ssubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 ; GFX10-NEXT:    s_lshr_b32 s4, s0, 24
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s2, s3, s4
-; GFX10-NEXT:    s_lshr_b32 s4, s0, 16
 ; GFX10-NEXT:    s_mov_b32 s3, 0x80008
+; GFX10-NEXT:    s_lshr_b32 s4, s0, 16
 ; GFX10-NEXT:    s_lshr_b32 s5, s1, 8
 ; GFX10-NEXT:    s_lshr_b32 s6, s1, 16
 ; GFX10-NEXT:    s_lshr_b32 s7, s1, 24
@@ -904,8 +904,8 @@ define amdgpu_ps i32 @s_ssubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 ; GFX10-NEXT:    s_lshl_b32 s5, s5, 8
 ; GFX10-NEXT:    s_lshl_b32 s3, s4, s3
 ; GFX10-NEXT:    s_lshl_b32 s4, s6, 8
-; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s5
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s2, s2, s8
+; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s5
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s3, s3, s4
 ; GFX10-NEXT:    v_pk_sub_i16 v0, s0, s1 clamp
 ; GFX10-NEXT:    v_pk_sub_i16 v1, s2, s3 clamp
@@ -935,9 +935,9 @@ define i24 @v_ssubsat_i24(i24 %lhs, i24 %rhs) {
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; GFX6-NEXT:    v_max_i32_e32 v2, -1, v0
-; GFX6-NEXT:    v_min_i32_e32 v3, -1, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, 0x7fffffff, v2
+; GFX6-NEXT:    v_min_i32_e32 v3, -1, v0
 ; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, 0x80000000, v3
 ; GFX6-NEXT:    v_max_i32_e32 v1, v2, v1
 ; GFX6-NEXT:    v_min_i32_e32 v1, v1, v3
@@ -987,9 +987,9 @@ define amdgpu_ps i24 @s_ssubsat_i24(i24 inreg %lhs, i24 inreg %rhs) {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 8
 ; GFX6-NEXT:    s_max_i32 s2, s0, -1
-; GFX6-NEXT:    s_min_i32 s3, s0, -1
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 8
 ; GFX6-NEXT:    s_sub_i32 s2, s2, 0x7fffffff
+; GFX6-NEXT:    s_min_i32 s3, s0, -1
 ; GFX6-NEXT:    s_sub_i32 s3, s3, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s1, s2, s1
 ; GFX6-NEXT:    s_min_i32 s1, s1, s3
@@ -1042,8 +1042,8 @@ define i32 @v_ssubsat_i32(i32 %lhs, i32 %rhs) {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_max_i32_e32 v2, -1, v0
-; GFX6-NEXT:    v_min_i32_e32 v3, -1, v0
 ; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, 0x7fffffff, v2
+; GFX6-NEXT:    v_min_i32_e32 v3, -1, v0
 ; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, 0x80000000, v3
 ; GFX6-NEXT:    v_max_i32_e32 v1, v2, v1
 ; GFX6-NEXT:    v_min_i32_e32 v1, v1, v3
@@ -1054,8 +1054,8 @@ define i32 @v_ssubsat_i32(i32 %lhs, i32 %rhs) {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_max_i32_e32 v2, -1, v0
-; GFX8-NEXT:    v_min_i32_e32 v3, -1, v0
 ; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, 0x7fffffff, v2
+; GFX8-NEXT:    v_min_i32_e32 v3, -1, v0
 ; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, 0x80000000, v3
 ; GFX8-NEXT:    v_max_i32_e32 v1, v2, v1
 ; GFX8-NEXT:    v_min_i32_e32 v1, v1, v3
@@ -1082,8 +1082,8 @@ define amdgpu_ps i32 @s_ssubsat_i32(i32 inreg %lhs, i32 inreg %rhs) {
 ; GFX6-LABEL: s_ssubsat_i32:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_max_i32 s2, s0, -1
-; GFX6-NEXT:    s_min_i32 s3, s0, -1
 ; GFX6-NEXT:    s_sub_i32 s2, s2, 0x7fffffff
+; GFX6-NEXT:    s_min_i32 s3, s0, -1
 ; GFX6-NEXT:    s_sub_i32 s3, s3, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s1, s2, s1
 ; GFX6-NEXT:    s_min_i32 s1, s1, s3
@@ -1093,8 +1093,8 @@ define amdgpu_ps i32 @s_ssubsat_i32(i32 inreg %lhs, i32 inreg %rhs) {
 ; GFX8-LABEL: s_ssubsat_i32:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_max_i32 s2, s0, -1
-; GFX8-NEXT:    s_min_i32 s3, s0, -1
 ; GFX8-NEXT:    s_sub_i32 s2, s2, 0x7fffffff
+; GFX8-NEXT:    s_min_i32 s3, s0, -1
 ; GFX8-NEXT:    s_sub_i32 s3, s3, 0x80000000
 ; GFX8-NEXT:    s_max_i32 s1, s2, s1
 ; GFX8-NEXT:    s_min_i32 s1, s1, s3
@@ -1121,8 +1121,8 @@ define amdgpu_ps float @ssubsat_i32_sv(i32 inreg %lhs, i32 %rhs) {
 ; GFX6-LABEL: ssubsat_i32_sv:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_max_i32 s1, s0, -1
-; GFX6-NEXT:    s_min_i32 s2, s0, -1
 ; GFX6-NEXT:    s_sub_i32 s1, s1, 0x7fffffff
+; GFX6-NEXT:    s_min_i32 s2, s0, -1
 ; GFX6-NEXT:    s_sub_i32 s2, s2, 0x80000000
 ; GFX6-NEXT:    v_max_i32_e32 v0, s1, v0
 ; GFX6-NEXT:    v_min_i32_e32 v0, s2, v0
@@ -1132,8 +1132,8 @@ define amdgpu_ps float @ssubsat_i32_sv(i32 inreg %lhs, i32 %rhs) {
 ; GFX8-LABEL: ssubsat_i32_sv:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_max_i32 s1, s0, -1
-; GFX8-NEXT:    s_min_i32 s2, s0, -1
 ; GFX8-NEXT:    s_sub_i32 s1, s1, 0x7fffffff
+; GFX8-NEXT:    s_min_i32 s2, s0, -1
 ; GFX8-NEXT:    s_sub_i32 s2, s2, 0x80000000
 ; GFX8-NEXT:    v_max_i32_e32 v0, s1, v0
 ; GFX8-NEXT:    v_min_i32_e32 v0, s2, v0
@@ -1197,11 +1197,11 @@ define <2 x i32> @v_ssubsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_brev_b32 s4, -2
 ; GFX6-NEXT:    v_max_i32_e32 v4, -1, v0
-; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s4, v4
 ; GFX6-NEXT:    s_brev_b32 s5, 1
+; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s4, v4
 ; GFX6-NEXT:    v_min_i32_e32 v5, -1, v0
-; GFX6-NEXT:    v_max_i32_e32 v2, v4, v2
 ; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s5, v5
+; GFX6-NEXT:    v_max_i32_e32 v2, v4, v2
 ; GFX6-NEXT:    v_min_i32_e32 v2, v2, v5
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
 ; GFX6-NEXT:    v_max_i32_e32 v2, -1, v1
@@ -1218,11 +1218,11 @@ define <2 x i32> @v_ssubsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_brev_b32 s4, -2
 ; GFX8-NEXT:    v_max_i32_e32 v4, -1, v0
-; GFX8-NEXT:    v_subrev_u32_e32 v4, vcc, s4, v4
 ; GFX8-NEXT:    s_brev_b32 s5, 1
+; GFX8-NEXT:    v_subrev_u32_e32 v4, vcc, s4, v4
 ; GFX8-NEXT:    v_min_i32_e32 v5, -1, v0
-; GFX8-NEXT:    v_max_i32_e32 v2, v4, v2
 ; GFX8-NEXT:    v_subrev_u32_e32 v5, vcc, s5, v5
+; GFX8-NEXT:    v_max_i32_e32 v2, v4, v2
 ; GFX8-NEXT:    v_min_i32_e32 v2, v2, v5
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v0, v2
 ; GFX8-NEXT:    v_max_i32_e32 v2, -1, v1
@@ -1257,8 +1257,8 @@ define amdgpu_ps <2 x i32> @s_ssubsat_v2i32(<2 x i32> inreg %lhs, <2 x i32> inre
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_brev_b32 s4, -2
 ; GFX6-NEXT:    s_max_i32 s6, s0, -1
-; GFX6-NEXT:    s_sub_i32 s6, s6, s4
 ; GFX6-NEXT:    s_brev_b32 s5, 1
+; GFX6-NEXT:    s_sub_i32 s6, s6, s4
 ; GFX6-NEXT:    s_min_i32 s7, s0, -1
 ; GFX6-NEXT:    s_sub_i32 s7, s7, s5
 ; GFX6-NEXT:    s_max_i32 s2, s6, s2
@@ -1277,8 +1277,8 @@ define amdgpu_ps <2 x i32> @s_ssubsat_v2i32(<2 x i32> inreg %lhs, <2 x i32> inre
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_brev_b32 s4, -2
 ; GFX8-NEXT:    s_max_i32 s6, s0, -1
-; GFX8-NEXT:    s_sub_i32 s6, s6, s4
 ; GFX8-NEXT:    s_brev_b32 s5, 1
+; GFX8-NEXT:    s_sub_i32 s6, s6, s4
 ; GFX8-NEXT:    s_min_i32 s7, s0, -1
 ; GFX8-NEXT:    s_sub_i32 s7, s7, s5
 ; GFX8-NEXT:    s_max_i32 s2, s6, s2
@@ -1320,18 +1320,18 @@ define <3 x i32> @v_ssubsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) {
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_brev_b32 s4, -2
 ; GFX6-NEXT:    v_max_i32_e32 v6, -1, v0
-; GFX6-NEXT:    v_subrev_i32_e32 v6, vcc, s4, v6
 ; GFX6-NEXT:    s_brev_b32 s5, 1
+; GFX6-NEXT:    v_subrev_i32_e32 v6, vcc, s4, v6
 ; GFX6-NEXT:    v_min_i32_e32 v7, -1, v0
-; GFX6-NEXT:    v_max_i32_e32 v3, v6, v3
 ; GFX6-NEXT:    v_subrev_i32_e32 v7, vcc, s5, v7
+; GFX6-NEXT:    v_max_i32_e32 v3, v6, v3
 ; GFX6-NEXT:    v_min_i32_e32 v3, v3, v7
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v3
 ; GFX6-NEXT:    v_max_i32_e32 v3, -1, v1
 ; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s4, v3
 ; GFX6-NEXT:    v_min_i32_e32 v6, -1, v1
-; GFX6-NEXT:    v_max_i32_e32 v3, v3, v4
 ; GFX6-NEXT:    v_subrev_i32_e32 v6, vcc, s5, v6
+; GFX6-NEXT:    v_max_i32_e32 v3, v3, v4
 ; GFX6-NEXT:    v_min_i32_e32 v3, v3, v6
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v3
 ; GFX6-NEXT:    v_max_i32_e32 v3, -1, v2
@@ -1348,18 +1348,18 @@ define <3 x i32> @v_ssubsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_brev_b32 s4, -2
 ; GFX8-NEXT:    v_max_i32_e32 v6, -1, v0
-; GFX8-NEXT:    v_subrev_u32_e32 v6, vcc, s4, v6
 ; GFX8-NEXT:    s_brev_b32 s5, 1
+; GFX8-NEXT:    v_subrev_u32_e32 v6, vcc, s4, v6
 ; GFX8-NEXT:    v_min_i32_e32 v7, -1, v0
-; GFX8-NEXT:    v_max_i32_e32 v3, v6, v3
 ; GFX8-NEXT:    v_subrev_u32_e32 v7, vcc, s5, v7
+; GFX8-NEXT:    v_max_i32_e32 v3, v6, v3
 ; GFX8-NEXT:    v_min_i32_e32 v3, v3, v7
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v0, v3
 ; GFX8-NEXT:    v_max_i32_e32 v3, -1, v1
 ; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, s4, v3
 ; GFX8-NEXT:    v_min_i32_e32 v6, -1, v1
-; GFX8-NEXT:    v_max_i32_e32 v3, v3, v4
 ; GFX8-NEXT:    v_subrev_u32_e32 v6, vcc, s5, v6
+; GFX8-NEXT:    v_max_i32_e32 v3, v3, v4
 ; GFX8-NEXT:    v_min_i32_e32 v3, v3, v6
 ; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, v1, v3
 ; GFX8-NEXT:    v_max_i32_e32 v3, -1, v2
@@ -1396,18 +1396,18 @@ define amdgpu_ps <3 x i32> @s_ssubsat_v3i32(<3 x i32> inreg %lhs, <3 x i32> inre
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_brev_b32 s6, -2
 ; GFX6-NEXT:    s_max_i32 s8, s0, -1
-; GFX6-NEXT:    s_sub_i32 s8, s8, s6
 ; GFX6-NEXT:    s_brev_b32 s7, 1
+; GFX6-NEXT:    s_sub_i32 s8, s8, s6
 ; GFX6-NEXT:    s_min_i32 s9, s0, -1
-; GFX6-NEXT:    s_max_i32 s3, s8, s3
 ; GFX6-NEXT:    s_sub_i32 s9, s9, s7
+; GFX6-NEXT:    s_max_i32 s3, s8, s3
 ; GFX6-NEXT:    s_min_i32 s3, s3, s9
 ; GFX6-NEXT:    s_sub_i32 s0, s0, s3
 ; GFX6-NEXT:    s_max_i32 s3, s1, -1
 ; GFX6-NEXT:    s_sub_i32 s3, s3, s6
 ; GFX6-NEXT:    s_min_i32 s8, s1, -1
-; GFX6-NEXT:    s_max_i32 s3, s3, s4
 ; GFX6-NEXT:    s_sub_i32 s8, s8, s7
+; GFX6-NEXT:    s_max_i32 s3, s3, s4
 ; GFX6-NEXT:    s_min_i32 s3, s3, s8
 ; GFX6-NEXT:    s_sub_i32 s1, s1, s3
 ; GFX6-NEXT:    s_max_i32 s3, s2, -1
@@ -1423,18 +1423,18 @@ define amdgpu_ps <3 x i32> @s_ssubsat_v3i32(<3 x i32> inreg %lhs, <3 x i32> inre
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_brev_b32 s6, -2
 ; GFX8-NEXT:    s_max_i32 s8, s0, -1
-; GFX8-NEXT:    s_sub_i32 s8, s8, s6
 ; GFX8-NEXT:    s_brev_b32 s7, 1
+; GFX8-NEXT:    s_sub_i32 s8, s8, s6
 ; GFX8-NEXT:    s_min_i32 s9, s0, -1
-; GFX8-NEXT:    s_max_i32 s3, s8, s3
 ; GFX8-NEXT:    s_sub_i32 s9, s9, s7
+; GFX8-NEXT:    s_max_i32 s3, s8, s3
 ; GFX8-NEXT:    s_min_i32 s3, s3, s9
 ; GFX8-NEXT:    s_sub_i32 s0, s0, s3
 ; GFX8-NEXT:    s_max_i32 s3, s1, -1
 ; GFX8-NEXT:    s_sub_i32 s3, s3, s6
 ; GFX8-NEXT:    s_min_i32 s8, s1, -1
-; GFX8-NEXT:    s_max_i32 s3, s3, s4
 ; GFX8-NEXT:    s_sub_i32 s8, s8, s7
+; GFX8-NEXT:    s_max_i32 s3, s3, s4
 ; GFX8-NEXT:    s_min_i32 s3, s3, s8
 ; GFX8-NEXT:    s_sub_i32 s1, s1, s3
 ; GFX8-NEXT:    s_max_i32 s3, s2, -1
@@ -1478,18 +1478,18 @@ define <4 x i32> @v_ssubsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_brev_b32 s4, -2
 ; GFX6-NEXT:    v_max_i32_e32 v8, -1, v0
-; GFX6-NEXT:    v_subrev_i32_e32 v8, vcc, s4, v8
 ; GFX6-NEXT:    s_brev_b32 s5, 1
+; GFX6-NEXT:    v_subrev_i32_e32 v8, vcc, s4, v8
 ; GFX6-NEXT:    v_min_i32_e32 v9, -1, v0
-; GFX6-NEXT:    v_max_i32_e32 v4, v8, v4
 ; GFX6-NEXT:    v_subrev_i32_e32 v9, vcc, s5, v9
+; GFX6-NEXT:    v_max_i32_e32 v4, v8, v4
 ; GFX6-NEXT:    v_min_i32_e32 v4, v4, v9
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
 ; GFX6-NEXT:    v_max_i32_e32 v4, -1, v1
 ; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s4, v4
 ; GFX6-NEXT:    v_min_i32_e32 v8, -1, v1
-; GFX6-NEXT:    v_max_i32_e32 v4, v4, v5
 ; GFX6-NEXT:    v_subrev_i32_e32 v8, vcc, s5, v8
+; GFX6-NEXT:    v_max_i32_e32 v4, v4, v5
 ; GFX6-NEXT:    v_min_i32_e32 v4, v4, v8
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v4
 ; GFX6-NEXT:    v_max_i32_e32 v4, -1, v2
@@ -1513,18 +1513,18 @@ define <4 x i32> @v_ssubsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_brev_b32 s4, -2
 ; GFX8-NEXT:    v_max_i32_e32 v8, -1, v0
-; GFX8-NEXT:    v_subrev_u32_e32 v8, vcc, s4, v8
 ; GFX8-NEXT:    s_brev_b32 s5, 1
+; GFX8-NEXT:    v_subrev_u32_e32 v8, vcc, s4, v8
 ; GFX8-NEXT:    v_min_i32_e32 v9, -1, v0
-; GFX8-NEXT:    v_max_i32_e32 v4, v8, v4
 ; GFX8-NEXT:    v_subrev_u32_e32 v9, vcc, s5, v9
+; GFX8-NEXT:    v_max_i32_e32 v4, v8, v4
 ; GFX8-NEXT:    v_min_i32_e32 v4, v4, v9
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v0, v4
 ; GFX8-NEXT:    v_max_i32_e32 v4, -1, v1
 ; GFX8-NEXT:    v_subrev_u32_e32 v4, vcc, s4, v4
 ; GFX8-NEXT:    v_min_i32_e32 v8, -1, v1
-; GFX8-NEXT:    v_max_i32_e32 v4, v4, v5
 ; GFX8-NEXT:    v_subrev_u32_e32 v8, vcc, s5, v8
+; GFX8-NEXT:    v_max_i32_e32 v4, v4, v5
 ; GFX8-NEXT:    v_min_i32_e32 v4, v4, v8
 ; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, v1, v4
 ; GFX8-NEXT:    v_max_i32_e32 v4, -1, v2
@@ -1570,18 +1570,18 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v4i32(<4 x i32> inreg %lhs, <4 x i32> inre
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_brev_b32 s8, -2
 ; GFX6-NEXT:    s_max_i32 s10, s0, -1
-; GFX6-NEXT:    s_sub_i32 s10, s10, s8
 ; GFX6-NEXT:    s_brev_b32 s9, 1
+; GFX6-NEXT:    s_sub_i32 s10, s10, s8
 ; GFX6-NEXT:    s_min_i32 s11, s0, -1
-; GFX6-NEXT:    s_max_i32 s4, s10, s4
 ; GFX6-NEXT:    s_sub_i32 s11, s11, s9
+; GFX6-NEXT:    s_max_i32 s4, s10, s4
 ; GFX6-NEXT:    s_min_i32 s4, s4, s11
 ; GFX6-NEXT:    s_sub_i32 s0, s0, s4
 ; GFX6-NEXT:    s_max_i32 s4, s1, -1
 ; GFX6-NEXT:    s_sub_i32 s4, s4, s8
 ; GFX6-NEXT:    s_min_i32 s10, s1, -1
-; GFX6-NEXT:    s_max_i32 s4, s4, s5
 ; GFX6-NEXT:    s_sub_i32 s10, s10, s9
+; GFX6-NEXT:    s_max_i32 s4, s4, s5
 ; GFX6-NEXT:    s_min_i32 s4, s4, s10
 ; GFX6-NEXT:    s_sub_i32 s1, s1, s4
 ; GFX6-NEXT:    s_max_i32 s4, s2, -1
@@ -1604,18 +1604,18 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v4i32(<4 x i32> inreg %lhs, <4 x i32> inre
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_brev_b32 s8, -2
 ; GFX8-NEXT:    s_max_i32 s10, s0, -1
-; GFX8-NEXT:    s_sub_i32 s10, s10, s8
 ; GFX8-NEXT:    s_brev_b32 s9, 1
+; GFX8-NEXT:    s_sub_i32 s10, s10, s8
 ; GFX8-NEXT:    s_min_i32 s11, s0, -1
-; GFX8-NEXT:    s_max_i32 s4, s10, s4
 ; GFX8-NEXT:    s_sub_i32 s11, s11, s9
+; GFX8-NEXT:    s_max_i32 s4, s10, s4
 ; GFX8-NEXT:    s_min_i32 s4, s4, s11
 ; GFX8-NEXT:    s_sub_i32 s0, s0, s4
 ; GFX8-NEXT:    s_max_i32 s4, s1, -1
 ; GFX8-NEXT:    s_sub_i32 s4, s4, s8
 ; GFX8-NEXT:    s_min_i32 s10, s1, -1
-; GFX8-NEXT:    s_max_i32 s4, s4, s5
 ; GFX8-NEXT:    s_sub_i32 s10, s10, s9
+; GFX8-NEXT:    s_max_i32 s4, s4, s5
 ; GFX8-NEXT:    s_min_i32 s4, s4, s10
 ; GFX8-NEXT:    s_sub_i32 s1, s1, s4
 ; GFX8-NEXT:    s_max_i32 s4, s2, -1
@@ -1671,18 +1671,18 @@ define <5 x i32> @v_ssubsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) {
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_brev_b32 s4, -2
 ; GFX6-NEXT:    v_max_i32_e32 v10, -1, v0
-; GFX6-NEXT:    v_subrev_i32_e32 v10, vcc, s4, v10
 ; GFX6-NEXT:    s_brev_b32 s5, 1
+; GFX6-NEXT:    v_subrev_i32_e32 v10, vcc, s4, v10
 ; GFX6-NEXT:    v_min_i32_e32 v12, -1, v0
-; GFX6-NEXT:    v_max_i32_e32 v5, v10, v5
 ; GFX6-NEXT:    v_subrev_i32_e32 v12, vcc, s5, v12
+; GFX6-NEXT:    v_max_i32_e32 v5, v10, v5
 ; GFX6-NEXT:    v_min_i32_e32 v5, v5, v12
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v5
 ; GFX6-NEXT:    v_max_i32_e32 v5, -1, v1
 ; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s4, v5
 ; GFX6-NEXT:    v_min_i32_e32 v10, -1, v1
-; GFX6-NEXT:    v_max_i32_e32 v5, v5, v6
 ; GFX6-NEXT:    v_subrev_i32_e32 v10, vcc, s5, v10
+; GFX6-NEXT:    v_max_i32_e32 v5, v5, v6
 ; GFX6-NEXT:    v_min_i32_e32 v5, v5, v10
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v5
 ; GFX6-NEXT:    v_max_i32_e32 v5, -1, v2
@@ -1691,11 +1691,11 @@ define <5 x i32> @v_ssubsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) {
 ; GFX6-NEXT:    v_subrev_i32_e32 v6, vcc, s5, v6
 ; GFX6-NEXT:    v_max_i32_e32 v5, v5, v7
 ; GFX6-NEXT:    v_min_i32_e32 v5, v5, v6
-; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v5
 ; GFX6-NEXT:    v_bfrev_b32_e32 v11, -2
+; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v5
 ; GFX6-NEXT:    v_max_i32_e32 v5, -1, v3
-; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v5, v11
 ; GFX6-NEXT:    v_bfrev_b32_e32 v13, 1
+; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v5, v11
 ; GFX6-NEXT:    v_min_i32_e32 v6, -1, v3
 ; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, v6, v13
 ; GFX6-NEXT:    v_max_i32_e32 v5, v5, v8
@@ -1715,18 +1715,18 @@ define <5 x i32> @v_ssubsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_brev_b32 s4, -2
 ; GFX8-NEXT:    v_max_i32_e32 v10, -1, v0
-; GFX8-NEXT:    v_subrev_u32_e32 v10, vcc, s4, v10
 ; GFX8-NEXT:    s_brev_b32 s5, 1
+; GFX8-NEXT:    v_subrev_u32_e32 v10, vcc, s4, v10
 ; GFX8-NEXT:    v_min_i32_e32 v12, -1, v0
-; GFX8-NEXT:    v_max_i32_e32 v5, v10, v5
 ; GFX8-NEXT:    v_subrev_u32_e32 v12, vcc, s5, v12
+; GFX8-NEXT:    v_max_i32_e32 v5, v10, v5
 ; GFX8-NEXT:    v_min_i32_e32 v5, v5, v12
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v0, v5
 ; GFX8-NEXT:    v_max_i32_e32 v5, -1, v1
 ; GFX8-NEXT:    v_subrev_u32_e32 v5, vcc, s4, v5
 ; GFX8-NEXT:    v_min_i32_e32 v10, -1, v1
-; GFX8-NEXT:    v_max_i32_e32 v5, v5, v6
 ; GFX8-NEXT:    v_subrev_u32_e32 v10, vcc, s5, v10
+; GFX8-NEXT:    v_max_i32_e32 v5, v5, v6
 ; GFX8-NEXT:    v_min_i32_e32 v5, v5, v10
 ; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, v1, v5
 ; GFX8-NEXT:    v_max_i32_e32 v5, -1, v2
@@ -1735,11 +1735,11 @@ define <5 x i32> @v_ssubsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) {
 ; GFX8-NEXT:    v_subrev_u32_e32 v6, vcc, s5, v6
 ; GFX8-NEXT:    v_max_i32_e32 v5, v5, v7
 ; GFX8-NEXT:    v_min_i32_e32 v5, v5, v6
-; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, v2, v5
 ; GFX8-NEXT:    v_bfrev_b32_e32 v11, -2
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, v2, v5
 ; GFX8-NEXT:    v_max_i32_e32 v5, -1, v3
-; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, v5, v11
 ; GFX8-NEXT:    v_bfrev_b32_e32 v13, 1
+; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, v5, v11
 ; GFX8-NEXT:    v_min_i32_e32 v6, -1, v3
 ; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, v6, v13
 ; GFX8-NEXT:    v_max_i32_e32 v5, v5, v8
@@ -1783,18 +1783,18 @@ define amdgpu_ps <5 x i32> @s_ssubsat_v5i32(<5 x i32> inreg %lhs, <5 x i32> inre
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_brev_b32 s10, -2
 ; GFX6-NEXT:    s_max_i32 s12, s0, -1
-; GFX6-NEXT:    s_sub_i32 s12, s12, s10
 ; GFX6-NEXT:    s_brev_b32 s11, 1
+; GFX6-NEXT:    s_sub_i32 s12, s12, s10
 ; GFX6-NEXT:    s_min_i32 s13, s0, -1
-; GFX6-NEXT:    s_max_i32 s5, s12, s5
 ; GFX6-NEXT:    s_sub_i32 s13, s13, s11
+; GFX6-NEXT:    s_max_i32 s5, s12, s5
 ; GFX6-NEXT:    s_min_i32 s5, s5, s13
 ; GFX6-NEXT:    s_sub_i32 s0, s0, s5
 ; GFX6-NEXT:    s_max_i32 s5, s1, -1
 ; GFX6-NEXT:    s_sub_i32 s5, s5, s10
 ; GFX6-NEXT:    s_min_i32 s12, s1, -1
-; GFX6-NEXT:    s_max_i32 s5, s5, s6
 ; GFX6-NEXT:    s_sub_i32 s12, s12, s11
+; GFX6-NEXT:    s_max_i32 s5, s5, s6
 ; GFX6-NEXT:    s_min_i32 s5, s5, s12
 ; GFX6-NEXT:    s_sub_i32 s1, s1, s5
 ; GFX6-NEXT:    s_max_i32 s5, s2, -1
@@ -1824,18 +1824,18 @@ define amdgpu_ps <5 x i32> @s_ssubsat_v5i32(<5 x i32> inreg %lhs, <5 x i32> inre
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_brev_b32 s10, -2
 ; GFX8-NEXT:    s_max_i32 s12, s0, -1
-; GFX8-NEXT:    s_sub_i32 s12, s12, s10
 ; GFX8-NEXT:    s_brev_b32 s11, 1
+; GFX8-NEXT:    s_sub_i32 s12, s12, s10
 ; GFX8-NEXT:    s_min_i32 s13, s0, -1
-; GFX8-NEXT:    s_max_i32 s5, s12, s5
 ; GFX8-NEXT:    s_sub_i32 s13, s13, s11
+; GFX8-NEXT:    s_max_i32 s5, s12, s5
 ; GFX8-NEXT:    s_min_i32 s5, s5, s13
 ; GFX8-NEXT:    s_sub_i32 s0, s0, s5
 ; GFX8-NEXT:    s_max_i32 s5, s1, -1
 ; GFX8-NEXT:    s_sub_i32 s5, s5, s10
 ; GFX8-NEXT:    s_min_i32 s12, s1, -1
-; GFX8-NEXT:    s_max_i32 s5, s5, s6
 ; GFX8-NEXT:    s_sub_i32 s12, s12, s11
+; GFX8-NEXT:    s_max_i32 s5, s5, s6
 ; GFX8-NEXT:    s_min_i32 s5, s5, s12
 ; GFX8-NEXT:    s_sub_i32 s1, s1, s5
 ; GFX8-NEXT:    s_max_i32 s5, s2, -1
@@ -2191,18 +2191,18 @@ define amdgpu_ps <16 x i32> @s_ssubsat_v16i32(<16 x i32> inreg %lhs, <16 x i32>
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_brev_b32 s32, -2
 ; GFX6-NEXT:    s_max_i32 s34, s0, -1
-; GFX6-NEXT:    s_sub_i32 s34, s34, s32
 ; GFX6-NEXT:    s_brev_b32 s33, 1
+; GFX6-NEXT:    s_sub_i32 s34, s34, s32
 ; GFX6-NEXT:    s_min_i32 s35, s0, -1
-; GFX6-NEXT:    s_max_i32 s16, s34, s16
 ; GFX6-NEXT:    s_sub_i32 s35, s35, s33
+; GFX6-NEXT:    s_max_i32 s16, s34, s16
 ; GFX6-NEXT:    s_min_i32 s16, s16, s35
 ; GFX6-NEXT:    s_sub_i32 s0, s0, s16
 ; GFX6-NEXT:    s_max_i32 s16, s1, -1
 ; GFX6-NEXT:    s_sub_i32 s16, s16, s32
 ; GFX6-NEXT:    s_min_i32 s34, s1, -1
-; GFX6-NEXT:    s_max_i32 s16, s16, s17
 ; GFX6-NEXT:    s_sub_i32 s34, s34, s33
+; GFX6-NEXT:    s_max_i32 s16, s16, s17
 ; GFX6-NEXT:    s_min_i32 s16, s16, s34
 ; GFX6-NEXT:    s_sub_i32 s1, s1, s16
 ; GFX6-NEXT:    s_max_i32 s16, s2, -1
@@ -2309,18 +2309,18 @@ define amdgpu_ps <16 x i32> @s_ssubsat_v16i32(<16 x i32> inreg %lhs, <16 x i32>
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_brev_b32 s32, -2
 ; GFX8-NEXT:    s_max_i32 s34, s0, -1
-; GFX8-NEXT:    s_sub_i32 s34, s34, s32
 ; GFX8-NEXT:    s_brev_b32 s33, 1
+; GFX8-NEXT:    s_sub_i32 s34, s34, s32
 ; GFX8-NEXT:    s_min_i32 s35, s0, -1
-; GFX8-NEXT:    s_max_i32 s16, s34, s16
 ; GFX8-NEXT:    s_sub_i32 s35, s35, s33
+; GFX8-NEXT:    s_max_i32 s16, s34, s16
 ; GFX8-NEXT:    s_min_i32 s16, s16, s35
 ; GFX8-NEXT:    s_sub_i32 s0, s0, s16
 ; GFX8-NEXT:    s_max_i32 s16, s1, -1
 ; GFX8-NEXT:    s_sub_i32 s16, s16, s32
 ; GFX8-NEXT:    s_min_i32 s34, s1, -1
-; GFX8-NEXT:    s_max_i32 s16, s16, s17
 ; GFX8-NEXT:    s_sub_i32 s34, s34, s33
+; GFX8-NEXT:    s_max_i32 s16, s16, s17
 ; GFX8-NEXT:    s_min_i32 s16, s16, s34
 ; GFX8-NEXT:    s_sub_i32 s1, s1, s16
 ; GFX8-NEXT:    s_max_i32 s16, s2, -1
@@ -2520,9 +2520,9 @@ define i16 @v_ssubsat_i16(i16 %lhs, i16 %rhs) {
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    v_max_i32_e32 v2, -1, v0
-; GFX6-NEXT:    v_min_i32_e32 v3, -1, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, 0x7fffffff, v2
+; GFX6-NEXT:    v_min_i32_e32 v3, -1, v0
 ; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, 0x80000000, v3
 ; GFX6-NEXT:    v_max_i32_e32 v1, v2, v1
 ; GFX6-NEXT:    v_min_i32_e32 v1, v1, v3
@@ -2534,8 +2534,8 @@ define i16 @v_ssubsat_i16(i16 %lhs, i16 %rhs) {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_max_i16_e32 v2, -1, v0
-; GFX8-NEXT:    v_min_i16_e32 v3, -1, v0
 ; GFX8-NEXT:    v_subrev_u16_e32 v2, 0x7fff, v2
+; GFX8-NEXT:    v_min_i16_e32 v3, -1, v0
 ; GFX8-NEXT:    v_subrev_u16_e32 v3, 0x8000, v3
 ; GFX8-NEXT:    v_max_i16_e32 v1, v2, v1
 ; GFX8-NEXT:    v_min_i16_e32 v1, v1, v3
@@ -2563,9 +2563,9 @@ define amdgpu_ps i16 @s_ssubsat_i16(i16 inreg %lhs, i16 inreg %rhs) {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX6-NEXT:    s_max_i32 s2, s0, -1
-; GFX6-NEXT:    s_min_i32 s3, s0, -1
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_sub_i32 s2, s2, 0x7fffffff
+; GFX6-NEXT:    s_min_i32 s3, s0, -1
 ; GFX6-NEXT:    s_sub_i32 s3, s3, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s1, s2, s1
 ; GFX6-NEXT:    s_min_i32 s1, s1, s3
@@ -2611,9 +2611,9 @@ define amdgpu_ps half @ssubsat_i16_sv(i16 inreg %lhs, i16 %rhs) {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX6-NEXT:    s_max_i32 s1, s0, -1
-; GFX6-NEXT:    s_min_i32 s2, s0, -1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    s_sub_i32 s1, s1, 0x7fffffff
+; GFX6-NEXT:    s_min_i32 s2, s0, -1
 ; GFX6-NEXT:    s_sub_i32 s2, s2, 0x80000000
 ; GFX6-NEXT:    v_max_i32_e32 v0, s1, v0
 ; GFX6-NEXT:    v_min_i32_e32 v0, s2, v0
@@ -2626,8 +2626,8 @@ define amdgpu_ps half @ssubsat_i16_sv(i16 inreg %lhs, i16 %rhs) {
 ; GFX8-NEXT:    s_sext_i32_i16 s1, s0
 ; GFX8-NEXT:    s_sext_i32_i16 s2, -1
 ; GFX8-NEXT:    s_max_i32 s3, s1, s2
-; GFX8-NEXT:    s_min_i32 s1, s1, s2
 ; GFX8-NEXT:    s_sub_i32 s3, s3, 0x7fff
+; GFX8-NEXT:    s_min_i32 s1, s1, s2
 ; GFX8-NEXT:    s_sub_i32 s1, s1, 0xffff8000
 ; GFX8-NEXT:    v_max_i16_e32 v0, s3, v0
 ; GFX8-NEXT:    v_min_i16_e32 v0, s1, v0
@@ -2653,9 +2653,9 @@ define amdgpu_ps half @ssubsat_i16_vs(i16 %lhs, i16 inreg %rhs) {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    v_max_i32_e32 v1, -1, v0
-; GFX6-NEXT:    v_min_i32_e32 v2, -1, v0
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, 0x7fffffff, v1
+; GFX6-NEXT:    v_min_i32_e32 v2, -1, v0
 ; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, 0x80000000, v2
 ; GFX6-NEXT:    v_max_i32_e32 v1, s0, v1
 ; GFX6-NEXT:    v_min_i32_e32 v1, v1, v2
@@ -2696,8 +2696,8 @@ define <2 x i16> @v_ssubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
 ; GFX6-NEXT:    s_brev_b32 s4, -2
 ; GFX6-NEXT:    v_max_i32_e32 v4, -1, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s4, v4
 ; GFX6-NEXT:    s_brev_b32 s5, 1
+; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s4, v4
 ; GFX6-NEXT:    v_min_i32_e32 v5, -1, v0
 ; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s5, v5
 ; GFX6-NEXT:    v_max_i32_e32 v2, v4, v2
@@ -2706,8 +2706,8 @@ define <2 x i16> @v_ssubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GFX6-NEXT:    v_max_i32_e32 v3, -1, v1
-; GFX6-NEXT:    v_min_i32_e32 v4, -1, v1
 ; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s4, v3
+; GFX6-NEXT:    v_min_i32_e32 v4, -1, v1
 ; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s5, v4
 ; GFX6-NEXT:    v_max_i32_e32 v2, v3, v2
 ; GFX6-NEXT:    v_min_i32_e32 v2, v2, v4
@@ -2721,16 +2721,16 @@ define <2 x i16> @v_ssubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX8-NEXT:    v_max_i16_e32 v3, -1, v0
-; GFX8-NEXT:    v_subrev_u16_e32 v3, s4, v3
 ; GFX8-NEXT:    s_movk_i32 s5, 0x8000
+; GFX8-NEXT:    v_subrev_u16_e32 v3, s4, v3
 ; GFX8-NEXT:    v_min_i16_e32 v4, -1, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX8-NEXT:    v_subrev_u16_e32 v4, s5, v4
 ; GFX8-NEXT:    v_max_i16_e32 v3, v3, v1
 ; GFX8-NEXT:    v_min_i16_e32 v3, v3, v4
 ; GFX8-NEXT:    v_max_i16_e32 v4, -1, v2
-; GFX8-NEXT:    v_min_i16_e32 v5, -1, v2
 ; GFX8-NEXT:    v_subrev_u16_e32 v4, s4, v4
+; GFX8-NEXT:    v_min_i16_e32 v5, -1, v2
 ; GFX8-NEXT:    v_subrev_u16_e32 v5, s5, v5
 ; GFX8-NEXT:    v_max_i16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-NEXT:    v_min_i16_e32 v1, v1, v5
@@ -2762,8 +2762,8 @@ define amdgpu_ps i32 @s_ssubsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs
 ; GFX6-NEXT:    s_brev_b32 s4, -2
 ; GFX6-NEXT:    s_max_i32 s6, s0, -1
 ; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX6-NEXT:    s_sub_i32 s6, s6, s4
 ; GFX6-NEXT:    s_brev_b32 s5, 1
+; GFX6-NEXT:    s_sub_i32 s6, s6, s4
 ; GFX6-NEXT:    s_min_i32 s7, s0, -1
 ; GFX6-NEXT:    s_sub_i32 s7, s7, s5
 ; GFX6-NEXT:    s_max_i32 s2, s6, s2
@@ -2791,8 +2791,8 @@ define amdgpu_ps i32 @s_ssubsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_sext_i32_i16 s6, s0
 ; GFX8-NEXT:    s_sext_i32_i16 s7, -1
-; GFX8-NEXT:    s_max_i32 s8, s6, s7
 ; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX8-NEXT:    s_max_i32 s8, s6, s7
 ; GFX8-NEXT:    s_sub_i32 s8, s8, s4
 ; GFX8-NEXT:    s_lshr_b32 s3, s1, 16
 ; GFX8-NEXT:    s_movk_i32 s5, 0x8000
@@ -2848,8 +2848,8 @@ define amdgpu_ps float @ssubsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) {
 ; GFX6-NEXT:    s_brev_b32 s2, -2
 ; GFX6-NEXT:    s_max_i32 s4, s0, -1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX6-NEXT:    s_sub_i32 s4, s4, s2
 ; GFX6-NEXT:    s_brev_b32 s3, 1
+; GFX6-NEXT:    s_sub_i32 s4, s4, s2
 ; GFX6-NEXT:    s_min_i32 s5, s0, -1
 ; GFX6-NEXT:    s_sub_i32 s5, s5, s3
 ; GFX6-NEXT:    v_max_i32_e32 v0, s4, v0
@@ -2857,8 +2857,8 @@ define amdgpu_ps float @ssubsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) {
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
 ; GFX6-NEXT:    s_lshl_b32 s0, s1, 16
 ; GFX6-NEXT:    s_max_i32 s1, s0, -1
-; GFX6-NEXT:    s_sub_i32 s1, s1, s2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    s_sub_i32 s1, s1, s2
 ; GFX6-NEXT:    s_min_i32 s2, s0, -1
 ; GFX6-NEXT:    s_sub_i32 s2, s2, s3
 ; GFX6-NEXT:    v_max_i32_e32 v1, s1, v1
@@ -2877,10 +2877,10 @@ define amdgpu_ps float @ssubsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s0
 ; GFX8-NEXT:    s_sext_i32_i16 s5, -1
-; GFX8-NEXT:    s_max_i32 s6, s4, s5
 ; GFX8-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX8-NEXT:    s_sub_i32 s6, s6, s2
+; GFX8-NEXT:    s_max_i32 s6, s4, s5
 ; GFX8-NEXT:    s_movk_i32 s3, 0x8000
+; GFX8-NEXT:    s_sub_i32 s6, s6, s2
 ; GFX8-NEXT:    s_min_i32 s4, s4, s5
 ; GFX8-NEXT:    s_lshr_b32 s1, s0, 16
 ; GFX8-NEXT:    s_sub_i32 s4, s4, s3
@@ -2891,8 +2891,8 @@ define amdgpu_ps float @ssubsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) {
 ; GFX8-NEXT:    s_sub_i32 s2, s6, s2
 ; GFX8-NEXT:    s_min_i32 s4, s4, s5
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s2
-; GFX8-NEXT:    v_max_i16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-NEXT:    s_sub_i32 s3, s4, s3
+; GFX8-NEXT:    v_max_i16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-NEXT:    v_min_i16_e32 v0, s3, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX8-NEXT:    v_sub_u16_e32 v1, s0, v1
@@ -2921,20 +2921,20 @@ define amdgpu_ps float @ssubsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) {
 ; GFX6-NEXT:    s_brev_b32 s2, -2
 ; GFX6-NEXT:    v_max_i32_e32 v2, -1, v0
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s2, v2
 ; GFX6-NEXT:    s_brev_b32 s3, 1
+; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s2, v2
 ; GFX6-NEXT:    v_min_i32_e32 v3, -1, v0
-; GFX6-NEXT:    v_max_i32_e32 v2, s0, v2
 ; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s3, v3
+; GFX6-NEXT:    v_max_i32_e32 v2, s0, v2
 ; GFX6-NEXT:    v_min_i32_e32 v2, v2, v3
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
 ; GFX6-NEXT:    v_max_i32_e32 v2, -1, v1
-; GFX6-NEXT:    v_min_i32_e32 v3, -1, v1
 ; GFX6-NEXT:    s_lshl_b32 s0, s1, 16
 ; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s2, v2
-; GFX6-NEXT:    v_max_i32_e32 v2, s0, v2
+; GFX6-NEXT:    v_min_i32_e32 v3, -1, v1
 ; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s3, v3
+; GFX6-NEXT:    v_max_i32_e32 v2, s0, v2
 ; GFX6-NEXT:    v_min_i32_e32 v2, v2, v3
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v2
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
@@ -2950,17 +2950,17 @@ define amdgpu_ps float @ssubsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_movk_i32 s2, 0x7fff
 ; GFX8-NEXT:    v_max_i16_e32 v2, -1, v0
-; GFX8-NEXT:    v_subrev_u16_e32 v2, s2, v2
 ; GFX8-NEXT:    s_movk_i32 s3, 0x8000
+; GFX8-NEXT:    v_subrev_u16_e32 v2, s2, v2
 ; GFX8-NEXT:    v_min_i16_e32 v3, -1, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX8-NEXT:    v_subrev_u16_e32 v3, s3, v3
 ; GFX8-NEXT:    v_max_i16_e32 v2, s0, v2
 ; GFX8-NEXT:    v_min_i16_e32 v2, v2, v3
 ; GFX8-NEXT:    v_max_i16_e32 v3, -1, v1
-; GFX8-NEXT:    v_min_i16_e32 v4, -1, v1
 ; GFX8-NEXT:    s_lshr_b32 s1, s0, 16
 ; GFX8-NEXT:    v_subrev_u16_e32 v3, s2, v3
+; GFX8-NEXT:    v_min_i16_e32 v4, -1, v1
 ; GFX8-NEXT:    v_subrev_u16_e32 v4, s3, v4
 ; GFX8-NEXT:    v_max_i16_e32 v3, s1, v3
 ; GFX8-NEXT:    v_min_i16_e32 v3, v3, v4
@@ -3002,8 +3002,8 @@ define <2 x float> @v_ssubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 ; GFX6-NEXT:    s_brev_b32 s4, -2
 ; GFX6-NEXT:    v_max_i32_e32 v8, -1, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX6-NEXT:    v_subrev_i32_e32 v8, vcc, s4, v8
 ; GFX6-NEXT:    s_brev_b32 s5, 1
+; GFX6-NEXT:    v_subrev_i32_e32 v8, vcc, s4, v8
 ; GFX6-NEXT:    v_min_i32_e32 v10, -1, v0
 ; GFX6-NEXT:    v_subrev_i32_e32 v10, vcc, s5, v10
 ; GFX6-NEXT:    v_max_i32_e32 v4, v8, v4
@@ -3017,27 +3017,27 @@ define <2 x float> @v_ssubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 ; GFX6-NEXT:    v_subrev_i32_e32 v8, vcc, s5, v8
 ; GFX6-NEXT:    v_max_i32_e32 v4, v5, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT:    v_min_i32_e32 v4, v4, v8
 ; GFX6-NEXT:    v_bfrev_b32_e32 v9, -2
+; GFX6-NEXT:    v_min_i32_e32 v4, v4, v8
 ; GFX6-NEXT:    v_max_i32_e32 v5, -1, v2
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
-; GFX6-NEXT:    v_min_i32_e32 v6, -1, v2
 ; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v5, v9
+; GFX6-NEXT:    v_min_i32_e32 v6, -1, v2
 ; GFX6-NEXT:    v_subrev_i32_e32 v6, vcc, s5, v6
 ; GFX6-NEXT:    v_max_i32_e32 v4, v5, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX6-NEXT:    v_min_i32_e32 v4, v4, v6
 ; GFX6-NEXT:    v_max_i32_e32 v5, -1, v3
-; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v4
 ; GFX6-NEXT:    v_bfrev_b32_e32 v11, 1
-; GFX6-NEXT:    v_min_i32_e32 v6, -1, v3
+; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
 ; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v5, v9
+; GFX6-NEXT:    v_min_i32_e32 v6, -1, v3
 ; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, v6, v11
 ; GFX6-NEXT:    v_max_i32_e32 v4, v5, v4
-; GFX6-NEXT:    v_min_i32_e32 v4, v4, v6
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
+; GFX6-NEXT:    v_min_i32_e32 v4, v4, v6
 ; GFX6-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
 ; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, v3, v4
@@ -3058,19 +3058,19 @@ define <2 x float> @v_ssubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX8-NEXT:    v_max_i16_e32 v6, -1, v0
-; GFX8-NEXT:    v_subrev_u16_e32 v6, s4, v6
 ; GFX8-NEXT:    s_movk_i32 s5, 0x8000
+; GFX8-NEXT:    v_subrev_u16_e32 v6, s4, v6
 ; GFX8-NEXT:    v_min_i16_e32 v7, -1, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
 ; GFX8-NEXT:    v_subrev_u16_e32 v7, s5, v7
 ; GFX8-NEXT:    v_max_i16_e32 v6, v6, v2
 ; GFX8-NEXT:    v_min_i16_e32 v6, v6, v7
 ; GFX8-NEXT:    v_max_i16_e32 v7, -1, v4
-; GFX8-NEXT:    v_min_i16_e32 v8, -1, v4
 ; GFX8-NEXT:    v_subrev_u16_e32 v7, s4, v7
+; GFX8-NEXT:    v_min_i16_e32 v8, -1, v4
+; GFX8-NEXT:    v_subrev_u16_e32 v8, s5, v8
 ; GFX8-NEXT:    v_max_i16_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_i16_e32 v7, -1, v1
-; GFX8-NEXT:    v_subrev_u16_e32 v8, s5, v8
 ; GFX8-NEXT:    v_min_i16_e32 v2, v2, v8
 ; GFX8-NEXT:    v_subrev_u16_e32 v7, s4, v7
 ; GFX8-NEXT:    v_min_i16_e32 v8, -1, v1
@@ -3079,8 +3079,8 @@ define <2 x float> @v_ssubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 ; GFX8-NEXT:    v_max_i16_e32 v7, v7, v3
 ; GFX8-NEXT:    v_min_i16_e32 v7, v7, v8
 ; GFX8-NEXT:    v_max_i16_e32 v8, -1, v5
-; GFX8-NEXT:    v_min_i16_e32 v9, -1, v5
 ; GFX8-NEXT:    v_subrev_u16_e32 v8, s4, v8
+; GFX8-NEXT:    v_min_i16_e32 v9, -1, v5
 ; GFX8-NEXT:    v_subrev_u16_e32 v9, s5, v9
 ; GFX8-NEXT:    v_max_i16_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-NEXT:    v_min_i16_e32 v3, v3, v9
@@ -3118,8 +3118,8 @@ define amdgpu_ps <2 x i32> @s_ssubsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre
 ; GFX6-NEXT:    s_brev_b32 s8, -2
 ; GFX6-NEXT:    s_max_i32 s10, s0, -1
 ; GFX6-NEXT:    s_lshl_b32 s4, s4, 16
-; GFX6-NEXT:    s_sub_i32 s10, s10, s8
 ; GFX6-NEXT:    s_brev_b32 s9, 1
+; GFX6-NEXT:    s_sub_i32 s10, s10, s8
 ; GFX6-NEXT:    s_min_i32 s11, s0, -1
 ; GFX6-NEXT:    s_sub_i32 s11, s11, s9
 ; GFX6-NEXT:    s_max_i32 s4, s10, s4
@@ -3128,31 +3128,31 @@ define amdgpu_ps <2 x i32> @s_ssubsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre
 ; GFX6-NEXT:    s_sub_i32 s0, s0, s4
 ; GFX6-NEXT:    s_lshl_b32 s4, s5, 16
 ; GFX6-NEXT:    s_max_i32 s5, s1, -1
-; GFX6-NEXT:    s_min_i32 s10, s1, -1
 ; GFX6-NEXT:    s_sub_i32 s5, s5, s8
+; GFX6-NEXT:    s_min_i32 s10, s1, -1
 ; GFX6-NEXT:    s_sub_i32 s10, s10, s9
 ; GFX6-NEXT:    s_max_i32 s4, s5, s4
 ; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX6-NEXT:    s_max_i32 s5, s2, -1
 ; GFX6-NEXT:    s_min_i32 s4, s4, s10
+; GFX6-NEXT:    s_max_i32 s5, s2, -1
 ; GFX6-NEXT:    s_sub_i32 s1, s1, s4
 ; GFX6-NEXT:    s_lshl_b32 s4, s6, 16
-; GFX6-NEXT:    s_min_i32 s6, s2, -1
 ; GFX6-NEXT:    s_sub_i32 s5, s5, s8
+; GFX6-NEXT:    s_min_i32 s6, s2, -1
 ; GFX6-NEXT:    s_sub_i32 s6, s6, s9
 ; GFX6-NEXT:    s_max_i32 s4, s5, s4
 ; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
 ; GFX6-NEXT:    s_min_i32 s4, s4, s6
 ; GFX6-NEXT:    s_max_i32 s5, s3, -1
 ; GFX6-NEXT:    s_sub_i32 s2, s2, s4
-; GFX6-NEXT:    s_min_i32 s6, s3, -1
 ; GFX6-NEXT:    s_lshl_b32 s4, s7, 16
 ; GFX6-NEXT:    s_sub_i32 s5, s5, s8
+; GFX6-NEXT:    s_min_i32 s6, s3, -1
 ; GFX6-NEXT:    s_sub_i32 s6, s6, s9
 ; GFX6-NEXT:    s_max_i32 s4, s5, s4
 ; GFX6-NEXT:    s_min_i32 s4, s4, s6
-; GFX6-NEXT:    s_sub_i32 s3, s3, s4
 ; GFX6-NEXT:    s_ashr_i32 s1, s1, 16
+; GFX6-NEXT:    s_sub_i32 s3, s3, s4
 ; GFX6-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX6-NEXT:    s_ashr_i32 s0, s0, 16
 ; GFX6-NEXT:    s_and_b32 s1, s1, s4
@@ -3171,8 +3171,8 @@ define amdgpu_ps <2 x i32> @s_ssubsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_sext_i32_i16 s10, s0
 ; GFX8-NEXT:    s_sext_i32_i16 s11, -1
-; GFX8-NEXT:    s_max_i32 s12, s10, s11
 ; GFX8-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX8-NEXT:    s_max_i32 s12, s10, s11
 ; GFX8-NEXT:    s_sub_i32 s12, s12, s8
 ; GFX8-NEXT:    s_lshr_b32 s6, s2, 16
 ; GFX8-NEXT:    s_movk_i32 s9, 0x8000
@@ -3201,12 +3201,12 @@ define amdgpu_ps <2 x i32> @s_ssubsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s1
 ; GFX8-NEXT:    s_max_i32 s6, s4, s11
 ; GFX8-NEXT:    s_sub_i32 s6, s6, s8
-; GFX8-NEXT:    s_min_i32 s4, s4, s11
 ; GFX8-NEXT:    s_lshr_b32 s7, s3, 16
+; GFX8-NEXT:    s_min_i32 s4, s4, s11
 ; GFX8-NEXT:    s_sext_i32_i16 s6, s6
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
-; GFX8-NEXT:    s_max_i32 s3, s6, s3
 ; GFX8-NEXT:    s_sub_i32 s4, s4, s9
+; GFX8-NEXT:    s_max_i32 s3, s6, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
 ; GFX8-NEXT:    s_lshr_b32 s5, s1, 16
@@ -3275,8 +3275,8 @@ define <3 x float> @v_ssubsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) {
 ; GFX6-NEXT:    s_brev_b32 s4, -2
 ; GFX6-NEXT:    v_max_i32_e32 v12, -1, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX6-NEXT:    v_subrev_i32_e32 v12, vcc, s4, v12
 ; GFX6-NEXT:    s_brev_b32 s5, 1
+; GFX6-NEXT:    v_subrev_i32_e32 v12, vcc, s4, v12
 ; GFX6-NEXT:    v_min_i32_e32 v14, -1, v0
 ; GFX6-NEXT:    v_subrev_i32_e32 v14, vcc, s5, v14
 ; GFX6-NEXT:    v_max_i32_e32 v6, v12, v6
@@ -3290,60 +3290,60 @@ define <3 x float> @v_ssubsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) {
 ; GFX6-NEXT:    v_subrev_i32_e32 v12, vcc, s5, v12
 ; GFX6-NEXT:    v_max_i32_e32 v6, v7, v6
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT:    v_min_i32_e32 v6, v6, v12
 ; GFX6-NEXT:    v_bfrev_b32_e32 v13, -2
+; GFX6-NEXT:    v_min_i32_e32 v6, v6, v12
 ; GFX6-NEXT:    v_max_i32_e32 v7, -1, v2
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v6
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 16, v8
-; GFX6-NEXT:    v_min_i32_e32 v8, -1, v2
 ; GFX6-NEXT:    v_sub_i32_e32 v7, vcc, v7, v13
+; GFX6-NEXT:    v_min_i32_e32 v8, -1, v2
 ; GFX6-NEXT:    v_subrev_i32_e32 v8, vcc, s5, v8
 ; GFX6-NEXT:    v_max_i32_e32 v6, v7, v6
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX6-NEXT:    v_min_i32_e32 v6, v6, v8
 ; GFX6-NEXT:    v_max_i32_e32 v7, -1, v3
-; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v6
 ; GFX6-NEXT:    v_bfrev_b32_e32 v15, 1
-; GFX6-NEXT:    v_min_i32_e32 v8, -1, v3
+; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v6
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 16, v9
 ; GFX6-NEXT:    v_sub_i32_e32 v7, vcc, v7, v13
+; GFX6-NEXT:    v_min_i32_e32 v8, -1, v3
 ; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, v8, v15
 ; GFX6-NEXT:    v_max_i32_e32 v6, v7, v6
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX6-NEXT:    v_min_i32_e32 v6, v6, v8
 ; GFX6-NEXT:    v_max_i32_e32 v7, -1, v4
 ; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, v3, v6
-; GFX6-NEXT:    v_min_i32_e32 v8, -1, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 16, v10
 ; GFX6-NEXT:    v_sub_i32_e32 v7, vcc, v7, v13
+; GFX6-NEXT:    v_min_i32_e32 v8, -1, v4
 ; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, v8, v15
 ; GFX6-NEXT:    v_max_i32_e32 v6, v7, v6
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; GFX6-NEXT:    v_min_i32_e32 v6, v6, v8
 ; GFX6-NEXT:    v_max_i32_e32 v7, -1, v5
 ; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, v4, v6
-; GFX6-NEXT:    v_min_i32_e32 v8, -1, v5
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 16, v11
 ; GFX6-NEXT:    v_sub_i32_e32 v7, vcc, v7, v13
+; GFX6-NEXT:    v_min_i32_e32 v8, -1, v5
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
-; GFX6-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, v8, v15
 ; GFX6-NEXT:    v_max_i32_e32 v6, v7, v6
+; GFX6-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
-; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
 ; GFX6-NEXT:    v_min_i32_e32 v6, v6, v8
-; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v5, v6
+; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 16, v2
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v3, 16, v3
+; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v5, v6
 ; GFX6-NEXT:    v_and_b32_e32 v0, s4, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_ashrrev_i32_e32 v5, 16, v5
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_and_b32_e32 v1, s4, v2
 ; GFX6-NEXT:    v_and_b32_e32 v2, s4, v3
-; GFX6-NEXT:    v_ashrrev_i32_e32 v5, 16, v5
-; GFX6-NEXT:    v_and_b32_e32 v3, s4, v5
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v4, 16, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_and_b32_e32 v3, s4, v5
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX6-NEXT:    v_and_b32_e32 v2, s4, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
@@ -3355,19 +3355,19 @@ define <3 x float> @v_ssubsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX8-NEXT:    v_max_i16_e32 v9, -1, v0
-; GFX8-NEXT:    v_subrev_u16_e32 v9, s4, v9
 ; GFX8-NEXT:    s_movk_i32 s5, 0x8000
+; GFX8-NEXT:    v_subrev_u16_e32 v9, s4, v9
 ; GFX8-NEXT:    v_min_i16_e32 v11, -1, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
 ; GFX8-NEXT:    v_subrev_u16_e32 v11, s5, v11
 ; GFX8-NEXT:    v_max_i16_e32 v9, v9, v3
 ; GFX8-NEXT:    v_min_i16_e32 v9, v9, v11
 ; GFX8-NEXT:    v_max_i16_e32 v11, -1, v6
-; GFX8-NEXT:    v_min_i16_e32 v13, -1, v6
 ; GFX8-NEXT:    v_subrev_u16_e32 v11, s4, v11
+; GFX8-NEXT:    v_min_i16_e32 v13, -1, v6
+; GFX8-NEXT:    v_subrev_u16_e32 v13, s5, v13
 ; GFX8-NEXT:    v_max_i16_sdwa v3, v11, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_i16_e32 v11, -1, v1
-; GFX8-NEXT:    v_subrev_u16_e32 v13, s5, v13
 ; GFX8-NEXT:    v_min_i16_e32 v3, v3, v13
 ; GFX8-NEXT:    v_subrev_u16_e32 v11, s4, v11
 ; GFX8-NEXT:    v_min_i16_e32 v13, -1, v1
@@ -3376,15 +3376,15 @@ define <3 x float> @v_ssubsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) {
 ; GFX8-NEXT:    v_max_i16_e32 v11, v11, v4
 ; GFX8-NEXT:    v_min_i16_e32 v11, v11, v13
 ; GFX8-NEXT:    v_max_i16_e32 v13, -1, v7
-; GFX8-NEXT:    v_min_i16_e32 v14, -1, v7
 ; GFX8-NEXT:    v_subrev_u16_e32 v13, s4, v13
-; GFX8-NEXT:    v_max_i16_sdwa v4, v13, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_subrev_u16_e32 v14, s5, v14
+; GFX8-NEXT:    v_min_i16_e32 v14, -1, v7
 ; GFX8-NEXT:    v_mov_b32_e32 v10, 0x7fff
+; GFX8-NEXT:    v_subrev_u16_e32 v14, s5, v14
+; GFX8-NEXT:    v_max_i16_sdwa v4, v13, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_i16_e32 v13, -1, v2
-; GFX8-NEXT:    v_sub_u16_e32 v13, v13, v10
-; GFX8-NEXT:    v_min_i16_e32 v4, v4, v14
 ; GFX8-NEXT:    v_mov_b32_e32 v12, 0xffff8000
+; GFX8-NEXT:    v_min_i16_e32 v4, v4, v14
+; GFX8-NEXT:    v_sub_u16_e32 v13, v13, v10
 ; GFX8-NEXT:    v_min_i16_e32 v14, -1, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
 ; GFX8-NEXT:    v_sub_u16_e32 v14, v14, v12
@@ -3397,8 +3397,8 @@ define <3 x float> @v_ssubsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) {
 ; GFX8-NEXT:    v_max_i16_sdwa v5, v10, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-NEXT:    v_sub_u16_e32 v0, v0, v9
 ; GFX8-NEXT:    v_sub_u16_sdwa v3, v6, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v3
 ; GFX8-NEXT:    v_min_i16_e32 v5, v5, v12
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v3
 ; GFX8-NEXT:    v_sub_u16_e32 v1, v1, v11
 ; GFX8-NEXT:    v_sub_u16_sdwa v3, v7, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v3
@@ -3435,8 +3435,8 @@ define amdgpu_ps <3 x i32> @s_ssubsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre
 ; GFX6-NEXT:    s_brev_b32 s12, -2
 ; GFX6-NEXT:    s_max_i32 s14, s0, -1
 ; GFX6-NEXT:    s_lshl_b32 s6, s6, 16
-; GFX6-NEXT:    s_sub_i32 s14, s14, s12
 ; GFX6-NEXT:    s_brev_b32 s13, 1
+; GFX6-NEXT:    s_sub_i32 s14, s14, s12
 ; GFX6-NEXT:    s_min_i32 s15, s0, -1
 ; GFX6-NEXT:    s_sub_i32 s15, s15, s13
 ; GFX6-NEXT:    s_max_i32 s6, s14, s6
@@ -3445,49 +3445,49 @@ define amdgpu_ps <3 x i32> @s_ssubsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre
 ; GFX6-NEXT:    s_sub_i32 s0, s0, s6
 ; GFX6-NEXT:    s_lshl_b32 s6, s7, 16
 ; GFX6-NEXT:    s_max_i32 s7, s1, -1
-; GFX6-NEXT:    s_min_i32 s14, s1, -1
 ; GFX6-NEXT:    s_sub_i32 s7, s7, s12
+; GFX6-NEXT:    s_min_i32 s14, s1, -1
 ; GFX6-NEXT:    s_sub_i32 s14, s14, s13
 ; GFX6-NEXT:    s_max_i32 s6, s7, s6
 ; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX6-NEXT:    s_max_i32 s7, s2, -1
 ; GFX6-NEXT:    s_min_i32 s6, s6, s14
+; GFX6-NEXT:    s_max_i32 s7, s2, -1
 ; GFX6-NEXT:    s_sub_i32 s1, s1, s6
 ; GFX6-NEXT:    s_lshl_b32 s6, s8, 16
-; GFX6-NEXT:    s_min_i32 s8, s2, -1
 ; GFX6-NEXT:    s_sub_i32 s7, s7, s12
+; GFX6-NEXT:    s_min_i32 s8, s2, -1
 ; GFX6-NEXT:    s_sub_i32 s8, s8, s13
 ; GFX6-NEXT:    s_max_i32 s6, s7, s6
 ; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
 ; GFX6-NEXT:    s_min_i32 s6, s6, s8
 ; GFX6-NEXT:    s_max_i32 s7, s3, -1
 ; GFX6-NEXT:    s_sub_i32 s2, s2, s6
-; GFX6-NEXT:    s_min_i32 s8, s3, -1
 ; GFX6-NEXT:    s_lshl_b32 s6, s9, 16
 ; GFX6-NEXT:    s_sub_i32 s7, s7, s12
+; GFX6-NEXT:    s_min_i32 s8, s3, -1
 ; GFX6-NEXT:    s_sub_i32 s8, s8, s13
 ; GFX6-NEXT:    s_max_i32 s6, s7, s6
 ; GFX6-NEXT:    s_lshl_b32 s4, s4, 16
 ; GFX6-NEXT:    s_min_i32 s6, s6, s8
 ; GFX6-NEXT:    s_max_i32 s7, s4, -1
 ; GFX6-NEXT:    s_sub_i32 s3, s3, s6
-; GFX6-NEXT:    s_min_i32 s8, s4, -1
 ; GFX6-NEXT:    s_lshl_b32 s6, s10, 16
 ; GFX6-NEXT:    s_sub_i32 s7, s7, s12
+; GFX6-NEXT:    s_min_i32 s8, s4, -1
 ; GFX6-NEXT:    s_sub_i32 s8, s8, s13
 ; GFX6-NEXT:    s_max_i32 s6, s7, s6
 ; GFX6-NEXT:    s_lshl_b32 s5, s5, 16
 ; GFX6-NEXT:    s_min_i32 s6, s6, s8
 ; GFX6-NEXT:    s_max_i32 s7, s5, -1
 ; GFX6-NEXT:    s_sub_i32 s4, s4, s6
-; GFX6-NEXT:    s_min_i32 s8, s5, -1
 ; GFX6-NEXT:    s_lshl_b32 s6, s11, 16
 ; GFX6-NEXT:    s_sub_i32 s7, s7, s12
+; GFX6-NEXT:    s_min_i32 s8, s5, -1
 ; GFX6-NEXT:    s_sub_i32 s8, s8, s13
 ; GFX6-NEXT:    s_max_i32 s6, s7, s6
 ; GFX6-NEXT:    s_min_i32 s6, s6, s8
-; GFX6-NEXT:    s_sub_i32 s5, s5, s6
 ; GFX6-NEXT:    s_ashr_i32 s1, s1, 16
+; GFX6-NEXT:    s_sub_i32 s5, s5, s6
 ; GFX6-NEXT:    s_mov_b32 s6, 0xffff
 ; GFX6-NEXT:    s_ashr_i32 s0, s0, 16
 ; GFX6-NEXT:    s_and_b32 s1, s1, s6
@@ -3495,13 +3495,13 @@ define amdgpu_ps <3 x i32> @s_ssubsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre
 ; GFX6-NEXT:    s_ashr_i32 s3, s3, 16
 ; GFX6-NEXT:    s_and_b32 s0, s0, s6
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX6-NEXT:    s_ashr_i32 s5, s5, 16
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
 ; GFX6-NEXT:    s_and_b32 s1, s2, s6
 ; GFX6-NEXT:    s_and_b32 s2, s3, s6
-; GFX6-NEXT:    s_ashr_i32 s5, s5, 16
-; GFX6-NEXT:    s_and_b32 s3, s5, s6
-; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
 ; GFX6-NEXT:    s_ashr_i32 s4, s4, 16
+; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX6-NEXT:    s_and_b32 s3, s5, s6
 ; GFX6-NEXT:    s_or_b32 s1, s1, s2
 ; GFX6-NEXT:    s_and_b32 s2, s4, s6
 ; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
@@ -3512,8 +3512,8 @@ define amdgpu_ps <3 x i32> @s_ssubsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_sext_i32_i16 s14, s0
 ; GFX8-NEXT:    s_sext_i32_i16 s15, -1
-; GFX8-NEXT:    s_max_i32 s16, s14, s15
 ; GFX8-NEXT:    s_movk_i32 s12, 0x7fff
+; GFX8-NEXT:    s_max_i32 s16, s14, s15
 ; GFX8-NEXT:    s_sub_i32 s16, s16, s12
 ; GFX8-NEXT:    s_lshr_b32 s9, s3, 16
 ; GFX8-NEXT:    s_movk_i32 s13, 0x8000
@@ -3542,12 +3542,12 @@ define amdgpu_ps <3 x i32> @s_ssubsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre
 ; GFX8-NEXT:    s_sext_i32_i16 s6, s1
 ; GFX8-NEXT:    s_max_i32 s9, s6, s15
 ; GFX8-NEXT:    s_sub_i32 s9, s9, s12
-; GFX8-NEXT:    s_min_i32 s6, s6, s15
 ; GFX8-NEXT:    s_lshr_b32 s10, s4, 16
+; GFX8-NEXT:    s_min_i32 s6, s6, s15
 ; GFX8-NEXT:    s_sext_i32_i16 s9, s9
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
-; GFX8-NEXT:    s_max_i32 s4, s9, s4
 ; GFX8-NEXT:    s_sub_i32 s6, s6, s13
+; GFX8-NEXT:    s_max_i32 s4, s9, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s4, s4
 ; GFX8-NEXT:    s_sext_i32_i16 s6, s6
 ; GFX8-NEXT:    s_lshr_b32 s7, s1, 16
@@ -3568,12 +3568,12 @@ define amdgpu_ps <3 x i32> @s_ssubsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre
 ; GFX8-NEXT:    s_sub_i32 s4, s7, s4
 ; GFX8-NEXT:    s_max_i32 s7, s6, s15
 ; GFX8-NEXT:    s_sub_i32 s7, s7, s12
-; GFX8-NEXT:    s_min_i32 s6, s6, s15
 ; GFX8-NEXT:    s_lshr_b32 s11, s5, 16
+; GFX8-NEXT:    s_min_i32 s6, s6, s15
 ; GFX8-NEXT:    s_sext_i32_i16 s7, s7
 ; GFX8-NEXT:    s_sext_i32_i16 s5, s5
-; GFX8-NEXT:    s_max_i32 s5, s7, s5
 ; GFX8-NEXT:    s_sub_i32 s6, s6, s13
+; GFX8-NEXT:    s_max_i32 s5, s7, s5
 ; GFX8-NEXT:    s_sext_i32_i16 s5, s5
 ; GFX8-NEXT:    s_sext_i32_i16 s6, s6
 ; GFX8-NEXT:    s_lshr_b32 s8, s2, 16
@@ -3640,8 +3640,8 @@ define <4 x float> @v_ssubsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
 ; GFX6-NEXT:    s_brev_b32 s4, -2
 ; GFX6-NEXT:    v_max_i32_e32 v16, -1, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX6-NEXT:    v_subrev_i32_e32 v16, vcc, s4, v16
 ; GFX6-NEXT:    s_brev_b32 s5, 1
+; GFX6-NEXT:    v_subrev_i32_e32 v16, vcc, s4, v16
 ; GFX6-NEXT:    v_min_i32_e32 v18, -1, v0
 ; GFX6-NEXT:    v_subrev_i32_e32 v18, vcc, s5, v18
 ; GFX6-NEXT:    v_max_i32_e32 v8, v16, v8
@@ -3655,84 +3655,84 @@ define <4 x float> @v_ssubsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
 ; GFX6-NEXT:    v_subrev_i32_e32 v16, vcc, s5, v16
 ; GFX6-NEXT:    v_max_i32_e32 v8, v9, v8
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT:    v_min_i32_e32 v8, v8, v16
 ; GFX6-NEXT:    v_bfrev_b32_e32 v17, -2
+; GFX6-NEXT:    v_min_i32_e32 v8, v8, v16
 ; GFX6-NEXT:    v_max_i32_e32 v9, -1, v2
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v8
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v10
-; GFX6-NEXT:    v_min_i32_e32 v10, -1, v2
 ; GFX6-NEXT:    v_sub_i32_e32 v9, vcc, v9, v17
+; GFX6-NEXT:    v_min_i32_e32 v10, -1, v2
 ; GFX6-NEXT:    v_subrev_i32_e32 v10, vcc, s5, v10
 ; GFX6-NEXT:    v_max_i32_e32 v8, v9, v8
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX6-NEXT:    v_min_i32_e32 v8, v8, v10
 ; GFX6-NEXT:    v_max_i32_e32 v9, -1, v3
-; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v8
 ; GFX6-NEXT:    v_bfrev_b32_e32 v19, 1
-; GFX6-NEXT:    v_min_i32_e32 v10, -1, v3
+; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v8
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v11
 ; GFX6-NEXT:    v_sub_i32_e32 v9, vcc, v9, v17
+; GFX6-NEXT:    v_min_i32_e32 v10, -1, v3
 ; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, v10, v19
 ; GFX6-NEXT:    v_max_i32_e32 v8, v9, v8
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX6-NEXT:    v_min_i32_e32 v8, v8, v10
 ; GFX6-NEXT:    v_max_i32_e32 v9, -1, v4
 ; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, v3, v8
-; GFX6-NEXT:    v_min_i32_e32 v10, -1, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
 ; GFX6-NEXT:    v_sub_i32_e32 v9, vcc, v9, v17
+; GFX6-NEXT:    v_min_i32_e32 v10, -1, v4
 ; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, v10, v19
 ; GFX6-NEXT:    v_max_i32_e32 v8, v9, v8
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; GFX6-NEXT:    v_min_i32_e32 v8, v8, v10
 ; GFX6-NEXT:    v_max_i32_e32 v9, -1, v5
 ; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, v4, v8
-; GFX6-NEXT:    v_min_i32_e32 v10, -1, v5
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v13
 ; GFX6-NEXT:    v_sub_i32_e32 v9, vcc, v9, v17
+; GFX6-NEXT:    v_min_i32_e32 v10, -1, v5
 ; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, v10, v19
 ; GFX6-NEXT:    v_max_i32_e32 v8, v9, v8
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
 ; GFX6-NEXT:    v_min_i32_e32 v8, v8, v10
 ; GFX6-NEXT:    v_max_i32_e32 v9, -1, v6
 ; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v5, v8
-; GFX6-NEXT:    v_min_i32_e32 v10, -1, v6
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v14
 ; GFX6-NEXT:    v_sub_i32_e32 v9, vcc, v9, v17
+; GFX6-NEXT:    v_min_i32_e32 v10, -1, v6
 ; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, v10, v19
 ; GFX6-NEXT:    v_max_i32_e32 v8, v9, v8
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
 ; GFX6-NEXT:    v_min_i32_e32 v8, v8, v10
 ; GFX6-NEXT:    v_max_i32_e32 v9, -1, v7
-; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, v6, v8
-; GFX6-NEXT:    v_min_i32_e32 v10, -1, v7
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
-; GFX6-NEXT:    s_mov_b32 s4, 0xffff
+; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, v6, v8
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v15
 ; GFX6-NEXT:    v_sub_i32_e32 v9, vcc, v9, v17
+; GFX6-NEXT:    v_min_i32_e32 v10, -1, v7
+; GFX6-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
-; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
 ; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, v10, v19
 ; GFX6-NEXT:    v_max_i32_e32 v8, v9, v8
-; GFX6-NEXT:    v_min_i32_e32 v8, v8, v10
+; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 16, v2
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v3, 16, v3
+; GFX6-NEXT:    v_min_i32_e32 v8, v8, v10
 ; GFX6-NEXT:    v_and_b32_e32 v0, s4, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_ashrrev_i32_e32 v5, 16, v5
 ; GFX6-NEXT:    v_sub_i32_e32 v7, vcc, v7, v8
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_and_b32_e32 v1, s4, v2
 ; GFX6-NEXT:    v_and_b32_e32 v2, s4, v3
-; GFX6-NEXT:    v_ashrrev_i32_e32 v5, 16, v5
-; GFX6-NEXT:    v_and_b32_e32 v3, s4, v5
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v4, 16, v4
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v7, 16, v7
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_and_b32_e32 v3, s4, v5
+; GFX6-NEXT:    v_ashrrev_i32_e32 v6, 16, v6
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX6-NEXT:    v_and_b32_e32 v2, s4, v4
-; GFX6-NEXT:    v_and_b32_e32 v4, s4, v7
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_ashrrev_i32_e32 v6, 16, v6
+; GFX6-NEXT:    v_and_b32_e32 v4, s4, v7
 ; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX6-NEXT:    v_and_b32_e32 v3, s4, v6
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
@@ -3744,19 +3744,19 @@ define <4 x float> @v_ssubsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX8-NEXT:    v_max_i16_e32 v12, -1, v0
-; GFX8-NEXT:    v_subrev_u16_e32 v12, s4, v12
 ; GFX8-NEXT:    s_movk_i32 s5, 0x8000
+; GFX8-NEXT:    v_subrev_u16_e32 v12, s4, v12
 ; GFX8-NEXT:    v_min_i16_e32 v14, -1, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
 ; GFX8-NEXT:    v_subrev_u16_e32 v14, s5, v14
 ; GFX8-NEXT:    v_max_i16_e32 v12, v12, v4
 ; GFX8-NEXT:    v_min_i16_e32 v12, v12, v14
 ; GFX8-NEXT:    v_max_i16_e32 v14, -1, v8
-; GFX8-NEXT:    v_min_i16_e32 v16, -1, v8
 ; GFX8-NEXT:    v_subrev_u16_e32 v14, s4, v14
+; GFX8-NEXT:    v_min_i16_e32 v16, -1, v8
+; GFX8-NEXT:    v_subrev_u16_e32 v16, s5, v16
 ; GFX8-NEXT:    v_max_i16_sdwa v4, v14, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_i16_e32 v14, -1, v1
-; GFX8-NEXT:    v_subrev_u16_e32 v16, s5, v16
 ; GFX8-NEXT:    v_min_i16_e32 v4, v4, v16
 ; GFX8-NEXT:    v_subrev_u16_e32 v14, s4, v14
 ; GFX8-NEXT:    v_min_i16_e32 v16, -1, v1
@@ -3765,28 +3765,28 @@ define <4 x float> @v_ssubsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
 ; GFX8-NEXT:    v_max_i16_e32 v14, v14, v5
 ; GFX8-NEXT:    v_min_i16_e32 v14, v14, v16
 ; GFX8-NEXT:    v_max_i16_e32 v16, -1, v9
-; GFX8-NEXT:    v_min_i16_e32 v17, -1, v9
 ; GFX8-NEXT:    v_subrev_u16_e32 v16, s4, v16
-; GFX8-NEXT:    v_max_i16_sdwa v5, v16, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_subrev_u16_e32 v17, s5, v17
+; GFX8-NEXT:    v_min_i16_e32 v17, -1, v9
 ; GFX8-NEXT:    v_mov_b32_e32 v13, 0x7fff
+; GFX8-NEXT:    v_subrev_u16_e32 v17, s5, v17
+; GFX8-NEXT:    v_max_i16_sdwa v5, v16, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_i16_e32 v16, -1, v2
-; GFX8-NEXT:    v_sub_u16_e32 v16, v16, v13
-; GFX8-NEXT:    v_min_i16_e32 v5, v5, v17
 ; GFX8-NEXT:    v_mov_b32_e32 v15, 0xffff8000
+; GFX8-NEXT:    v_min_i16_e32 v5, v5, v17
+; GFX8-NEXT:    v_sub_u16_e32 v16, v16, v13
 ; GFX8-NEXT:    v_min_i16_e32 v17, -1, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
 ; GFX8-NEXT:    v_sub_u16_e32 v17, v17, v15
 ; GFX8-NEXT:    v_max_i16_e32 v16, v16, v6
 ; GFX8-NEXT:    v_min_i16_e32 v16, v16, v17
 ; GFX8-NEXT:    v_max_i16_e32 v17, -1, v10
-; GFX8-NEXT:    v_min_i16_e32 v18, -1, v10
 ; GFX8-NEXT:    v_sub_u16_e32 v17, v17, v13
+; GFX8-NEXT:    v_min_i16_e32 v18, -1, v10
+; GFX8-NEXT:    v_sub_u16_e32 v18, v18, v15
 ; GFX8-NEXT:    v_max_i16_sdwa v6, v17, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_i16_e32 v17, -1, v3
-; GFX8-NEXT:    v_sub_u16_e32 v18, v18, v15
-; GFX8-NEXT:    v_sub_u16_e32 v17, v17, v13
 ; GFX8-NEXT:    v_min_i16_e32 v6, v6, v18
+; GFX8-NEXT:    v_sub_u16_e32 v17, v17, v13
 ; GFX8-NEXT:    v_min_i16_e32 v18, -1, v3
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
 ; GFX8-NEXT:    v_sub_u16_e32 v18, v18, v15
@@ -3797,13 +3797,13 @@ define <4 x float> @v_ssubsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
 ; GFX8-NEXT:    v_min_i16_e32 v18, -1, v11
 ; GFX8-NEXT:    v_sub_u16_e32 v0, v0, v12
 ; GFX8-NEXT:    v_sub_u16_sdwa v4, v8, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX8-NEXT:    v_sub_u16_e32 v15, v18, v15
 ; GFX8-NEXT:    v_max_i16_sdwa v7, v13, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX8-NEXT:    v_sub_u16_e32 v1, v1, v14
 ; GFX8-NEXT:    v_sub_u16_sdwa v4, v9, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
 ; GFX8-NEXT:    v_min_i16_e32 v7, v7, v15
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
 ; GFX8-NEXT:    v_sub_u16_e32 v2, v2, v16
 ; GFX8-NEXT:    v_sub_u16_sdwa v4, v10, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v2, v2, v4
@@ -3842,8 +3842,8 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
 ; GFX6-NEXT:    s_brev_b32 s16, -2
 ; GFX6-NEXT:    s_max_i32 s18, s0, -1
 ; GFX6-NEXT:    s_lshl_b32 s8, s8, 16
-; GFX6-NEXT:    s_sub_i32 s18, s18, s16
 ; GFX6-NEXT:    s_brev_b32 s17, 1
+; GFX6-NEXT:    s_sub_i32 s18, s18, s16
 ; GFX6-NEXT:    s_min_i32 s19, s0, -1
 ; GFX6-NEXT:    s_sub_i32 s19, s19, s17
 ; GFX6-NEXT:    s_max_i32 s8, s18, s8
@@ -3852,67 +3852,67 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
 ; GFX6-NEXT:    s_sub_i32 s0, s0, s8
 ; GFX6-NEXT:    s_lshl_b32 s8, s9, 16
 ; GFX6-NEXT:    s_max_i32 s9, s1, -1
-; GFX6-NEXT:    s_min_i32 s18, s1, -1
 ; GFX6-NEXT:    s_sub_i32 s9, s9, s16
+; GFX6-NEXT:    s_min_i32 s18, s1, -1
 ; GFX6-NEXT:    s_sub_i32 s18, s18, s17
 ; GFX6-NEXT:    s_max_i32 s8, s9, s8
 ; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX6-NEXT:    s_max_i32 s9, s2, -1
 ; GFX6-NEXT:    s_min_i32 s8, s8, s18
+; GFX6-NEXT:    s_max_i32 s9, s2, -1
 ; GFX6-NEXT:    s_sub_i32 s1, s1, s8
 ; GFX6-NEXT:    s_lshl_b32 s8, s10, 16
-; GFX6-NEXT:    s_min_i32 s10, s2, -1
 ; GFX6-NEXT:    s_sub_i32 s9, s9, s16
+; GFX6-NEXT:    s_min_i32 s10, s2, -1
 ; GFX6-NEXT:    s_sub_i32 s10, s10, s17
 ; GFX6-NEXT:    s_max_i32 s8, s9, s8
 ; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
 ; GFX6-NEXT:    s_min_i32 s8, s8, s10
 ; GFX6-NEXT:    s_max_i32 s9, s3, -1
 ; GFX6-NEXT:    s_sub_i32 s2, s2, s8
-; GFX6-NEXT:    s_min_i32 s10, s3, -1
 ; GFX6-NEXT:    s_lshl_b32 s8, s11, 16
 ; GFX6-NEXT:    s_sub_i32 s9, s9, s16
+; GFX6-NEXT:    s_min_i32 s10, s3, -1
 ; GFX6-NEXT:    s_sub_i32 s10, s10, s17
 ; GFX6-NEXT:    s_max_i32 s8, s9, s8
 ; GFX6-NEXT:    s_lshl_b32 s4, s4, 16
 ; GFX6-NEXT:    s_min_i32 s8, s8, s10
 ; GFX6-NEXT:    s_max_i32 s9, s4, -1
 ; GFX6-NEXT:    s_sub_i32 s3, s3, s8
-; GFX6-NEXT:    s_min_i32 s10, s4, -1
 ; GFX6-NEXT:    s_lshl_b32 s8, s12, 16
 ; GFX6-NEXT:    s_sub_i32 s9, s9, s16
+; GFX6-NEXT:    s_min_i32 s10, s4, -1
 ; GFX6-NEXT:    s_sub_i32 s10, s10, s17
 ; GFX6-NEXT:    s_max_i32 s8, s9, s8
 ; GFX6-NEXT:    s_lshl_b32 s5, s5, 16
 ; GFX6-NEXT:    s_min_i32 s8, s8, s10
 ; GFX6-NEXT:    s_max_i32 s9, s5, -1
 ; GFX6-NEXT:    s_sub_i32 s4, s4, s8
-; GFX6-NEXT:    s_min_i32 s10, s5, -1
 ; GFX6-NEXT:    s_lshl_b32 s8, s13, 16
 ; GFX6-NEXT:    s_sub_i32 s9, s9, s16
+; GFX6-NEXT:    s_min_i32 s10, s5, -1
 ; GFX6-NEXT:    s_sub_i32 s10, s10, s17
 ; GFX6-NEXT:    s_max_i32 s8, s9, s8
 ; GFX6-NEXT:    s_lshl_b32 s6, s6, 16
 ; GFX6-NEXT:    s_min_i32 s8, s8, s10
 ; GFX6-NEXT:    s_max_i32 s9, s6, -1
 ; GFX6-NEXT:    s_sub_i32 s5, s5, s8
-; GFX6-NEXT:    s_min_i32 s10, s6, -1
 ; GFX6-NEXT:    s_lshl_b32 s8, s14, 16
 ; GFX6-NEXT:    s_sub_i32 s9, s9, s16
+; GFX6-NEXT:    s_min_i32 s10, s6, -1
 ; GFX6-NEXT:    s_sub_i32 s10, s10, s17
 ; GFX6-NEXT:    s_max_i32 s8, s9, s8
 ; GFX6-NEXT:    s_lshl_b32 s7, s7, 16
 ; GFX6-NEXT:    s_min_i32 s8, s8, s10
 ; GFX6-NEXT:    s_max_i32 s9, s7, -1
 ; GFX6-NEXT:    s_sub_i32 s6, s6, s8
-; GFX6-NEXT:    s_min_i32 s10, s7, -1
 ; GFX6-NEXT:    s_lshl_b32 s8, s15, 16
 ; GFX6-NEXT:    s_sub_i32 s9, s9, s16
+; GFX6-NEXT:    s_min_i32 s10, s7, -1
 ; GFX6-NEXT:    s_sub_i32 s10, s10, s17
 ; GFX6-NEXT:    s_max_i32 s8, s9, s8
 ; GFX6-NEXT:    s_min_i32 s8, s8, s10
-; GFX6-NEXT:    s_sub_i32 s7, s7, s8
 ; GFX6-NEXT:    s_ashr_i32 s1, s1, 16
+; GFX6-NEXT:    s_sub_i32 s7, s7, s8
 ; GFX6-NEXT:    s_mov_b32 s8, 0xffff
 ; GFX6-NEXT:    s_ashr_i32 s0, s0, 16
 ; GFX6-NEXT:    s_and_b32 s1, s1, s8
@@ -3920,19 +3920,19 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
 ; GFX6-NEXT:    s_ashr_i32 s3, s3, 16
 ; GFX6-NEXT:    s_and_b32 s0, s0, s8
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX6-NEXT:    s_ashr_i32 s5, s5, 16
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
 ; GFX6-NEXT:    s_and_b32 s1, s2, s8
 ; GFX6-NEXT:    s_and_b32 s2, s3, s8
-; GFX6-NEXT:    s_ashr_i32 s5, s5, 16
-; GFX6-NEXT:    s_and_b32 s3, s5, s8
-; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
 ; GFX6-NEXT:    s_ashr_i32 s4, s4, 16
 ; GFX6-NEXT:    s_ashr_i32 s7, s7, 16
+; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX6-NEXT:    s_and_b32 s3, s5, s8
+; GFX6-NEXT:    s_ashr_i32 s6, s6, 16
 ; GFX6-NEXT:    s_or_b32 s1, s1, s2
 ; GFX6-NEXT:    s_and_b32 s2, s4, s8
-; GFX6-NEXT:    s_and_b32 s4, s7, s8
 ; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
-; GFX6-NEXT:    s_ashr_i32 s6, s6, 16
+; GFX6-NEXT:    s_and_b32 s4, s7, s8
 ; GFX6-NEXT:    s_or_b32 s2, s2, s3
 ; GFX6-NEXT:    s_and_b32 s3, s6, s8
 ; GFX6-NEXT:    s_lshl_b32 s4, s4, 16
@@ -3943,8 +3943,8 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_sext_i32_i16 s18, s0
 ; GFX8-NEXT:    s_sext_i32_i16 s19, -1
-; GFX8-NEXT:    s_max_i32 s20, s18, s19
 ; GFX8-NEXT:    s_movk_i32 s16, 0x7fff
+; GFX8-NEXT:    s_max_i32 s20, s18, s19
 ; GFX8-NEXT:    s_sub_i32 s20, s20, s16
 ; GFX8-NEXT:    s_lshr_b32 s12, s4, 16
 ; GFX8-NEXT:    s_movk_i32 s17, 0x8000
@@ -3973,12 +3973,12 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
 ; GFX8-NEXT:    s_sext_i32_i16 s8, s1
 ; GFX8-NEXT:    s_max_i32 s12, s8, s19
 ; GFX8-NEXT:    s_sub_i32 s12, s12, s16
-; GFX8-NEXT:    s_min_i32 s8, s8, s19
 ; GFX8-NEXT:    s_lshr_b32 s13, s5, 16
+; GFX8-NEXT:    s_min_i32 s8, s8, s19
 ; GFX8-NEXT:    s_sext_i32_i16 s12, s12
 ; GFX8-NEXT:    s_sext_i32_i16 s5, s5
-; GFX8-NEXT:    s_max_i32 s5, s12, s5
 ; GFX8-NEXT:    s_sub_i32 s8, s8, s17
+; GFX8-NEXT:    s_max_i32 s5, s12, s5
 ; GFX8-NEXT:    s_sext_i32_i16 s5, s5
 ; GFX8-NEXT:    s_sext_i32_i16 s8, s8
 ; GFX8-NEXT:    s_lshr_b32 s9, s1, 16
@@ -3999,12 +3999,12 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
 ; GFX8-NEXT:    s_sub_i32 s5, s9, s5
 ; GFX8-NEXT:    s_max_i32 s9, s8, s19
 ; GFX8-NEXT:    s_sub_i32 s9, s9, s16
-; GFX8-NEXT:    s_min_i32 s8, s8, s19
 ; GFX8-NEXT:    s_lshr_b32 s14, s6, 16
+; GFX8-NEXT:    s_min_i32 s8, s8, s19
 ; GFX8-NEXT:    s_sext_i32_i16 s9, s9
 ; GFX8-NEXT:    s_sext_i32_i16 s6, s6
-; GFX8-NEXT:    s_max_i32 s6, s9, s6
 ; GFX8-NEXT:    s_sub_i32 s8, s8, s17
+; GFX8-NEXT:    s_max_i32 s6, s9, s6
 ; GFX8-NEXT:    s_sext_i32_i16 s6, s6
 ; GFX8-NEXT:    s_sext_i32_i16 s8, s8
 ; GFX8-NEXT:    s_lshr_b32 s10, s2, 16
@@ -4024,12 +4024,12 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
 ; GFX8-NEXT:    s_sext_i32_i16 s8, s3
 ; GFX8-NEXT:    s_max_i32 s9, s8, s19
 ; GFX8-NEXT:    s_sub_i32 s9, s9, s16
-; GFX8-NEXT:    s_min_i32 s8, s8, s19
 ; GFX8-NEXT:    s_lshr_b32 s15, s7, 16
+; GFX8-NEXT:    s_min_i32 s8, s8, s19
 ; GFX8-NEXT:    s_sext_i32_i16 s9, s9
 ; GFX8-NEXT:    s_sext_i32_i16 s7, s7
-; GFX8-NEXT:    s_max_i32 s7, s9, s7
 ; GFX8-NEXT:    s_sub_i32 s8, s8, s17
+; GFX8-NEXT:    s_max_i32 s7, s9, s7
 ; GFX8-NEXT:    s_sext_i32_i16 s7, s7
 ; GFX8-NEXT:    s_sext_i32_i16 s8, s8
 ; GFX8-NEXT:    s_lshr_b32 s11, s3, 16
@@ -4448,8 +4448,8 @@ define <2 x i64> @v_ssubsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v9
 ; GFX6-NEXT:    v_bfrev_b32_e32 v10, 1
 ; GFX6-NEXT:    v_add_i32_e64 v1, s[6:7], 0, v0
-; GFX6-NEXT:    s_xor_b64 vcc, s[4:5], vcc
 ; GFX6-NEXT:    v_addc_u32_e64 v4, s[6:7], v0, v10, s[6:7]
+; GFX6-NEXT:    s_xor_b64 vcc, s[4:5], vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v8, v1, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v9, v4, vcc
 ; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, v2, v6
@@ -4474,8 +4474,8 @@ define <2 x i64> @v_ssubsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 31, v9
 ; GFX8-NEXT:    v_bfrev_b32_e32 v10, 1
 ; GFX8-NEXT:    v_add_u32_e64 v1, s[6:7], 0, v0
-; GFX8-NEXT:    s_xor_b64 vcc, s[4:5], vcc
 ; GFX8-NEXT:    v_addc_u32_e64 v4, s[6:7], v0, v10, s[6:7]
+; GFX8-NEXT:    s_xor_b64 vcc, s[4:5], vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v8, v1, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v9, v4, vcc
 ; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, v2, v6
@@ -4500,8 +4500,8 @@ define <2 x i64> @v_ssubsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 31, v9
 ; GFX9-NEXT:    v_bfrev_b32_e32 v10, 1
 ; GFX9-NEXT:    v_add_co_u32_e64 v1, s[6:7], 0, v0
-; GFX9-NEXT:    s_xor_b64 vcc, s[4:5], vcc
 ; GFX9-NEXT:    v_addc_co_u32_e64 v4, s[6:7], v0, v10, s[6:7]
+; GFX9-NEXT:    s_xor_b64 vcc, s[4:5], vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v8, v1, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v9, v4, vcc
 ; GFX9-NEXT:    v_sub_co_u32_e32 v4, vcc, v2, v6
@@ -4521,20 +4521,20 @@ define <2 x i64> @v_ssubsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_sub_co_u32 v8, vcc_lo, v0, v4
-; GFX10-NEXT:    v_cmp_lt_i64_e64 s4, 0, v[4:5]
 ; GFX10-NEXT:    v_sub_co_ci_u32_e32 v9, vcc_lo, v1, v5, vcc_lo
 ; GFX10-NEXT:    v_sub_co_u32 v10, vcc_lo, v2, v6
-; GFX10-NEXT:    v_cmp_lt_i64_e64 s6, 0, v[6:7]
 ; GFX10-NEXT:    v_sub_co_ci_u32_e32 v11, vcc_lo, v3, v7, vcc_lo
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v12, 31, v9
 ; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[8:9], v[0:1]
+; GFX10-NEXT:    v_cmp_lt_i64_e64 s4, 0, v[4:5]
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v0, 31, v11
+; GFX10-NEXT:    v_cmp_lt_i64_e64 s6, 0, v[6:7]
 ; GFX10-NEXT:    v_add_co_u32 v1, s5, v12, 0
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v4, s5, 0x80000000, v12, s5
 ; GFX10-NEXT:    v_cmp_lt_i64_e64 s5, v[10:11], v[2:3]
 ; GFX10-NEXT:    v_add_co_u32 v2, s7, v0, 0
-; GFX10-NEXT:    s_xor_b32 vcc_lo, s4, vcc_lo
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s7, 0x80000000, v0, s7
+; GFX10-NEXT:    s_xor_b32 vcc_lo, s4, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v8, v1, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v9, v4, vcc_lo
 ; GFX10-NEXT:    s_xor_b32 vcc_lo, s6, s5
@@ -4555,8 +4555,8 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX6-NEXT:    s_subb_u32 s9, s1, s5
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[4:5], 0
 ; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
+; GFX6-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[4:5], 0
 ; GFX6-NEXT:    s_ashr_i32 s4, s9, 31
 ; GFX6-NEXT:    s_xor_b64 vcc, s[0:1], vcc
 ; GFX6-NEXT:    s_add_u32 s0, s4, 0
@@ -4569,13 +4569,13 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX6-NEXT:    s_sub_u32 s0, s2, s6
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX6-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX6-NEXT:    s_and_b32 s1, s1, 1
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s8
-; GFX6-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX6-NEXT:    s_and_b32 s1, s1, 1
 ; GFX6-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc
+; GFX6-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_subb_u32 s1, s3, s7
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s9
+; GFX6-NEXT:    s_subb_u32 s1, s3, s7
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
 ; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
@@ -4589,8 +4589,8 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX6-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX6-NEXT:    s_addc_u32 s3, s4, s5
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s0
-; GFX6-NEXT:    v_mov_b32_e32 v5, s1
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s3
+; GFX6-NEXT:    v_mov_b32_e32 v5, s1
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
 ; GFX6-NEXT:    v_readfirstlane_b32 s0, v4
@@ -4608,8 +4608,8 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    s_subb_u32 s9, s1, s5
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
-; GFX8-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[4:5], 0
 ; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
+; GFX8-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[4:5], 0
 ; GFX8-NEXT:    s_ashr_i32 s4, s9, 31
 ; GFX8-NEXT:    s_xor_b64 vcc, s[0:1], vcc
 ; GFX8-NEXT:    s_add_u32 s0, s4, 0
@@ -4622,13 +4622,13 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX8-NEXT:    s_sub_u32 s0, s2, s6
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX8-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX8-NEXT:    s_and_b32 s1, s1, 1
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s8
-; GFX8-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX8-NEXT:    s_and_b32 s1, s1, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc
+; GFX8-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s2
-; GFX8-NEXT:    s_subb_u32 s1, s3, s7
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s9
+; GFX8-NEXT:    s_subb_u32 s1, s3, s7
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
 ; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
@@ -4642,8 +4642,8 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX8-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX8-NEXT:    s_addc_u32 s3, s4, s5
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s0
-; GFX8-NEXT:    v_mov_b32_e32 v5, s1
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NEXT:    v_mov_b32_e32 v5, s1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
 ; GFX8-NEXT:    v_readfirstlane_b32 s0, v4
@@ -4661,8 +4661,8 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    s_subb_u32 s9, s1, s5
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[4:5], 0
 ; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
+; GFX9-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[4:5], 0
 ; GFX9-NEXT:    s_ashr_i32 s4, s9, 31
 ; GFX9-NEXT:    s_xor_b64 vcc, s[0:1], vcc
 ; GFX9-NEXT:    s_add_u32 s0, s4, 0
@@ -4675,13 +4675,13 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX9-NEXT:    s_sub_u32 s0, s2, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX9-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX9-NEXT:    s_and_b32 s1, s1, 1
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s8
-; GFX9-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX9-NEXT:    s_and_b32 s1, s1, 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc
+; GFX9-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    s_subb_u32 s1, s3, s7
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s9
+; GFX9-NEXT:    s_subb_u32 s1, s3, s7
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
 ; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
@@ -4695,8 +4695,8 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX9-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX9-NEXT:    s_addc_u32 s3, s4, s5
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-NEXT:    v_mov_b32_e32 v5, s1
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s3
+; GFX9-NEXT:    v_mov_b32_e32 v5, s1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v4
@@ -4770,8 +4770,8 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX6-NEXT:    s_and_b32 s11, s11, 1
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX6-NEXT:    s_cmp_lg_u32 s11, 0
-; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, s[8:9], v[2:3]
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s2
+; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, s[8:9], v[2:3]
 ; GFX6-NEXT:    s_subb_u32 s11, s3, s7
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
@@ -4791,11 +4791,11 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX6-NEXT:    s_addc_u32 s1, s3, 0
 ; GFX6-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX6-NEXT:    s_and_b32 s2, s2, 1
-; GFX6-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX6-NEXT:    v_cmp_eq_u64_e64 vcc, s[6:7], 0
+; GFX6-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX6-NEXT:    s_addc_u32 s2, s3, 0
-; GFX6-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX6-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX6-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX6-NEXT:    s_and_b32 s4, s4, 1
 ; GFX6-NEXT:    s_cmp_lg_u32 s4, 0
@@ -4804,13 +4804,13 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s8
-; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX6-NEXT:    v_mov_b32_e32 v4, s9
+; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s2
-; GFX6-NEXT:    v_mov_b32_e32 v4, s10
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s3
+; GFX6-NEXT:    v_mov_b32_e32 v4, s10
 ; GFX6-NEXT:    v_mov_b32_e32 v5, s11
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
@@ -4837,8 +4837,8 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX8-NEXT:    s_cmp_lg_u32 s11, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX8-NEXT:    s_subb_u32 s11, s3, s7
-; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[8:9], v[2:3]
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[8:9], v[2:3]
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX8-NEXT:    s_cmp_eq_u64 s[10:11], s[2:3]
 ; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
@@ -4866,8 +4866,8 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX8-NEXT:    s_and_b32 s2, s2, 1
 ; GFX8-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX8-NEXT:    s_addc_u32 s2, s3, 0
-; GFX8-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX8-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX8-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    s_and_b32 s4, s4, 1
 ; GFX8-NEXT:    s_cmp_lg_u32 s4, 0
@@ -4876,13 +4876,13 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s8
-; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s9
+; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s2
-; GFX8-NEXT:    v_mov_b32_e32 v4, s10
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NEXT:    v_mov_b32_e32 v4, s10
 ; GFX8-NEXT:    v_mov_b32_e32 v5, s11
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
@@ -4909,8 +4909,8 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX9-NEXT:    s_cmp_lg_u32 s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX9-NEXT:    s_subb_u32 s11, s3, s7
-; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[8:9], v[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[8:9], v[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    s_cmp_eq_u64 s[10:11], s[2:3]
 ; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
@@ -4938,8 +4938,8 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX9-NEXT:    s_and_b32 s2, s2, 1
 ; GFX9-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX9-NEXT:    s_addc_u32 s2, s3, 0
-; GFX9-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    s_and_b32 s4, s4, 1
 ; GFX9-NEXT:    s_cmp_lg_u32 s4, 0
@@ -4948,13 +4948,13 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s8
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v4, s9
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s2
-; GFX9-NEXT:    v_mov_b32_e32 v4, s10
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s3
+; GFX9-NEXT:    v_mov_b32_e32 v4, s10
 ; GFX9-NEXT:    v_mov_b32_e32 v5, s11
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
@@ -5294,21 +5294,21 @@ define amdgpu_ps <4 x float> @ssubsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
 ; GFX10-LABEL: ssubsat_i128_vs:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_sub_co_u32 v4, vcc_lo, v0, s0
-; GFX10-NEXT:    v_cmp_gt_u64_e64 s0, s[0:1], 0
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v5, vcc_lo, s1, v1, vcc_lo
-; GFX10-NEXT:    s_cmp_eq_u64 s[2:3], 0
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v6, vcc_lo, s2, v2, vcc_lo
-; GFX10-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v7, vcc_lo, s3, v3, vcc_lo
 ; GFX10-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[0:1]
-; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s0
-; GFX10-NEXT:    v_cmp_gt_i64_e64 s0, s[2:3], 0
+; GFX10-NEXT:    v_cmp_gt_u64_e64 s0, s[0:1], 0
+; GFX10-NEXT:    s_cmp_eq_u64 s[2:3], 0
+; GFX10-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[6:7], v[2:3]
-; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s0
-; GFX10-NEXT:    s_and_b32 s0, 1, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s0
+; GFX10-NEXT:    v_cmp_gt_i64_e64 s0, s[2:3], 0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3]
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s0
+; GFX10-NEXT:    s_and_b32 s0, 1, s4
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v9, v8, vcc_lo
@@ -5540,19 +5540,19 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX10-NEXT:    v_sub_co_u32 v8, vcc_lo, v4, v12
 ; GFX10-NEXT:    v_sub_co_ci_u32_e32 v9, vcc_lo, v5, v13, vcc_lo
 ; GFX10-NEXT:    v_sub_co_ci_u32_e32 v10, vcc_lo, v6, v14, vcc_lo
-; GFX10-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX10-NEXT:    v_sub_co_ci_u32_e32 v11, vcc_lo, v7, v15, vcc_lo
 ; GFX10-NEXT:    v_cmp_lt_u64_e64 s4, v[8:9], v[4:5]
+; GFX10-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v1, 31, v19
-; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    v_cmp_eq_u64_e64 s5, v[10:11], v[6:7]
 ; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s4
 ; GFX10-NEXT:    v_cmp_lt_i64_e64 s4, v[10:11], v[6:7]
+; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v1, 0
-; GFX10-NEXT:    v_ashrrev_i32_e32 v7, 31, v11
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s4
 ; GFX10-NEXT:    v_cmp_lt_u64_e64 s4, 0, v[12:13]
+; GFX10-NEXT:    v_ashrrev_i32_e32 v7, 31, v11
 ; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s4
 ; GFX10-NEXT:    v_cmp_lt_i64_e64 s4, 0, v[14:15]
 ; GFX10-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s4
@@ -5599,8 +5599,8 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX6-NEXT:    s_and_b32 s19, s19, 1
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX6-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, s[16:17], v[2:3]
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s2
+; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, s[16:17], v[2:3]
 ; GFX6-NEXT:    s_subb_u32 s19, s3, s11
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
@@ -5623,8 +5623,8 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX6-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX6-NEXT:    s_addc_u32 s2, s3, 0
 ; GFX6-NEXT:    s_cselect_b32 s9, 1, 0
-; GFX6-NEXT:    s_and_b32 s9, s9, 1
 ; GFX6-NEXT:    v_cmp_eq_u64_e64 vcc, s[10:11], 0
+; GFX6-NEXT:    s_and_b32 s9, s9, 1
 ; GFX6-NEXT:    s_brev_b32 s8, 1
 ; GFX6-NEXT:    s_cmp_lg_u32 s9, 0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
@@ -5642,24 +5642,24 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX6-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX6-NEXT:    s_and_b32 s2, s2, 1
-; GFX6-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX6-NEXT:    v_mov_b32_e32 v4, s17
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s16
+; GFX6-NEXT:    v_mov_b32_e32 v4, s17
+; GFX6-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v5, v3, v1, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v4, v4, v2, vcc
-; GFX6-NEXT:    v_mov_b32_e32 v2, s18
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    v_mov_b32_e32 v2, s18
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s19
 ; GFX6-NEXT:    s_subb_u32 s2, s6, s14
-; GFX6-NEXT:    s_cselect_b32 s3, 1, 0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v6, v2, v0, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v7, v3, v1, vcc
+; GFX6-NEXT:    s_cselect_b32 s3, 1, 0
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    s_and_b32 s3, s3, 1
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX6-NEXT:    s_cmp_lg_u32 s3, 0
-; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
 ; GFX6-NEXT:    s_subb_u32 s3, s7, s15
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
@@ -5679,27 +5679,27 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX6-NEXT:    s_addc_u32 s5, s7, 0
 ; GFX6-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX6-NEXT:    s_and_b32 s6, s6, 1
-; GFX6-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX6-NEXT:    v_cmp_eq_u64_e64 vcc, s[14:15], 0
+; GFX6-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX6-NEXT:    s_addc_u32 s6, s7, 0
-; GFX6-NEXT:    s_cselect_b32 s9, 1, 0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX6-NEXT:    s_cselect_b32 s9, 1, 0
 ; GFX6-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX6-NEXT:    s_and_b32 s9, s9, 1
 ; GFX6-NEXT:    s_cmp_lg_u32 s9, 0
 ; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX6-NEXT:    v_mov_b32_e32 v3, s0
-; GFX6-NEXT:    v_mov_b32_e32 v8, s1
 ; GFX6-NEXT:    s_addc_u32 s7, s7, s8
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s4
-; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    v_mov_b32_e32 v3, s0
+; GFX6-NEXT:    v_mov_b32_e32 v8, s1
+; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v8, v2, vcc
-; GFX6-NEXT:    v_mov_b32_e32 v8, s2
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s6
-; GFX6-NEXT:    v_mov_b32_e32 v9, s3
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s7
+; GFX6-NEXT:    v_mov_b32_e32 v8, s2
+; GFX6-NEXT:    v_mov_b32_e32 v9, s3
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
 ; GFX6-NEXT:    v_readfirstlane_b32 s0, v5
@@ -5729,8 +5729,8 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX8-NEXT:    s_cmp_lg_u32 s19, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX8-NEXT:    s_subb_u32 s19, s3, s11
-; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[16:17], v[2:3]
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[16:17], v[2:3]
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX8-NEXT:    s_cmp_eq_u64 s[18:19], s[2:3]
 ; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
@@ -5778,24 +5778,24 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX8-NEXT:    s_and_b32 s2, s2, 1
 ; GFX8-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX8-NEXT:    v_mov_b32_e32 v4, s17
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s16
+; GFX8-NEXT:    v_mov_b32_e32 v4, s17
 ; GFX8-NEXT:    s_subb_u32 s2, s6, s14
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v3, v1, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v2, vcc
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s3
-; GFX8-NEXT:    s_cselect_b32 s3, 1, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s18
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s19
-; GFX8-NEXT:    s_and_b32 s3, s3, 1
+; GFX8-NEXT:    s_cselect_b32 s3, 1, 0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, v2, v0, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, v3, v1, vcc
+; GFX8-NEXT:    s_and_b32 s3, s3, 1
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX8-NEXT:    s_subb_u32 s3, s7, s15
-; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX8-NEXT:    s_cmp_eq_u64 s[2:3], s[6:7]
 ; GFX8-NEXT:    s_cselect_b32 s6, 1, 0
@@ -5823,24 +5823,24 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX8-NEXT:    s_and_b32 s6, s6, 1
 ; GFX8-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX8-NEXT:    s_addc_u32 s6, s7, 0
-; GFX8-NEXT:    s_cselect_b32 s9, 1, 0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX8-NEXT:    s_cselect_b32 s9, 1, 0
 ; GFX8-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    s_and_b32 s9, s9, 1
 ; GFX8-NEXT:    s_cmp_lg_u32 s9, 0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    v_mov_b32_e32 v3, s0
-; GFX8-NEXT:    v_mov_b32_e32 v8, s1
 ; GFX8-NEXT:    s_addc_u32 s7, s7, s8
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s4
-; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s5
+; GFX8-NEXT:    v_mov_b32_e32 v3, s0
+; GFX8-NEXT:    v_mov_b32_e32 v8, s1
+; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v8, v2, vcc
-; GFX8-NEXT:    v_mov_b32_e32 v8, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s6
-; GFX8-NEXT:    v_mov_b32_e32 v9, s3
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s7
+; GFX8-NEXT:    v_mov_b32_e32 v8, s2
+; GFX8-NEXT:    v_mov_b32_e32 v9, s3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
 ; GFX8-NEXT:    v_readfirstlane_b32 s0, v5
@@ -5870,8 +5870,8 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX9-NEXT:    s_cmp_lg_u32 s19, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX9-NEXT:    s_subb_u32 s19, s3, s11
-; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[16:17], v[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[16:17], v[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    s_cmp_eq_u64 s[18:19], s[2:3]
 ; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
@@ -5919,24 +5919,24 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX9-NEXT:    s_and_b32 s2, s2, 1
 ; GFX9-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX9-NEXT:    v_mov_b32_e32 v4, s17
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s16
+; GFX9-NEXT:    v_mov_b32_e32 v4, s17
 ; GFX9-NEXT:    s_subb_u32 s2, s6, s14
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v3, v1, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v2, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    s_cselect_b32 s3, 1, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s18
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s19
-; GFX9-NEXT:    s_and_b32 s3, s3, 1
+; GFX9-NEXT:    s_cselect_b32 s3, 1, 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v2, v0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v7, v3, v1, vcc
+; GFX9-NEXT:    s_and_b32 s3, s3, 1
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX9-NEXT:    s_subb_u32 s3, s7, s15
-; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9-NEXT:    s_cmp_eq_u64 s[2:3], s[6:7]
 ; GFX9-NEXT:    s_cselect_b32 s6, 1, 0
@@ -5964,24 +5964,24 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX9-NEXT:    s_and_b32 s6, s6, 1
 ; GFX9-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX9-NEXT:    s_addc_u32 s6, s7, 0
-; GFX9-NEXT:    s_cselect_b32 s9, 1, 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT:    s_cselect_b32 s9, 1, 0
 ; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    s_and_b32 s9, s9, 1
 ; GFX9-NEXT:    s_cmp_lg_u32 s9, 0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_mov_b32_e32 v3, s0
-; GFX9-NEXT:    v_mov_b32_e32 v8, s1
 ; GFX9-NEXT:    s_addc_u32 s7, s7, s8
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s4
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s5
+; GFX9-NEXT:    v_mov_b32_e32 v3, s0
+; GFX9-NEXT:    v_mov_b32_e32 v8, s1
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v8, v2, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v8, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s6
-; GFX9-NEXT:    v_mov_b32_e32 v9, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-NEXT:    v_mov_b32_e32 v8, s2
+; GFX9-NEXT:    v_mov_b32_e32 v9, s3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v5

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll
index 8c1bc5fb57ca7..12821847dc287 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll
@@ -58,47 +58,47 @@ define amdgpu_kernel void @store_lds_v4i32_align1(<4 x i32> addrspace(3)* %out,
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    s_lshr_b32 s0, s4, 8
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    s_lshr_b32 s1, s4, 16
 ; GFX9-NEXT:    ds_write_b8 v1, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    s_lshr_b32 s1, s4, 16
+; GFX9-NEXT:    s_lshr_b32 s3, s4, 24
 ; GFX9-NEXT:    ds_write_b8 v1, v0 offset:1
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s1
-; GFX9-NEXT:    s_lshr_b32 s3, s4, 24
 ; GFX9-NEXT:    ds_write_b8 v1, v0 offset:2
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX9-NEXT:    ds_write_b8 v1, v0 offset:3
-; GFX9-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX9-NEXT:    s_lshr_b32 s0, s5, 8
+; GFX9-NEXT:    v_mov_b32_e32 v0, s5
+; GFX9-NEXT:    s_lshr_b32 s1, s5, 16
 ; GFX9-NEXT:    ds_write_b8 v1, v0 offset:4
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    s_lshr_b32 s1, s5, 16
+; GFX9-NEXT:    s_lshr_b32 s2, s5, 24
 ; GFX9-NEXT:    ds_write_b8 v1, v0 offset:5
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s1
-; GFX9-NEXT:    s_lshr_b32 s2, s5, 24
 ; GFX9-NEXT:    ds_write_b8 v1, v0 offset:6
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9-NEXT:    ds_write_b8 v1, v0 offset:7
-; GFX9-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX9-NEXT:    s_lshr_b32 s0, s6, 8
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    s_lshr_b32 s1, s6, 16
 ; GFX9-NEXT:    ds_write_b8 v1, v0 offset:8
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    s_lshr_b32 s1, s6, 16
+; GFX9-NEXT:    s_lshr_b32 s2, s6, 24
 ; GFX9-NEXT:    ds_write_b8 v1, v0 offset:9
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s1
-; GFX9-NEXT:    s_lshr_b32 s2, s6, 24
 ; GFX9-NEXT:    ds_write_b8 v1, v0 offset:10
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9-NEXT:    ds_write_b8 v1, v0 offset:11
-; GFX9-NEXT:    v_mov_b32_e32 v0, s7
 ; GFX9-NEXT:    s_lshr_b32 s0, s7, 8
+; GFX9-NEXT:    v_mov_b32_e32 v0, s7
+; GFX9-NEXT:    s_lshr_b32 s1, s7, 16
 ; GFX9-NEXT:    ds_write_b8 v1, v0 offset:12
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    s_lshr_b32 s1, s7, 16
+; GFX9-NEXT:    s_lshr_b32 s2, s7, 24
 ; GFX9-NEXT:    ds_write_b8 v1, v0 offset:13
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s1
-; GFX9-NEXT:    s_lshr_b32 s2, s7, 24
 ; GFX9-NEXT:    ds_write_b8 v1, v0 offset:14
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9-NEXT:    ds_write_b8 v1, v0 offset:15
@@ -111,47 +111,47 @@ define amdgpu_kernel void @store_lds_v4i32_align1(<4 x i32> addrspace(3)* %out,
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s4
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX7-NEXT:    s_lshr_b32 s5, s0, 8
+; GFX7-NEXT:    v_mov_b32_e32 v0, s0
+; GFX7-NEXT:    s_lshr_b32 s6, s0, 16
 ; GFX7-NEXT:    ds_write_b8 v1, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s5
-; GFX7-NEXT:    s_lshr_b32 s6, s0, 16
 ; GFX7-NEXT:    s_lshr_b32 s7, s0, 24
 ; GFX7-NEXT:    ds_write_b8 v1, v0 offset:1
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX7-NEXT:    ds_write_b8 v1, v0 offset:2
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s7
 ; GFX7-NEXT:    ds_write_b8 v1, v0 offset:3
-; GFX7-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX7-NEXT:    s_lshr_b32 s0, s1, 8
+; GFX7-NEXT:    v_mov_b32_e32 v0, s1
+; GFX7-NEXT:    s_lshr_b32 s4, s1, 16
 ; GFX7-NEXT:    ds_write_b8 v1, v0 offset:4
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    s_lshr_b32 s4, s1, 16
 ; GFX7-NEXT:    s_lshr_b32 s5, s1, 24
 ; GFX7-NEXT:    ds_write_b8 v1, v0 offset:5
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    ds_write_b8 v1, v0 offset:6
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX7-NEXT:    ds_write_b8 v1, v0 offset:7
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-NEXT:    s_lshr_b32 s0, s2, 8
+; GFX7-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7-NEXT:    s_lshr_b32 s1, s2, 16
 ; GFX7-NEXT:    ds_write_b8 v1, v0 offset:8
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    s_lshr_b32 s1, s2, 16
 ; GFX7-NEXT:    s_lshr_b32 s4, s2, 24
 ; GFX7-NEXT:    ds_write_b8 v1, v0 offset:9
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX7-NEXT:    ds_write_b8 v1, v0 offset:10
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    ds_write_b8 v1, v0 offset:11
-; GFX7-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX7-NEXT:    s_lshr_b32 s0, s3, 8
+; GFX7-NEXT:    v_mov_b32_e32 v0, s3
+; GFX7-NEXT:    s_lshr_b32 s1, s3, 16
 ; GFX7-NEXT:    ds_write_b8 v1, v0 offset:12
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    s_lshr_b32 s1, s3, 16
+; GFX7-NEXT:    s_lshr_b32 s2, s3, 24
 ; GFX7-NEXT:    ds_write_b8 v1, v0 offset:13
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s1
-; GFX7-NEXT:    s_lshr_b32 s2, s3, 24
 ; GFX7-NEXT:    ds_write_b8 v1, v0 offset:14
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-NEXT:    ds_write_b8 v1, v0 offset:15
@@ -164,8 +164,8 @@ define amdgpu_kernel void @store_lds_v4i32_align1(<4 x i32> addrspace(3)* %out,
 ; GFX10-NEXT:    s_load_dword s2, s[0:1], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_lshr_b32 s0, s4, 8
-; GFX10-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-NEXT:    s_lshr_b32 s1, s4, 16
 ; GFX10-NEXT:    s_lshr_b32 s3, s4, 24
 ; GFX10-NEXT:    s_lshr_b32 s2, s5, 8
@@ -174,8 +174,8 @@ define amdgpu_kernel void @store_lds_v4i32_align1(<4 x i32> addrspace(3)* %out,
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-NEXT:    s_lshr_b32 s5, s6, 8
 ; GFX10-NEXT:    s_lshr_b32 s9, s6, 16
-; GFX10-NEXT:    v_mov_b32_e32 v4, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-NEXT:    v_mov_b32_e32 v4, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v5, s1
 ; GFX10-NEXT:    v_mov_b32_e32 v10, s5
 ; GFX10-NEXT:    s_lshr_b32 s0, s6, 24
@@ -220,23 +220,23 @@ define amdgpu_kernel void @store_lds_v4i32_align2(<4 x i32> addrspace(3)* %out,
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    s_lshr_b32 s0, s4, 16
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    ds_write_b16 v1, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    ds_write_b16 v1, v0 offset:2
-; GFX9-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX9-NEXT:    s_lshr_b32 s0, s5, 16
+; GFX9-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX9-NEXT:    ds_write_b16 v1, v0 offset:4
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    ds_write_b16 v1, v0 offset:6
-; GFX9-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX9-NEXT:    s_lshr_b32 s0, s6, 16
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX9-NEXT:    ds_write_b16 v1, v0 offset:8
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    ds_write_b16 v1, v0 offset:10
-; GFX9-NEXT:    v_mov_b32_e32 v0, s7
 ; GFX9-NEXT:    s_lshr_b32 s0, s7, 16
+; GFX9-NEXT:    v_mov_b32_e32 v0, s7
 ; GFX9-NEXT:    ds_write_b16 v1, v0 offset:12
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    ds_write_b16 v1, v0 offset:14
@@ -249,23 +249,23 @@ define amdgpu_kernel void @store_lds_v4i32_align2(<4 x i32> addrspace(3)* %out,
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s4
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX7-NEXT:    s_lshr_b32 s5, s0, 16
+; GFX7-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX7-NEXT:    ds_write_b16 v1, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX7-NEXT:    ds_write_b16 v1, v0 offset:2
-; GFX7-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX7-NEXT:    s_lshr_b32 s0, s1, 16
+; GFX7-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX7-NEXT:    ds_write_b16 v1, v0 offset:4
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX7-NEXT:    ds_write_b16 v1, v0 offset:6
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-NEXT:    s_lshr_b32 s0, s2, 16
+; GFX7-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-NEXT:    ds_write_b16 v1, v0 offset:8
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX7-NEXT:    ds_write_b16 v1, v0 offset:10
-; GFX7-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX7-NEXT:    s_lshr_b32 s0, s3, 16
+; GFX7-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX7-NEXT:    ds_write_b16 v1, v0 offset:12
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX7-NEXT:    ds_write_b16 v1, v0 offset:14
@@ -312,8 +312,8 @@ define amdgpu_kernel void @store_lds_v4i32_align4(<4 x i32> addrspace(3)* %out,
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s5
-; GFX9-NEXT:    ds_write2_b32 v1, v0, v2 offset1:1
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s6
+; GFX9-NEXT:    ds_write2_b32 v1, v0, v2 offset1:1
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s7
 ; GFX9-NEXT:    ds_write2_b32 v1, v3, v0 offset0:2 offset1:3
 ; GFX9-NEXT:    s_endpgm

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll
index c96a98fe631f5..b4b9660172091 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll
@@ -55,36 +55,36 @@ define amdgpu_kernel void @store_lds_v3i32_align1(<3 x i32> addrspace(3)* %out,
 ; GFX9-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x34
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-NEXT:    v_mov_b32_e32 v0, s12
 ; GFX9-NEXT:    s_lshr_b32 s0, s12, 8
+; GFX9-NEXT:    v_mov_b32_e32 v0, s12
+; GFX9-NEXT:    s_lshr_b32 s1, s12, 16
 ; GFX9-NEXT:    ds_write_b8 v1, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    s_lshr_b32 s1, s12, 16
+; GFX9-NEXT:    s_lshr_b32 s3, s12, 24
 ; GFX9-NEXT:    ds_write_b8 v1, v0 offset:1
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s1
-; GFX9-NEXT:    s_lshr_b32 s3, s12, 24
 ; GFX9-NEXT:    ds_write_b8 v1, v0 offset:2
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX9-NEXT:    ds_write_b8 v1, v0 offset:3
-; GFX9-NEXT:    v_mov_b32_e32 v0, s13
 ; GFX9-NEXT:    s_lshr_b32 s0, s13, 8
+; GFX9-NEXT:    v_mov_b32_e32 v0, s13
+; GFX9-NEXT:    s_lshr_b32 s1, s13, 16
 ; GFX9-NEXT:    ds_write_b8 v1, v0 offset:4
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    s_lshr_b32 s1, s13, 16
+; GFX9-NEXT:    s_lshr_b32 s2, s13, 24
 ; GFX9-NEXT:    ds_write_b8 v1, v0 offset:5
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s1
-; GFX9-NEXT:    s_lshr_b32 s2, s13, 24
 ; GFX9-NEXT:    ds_write_b8 v1, v0 offset:6
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9-NEXT:    ds_write_b8 v1, v0 offset:7
-; GFX9-NEXT:    v_mov_b32_e32 v0, s14
 ; GFX9-NEXT:    s_lshr_b32 s0, s14, 8
+; GFX9-NEXT:    v_mov_b32_e32 v0, s14
+; GFX9-NEXT:    s_lshr_b32 s1, s14, 16
 ; GFX9-NEXT:    ds_write_b8 v1, v0 offset:8
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    s_lshr_b32 s1, s14, 16
+; GFX9-NEXT:    s_lshr_b32 s2, s14, 24
 ; GFX9-NEXT:    ds_write_b8 v1, v0 offset:9
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s1
-; GFX9-NEXT:    s_lshr_b32 s2, s14, 24
 ; GFX9-NEXT:    ds_write_b8 v1, v0 offset:10
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9-NEXT:    ds_write_b8 v1, v0 offset:11
@@ -97,36 +97,36 @@ define amdgpu_kernel void @store_lds_v3i32_align1(<3 x i32> addrspace(3)* %out,
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s4
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX7-NEXT:    s_lshr_b32 s3, s0, 8
+; GFX7-NEXT:    v_mov_b32_e32 v0, s0
+; GFX7-NEXT:    s_lshr_b32 s5, s0, 16
 ; GFX7-NEXT:    ds_write_b8 v1, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s3
-; GFX7-NEXT:    s_lshr_b32 s5, s0, 16
 ; GFX7-NEXT:    s_lshr_b32 s6, s0, 24
 ; GFX7-NEXT:    ds_write_b8 v1, v0 offset:1
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX7-NEXT:    ds_write_b8 v1, v0 offset:2
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX7-NEXT:    ds_write_b8 v1, v0 offset:3
-; GFX7-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX7-NEXT:    s_lshr_b32 s0, s1, 8
+; GFX7-NEXT:    v_mov_b32_e32 v0, s1
+; GFX7-NEXT:    s_lshr_b32 s3, s1, 16
 ; GFX7-NEXT:    ds_write_b8 v1, v0 offset:4
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    s_lshr_b32 s3, s1, 16
 ; GFX7-NEXT:    s_lshr_b32 s4, s1, 24
 ; GFX7-NEXT:    ds_write_b8 v1, v0 offset:5
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX7-NEXT:    ds_write_b8 v1, v0 offset:6
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    ds_write_b8 v1, v0 offset:7
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-NEXT:    s_lshr_b32 s0, s2, 8
+; GFX7-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7-NEXT:    s_lshr_b32 s1, s2, 16
 ; GFX7-NEXT:    ds_write_b8 v1, v0 offset:8
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    s_lshr_b32 s1, s2, 16
+; GFX7-NEXT:    s_lshr_b32 s3, s2, 24
 ; GFX7-NEXT:    ds_write_b8 v1, v0 offset:9
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s1
-; GFX7-NEXT:    s_lshr_b32 s3, s2, 24
 ; GFX7-NEXT:    ds_write_b8 v1, v0 offset:10
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX7-NEXT:    ds_write_b8 v1, v0 offset:11
@@ -139,9 +139,9 @@ define amdgpu_kernel void @store_lds_v3i32_align1(<3 x i32> addrspace(3)* %out,
 ; GFX10-NEXT:    s_load_dword s2, s[0:1], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_lshr_b32 s0, s12, 8
+; GFX10-NEXT:    v_mov_b32_e32 v0, s12
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-NEXT:    s_lshr_b32 s5, s13, 24
-; GFX10-NEXT:    v_mov_b32_e32 v0, s12
 ; GFX10-NEXT:    s_lshr_b32 s1, s12, 16
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s13
 ; GFX10-NEXT:    s_lshr_b32 s3, s12, 24
@@ -184,18 +184,18 @@ define amdgpu_kernel void @store_lds_v3i32_align2(<3 x i32> addrspace(3)* %out,
 ; GFX9-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x34
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-NEXT:    v_mov_b32_e32 v0, s12
 ; GFX9-NEXT:    s_lshr_b32 s0, s12, 16
+; GFX9-NEXT:    v_mov_b32_e32 v0, s12
 ; GFX9-NEXT:    ds_write_b16 v1, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    ds_write_b16 v1, v0 offset:2
-; GFX9-NEXT:    v_mov_b32_e32 v0, s13
 ; GFX9-NEXT:    s_lshr_b32 s0, s13, 16
+; GFX9-NEXT:    v_mov_b32_e32 v0, s13
 ; GFX9-NEXT:    ds_write_b16 v1, v0 offset:4
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    ds_write_b16 v1, v0 offset:6
-; GFX9-NEXT:    v_mov_b32_e32 v0, s14
 ; GFX9-NEXT:    s_lshr_b32 s0, s14, 16
+; GFX9-NEXT:    v_mov_b32_e32 v0, s14
 ; GFX9-NEXT:    ds_write_b16 v1, v0 offset:8
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    ds_write_b16 v1, v0 offset:10
@@ -208,18 +208,18 @@ define amdgpu_kernel void @store_lds_v3i32_align2(<3 x i32> addrspace(3)* %out,
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s4
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX7-NEXT:    s_lshr_b32 s3, s0, 16
+; GFX7-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX7-NEXT:    ds_write_b16 v1, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX7-NEXT:    ds_write_b16 v1, v0 offset:2
-; GFX7-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX7-NEXT:    s_lshr_b32 s0, s1, 16
+; GFX7-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX7-NEXT:    ds_write_b16 v1, v0 offset:4
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX7-NEXT:    ds_write_b16 v1, v0 offset:6
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-NEXT:    s_lshr_b32 s0, s2, 16
+; GFX7-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-NEXT:    ds_write_b16 v1, v0 offset:8
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX7-NEXT:    ds_write_b16 v1, v0 offset:10

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
index 53a6250892432..f69e522cfdecc 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
@@ -289,8 +289,8 @@ define amdgpu_ps i16 @s_uaddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
 ; GFX8-NEXT:    s_lshr_b32 s3, s1, 8
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, s4
 ; GFX8-NEXT:    s_lshr_b32 s2, s0, 8
-; GFX8-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX8-NEXT:    s_lshl_b32 s1, s3, s4
 ; GFX8-NEXT:    v_add_u16_e64 v0, s0, v0 clamp
 ; GFX8-NEXT:    s_lshl_b32 s0, s2, s4
@@ -308,8 +308,8 @@ define amdgpu_ps i16 @s_uaddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
 ; GFX9-NEXT:    s_lshr_b32 s3, s1, 8
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s3
-; GFX9-NEXT:    s_lshr_b32 s3, s0, 16
 ; GFX9-NEXT:    s_mov_b32 s2, 0x80008
+; GFX9-NEXT:    s_lshr_b32 s3, s0, 16
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, s2
 ; GFX9-NEXT:    s_lshl_b32 s3, s3, 8
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s3
@@ -332,8 +332,8 @@ define amdgpu_ps i16 @s_uaddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
 ; GFX10-NEXT:    s_lshr_b32 s3, s1, 8
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s3
-; GFX10-NEXT:    s_lshr_b32 s3, s0, 16
 ; GFX10-NEXT:    s_mov_b32 s2, 0x80008
+; GFX10-NEXT:    s_lshr_b32 s3, s0, 16
 ; GFX10-NEXT:    s_lshr_b32 s4, s1, 16
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, s2
 ; GFX10-NEXT:    s_lshl_b32 s3, s3, 8
@@ -385,13 +385,13 @@ define i32 @v_uaddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
 ; GFX6-NEXT:    v_xor_b32_e32 v5, -1, v3
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
 ; GFX6-NEXT:    v_min_u32_e32 v4, v5, v4
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 24, v3
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 24, v3
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v3
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -436,20 +436,20 @@ define i32 @v_uaddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v8, 0xffff
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX9-NEXT:    v_and_or_b32 v0, v0, v8, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
 ; GFX9-NEXT:    v_lshrrev_b32_sdwa v5, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
-; GFX9-NEXT:    v_and_or_b32 v2, v3, v8, v2
+; GFX9-NEXT:    v_and_or_b32 v0, v0, v8, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; GFX9-NEXT:    v_and_or_b32 v2, v3, v8, v2
 ; GFX9-NEXT:    v_and_or_b32 v1, v1, v8, v5
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
-; GFX9-NEXT:    v_and_or_b32 v3, v6, v8, v3
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
+; GFX9-NEXT:    v_and_or_b32 v3, v6, v8, v3
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
-; GFX9-NEXT:    v_pk_add_u16 v0, v0, v1 clamp
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1]
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1]
+; GFX9-NEXT:    v_pk_add_u16 v0, v0, v1 clamp
 ; GFX9-NEXT:    v_pk_add_u16 v1, v2, v3 clamp
 ; GFX9-NEXT:    v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 8
@@ -536,13 +536,13 @@ define amdgpu_ps i32 @s_uaddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 ; GFX6-NEXT:    s_not_b32 s5, s3
 ; GFX6-NEXT:    s_lshr_b32 s1, s1, 24
 ; GFX6-NEXT:    s_min_u32 s4, s5, s4
-; GFX6-NEXT:    s_add_i32 s3, s3, s4
 ; GFX6-NEXT:    s_lshr_b32 s0, s0, 24
-; GFX6-NEXT:    s_lshl_b32 s1, s1, 8
 ; GFX6-NEXT:    s_lshr_b32 s2, s2, 24
+; GFX6-NEXT:    s_add_i32 s3, s3, s4
+; GFX6-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX6-NEXT:    s_lshr_b32 s3, s3, 24
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
 ; GFX6-NEXT:    s_lshl_b32 s1, s2, 16
-; GFX6-NEXT:    s_lshr_b32 s3, s3, 24
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
 ; GFX6-NEXT:    s_lshl_b32 s1, s3, 24
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
@@ -555,30 +555,30 @@ define amdgpu_ps i32 @s_uaddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 ; GFX8-NEXT:    s_lshr_b32 s6, s1, 16
 ; GFX8-NEXT:    s_lshr_b32 s7, s1, 24
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, s8
-; GFX8-NEXT:    v_mov_b32_e32 v0, s1
-; GFX8-NEXT:    s_lshl_b32 s1, s5, s8
 ; GFX8-NEXT:    s_lshr_b32 s2, s0, 8
 ; GFX8-NEXT:    s_lshr_b32 s3, s0, 16
 ; GFX8-NEXT:    s_lshr_b32 s4, s0, 24
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, s8
+; GFX8-NEXT:    v_mov_b32_e32 v0, s1
+; GFX8-NEXT:    s_lshl_b32 s1, s5, s8
 ; GFX8-NEXT:    v_add_u16_e64 v0, s0, v0 clamp
-; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    s_lshl_b32 s0, s2, s8
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_add_u16_e64 v1, s0, v1 clamp
-; GFX8-NEXT:    v_mov_b32_e32 v4, 0xff
 ; GFX8-NEXT:    s_lshl_b32 s1, s6, s8
-; GFX8-NEXT:    v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX8-NEXT:    v_mov_b32_e32 v2, s1
+; GFX8-NEXT:    v_mov_b32_e32 v4, 0xff
 ; GFX8-NEXT:    s_lshl_b32 s0, s3, s8
+; GFX8-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX8-NEXT:    s_lshl_b32 s1, s7, s8
+; GFX8-NEXT:    v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_add_u16_e64 v2, s0, v2 clamp
 ; GFX8-NEXT:    s_lshl_b32 s0, s4, s8
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX8-NEXT:    v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
+; GFX8-NEXT:    v_add_u16_e64 v3, s0, v3 clamp
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_and_b32_sdwa v1, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX8-NEXT:    v_add_u16_e64 v3, s0, v3 clamp
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_and_b32_sdwa v1, v3, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -592,19 +592,19 @@ define amdgpu_ps i32 @s_uaddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 ; GFX9-NEXT:    s_lshr_b32 s6, s0, 24
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s3
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s4, s6
-; GFX9-NEXT:    s_lshr_b32 s6, s0, 16
 ; GFX9-NEXT:    s_mov_b32 s4, 0x80008
+; GFX9-NEXT:    s_lshr_b32 s6, s0, 16
 ; GFX9-NEXT:    s_lshr_b32 s7, s1, 8
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, s4
 ; GFX9-NEXT:    s_lshl_b32 s6, s6, 8
-; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s6
-; GFX9-NEXT:    s_lshr_b32 s6, s3, 16
 ; GFX9-NEXT:    s_lshr_b32 s8, s1, 16
 ; GFX9-NEXT:    s_lshr_b32 s9, s1, 24
+; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s6
+; GFX9-NEXT:    s_lshr_b32 s6, s3, 16
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s7
-; GFX9-NEXT:    s_lshr_b32 s7, s1, 16
 ; GFX9-NEXT:    s_lshl_b32 s3, s3, s4
 ; GFX9-NEXT:    s_lshl_b32 s6, s6, 8
+; GFX9-NEXT:    s_lshr_b32 s7, s1, 16
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s3, s6
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s6, s8, s9
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, s4
@@ -613,19 +613,19 @@ define amdgpu_ps i32 @s_uaddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 ; GFX9-NEXT:    s_lshr_b32 s7, s6, 16
 ; GFX9-NEXT:    s_lshl_b32 s4, s6, s4
 ; GFX9-NEXT:    s_lshl_b32 s6, s7, 8
-; GFX9-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s4, s6
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX9-NEXT:    v_pk_add_u16 v0, s0, v0 clamp
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s4
-; GFX9-NEXT:    v_pk_add_u16 v1, s3, v1 clamp
 ; GFX9-NEXT:    s_mov_b32 s2, 8
+; GFX9-NEXT:    v_pk_add_u16 v1, s3, v1 clamp
 ; GFX9-NEXT:    v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
 ; GFX9-NEXT:    v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
 ; GFX9-NEXT:    s_movk_i32 s0, 0xff
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    s_mov_b32 s5, 24
 ; GFX9-NEXT:    v_and_or_b32 v0, v0, s0, v2
 ; GFX9-NEXT:    v_and_b32_e32 v2, s0, v1
-; GFX9-NEXT:    s_mov_b32 s5, 24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_or3_b32 v0, v0, v2, v1
@@ -639,8 +639,8 @@ define amdgpu_ps i32 @s_uaddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 ; GFX10-NEXT:    s_lshr_b32 s4, s0, 24
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s2, s3, s4
-; GFX10-NEXT:    s_lshr_b32 s4, s0, 16
 ; GFX10-NEXT:    s_mov_b32 s3, 0x80008
+; GFX10-NEXT:    s_lshr_b32 s4, s0, 16
 ; GFX10-NEXT:    s_lshr_b32 s5, s1, 8
 ; GFX10-NEXT:    s_lshr_b32 s6, s1, 16
 ; GFX10-NEXT:    s_lshr_b32 s7, s1, 24
@@ -658,8 +658,8 @@ define amdgpu_ps i32 @s_uaddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 ; GFX10-NEXT:    s_lshl_b32 s5, s5, 8
 ; GFX10-NEXT:    s_lshl_b32 s3, s4, s3
 ; GFX10-NEXT:    s_lshl_b32 s4, s6, 8
-; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s5
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s2, s2, s8
+; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s5
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s3, s3, s4
 ; GFX10-NEXT:    v_pk_add_u16 v0, s0, s1 clamp
 ; GFX10-NEXT:    v_pk_add_u16 v1, s2, s3 clamp
@@ -1967,10 +1967,10 @@ define <2 x float> @v_uaddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
 ; GFX6-NEXT:    v_or_b32_e32 v1, v2, v1
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -2033,10 +2033,10 @@ define amdgpu_ps <2 x i32> @s_uaddsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre
 ; GFX6-NEXT:    s_lshr_b32 s1, s1, 16
 ; GFX6-NEXT:    s_add_i32 s3, s3, s4
 ; GFX6-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_lshr_b32 s3, s3, 16
-; GFX6-NEXT:    s_or_b32 s0, s0, s1
+; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_lshr_b32 s2, s2, 16
+; GFX6-NEXT:    s_or_b32 s0, s0, s1
 ; GFX6-NEXT:    s_lshl_b32 s1, s3, 16
 ; GFX6-NEXT:    s_or_b32 s1, s2, s1
 ; GFX6-NEXT:    ; return to shader part epilog
@@ -2044,20 +2044,20 @@ define amdgpu_ps <2 x i32> @s_uaddsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre
 ; GFX8-LABEL: s_uaddsat_v4i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_lshr_b32 s6, s2, 16
-; GFX8-NEXT:    s_lshr_b32 s7, s3, 16
 ; GFX8-NEXT:    s_lshr_b32 s4, s0, 16
+; GFX8-NEXT:    s_lshr_b32 s7, s3, 16
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s6
-; GFX8-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX8-NEXT:    s_lshr_b32 s5, s1, 16
-; GFX8-NEXT:    v_mov_b32_e32 v3, s7
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX8-NEXT:    v_add_u16_e64 v1, s4, v1 clamp
+; GFX8-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX8-NEXT:    v_mov_b32_e32 v4, 16
-; GFX8-NEXT:    v_mov_b32_e32 v2, s3
 ; GFX8-NEXT:    v_add_u16_e64 v0, s0, v0 clamp
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT:    v_mov_b32_e32 v2, s3
 ; GFX8-NEXT:    v_add_u16_e64 v3, s5, v3 clamp
-; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX8-NEXT:    v_add_u16_e64 v2, s1, v2 clamp
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX8-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
@@ -2131,16 +2131,16 @@ define <3 x float> @v_uaddsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) {
 ; GFX6-NEXT:    v_xor_b32_e32 v7, -1, v5
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_min_u32_e32 v6, v7, v6
-; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX6-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX6-NEXT:    v_or_b32_e32 v1, v2, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
 ; GFX6-NEXT:    v_or_b32_e32 v2, v4, v2
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -2218,16 +2218,16 @@ define amdgpu_ps <3 x i32> @s_uaddsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre
 ; GFX6-NEXT:    s_not_b32 s7, s5
 ; GFX6-NEXT:    s_lshr_b32 s1, s1, 16
 ; GFX6-NEXT:    s_min_u32 s6, s7, s6
-; GFX6-NEXT:    s_add_i32 s5, s5, s6
 ; GFX6-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_lshr_b32 s3, s3, 16
-; GFX6-NEXT:    s_or_b32 s0, s0, s1
+; GFX6-NEXT:    s_add_i32 s5, s5, s6
+; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_lshr_b32 s2, s2, 16
-; GFX6-NEXT:    s_lshl_b32 s1, s3, 16
 ; GFX6-NEXT:    s_lshr_b32 s5, s5, 16
-; GFX6-NEXT:    s_or_b32 s1, s2, s1
+; GFX6-NEXT:    s_or_b32 s0, s0, s1
+; GFX6-NEXT:    s_lshl_b32 s1, s3, 16
 ; GFX6-NEXT:    s_lshr_b32 s4, s4, 16
+; GFX6-NEXT:    s_or_b32 s1, s2, s1
 ; GFX6-NEXT:    s_lshl_b32 s2, s5, 16
 ; GFX6-NEXT:    s_or_b32 s2, s4, s2
 ; GFX6-NEXT:    ; return to shader part epilog
@@ -2235,28 +2235,28 @@ define amdgpu_ps <3 x i32> @s_uaddsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre
 ; GFX8-LABEL: s_uaddsat_v6i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_lshr_b32 s9, s3, 16
-; GFX8-NEXT:    s_lshr_b32 s10, s4, 16
 ; GFX8-NEXT:    s_lshr_b32 s6, s0, 16
+; GFX8-NEXT:    s_lshr_b32 s10, s4, 16
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s9
-; GFX8-NEXT:    v_mov_b32_e32 v0, s3
-; GFX8-NEXT:    s_lshr_b32 s11, s5, 16
 ; GFX8-NEXT:    s_lshr_b32 s7, s1, 16
-; GFX8-NEXT:    v_mov_b32_e32 v3, s10
+; GFX8-NEXT:    s_lshr_b32 s11, s5, 16
+; GFX8-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX8-NEXT:    v_add_u16_e64 v1, s6, v1 clamp
+; GFX8-NEXT:    v_mov_b32_e32 v3, s10
 ; GFX8-NEXT:    v_mov_b32_e32 v6, 16
-; GFX8-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8-NEXT:    s_lshr_b32 s8, s2, 16
-; GFX8-NEXT:    v_mov_b32_e32 v5, s11
 ; GFX8-NEXT:    v_add_u16_e64 v0, s0, v0 clamp
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8-NEXT:    v_add_u16_e64 v3, s7, v3 clamp
+; GFX8-NEXT:    v_mov_b32_e32 v5, s11
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT:    v_add_u16_e64 v2, s1, v2 clamp
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s5
+; GFX8-NEXT:    v_add_u16_e64 v5, s8, v5 clamp
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_add_u16_e64 v2, s1, v2 clamp
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX8-NEXT:    v_add_u16_e64 v5, s8, v5 clamp
-; GFX8-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_add_u16_e64 v4, s2, v4 clamp
+; GFX8-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX8-NEXT:    v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
@@ -2335,19 +2335,19 @@ define <4 x float> @v_uaddsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
 ; GFX6-NEXT:    v_xor_b32_e32 v9, -1, v7
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_min_u32_e32 v8, v9, v8
-; GFX6-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; GFX6-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX6-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX6-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
 ; GFX6-NEXT:    v_or_b32_e32 v2, v4, v2
 ; GFX6-NEXT:    v_or_b32_e32 v3, v6, v3
@@ -2442,19 +2442,19 @@ define amdgpu_ps <4 x i32> @s_uaddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
 ; GFX6-NEXT:    s_not_b32 s9, s7
 ; GFX6-NEXT:    s_lshr_b32 s1, s1, 16
 ; GFX6-NEXT:    s_min_u32 s8, s9, s8
-; GFX6-NEXT:    s_add_i32 s7, s7, s8
 ; GFX6-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_lshr_b32 s3, s3, 16
-; GFX6-NEXT:    s_or_b32 s0, s0, s1
+; GFX6-NEXT:    s_add_i32 s7, s7, s8
+; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_lshr_b32 s2, s2, 16
-; GFX6-NEXT:    s_lshl_b32 s1, s3, 16
-; GFX6-NEXT:    s_lshr_b32 s7, s7, 16
 ; GFX6-NEXT:    s_lshr_b32 s5, s5, 16
-; GFX6-NEXT:    s_or_b32 s1, s2, s1
+; GFX6-NEXT:    s_lshr_b32 s7, s7, 16
+; GFX6-NEXT:    s_or_b32 s0, s0, s1
+; GFX6-NEXT:    s_lshl_b32 s1, s3, 16
 ; GFX6-NEXT:    s_lshr_b32 s4, s4, 16
-; GFX6-NEXT:    s_lshl_b32 s2, s5, 16
 ; GFX6-NEXT:    s_lshr_b32 s6, s6, 16
+; GFX6-NEXT:    s_or_b32 s1, s2, s1
+; GFX6-NEXT:    s_lshl_b32 s2, s5, 16
 ; GFX6-NEXT:    s_lshl_b32 s3, s7, 16
 ; GFX6-NEXT:    s_or_b32 s2, s4, s2
 ; GFX6-NEXT:    s_or_b32 s3, s6, s3
@@ -2463,35 +2463,35 @@ define amdgpu_ps <4 x i32> @s_uaddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
 ; GFX8-LABEL: s_uaddsat_v8i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_lshr_b32 s12, s4, 16
-; GFX8-NEXT:    s_lshr_b32 s13, s5, 16
 ; GFX8-NEXT:    s_lshr_b32 s8, s0, 16
+; GFX8-NEXT:    s_lshr_b32 s13, s5, 16
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s12
-; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    s_lshr_b32 s9, s1, 16
 ; GFX8-NEXT:    s_lshr_b32 s14, s6, 16
 ; GFX8-NEXT:    s_lshr_b32 s15, s7, 16
-; GFX8-NEXT:    s_lshr_b32 s9, s1, 16
-; GFX8-NEXT:    v_mov_b32_e32 v3, s13
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX8-NEXT:    v_add_u16_e64 v1, s8, v1 clamp
+; GFX8-NEXT:    v_mov_b32_e32 v3, s13
 ; GFX8-NEXT:    v_mov_b32_e32 v8, 16
-; GFX8-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX8-NEXT:    s_lshr_b32 s10, s2, 16
-; GFX8-NEXT:    v_mov_b32_e32 v5, s14
 ; GFX8-NEXT:    s_lshr_b32 s11, s3, 16
-; GFX8-NEXT:    v_mov_b32_e32 v7, s15
 ; GFX8-NEXT:    v_add_u16_e64 v0, s0, v0 clamp
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX8-NEXT:    v_add_u16_e64 v3, s9, v3 clamp
+; GFX8-NEXT:    v_mov_b32_e32 v5, s14
+; GFX8-NEXT:    v_mov_b32_e32 v7, s15
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT:    v_add_u16_e64 v2, s1, v2 clamp
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s6
+; GFX8-NEXT:    v_add_u16_e64 v5, s10, v5 clamp
 ; GFX8-NEXT:    v_mov_b32_e32 v6, s7
+; GFX8-NEXT:    v_add_u16_e64 v7, s11, v7 clamp
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_add_u16_e64 v2, s1, v2 clamp
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX8-NEXT:    v_add_u16_e64 v7, s11, v7 clamp
-; GFX8-NEXT:    v_add_u16_e64 v5, s10, v5 clamp
-; GFX8-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_add_u16_e64 v4, s2, v4 clamp
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX8-NEXT:    v_add_u16_e64 v6, s3, v6 clamp
+; GFX8-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v3, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX8-NEXT:    v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -2838,8 +2838,8 @@ define amdgpu_ps <2 x i64> @s_uaddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_addc_u32 s1, s1, s5
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s5
-; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
 ; GFX6-NEXT:    s_add_u32 s0, s2, s6
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX6-NEXT:    s_cselect_b32 s1, 1, 0
@@ -2870,8 +2870,8 @@ define amdgpu_ps <2 x i64> @s_uaddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX8-NEXT:    s_addc_u32 s1, s1, s5
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s5
-; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
 ; GFX8-NEXT:    s_add_u32 s0, s2, s6
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX8-NEXT:    s_cselect_b32 s1, 1, 0
@@ -2902,8 +2902,8 @@ define amdgpu_ps <2 x i64> @s_uaddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    s_addc_u32 s1, s1, s5
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s0
+; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
 ; GFX9-NEXT:    s_add_u32 s0, s2, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX9-NEXT:    s_cselect_b32 s1, 1, 0
@@ -2942,9 +2942,9 @@ define amdgpu_ps <2 x i64> @s_uaddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX10-NEXT:    v_cmp_lt_u64_e64 s5, s[2:3], s[6:7]
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, s1, -1, s4
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, s2, -1, s5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, s3, -1, s5
-; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX10-NEXT:    v_readfirstlane_b32 s2, v2
 ; GFX10-NEXT:    v_readfirstlane_b32 s3, v3
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -2969,8 +2969,8 @@ define amdgpu_ps i128 @s_uaddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX6-NEXT:    s_and_b32 s8, s8, 1
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX6-NEXT:    s_cmp_lg_u32 s8, 0
-; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
 ; GFX6-NEXT:    s_addc_u32 s3, s3, s7
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
@@ -2980,8 +2980,8 @@ define amdgpu_ps i128 @s_uaddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s1
+; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX6-NEXT:    v_cndmask_b32_e64 v0, v1, -1, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, v2, -1, vcc
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s2
@@ -3011,8 +3011,8 @@ define amdgpu_ps i128 @s_uaddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX8-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX8-NEXT:    s_addc_u32 s3, s3, s7
-; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX8-NEXT:    s_cmp_eq_u64 s[2:3], s[6:7]
 ; GFX8-NEXT:    s_cselect_b32 s6, 1, 0
@@ -3024,8 +3024,8 @@ define amdgpu_ps i128 @s_uaddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s0
-; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s1
+; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v1, -1, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, v2, -1, vcc
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s2
@@ -3055,8 +3055,8 @@ define amdgpu_ps i128 @s_uaddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX9-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX9-NEXT:    s_addc_u32 s3, s3, s7
-; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9-NEXT:    s_cmp_eq_u64 s[2:3], s[6:7]
 ; GFX9-NEXT:    s_cselect_b32 s6, 1, 0
@@ -3068,8 +3068,8 @@ define amdgpu_ps i128 @s_uaddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s1
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v1, -1, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v2, -1, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s2
@@ -3479,8 +3479,8 @@ define amdgpu_ps <2 x i128> @s_uaddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX6-NEXT:    s_and_b32 s16, s16, 1
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s9
 ; GFX6-NEXT:    s_cmp_lg_u32 s16, 0
-; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s10
+; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
 ; GFX6-NEXT:    s_addc_u32 s3, s3, s11
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s11
 ; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
@@ -3503,17 +3503,17 @@ define amdgpu_ps <2 x i128> @s_uaddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX6-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX6-NEXT:    s_addc_u32 s2, s6, s14
 ; GFX6-NEXT:    v_cndmask_b32_e64 v4, v1, -1, vcc
+; GFX6-NEXT:    v_cndmask_b32_e64 v5, v2, -1, vcc
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX6-NEXT:    s_cselect_b32 s3, 1, 0
-; GFX6-NEXT:    v_cndmask_b32_e64 v5, v2, -1, vcc
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s12
 ; GFX6-NEXT:    s_and_b32 s3, s3, 1
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s13
-; GFX6-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX6-NEXT:    v_cndmask_b32_e64 v6, v0, -1, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e64 v7, v1, -1, vcc
-; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
+; GFX6-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s14
+; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
 ; GFX6-NEXT:    s_addc_u32 s3, s7, s15
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s15
 ; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
@@ -3523,12 +3523,12 @@ define amdgpu_ps <2 x i128> @s_uaddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s1
-; GFX6-NEXT:    v_mov_b32_e32 v3, s3
+; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX6-NEXT:    v_cndmask_b32_e64 v0, v1, -1, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, v2, -1, vcc
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s2
+; GFX6-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, -1, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e64 v3, v3, -1, vcc
 ; GFX6-NEXT:    v_readfirstlane_b32 s0, v4
@@ -3558,8 +3558,8 @@ define amdgpu_ps <2 x i128> @s_uaddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX8-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s9
 ; GFX8-NEXT:    s_addc_u32 s3, s3, s11
-; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s10
+; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s11
 ; GFX8-NEXT:    s_cmp_eq_u64 s[2:3], s[10:11]
 ; GFX8-NEXT:    s_cselect_b32 s10, 1, 0
@@ -3586,18 +3586,18 @@ define amdgpu_ps <2 x i128> @s_uaddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX8-NEXT:    v_cndmask_b32_e64 v4, v1, -1, vcc
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX8-NEXT:    s_cselect_b32 s3, 1, 0
-; GFX8-NEXT:    s_and_b32 s3, s3, 1
 ; GFX8-NEXT:    v_cndmask_b32_e64 v5, v2, -1, vcc
+; GFX8-NEXT:    s_and_b32 s3, s3, 1
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s12
 ; GFX8-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s13
-; GFX8-NEXT:    s_addc_u32 s3, s7, s15
 ; GFX8-NEXT:    v_cndmask_b32_e64 v6, v0, -1, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v7, v1, -1, vcc
-; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
+; GFX8-NEXT:    s_addc_u32 s3, s7, s15
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s14
-; GFX8-NEXT:    s_cmp_eq_u64 s[2:3], s[14:15]
+; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s15
+; GFX8-NEXT:    s_cmp_eq_u64 s[2:3], s[14:15]
 ; GFX8-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
@@ -3607,12 +3607,12 @@ define amdgpu_ps <2 x i128> @s_uaddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s0
-; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s1
-; GFX8-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v1, -1, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, v2, -1, vcc
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, -1, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, -1, vcc
 ; GFX8-NEXT:    v_readfirstlane_b32 s0, v4
@@ -3642,8 +3642,8 @@ define amdgpu_ps <2 x i128> @s_uaddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX9-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s9
 ; GFX9-NEXT:    s_addc_u32 s3, s3, s11
-; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s10
+; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s11
 ; GFX9-NEXT:    s_cmp_eq_u64 s[2:3], s[10:11]
 ; GFX9-NEXT:    s_cselect_b32 s10, 1, 0
@@ -3670,18 +3670,18 @@ define amdgpu_ps <2 x i128> @s_uaddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, v1, -1, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    s_cselect_b32 s3, 1, 0
-; GFX9-NEXT:    s_and_b32 s3, s3, 1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, v2, -1, vcc
+; GFX9-NEXT:    s_and_b32 s3, s3, 1
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s12
 ; GFX9-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s13
-; GFX9-NEXT:    s_addc_u32 s3, s7, s15
 ; GFX9-NEXT:    v_cndmask_b32_e64 v6, v0, -1, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v7, v1, -1, vcc
-; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
+; GFX9-NEXT:    s_addc_u32 s3, s7, s15
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s14
-; GFX9-NEXT:    s_cmp_eq_u64 s[2:3], s[14:15]
+; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s15
+; GFX9-NEXT:    s_cmp_eq_u64 s[2:3], s[14:15]
 ; GFX9-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
@@ -3691,12 +3691,12 @@ define amdgpu_ps <2 x i128> @s_uaddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s1
-; GFX9-NEXT:    v_mov_b32_e32 v3, s3
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v1, -1, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v2, -1, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, -1, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, -1, vcc
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v4
@@ -3746,8 +3746,8 @@ define amdgpu_ps <2 x i128> @s_uaddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX10-NEXT:    s_cselect_b32 s8, 1, 0
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    s_and_b32 s8, s8, 1
-; GFX10-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s9
+; GFX10-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX10-NEXT:    s_addc_u32 s7, s7, s15
 ; GFX10-NEXT:    s_cmp_eq_u64 s[6:7], s[14:15]
 ; GFX10-NEXT:    v_cmp_lt_u64_e64 s9, s[6:7], s[14:15]

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
index 11d115fcc5c05..133a224b7437c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
@@ -88,8 +88,8 @@ define amdgpu_kernel void @udivrem_i32(i32 addrspace(1)* %out0, i32 addrspace(1)
 ; GFX10-NEXT:    v_mul_lo_u32 v1, v0, s7
 ; GFX10-NEXT:    v_add_nc_u32_e32 v2, 1, v0
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v1, s6, v1
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s7, v1
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, s7, v1
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s7, v1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
 ; GFX10-NEXT:    v_add_nc_u32_e32 v2, 1, v0
@@ -129,11 +129,11 @@ define amdgpu_kernel void @udivrem_i64(i64 addrspace(1)* %out0, i64 addrspace(1)
 ; GFX8-NEXT:    v_trunc_f32_e32 v1, v1
 ; GFX8-NEXT:    v_mul_f32_e32 v2, 0xcf800000, v1
 ; GFX8-NEXT:    v_add_f32_e32 v0, v2, v0
-; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v6, s11
-; GFX8-NEXT:    v_mul_lo_u32 v3, s3, v0
 ; GFX8-NEXT:    v_mul_lo_u32 v2, s2, v1
+; GFX8-NEXT:    v_mul_lo_u32 v3, s3, v0
 ; GFX8-NEXT:    v_mul_hi_u32 v5, s2, v0
 ; GFX8-NEXT:    v_mul_lo_u32 v4, s2, v0
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
@@ -149,12 +149,12 @@ define amdgpu_kernel void @udivrem_i64(i64 addrspace(1)* %out0, i64 addrspace(1)
 ; GFX8-NEXT:    v_mul_lo_u32 v7, v1, v2
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v5, v3
 ; GFX8-NEXT:    v_mul_hi_u32 v5, v0, v2
-; GFX8-NEXT:    v_mul_hi_u32 v2, v1, v2
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v7, v4
 ; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
 ; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v7, v5
+; GFX8-NEXT:    v_mul_hi_u32 v2, v1, v2
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v4, v3
 ; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v5, v4
@@ -179,12 +179,12 @@ define amdgpu_kernel void @udivrem_i64(i64 addrspace(1)* %out0, i64 addrspace(1)
 ; GFX8-NEXT:    v_mul_lo_u32 v5, v3, v4
 ; GFX8-NEXT:    v_add_u32_e64 v2, s[0:1], v8, v2
 ; GFX8-NEXT:    v_mul_hi_u32 v8, v0, v4
-; GFX8-NEXT:    v_mul_hi_u32 v3, v3, v4
 ; GFX8-NEXT:    v_add_u32_e64 v5, s[0:1], v5, v7
 ; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[0:1]
 ; GFX8-NEXT:    v_add_u32_e64 v5, s[0:1], v5, v8
 ; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[0:1]
 ; GFX8-NEXT:    v_add_u32_e64 v7, s[0:1], v7, v8
+; GFX8-NEXT:    v_mul_hi_u32 v3, v3, v4
 ; GFX8-NEXT:    v_add_u32_e64 v2, s[0:1], v5, v2
 ; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[0:1]
 ; GFX8-NEXT:    v_add_u32_e64 v4, s[0:1], v7, v5
@@ -204,12 +204,12 @@ define amdgpu_kernel void @udivrem_i64(i64 addrspace(1)* %out0, i64 addrspace(1)
 ; GFX8-NEXT:    v_mul_lo_u32 v5, s9, v1
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
 ; GFX8-NEXT:    v_mul_hi_u32 v3, s8, v1
-; GFX8-NEXT:    v_mul_hi_u32 v1, s9, v1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v5, v0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v3
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v5, v3
+; GFX8-NEXT:    v_mul_hi_u32 v1, s9, v1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
@@ -235,12 +235,12 @@ define amdgpu_kernel void @udivrem_i64(i64 addrspace(1)* %out0, i64 addrspace(1)
 ; GFX8-NEXT:    v_add_u32_e64 v9, s[0:1], 1, v0
 ; GFX8-NEXT:    v_addc_u32_e64 v10, s[0:1], 0, v1, s[0:1]
 ; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s11, v8
-; GFX8-NEXT:    v_subb_u32_e32 v2, vcc, v2, v6, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[0:1]
 ; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v7
-; GFX8-NEXT:    v_subrev_u32_e32 v6, vcc, s10, v7
+; GFX8-NEXT:    v_subb_u32_e32 v2, vcc, v2, v6, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[0:1]
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s11, v8
+; GFX8-NEXT:    v_subrev_u32_e32 v6, vcc, s10, v7
 ; GFX8-NEXT:    v_cndmask_b32_e64 v11, v11, v12, s[0:1]
 ; GFX8-NEXT:    v_add_u32_e64 v12, s[0:1], 1, v9
 ; GFX8-NEXT:    v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
@@ -282,10 +282,10 @@ define amdgpu_kernel void @udivrem_i64(i64 addrspace(1)* %out0, i64 addrspace(1)
 ; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
 ; GFX9-NEXT:    v_mul_f32_e32 v2, 0xcf800000, v1
 ; GFX9-NEXT:    v_add_f32_e32 v0, v2, v0
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT:    v_mul_lo_u32 v3, s3, v0
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v2, s2, v1
+; GFX9-NEXT:    v_mul_lo_u32 v3, s3, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v4, s2, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v5, s2, v0
 ; GFX9-NEXT:    v_add3_u32 v2, v3, v2, v4
@@ -301,15 +301,15 @@ define amdgpu_kernel void @udivrem_i64(i64 addrspace(1)* %out0, i64 addrspace(1)
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v7, v5
 ; GFX9-NEXT:    v_mul_hi_u32 v2, v1, v2
+; GFX9-NEXT:    v_add_u32_e32 v3, v6, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v8
-; GFX9-NEXT:    v_add_u32_e32 v3, v6, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v5, v5, v6
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; GFX9-NEXT:    v_add3_u32 v2, v5, v4, v2
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
 ; GFX9-NEXT:    v_addc_co_u32_e64 v3, s[0:1], v1, v2, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v4, s3, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v5, s2, v3
@@ -383,20 +383,20 @@ define amdgpu_kernel void @udivrem_i64(i64 addrspace(1)* %out0, i64 addrspace(1)
 ; GFX9-NEXT:    v_add_co_u32_e64 v10, s[0:1], 1, v0
 ; GFX9-NEXT:    v_addc_co_u32_e64 v11, s[0:1], 0, v1, s[0:1]
 ; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s11, v9
-; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v4, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[0:1]
 ; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v8
-; GFX9-NEXT:    v_subrev_co_u32_e32 v4, vcc, s10, v8
+; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v4, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[0:1]
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s11, v9
+; GFX9-NEXT:    v_subrev_co_u32_e32 v4, vcc, s10, v8
 ; GFX9-NEXT:    v_cndmask_b32_e64 v12, v12, v13, s[0:1]
 ; GFX9-NEXT:    v_add_co_u32_e64 v13, s[0:1], 1, v10
 ; GFX9-NEXT:    v_subbrev_co_u32_e32 v2, vcc, 0, v2, vcc
 ; GFX9-NEXT:    v_addc_co_u32_e64 v14, s[0:1], 0, v11, s[0:1]
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v12
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v7
 ; GFX9-NEXT:    v_cndmask_b32_e32 v10, v10, v13, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v11, v11, v14, vcc
+; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v7
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v10, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v11, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
@@ -415,11 +415,11 @@ define amdgpu_kernel void @udivrem_i64(i64 addrspace(1)* %out0, i64 addrspace(1)
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s10
 ; GFX10-NEXT:    s_sub_u32 s1, 0, s10
 ; GFX10-NEXT:    s_cselect_b32 s0, 1, 0
-; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
 ; GFX10-NEXT:    s_and_b32 s0, s0, 1
+; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
 ; GFX10-NEXT:    s_cmp_lg_u32 s0, 0
-; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    s_subb_u32 s2, 0, s11
+; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
@@ -511,8 +511,8 @@ define amdgpu_kernel void @udivrem_i64(i64 addrspace(1)* %out0, i64 addrspace(1)
 ; GFX10-NEXT:    v_add3_u32 v2, v2, v4, v3
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, v0, 1
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    v_sub_co_u32 v5, vcc_lo, s8, v5
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v6, s9, v2
+; GFX10-NEXT:    v_sub_co_u32 v5, vcc_lo, s8, v5
 ; GFX10-NEXT:    v_sub_co_ci_u32_e64 v7, s0, s9, v2, vcc_lo
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v2, vcc_lo, s11, v6, vcc_lo
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s10, v5
@@ -536,8 +536,8 @@ define amdgpu_kernel void @udivrem_i64(i64 addrspace(1)* %out0, i64 addrspace(1)
 ; GFX10-NEXT:    v_sub_co_u32 v10, s0, v8, s10
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v2, s0, 0, v2, s0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v13, vcc_lo
-; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, v6
 ; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v14, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, v6
 ; GFX10-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v8, v9, v2, vcc_lo
 ; GFX10-NEXT:    v_mov_b32_e32 v9, 0
@@ -646,15 +646,15 @@ define amdgpu_kernel void @udivrem_v2i32(<2 x i32> addrspace(1)* %out0, <2 x i32
 ; GFX9-NEXT:    v_sub_u32_e32 v3, s1, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
 ; GFX9-NEXT:    v_subrev_u32_e32 v4, s2, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[0:1]
 ; GFX9-NEXT:    v_subrev_u32_e32 v5, s3, v3
 ; GFX9-NEXT:    v_add_u32_e32 v4, 1, v0
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s2, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
 ; GFX9-NEXT:    v_subrev_u32_e32 v4, s2, v2
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v3
@@ -683,12 +683,10 @@ define amdgpu_kernel void @udivrem_v2i32(<2 x i32> addrspace(1)* %out0, <2 x i32
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX10-NEXT:    v_mul_lo_u32 v2, s0, v0
 ; GFX10-NEXT:    v_mul_lo_u32 v3, s1, v1
-; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x10
-; GFX10-NEXT:    s_nop 0
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
 ; GFX10-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX10-NEXT:    v_mul_hi_u32 v3, v1, v3
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_add_nc_u32_e32 v1, v1, v3
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -700,17 +698,17 @@ define amdgpu_kernel void @udivrem_v2i32(<2 x i32> addrspace(1)* %out0, <2 x i32
 ; GFX10-NEXT:    v_add_nc_u32_e32 v5, 1, v1
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v2, s0, v2
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v3, s1, v3
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s2, v2
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v6, s2, v2
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s3, v3
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s2, v2
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, s3, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s0
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc_lo
+; GFX10-NEXT:    v_add_nc_u32_e32 v5, 1, v1
 ; GFX10-NEXT:    v_add_nc_u32_e32 v4, 1, v0
 ; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s2, v2
-; GFX10-NEXT:    v_add_nc_u32_e32 v5, 1, v1
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s3, v3
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v6, s2, v2
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, s3, v3
@@ -772,10 +770,10 @@ define amdgpu_kernel void @udivrem_v4i32(<4 x i32> addrspace(1)* %out0, <4 x i32
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v5, v6
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 1, v1
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s9, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
 ; GFX8-NEXT:    v_mul_f32_e32 v5, v5, v2
-; GFX8-NEXT:    v_subrev_u32_e64 v6, s[0:1], s9, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v5, v5
+; GFX8-NEXT:    v_subrev_u32_e64 v6, s[0:1], s9, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 1, v1
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s9, v3
@@ -900,8 +898,8 @@ define amdgpu_kernel void @udivrem_v4i32(<4 x i32> addrspace(1)* %out0, <4 x i32
 ; GFX9-NEXT:    v_mul_hi_u32 v7, s11, v2
 ; GFX9-NEXT:    v_add_u32_e32 v8, 1, v3
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s2, v6
-; GFX9-NEXT:    v_subrev_u32_e32 v2, s2, v6
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v8, vcc
+; GFX9-NEXT:    v_subrev_u32_e32 v2, s2, v6
 ; GFX9-NEXT:    v_mul_lo_u32 v8, v7, s3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v2, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v2, 1, v3
@@ -999,25 +997,25 @@ define amdgpu_kernel void @udivrem_v4i32(<4 x i32> addrspace(1)* %out0, <4 x i32
 ; GFX10-NEXT:    v_cndmask_b32_e64 v6, v6, v11, s1
 ; GFX10-NEXT:    v_add_nc_u32_e32 v9, 1, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v7, v7, v12, s2
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s8, v4
 ; GFX10-NEXT:    v_add_nc_u32_e32 v10, 1, v1
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s9, v5
 ; GFX10-NEXT:    v_add_nc_u32_e32 v11, 1, v2
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s8, v4
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s9, v5
 ; GFX10-NEXT:    v_cmp_le_u32_e64 s1, s10, v6
 ; GFX10-NEXT:    v_add_nc_u32_e32 v12, 1, v3
 ; GFX10-NEXT:    v_cmp_le_u32_e64 s2, s11, v7
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v13, s11, v7
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v9, vcc_lo
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v9, s8, v4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v10, s0
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v10, s9, v5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v11, s1
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v11, s10, v6
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v13, s11, v7
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v12, s2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v9, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, v10, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v7, v7, v13, s2
 ; GFX10-NEXT:    v_cndmask_b32_e64 v6, v6, v11, s1
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, v7, v13, s2
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_store_dwordx4 v8, v[0:3], s[4:5]
 ; GFX10-NEXT:    global_store_dwordx4 v8, v[4:7], s[6:7]
@@ -1051,11 +1049,11 @@ define amdgpu_kernel void @udivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX8-NEXT:    v_trunc_f32_e32 v1, v1
 ; GFX8-NEXT:    v_mul_f32_e32 v2, 0xcf800000, v1
 ; GFX8-NEXT:    v_add_f32_e32 v0, v2, v0
-; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v6, s9
-; GFX8-NEXT:    v_mul_lo_u32 v3, s3, v0
 ; GFX8-NEXT:    v_mul_lo_u32 v2, s2, v1
+; GFX8-NEXT:    v_mul_lo_u32 v3, s3, v0
 ; GFX8-NEXT:    v_mul_hi_u32 v5, s2, v0
 ; GFX8-NEXT:    v_mul_lo_u32 v4, s2, v0
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
@@ -1071,12 +1069,12 @@ define amdgpu_kernel void @udivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX8-NEXT:    v_mul_lo_u32 v7, v1, v2
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v5, v3
 ; GFX8-NEXT:    v_mul_hi_u32 v5, v0, v2
-; GFX8-NEXT:    v_mul_hi_u32 v2, v1, v2
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v7, v4
 ; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
 ; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v7, v5
+; GFX8-NEXT:    v_mul_hi_u32 v2, v1, v2
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v4, v3
 ; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v5, v4
@@ -1102,12 +1100,12 @@ define amdgpu_kernel void @udivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX8-NEXT:    v_mul_lo_u32 v5, v3, v4
 ; GFX8-NEXT:    v_add_u32_e64 v2, s[0:1], v8, v2
 ; GFX8-NEXT:    v_mul_hi_u32 v8, v0, v4
-; GFX8-NEXT:    v_mul_hi_u32 v3, v3, v4
 ; GFX8-NEXT:    v_add_u32_e64 v5, s[0:1], v5, v7
 ; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[0:1]
 ; GFX8-NEXT:    v_add_u32_e64 v5, s[0:1], v5, v8
 ; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[0:1]
 ; GFX8-NEXT:    v_add_u32_e64 v7, s[0:1], v7, v8
+; GFX8-NEXT:    v_mul_hi_u32 v3, v3, v4
 ; GFX8-NEXT:    v_add_u32_e64 v2, s[0:1], v5, v2
 ; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[0:1]
 ; GFX8-NEXT:    v_add_u32_e64 v4, s[0:1], v7, v5
@@ -1127,12 +1125,12 @@ define amdgpu_kernel void @udivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX8-NEXT:    v_mul_lo_u32 v5, s13, v1
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
 ; GFX8-NEXT:    v_mul_hi_u32 v3, s12, v1
-; GFX8-NEXT:    v_mul_hi_u32 v1, s13, v1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v5, v0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v3
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v5, v3
+; GFX8-NEXT:    v_mul_hi_u32 v1, s13, v1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
@@ -1158,24 +1156,24 @@ define amdgpu_kernel void @udivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX8-NEXT:    v_add_u32_e64 v9, s[0:1], 1, v0
 ; GFX8-NEXT:    v_addc_u32_e64 v10, s[0:1], 0, v1, s[0:1]
 ; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v8
-; GFX8-NEXT:    v_subb_u32_e32 v2, vcc, v2, v6, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[0:1]
 ; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v7
-; GFX8-NEXT:    v_subrev_u32_e32 v6, vcc, s8, v7
+; GFX8-NEXT:    v_subb_u32_e32 v2, vcc, v2, v6, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[0:1]
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v8
+; GFX8-NEXT:    v_subrev_u32_e32 v6, vcc, s8, v7
 ; GFX8-NEXT:    v_cndmask_b32_e64 v11, v11, v12, s[0:1]
 ; GFX8-NEXT:    v_add_u32_e64 v12, s[0:1], 1, v9
 ; GFX8-NEXT:    v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
-; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
 ; GFX8-NEXT:    v_addc_u32_e64 v13, s[0:1], 0, v10, s[0:1]
+; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
+; GFX8-NEXT:    v_cndmask_b32_e32 v9, v9, v12, vcc
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v4, s11
-; GFX8-NEXT:    v_cndmask_b32_e32 v9, v9, v12, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v9, s[0:1]
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v9, s10
-; GFX8-NEXT:    v_mul_f32_e32 v4, 0x4f800000, v4
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
+; GFX8-NEXT:    v_mul_f32_e32 v4, 0x4f800000, v4
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
 ; GFX8-NEXT:    v_add_f32_e32 v4, v4, v9
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v7, v4
@@ -1188,9 +1186,9 @@ define amdgpu_kernel void @udivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX8-NEXT:    v_mul_f32_e32 v6, 0xcf800000, v3
 ; GFX8-NEXT:    v_add_f32_e32 v2, v6, v2
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v10, s[0:1]
-; GFX8-NEXT:    s_cselect_b32 s0, 1, 0
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX8-NEXT:    s_cselect_b32 s0, 1, 0
 ; GFX8-NEXT:    s_and_b32 s0, s0, 1
 ; GFX8-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX8-NEXT:    s_subb_u32 s3, 0, s11
@@ -1212,12 +1210,12 @@ define amdgpu_kernel void @udivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX8-NEXT:    v_mul_lo_u32 v11, v3, v6
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v9, v7
 ; GFX8-NEXT:    v_mul_hi_u32 v9, v2, v6
-; GFX8-NEXT:    v_mul_hi_u32 v6, v3, v6
 ; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v11, v8
 ; GFX8-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v9
 ; GFX8-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v11, v9
+; GFX8-NEXT:    v_mul_hi_u32 v6, v3, v6
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v8, v7
 ; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v9, v8
@@ -1242,12 +1240,12 @@ define amdgpu_kernel void @udivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX8-NEXT:    v_mul_lo_u32 v9, v7, v8
 ; GFX8-NEXT:    v_add_u32_e64 v6, s[0:1], v12, v6
 ; GFX8-NEXT:    v_mul_hi_u32 v12, v2, v8
-; GFX8-NEXT:    v_mul_hi_u32 v7, v7, v8
 ; GFX8-NEXT:    v_add_u32_e64 v9, s[0:1], v9, v11
 ; GFX8-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[0:1]
 ; GFX8-NEXT:    v_add_u32_e64 v9, s[0:1], v9, v12
 ; GFX8-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[0:1]
 ; GFX8-NEXT:    v_add_u32_e64 v11, s[0:1], v11, v12
+; GFX8-NEXT:    v_mul_hi_u32 v7, v7, v8
 ; GFX8-NEXT:    v_add_u32_e64 v6, s[0:1], v9, v6
 ; GFX8-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[0:1]
 ; GFX8-NEXT:    v_add_u32_e64 v8, s[0:1], v11, v9
@@ -1267,12 +1265,12 @@ define amdgpu_kernel void @udivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX8-NEXT:    v_mul_lo_u32 v9, s15, v3
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v7, v6
 ; GFX8-NEXT:    v_mul_hi_u32 v7, s14, v3
-; GFX8-NEXT:    v_mul_hi_u32 v3, s15, v3
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v9, v2
 ; GFX8-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v7
 ; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v9, v7
+; GFX8-NEXT:    v_mul_hi_u32 v3, s15, v3
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v6
 ; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v7, v6
@@ -1347,11 +1345,11 @@ define amdgpu_kernel void @udivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
 ; GFX9-NEXT:    v_mul_f32_e32 v2, 0xcf800000, v1
 ; GFX9-NEXT:    v_add_f32_e32 v0, v2, v0
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v14, s11
-; GFX9-NEXT:    v_mul_lo_u32 v3, s3, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v2, s2, v1
+; GFX9-NEXT:    v_mul_lo_u32 v3, s3, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v4, s2, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v5, s2, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v14, 0x4f800000, v14
@@ -1368,15 +1366,15 @@ define amdgpu_kernel void @udivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v7, v5
 ; GFX9-NEXT:    v_mul_hi_u32 v2, v1, v2
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v3, v6, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v8
 ; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v5, v5, v6
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; GFX9-NEXT:    v_add3_u32 v2, v5, v4, v2
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
 ; GFX9-NEXT:    v_addc_co_u32_e64 v3, s[0:1], v1, v2, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v4, s3, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v5, s2, v3
@@ -1450,14 +1448,14 @@ define amdgpu_kernel void @udivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s10
 ; GFX9-NEXT:    v_add_co_u32_e64 v9, s[0:1], 1, v0
 ; GFX9-NEXT:    v_addc_co_u32_e64 v10, s[0:1], 0, v1, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v8
 ; GFX9-NEXT:    v_add_f32_e32 v5, v14, v5
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v5
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v8
 ; GFX9-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[0:1]
 ; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v7
-; GFX9-NEXT:    v_subrev_co_u32_e32 v15, vcc, s8, v7
 ; GFX9-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[0:1]
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v8
+; GFX9-NEXT:    v_subrev_co_u32_e32 v15, vcc, s8, v7
 ; GFX9-NEXT:    v_cndmask_b32_e64 v11, v11, v12, s[0:1]
 ; GFX9-NEXT:    v_subbrev_co_u32_e32 v2, vcc, 0, v2, vcc
 ; GFX9-NEXT:    v_mul_f32_e32 v5, 0x5f7ffffc, v5
@@ -1470,9 +1468,9 @@ define amdgpu_kernel void @udivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX9-NEXT:    v_addc_co_u32_e64 v13, s[0:1], 0, v10, s[0:1]
 ; GFX9-NEXT:    v_add_f32_e32 v5, v12, v5
 ; GFX9-NEXT:    s_sub_u32 s8, 0, s10
-; GFX9-NEXT:    s_cselect_b32 s0, 1, 0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v5, v5
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v11, v11
+; GFX9-NEXT:    s_cselect_b32 s0, 1, 0
 ; GFX9-NEXT:    s_and_b32 s0, s0, 1
 ; GFX9-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX9-NEXT:    s_subb_u32 s9, 0, s11
@@ -1503,10 +1501,10 @@ define amdgpu_kernel void @udivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX9-NEXT:    v_add_co_u32_e64 v10, s[2:3], v12, v10
 ; GFX9-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[2:3]
 ; GFX9-NEXT:    v_add_co_u32_e64 v9, s[2:3], v10, v9
-; GFX9-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[2:3]
 ; GFX9-NEXT:    v_add_u32_e32 v12, v13, v12
-; GFX9-NEXT:    v_add_co_u32_e64 v5, s[2:3], v5, v9
+; GFX9-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[2:3]
 ; GFX9-NEXT:    v_add3_u32 v10, v12, v10, v4
+; GFX9-NEXT:    v_add_co_u32_e64 v5, s[2:3], v5, v9
 ; GFX9-NEXT:    v_addc_co_u32_e64 v9, s[6:7], v11, v10, s[2:3]
 ; GFX9-NEXT:    v_mul_lo_u32 v12, s9, v5
 ; GFX9-NEXT:    v_mul_lo_u32 v13, s8, v9
@@ -1597,8 +1595,8 @@ define amdgpu_kernel void @udivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX9-NEXT:    v_subrev_co_u32_e64 v15, s[0:1], s10, v11
 ; GFX9-NEXT:    v_subbrev_co_u32_e64 v6, s[0:1], 0, v6, s[0:1]
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v10
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v9, s[0:1]
 ; GFX9-NEXT:    v_mov_b32_e32 v13, 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v9, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v14, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v9, v11, v15, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v10, v12, v6, vcc
@@ -1674,64 +1672,63 @@ define amdgpu_kernel void @udivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX10-NEXT:    v_mul_hi_u32 v14, v0, v4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s0
 ; GFX10-NEXT:    v_add_co_u32 v7, s0, v13, v7
-; GFX10-NEXT:    v_mul_hi_u32 v17, v1, v8
 ; GFX10-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s0
 ; GFX10-NEXT:    v_add_co_u32 v9, s0, v9, v15
-; GFX10-NEXT:    v_mul_hi_u32 v4, v2, v4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s0
 ; GFX10-NEXT:    v_add_co_u32 v11, s0, v16, v11
-; GFX10-NEXT:    v_mul_hi_u32 v8, v3, v8
 ; GFX10-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s0
 ; GFX10-NEXT:    v_add_co_u32 v5, s0, v5, v6
+; GFX10-NEXT:    v_mul_hi_u32 v17, v1, v8
 ; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s0
 ; GFX10-NEXT:    v_add_co_u32 v6, s0, v7, v14
 ; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s0
 ; GFX10-NEXT:    v_add_co_u32 v9, s0, v9, v10
-; GFX10-NEXT:    v_add_nc_u32_e32 v5, v12, v5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s0
+; GFX10-NEXT:    v_add_nc_u32_e32 v5, v12, v5
 ; GFX10-NEXT:    v_add_co_u32 v10, s0, v11, v17
-; GFX10-NEXT:    v_add_nc_u32_e32 v7, v13, v7
+; GFX10-NEXT:    v_mul_hi_u32 v4, v2, v4
+; GFX10-NEXT:    v_add_nc_u32_e32 v9, v15, v9
 ; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s0
 ; GFX10-NEXT:    v_add_co_u32 v5, s0, v6, v5
-; GFX10-NEXT:    v_add_nc_u32_e32 v9, v15, v9
+; GFX10-NEXT:    v_add_nc_u32_e32 v7, v13, v7
 ; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s0
-; GFX10-NEXT:    v_add_nc_u32_e32 v11, v16, v11
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v5
+; GFX10-NEXT:    v_mul_hi_u32 v8, v3, v8
 ; GFX10-NEXT:    v_add_co_u32 v9, s0, v10, v9
-; GFX10-NEXT:    v_add3_u32 v4, v7, v6, v4
+; GFX10-NEXT:    v_add_nc_u32_e32 v11, v16, v11
 ; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s0
-; GFX10-NEXT:    v_mul_lo_u32 v7, s2, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v6, s0, v2, v4, vcc_lo
+; GFX10-NEXT:    v_add3_u32 v4, v7, v6, v4
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v5
 ; GFX10-NEXT:    v_add3_u32 v5, v11, v10, v8
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v6, s0, v2, v4, vcc_lo
 ; GFX10-NEXT:    v_add_co_u32 v1, s0, v1, v9
 ; GFX10-NEXT:    v_mul_lo_u32 v8, s1, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v10, s1, v3, v5, s0
 ; GFX10-NEXT:    v_mul_hi_u32 v9, s2, v0
 ; GFX10-NEXT:    v_mul_lo_u32 v11, s2, v6
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v10, s1, v3, v5, s0
 ; GFX10-NEXT:    v_mul_lo_u32 v13, s6, v1
 ; GFX10-NEXT:    v_mul_hi_u32 v14, s3, v1
-; GFX10-NEXT:    v_mul_lo_u32 v12, s3, v1
-; GFX10-NEXT:    v_mul_lo_u32 v16, v6, v7
 ; GFX10-NEXT:    v_mul_lo_u32 v15, s3, v10
-; GFX10-NEXT:    v_mul_hi_u32 v17, v0, v7
-; GFX10-NEXT:    v_add3_u32 v8, v8, v11, v9
-; GFX10-NEXT:    v_mul_hi_u32 v7, v6, v7
+; GFX10-NEXT:    v_mul_lo_u32 v7, s2, v0
+; GFX10-NEXT:    v_mul_lo_u32 v12, s3, v1
 ; GFX10-NEXT:    v_add_nc_u32_e32 v2, v2, v4
 ; GFX10-NEXT:    v_add_nc_u32_e32 v3, v3, v5
-; GFX10-NEXT:    v_mul_lo_u32 v9, v10, v12
-; GFX10-NEXT:    v_mul_hi_u32 v18, v0, v8
+; GFX10-NEXT:    v_add3_u32 v8, v8, v11, v9
 ; GFX10-NEXT:    v_add3_u32 v13, v13, v15, v14
+; GFX10-NEXT:    v_mul_lo_u32 v16, v6, v7
 ; GFX10-NEXT:    v_mul_lo_u32 v14, v0, v8
+; GFX10-NEXT:    v_mul_hi_u32 v17, v0, v7
+; GFX10-NEXT:    v_mul_hi_u32 v7, v6, v7
 ; GFX10-NEXT:    v_mul_lo_u32 v15, v6, v8
+; GFX10-NEXT:    v_mul_lo_u32 v9, v10, v12
+; GFX10-NEXT:    v_mul_hi_u32 v18, v0, v8
 ; GFX10-NEXT:    v_mul_hi_u32 v6, v6, v8
-; GFX10-NEXT:    v_mul_hi_u32 v11, v1, v12
 ; GFX10-NEXT:    v_mul_lo_u32 v8, v1, v13
-; GFX10-NEXT:    v_mul_hi_u32 v12, v10, v12
-; GFX10-NEXT:    v_mul_lo_u32 v19, v10, v13
-; GFX10-NEXT:    v_mul_hi_u32 v20, v1, v13
 ; GFX10-NEXT:    v_add_co_u32 v14, s1, v16, v14
 ; GFX10-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s1
 ; GFX10-NEXT:    v_add_co_u32 v7, s1, v15, v7
+; GFX10-NEXT:    v_mul_hi_u32 v11, v1, v12
+; GFX10-NEXT:    v_mul_hi_u32 v12, v10, v12
+; GFX10-NEXT:    v_mul_lo_u32 v19, v10, v13
 ; GFX10-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s1
 ; GFX10-NEXT:    v_add_co_u32 v8, s1, v9, v8
 ; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s1
@@ -1745,20 +1742,21 @@ define amdgpu_kernel void @udivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX10-NEXT:    v_add_co_u32 v8, s1, v8, v11
 ; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s1
 ; GFX10-NEXT:    v_add_co_u32 v7, s1, v7, v14
+; GFX10-NEXT:    v_mul_hi_u32 v20, v1, v13
 ; GFX10-NEXT:    v_add_nc_u32_e32 v11, v15, v18
 ; GFX10-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s1
-; GFX10-NEXT:    v_add_co_u32 v12, s1, v12, v20
 ; GFX10-NEXT:    v_add_nc_u32_e32 v8, v9, v8
-; GFX10-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s1
 ; GFX10-NEXT:    v_add3_u32 v4, v11, v14, v6
+; GFX10-NEXT:    v_add_co_u32 v12, s1, v12, v20
+; GFX10-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s1
 ; GFX10-NEXT:    v_mul_hi_u32 v6, v10, v13
-; GFX10-NEXT:    v_mov_b32_e32 v10, 0
-; GFX10-NEXT:    v_add_nc_u32_e32 v9, v17, v15
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v2, v4, vcc_lo
 ; GFX10-NEXT:    v_add_co_u32 v4, s1, v12, v8
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v7
+; GFX10-NEXT:    v_add_nc_u32_e32 v9, v17, v15
 ; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s1
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, 0, v2, vcc_lo
+; GFX10-NEXT:    v_mov_b32_e32 v10, 0
 ; GFX10-NEXT:    v_add3_u32 v5, v9, v8, v6
 ; GFX10-NEXT:    v_mul_lo_u32 v6, s17, v0
 ; GFX10-NEXT:    v_mul_lo_u32 v7, s16, v2
@@ -1768,47 +1766,47 @@ define amdgpu_kernel void @udivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, vcc_lo, v3, v5, s0
 ; GFX10-NEXT:    v_mul_hi_u32 v5, s16, v2
 ; GFX10-NEXT:    v_mul_hi_u32 v2, s17, v2
-; GFX10-NEXT:    v_add_co_u32 v1, vcc_lo, v1, v4
 ; GFX10-NEXT:    v_add_co_u32 v6, s0, v6, v7
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s0
 ; GFX10-NEXT:    v_add_co_u32 v8, s0, v9, v8
 ; GFX10-NEXT:    v_add_co_u32 v0, s1, v6, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s1
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s0
 ; GFX10-NEXT:    v_add_co_u32 v5, s0, v8, v5
-; GFX10-NEXT:    v_mul_hi_u32 v12, s18, v3
 ; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s0
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, v7, v0
-; GFX10-NEXT:    v_mul_hi_u32 v7, s18, v1
+; GFX10-NEXT:    v_add_co_u32 v1, vcc_lo, v1, v4
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v0, s0, v5, v0
 ; GFX10-NEXT:    v_add_nc_u32_e32 v4, v6, v8
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s0
 ; GFX10-NEXT:    v_mul_lo_u32 v6, s19, v1
-; GFX10-NEXT:    v_add_co_u32 v0, s0, v5, v0
+; GFX10-NEXT:    v_mul_hi_u32 v7, s18, v1
 ; GFX10-NEXT:    v_mul_hi_u32 v1, s19, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s0
 ; GFX10-NEXT:    v_mul_lo_u32 v8, s9, v0
-; GFX10-NEXT:    v_mul_hi_u32 v9, s8, v0
 ; GFX10-NEXT:    v_add3_u32 v2, v4, v5, v2
 ; GFX10-NEXT:    v_mul_lo_u32 v4, s18, v3
 ; GFX10-NEXT:    v_mul_lo_u32 v5, s19, v3
+; GFX10-NEXT:    v_mul_hi_u32 v9, s8, v0
 ; GFX10-NEXT:    v_mul_lo_u32 v13, s8, v0
-; GFX10-NEXT:    v_mul_hi_u32 v3, s19, v3
 ; GFX10-NEXT:    v_mul_lo_u32 v11, s8, v2
+; GFX10-NEXT:    v_mul_hi_u32 v12, s18, v3
+; GFX10-NEXT:    v_mul_hi_u32 v3, s19, v3
 ; GFX10-NEXT:    v_add_co_u32 v4, s0, v6, v4
 ; GFX10-NEXT:    v_add_co_u32 v1, s1, v5, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s1
 ; GFX10-NEXT:    v_add3_u32 v5, v8, v11, v9
-; GFX10-NEXT:    v_add_co_u32 v4, s0, v4, v7
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s0
 ; GFX10-NEXT:    v_sub_co_u32 v8, vcc_lo, s16, v13
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s0
+; GFX10-NEXT:    v_add_co_u32 v4, s0, v4, v7
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v7, s17, v5
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s0
 ; GFX10-NEXT:    v_sub_co_ci_u32_e64 v5, s0, s17, v5, vcc_lo
 ; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s8, v8
-; GFX10-NEXT:    v_add_nc_u32_e32 v4, v6, v4
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v7, vcc_lo, s9, v7, vcc_lo
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s9, v5
+; GFX10-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s0
+; GFX10-NEXT:    v_add_nc_u32_e32 v4, v6, v4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, -1, vcc_lo
 ; GFX10-NEXT:    v_sub_co_u32 v13, vcc_lo, v8, s8
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v14, s0, 0, v7, vcc_lo
@@ -1829,35 +1827,35 @@ define amdgpu_kernel void @udivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
 ; GFX10-NEXT:    v_add_co_u32 v4, s0, v17, 1
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v15, s0, 0, v18, s0
-; GFX10-NEXT:    v_sub_co_u32 v19, s0, v13, s8
 ; GFX10-NEXT:    v_add3_u32 v3, v11, v1, v3
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v1, vcc_lo, s9, v7, vcc_lo
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v6
 ; GFX10-NEXT:    v_mul_lo_u32 v6, s11, v12
-; GFX10-NEXT:    v_mul_hi_u32 v11, s10, v12
 ; GFX10-NEXT:    v_mul_lo_u32 v7, s10, v3
+; GFX10-NEXT:    v_mul_hi_u32 v11, s10, v12
+; GFX10-NEXT:    v_sub_co_u32 v19, s0, v13, s8
 ; GFX10-NEXT:    v_mul_lo_u32 v16, s10, v12
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v20, s0, 0, v1, s0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v17, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, v9
 ; GFX10-NEXT:    v_cndmask_b32_e32 v4, v18, v15, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v14, v14, v20, vcc_lo
 ; GFX10-NEXT:    v_add3_u32 v6, v6, v7, v11
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, v9
+; GFX10-NEXT:    v_cndmask_b32_e32 v14, v14, v20, vcc_lo
+; GFX10-NEXT:    v_sub_co_u32 v7, s1, s18, v16
+; GFX10-NEXT:    v_sub_co_ci_u32_e64 v9, s2, s19, v6, s1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v2, v4, s0
-; GFX10-NEXT:    v_sub_co_u32 v7, s1, s18, v16
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v13, v19, vcc_lo
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v2, s19, v6
-; GFX10-NEXT:    v_sub_co_ci_u32_e64 v9, s2, s19, v6, s1
+; GFX10-NEXT:    v_cmp_le_u32_e64 s2, s11, v9
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v13, v19, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, s11, v9
 ; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, v14, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, v8, v4, s0
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v2, s1, s11, v2, s1
 ; GFX10-NEXT:    v_cmp_le_u32_e64 s1, s10, v7
-; GFX10-NEXT:    v_cmp_le_u32_e64 s2, s11, v9
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, s11, v9
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s2
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v8, v4, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s1
 ; GFX10-NEXT:    v_sub_co_u32 v13, s1, v7, s10
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s2
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v15, s2, 0, v2, s1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v6, v6, v11, vcc_lo
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s11, v15
@@ -1875,8 +1873,8 @@ define amdgpu_kernel void @udivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX10-NEXT:    v_sub_co_u32 v8, s1, v13, s10
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v2, s1, 0, v2, s1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v11, v16, v11, vcc_lo
-; GFX10-NEXT:    v_cmp_ne_u32_e64 s1, 0, v6
 ; GFX10-NEXT:    v_cndmask_b32_e32 v16, v17, v18, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s1, 0, v6
 ; GFX10-NEXT:    v_cndmask_b32_e32 v6, v13, v8, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v8, v15, v2, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v12, v11, s1
@@ -1984,8 +1982,8 @@ define amdgpu_kernel void @udiv_i8(i8 addrspace(1)* %out0, i8 addrspace(1)* %out
 ; GFX10-NEXT:    v_add_nc_u32_e32 v2, 1, v0
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v1, s0, v1
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s6, v1
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, s6, v1
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s6, v1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
 ; GFX10-NEXT:    v_add_nc_u32_e32 v2, 1, v0
@@ -2046,8 +2044,8 @@ define amdgpu_kernel void @udivrem_v2i8(<2 x i8> addrspace(1)* %out0, <2 x i8> a
 ; GFX8-NEXT:    v_mul_lo_u32 v3, v1, s3
 ; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s2, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 1, v1
 ; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, s4, v3
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 1, v1
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s3, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s3, v3
@@ -2110,9 +2108,9 @@ define amdgpu_kernel void @udivrem_v2i8(<2 x i8> addrspace(1)* %out0, <2 x i8> a
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s6, v3
+; GFX9-NEXT:    v_sub_u32_e32 v2, s9, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX9-NEXT:    v_subrev_u32_e32 v4, s6, v3
-; GFX9-NEXT:    v_sub_u32_e32 v2, s9, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v4, 1, v0
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s7, v2
@@ -2125,9 +2123,9 @@ define amdgpu_kernel void @udivrem_v2i8(<2 x i8> addrspace(1)* %out0, <2 x i8> a
 ; GFX9-NEXT:    v_and_b32_e32 v0, s4, v0
 ; GFX9-NEXT:    v_subrev_u32_e32 v4, s7, v2
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_store_short v1, v0, s[0:1]
 ; GFX9-NEXT:    v_and_b32_e32 v0, s4, v2
@@ -2138,10 +2136,7 @@ define amdgpu_kernel void @udivrem_v2i8(<2 x i8> addrspace(1)* %out0, <2 x i8> a
 ;
 ; GFX10-LABEL: udivrem_v2i8:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    s_load_dword s0, s[4:5], 0x10
-; GFX10-NEXT:    s_nop 0
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte3_e32 v0, s0
 ; GFX10-NEXT:    s_bfe_u32 s1, s0, 0x80010
@@ -2149,6 +2144,7 @@ define amdgpu_kernel void @udivrem_v2i8(<2 x i8> addrspace(1)* %out0, <2 x i8> a
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v1, s1
 ; GFX10-NEXT:    s_sub_i32 s3, 0, s2
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v1, v1
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
@@ -2171,8 +2167,8 @@ define amdgpu_kernel void @udivrem_v2i8(<2 x i8> addrspace(1)* %out0, <2 x i8> a
 ; GFX10-NEXT:    v_add_nc_u32_e32 v6, 1, v1
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v2, s3, v2
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v3, s0, v3
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s2, v2
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v5, s2, v2
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s2, v2
 ; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s1, v3
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, s1, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
@@ -2195,6 +2191,7 @@ define amdgpu_kernel void @udivrem_v2i8(<2 x i8> addrspace(1)* %out0, <2 x i8> a
 ; GFX10-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_store_short v1, v0, s[4:5]
 ; GFX10-NEXT:    global_store_short v1, v2, s[6:7]
 ; GFX10-NEXT:    s_endpgm
@@ -2296,8 +2293,8 @@ define amdgpu_kernel void @udiv_i16(i16 addrspace(1)* %out0, i16 addrspace(1)* %
 ; GFX10-NEXT:    v_add_nc_u32_e32 v2, 1, v0
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v1, s0, v1
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s6, v1
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, s6, v1
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s6, v1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
 ; GFX10-NEXT:    v_add_nc_u32_e32 v2, 1, v0
@@ -2361,8 +2358,8 @@ define amdgpu_kernel void @udivrem_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16
 ; GFX8-NEXT:    v_mul_lo_u32 v3, v1, s3
 ; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s6, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 1, v1
 ; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, s4, v3
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 1, v1
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s3, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s3, v3
@@ -2426,11 +2423,11 @@ define amdgpu_kernel void @udivrem_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v4, 1, v0
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s7, v2
+; GFX9-NEXT:    v_sub_u32_e32 v3, s8, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
 ; GFX9-NEXT:    v_subrev_u32_e32 v4, s7, v2
-; GFX9-NEXT:    v_sub_u32_e32 v3, s8, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v5, 1, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s6, v3
 ; GFX9-NEXT:    v_subrev_u32_e32 v4, s6, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
@@ -2487,8 +2484,8 @@ define amdgpu_kernel void @udivrem_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v2, s6, v2
 ; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v3, s0, v3
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s1, v2
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v5, s1, v2
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s1, v2
 ; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s3, v3
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, s3, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
@@ -2554,8 +2551,8 @@ define amdgpu_kernel void @udivrem_i3(i3 addrspace(1)* %out0, i3 addrspace(1)* %
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s6, v3
 ; GFX8-NEXT:    v_and_b32_e32 v2, 7, v2
-; GFX8-NEXT:    flat_store_byte v[0:1], v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX8-NEXT:    flat_store_byte v[0:1], v2
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 7, v3
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s3
@@ -2590,8 +2587,8 @@ define amdgpu_kernel void @udivrem_i3(i3 addrspace(1)* %out0, i3 addrspace(1)* %
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s6, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX9-NEXT:    v_subrev_u32_e32 v3, s6, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 7, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-NEXT:    v_and_b32_e32 v0, 7, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_store_byte v2, v0, s[0:1]
 ; GFX9-NEXT:    v_and_b32_e32 v0, 7, v1
@@ -2617,8 +2614,8 @@ define amdgpu_kernel void @udivrem_i3(i3 addrspace(1)* %out0, i3 addrspace(1)* %
 ; GFX10-NEXT:    v_add_nc_u32_e32 v2, 1, v0
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v1, s0, v1
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s6, v1
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, s6, v1
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s6, v1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
 ; GFX10-NEXT:    v_add_nc_u32_e32 v2, 1, v0
@@ -2673,8 +2670,8 @@ define amdgpu_kernel void @udivrem_i27(i27 addrspace(1)* %out0, i27 addrspace(1)
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s7, v3
 ; GFX8-NEXT:    v_and_b32_e32 v2, s6, v2
-; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX8-NEXT:    v_and_b32_e32 v2, s6, v3
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s3
@@ -2710,8 +2707,8 @@ define amdgpu_kernel void @udivrem_i27(i27 addrspace(1)* %out0, i27 addrspace(1)
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s7, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX9-NEXT:    v_subrev_u32_e32 v3, s7, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, s6, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-NEXT:    v_and_b32_e32 v0, s6, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX9-NEXT:    v_and_b32_e32 v0, s6, v1
@@ -2738,8 +2735,8 @@ define amdgpu_kernel void @udivrem_i27(i27 addrspace(1)* %out0, i27 addrspace(1)
 ; GFX10-NEXT:    v_add_nc_u32_e32 v2, 1, v0
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v1, s0, v1
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s7, v1
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, s7, v1
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s7, v1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
 ; GFX10-NEXT:    v_add_nc_u32_e32 v2, 1, v0

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
index 1775b535ec0b0..502a952df4e11 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
@@ -281,8 +281,8 @@ define amdgpu_ps i16 @s_usubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
 ; GFX8-NEXT:    s_lshr_b32 s3, s1, 8
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, s4
 ; GFX8-NEXT:    s_lshr_b32 s2, s0, 8
-; GFX8-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX8-NEXT:    s_lshl_b32 s1, s3, s4
 ; GFX8-NEXT:    v_sub_u16_e64 v0, s0, v0 clamp
 ; GFX8-NEXT:    s_lshl_b32 s0, s2, s4
@@ -300,8 +300,8 @@ define amdgpu_ps i16 @s_usubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
 ; GFX9-NEXT:    s_lshr_b32 s3, s1, 8
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s3
-; GFX9-NEXT:    s_lshr_b32 s3, s0, 16
 ; GFX9-NEXT:    s_mov_b32 s2, 0x80008
+; GFX9-NEXT:    s_lshr_b32 s3, s0, 16
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, s2
 ; GFX9-NEXT:    s_lshl_b32 s3, s3, 8
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s3
@@ -324,8 +324,8 @@ define amdgpu_ps i16 @s_usubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
 ; GFX10-NEXT:    s_lshr_b32 s3, s1, 8
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s3
-; GFX10-NEXT:    s_lshr_b32 s3, s0, 16
 ; GFX10-NEXT:    s_mov_b32 s2, 0x80008
+; GFX10-NEXT:    s_lshr_b32 s3, s0, 16
 ; GFX10-NEXT:    s_lshr_b32 s4, s1, 16
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, s2
 ; GFX10-NEXT:    s_lshl_b32 s3, s3, 8
@@ -373,13 +373,13 @@ define i32 @v_usubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 24, v7
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
 ; GFX6-NEXT:    v_min_u32_e32 v4, v3, v4
-; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, v3, v4
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
+; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, v3, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 24, v3
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 24, v3
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v3
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -424,20 +424,20 @@ define i32 @v_usubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v8, 0xffff
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX9-NEXT:    v_and_or_b32 v0, v0, v8, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
 ; GFX9-NEXT:    v_lshrrev_b32_sdwa v5, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
-; GFX9-NEXT:    v_and_or_b32 v2, v3, v8, v2
+; GFX9-NEXT:    v_and_or_b32 v0, v0, v8, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; GFX9-NEXT:    v_and_or_b32 v2, v3, v8, v2
 ; GFX9-NEXT:    v_and_or_b32 v1, v1, v8, v5
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
-; GFX9-NEXT:    v_and_or_b32 v3, v6, v8, v3
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
+; GFX9-NEXT:    v_and_or_b32 v3, v6, v8, v3
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
-; GFX9-NEXT:    v_pk_sub_u16 v0, v0, v1 clamp
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1]
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1]
+; GFX9-NEXT:    v_pk_sub_u16 v0, v0, v1 clamp
 ; GFX9-NEXT:    v_pk_sub_u16 v1, v2, v3 clamp
 ; GFX9-NEXT:    v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 8
@@ -520,13 +520,13 @@ define amdgpu_ps i32 @s_usubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 ; GFX6-NEXT:    s_lshl_b32 s4, s7, 24
 ; GFX6-NEXT:    s_lshr_b32 s1, s1, 24
 ; GFX6-NEXT:    s_min_u32 s4, s3, s4
-; GFX6-NEXT:    s_sub_i32 s3, s3, s4
 ; GFX6-NEXT:    s_lshr_b32 s0, s0, 24
-; GFX6-NEXT:    s_lshl_b32 s1, s1, 8
 ; GFX6-NEXT:    s_lshr_b32 s2, s2, 24
+; GFX6-NEXT:    s_sub_i32 s3, s3, s4
+; GFX6-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX6-NEXT:    s_lshr_b32 s3, s3, 24
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
 ; GFX6-NEXT:    s_lshl_b32 s1, s2, 16
-; GFX6-NEXT:    s_lshr_b32 s3, s3, 24
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
 ; GFX6-NEXT:    s_lshl_b32 s1, s3, 24
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
@@ -539,30 +539,30 @@ define amdgpu_ps i32 @s_usubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 ; GFX8-NEXT:    s_lshr_b32 s6, s1, 16
 ; GFX8-NEXT:    s_lshr_b32 s7, s1, 24
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, s8
-; GFX8-NEXT:    v_mov_b32_e32 v0, s1
-; GFX8-NEXT:    s_lshl_b32 s1, s5, s8
 ; GFX8-NEXT:    s_lshr_b32 s2, s0, 8
 ; GFX8-NEXT:    s_lshr_b32 s3, s0, 16
 ; GFX8-NEXT:    s_lshr_b32 s4, s0, 24
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, s8
+; GFX8-NEXT:    v_mov_b32_e32 v0, s1
+; GFX8-NEXT:    s_lshl_b32 s1, s5, s8
 ; GFX8-NEXT:    v_sub_u16_e64 v0, s0, v0 clamp
-; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    s_lshl_b32 s0, s2, s8
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_sub_u16_e64 v1, s0, v1 clamp
-; GFX8-NEXT:    v_mov_b32_e32 v4, 0xff
 ; GFX8-NEXT:    s_lshl_b32 s1, s6, s8
-; GFX8-NEXT:    v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX8-NEXT:    v_mov_b32_e32 v2, s1
+; GFX8-NEXT:    v_mov_b32_e32 v4, 0xff
 ; GFX8-NEXT:    s_lshl_b32 s0, s3, s8
+; GFX8-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX8-NEXT:    s_lshl_b32 s1, s7, s8
+; GFX8-NEXT:    v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_sub_u16_e64 v2, s0, v2 clamp
 ; GFX8-NEXT:    s_lshl_b32 s0, s4, s8
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX8-NEXT:    v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
+; GFX8-NEXT:    v_sub_u16_e64 v3, s0, v3 clamp
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_and_b32_sdwa v1, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX8-NEXT:    v_sub_u16_e64 v3, s0, v3 clamp
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_and_b32_sdwa v1, v3, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -576,19 +576,19 @@ define amdgpu_ps i32 @s_usubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 ; GFX9-NEXT:    s_lshr_b32 s6, s0, 24
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s3
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s4, s6
-; GFX9-NEXT:    s_lshr_b32 s6, s0, 16
 ; GFX9-NEXT:    s_mov_b32 s4, 0x80008
+; GFX9-NEXT:    s_lshr_b32 s6, s0, 16
 ; GFX9-NEXT:    s_lshr_b32 s7, s1, 8
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, s4
 ; GFX9-NEXT:    s_lshl_b32 s6, s6, 8
-; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s6
-; GFX9-NEXT:    s_lshr_b32 s6, s3, 16
 ; GFX9-NEXT:    s_lshr_b32 s8, s1, 16
 ; GFX9-NEXT:    s_lshr_b32 s9, s1, 24
+; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s6
+; GFX9-NEXT:    s_lshr_b32 s6, s3, 16
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s7
-; GFX9-NEXT:    s_lshr_b32 s7, s1, 16
 ; GFX9-NEXT:    s_lshl_b32 s3, s3, s4
 ; GFX9-NEXT:    s_lshl_b32 s6, s6, 8
+; GFX9-NEXT:    s_lshr_b32 s7, s1, 16
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s3, s6
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s6, s8, s9
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, s4
@@ -597,19 +597,19 @@ define amdgpu_ps i32 @s_usubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 ; GFX9-NEXT:    s_lshr_b32 s7, s6, 16
 ; GFX9-NEXT:    s_lshl_b32 s4, s6, s4
 ; GFX9-NEXT:    s_lshl_b32 s6, s7, 8
-; GFX9-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s4, s6
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX9-NEXT:    v_pk_sub_u16 v0, s0, v0 clamp
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s4
-; GFX9-NEXT:    v_pk_sub_u16 v1, s3, v1 clamp
 ; GFX9-NEXT:    s_mov_b32 s2, 8
+; GFX9-NEXT:    v_pk_sub_u16 v1, s3, v1 clamp
 ; GFX9-NEXT:    v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
 ; GFX9-NEXT:    v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
 ; GFX9-NEXT:    s_movk_i32 s0, 0xff
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    s_mov_b32 s5, 24
 ; GFX9-NEXT:    v_and_or_b32 v0, v0, s0, v2
 ; GFX9-NEXT:    v_and_b32_e32 v2, s0, v1
-; GFX9-NEXT:    s_mov_b32 s5, 24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_or3_b32 v0, v0, v2, v1
@@ -623,8 +623,8 @@ define amdgpu_ps i32 @s_usubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 ; GFX10-NEXT:    s_lshr_b32 s4, s0, 24
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s2, s3, s4
-; GFX10-NEXT:    s_lshr_b32 s4, s0, 16
 ; GFX10-NEXT:    s_mov_b32 s3, 0x80008
+; GFX10-NEXT:    s_lshr_b32 s4, s0, 16
 ; GFX10-NEXT:    s_lshr_b32 s5, s1, 8
 ; GFX10-NEXT:    s_lshr_b32 s6, s1, 16
 ; GFX10-NEXT:    s_lshr_b32 s7, s1, 24
@@ -642,8 +642,8 @@ define amdgpu_ps i32 @s_usubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 ; GFX10-NEXT:    s_lshl_b32 s5, s5, 8
 ; GFX10-NEXT:    s_lshl_b32 s3, s4, s3
 ; GFX10-NEXT:    s_lshl_b32 s4, s6, 8
-; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s5
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s2, s2, s8
+; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s5
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s3, s3, s4
 ; GFX10-NEXT:    v_pk_sub_u16 v0, s0, s1 clamp
 ; GFX10-NEXT:    v_pk_sub_u16 v1, s2, s3 clamp
@@ -1869,10 +1869,10 @@ define <2 x float> @v_usubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, v3, v4
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
 ; GFX6-NEXT:    v_or_b32_e32 v1, v2, v1
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -1931,10 +1931,10 @@ define amdgpu_ps <2 x i32> @s_usubsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre
 ; GFX6-NEXT:    s_lshr_b32 s1, s1, 16
 ; GFX6-NEXT:    s_sub_i32 s3, s3, s4
 ; GFX6-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_lshr_b32 s3, s3, 16
-; GFX6-NEXT:    s_or_b32 s0, s0, s1
+; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_lshr_b32 s2, s2, 16
+; GFX6-NEXT:    s_or_b32 s0, s0, s1
 ; GFX6-NEXT:    s_lshl_b32 s1, s3, 16
 ; GFX6-NEXT:    s_or_b32 s1, s2, s1
 ; GFX6-NEXT:    ; return to shader part epilog
@@ -1942,20 +1942,20 @@ define amdgpu_ps <2 x i32> @s_usubsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre
 ; GFX8-LABEL: s_usubsat_v4i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_lshr_b32 s6, s2, 16
-; GFX8-NEXT:    s_lshr_b32 s7, s3, 16
 ; GFX8-NEXT:    s_lshr_b32 s4, s0, 16
+; GFX8-NEXT:    s_lshr_b32 s7, s3, 16
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s6
-; GFX8-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX8-NEXT:    s_lshr_b32 s5, s1, 16
-; GFX8-NEXT:    v_mov_b32_e32 v3, s7
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX8-NEXT:    v_sub_u16_e64 v1, s4, v1 clamp
+; GFX8-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX8-NEXT:    v_mov_b32_e32 v4, 16
-; GFX8-NEXT:    v_mov_b32_e32 v2, s3
 ; GFX8-NEXT:    v_sub_u16_e64 v0, s0, v0 clamp
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT:    v_mov_b32_e32 v2, s3
 ; GFX8-NEXT:    v_sub_u16_e64 v3, s5, v3 clamp
-; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX8-NEXT:    v_sub_u16_e64 v2, s1, v2 clamp
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX8-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
@@ -2023,16 +2023,16 @@ define <3 x float> @v_usubsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) {
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 16, v11
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_min_u32_e32 v6, v5, v6
-; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v5, v6
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v5, v6
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX6-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX6-NEXT:    v_or_b32_e32 v1, v2, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
 ; GFX6-NEXT:    v_or_b32_e32 v2, v4, v2
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -2104,16 +2104,16 @@ define amdgpu_ps <3 x i32> @s_usubsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre
 ; GFX6-NEXT:    s_lshl_b32 s6, s11, 16
 ; GFX6-NEXT:    s_lshr_b32 s1, s1, 16
 ; GFX6-NEXT:    s_min_u32 s6, s5, s6
-; GFX6-NEXT:    s_sub_i32 s5, s5, s6
 ; GFX6-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_lshr_b32 s3, s3, 16
-; GFX6-NEXT:    s_or_b32 s0, s0, s1
+; GFX6-NEXT:    s_sub_i32 s5, s5, s6
+; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_lshr_b32 s2, s2, 16
-; GFX6-NEXT:    s_lshl_b32 s1, s3, 16
 ; GFX6-NEXT:    s_lshr_b32 s5, s5, 16
-; GFX6-NEXT:    s_or_b32 s1, s2, s1
+; GFX6-NEXT:    s_or_b32 s0, s0, s1
+; GFX6-NEXT:    s_lshl_b32 s1, s3, 16
 ; GFX6-NEXT:    s_lshr_b32 s4, s4, 16
+; GFX6-NEXT:    s_or_b32 s1, s2, s1
 ; GFX6-NEXT:    s_lshl_b32 s2, s5, 16
 ; GFX6-NEXT:    s_or_b32 s2, s4, s2
 ; GFX6-NEXT:    ; return to shader part epilog
@@ -2121,28 +2121,28 @@ define amdgpu_ps <3 x i32> @s_usubsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre
 ; GFX8-LABEL: s_usubsat_v6i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_lshr_b32 s9, s3, 16
-; GFX8-NEXT:    s_lshr_b32 s10, s4, 16
 ; GFX8-NEXT:    s_lshr_b32 s6, s0, 16
+; GFX8-NEXT:    s_lshr_b32 s10, s4, 16
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s9
-; GFX8-NEXT:    v_mov_b32_e32 v0, s3
-; GFX8-NEXT:    s_lshr_b32 s11, s5, 16
 ; GFX8-NEXT:    s_lshr_b32 s7, s1, 16
-; GFX8-NEXT:    v_mov_b32_e32 v3, s10
+; GFX8-NEXT:    s_lshr_b32 s11, s5, 16
+; GFX8-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX8-NEXT:    v_sub_u16_e64 v1, s6, v1 clamp
+; GFX8-NEXT:    v_mov_b32_e32 v3, s10
 ; GFX8-NEXT:    v_mov_b32_e32 v6, 16
-; GFX8-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8-NEXT:    s_lshr_b32 s8, s2, 16
-; GFX8-NEXT:    v_mov_b32_e32 v5, s11
 ; GFX8-NEXT:    v_sub_u16_e64 v0, s0, v0 clamp
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8-NEXT:    v_sub_u16_e64 v3, s7, v3 clamp
+; GFX8-NEXT:    v_mov_b32_e32 v5, s11
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT:    v_sub_u16_e64 v2, s1, v2 clamp
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s5
+; GFX8-NEXT:    v_sub_u16_e64 v5, s8, v5 clamp
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_sub_u16_e64 v2, s1, v2 clamp
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX8-NEXT:    v_sub_u16_e64 v5, s8, v5 clamp
-; GFX8-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_sub_u16_e64 v4, s2, v4 clamp
+; GFX8-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX8-NEXT:    v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
@@ -2213,19 +2213,19 @@ define <4 x float> @v_usubsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v15
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_min_u32_e32 v8, v7, v8
-; GFX6-NEXT:    v_sub_i32_e32 v7, vcc, v7, v8
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    v_sub_i32_e32 v7, vcc, v7, v8
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; GFX6-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX6-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX6-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
 ; GFX6-NEXT:    v_or_b32_e32 v2, v4, v2
 ; GFX6-NEXT:    v_or_b32_e32 v3, v6, v3
@@ -2312,19 +2312,19 @@ define amdgpu_ps <4 x i32> @s_usubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
 ; GFX6-NEXT:    s_lshl_b32 s8, s15, 16
 ; GFX6-NEXT:    s_lshr_b32 s1, s1, 16
 ; GFX6-NEXT:    s_min_u32 s8, s7, s8
-; GFX6-NEXT:    s_sub_i32 s7, s7, s8
 ; GFX6-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_lshr_b32 s3, s3, 16
-; GFX6-NEXT:    s_or_b32 s0, s0, s1
+; GFX6-NEXT:    s_sub_i32 s7, s7, s8
+; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_lshr_b32 s2, s2, 16
-; GFX6-NEXT:    s_lshl_b32 s1, s3, 16
-; GFX6-NEXT:    s_lshr_b32 s7, s7, 16
 ; GFX6-NEXT:    s_lshr_b32 s5, s5, 16
-; GFX6-NEXT:    s_or_b32 s1, s2, s1
+; GFX6-NEXT:    s_lshr_b32 s7, s7, 16
+; GFX6-NEXT:    s_or_b32 s0, s0, s1
+; GFX6-NEXT:    s_lshl_b32 s1, s3, 16
 ; GFX6-NEXT:    s_lshr_b32 s4, s4, 16
-; GFX6-NEXT:    s_lshl_b32 s2, s5, 16
 ; GFX6-NEXT:    s_lshr_b32 s6, s6, 16
+; GFX6-NEXT:    s_or_b32 s1, s2, s1
+; GFX6-NEXT:    s_lshl_b32 s2, s5, 16
 ; GFX6-NEXT:    s_lshl_b32 s3, s7, 16
 ; GFX6-NEXT:    s_or_b32 s2, s4, s2
 ; GFX6-NEXT:    s_or_b32 s3, s6, s3
@@ -2333,35 +2333,35 @@ define amdgpu_ps <4 x i32> @s_usubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
 ; GFX8-LABEL: s_usubsat_v8i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_lshr_b32 s12, s4, 16
-; GFX8-NEXT:    s_lshr_b32 s13, s5, 16
 ; GFX8-NEXT:    s_lshr_b32 s8, s0, 16
+; GFX8-NEXT:    s_lshr_b32 s13, s5, 16
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s12
-; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    s_lshr_b32 s9, s1, 16
 ; GFX8-NEXT:    s_lshr_b32 s14, s6, 16
 ; GFX8-NEXT:    s_lshr_b32 s15, s7, 16
-; GFX8-NEXT:    s_lshr_b32 s9, s1, 16
-; GFX8-NEXT:    v_mov_b32_e32 v3, s13
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX8-NEXT:    v_sub_u16_e64 v1, s8, v1 clamp
+; GFX8-NEXT:    v_mov_b32_e32 v3, s13
 ; GFX8-NEXT:    v_mov_b32_e32 v8, 16
-; GFX8-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX8-NEXT:    s_lshr_b32 s10, s2, 16
-; GFX8-NEXT:    v_mov_b32_e32 v5, s14
 ; GFX8-NEXT:    s_lshr_b32 s11, s3, 16
-; GFX8-NEXT:    v_mov_b32_e32 v7, s15
 ; GFX8-NEXT:    v_sub_u16_e64 v0, s0, v0 clamp
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX8-NEXT:    v_sub_u16_e64 v3, s9, v3 clamp
+; GFX8-NEXT:    v_mov_b32_e32 v5, s14
+; GFX8-NEXT:    v_mov_b32_e32 v7, s15
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT:    v_sub_u16_e64 v2, s1, v2 clamp
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s6
+; GFX8-NEXT:    v_sub_u16_e64 v5, s10, v5 clamp
 ; GFX8-NEXT:    v_mov_b32_e32 v6, s7
+; GFX8-NEXT:    v_sub_u16_e64 v7, s11, v7 clamp
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_sub_u16_e64 v2, s1, v2 clamp
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX8-NEXT:    v_sub_u16_e64 v7, s11, v7 clamp
-; GFX8-NEXT:    v_sub_u16_e64 v5, s10, v5 clamp
-; GFX8-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_sub_u16_e64 v4, s2, v4 clamp
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX8-NEXT:    v_sub_u16_e64 v6, s3, v6 clamp
+; GFX8-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v3, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX8-NEXT:    v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -2713,13 +2713,13 @@ define amdgpu_ps <2 x i64> @s_usubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX6-NEXT:    s_cselect_b32 s1, 1, 0
 ; GFX6-NEXT:    s_and_b32 s1, s1, 1
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s8
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s9
+; GFX6-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s7
-; GFX6-NEXT:    s_subb_u32 s1, s3, s7
 ; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e64 v3, v3, 0, vcc
+; GFX6-NEXT:    s_subb_u32 s1, s3, s7
 ; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
 ; GFX6-NEXT:    v_mov_b32_e32 v4, s0
 ; GFX6-NEXT:    v_mov_b32_e32 v5, s1
@@ -2745,13 +2745,13 @@ define amdgpu_ps <2 x i64> @s_usubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX8-NEXT:    s_cselect_b32 s1, 1, 0
 ; GFX8-NEXT:    s_and_b32 s1, s1, 1
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s6
-; GFX8-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s8
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s9
+; GFX8-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s7
-; GFX8-NEXT:    s_subb_u32 s1, s3, s7
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, 0, vcc
+; GFX8-NEXT:    s_subb_u32 s1, s3, s7
 ; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v5, s1
@@ -2777,13 +2777,13 @@ define amdgpu_ps <2 x i64> @s_usubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX9-NEXT:    s_cselect_b32 s1, 1, 0
 ; GFX9-NEXT:    s_and_b32 s1, s1, 1
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s6
-; GFX9-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s8
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s9
+; GFX9-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s7
-; GFX9-NEXT:    s_subb_u32 s1, s3, s7
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, 0, vcc
+; GFX9-NEXT:    s_subb_u32 s1, s3, s7
 ; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
 ; GFX9-NEXT:    v_mov_b32_e32 v4, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v5, s1
@@ -2831,9 +2831,9 @@ define amdgpu_ps i128 @s_usubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    s_cmp_lg_u32 s9, 0
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s5
-; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
 ; GFX6-NEXT:    s_subb_u32 s9, s1, s5
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
 ; GFX6-NEXT:    s_cselect_b32 s10, 1, 0
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX6-NEXT:    s_and_b32 s10, s10, 1
@@ -2844,14 +2844,14 @@ define amdgpu_ps i128 @s_usubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX6-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[2:3], v[0:1]
 ; GFX6-NEXT:    s_cselect_b32 s11, 1, 0
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; GFX6-NEXT:    s_and_b32 s11, s11, 1
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; GFX6-NEXT:    s_cmp_lg_u32 s11, 0
 ; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX6-NEXT:    s_subb_u32 s11, s3, s7
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s8
-; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s9
+; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX6-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, v2, 0, vcc
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s10
@@ -2881,8 +2881,8 @@ define amdgpu_ps i128 @s_usubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX8-NEXT:    s_cmp_lg_u32 s11, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX8-NEXT:    s_subb_u32 s11, s3, s7
-; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX8-NEXT:    s_cmp_eq_u64 s[2:3], s[6:7]
 ; GFX8-NEXT:    s_cselect_b32 s6, 1, 0
@@ -2894,8 +2894,8 @@ define amdgpu_ps i128 @s_usubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s8
-; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s9
+; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, v2, 0, vcc
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s10
@@ -2925,8 +2925,8 @@ define amdgpu_ps i128 @s_usubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX9-NEXT:    s_cmp_lg_u32 s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX9-NEXT:    s_subb_u32 s11, s3, s7
-; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9-NEXT:    s_cmp_eq_u64 s[2:3], s[6:7]
 ; GFX9-NEXT:    s_cselect_b32 s6, 1, 0
@@ -2938,8 +2938,8 @@ define amdgpu_ps i128 @s_usubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s8
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s9
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v2, 0, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s10
@@ -2979,13 +2979,13 @@ define amdgpu_ps i128 @s_usubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, s8, 0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, s9, 0, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, s10, 0, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, s1, 0, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, s9, 0, vcc_lo
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX10-NEXT:    v_readfirstlane_b32 s2, v2
 ; GFX10-NEXT:    v_readfirstlane_b32 s3, v3
-; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX10-NEXT:    ; return to shader part epilog
   %result = call i128 @llvm.usub.sat.i128(i128 %lhs, i128 %rhs)
   ret i128 %result
@@ -3312,16 +3312,16 @@ define <2 x i128> @v_usubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX10-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v9, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v8, v18, v17, s5
 ; GFX10-NEXT:    v_sub_co_ci_u32_e32 v2, vcc_lo, v2, v10, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s4
 ; GFX10-NEXT:    v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v11, vcc_lo
 ; GFX10-NEXT:    v_sub_co_u32 v4, vcc_lo, v4, v12
 ; GFX10-NEXT:    v_and_b32_e32 v8, 1, v8
 ; GFX10-NEXT:    v_sub_co_ci_u32_e32 v5, vcc_lo, v5, v13, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s4
 ; GFX10-NEXT:    v_sub_co_ci_u32_e32 v6, vcc_lo, v6, v14, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, 0, s4
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 s5, 0, v8
 ; GFX10-NEXT:    v_sub_co_ci_u32_e32 v7, vcc_lo, v7, v15, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, 0, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, 0, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, 0, s5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, 0, s5
@@ -3344,8 +3344,8 @@ define amdgpu_ps <2 x i128> @s_usubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX6-NEXT:    s_cselect_b32 s18, 1, 0
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s9
 ; GFX6-NEXT:    s_and_b32 s18, s18, 1
-; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s10
+; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
 ; GFX6-NEXT:    s_cmp_lg_u32 s18, 0
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s11
 ; GFX6-NEXT:    s_subb_u32 s18, s2, s10
@@ -3356,27 +3356,27 @@ define amdgpu_ps <2 x i128> @s_usubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX6-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[2:3], v[0:1]
 ; GFX6-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; GFX6-NEXT:    s_subb_u32 s19, s3, s11
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX6-NEXT:    s_sub_u32 s0, s4, s12
-; GFX6-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s17
-; GFX6-NEXT:    s_and_b32 s1, s1, 1
+; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX6-NEXT:    s_cselect_b32 s1, 1, 0
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s16
 ; GFX6-NEXT:    v_cndmask_b32_e64 v5, v2, 0, vcc
+; GFX6-NEXT:    s_and_b32 s1, s1, 1
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s12
-; GFX6-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX6-NEXT:    v_cndmask_b32_e64 v4, v1, 0, vcc
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s18
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s19
+; GFX6-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s13
-; GFX6-NEXT:    s_subb_u32 s1, s5, s13
 ; GFX6-NEXT:    v_cndmask_b32_e64 v6, v0, 0, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e64 v7, v1, 0, vcc
-; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3]
+; GFX6-NEXT:    s_subb_u32 s1, s5, s13
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s14
+; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3]
 ; GFX6-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s15
 ; GFX6-NEXT:    s_and_b32 s2, s2, 1
@@ -3387,18 +3387,18 @@ define amdgpu_ps <2 x i128> @s_usubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX6-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[6:7], v[0:1]
 ; GFX6-NEXT:    s_cselect_b32 s3, 1, 0
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; GFX6-NEXT:    s_and_b32 s3, s3, 1
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; GFX6-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX6-NEXT:    s_subb_u32 s3, s7, s15
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s0
-; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s1
-; GFX6-NEXT:    v_mov_b32_e32 v3, s3
+; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX6-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, v2, 0, vcc
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s2
+; GFX6-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e64 v3, v3, 0, vcc
 ; GFX6-NEXT:    v_readfirstlane_b32 s0, v4
@@ -3428,8 +3428,8 @@ define amdgpu_ps <2 x i128> @s_usubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX8-NEXT:    s_cmp_lg_u32 s19, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s9
 ; GFX8-NEXT:    s_subb_u32 s19, s3, s11
-; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s10
+; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s11
 ; GFX8-NEXT:    s_cmp_eq_u64 s[2:3], s[10:11]
 ; GFX8-NEXT:    s_cselect_b32 s10, 1, 0
@@ -3449,25 +3449,25 @@ define amdgpu_ps <2 x i128> @s_usubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX8-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX8-NEXT:    s_subb_u32 s2, s6, s14
-; GFX8-NEXT:    s_cselect_b32 s3, 1, 0
-; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s17
-; GFX8-NEXT:    s_and_b32 s3, s3, 1
+; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX8-NEXT:    s_cselect_b32 s3, 1, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s16
 ; GFX8-NEXT:    v_cndmask_b32_e64 v5, v2, 0, vcc
+; GFX8-NEXT:    s_and_b32 s3, s3, 1
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s12
-; GFX8-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v4, v1, 0, vcc
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s18
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s19
+; GFX8-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s13
-; GFX8-NEXT:    s_subb_u32 s3, s7, s15
 ; GFX8-NEXT:    v_cndmask_b32_e64 v6, v0, 0, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v7, v1, 0, vcc
-; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3]
+; GFX8-NEXT:    s_subb_u32 s3, s7, s15
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s14
-; GFX8-NEXT:    s_cmp_eq_u64 s[6:7], s[14:15]
+; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3]
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s15
+; GFX8-NEXT:    s_cmp_eq_u64 s[6:7], s[14:15]
 ; GFX8-NEXT:    s_cselect_b32 s8, 1, 0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
@@ -3477,12 +3477,12 @@ define amdgpu_ps <2 x i128> @s_usubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s0
-; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s1
-; GFX8-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, v2, 0, vcc
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, 0, vcc
 ; GFX8-NEXT:    v_readfirstlane_b32 s0, v4
@@ -3512,8 +3512,8 @@ define amdgpu_ps <2 x i128> @s_usubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX9-NEXT:    s_cmp_lg_u32 s19, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s9
 ; GFX9-NEXT:    s_subb_u32 s19, s3, s11
-; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s10
+; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s11
 ; GFX9-NEXT:    s_cmp_eq_u64 s[2:3], s[10:11]
 ; GFX9-NEXT:    s_cselect_b32 s10, 1, 0
@@ -3533,25 +3533,25 @@ define amdgpu_ps <2 x i128> @s_usubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX9-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    s_subb_u32 s2, s6, s14
-; GFX9-NEXT:    s_cselect_b32 s3, 1, 0
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s17
-; GFX9-NEXT:    s_and_b32 s3, s3, 1
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    s_cselect_b32 s3, 1, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s16
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, v2, 0, vcc
+; GFX9-NEXT:    s_and_b32 s3, s3, 1
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s12
-; GFX9-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, v1, 0, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s18
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s19
+; GFX9-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s13
-; GFX9-NEXT:    s_subb_u32 s3, s7, s15
 ; GFX9-NEXT:    v_cndmask_b32_e64 v6, v0, 0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v7, v1, 0, vcc
-; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3]
+; GFX9-NEXT:    s_subb_u32 s3, s7, s15
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s14
-; GFX9-NEXT:    s_cmp_eq_u64 s[6:7], s[14:15]
+; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s15
+; GFX9-NEXT:    s_cmp_eq_u64 s[6:7], s[14:15]
 ; GFX9-NEXT:    s_cselect_b32 s8, 1, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
@@ -3561,12 +3561,12 @@ define amdgpu_ps <2 x i128> @s_usubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s1
-; GFX9-NEXT:    v_mov_b32_e32 v3, s3
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v2, 0, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, 0, vcc
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v4
@@ -3616,15 +3616,15 @@ define amdgpu_ps <2 x i128> @s_usubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX10-NEXT:    s_cselect_b32 s0, 1, 0
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    s_and_b32 s0, s0, 1
-; GFX10-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s1
+; GFX10-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX10-NEXT:    v_cmp_lt_u64_e64 s1, s[6:7], s[14:15]
 ; GFX10-NEXT:    s_subb_u32 s9, s7, s15
 ; GFX10-NEXT:    s_cmp_eq_u64 s[6:7], s[14:15]
-; GFX10-NEXT:    v_cmp_lt_u64_e64 s1, s[6:7], s[14:15]
 ; GFX10-NEXT:    s_cselect_b32 s0, 1, 0
 ; GFX10-NEXT:    s_and_b32 s0, 1, s0
-; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s1
+; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc_lo
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v1
@@ -3636,13 +3636,13 @@ define amdgpu_ps <2 x i128> @s_usubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v1
 ; GFX10-NEXT:    v_readfirstlane_b32 s1, v2
 ; GFX10-NEXT:    v_readfirstlane_b32 s2, v3
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, s3, 0, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, s8, 0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, s3, 0, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, s10, 0, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, s9, 0, vcc_lo
 ; GFX10-NEXT:    v_readfirstlane_b32 s3, v4
-; GFX10-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX10-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX10-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX10-NEXT:    v_readfirstlane_b32 s6, v2
 ; GFX10-NEXT:    v_readfirstlane_b32 s7, v3
 ; GFX10-NEXT:    ; return to shader part epilog

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll
index 42a321345acfc..f65d72a560dd3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll
@@ -38,10 +38,10 @@ define amdgpu_ps i32 @scalar_xnor_v2i16_one_use(<2 x i16> inreg %a, <2 x i16> in
 ;
 ; GFX8-LABEL: scalar_xnor_v2i16_one_use:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_xor_b32 s0, s0, s1
 ; GFX8-NEXT:    s_mov_b32 s2, 0xffff
-; GFX8-NEXT:    s_lshr_b32 s1, s0, 16
+; GFX8-NEXT:    s_xor_b32 s0, s0, s1
 ; GFX8-NEXT:    s_mov_b32 s3, s2
+; GFX8-NEXT:    s_lshr_b32 s1, s0, 16
 ; GFX8-NEXT:    s_and_b32 s0, s0, s2
 ; GFX8-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
@@ -124,8 +124,8 @@ define amdgpu_ps i64 @scalar_xnor_v4i16_one_use(<4 x i16> inreg %a, <4 x i16> in
 ; GFX7-NEXT:    s_lshl_b32 s1, s3, 16
 ; GFX7-NEXT:    s_and_b32 s2, s2, s8
 ; GFX7-NEXT:    s_or_b32 s1, s1, s2
-; GFX7-NEXT:    s_and_b32 s3, s4, s8
 ; GFX7-NEXT:    s_lshl_b32 s2, s5, 16
+; GFX7-NEXT:    s_and_b32 s3, s4, s8
 ; GFX7-NEXT:    s_or_b32 s2, s2, s3
 ; GFX7-NEXT:    s_lshl_b32 s3, s7, 16
 ; GFX7-NEXT:    s_and_b32 s4, s6, s8
@@ -138,8 +138,8 @@ define amdgpu_ps i64 @scalar_xnor_v4i16_one_use(<4 x i16> inreg %a, <4 x i16> in
 ;
 ; GFX8-LABEL: scalar_xnor_v4i16_one_use:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
 ; GFX8-NEXT:    s_mov_b32 s4, 0xffff
+; GFX8-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
 ; GFX8-NEXT:    s_lshr_b32 s3, s0, 16
 ; GFX8-NEXT:    s_and_b32 s2, s0, s4
 ; GFX8-NEXT:    s_mov_b32 s5, s4

diff  --git a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
index 72a6522ab14a1..3e96cb20dc393 100644
--- a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
+++ b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
@@ -22,13 +22,13 @@ define void @parent_func_missing_inputs() #0 {
 ; VARABI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VARABI-NEXT:    s_mov_b64 exec, s[4:5]
 ; VARABI-NEXT:    v_writelane_b32 v40, s33, 2
-; VARABI-NEXT:    v_writelane_b32 v40, s30, 0
 ; VARABI-NEXT:    s_mov_b32 s33, s32
 ; VARABI-NEXT:    s_addk_i32 s32, 0x400
+; VARABI-NEXT:    v_writelane_b32 v40, s30, 0
+; VARABI-NEXT:    v_writelane_b32 v40, s31, 1
 ; VARABI-NEXT:    s_getpc_b64 s[4:5]
 ; VARABI-NEXT:    s_add_u32 s4, s4, requires_all_inputs at rel32@lo+4
 ; VARABI-NEXT:    s_addc_u32 s5, s5, requires_all_inputs at rel32@hi+12
-; VARABI-NEXT:    v_writelane_b32 v40, s31, 1
 ; VARABI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; VARABI-NEXT:    v_readlane_b32 s4, v40, 0
 ; VARABI-NEXT:    v_readlane_b32 s5, v40, 1
@@ -47,13 +47,13 @@ define void @parent_func_missing_inputs() #0 {
 ; FIXEDABI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; FIXEDABI-NEXT:    s_mov_b64 exec, s[16:17]
 ; FIXEDABI-NEXT:    v_writelane_b32 v40, s33, 2
-; FIXEDABI-NEXT:    v_writelane_b32 v40, s30, 0
 ; FIXEDABI-NEXT:    s_mov_b32 s33, s32
 ; FIXEDABI-NEXT:    s_addk_i32 s32, 0x400
+; FIXEDABI-NEXT:    v_writelane_b32 v40, s30, 0
+; FIXEDABI-NEXT:    v_writelane_b32 v40, s31, 1
 ; FIXEDABI-NEXT:    s_getpc_b64 s[16:17]
 ; FIXEDABI-NEXT:    s_add_u32 s16, s16, requires_all_inputs at rel32@lo+4
 ; FIXEDABI-NEXT:    s_addc_u32 s17, s17, requires_all_inputs at rel32@hi+12
-; FIXEDABI-NEXT:    v_writelane_b32 v40, s31, 1
 ; FIXEDABI-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; FIXEDABI-NEXT:    v_readlane_b32 s4, v40, 0
 ; FIXEDABI-NEXT:    v_readlane_b32 s5, v40, 1
@@ -75,11 +75,11 @@ define amdgpu_kernel void @parent_kernel_missing_inputs() #0 {
 ; VARABI-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
 ; VARABI-NEXT:    s_add_u32 s0, s0, s9
 ; VARABI-NEXT:    s_addc_u32 s1, s1, 0
+; VARABI-NEXT:    s_mov_b32 s32, 0
 ; VARABI-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; VARABI-NEXT:    s_getpc_b64 s[4:5]
 ; VARABI-NEXT:    s_add_u32 s4, s4, requires_all_inputs at rel32@lo+4
 ; VARABI-NEXT:    s_addc_u32 s5, s5, requires_all_inputs at rel32@hi+12
-; VARABI-NEXT:    s_mov_b32 s32, 0
 ; VARABI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; VARABI-NEXT:    s_endpgm
 ;
@@ -96,10 +96,10 @@ define amdgpu_kernel void @parent_kernel_missing_inputs() #0 {
 ; FIXEDABI-SDAG-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; FIXEDABI-SDAG-NEXT:    v_or_b32_e32 v31, v0, v2
 ; FIXEDABI-SDAG-NEXT:    s_mov_b64 s[8:9], 0
+; FIXEDABI-SDAG-NEXT:    s_mov_b32 s32, 0
 ; FIXEDABI-SDAG-NEXT:    s_getpc_b64 s[16:17]
 ; FIXEDABI-SDAG-NEXT:    s_add_u32 s16, s16, requires_all_inputs at rel32@lo+4
 ; FIXEDABI-SDAG-NEXT:    s_addc_u32 s17, s17, requires_all_inputs at rel32@hi+12
-; FIXEDABI-SDAG-NEXT:    s_mov_b32 s32, 0
 ; FIXEDABI-SDAG-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; FIXEDABI-SDAG-NEXT:    s_endpgm
 ;
@@ -116,10 +116,10 @@ define amdgpu_kernel void @parent_kernel_missing_inputs() #0 {
 ; FIXEDABI-GISEL-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; FIXEDABI-GISEL-NEXT:    v_or_b32_e32 v31, v0, v1
 ; FIXEDABI-GISEL-NEXT:    s_mov_b64 s[8:9], 0
+; FIXEDABI-GISEL-NEXT:    s_mov_b32 s32, 0
 ; FIXEDABI-GISEL-NEXT:    s_getpc_b64 s[16:17]
 ; FIXEDABI-GISEL-NEXT:    s_add_u32 s16, s16, requires_all_inputs at rel32@lo+4
 ; FIXEDABI-GISEL-NEXT:    s_addc_u32 s17, s17, requires_all_inputs at rel32@hi+12
-; FIXEDABI-GISEL-NEXT:    s_mov_b32 s32, 0
 ; FIXEDABI-GISEL-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; FIXEDABI-GISEL-NEXT:    s_endpgm
   call void @requires_all_inputs()
@@ -401,14 +401,14 @@ define void @addrspacecast_requires_queue_ptr(i32 addrspace(5)* %ptr.private, i3
 ; VARABI:       ; %bb.0:
 ; VARABI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VARABI-NEXT:    v_cmp_ne_u32_e32 vcc, -1, v0
+; VARABI-NEXT:    v_mov_b32_e32 v3, 0
 ; VARABI-NEXT:    v_cndmask_b32_e32 v2, 0, v0, vcc
 ; VARABI-NEXT:    v_cmp_ne_u32_e32 vcc, -1, v1
-; VARABI-NEXT:    v_mov_b32_e32 v3, 0
 ; VARABI-NEXT:    v_mov_b32_e32 v4, 1
 ; VARABI-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; VARABI-NEXT:    v_mov_b32_e32 v1, v3
 ; VARABI-NEXT:    flat_store_dword v[2:3], v4
 ; VARABI-NEXT:    s_waitcnt vmcnt(0)
-; VARABI-NEXT:    v_mov_b32_e32 v1, v3
 ; VARABI-NEXT:    v_mov_b32_e32 v2, 2
 ; VARABI-NEXT:    flat_store_dword v[0:1], v2
 ; VARABI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -424,13 +424,13 @@ define void @addrspacecast_requires_queue_ptr(i32 addrspace(5)* %ptr.private, i3
 ; FIXEDABI-SDAG-NEXT:    v_mov_b32_e32 v2, s5
 ; FIXEDABI-SDAG-NEXT:    v_cndmask_b32_e32 v3, 0, v2, vcc
 ; FIXEDABI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v0, vcc
-; FIXEDABI-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, -1, v1
 ; FIXEDABI-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; FIXEDABI-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, -1, v1
 ; FIXEDABI-SDAG-NEXT:    v_cndmask_b32_e32 v5, 0, v0, vcc
 ; FIXEDABI-SDAG-NEXT:    v_mov_b32_e32 v0, 1
+; FIXEDABI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v1, vcc
 ; FIXEDABI-SDAG-NEXT:    flat_store_dword v[2:3], v0
 ; FIXEDABI-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; FIXEDABI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v1, vcc
 ; FIXEDABI-SDAG-NEXT:    v_mov_b32_e32 v0, 2
 ; FIXEDABI-SDAG-NEXT:    flat_store_dword v[4:5], v0
 ; FIXEDABI-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -446,8 +446,8 @@ define void @addrspacecast_requires_queue_ptr(i32 addrspace(5)* %ptr.private, i3
 ; FIXEDABI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; FIXEDABI-GISEL-NEXT:    v_mov_b32_e32 v3, s4
 ; FIXEDABI-GISEL-NEXT:    v_cndmask_b32_e32 v3, 0, v3, vcc
-; FIXEDABI-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, -1, v1
 ; FIXEDABI-GISEL-NEXT:    v_mov_b32_e32 v4, s5
+; FIXEDABI-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, -1, v1
 ; FIXEDABI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
 ; FIXEDABI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v4, vcc
 ; FIXEDABI-GISEL-NEXT:    v_mov_b32_e32 v4, 1

diff  --git a/llvm/test/CodeGen/AMDGPU/add3.ll b/llvm/test/CodeGen/AMDGPU/add3.ll
index 7c4ea82432af9..4610abd74fdd4 100644
--- a/llvm/test/CodeGen/AMDGPU/add3.ll
+++ b/llvm/test/CodeGen/AMDGPU/add3.ll
@@ -217,9 +217,9 @@ define amdgpu_ps <2 x float> @add3_multiuse_inner(i32 %a, i32 %b, i32 %c) {
 define amdgpu_ps float @add3_uniform_vgpr(float inreg %a, float inreg %b, float inreg %c) {
 ; VI-LABEL: add3_uniform_vgpr:
 ; VI:       ; %bb.0:
-; VI-NEXT:    v_mov_b32_e32 v2, 0x40400000
 ; VI-NEXT:    v_add_f32_e64 v0, s2, 1.0
 ; VI-NEXT:    v_add_f32_e64 v1, s3, 2.0
+; VI-NEXT:    v_mov_b32_e32 v2, 0x40400000
 ; VI-NEXT:    v_add_f32_e32 v2, s4, v2
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
@@ -227,9 +227,9 @@ define amdgpu_ps float @add3_uniform_vgpr(float inreg %a, float inreg %b, float
 ;
 ; GFX9-LABEL: add3_uniform_vgpr:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0x40400000
 ; GFX9-NEXT:    v_add_f32_e64 v0, s2, 1.0
 ; GFX9-NEXT:    v_add_f32_e64 v1, s3, 2.0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x40400000
 ; GFX9-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2

diff  --git a/llvm/test/CodeGen/AMDGPU/agpr-remat.ll b/llvm/test/CodeGen/AMDGPU/agpr-remat.ll
index 2ec1cee766622..9a535e441b79a 100644
--- a/llvm/test/CodeGen/AMDGPU/agpr-remat.ll
+++ b/llvm/test/CodeGen/AMDGPU/agpr-remat.ll
@@ -33,9 +33,9 @@ define void @remat_regcopy_avoids_spill(i32 %v0, i32 %v1, i32 %v2, i32 %v3, i32
 ; GFX908-NEXT:    v_accvgpr_write_b32 a2, v1
 ; GFX908-NEXT:    v_accvgpr_write_b32 a3, v2
 ; GFX908-NEXT:    v_accvgpr_write_b32 a4, v3
+; GFX908-NEXT:    v_accvgpr_write_b32 a0, v8
 ; GFX908-NEXT:    ;;#ASMSTART
 ; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    v_accvgpr_write_b32 a0, v8
 ; GFX908-NEXT:    v_accvgpr_write_b32 a1, v4
 ; GFX908-NEXT:    v_accvgpr_write_b32 a2, v5
 ; GFX908-NEXT:    v_accvgpr_write_b32 a3, v6

diff  --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
index d3c9ef10d5446..bfc1717e8275c 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --force-update
 ; RUN: opt -S -mtriple=amdgcn-- -mcpu=tahiti -amdgpu-codegenprepare -amdgpu-bypass-slow-div=0 %s | FileCheck %s
 ; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GFX6 %s
 ; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GFX9 %s
@@ -191,11 +191,10 @@ define amdgpu_kernel void @urem_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) {
 ; GFX9-LABEL: urem_i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
 ; GFX9-NEXT:    s_sub_i32 s4, 0, s3
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
@@ -212,6 +211,7 @@ define amdgpu_kernel void @urem_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) {
 ; GFX9-NEXT:    v_subrev_u32_e32 v2, s3, v0
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -483,14 +483,13 @@ define amdgpu_kernel void @srem_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) {
 ; GFX9-LABEL: srem_i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
 ; GFX9-NEXT:    s_add_i32 s3, s3, s4
 ; GFX9-NEXT:    s_xor_b32 s3, s3, s4
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
 ; GFX9-NEXT:    s_sub_i32 s4, 0, s3
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
@@ -512,6 +511,7 @@ define amdgpu_kernel void @srem_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) {
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX9-NEXT:    v_xor_b32_e32 v0, s4, v0
 ; GFX9-NEXT:    v_subrev_u32_e32 v0, s4, v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -690,13 +690,12 @@ define amdgpu_kernel void @urem_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) {
 ; GFX9-LABEL: urem_i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_lshr_b32 s3, s2, 16
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
 ; GFX9-NEXT:    s_and_b32 s4, s2, 0xffff
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s4
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
@@ -707,6 +706,7 @@ define amdgpu_kernel void @urem_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) {
 ; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v3, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s3
 ; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_store_short v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -901,8 +901,6 @@ define amdgpu_kernel void @srem_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) {
 ; GFX9-LABEL: srem_i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_ashr_i32 s5, s4, 16
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s5
@@ -912,6 +910,7 @@ define amdgpu_kernel void @srem_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) {
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v0
 ; GFX9-NEXT:    s_ashr_i32 s2, s2, 30
 ; GFX9-NEXT:    s_or_b32 s6, s2, 1
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
 ; GFX9-NEXT:    v_mad_f32 v1, -v2, v0, v1
@@ -923,6 +922,7 @@ define amdgpu_kernel void @srem_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) {
 ; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s5
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    v_sub_u32_e32 v0, s4, v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_store_short v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -1087,13 +1087,12 @@ define amdgpu_kernel void @urem_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) {
 ; GFX9-LABEL: urem_i8:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v0, s2
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v0
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v2, s2
 ; GFX9-NEXT:    s_lshr_b32 s3, s2, 8
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    v_mul_f32_e32 v1, v2, v1
 ; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v1
@@ -1103,6 +1102,7 @@ define amdgpu_kernel void @urem_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) {
 ; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v3, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s3
 ; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_store_byte v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -1297,8 +1297,6 @@ define amdgpu_kernel void @srem_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) {
 ; GFX9-LABEL: srem_i8:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_bfe_i32 s2, s4, 0x80008
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s2
@@ -1318,8 +1316,10 @@ define amdgpu_kernel void @srem_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) {
 ; GFX9-NEXT:    s_cselect_b32 s2, s6, 0
 ; GFX9-NEXT:    v_add_u32_e32 v0, s2, v2
 ; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s5
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    v_sub_u32_e32 v0, s4, v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_store_byte v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -1530,17 +1530,17 @@ define amdgpu_kernel void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v4
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s5, v5
-; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 1, v1
 ; GFX6-NEXT:    v_mul_f32_e32 v2, s3, v2
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 1, v1
 ; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v3
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s[0:1]
-; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s9, v3
 ; GFX6-NEXT:    v_mul_lo_u32 v4, s2, v2
+; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s9, v3
 ; GFX6-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
+; GFX6-NEXT:    v_mul_hi_u32 v4, v2, v4
 ; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 1, v1
 ; GFX6-NEXT:    s_sub_i32 s0, 0, s11
-; GFX6-NEXT:    v_mul_hi_u32 v4, v2, v4
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v6
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s9, v3
@@ -1556,13 +1556,13 @@ define amdgpu_kernel void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x
 ; GFX6-NEXT:    v_mul_hi_u32 v5, v4, v5
 ; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[0:1]
 ; GFX6-NEXT:    v_subrev_i32_e32 v6, vcc, s10, v3
-; GFX6-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[0:1]
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
 ; GFX6-NEXT:    v_mul_hi_u32 v4, s7, v4
+; GFX6-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[0:1]
 ; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 1, v2
+; GFX6-NEXT:    v_mul_lo_u32 v6, v4, s11
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s10, v3
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v6, v4, s11
 ; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 1, v4
 ; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s7, v6
 ; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s11, v3
@@ -1612,17 +1612,17 @@ define amdgpu_kernel void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v5
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc
 ; GFX9-NEXT:    v_subrev_u32_e32 v7, s8, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v6, v1, s9
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v2
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v7, 1, v0
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v5
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v7, s2, v3
 ; GFX9-NEXT:    v_sub_u32_e32 v6, s5, v6
-; GFX9-NEXT:    v_mul_f32_e32 v2, s12, v2
 ; GFX9-NEXT:    v_add_u32_e32 v5, 1, v1
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v6
+; GFX9-NEXT:    v_mul_f32_e32 v2, s12, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
 ; GFX9-NEXT:    v_mul_hi_u32 v5, v3, v7
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
@@ -1646,8 +1646,8 @@ define amdgpu_kernel void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x
 ; GFX9-NEXT:    v_subrev_u32_e32 v3, s10, v6
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v6, v5, s11
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s10, v3
 ; GFX9-NEXT:    v_add_u32_e32 v7, 1, v2
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s10, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc
 ; GFX9-NEXT:    v_sub_u32_e32 v3, s7, v6
 ; GFX9-NEXT:    v_add_u32_e32 v6, 1, v5
@@ -1689,32 +1689,32 @@ define amdgpu_kernel void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x
 ; GFX90A-NEXT:    v_subrev_u32_e32 v3, s8, v2
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; GFX90A-NEXT:    s_sub_i32 s2, 0, s9
+; GFX90A-NEXT:    v_add_u32_e32 v3, 1, v0
 ; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s8, v2
 ; GFX90A-NEXT:    v_mul_lo_u32 v2, s2, v1
-; GFX90A-NEXT:    v_add_u32_e32 v3, 1, v0
-; GFX90A-NEXT:    v_mul_hi_u32 v2, v1, v2
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX90A-NEXT:    v_mul_hi_u32 v2, v1, v2
 ; GFX90A-NEXT:    v_cvt_f32_u32_e32 v3, s10
 ; GFX90A-NEXT:    v_add_u32_e32 v1, v1, v2
 ; GFX90A-NEXT:    v_mul_hi_u32 v1, s5, v1
 ; GFX90A-NEXT:    v_mul_lo_u32 v2, v1, s9
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v3, v3
 ; GFX90A-NEXT:    v_sub_u32_e32 v2, s5, v2
+; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v3, v3
 ; GFX90A-NEXT:    v_add_u32_e32 v5, 1, v1
 ; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s9, v2
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
 ; GFX90A-NEXT:    v_subrev_u32_e32 v5, s9, v2
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
-; GFX90A-NEXT:    v_mul_f32_e32 v3, s3, v3
 ; GFX90A-NEXT:    v_add_u32_e32 v5, 1, v1
+; GFX90A-NEXT:    v_mul_f32_e32 v3, s3, v3
 ; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s9, v2
+; GFX90A-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
 ; GFX90A-NEXT:    v_cvt_f32_u32_e32 v5, s11
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; GFX90A-NEXT:    s_sub_i32 s2, 0, s10
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v5, v5
 ; GFX90A-NEXT:    v_mul_lo_u32 v2, s2, v3
 ; GFX90A-NEXT:    v_mul_hi_u32 v2, v3, v2
+; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v5, v5
 ; GFX90A-NEXT:    v_add_u32_e32 v2, v3, v2
 ; GFX90A-NEXT:    v_mul_hi_u32 v2, s6, v2
 ; GFX90A-NEXT:    v_mul_lo_u32 v3, v2, s10
@@ -1905,8 +1905,8 @@ define amdgpu_kernel void @urem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s4, v0
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v4, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v1, s5, v1
-; GFX6-NEXT:    v_mul_f32_e32 v2, s13, v3
 ; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s8
+; GFX6-NEXT:    v_mul_f32_e32 v2, s13, v3
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s9
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
@@ -1921,8 +1921,8 @@ define amdgpu_kernel void @urem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s5, v1
 ; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s9, v1
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX6-NEXT:    v_mul_hi_u32 v3, v2, v3
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v5
 ; GFX6-NEXT:    s_sub_i32 s4, 0, s11
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
@@ -1930,19 +1930,19 @@ define amdgpu_kernel void @urem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s9, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v2, s6, v2
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
 ; GFX6-NEXT:    v_mul_lo_u32 v5, s4, v3
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX6-NEXT:    v_mul_lo_u32 v2, v2, s10
 ; GFX6-NEXT:    v_mul_hi_u32 v4, v3, v5
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s6, v2
-; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s10, v2
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
 ; GFX6-NEXT:    v_mul_hi_u32 v3, s7, v3
+; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s10, v2
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
+; GFX6-NEXT:    v_mul_lo_u32 v3, v3, s11
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
 ; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s10, v2
-; GFX6-NEXT:    v_mul_lo_u32 v3, v3, s11
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s7, v3
@@ -2013,9 +2013,9 @@ define amdgpu_kernel void @urem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v3, v3, s11
-; GFX9-NEXT:    v_sub_u32_e32 v2, s6, v2
 ; GFX9-NEXT:    v_subrev_u32_e32 v6, s9, v1
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
+; GFX9-NEXT:    v_sub_u32_e32 v2, s6, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
 ; GFX9-NEXT:    v_subrev_u32_e32 v5, s10, v2
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
@@ -2328,8 +2328,8 @@ define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 1, v0
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s3, v3
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
 ; GFX6-NEXT:    s_ashr_i32 s0, s5, 31
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
 ; GFX6-NEXT:    s_add_i32 s1, s5, s0
 ; GFX6-NEXT:    v_xor_b32_e32 v0, s2, v0
 ; GFX6-NEXT:    s_ashr_i32 s3, s10, 31
@@ -2363,17 +2363,17 @@ define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x
 ; GFX6-NEXT:    s_add_i32 s5, s11, s2
 ; GFX6-NEXT:    s_add_i32 s1, s6, s0
 ; GFX6-NEXT:    s_xor_b32 s5, s5, s2
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s5
 ; GFX6-NEXT:    s_xor_b32 s1, s1, s0
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s5
 ; GFX6-NEXT:    v_mul_hi_u32 v2, s1, v2
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v4
 ; GFX6-NEXT:    s_xor_b32 s3, s0, s3
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v4
 ; GFX6-NEXT:    v_mul_lo_u32 v3, v2, s4
-; GFX6-NEXT:    v_mul_f32_e32 v4, s16, v4
-; GFX6-NEXT:    v_cvt_u32_f32_e32 v4, v4
 ; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 1, v2
+; GFX6-NEXT:    v_mul_f32_e32 v4, s16, v4
 ; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s1, v3
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v4, v4
 ; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s4, v3
 ; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
 ; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s4, v3
@@ -2476,8 +2476,8 @@ define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x
 ; GFX9-NEXT:    s_add_i32 s9, s11, s8
 ; GFX9-NEXT:    v_add_u32_e32 v5, 1, v1
 ; GFX9-NEXT:    s_xor_b32 s9, s9, s8
-; GFX9-NEXT:    v_mul_hi_u32 v2, v3, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX9-NEXT:    v_mul_hi_u32 v2, v3, v2
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s9
 ; GFX9-NEXT:    s_ashr_i32 s5, s6, 31
 ; GFX9-NEXT:    s_add_i32 s6, s6, s5
@@ -2488,9 +2488,9 @@ define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x
 ; GFX9-NEXT:    v_xor_b32_e32 v0, s2, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v3, s15, v3
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GFX9-NEXT:    v_mul_lo_u32 v5, v2, s4
 ; GFX9-NEXT:    v_subrev_u32_e32 v0, s2, v0
 ; GFX9-NEXT:    s_xor_b32 s2, s13, s12
+; GFX9-NEXT:    v_mul_lo_u32 v5, v2, s4
 ; GFX9-NEXT:    v_xor_b32_e32 v1, s2, v1
 ; GFX9-NEXT:    v_subrev_u32_e32 v1, s2, v1
 ; GFX9-NEXT:    s_xor_b32 s2, s5, s3
@@ -2852,9 +2852,9 @@ define amdgpu_kernel void @srem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x
 ; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s8, v0
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s8, v0
 ; GFX6-NEXT:    s_xor_b32 s4, s5, s13
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
-; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s8, v0
 ; GFX6-NEXT:    s_ashr_i32 s5, s10, 31
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
 ; GFX6-NEXT:    s_add_i32 s8, s10, s5
@@ -2866,7 +2866,6 @@ define amdgpu_kernel void @srem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s9
 ; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s12, v0
-; GFX6-NEXT:    s_ashr_i32 s8, s11, 31
 ; GFX6-NEXT:    v_mul_f32_e32 v2, s14, v2
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s4, v1
@@ -2879,6 +2878,7 @@ define amdgpu_kernel void @srem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX6-NEXT:    v_mul_hi_u32 v3, v2, v4
+; GFX6-NEXT:    s_ashr_i32 s8, s11, 31
 ; GFX6-NEXT:    s_add_i32 s9, s11, s8
 ; GFX6-NEXT:    s_ashr_i32 s4, s6, 31
 ; GFX6-NEXT:    s_xor_b32 s8, s9, s8
@@ -3072,11 +3072,11 @@ define amdgpu_kernel void @srem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX90A-NEXT:    s_sub_i32 s4, 0, s8
 ; GFX90A-NEXT:    v_xor_b32_e32 v0, s3, v0
-; GFX90A-NEXT:    v_mul_lo_u32 v2, s4, v1
 ; GFX90A-NEXT:    s_ashr_i32 s2, s5, 31
-; GFX90A-NEXT:    v_mul_hi_u32 v2, v1, v2
+; GFX90A-NEXT:    v_mul_lo_u32 v2, s4, v1
 ; GFX90A-NEXT:    v_subrev_u32_e32 v0, s3, v0
 ; GFX90A-NEXT:    s_add_i32 s3, s5, s2
+; GFX90A-NEXT:    v_mul_hi_u32 v2, v1, v2
 ; GFX90A-NEXT:    s_xor_b32 s3, s3, s2
 ; GFX90A-NEXT:    v_add_u32_e32 v1, v1, v2
 ; GFX90A-NEXT:    v_mul_hi_u32 v1, s3, v1
@@ -3084,9 +3084,9 @@ define amdgpu_kernel void @srem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x
 ; GFX90A-NEXT:    v_sub_u32_e32 v1, s3, v1
 ; GFX90A-NEXT:    s_ashr_i32 s3, s10, 31
 ; GFX90A-NEXT:    s_add_i32 s4, s10, s3
-; GFX90A-NEXT:    s_xor_b32 s3, s4, s3
 ; GFX90A-NEXT:    v_subrev_u32_e32 v2, s8, v1
 ; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s8, v1
+; GFX90A-NEXT:    s_xor_b32 s3, s4, s3
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX90A-NEXT:    v_cvt_f32_u32_e32 v2, s3
 ; GFX90A-NEXT:    v_subrev_u32_e32 v3, s8, v1
@@ -3109,9 +3109,9 @@ define amdgpu_kernel void @srem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x
 ; GFX90A-NEXT:    v_sub_u32_e32 v2, s4, v2
 ; GFX90A-NEXT:    s_ashr_i32 s4, s11, 31
 ; GFX90A-NEXT:    s_add_i32 s5, s11, s4
-; GFX90A-NEXT:    s_xor_b32 s4, s5, s4
 ; GFX90A-NEXT:    v_subrev_u32_e32 v3, s3, v2
 ; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s3, v2
+; GFX90A-NEXT:    s_xor_b32 s4, s5, s4
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; GFX90A-NEXT:    v_cvt_f32_u32_e32 v3, s4
 ; GFX90A-NEXT:    v_subrev_u32_e32 v5, s3, v2
@@ -3245,13 +3245,13 @@ define amdgpu_kernel void @udiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
 ; GFX6-NEXT:    s_lshr_b32 s9, s0, 16
 ; GFX6-NEXT:    s_and_b32 s0, s0, s8
 ; GFX6-NEXT:    s_lshr_b32 s2, s2, 16
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s2
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s0
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s2
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s9
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v3
 ; GFX6-NEXT:    s_and_b32 s2, s3, s8
 ; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v3
 ; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
 ; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v2
@@ -3262,22 +3262,21 @@ define amdgpu_kernel void @udiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
 ; GFX6-NEXT:    v_mad_f32 v2, -v1, v3, v4
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s2
 ; GFX6-NEXT:    s_lshr_b32 s0, s1, 16
-; GFX6-NEXT:    s_and_b32 s1, s1, s8
 ; GFX6-NEXT:    s_lshr_b32 s10, s3, 16
-; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v3
+; GFX6-NEXT:    s_and_b32 s1, s1, s8
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s10
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, s1
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v6, v4
+; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v3
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s10
 ; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v1, vcc
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v7, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX6-NEXT:    v_mul_f32_e32 v1, v5, v6
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v6, s0
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v7, v3
 ; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
 ; GFX6-NEXT:    v_mad_f32 v5, -v1, v4, v5
-; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v5|, v4
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v5|, v4
 ; GFX6-NEXT:    v_mul_f32_e32 v4, v6, v7
 ; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v5, v4
@@ -3285,9 +3284,10 @@ define amdgpu_kernel void @udiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
 ; GFX6-NEXT:    v_mad_f32 v4, -v4, v3, v6
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, v3
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
-; GFX6-NEXT:    v_and_b32_e32 v0, s8, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX6-NEXT:    v_and_b32_e32 v1, s8, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_and_b32_e32 v0, s8, v0
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v3
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -3321,22 +3321,22 @@ define amdgpu_kernel void @udiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
 ; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v3, vcc
 ; GFX9-NEXT:    v_mad_f32 v3, -v1, v4, v5
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s0
-; GFX9-NEXT:    s_and_b32 s0, s5, s8
 ; GFX9-NEXT:    s_lshr_b32 s6, s7, 16
-; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v4
+; GFX9-NEXT:    s_and_b32 s0, s5, s8
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s6
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v6, s0
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v7, v5
+; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v4
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s6
 ; GFX9-NEXT:    s_lshr_b32 s1, s5, 16
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v8, v4
 ; GFX9-NEXT:    v_mul_f32_e32 v1, v6, v7
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v7, s1
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v8, v4
 ; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
 ; GFX9-NEXT:    v_mad_f32 v6, -v1, v5, v6
-; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v6|, v5
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v6|, v5
 ; GFX9-NEXT:    v_mul_f32_e32 v5, v7, v8
 ; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v5
@@ -3344,9 +3344,9 @@ define amdgpu_kernel void @udiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
 ; GFX9-NEXT:    v_mad_f32 v5, -v5, v4, v7
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v5|, v4
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0xffff
-; GFX9-NEXT:    v_and_b32_e32 v0, v5, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v6, vcc
 ; GFX9-NEXT:    v_and_b32_e32 v1, v5, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, v5, v0
 ; GFX9-NEXT:    v_lshl_or_b32 v1, v4, 16, v1
 ; GFX9-NEXT:    v_lshl_or_b32 v0, v3, 16, v0
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
@@ -3380,22 +3380,22 @@ define amdgpu_kernel void @udiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v3, vcc
 ; GFX90A-NEXT:    v_mad_f32 v3, -v1, v4, v5
 ; GFX90A-NEXT:    v_cvt_f32_u32_e32 v5, s0
-; GFX90A-NEXT:    s_and_b32 s0, s5, s8
 ; GFX90A-NEXT:    s_lshr_b32 s6, s7, 16
-; GFX90A-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v4
+; GFX90A-NEXT:    s_and_b32 s0, s5, s8
 ; GFX90A-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v4, s6
 ; GFX90A-NEXT:    v_cvt_f32_u32_e32 v6, s0
 ; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v7, v5
+; GFX90A-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v4
+; GFX90A-NEXT:    v_cvt_f32_u32_e32 v4, s6
 ; GFX90A-NEXT:    s_lshr_b32 s1, s5, 16
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v8, v4
 ; GFX90A-NEXT:    v_mul_f32_e32 v1, v6, v7
 ; GFX90A-NEXT:    v_cvt_f32_u32_e32 v7, s1
+; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v8, v4
 ; GFX90A-NEXT:    v_trunc_f32_e32 v1, v1
 ; GFX90A-NEXT:    v_mad_f32 v6, -v1, v5, v6
-; GFX90A-NEXT:    v_cmp_ge_f32_e64 vcc, |v6|, v5
 ; GFX90A-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX90A-NEXT:    v_cmp_ge_f32_e64 vcc, |v6|, v5
 ; GFX90A-NEXT:    v_mul_f32_e32 v5, v7, v8
 ; GFX90A-NEXT:    v_trunc_f32_e32 v5, v5
 ; GFX90A-NEXT:    v_cvt_u32_f32_e32 v6, v5
@@ -3403,9 +3403,9 @@ define amdgpu_kernel void @udiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
 ; GFX90A-NEXT:    v_mad_f32 v5, -v5, v4, v7
 ; GFX90A-NEXT:    v_cmp_ge_f32_e64 vcc, |v5|, v4
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, 0xffff
-; GFX90A-NEXT:    v_and_b32_e32 v0, v5, v0
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v6, vcc
 ; GFX90A-NEXT:    v_and_b32_e32 v1, v5, v1
+; GFX90A-NEXT:    v_and_b32_e32 v0, v5, v0
 ; GFX90A-NEXT:    v_lshl_or_b32 v1, v4, 16, v1
 ; GFX90A-NEXT:    v_lshl_or_b32 v0, v3, 16, v0
 ; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
@@ -3536,8 +3536,8 @@ define amdgpu_kernel void @urem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
 ; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v2, vcc
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v1
 ; GFX6-NEXT:    v_mad_f32 v1, -v1, v3, v4
-; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v3
 ; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s2
+; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v3
 ; GFX6-NEXT:    s_and_b32 s2, s3, s8
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s2
@@ -3547,16 +3547,16 @@ define amdgpu_kernel void @urem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v2
 ; GFX6-NEXT:    s_lshr_b32 s12, s3, 16
 ; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, s9, v1
-; GFX6-NEXT:    s_lshr_b32 s10, s1, 16
 ; GFX6-NEXT:    v_mul_f32_e32 v1, v3, v4
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s12
+; GFX6-NEXT:    s_lshr_b32 s10, s1, 16
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v6, s10
 ; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v7, v4
+; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
 ; GFX6-NEXT:    v_mad_f32 v3, -v1, v2, v3
-; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v2
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v2
 ; GFX6-NEXT:    v_mul_f32_e32 v2, v6, v7
 ; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v2
@@ -3608,20 +3608,20 @@ define amdgpu_kernel void @urem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
 ; GFX9-NEXT:    v_mad_f32 v3, -v1, v4, v5
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s6
 ; GFX9-NEXT:    s_and_b32 s6, s5, s8
-; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v4
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s10
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v6, s6
+; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v4
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v7, v5
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s10
 ; GFX9-NEXT:    s_lshr_b32 s1, s5, 16
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v8, v4
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX9-NEXT:    v_mul_f32_e32 v3, v6, v7
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v7, s1
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v8, v4
 ; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX9-NEXT:    v_mad_f32 v6, -v3, v5, v6
-; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v6|, v5
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v6|, v5
 ; GFX9-NEXT:    v_mul_f32_e32 v5, v7, v8
 ; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v5
@@ -3677,21 +3677,21 @@ define amdgpu_kernel void @urem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
 ; GFX90A-NEXT:    v_cvt_f32_u32_e32 v5, s4
 ; GFX90A-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX90A-NEXT:    s_and_b32 s4, s5, s8
-; GFX90A-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v4
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v4, s10
 ; GFX90A-NEXT:    v_cvt_f32_u32_e32 v6, s4
 ; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v7, v5
+; GFX90A-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v4
+; GFX90A-NEXT:    v_cvt_f32_u32_e32 v4, s10
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v1, v1, s9
 ; GFX90A-NEXT:    s_lshr_b32 s1, s5, 16
+; GFX90A-NEXT:    v_mul_lo_u32 v1, v1, s9
 ; GFX90A-NEXT:    v_sub_u32_e32 v3, s0, v1
 ; GFX90A-NEXT:    v_mul_f32_e32 v1, v6, v7
 ; GFX90A-NEXT:    v_cvt_f32_u32_e32 v7, s1
 ; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v8, v4
 ; GFX90A-NEXT:    v_trunc_f32_e32 v1, v1
 ; GFX90A-NEXT:    v_mad_f32 v6, -v1, v5, v6
-; GFX90A-NEXT:    v_cmp_ge_f32_e64 vcc, |v6|, v5
 ; GFX90A-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX90A-NEXT:    v_cmp_ge_f32_e64 vcc, |v6|, v5
 ; GFX90A-NEXT:    v_mul_f32_e32 v5, v7, v8
 ; GFX90A-NEXT:    v_trunc_f32_e32 v5, v5
 ; GFX90A-NEXT:    v_cvt_u32_f32_e32 v6, v5
@@ -3700,12 +3700,12 @@ define amdgpu_kernel void @urem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
 ; GFX90A-NEXT:    v_cmp_ge_f32_e64 vcc, |v5|, v4
 ; GFX90A-NEXT:    v_mul_lo_u32 v1, v1, s7
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v6, vcc
+; GFX90A-NEXT:    v_sub_u32_e32 v1, s5, v1
 ; GFX90A-NEXT:    v_mul_lo_u32 v4, v4, s10
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, 0xffff
-; GFX90A-NEXT:    v_sub_u32_e32 v1, s5, v1
-; GFX90A-NEXT:    v_and_b32_e32 v0, v5, v0
 ; GFX90A-NEXT:    v_sub_u32_e32 v4, s1, v4
 ; GFX90A-NEXT:    v_and_b32_e32 v1, v5, v1
+; GFX90A-NEXT:    v_and_b32_e32 v0, v5, v0
 ; GFX90A-NEXT:    v_lshl_or_b32 v1, v4, 16, v1
 ; GFX90A-NEXT:    v_lshl_or_b32 v0, v3, 16, v0
 ; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
@@ -3835,8 +3835,8 @@ define amdgpu_kernel void @sdiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
 ; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
 ; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
-; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
 ; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s2
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s8
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
@@ -3852,8 +3852,8 @@ define amdgpu_kernel void @sdiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
 ; GFX6-NEXT:    v_mad_f32 v2, -v3, v1, v2
 ; GFX6-NEXT:    v_mov_b32_e32 v4, s0
 ; GFX6-NEXT:    s_sext_i32_i16 s0, s3
-; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, |v1|
 ; GFX6-NEXT:    v_cvt_i32_f32_e32 v3, v3
+; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, |v1|
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v4, vcc
 ; GFX6-NEXT:    s_sext_i32_i16 s2, s1
@@ -3868,8 +3868,8 @@ define amdgpu_kernel void @sdiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
 ; GFX6-NEXT:    v_mad_f32 v1, -v4, v2, v1
 ; GFX6-NEXT:    v_mov_b32_e32 v5, s0
 ; GFX6-NEXT:    s_ashr_i32 s0, s3, 16
-; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v2|
 ; GFX6-NEXT:    v_cvt_i32_f32_e32 v4, v4
+; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v2|
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v5, vcc
 ; GFX6-NEXT:    s_ashr_i32 s1, s1, 16
@@ -3931,9 +3931,9 @@ define amdgpu_kernel void @sdiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
 ; GFX9-NEXT:    v_mad_f32 v1, -v4, v0, v1
 ; GFX9-NEXT:    s_or_b32 s4, s0, 1
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, |v0|
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v4, v4
 ; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    s_sext_i32_i16 s1, s7
-; GFX9-NEXT:    v_cvt_i32_f32_e32 v4, v4
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s1
 ; GFX9-NEXT:    s_cselect_b32 s0, s4, 0
 ; GFX9-NEXT:    v_add_u32_e32 v4, s0, v4
@@ -3948,8 +3948,8 @@ define amdgpu_kernel void @sdiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
 ; GFX9-NEXT:    v_mad_f32 v1, -v5, v0, v1
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, |v0|
 ; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT:    s_cselect_b32 s0, s4, 0
 ; GFX9-NEXT:    v_cvt_i32_f32_e32 v5, v5
+; GFX9-NEXT:    s_cselect_b32 s0, s4, 0
 ; GFX9-NEXT:    s_ashr_i32 s1, s7, 16
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s1
 ; GFX9-NEXT:    v_add_u32_e32 v1, s0, v5
@@ -4009,9 +4009,9 @@ define amdgpu_kernel void @sdiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
 ; GFX90A-NEXT:    v_mad_f32 v1, -v4, v0, v1
 ; GFX90A-NEXT:    s_or_b32 s4, s0, 1
 ; GFX90A-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, |v0|
+; GFX90A-NEXT:    v_cvt_i32_f32_e32 v4, v4
 ; GFX90A-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX90A-NEXT:    s_sext_i32_i16 s1, s7
-; GFX90A-NEXT:    v_cvt_i32_f32_e32 v4, v4
 ; GFX90A-NEXT:    v_cvt_f32_i32_e32 v0, s1
 ; GFX90A-NEXT:    s_cselect_b32 s0, s4, 0
 ; GFX90A-NEXT:    v_add_u32_e32 v4, s0, v4
@@ -4026,8 +4026,8 @@ define amdgpu_kernel void @sdiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
 ; GFX90A-NEXT:    v_mad_f32 v1, -v5, v0, v1
 ; GFX90A-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, |v0|
 ; GFX90A-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX90A-NEXT:    s_cselect_b32 s0, s4, 0
 ; GFX90A-NEXT:    v_cvt_i32_f32_e32 v5, v5
+; GFX90A-NEXT:    s_cselect_b32 s0, s4, 0
 ; GFX90A-NEXT:    s_ashr_i32 s1, s7, 16
 ; GFX90A-NEXT:    v_cvt_f32_i32_e32 v0, s1
 ; GFX90A-NEXT:    v_add_u32_e32 v1, s0, v5
@@ -4222,8 +4222,8 @@ define amdgpu_kernel void @srem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
 ; GFX6-NEXT:    v_mad_f32 v1, -v4, v2, v1
 ; GFX6-NEXT:    v_mov_b32_e32 v5, s0
 ; GFX6-NEXT:    s_ashr_i32 s0, s3, 16
-; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v2|
 ; GFX6-NEXT:    v_cvt_i32_f32_e32 v4, v4
+; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v2|
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v5, vcc
 ; GFX6-NEXT:    s_ashr_i32 s2, s1, 16
@@ -4243,11 +4243,11 @@ define amdgpu_kernel void @srem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v6, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
 ; GFX6-NEXT:    v_mul_lo_u32 v2, v2, s0
-; GFX6-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s1, v1
-; GFX6-NEXT:    v_and_b32_e32 v1, s0, v1
+; GFX6-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s2, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_and_b32_e32 v1, s0, v1
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GFX6-NEXT:    v_and_b32_e32 v0, s0, v0
@@ -4307,8 +4307,8 @@ define amdgpu_kernel void @srem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
 ; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
 ; GFX9-NEXT:    v_mad_f32 v4, -v5, v3, v4
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v4|, |v3|
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    v_cvt_i32_f32_e32 v5, v5
+; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    s_cselect_b32 s0, s6, 0
 ; GFX9-NEXT:    s_ashr_i32 s6, s7, 16
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v4, s6
@@ -4358,8 +4358,8 @@ define amdgpu_kernel void @srem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
 ; GFX90A-NEXT:    v_mul_f32_e32 v3, v1, v3
 ; GFX90A-NEXT:    v_trunc_f32_e32 v3, v3
 ; GFX90A-NEXT:    v_mad_f32 v1, -v3, v0, v1
-; GFX90A-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, |v0|
 ; GFX90A-NEXT:    v_cvt_i32_f32_e32 v3, v3
+; GFX90A-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, |v0|
 ; GFX90A-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX90A-NEXT:    s_cselect_b32 s0, s8, 0
 ; GFX90A-NEXT:    s_ashr_i32 s8, s6, 16
@@ -4394,8 +4394,8 @@ define amdgpu_kernel void @srem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
 ; GFX90A-NEXT:    v_mul_f32_e32 v5, v1, v5
 ; GFX90A-NEXT:    v_trunc_f32_e32 v5, v5
 ; GFX90A-NEXT:    v_mad_f32 v1, -v5, v3, v1
-; GFX90A-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, |v3|
 ; GFX90A-NEXT:    v_cvt_i32_f32_e32 v5, v5
+; GFX90A-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, |v3|
 ; GFX90A-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX90A-NEXT:    s_cselect_b32 s0, s4, 0
 ; GFX90A-NEXT:    s_ashr_i32 s4, s7, 16
@@ -4419,9 +4419,9 @@ define amdgpu_kernel void @srem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
 ; GFX90A-NEXT:    v_add_u32_e32 v3, s0, v6
 ; GFX90A-NEXT:    v_mul_lo_u32 v3, v3, s4
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, 0xffff
-; GFX90A-NEXT:    v_and_b32_e32 v0, v5, v0
 ; GFX90A-NEXT:    v_sub_u32_e32 v3, s5, v3
 ; GFX90A-NEXT:    v_and_b32_e32 v1, v5, v1
+; GFX90A-NEXT:    v_and_b32_e32 v0, v5, v0
 ; GFX90A-NEXT:    v_lshl_or_b32 v1, v3, 16, v1
 ; GFX90A-NEXT:    v_lshl_or_b32 v0, v4, 16, v0
 ; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
@@ -4573,8 +4573,6 @@ define amdgpu_kernel void @urem_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) {
 ; GFX9-LABEL: urem_i3:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_bfe_u32 s3, s2, 0x30008
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, s3
@@ -4587,11 +4585,13 @@ define amdgpu_kernel void @urem_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) {
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v1
 ; GFX9-NEXT:    v_mad_f32 v1, -v1, v0, v2
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v3, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s3
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 7, v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_store_byte v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -4793,8 +4793,6 @@ define amdgpu_kernel void @srem_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) {
 ; GFX9-LABEL: srem_i3:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_bfe_i32 s2, s4, 0x30008
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s2
@@ -4814,9 +4812,11 @@ define amdgpu_kernel void @srem_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) {
 ; GFX9-NEXT:    s_cselect_b32 s2, s6, 0
 ; GFX9-NEXT:    v_add_u32_e32 v0, s2, v2
 ; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s5
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    v_sub_u32_e32 v0, s4, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 7, v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_store_byte v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -4930,13 +4930,13 @@ define amdgpu_kernel void @udiv_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s6
 ; GFX6-NEXT:    s_and_b32 s6, s2, s8
 ; GFX6-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s0
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s6
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s0
 ; GFX6-NEXT:    s_lshr_b32 s0, s2, 16
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s0
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v3
 ; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v3
 ; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
 ; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v2
@@ -4960,8 +4960,8 @@ define amdgpu_kernel void @udiv_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x
 ; GFX6-NEXT:    v_mad_f32 v2, -v2, v4, v5
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT:    v_and_b32_e32 v0, s8, v0
 ; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v3, vcc
+; GFX6-NEXT:    v_and_b32_e32 v0, s8, v0
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    buffer_store_short v2, off, s[4:7], 0 offset:4
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
@@ -5156,21 +5156,21 @@ define amdgpu_kernel void @urem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x
 ; GFX6-NEXT:    v_alignbit_b32 v1, s3, v1, 16
 ; GFX6-NEXT:    v_mul_f32_e32 v3, v2, v3
 ; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
-; GFX6-NEXT:    v_mad_f32 v2, -v3, v0, v2
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v6, v3
+; GFX6-NEXT:    v_mad_f32 v2, -v3, v0, v2
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v0
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, v5
-; GFX6-NEXT:    v_and_b32_e32 v3, s8, v1
 ; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v6, vcc
+; GFX6-NEXT:    v_and_b32_e32 v3, s8, v1
 ; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s0
-; GFX6-NEXT:    s_and_b32 s0, s1, s8
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, v3
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v2
+; GFX6-NEXT:    s_and_b32 s0, s1, s8
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v6, s0
 ; GFX6-NEXT:    s_and_b32 s0, s3, s8
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v7, s0
 ; GFX6-NEXT:    v_mul_f32_e32 v5, v3, v5
 ; GFX6-NEXT:    v_trunc_f32_e32 v5, v5
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v7, s0
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v8, v6
 ; GFX6-NEXT:    v_mad_f32 v3, -v5, v2, v3
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v5, v5
@@ -5188,8 +5188,8 @@ define amdgpu_kernel void @urem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x
 ; GFX6-NEXT:    v_mul_lo_u32 v3, v3, s1
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT:    v_and_b32_e32 v0, s8, v0
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s3, v3
+; GFX6-NEXT:    v_and_b32_e32 v0, s8, v0
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    buffer_store_short v2, off, s[4:7], 0 offset:4
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
@@ -5282,8 +5282,8 @@ define amdgpu_kernel void @urem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x
 ; GFX90A-NEXT:    s_and_b32 s1, s5, s8
 ; GFX90A-NEXT:    v_cvt_f32_u32_e32 v6, s1
 ; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v7, v5
-; GFX90A-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v4
 ; GFX90A-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX90A-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v4
 ; GFX90A-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX90A-NEXT:    v_mul_f32_e32 v3, v6, v7
 ; GFX90A-NEXT:    v_trunc_f32_e32 v3, v3
@@ -5403,8 +5403,8 @@ define amdgpu_kernel void @sdiv_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x
 ; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
 ; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
-; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
 ; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
 ; GFX6-NEXT:    s_ashr_i32 s2, s2, 16
@@ -5419,8 +5419,8 @@ define amdgpu_kernel void @sdiv_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x
 ; GFX6-NEXT:    v_mad_f32 v2, -v3, v1, v2
 ; GFX6-NEXT:    v_mov_b32_e32 v4, s0
 ; GFX6-NEXT:    s_sext_i32_i16 s0, s1
-; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, |v1|
 ; GFX6-NEXT:    v_cvt_i32_f32_e32 v3, v3
+; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, |v1|
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v4, vcc
 ; GFX6-NEXT:    s_sext_i32_i16 s1, s3
@@ -5480,12 +5480,11 @@ define amdgpu_kernel void @sdiv_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x
 ; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
 ; GFX9-NEXT:    v_mad_f32 v3, -v4, v0, v3
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v3|, |v0|
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v4, v4
 ; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    s_sext_i32_i16 s1, s7
-; GFX9-NEXT:    v_cvt_i32_f32_e32 v4, v4
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s1
 ; GFX9-NEXT:    s_cselect_b32 s0, s4, 0
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX9-NEXT:    v_add_u32_e32 v3, s0, v4
 ; GFX9-NEXT:    s_sext_i32_i16 s0, s5
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v4, s0
@@ -5501,6 +5500,7 @@ define amdgpu_kernel void @sdiv_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x
 ; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    s_cselect_b32 s0, s4, 0
 ; GFX9-NEXT:    v_add_u32_e32 v0, s0, v5
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX9-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
 ; GFX9-NEXT:    global_store_short v1, v0, s[2:3] offset:4
 ; GFX9-NEXT:    global_store_dword v1, v2, s[2:3]
@@ -5541,12 +5541,11 @@ define amdgpu_kernel void @sdiv_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x
 ; GFX90A-NEXT:    v_trunc_f32_e32 v4, v4
 ; GFX90A-NEXT:    v_mad_f32 v3, -v4, v0, v3
 ; GFX90A-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v3|, |v0|
+; GFX90A-NEXT:    v_cvt_i32_f32_e32 v4, v4
 ; GFX90A-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX90A-NEXT:    s_sext_i32_i16 s1, s7
-; GFX90A-NEXT:    v_cvt_i32_f32_e32 v4, v4
 ; GFX90A-NEXT:    v_cvt_f32_i32_e32 v0, s1
 ; GFX90A-NEXT:    s_cselect_b32 s0, s4, 0
-; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX90A-NEXT:    v_add_u32_e32 v3, s0, v4
 ; GFX90A-NEXT:    s_sext_i32_i16 s0, s5
 ; GFX90A-NEXT:    v_cvt_f32_i32_e32 v4, s0
@@ -5562,6 +5561,7 @@ define amdgpu_kernel void @sdiv_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x
 ; GFX90A-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX90A-NEXT:    s_cselect_b32 s0, s4, 0
 ; GFX90A-NEXT:    v_add_u32_e32 v0, s0, v5
+; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX90A-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
 ; GFX90A-NEXT:    global_store_short v1, v0, s[2:3] offset:4
 ; GFX90A-NEXT:    global_store_dword v1, v2, s[2:3]
@@ -5719,8 +5719,8 @@ define amdgpu_kernel void @srem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x
 ; GFX6-NEXT:    v_mul_lo_u32 v3, v3, s1
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s3, v3
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    buffer_store_short v2, off, s[4:7], 0 offset:4
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
@@ -5786,8 +5786,8 @@ define amdgpu_kernel void @srem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x
 ; GFX9-NEXT:    v_sub_u32_e32 v1, s9, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-NEXT:    v_sub_u32_e32 v0, s6, v0
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX9-NEXT:    v_sub_u32_e32 v2, s2, v2
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX9-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
 ; GFX9-NEXT:    global_store_short v3, v2, s[4:5] offset:4
 ; GFX9-NEXT:    global_store_dword v3, v0, s[4:5]
@@ -5853,8 +5853,8 @@ define amdgpu_kernel void @srem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x
 ; GFX90A-NEXT:    v_add_u32_e32 v3, s0, v5
 ; GFX90A-NEXT:    v_sub_u32_e32 v0, s9, v0
 ; GFX90A-NEXT:    v_mul_lo_u32 v3, v3, s6
-; GFX90A-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX90A-NEXT:    v_sub_u32_e32 v3, s4, v3
+; GFX90A-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX90A-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
 ; GFX90A-NEXT:    global_store_short v1, v3, s[2:3] offset:4
 ; GFX90A-NEXT:    global_store_dword v1, v0, s[2:3]
@@ -5942,16 +5942,16 @@ define amdgpu_kernel void @udiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
 ; GFX6-NEXT:    s_movk_i32 s3, 0x7fff
 ; GFX6-NEXT:    s_and_b32 s9, s0, s3
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s9
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX6-NEXT:    s_and_b32 s8, s2, s3
+; GFX6-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX6-NEXT:    s_bfe_u32 s0, s0, 0xf000f
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, s0
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s8
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v1
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, s0
 ; GFX6-NEXT:    s_bfe_u32 s2, s2, 0xf000f
 ; GFX6-NEXT:    v_alignbit_b32 v2, s1, v2, 30
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v6, s2
 ; GFX6-NEXT:    v_mul_f32_e32 v4, v3, v4
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v6, s2
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v7, v5
 ; GFX6-NEXT:    v_and_b32_e32 v2, s3, v2
 ; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
@@ -5977,8 +5977,8 @@ define amdgpu_kernel void @udiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
 ; GFX6-NEXT:    v_and_b32_e32 v2, s3, v3
 ; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v5, vcc
 ; GFX6-NEXT:    v_and_b32_e32 v3, s3, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 15, v3
 ; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], 30
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 15, v3
 ; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
@@ -6002,8 +6002,8 @@ define amdgpu_kernel void @udiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
 ; GFX9-NEXT:    s_bfe_u32 s0, s6, 0xf000f
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v6, s0
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v1
-; GFX9-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX9-NEXT:    s_bfe_u32 s1, s4, 0xf000f
+; GFX9-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX9-NEXT:    v_alignbit_b32 v3, s7, v3, 30
 ; GFX9-NEXT:    v_mul_f32_e32 v5, v4, v5
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v7, s1
@@ -6014,8 +6014,8 @@ define amdgpu_kernel void @udiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v5, v5
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, v3
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, v1
 ; GFX9-NEXT:    v_alignbit_b32 v0, s5, v0, 30
+; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, v1
 ; GFX9-NEXT:    v_mul_f32_e32 v1, v7, v8
 ; GFX9-NEXT:    v_and_b32_e32 v0, s8, v0
 ; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
@@ -6034,8 +6034,8 @@ define amdgpu_kernel void @udiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
 ; GFX9-NEXT:    v_and_b32_e32 v3, s8, v4
 ; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v6, vcc
 ; GFX9-NEXT:    v_and_b32_e32 v4, s8, v5
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 15, v4
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 30, v[0:1]
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 15, v4
 ; GFX9-NEXT:    v_or_b32_e32 v3, v3, v4
 ; GFX9-NEXT:    v_or_b32_e32 v0, v3, v0
 ; GFX9-NEXT:    global_store_dword v2, v0, s[2:3]
@@ -6058,8 +6058,8 @@ define amdgpu_kernel void @udiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
 ; GFX90A-NEXT:    s_bfe_u32 s0, s6, 0xf000f
 ; GFX90A-NEXT:    v_cvt_f32_u32_e32 v6, s0
 ; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v5, v1
-; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX90A-NEXT:    s_bfe_u32 s1, s4, 0xf000f
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX90A-NEXT:    v_alignbit_b32 v3, s7, v3, 30
 ; GFX90A-NEXT:    v_mul_f32_e32 v5, v4, v5
 ; GFX90A-NEXT:    v_cvt_f32_u32_e32 v7, s1
@@ -6070,8 +6070,8 @@ define amdgpu_kernel void @udiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
 ; GFX90A-NEXT:    v_cvt_u32_f32_e32 v5, v5
 ; GFX90A-NEXT:    v_cvt_f32_u32_e32 v3, v3
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, s4
-; GFX90A-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, v1
 ; GFX90A-NEXT:    v_alignbit_b32 v0, s5, v0, 30
+; GFX90A-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, v1
 ; GFX90A-NEXT:    v_mul_f32_e32 v1, v7, v8
 ; GFX90A-NEXT:    v_and_b32_e32 v0, s8, v0
 ; GFX90A-NEXT:    v_trunc_f32_e32 v1, v1
@@ -6213,8 +6213,8 @@ define amdgpu_kernel void @urem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
 ; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
 ; GFX6-NEXT:    v_mad_f32 v3, -v1, v5, v3
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v8, v4
-; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v5
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v5
 ; GFX6-NEXT:    s_lshr_b32 s0, s0, 15
 ; GFX6-NEXT:    v_mul_f32_e32 v3, v7, v8
 ; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
@@ -6222,8 +6222,8 @@ define amdgpu_kernel void @urem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX6-NEXT:    v_mad_f32 v3, -v3, v4, v7
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v4
-; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
 ; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s0
+; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
 ; GFX6-NEXT:    v_mul_lo_u32 v2, v3, v2
 ; GFX6-NEXT:    s_lshr_b32 s8, s2, 15
 ; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s8, v1
@@ -6263,37 +6263,37 @@ define amdgpu_kernel void @urem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
 ; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
 ; GFX9-NEXT:    v_mad_f32 v4, -v5, v1, v4
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, v1
 ; GFX9-NEXT:    s_bfe_u32 s1, s4, 0xf000f
 ; GFX9-NEXT:    v_and_b32_e32 v3, s8, v3
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v5, vcc
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, v3
+; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, v1
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v7, s1
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v8, v6
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v5, vcc
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, v3
 ; GFX9-NEXT:    v_and_b32_e32 v0, s8, v0
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v9, v5
-; GFX9-NEXT:    s_lshr_b32 s0, s6, 15
 ; GFX9-NEXT:    v_mul_f32_e32 v4, v7, v8
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v8, v0
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v9, v5
 ; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
 ; GFX9-NEXT:    v_mad_f32 v7, -v4, v6, v7
-; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v7|, v6
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v4
+; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v7|, v6
 ; GFX9-NEXT:    v_mul_f32_e32 v6, v8, v9
 ; GFX9-NEXT:    v_trunc_f32_e32 v6, v6
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v7, v6
 ; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
 ; GFX9-NEXT:    v_mad_f32 v6, -v6, v5, v8
+; GFX9-NEXT:    s_lshr_b32 s0, s6, 15
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v6|, v5
 ; GFX9-NEXT:    v_mul_lo_u32 v4, v4, s0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v3, v5, v3
 ; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s6
+; GFX9-NEXT:    v_mul_lo_u32 v3, v5, v3
 ; GFX9-NEXT:    s_lshr_b32 s0, s4, 15
 ; GFX9-NEXT:    v_sub_u32_e32 v4, s0, v4
-; GFX9-NEXT:    v_and_b32_e32 v4, s8, v4
 ; GFX9-NEXT:    v_sub_u32_e32 v5, s4, v1
 ; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v3
+; GFX9-NEXT:    v_and_b32_e32 v4, s8, v4
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 30, v[0:1]
 ; GFX9-NEXT:    v_and_b32_e32 v3, s8, v5
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 15, v4
@@ -6472,8 +6472,8 @@ define amdgpu_kernel void @sdiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
 ; GFX6-NEXT:    v_mul_f32_e32 v4, v3, v4
 ; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
 ; GFX6-NEXT:    v_mad_f32 v3, -v4, v2, v3
-; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v2|
 ; GFX6-NEXT:    v_cvt_i32_f32_e32 v4, v4
+; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v2|
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v3, s0
 ; GFX6-NEXT:    s_or_b32 s1, s1, 1
 ; GFX6-NEXT:    v_mov_b32_e32 v5, s1
@@ -6488,8 +6488,8 @@ define amdgpu_kernel void @sdiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
 ; GFX6-NEXT:    v_mul_f32_e32 v5, v4, v5
 ; GFX6-NEXT:    v_trunc_f32_e32 v5, v5
 ; GFX6-NEXT:    v_mad_f32 v4, -v5, v3, v4
-; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, |v3|
 ; GFX6-NEXT:    v_cvt_i32_f32_e32 v5, v5
+; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, |v3|
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v4, v1
 ; GFX6-NEXT:    s_or_b32 s0, s0, 1
 ; GFX6-NEXT:    v_mov_b32_e32 v6, s0
@@ -6543,26 +6543,26 @@ define amdgpu_kernel void @sdiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
 ; GFX9-NEXT:    v_mad_f32 v4, -v5, v3, v4
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v4|, |v3|
 ; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT:    s_cselect_b32 s0, s5, 0
 ; GFX9-NEXT:    v_cvt_i32_f32_e32 v5, v5
+; GFX9-NEXT:    s_cselect_b32 s0, s5, 0
 ; GFX9-NEXT:    s_bfe_i32 s1, s6, 0xf000f
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, s1
-; GFX9-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-NEXT:    v_add_u32_e32 v4, s0, v5
 ; GFX9-NEXT:    s_bfe_i32 s0, s4, 0xf000f
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v5, s0
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v6, v3
+; GFX9-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-NEXT:    v_alignbit_b32 v1, s7, v1, 30
 ; GFX9-NEXT:    s_xor_b32 s0, s0, s1
-; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
 ; GFX9-NEXT:    v_mul_f32_e32 v6, v5, v6
 ; GFX9-NEXT:    v_trunc_f32_e32 v6, v6
+; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
 ; GFX9-NEXT:    v_mad_f32 v5, -v6, v3, v5
 ; GFX9-NEXT:    v_bfe_i32 v1, v1, 0, 15
 ; GFX9-NEXT:    s_or_b32 s4, s0, 1
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v6, v6
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v5|, |v3|
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, v1
-; GFX9-NEXT:    v_cvt_i32_f32_e32 v6, v6
 ; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    s_cselect_b32 s0, s4, 0
 ; GFX9-NEXT:    v_bfe_i32 v0, v0, 0, 15
@@ -6582,8 +6582,8 @@ define amdgpu_kernel void @sdiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
 ; GFX9-NEXT:    v_add_u32_e32 v0, v7, v0
 ; GFX9-NEXT:    v_and_b32_e32 v3, s0, v4
 ; GFX9-NEXT:    v_and_b32_e32 v4, s0, v5
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 15, v4
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 30, v[0:1]
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 15, v4
 ; GFX9-NEXT:    v_or_b32_e32 v3, v3, v4
 ; GFX9-NEXT:    v_or_b32_e32 v0, v3, v0
 ; GFX9-NEXT:    global_store_dword v2, v0, s[2:3]
@@ -6613,26 +6613,26 @@ define amdgpu_kernel void @sdiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
 ; GFX90A-NEXT:    v_mad_f32 v4, -v5, v3, v4
 ; GFX90A-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v4|, |v3|
 ; GFX90A-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX90A-NEXT:    s_cselect_b32 s0, s5, 0
 ; GFX90A-NEXT:    v_cvt_i32_f32_e32 v5, v5
+; GFX90A-NEXT:    s_cselect_b32 s0, s5, 0
 ; GFX90A-NEXT:    s_bfe_i32 s1, s6, 0xf000f
 ; GFX90A-NEXT:    v_cvt_f32_i32_e32 v3, s1
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX90A-NEXT:    v_add_u32_e32 v4, s0, v5
 ; GFX90A-NEXT:    s_bfe_i32 s0, s4, 0xf000f
 ; GFX90A-NEXT:    v_cvt_f32_i32_e32 v5, s0
 ; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v6, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX90A-NEXT:    v_alignbit_b32 v1, s7, v1, 30
 ; GFX90A-NEXT:    s_xor_b32 s0, s0, s1
-; GFX90A-NEXT:    s_ashr_i32 s0, s0, 30
 ; GFX90A-NEXT:    v_mul_f32_e32 v6, v5, v6
 ; GFX90A-NEXT:    v_trunc_f32_e32 v6, v6
+; GFX90A-NEXT:    s_ashr_i32 s0, s0, 30
 ; GFX90A-NEXT:    v_mad_f32 v5, -v6, v3, v5
 ; GFX90A-NEXT:    v_bfe_i32 v1, v1, 0, 15
 ; GFX90A-NEXT:    s_or_b32 s4, s0, 1
+; GFX90A-NEXT:    v_cvt_i32_f32_e32 v6, v6
 ; GFX90A-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v5|, |v3|
 ; GFX90A-NEXT:    v_cvt_f32_i32_e32 v3, v1
-; GFX90A-NEXT:    v_cvt_i32_f32_e32 v6, v6
 ; GFX90A-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX90A-NEXT:    s_cselect_b32 s0, s4, 0
 ; GFX90A-NEXT:    v_bfe_i32 v0, v0, 0, 15
@@ -6777,16 +6777,16 @@ define amdgpu_kernel void @srem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v2|
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v5, vcc
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s0
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; GFX6-NEXT:    s_bfe_u32 s12, s0, 0xf000f
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; GFX6-NEXT:    v_alignbit_b32 v1, s1, v1, 30
-; GFX6-NEXT:    v_mul_lo_u32 v2, v2, s0
 ; GFX6-NEXT:    s_lshr_b32 s1, s0, 15
+; GFX6-NEXT:    v_mul_lo_u32 v2, v2, s0
 ; GFX6-NEXT:    s_bfe_i32 s0, s12, 0xf0000
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v3, s0
 ; GFX6-NEXT:    s_bfe_u32 s10, s2, 0xf000f
-; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s2, v2
 ; GFX6-NEXT:    s_lshr_b32 s8, s2, 15
+; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s2, v2
 ; GFX6-NEXT:    s_bfe_i32 s2, s10, 0xf0000
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v4, s2
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v3
@@ -6798,8 +6798,8 @@ define amdgpu_kernel void @srem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
 ; GFX6-NEXT:    v_mad_f32 v4, -v5, v3, v4
 ; GFX6-NEXT:    v_cvt_i32_f32_e32 v5, v5
 ; GFX6-NEXT:    v_and_b32_e32 v1, s3, v1
-; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, |v3|
 ; GFX6-NEXT:    v_mov_b32_e32 v6, s0
+; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, |v3|
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, 0, v6, vcc
 ; GFX6-NEXT:    v_bfe_i32 v4, v1, 0, 15
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
@@ -6820,12 +6820,12 @@ define amdgpu_kernel void @srem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
 ; GFX6-NEXT:    v_mul_lo_u32 v3, v3, s1
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
 ; GFX6-NEXT:    v_mul_lo_u32 v1, v4, v1
-; GFX6-NEXT:    v_and_b32_e32 v2, s3, v2
 ; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s8, v3
 ; GFX6-NEXT:    v_and_b32_e32 v3, s3, v3
 ; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, v1, v0
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 15, v3
 ; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], 30
+; GFX6-NEXT:    v_and_b32_e32 v2, s3, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 15, v3
 ; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
@@ -6856,13 +6856,13 @@ define amdgpu_kernel void @srem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
 ; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
 ; GFX9-NEXT:    v_mad_f32 v3, -v4, v2, v3
 ; GFX9-NEXT:    v_cvt_i32_f32_e32 v4, v4
-; GFX9-NEXT:    v_alignbit_b32 v0, s5, v0, 30
-; GFX9-NEXT:    v_alignbit_b32 v1, s7, v1, 30
-; GFX9-NEXT:    s_or_b32 s11, s0, 1
 ; GFX9-NEXT:    s_lshr_b32 s9, s4, 15
+; GFX9-NEXT:    v_alignbit_b32 v0, s5, v0, 30
 ; GFX9-NEXT:    s_bfe_u32 s5, s4, 0xf000f
+; GFX9-NEXT:    v_alignbit_b32 v1, s7, v1, 30
 ; GFX9-NEXT:    s_lshr_b32 s7, s6, 15
 ; GFX9-NEXT:    s_bfe_u32 s10, s6, 0xf000f
+; GFX9-NEXT:    s_or_b32 s11, s0, 1
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v3|, |v2|
 ; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    s_cselect_b32 s0, s11, 0
@@ -6942,12 +6942,12 @@ define amdgpu_kernel void @srem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
 ; GFX90A-NEXT:    v_mad_f32 v4, -v5, v3, v4
 ; GFX90A-NEXT:    v_cvt_i32_f32_e32 v5, v5
 ; GFX90A-NEXT:    v_alignbit_b32 v0, s5, v0, 30
-; GFX90A-NEXT:    v_alignbit_b32 v1, s7, v1, 30
-; GFX90A-NEXT:    s_or_b32 s11, s0, 1
 ; GFX90A-NEXT:    s_lshr_b32 s5, s4, 15
 ; GFX90A-NEXT:    s_bfe_u32 s9, s4, 0xf000f
+; GFX90A-NEXT:    v_alignbit_b32 v1, s7, v1, 30
 ; GFX90A-NEXT:    s_lshr_b32 s7, s6, 15
 ; GFX90A-NEXT:    s_bfe_u32 s10, s6, 0xf000f
+; GFX90A-NEXT:    s_or_b32 s11, s0, 1
 ; GFX90A-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v4|, |v3|
 ; GFX90A-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX90A-NEXT:    s_cselect_b32 s0, s11, 0
@@ -6967,8 +6967,8 @@ define amdgpu_kernel void @srem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
 ; GFX90A-NEXT:    v_cvt_i32_f32_e32 v6, v6
 ; GFX90A-NEXT:    s_or_b32 s4, s0, 1
 ; GFX90A-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v5|, |v4|
-; GFX90A-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX90A-NEXT:    v_and_b32_e32 v1, s8, v1
+; GFX90A-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX90A-NEXT:    s_cselect_b32 s0, s4, 0
 ; GFX90A-NEXT:    v_bfe_i32 v5, v1, 0, 15
 ; GFX90A-NEXT:    v_add_u32_e32 v4, s0, v6
@@ -6989,8 +6989,8 @@ define amdgpu_kernel void @srem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v5, 0, v5, vcc
 ; GFX90A-NEXT:    v_sub_u32_e32 v4, s5, v4
 ; GFX90A-NEXT:    v_add_u32_e32 v5, v9, v5
-; GFX90A-NEXT:    v_and_b32_e32 v4, s8, v4
 ; GFX90A-NEXT:    v_mul_lo_u32 v1, v5, v1
+; GFX90A-NEXT:    v_and_b32_e32 v4, s8, v4
 ; GFX90A-NEXT:    v_sub_u32_e32 v0, v0, v1
 ; GFX90A-NEXT:    v_and_b32_e32 v3, s8, v3
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 15, v4
@@ -7441,8 +7441,8 @@ define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou
 ; GFX9-NEXT:    v_subrev_u32_e32 v5, s4, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
 ; GFX9-NEXT:    v_sub_u32_e32 v4, s3, v4
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s4, v3
 ; GFX9-NEXT:    v_add_u32_e32 v5, 1, v0
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s4, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s5, v4
 ; GFX9-NEXT:    v_subrev_u32_e32 v3, s5, v4
@@ -7880,14 +7880,14 @@ define amdgpu_kernel void @urem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou
 ; GFX9-NEXT:    v_sub_u32_e32 v1, s3, v1
 ; GFX9-NEXT:    v_subrev_u32_e32 v3, s4, v0
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s4, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX9-NEXT:    v_subrev_u32_e32 v4, s5, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s5, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX9-NEXT:    v_subrev_u32_e32 v3, s4, v0
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s4, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX9-NEXT:    v_subrev_u32_e32 v4, s5, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s5, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
@@ -8202,8 +8202,8 @@ define amdgpu_kernel void @sdiv_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out,
 ; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_ashr_i32 s2, s0, 31
-; GFX6-NEXT:    s_lshr_b32 s2, s2, 20
 ; GFX6-NEXT:    s_ashr_i32 s3, s1, 31
+; GFX6-NEXT:    s_lshr_b32 s2, s2, 20
 ; GFX6-NEXT:    s_add_i32 s0, s0, s2
 ; GFX6-NEXT:    s_lshr_b32 s2, s3, 20
 ; GFX6-NEXT:    s_add_i32 s1, s1, s2
@@ -8280,9 +8280,9 @@ define amdgpu_kernel void @ssdiv_v2i32_mixed_pow2k_denom(<2 x i32> addrspace(1)*
 ; GFX6-NEXT:    s_lshr_b32 s2, s2, 20
 ; GFX6-NEXT:    s_add_i32 s0, s0, s2
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s1, v0
+; GFX6-NEXT:    s_ashr_i32 s0, s0, 12
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 31, v0
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 11, v0
-; GFX6-NEXT:    s_ashr_i32 s0, s0, 12
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v0
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -8477,11 +8477,11 @@ define amdgpu_kernel void @sdiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou
 ; GFX6-NEXT:    v_mul_lo_u32 v2, v1, s10
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v1
 ; GFX6-NEXT:    v_xor_b32_e32 v0, s8, v0
-; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s8, v0
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s1, v2
 ; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v2
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
 ; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s10, v2
+; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s8, v0
 ; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v1
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
@@ -8541,8 +8541,8 @@ define amdgpu_kernel void @sdiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
 ; GFX9-NEXT:    v_subrev_u32_e32 v5, s2, v4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s2, v4
 ; GFX9-NEXT:    v_add_u32_e32 v3, 1, v0
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s2, v4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v3, v1, s0
 ; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
@@ -8801,8 +8801,6 @@ define amdgpu_kernel void @srem_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %
 ; GFX9-LABEL: srem_i32_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_lshl_b32 s3, 0x1000, s3
 ; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
@@ -8810,6 +8808,7 @@ define amdgpu_kernel void @srem_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %
 ; GFX9-NEXT:    s_xor_b32 s3, s3, s4
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
 ; GFX9-NEXT:    s_sub_i32 s4, 0, s3
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
@@ -8831,6 +8830,7 @@ define amdgpu_kernel void @srem_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX9-NEXT:    v_xor_b32_e32 v0, s4, v0
 ; GFX9-NEXT:    v_subrev_u32_e32 v0, s4, v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -9054,8 +9054,8 @@ define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou
 ; GFX6-NEXT:    s_ashr_i32 s6, s3, 31
 ; GFX6-NEXT:    s_add_i32 s3, s3, s6
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX6-NEXT:    s_xor_b32 s3, s3, s6
 ; GFX6-NEXT:    s_sub_i32 s9, 0, s2
+; GFX6-NEXT:    s_xor_b32 s3, s3, s6
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s3
 ; GFX6-NEXT:    v_mul_f32_e32 v0, s10, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
@@ -9078,8 +9078,8 @@ define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou
 ; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s2
 ; GFX6-NEXT:    v_mul_hi_u32 v2, v1, v2
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
-; GFX6-NEXT:    s_ashr_i32 s0, s1, 31
 ; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s2, v0
+; GFX6-NEXT:    s_ashr_i32 s0, s1, 31
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s2, v0
 ; GFX6-NEXT:    s_add_i32 s1, s1, s0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
@@ -9088,8 +9088,8 @@ define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou
 ; GFX6-NEXT:    v_mul_hi_u32 v1, s1, v1
 ; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s2, v0
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s2, v0
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s3
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX6-NEXT:    v_xor_b32_e32 v0, s8, v0
 ; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s8, v0
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s1, v1
@@ -9270,16 +9270,14 @@ define amdgpu_kernel void @udiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX6-NEXT:    v_mul_lo_u32 v3, v0, s3
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v5, v0, v2
-; GFX6-NEXT:    v_mul_hi_u32 v4, v0, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v3
+; GFX6-NEXT:    v_mul_hi_u32 v4, v0, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v9, v1, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
-; GFX6-NEXT:    s_mov_b32 s4, 0x976a7376
 ; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
 ; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v3
 ; GFX6-NEXT:    v_mul_hi_u32 v3, v1, v3
 ; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v8, v4, vcc
-; GFX6-NEXT:    s_mov_b32 s10, -1
 ; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v4, v3, vcc
 ; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v9, v7, vcc
@@ -9290,21 +9288,20 @@ define amdgpu_kernel void @udiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX6-NEXT:    v_mul_hi_u32 v5, v0, s3
 ; GFX6-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[0:1]
 ; GFX6-NEXT:    v_mul_lo_u32 v6, v2, s3
-; GFX6-NEXT:    s_movk_i32 s2, 0x11f
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
 ; GFX6-NEXT:    v_mul_lo_u32 v5, v0, s3
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
 ; GFX6-NEXT:    v_mul_lo_u32 v6, v0, v4
-; GFX6-NEXT:    v_mul_hi_u32 v10, v0, v4
 ; GFX6-NEXT:    v_mul_hi_u32 v9, v0, v5
+; GFX6-NEXT:    v_mul_hi_u32 v10, v0, v4
 ; GFX6-NEXT:    v_mul_hi_u32 v11, v2, v4
-; GFX6-NEXT:    s_mov_b32 s3, 0x976a7377
-; GFX6-NEXT:    s_mov_b32 s9, s5
+; GFX6-NEXT:    s_movk_i32 s2, 0x11f
 ; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
 ; GFX6-NEXT:    v_addc_u32_e32 v9, vcc, v8, v10, vcc
 ; GFX6-NEXT:    v_mul_lo_u32 v10, v2, v5
 ; GFX6-NEXT:    v_mul_hi_u32 v5, v2, v5
 ; GFX6-NEXT:    v_mul_lo_u32 v2, v2, v4
+; GFX6-NEXT:    s_mov_b32 s3, 0x976a7377
 ; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
 ; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v9, v5, vcc
 ; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v11, v7, vcc
@@ -9323,6 +9320,8 @@ define amdgpu_kernel void @udiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
 ; GFX6-NEXT:    v_mul_lo_u32 v4, s7, v0
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s7, v0
+; GFX6-NEXT:    s_mov_b32 s4, 0x976a7376
+; GFX6-NEXT:    s_mov_b32 s10, -1
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
 ; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, v5, v7, vcc
@@ -9332,6 +9331,7 @@ define amdgpu_kernel void @udiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s3
 ; GFX6-NEXT:    v_mul_lo_u32 v4, v1, s3
 ; GFX6-NEXT:    v_mov_b32_e32 v5, s2
+; GFX6-NEXT:    s_mov_b32 s9, s5
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v3, v0, s3
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
@@ -9403,8 +9403,8 @@ define amdgpu_kernel void @udiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v4, v6, vcc
 ; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v9, v5, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
-; GFX9-NEXT:    v_add_co_u32_e64 v0, s[2:3], v0, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v8, v4, vcc
+; GFX9-NEXT:    v_add_co_u32_e64 v0, s[2:3], v0, v2
 ; GFX9-NEXT:    v_addc_co_u32_e64 v2, vcc, v1, v3, s[2:3]
 ; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s4
 ; GFX9-NEXT:    v_mul_hi_u32 v6, v0, s5
@@ -9479,8 +9479,8 @@ define amdgpu_kernel void @udiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s2, v2
 ; GFX9-NEXT:    v_add_co_u32_e64 v4, s[0:1], v0, v4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
 ; GFX9-NEXT:    v_addc_co_u32_e64 v6, s[0:1], 0, v1, s[0:1]
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
 ; GFX9-NEXT:    global_store_dwordx2 v5, v[0:1], s[4:5]
@@ -9511,10 +9511,10 @@ define amdgpu_kernel void @udiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX90A-NEXT:    v_mul_lo_u32 v6, v0, s3
 ; GFX90A-NEXT:    v_mul_lo_u32 v5, v0, v3
 ; GFX90A-NEXT:    v_mul_hi_u32 v7, v0, v6
-; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v7, v5
 ; GFX90A-NEXT:    v_mul_hi_u32 v4, v0, v3
-; GFX90A-NEXT:    v_mul_hi_u32 v9, v1, v6
+; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v7, v5
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v8, v4, vcc
+; GFX90A-NEXT:    v_mul_hi_u32 v9, v1, v6
 ; GFX90A-NEXT:    v_mul_lo_u32 v6, v1, v6
 ; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v6
 ; GFX90A-NEXT:    v_mul_hi_u32 v7, v1, v3
@@ -9522,8 +9522,8 @@ define amdgpu_kernel void @udiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v7, v2, vcc
 ; GFX90A-NEXT:    v_mul_lo_u32 v3, v1, v3
 ; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
-; GFX90A-NEXT:    v_add_co_u32_e64 v0, s[0:1], v0, v3
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v8, v5, vcc
+; GFX90A-NEXT:    v_add_co_u32_e64 v0, s[0:1], v0, v3
 ; GFX90A-NEXT:    v_addc_co_u32_e64 v3, vcc, v1, v4, s[0:1]
 ; GFX90A-NEXT:    v_mul_lo_u32 v6, v0, s2
 ; GFX90A-NEXT:    v_mul_hi_u32 v7, v0, s3
@@ -9533,10 +9533,10 @@ define amdgpu_kernel void @udiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX90A-NEXT:    v_mul_lo_u32 v9, v0, s3
 ; GFX90A-NEXT:    v_mul_lo_u32 v7, v0, v5
 ; GFX90A-NEXT:    v_mul_hi_u32 v10, v0, v9
-; GFX90A-NEXT:    v_add_co_u32_e32 v7, vcc, v10, v7
 ; GFX90A-NEXT:    v_mul_hi_u32 v6, v0, v5
-; GFX90A-NEXT:    v_mul_hi_u32 v11, v3, v9
+; GFX90A-NEXT:    v_add_co_u32_e32 v7, vcc, v10, v7
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v6, vcc, v8, v6, vcc
+; GFX90A-NEXT:    v_mul_hi_u32 v11, v3, v9
 ; GFX90A-NEXT:    v_mul_lo_u32 v9, v3, v9
 ; GFX90A-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v9
 ; GFX90A-NEXT:    v_mul_hi_u32 v10, v3, v5
@@ -9552,10 +9552,10 @@ define amdgpu_kernel void @udiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    v_mul_lo_u32 v4, s6, v1
 ; GFX90A-NEXT:    v_mul_hi_u32 v5, s6, v0
-; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v5, v4
 ; GFX90A-NEXT:    v_mul_hi_u32 v3, s6, v1
-; GFX90A-NEXT:    v_mul_hi_u32 v6, s7, v0
+; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v5, v4
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v8, v3, vcc
+; GFX90A-NEXT:    v_mul_hi_u32 v6, s7, v0
 ; GFX90A-NEXT:    v_mul_lo_u32 v0, s7, v0
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v4, v0
 ; GFX90A-NEXT:    v_mul_hi_u32 v5, s7, v1
@@ -9563,8 +9563,8 @@ define amdgpu_kernel void @udiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v2, vcc
 ; GFX90A-NEXT:    v_mul_lo_u32 v1, s7, v1
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
-; GFX90A-NEXT:    s_mov_b32 s3, 0x976a7377
 ; GFX90A-NEXT:    s_movk_i32 s2, 0x11f
+; GFX90A-NEXT:    s_mov_b32 s3, 0x976a7377
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v8, v3, vcc
 ; GFX90A-NEXT:    v_mul_lo_u32 v3, v0, s2
 ; GFX90A-NEXT:    v_mul_hi_u32 v4, v0, s3
@@ -9572,9 +9572,9 @@ define amdgpu_kernel void @udiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX90A-NEXT:    v_mul_lo_u32 v4, v1, s3
 ; GFX90A-NEXT:    v_add_u32_e32 v3, v3, v4
 ; GFX90A-NEXT:    v_mul_lo_u32 v5, v0, s3
-; GFX90A-NEXT:    v_sub_co_u32_e32 v5, vcc, s6, v5
 ; GFX90A-NEXT:    v_sub_u32_e32 v4, s7, v3
 ; GFX90A-NEXT:    v_mov_b32_e32 v6, s2
+; GFX90A-NEXT:    v_sub_co_u32_e32 v5, vcc, s6, v5
 ; GFX90A-NEXT:    v_subb_co_u32_e64 v4, s[0:1], v4, v6, vcc
 ; GFX90A-NEXT:    v_subrev_co_u32_e64 v6, s[0:1], s3, v5
 ; GFX90A-NEXT:    v_subbrev_co_u32_e64 v4, s[0:1], 0, v4, s[0:1]
@@ -9597,8 +9597,8 @@ define amdgpu_kernel void @udiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, s2, v3
 ; GFX90A-NEXT:    v_add_co_u32_e64 v4, s[0:1], v0, v4
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v7, v5, vcc
-; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
 ; GFX90A-NEXT:    v_addc_co_u32_e64 v6, s[0:1], 0, v1, s[0:1]
+; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
 ; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
@@ -9814,15 +9814,15 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v3, v4, vcc
 ; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v9, v7, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GFX6-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v2
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
-; GFX6-NEXT:    v_mul_hi_u32 v4, v0, s2
+; GFX6-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v2
 ; GFX6-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[0:1]
+; GFX6-NEXT:    v_mul_hi_u32 v4, v0, s2
 ; GFX6-NEXT:    v_mul_lo_u32 v5, v2, s2
 ; GFX6-NEXT:    v_mul_lo_u32 v6, v0, s2
-; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, v0, v4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_lshr_b64 s[2:3], s[8:9], 12
+; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, v0, v4
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
 ; GFX6-NEXT:    v_mul_lo_u32 v5, v0, v4
 ; GFX6-NEXT:    v_mul_hi_u32 v9, v0, v6
@@ -9923,8 +9923,8 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v3, vcc
 ; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v9, v5, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
-; GFX9-NEXT:    v_add_co_u32_e64 v0, s[2:3], v0, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v7, v4, vcc
+; GFX9-NEXT:    v_add_co_u32_e64 v0, s[2:3], v0, v2
 ; GFX9-NEXT:    v_addc_co_u32_e64 v2, vcc, v1, v3, s[2:3]
 ; GFX9-NEXT:    v_mul_hi_u32 v4, v0, s4
 ; GFX9-NEXT:    v_mul_lo_u32 v6, v2, s4
@@ -9968,12 +9968,12 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v6, v5, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v7, v2, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s8
 ; GFX9-NEXT:    v_mul_lo_u32 v2, v1, s8
 ; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s8
-; GFX9-NEXT:    v_sub_co_u32_e32 v4, vcc, s6, v4
+; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s8
 ; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-NEXT:    v_sub_co_u32_e32 v4, vcc, s6, v4
 ; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v3, v2, vcc
 ; GFX9-NEXT:    v_subrev_co_u32_e32 v3, vcc, s8, v4
 ; GFX9-NEXT:    v_subbrev_co_u32_e32 v6, vcc, 0, v2, vcc
@@ -10020,10 +10020,10 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GFX90A-NEXT:    v_mul_lo_u32 v6, v0, s8
 ; GFX90A-NEXT:    v_mul_lo_u32 v5, v0, v2
 ; GFX90A-NEXT:    v_mul_hi_u32 v7, v0, v6
-; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v7, v5
 ; GFX90A-NEXT:    v_mul_hi_u32 v3, v0, v2
-; GFX90A-NEXT:    v_mul_hi_u32 v9, v1, v6
+; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v7, v5
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v8, v3, vcc
+; GFX90A-NEXT:    v_mul_hi_u32 v9, v1, v6
 ; GFX90A-NEXT:    v_mul_lo_u32 v6, v1, v6
 ; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v6
 ; GFX90A-NEXT:    v_mul_hi_u32 v7, v1, v2
@@ -10031,8 +10031,8 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v7, v4, vcc
 ; GFX90A-NEXT:    v_mul_lo_u32 v2, v1, v2
 ; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
-; GFX90A-NEXT:    v_add_co_u32_e64 v0, s[0:1], v0, v2
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v8, v5, vcc
+; GFX90A-NEXT:    v_add_co_u32_e64 v0, s[0:1], v0, v2
 ; GFX90A-NEXT:    v_addc_co_u32_e64 v2, vcc, v1, v3, s[0:1]
 ; GFX90A-NEXT:    v_mul_hi_u32 v6, v0, s8
 ; GFX90A-NEXT:    v_mul_lo_u32 v5, v2, s8
@@ -10041,10 +10041,10 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GFX90A-NEXT:    v_mul_lo_u32 v9, v0, s8
 ; GFX90A-NEXT:    v_mul_lo_u32 v7, v0, v5
 ; GFX90A-NEXT:    v_mul_hi_u32 v10, v0, v9
-; GFX90A-NEXT:    v_add_co_u32_e32 v7, vcc, v10, v7
 ; GFX90A-NEXT:    v_mul_hi_u32 v6, v0, v5
-; GFX90A-NEXT:    v_mul_hi_u32 v11, v2, v9
+; GFX90A-NEXT:    v_add_co_u32_e32 v7, vcc, v10, v7
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v6, vcc, v8, v6, vcc
+; GFX90A-NEXT:    v_mul_hi_u32 v11, v2, v9
 ; GFX90A-NEXT:    v_mul_lo_u32 v9, v2, v9
 ; GFX90A-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v9
 ; GFX90A-NEXT:    v_mul_hi_u32 v10, v2, v5
@@ -10060,10 +10060,10 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    v_mul_lo_u32 v3, s6, v1
 ; GFX90A-NEXT:    v_mul_hi_u32 v5, s6, v0
-; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v5, v3
 ; GFX90A-NEXT:    v_mul_hi_u32 v2, s6, v1
-; GFX90A-NEXT:    v_mul_hi_u32 v6, s7, v0
+; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v5, v3
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v2, vcc, v8, v2, vcc
+; GFX90A-NEXT:    v_mul_hi_u32 v6, s7, v0
 ; GFX90A-NEXT:    v_mul_lo_u32 v0, s7, v0
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v3, v0
 ; GFX90A-NEXT:    v_mul_hi_u32 v5, s7, v1
@@ -10071,8 +10071,8 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v2, vcc, v5, v4, vcc
 ; GFX90A-NEXT:    v_mul_lo_u32 v1, s7, v1
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
-; GFX90A-NEXT:    s_movk_i32 s0, 0xfff
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v8, v2, vcc
+; GFX90A-NEXT:    s_movk_i32 s0, 0xfff
 ; GFX90A-NEXT:    v_mul_lo_u32 v2, v1, s0
 ; GFX90A-NEXT:    v_mul_hi_u32 v3, v0, s0
 ; GFX90A-NEXT:    v_add_u32_e32 v2, v3, v2
@@ -10095,8 +10095,8 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v2, -1, v3, vcc
-; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
 ; GFX90A-NEXT:    s_lshr_b64 s[4:5], s[4:5], 12
+; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v0, v5, vcc
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v1, v6, vcc
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, s4
@@ -10215,16 +10215,14 @@ define amdgpu_kernel void @urem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX6-NEXT:    v_mul_lo_u32 v3, v0, s3
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v5, v0, v2
-; GFX6-NEXT:    v_mul_hi_u32 v4, v0, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v3
+; GFX6-NEXT:    v_mul_hi_u32 v4, v0, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v9, v1, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
-; GFX6-NEXT:    s_movk_i32 s4, 0x11f
 ; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
 ; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v3
 ; GFX6-NEXT:    v_mul_hi_u32 v3, v1, v3
 ; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v8, v4, vcc
-; GFX6-NEXT:    s_mov_b32 s9, s5
 ; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v4, v3, vcc
 ; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v9, v7, vcc
@@ -10235,21 +10233,20 @@ define amdgpu_kernel void @urem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX6-NEXT:    v_mul_hi_u32 v5, v0, s3
 ; GFX6-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[0:1]
 ; GFX6-NEXT:    v_mul_lo_u32 v6, v2, s3
-; GFX6-NEXT:    s_movk_i32 s5, 0x11e
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
 ; GFX6-NEXT:    v_mul_lo_u32 v5, v0, s3
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
 ; GFX6-NEXT:    v_mul_lo_u32 v6, v0, v4
-; GFX6-NEXT:    v_mul_hi_u32 v10, v0, v4
 ; GFX6-NEXT:    v_mul_hi_u32 v9, v0, v5
+; GFX6-NEXT:    v_mul_hi_u32 v10, v0, v4
 ; GFX6-NEXT:    v_mul_hi_u32 v11, v2, v4
-; GFX6-NEXT:    s_mov_b32 s11, 0xf000
-; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    s_movk_i32 s4, 0x11f
 ; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
 ; GFX6-NEXT:    v_addc_u32_e32 v9, vcc, v8, v10, vcc
 ; GFX6-NEXT:    v_mul_lo_u32 v10, v2, v5
 ; GFX6-NEXT:    v_mul_hi_u32 v5, v2, v5
 ; GFX6-NEXT:    v_mul_lo_u32 v2, v2, v4
+; GFX6-NEXT:    s_mov_b32 s9, s5
 ; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
 ; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v9, v5, vcc
 ; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v11, v7, vcc
@@ -10268,6 +10265,8 @@ define amdgpu_kernel void @urem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
 ; GFX6-NEXT:    v_mul_lo_u32 v4, s7, v0
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s7, v0
+; GFX6-NEXT:    s_movk_i32 s5, 0x11e
+; GFX6-NEXT:    s_mov_b32 s11, 0xf000
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
 ; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, v5, v7, vcc
@@ -10277,22 +10276,23 @@ define amdgpu_kernel void @urem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s12
 ; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s12
 ; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s12
+; GFX6-NEXT:    s_mov_b32 s10, -1
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s7, v1
-; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s6, v0
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s4
+; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s6, v0
 ; GFX6-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, vcc
 ; GFX6-NEXT:    v_subrev_i32_e64 v4, s[0:1], s12, v0
 ; GFX6-NEXT:    v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1]
 ; GFX6-NEXT:    v_cmp_lt_u32_e64 s[2:3], s5, v5
 ; GFX6-NEXT:    s_mov_b32 s6, 0x9761f7c8
-; GFX6-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1]
 ; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[2:3]
 ; GFX6-NEXT:    v_cmp_lt_u32_e64 s[2:3], s6, v4
-; GFX6-NEXT:    v_subrev_i32_e64 v3, s[0:1], s12, v4
+; GFX6-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1]
 ; GFX6-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[2:3]
 ; GFX6-NEXT:    v_cmp_eq_u32_e64 s[2:3], s4, v5
+; GFX6-NEXT:    v_subrev_i32_e64 v3, s[0:1], s12, v4
 ; GFX6-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[2:3]
 ; GFX6-NEXT:    v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1]
 ; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
@@ -10350,8 +10350,8 @@ define amdgpu_kernel void @urem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v4, v6, vcc
 ; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v9, v5, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
-; GFX9-NEXT:    v_add_co_u32_e64 v0, s[2:3], v0, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v8, v4, vcc
+; GFX9-NEXT:    v_add_co_u32_e64 v0, s[2:3], v0, v2
 ; GFX9-NEXT:    v_addc_co_u32_e64 v2, vcc, v1, v3, s[2:3]
 ; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s4
 ; GFX9-NEXT:    v_mul_hi_u32 v6, v0, s5
@@ -10399,20 +10399,20 @@ define amdgpu_kernel void @urem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s9
 ; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
 ; GFX9-NEXT:    v_add_u32_e32 v1, v2, v1
-; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s6, v0
 ; GFX9-NEXT:    v_sub_u32_e32 v2, s7, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s8
+; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s6, v0
 ; GFX9-NEXT:    v_subb_co_u32_e64 v2, s[0:1], v2, v3, vcc
 ; GFX9-NEXT:    v_subrev_co_u32_e64 v4, s[0:1], s9, v0
 ; GFX9-NEXT:    v_subbrev_co_u32_e64 v6, s[2:3], 0, v2, s[0:1]
 ; GFX9-NEXT:    s_movk_i32 s6, 0x11e
 ; GFX9-NEXT:    v_cmp_lt_u32_e64 s[2:3], s6, v6
-; GFX9-NEXT:    v_subb_co_u32_e64 v2, s[0:1], v2, v3, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[2:3]
 ; GFX9-NEXT:    v_cmp_lt_u32_e64 s[2:3], s10, v4
-; GFX9-NEXT:    v_subrev_co_u32_e64 v3, s[0:1], s9, v4
+; GFX9-NEXT:    v_subb_co_u32_e64 v2, s[0:1], v2, v3, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[2:3]
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], s8, v6
+; GFX9-NEXT:    v_subrev_co_u32_e64 v3, s[0:1], s9, v4
 ; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, v8, s[2:3]
 ; GFX9-NEXT:    v_subbrev_co_u32_e64 v2, s[0:1], 0, v2, s[0:1]
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v7
@@ -10457,10 +10457,10 @@ define amdgpu_kernel void @urem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX90A-NEXT:    v_mul_lo_u32 v6, v0, s3
 ; GFX90A-NEXT:    v_mul_lo_u32 v5, v0, v3
 ; GFX90A-NEXT:    v_mul_hi_u32 v7, v0, v6
-; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v7, v5
 ; GFX90A-NEXT:    v_mul_hi_u32 v4, v0, v3
-; GFX90A-NEXT:    v_mul_hi_u32 v9, v1, v6
+; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v7, v5
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v8, v4, vcc
+; GFX90A-NEXT:    v_mul_hi_u32 v9, v1, v6
 ; GFX90A-NEXT:    v_mul_lo_u32 v6, v1, v6
 ; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v6
 ; GFX90A-NEXT:    v_mul_hi_u32 v7, v1, v3
@@ -10468,8 +10468,8 @@ define amdgpu_kernel void @urem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v7, v2, vcc
 ; GFX90A-NEXT:    v_mul_lo_u32 v3, v1, v3
 ; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
-; GFX90A-NEXT:    v_add_co_u32_e64 v0, s[0:1], v0, v3
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v8, v5, vcc
+; GFX90A-NEXT:    v_add_co_u32_e64 v0, s[0:1], v0, v3
 ; GFX90A-NEXT:    v_addc_co_u32_e64 v3, vcc, v1, v4, s[0:1]
 ; GFX90A-NEXT:    v_mul_lo_u32 v6, v0, s2
 ; GFX90A-NEXT:    v_mul_hi_u32 v7, v0, s3
@@ -10479,10 +10479,10 @@ define amdgpu_kernel void @urem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX90A-NEXT:    v_mul_lo_u32 v9, v0, s3
 ; GFX90A-NEXT:    v_mul_lo_u32 v7, v0, v5
 ; GFX90A-NEXT:    v_mul_hi_u32 v10, v0, v9
-; GFX90A-NEXT:    v_add_co_u32_e32 v7, vcc, v10, v7
 ; GFX90A-NEXT:    v_mul_hi_u32 v6, v0, v5
-; GFX90A-NEXT:    v_mul_hi_u32 v11, v3, v9
+; GFX90A-NEXT:    v_add_co_u32_e32 v7, vcc, v10, v7
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v6, vcc, v8, v6, vcc
+; GFX90A-NEXT:    v_mul_hi_u32 v11, v3, v9
 ; GFX90A-NEXT:    v_mul_lo_u32 v9, v3, v9
 ; GFX90A-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v9
 ; GFX90A-NEXT:    v_mul_hi_u32 v10, v3, v5
@@ -10498,10 +10498,10 @@ define amdgpu_kernel void @urem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    v_mul_lo_u32 v4, s6, v1
 ; GFX90A-NEXT:    v_mul_hi_u32 v5, s6, v0
-; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v5, v4
 ; GFX90A-NEXT:    v_mul_hi_u32 v3, s6, v1
-; GFX90A-NEXT:    v_mul_hi_u32 v6, s7, v0
+; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v5, v4
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v8, v3, vcc
+; GFX90A-NEXT:    v_mul_hi_u32 v6, s7, v0
 ; GFX90A-NEXT:    v_mul_lo_u32 v0, s7, v0
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v4, v0
 ; GFX90A-NEXT:    v_mul_hi_u32 v5, s7, v1
@@ -10509,8 +10509,8 @@ define amdgpu_kernel void @urem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v2, vcc
 ; GFX90A-NEXT:    v_mul_lo_u32 v1, s7, v1
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
-; GFX90A-NEXT:    s_mov_b32 s9, 0x9761f7c9
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x11f
+; GFX90A-NEXT:    s_mov_b32 s9, 0x9761f7c9
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v8, v3, vcc
 ; GFX90A-NEXT:    v_mul_lo_u32 v3, v0, s8
 ; GFX90A-NEXT:    v_mul_hi_u32 v4, v0, s9
@@ -10518,21 +10518,21 @@ define amdgpu_kernel void @urem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX90A-NEXT:    v_mul_lo_u32 v1, v1, s9
 ; GFX90A-NEXT:    v_add_u32_e32 v1, v3, v1
 ; GFX90A-NEXT:    v_mul_lo_u32 v0, v0, s9
-; GFX90A-NEXT:    v_sub_co_u32_e32 v0, vcc, s6, v0
 ; GFX90A-NEXT:    v_sub_u32_e32 v3, s7, v1
 ; GFX90A-NEXT:    v_mov_b32_e32 v4, s8
+; GFX90A-NEXT:    v_sub_co_u32_e32 v0, vcc, s6, v0
 ; GFX90A-NEXT:    v_subb_co_u32_e64 v3, s[0:1], v3, v4, vcc
 ; GFX90A-NEXT:    v_subrev_co_u32_e64 v5, s[0:1], s9, v0
 ; GFX90A-NEXT:    v_subbrev_co_u32_e64 v6, s[2:3], 0, v3, s[0:1]
 ; GFX90A-NEXT:    s_movk_i32 s6, 0x11e
 ; GFX90A-NEXT:    v_cmp_lt_u32_e64 s[2:3], s6, v6
 ; GFX90A-NEXT:    s_mov_b32 s10, 0x9761f7c8
-; GFX90A-NEXT:    v_subb_co_u32_e64 v3, s[0:1], v3, v4, s[0:1]
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[2:3]
 ; GFX90A-NEXT:    v_cmp_lt_u32_e64 s[2:3], s10, v5
-; GFX90A-NEXT:    v_subrev_co_u32_e64 v4, s[0:1], s9, v5
+; GFX90A-NEXT:    v_subb_co_u32_e64 v3, s[0:1], v3, v4, s[0:1]
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[2:3]
 ; GFX90A-NEXT:    v_cmp_eq_u32_e64 s[2:3], s8, v6
+; GFX90A-NEXT:    v_subrev_co_u32_e64 v4, s[0:1], s9, v5
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v7, v7, v8, s[2:3]
 ; GFX90A-NEXT:    v_subbrev_co_u32_e64 v3, s[0:1], 0, v3, s[0:1]
 ; GFX90A-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v7
@@ -10831,12 +10831,12 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX6-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
 ; GFX6-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
-; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
-; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s2
 ; GFX6-NEXT:    v_mul_lo_u32 v2, v1, s2
+; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s2
 ; GFX6-NEXT:    v_mul_lo_u32 v4, v0, s2
 ; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
@@ -10852,22 +10852,22 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v4
 ; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v4
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v8, v3, vcc
-; GFX6-NEXT:    s_mov_b32 s5, s9
 ; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v3, v4, vcc
 ; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v9, v7, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GFX6-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v2
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
+; GFX6-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v2
 ; GFX6-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[0:1]
 ; GFX6-NEXT:    v_mul_lo_u32 v4, v2, s2
 ; GFX6-NEXT:    v_mul_hi_u32 v5, v0, s2
+; GFX6-NEXT:    s_mov_b32 s5, s9
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
 ; GFX6-NEXT:    v_mul_lo_u32 v5, v0, s2
 ; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, v0, v4
 ; GFX6-NEXT:    v_mul_lo_u32 v10, v0, v4
-; GFX6-NEXT:    v_mul_hi_u32 v12, v0, v4
 ; GFX6-NEXT:    v_mul_hi_u32 v11, v0, v5
+; GFX6-NEXT:    v_mul_hi_u32 v12, v0, v4
 ; GFX6-NEXT:    v_mul_hi_u32 v9, v2, v5
 ; GFX6-NEXT:    v_mul_lo_u32 v5, v2, v5
 ; GFX6-NEXT:    v_mul_hi_u32 v6, v2, v4
@@ -10886,8 +10886,8 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GFX6-NEXT:    s_mov_b32 s3, s2
 ; GFX6-NEXT:    s_addc_u32 s1, s11, s2
-; GFX6-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX6-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
 ; GFX6-NEXT:    v_mul_lo_u32 v2, s0, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v3, s0, v0
 ; GFX6-NEXT:    v_mul_hi_u32 v4, s0, v1
@@ -10911,8 +10911,8 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX6-NEXT:    v_add_i32_e32 v6, vcc, 1, v0
 ; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, 0, v1, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
-; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, s0, v8
 ; GFX6-NEXT:    v_mov_b32_e32 v5, s1
+; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, s0, v8
 ; GFX6-NEXT:    v_subb_u32_e32 v4, vcc, v5, v4, vcc
 ; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s3, v8
 ; GFX6-NEXT:    v_subbrev_u32_e32 v9, vcc, 0, v4, vcc
@@ -10928,8 +10928,8 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX6-NEXT:    v_cndmask_b32_e64 v4, -1, v5, s[0:1]
 ; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
 ; GFX6-NEXT:    v_xor_b32_e32 v0, s2, v0
 ; GFX6-NEXT:    v_xor_b32_e32 v1, s2, v1
@@ -10951,11 +10951,11 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
 ; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s8
 ; GFX9-NEXT:    v_mul_lo_u32 v2, v1, s8
+; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s8
 ; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s8
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_ashr_i32 s0, s7, 31
@@ -10975,8 +10975,8 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v4, vcc
 ; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v9, v5, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
-; GFX9-NEXT:    v_add_co_u32_e64 v0, s[2:3], v0, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v7, v4, vcc
+; GFX9-NEXT:    v_add_co_u32_e64 v0, s[2:3], v0, v2
 ; GFX9-NEXT:    v_addc_co_u32_e64 v2, vcc, v1, v3, s[2:3]
 ; GFX9-NEXT:    v_mul_lo_u32 v4, v2, s8
 ; GFX9-NEXT:    v_mul_hi_u32 v6, v0, s8
@@ -10989,8 +10989,8 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX9-NEXT:    v_mul_hi_u32 v12, v0, v4
 ; GFX9-NEXT:    v_mul_hi_u32 v9, v2, v8
 ; GFX9-NEXT:    v_mul_lo_u32 v8, v2, v8
-; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v11, v10
 ; GFX9-NEXT:    v_mul_hi_u32 v6, v2, v4
+; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v11, v10
 ; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, v7, v12, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v2, v2, v4
 ; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v10, v8
@@ -11000,10 +11000,10 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v7, v4, vcc
 ; GFX9-NEXT:    v_addc_co_u32_e64 v1, vcc, v1, v4, s[2:3]
 ; GFX9-NEXT:    s_add_u32 s2, s6, s0
-; GFX9-NEXT:    s_addc_u32 s3, s7, s0
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT:    s_xor_b64 s[2:3], s[2:3], s[0:1]
+; GFX9-NEXT:    s_addc_u32 s3, s7, s0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    s_xor_b64 s[2:3], s[2:3], s[0:1]
 ; GFX9-NEXT:    v_mul_lo_u32 v2, s2, v1
 ; GFX9-NEXT:    v_mul_hi_u32 v3, s2, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v4, s2, v1
@@ -11019,12 +11019,12 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v6, v5, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v7, v2, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s1
 ; GFX9-NEXT:    v_mul_lo_u32 v2, v1, s1
 ; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s1
-; GFX9-NEXT:    v_sub_co_u32_e32 v4, vcc, s2, v4
+; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s1
 ; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s3
+; GFX9-NEXT:    v_sub_co_u32_e32 v4, vcc, s2, v4
 ; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v3, v2, vcc
 ; GFX9-NEXT:    v_subrev_co_u32_e32 v3, vcc, s1, v4
 ; GFX9-NEXT:    v_subbrev_co_u32_e32 v6, vcc, 0, v2, vcc
@@ -11064,20 +11064,20 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX90A-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX90A-NEXT:    v_trunc_f32_e32 v1, v1
 ; GFX90A-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX90A-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX90A-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX90A-NEXT:    v_mul_hi_u32 v4, v0, s2
 ; GFX90A-NEXT:    v_mul_lo_u32 v3, v1, s2
+; GFX90A-NEXT:    v_mul_hi_u32 v4, v0, s2
 ; GFX90A-NEXT:    v_add_u32_e32 v3, v4, v3
 ; GFX90A-NEXT:    v_sub_u32_e32 v3, v3, v0
 ; GFX90A-NEXT:    v_mul_lo_u32 v6, v0, s2
 ; GFX90A-NEXT:    v_mul_lo_u32 v5, v0, v3
 ; GFX90A-NEXT:    v_mul_hi_u32 v7, v0, v6
-; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v7, v5
 ; GFX90A-NEXT:    v_mul_hi_u32 v4, v0, v3
-; GFX90A-NEXT:    v_mul_hi_u32 v9, v1, v6
+; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v7, v5
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v8, v4, vcc
+; GFX90A-NEXT:    v_mul_hi_u32 v9, v1, v6
 ; GFX90A-NEXT:    v_mul_lo_u32 v6, v1, v6
 ; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v6
 ; GFX90A-NEXT:    v_mul_hi_u32 v7, v1, v3
@@ -11085,8 +11085,8 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v7, v2, vcc
 ; GFX90A-NEXT:    v_mul_lo_u32 v3, v1, v3
 ; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
-; GFX90A-NEXT:    v_add_co_u32_e64 v0, s[0:1], v0, v3
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v8, v5, vcc
+; GFX90A-NEXT:    v_add_co_u32_e64 v0, s[0:1], v0, v3
 ; GFX90A-NEXT:    v_addc_co_u32_e64 v3, vcc, v1, v4, s[0:1]
 ; GFX90A-NEXT:    v_mul_lo_u32 v5, v3, s2
 ; GFX90A-NEXT:    v_mul_hi_u32 v6, v0, s2
@@ -11115,14 +11115,14 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
 ; GFX90A-NEXT:    s_mov_b32 s1, s0
 ; GFX90A-NEXT:    s_addc_u32 s3, s7, s0
-; GFX90A-NEXT:    s_xor_b64 s[2:3], s[2:3], s[0:1]
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX90A-NEXT:    s_xor_b64 s[2:3], s[2:3], s[0:1]
 ; GFX90A-NEXT:    v_mul_lo_u32 v4, s2, v1
 ; GFX90A-NEXT:    v_mul_hi_u32 v5, s2, v0
-; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v5, v4
 ; GFX90A-NEXT:    v_mul_hi_u32 v3, s2, v1
-; GFX90A-NEXT:    v_mul_hi_u32 v6, s3, v0
+; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v5, v4
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v8, v3, vcc
+; GFX90A-NEXT:    v_mul_hi_u32 v6, s3, v0
 ; GFX90A-NEXT:    v_mul_lo_u32 v0, s3, v0
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v4, v0
 ; GFX90A-NEXT:    v_mul_hi_u32 v5, s3, v1
@@ -11130,8 +11130,8 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v2, vcc
 ; GFX90A-NEXT:    v_mul_lo_u32 v1, s3, v1
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
-; GFX90A-NEXT:    s_mov_b32 s1, 0x12d8fb
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v8, v3, vcc
+; GFX90A-NEXT:    s_mov_b32 s1, 0x12d8fb
 ; GFX90A-NEXT:    v_mul_lo_u32 v3, v1, s1
 ; GFX90A-NEXT:    v_mul_hi_u32 v4, v0, s1
 ; GFX90A-NEXT:    v_add_u32_e32 v3, v4, v3
@@ -11259,10 +11259,10 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
 ; GFX6-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
 ; GFX6-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
-; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX6-NEXT:    v_mul_hi_u32 v3, s4, v0
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mul_lo_u32 v2, s4, v1
+; GFX6-NEXT:    v_mul_hi_u32 v3, s4, v0
 ; GFX6-NEXT:    v_mul_lo_u32 v5, s5, v0
 ; GFX6-NEXT:    v_mul_lo_u32 v4, s4, v0
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
@@ -11280,10 +11280,10 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v5, v4, vcc
 ; GFX6-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v7, v4, vcc
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GFX6-NEXT:    v_mov_b32_e32 v6, 0
-; GFX6-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v2
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v6, v5, vcc
+; GFX6-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v2
 ; GFX6-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[0:1]
 ; GFX6-NEXT:    v_mul_lo_u32 v5, s4, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v7, s4, v0
@@ -11293,8 +11293,8 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
 ; GFX6-NEXT:    v_mul_lo_u32 v7, s4, v0
 ; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
 ; GFX6-NEXT:    v_mul_lo_u32 v10, v0, v5
-; GFX6-NEXT:    v_mul_hi_u32 v12, v0, v5
 ; GFX6-NEXT:    v_mul_hi_u32 v11, v0, v7
+; GFX6-NEXT:    v_mul_hi_u32 v12, v0, v5
 ; GFX6-NEXT:    v_mul_hi_u32 v9, v2, v7
 ; GFX6-NEXT:    v_mul_lo_u32 v7, v2, v7
 ; GFX6-NEXT:    v_mul_hi_u32 v8, v2, v5
@@ -11309,10 +11309,10 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
 ; GFX6-NEXT:    v_addc_u32_e64 v1, vcc, v1, v5, s[0:1]
 ; GFX6-NEXT:    s_add_u32 s0, s10, s14
-; GFX6-NEXT:    s_addc_u32 s1, s11, s14
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GFX6-NEXT:    s_xor_b64 s[10:11], s[0:1], s[14:15]
+; GFX6-NEXT:    s_addc_u32 s1, s11, s14
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX6-NEXT:    s_xor_b64 s[10:11], s[0:1], s[14:15]
 ; GFX6-NEXT:    v_mul_lo_u32 v2, s10, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v3, s10, v0
 ; GFX6-NEXT:    v_mul_hi_u32 v5, s10, v1
@@ -11395,10 +11395,10 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
 ; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT:    v_mul_hi_u32 v4, s12, v0
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v3, s12, v1
+; GFX9-NEXT:    v_mul_hi_u32 v4, s12, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v6, s4, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v5, s12, v0
 ; GFX9-NEXT:    v_add_u32_e32 v3, v4, v3
@@ -11408,17 +11408,17 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
 ; GFX9-NEXT:    v_mul_hi_u32 v8, v0, v3
 ; GFX9-NEXT:    v_mul_hi_u32 v7, v1, v5
 ; GFX9-NEXT:    v_mul_lo_u32 v5, v1, v5
-; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v6
 ; GFX9-NEXT:    v_mul_hi_u32 v9, v1, v3
+; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v6
 ; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v8, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v3, v1, v3
 ; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
 ; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v6, v7, vcc
 ; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v9, v2, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
 ; GFX9-NEXT:    v_mov_b32_e32 v6, 0
-; GFX9-NEXT:    v_add_co_u32_e64 v0, s[2:3], v0, v3
+; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
 ; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v6, v5, vcc
+; GFX9-NEXT:    v_add_co_u32_e64 v0, s[2:3], v0, v3
 ; GFX9-NEXT:    v_addc_co_u32_e64 v3, vcc, v1, v4, s[2:3]
 ; GFX9-NEXT:    v_mul_lo_u32 v5, s12, v3
 ; GFX9-NEXT:    v_mul_hi_u32 v7, s12, v0
@@ -11432,8 +11432,8 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
 ; GFX9-NEXT:    v_mul_hi_u32 v12, v0, v5
 ; GFX9-NEXT:    v_mul_hi_u32 v8, v3, v9
 ; GFX9-NEXT:    v_mul_lo_u32 v9, v3, v9
-; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v11, v10
 ; GFX9-NEXT:    v_mul_hi_u32 v7, v3, v5
+; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v11, v10
 ; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v12, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v3, v3, v5
 ; GFX9-NEXT:    v_add_co_u32_e32 v9, vcc, v10, v9
@@ -11449,8 +11449,8 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
 ; GFX9-NEXT:    s_mov_b32 s3, s2
 ; GFX9-NEXT:    s_addc_u32 s1, s7, s2
-; GFX9-NEXT:    s_xor_b64 s[6:7], s[0:1], s[2:3]
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    s_xor_b64 s[6:7], s[0:1], s[2:3]
 ; GFX9-NEXT:    v_mul_lo_u32 v3, s6, v1
 ; GFX9-NEXT:    v_mul_hi_u32 v4, s6, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v5, s6, v1
@@ -11496,8 +11496,8 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v4, vcc
 ; GFX9-NEXT:    v_addc_co_u32_e64 v6, s[0:1], 0, v1, s[0:1]
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
-; GFX9-NEXT:    s_xor_b64 s[0:1], s[2:3], s[8:9]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
+; GFX9-NEXT:    s_xor_b64 s[0:1], s[2:3], s[8:9]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
 ; GFX9-NEXT:    v_xor_b32_e32 v0, s0, v0
 ; GFX9-NEXT:    v_xor_b32_e32 v1, s1, v1
@@ -11530,30 +11530,30 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
 ; GFX90A-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX90A-NEXT:    v_trunc_f32_e32 v1, v1
 ; GFX90A-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX90A-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX90A-NEXT:    v_mul_hi_u32 v5, s10, v0
+; GFX90A-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX90A-NEXT:    v_mul_lo_u32 v3, s10, v1
+; GFX90A-NEXT:    v_mul_hi_u32 v5, s10, v0
 ; GFX90A-NEXT:    v_mul_lo_u32 v4, s11, v0
 ; GFX90A-NEXT:    v_add_u32_e32 v3, v5, v3
-; GFX90A-NEXT:    v_add_u32_e32 v3, v3, v4
 ; GFX90A-NEXT:    v_mul_lo_u32 v6, s10, v0
+; GFX90A-NEXT:    v_add_u32_e32 v3, v3, v4
 ; GFX90A-NEXT:    v_mul_lo_u32 v5, v0, v3
 ; GFX90A-NEXT:    v_mul_hi_u32 v7, v0, v6
-; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v7, v5
 ; GFX90A-NEXT:    v_mul_hi_u32 v4, v0, v3
-; GFX90A-NEXT:    v_mul_hi_u32 v8, v1, v6
+; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v7, v5
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
+; GFX90A-NEXT:    v_mul_hi_u32 v8, v1, v6
 ; GFX90A-NEXT:    v_mul_lo_u32 v6, v1, v6
 ; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v6
 ; GFX90A-NEXT:    v_mul_hi_u32 v7, v1, v3
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v8, vcc
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v7, v2, vcc
 ; GFX90A-NEXT:    v_mul_lo_u32 v3, v1, v3
-; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
 ; GFX90A-NEXT:    v_mov_b32_e32 v6, 0
-; GFX90A-NEXT:    v_add_co_u32_e64 v0, s[0:1], v0, v3
+; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v6, v5, vcc
+; GFX90A-NEXT:    v_add_co_u32_e64 v0, s[0:1], v0, v3
 ; GFX90A-NEXT:    v_addc_co_u32_e64 v3, vcc, v1, v4, s[0:1]
 ; GFX90A-NEXT:    v_mul_lo_u32 v5, s10, v3
 ; GFX90A-NEXT:    v_mul_hi_u32 v7, s10, v0
@@ -11574,23 +11574,23 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, v7, v2, vcc
 ; GFX90A-NEXT:    v_mul_lo_u32 v3, v3, v5
 ; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v8, v3
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_ashr_i32 s10, s7, 31
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v6, v7, vcc
 ; GFX90A-NEXT:    v_add_u32_e32 v1, v1, v4
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    s_ashr_i32 s10, s7, 31
 ; GFX90A-NEXT:    v_addc_co_u32_e64 v1, vcc, v1, v5, s[0:1]
 ; GFX90A-NEXT:    s_add_u32 s0, s6, s10
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
 ; GFX90A-NEXT:    s_mov_b32 s11, s10
 ; GFX90A-NEXT:    s_addc_u32 s1, s7, s10
-; GFX90A-NEXT:    s_xor_b64 s[6:7], s[0:1], s[10:11]
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX90A-NEXT:    s_xor_b64 s[6:7], s[0:1], s[10:11]
 ; GFX90A-NEXT:    v_mul_lo_u32 v4, s6, v1
 ; GFX90A-NEXT:    v_mul_hi_u32 v5, s6, v0
-; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v5, v4
 ; GFX90A-NEXT:    v_mul_hi_u32 v3, s6, v1
-; GFX90A-NEXT:    v_mul_hi_u32 v7, s7, v0
+; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v5, v4
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX90A-NEXT:    v_mul_hi_u32 v7, s7, v0
 ; GFX90A-NEXT:    v_mul_lo_u32 v0, s7, v0
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v4, v0
 ; GFX90A-NEXT:    v_mul_hi_u32 v5, s7, v1
@@ -11630,8 +11630,8 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v7, v5, vcc
 ; GFX90A-NEXT:    v_addc_co_u32_e64 v6, s[0:1], 0, v1, s[0:1]
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
-; GFX90A-NEXT:    s_xor_b64 s[0:1], s[10:11], s[8:9]
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX90A-NEXT:    s_xor_b64 s[0:1], s[10:11], s[8:9]
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
 ; GFX90A-NEXT:    v_xor_b32_e32 v0, s0, v0
 ; GFX90A-NEXT:    v_xor_b32_e32 v1, s1, v1
@@ -11765,29 +11765,29 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GFX6-NEXT:    v_mul_lo_u32 v3, v1, s6
 ; GFX6-NEXT:    s_add_u32 s2, s8, s0
 ; GFX6-NEXT:    s_addc_u32 s3, s9, 0
-; GFX6-NEXT:    s_ashr_i32 s8, s11, 31
+; GFX6-NEXT:    s_ashr_i64 s[2:3], s[2:3], 12
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GFX6-NEXT:    v_mul_lo_u32 v3, v0, s6
 ; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, v0, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v4, v0, v2
-; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v5, v0, v3
+; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v7, v1, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
-; GFX6-NEXT:    s_ashr_i64 s[2:3], s[2:3], 12
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
 ; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
 ; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v3
 ; GFX6-NEXT:    v_mul_hi_u32 v3, v1, v3
+; GFX6-NEXT:    s_ashr_i32 s8, s11, 31
 ; GFX6-NEXT:    s_mov_b32 s9, s8
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v5, v3, vcc
 ; GFX6-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v7, v4, vcc
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GFX6-NEXT:    v_mov_b32_e32 v6, 0
-; GFX6-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v2
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v6, v5, vcc
+; GFX6-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v2
 ; GFX6-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[0:1]
 ; GFX6-NEXT:    v_mul_lo_u32 v5, v2, s6
 ; GFX6-NEXT:    v_mul_hi_u32 v7, v0, s6
@@ -11795,8 +11795,8 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GFX6-NEXT:    v_mul_lo_u32 v7, v0, s6
 ; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, v0, v5
 ; GFX6-NEXT:    v_mul_lo_u32 v10, v0, v5
-; GFX6-NEXT:    v_mul_hi_u32 v12, v0, v5
 ; GFX6-NEXT:    v_mul_hi_u32 v11, v0, v7
+; GFX6-NEXT:    v_mul_hi_u32 v12, v0, v5
 ; GFX6-NEXT:    v_mul_hi_u32 v9, v2, v7
 ; GFX6-NEXT:    v_mul_lo_u32 v7, v2, v7
 ; GFX6-NEXT:    v_mul_hi_u32 v8, v2, v5
@@ -11811,10 +11811,10 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
 ; GFX6-NEXT:    v_addc_u32_e64 v1, vcc, v1, v5, s[0:1]
 ; GFX6-NEXT:    s_add_u32 s0, s10, s8
-; GFX6-NEXT:    s_addc_u32 s1, s11, s8
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GFX6-NEXT:    s_xor_b64 s[0:1], s[0:1], s[8:9]
+; GFX6-NEXT:    s_addc_u32 s1, s11, s8
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX6-NEXT:    s_xor_b64 s[0:1], s[0:1], s[8:9]
 ; GFX6-NEXT:    v_mul_lo_u32 v2, s0, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v3, s0, v0
 ; GFX6-NEXT:    v_mul_hi_u32 v5, s0, v1
@@ -11839,8 +11839,8 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GFX6-NEXT:    v_add_i32_e32 v6, vcc, 1, v0
 ; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, 0, v1, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
-; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, s0, v8
 ; GFX6-NEXT:    v_mov_b32_e32 v5, s1
+; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, s0, v8
 ; GFX6-NEXT:    v_subb_u32_e32 v4, vcc, v5, v4, vcc
 ; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s9, v8
 ; GFX6-NEXT:    v_subbrev_u32_e32 v9, vcc, 0, v4, vcc
@@ -11856,13 +11856,13 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GFX6-NEXT:    v_cndmask_b32_e64 v4, -1, v5, s[0:1]
 ; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
 ; GFX6-NEXT:    v_xor_b32_e32 v0, s8, v0
-; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s8, v0
 ; GFX6-NEXT:    v_xor_b32_e32 v1, s8, v1
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s8
+; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s8, v0
 ; GFX6-NEXT:    v_subb_u32_e32 v3, vcc, v1, v3, vcc
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s3
@@ -11908,10 +11908,10 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v7
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v5, vcc
 ; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v8, v4, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v6, 0
-; GFX9-NEXT:    v_add_co_u32_e64 v0, s[2:3], v0, v2
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v5, vcc
+; GFX9-NEXT:    v_add_co_u32_e64 v0, s[2:3], v0, v2
 ; GFX9-NEXT:    v_addc_co_u32_e64 v2, vcc, v1, v3, s[2:3]
 ; GFX9-NEXT:    v_mul_lo_u32 v5, v2, s8
 ; GFX9-NEXT:    v_mul_hi_u32 v7, v0, s8
@@ -11924,8 +11924,8 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GFX9-NEXT:    v_mul_hi_u32 v12, v0, v5
 ; GFX9-NEXT:    v_mul_hi_u32 v9, v2, v8
 ; GFX9-NEXT:    v_mul_lo_u32 v8, v2, v8
-; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v11, v10
 ; GFX9-NEXT:    v_mul_hi_u32 v7, v2, v5
+; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v11, v10
 ; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v12, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v2, v2, v5
 ; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v10, v8
@@ -11939,8 +11939,8 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
 ; GFX9-NEXT:    s_mov_b32 s3, s2
 ; GFX9-NEXT:    s_addc_u32 s7, s7, s2
-; GFX9-NEXT:    s_xor_b64 s[6:7], s[6:7], s[2:3]
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    s_xor_b64 s[6:7], s[6:7], s[2:3]
 ; GFX9-NEXT:    v_mul_lo_u32 v2, s6, v1
 ; GFX9-NEXT:    v_mul_hi_u32 v3, s6, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v5, s6, v1
@@ -11956,12 +11956,12 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v7, v4, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v6, v2, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v5, v0, s3
 ; GFX9-NEXT:    v_mul_lo_u32 v2, v1, s3
 ; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s3
-; GFX9-NEXT:    v_sub_co_u32_e32 v5, vcc, s6, v5
+; GFX9-NEXT:    v_mul_lo_u32 v5, v0, s3
 ; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-NEXT:    v_sub_co_u32_e32 v5, vcc, s6, v5
 ; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v3, v2, vcc
 ; GFX9-NEXT:    v_subrev_co_u32_e32 v3, vcc, s3, v5
 ; GFX9-NEXT:    v_subbrev_co_u32_e32 v6, vcc, 0, v2, vcc
@@ -11982,9 +11982,9 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
 ; GFX9-NEXT:    v_xor_b32_e32 v0, s2, v0
-; GFX9-NEXT:    v_subrev_co_u32_e32 v2, vcc, s2, v0
 ; GFX9-NEXT:    v_xor_b32_e32 v1, s2, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s2
+; GFX9-NEXT:    v_subrev_co_u32_e32 v2, vcc, s2, v0
 ; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v1, v3, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
@@ -12018,10 +12018,10 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GFX90A-NEXT:    v_mul_lo_u32 v6, v0, s8
 ; GFX90A-NEXT:    v_mul_lo_u32 v5, v0, v2
 ; GFX90A-NEXT:    v_mul_hi_u32 v7, v0, v6
-; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v7, v5
 ; GFX90A-NEXT:    v_mul_hi_u32 v3, v0, v2
-; GFX90A-NEXT:    v_mul_hi_u32 v8, v1, v6
+; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v7, v5
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX90A-NEXT:    v_mul_hi_u32 v8, v1, v6
 ; GFX90A-NEXT:    v_mul_lo_u32 v6, v1, v6
 ; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v6
 ; GFX90A-NEXT:    v_mul_hi_u32 v7, v1, v2
@@ -12033,8 +12033,8 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GFX90A-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
 ; GFX90A-NEXT:    s_ashr_i64 s[4:5], s[0:1], 12
-; GFX90A-NEXT:    v_add_co_u32_e64 v0, s[0:1], v0, v2
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v5, vcc
+; GFX90A-NEXT:    v_add_co_u32_e64 v0, s[0:1], v0, v2
 ; GFX90A-NEXT:    v_addc_co_u32_e64 v2, vcc, v1, v3, s[0:1]
 ; GFX90A-NEXT:    v_mul_lo_u32 v5, v2, s8
 ; GFX90A-NEXT:    v_mul_hi_u32 v7, v0, s8
@@ -12062,14 +12062,14 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
 ; GFX90A-NEXT:    s_mov_b32 s1, s0
 ; GFX90A-NEXT:    s_addc_u32 s7, s7, s0
-; GFX90A-NEXT:    s_xor_b64 s[6:7], s[6:7], s[0:1]
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX90A-NEXT:    s_xor_b64 s[6:7], s[6:7], s[0:1]
 ; GFX90A-NEXT:    v_mul_lo_u32 v3, s6, v1
 ; GFX90A-NEXT:    v_mul_hi_u32 v5, s6, v0
-; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v5, v3
 ; GFX90A-NEXT:    v_mul_hi_u32 v2, s6, v1
-; GFX90A-NEXT:    v_mul_hi_u32 v7, s7, v0
+; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v5, v3
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
+; GFX90A-NEXT:    v_mul_hi_u32 v7, s7, v0
 ; GFX90A-NEXT:    v_mul_lo_u32 v0, s7, v0
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v3, v0
 ; GFX90A-NEXT:    v_mul_hi_u32 v5, s7, v1
@@ -12077,8 +12077,8 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v2, vcc, v5, v4, vcc
 ; GFX90A-NEXT:    v_mul_lo_u32 v1, s7, v1
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
-; GFX90A-NEXT:    s_movk_i32 s1, 0xfff
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v6, v2, vcc
+; GFX90A-NEXT:    s_movk_i32 s1, 0xfff
 ; GFX90A-NEXT:    v_mul_lo_u32 v2, v1, s1
 ; GFX90A-NEXT:    v_mul_hi_u32 v3, v0, s1
 ; GFX90A-NEXT:    v_add_u32_e32 v2, v3, v2
@@ -12105,9 +12105,9 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
 ; GFX90A-NEXT:    v_xor_b32_e32 v0, s0, v0
-; GFX90A-NEXT:    v_subrev_co_u32_e32 v2, vcc, s0, v0
 ; GFX90A-NEXT:    v_xor_b32_e32 v1, s0, v1
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, s0
+; GFX90A-NEXT:    v_subrev_co_u32_e32 v2, vcc, s0, v0
 ; GFX90A-NEXT:    v_subb_co_u32_e32 v3, vcc, v1, v3, vcc
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, s5
@@ -12160,10 +12160,10 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX6-NEXT:    v_mul_f32_e32 v1, s20, v0
 ; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
 ; GFX6-NEXT:    v_mac_f32_e32 v0, s21, v1
-; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX6-NEXT:    v_mul_hi_u32 v3, s6, v0
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mul_lo_u32 v2, s6, v1
+; GFX6-NEXT:    v_mul_hi_u32 v3, s6, v0
 ; GFX6-NEXT:    v_mul_lo_u32 v4, s7, v0
 ; GFX6-NEXT:    v_mul_lo_u32 v5, s6, v0
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
@@ -12181,10 +12181,10 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v4, v5, vcc
 ; GFX6-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v7, v4, vcc
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GFX6-NEXT:    v_mov_b32_e32 v6, 0
-; GFX6-NEXT:    v_add_i32_e64 v0, s[2:3], v0, v2
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v6, v5, vcc
+; GFX6-NEXT:    v_add_i32_e64 v0, s[2:3], v0, v2
 ; GFX6-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[2:3]
 ; GFX6-NEXT:    v_mul_lo_u32 v5, s6, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v7, s6, v0
@@ -12194,8 +12194,8 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX6-NEXT:    v_mul_lo_u32 v7, s6, v0
 ; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
 ; GFX6-NEXT:    v_mul_lo_u32 v10, v0, v5
-; GFX6-NEXT:    v_mul_hi_u32 v12, v0, v5
 ; GFX6-NEXT:    v_mul_hi_u32 v11, v0, v7
+; GFX6-NEXT:    v_mul_hi_u32 v12, v0, v5
 ; GFX6-NEXT:    v_mul_hi_u32 v9, v2, v7
 ; GFX6-NEXT:    v_mul_lo_u32 v7, v2, v7
 ; GFX6-NEXT:    v_mul_hi_u32 v8, v2, v5
@@ -12215,8 +12215,8 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GFX6-NEXT:    s_mov_b32 s3, s2
 ; GFX6-NEXT:    s_addc_u32 s1, s9, s2
-; GFX6-NEXT:    s_xor_b64 s[8:9], s[0:1], s[2:3]
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX6-NEXT:    s_xor_b64 s[8:9], s[0:1], s[2:3]
 ; GFX6-NEXT:    v_mul_lo_u32 v2, s8, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v3, s8, v0
 ; GFX6-NEXT:    v_mul_hi_u32 v5, s8, v1
@@ -12276,14 +12276,14 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX6-NEXT:    v_rcp_f32_e32 v3, v10
 ; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
-; GFX6-NEXT:    s_sub_u32 s14, 0, s12
+; GFX6-NEXT:    v_cndmask_b32_e64 v2, v9, v7, s[0:1]
 ; GFX6-NEXT:    v_mul_f32_e32 v3, s19, v3
 ; GFX6-NEXT:    v_mul_f32_e32 v5, s20, v3
 ; GFX6-NEXT:    v_trunc_f32_e32 v5, v5
 ; GFX6-NEXT:    v_mac_f32_e32 v3, s21, v5
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; GFX6-NEXT:    v_cndmask_b32_e64 v2, v9, v7, s[0:1]
+; GFX6-NEXT:    s_sub_u32 s14, 0, s12
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX6-NEXT:    v_mul_hi_u32 v2, s14, v3
 ; GFX6-NEXT:    v_mul_lo_u32 v7, s14, v5
@@ -12294,21 +12294,21 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX6-NEXT:    v_mul_lo_u32 v7, s14, v3
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
 ; GFX6-NEXT:    v_mul_lo_u32 v8, v3, v2
-; GFX6-NEXT:    v_mul_hi_u32 v10, v3, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v9, v3, v7
+; GFX6-NEXT:    v_mul_hi_u32 v10, v3, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v11, v5, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v2, v5, v2
-; GFX6-NEXT:    v_xor_b32_e32 v1, s3, v1
 ; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
 ; GFX6-NEXT:    v_addc_u32_e32 v9, vcc, 0, v10, vcc
 ; GFX6-NEXT:    v_mul_lo_u32 v10, v5, v7
 ; GFX6-NEXT:    v_mul_hi_u32 v7, v5, v7
+; GFX6-NEXT:    v_xor_b32_e32 v1, s3, v1
 ; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
 ; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, v9, v7, vcc
 ; GFX6-NEXT:    v_addc_u32_e32 v8, vcc, v11, v4, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
-; GFX6-NEXT:    v_add_i32_e64 v2, s[0:1], v3, v2
 ; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, v6, v8, vcc
+; GFX6-NEXT:    v_add_i32_e64 v2, s[0:1], v3, v2
 ; GFX6-NEXT:    v_addc_u32_e64 v3, vcc, v5, v7, s[0:1]
 ; GFX6-NEXT:    v_mul_lo_u32 v8, s14, v3
 ; GFX6-NEXT:    v_mul_hi_u32 v9, s14, v2
@@ -12317,8 +12317,8 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX6-NEXT:    v_mul_lo_u32 v9, s14, v2
 ; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
 ; GFX6-NEXT:    v_mul_lo_u32 v12, v2, v8
-; GFX6-NEXT:    v_mul_hi_u32 v14, v2, v8
 ; GFX6-NEXT:    v_mul_hi_u32 v13, v2, v9
+; GFX6-NEXT:    v_mul_hi_u32 v14, v2, v8
 ; GFX6-NEXT:    v_mul_hi_u32 v11, v3, v9
 ; GFX6-NEXT:    v_mul_lo_u32 v9, v3, v9
 ; GFX6-NEXT:    v_mul_hi_u32 v10, v3, v8
@@ -12337,8 +12337,8 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GFX6-NEXT:    s_mov_b32 s15, s14
 ; GFX6-NEXT:    s_addc_u32 s1, s11, s14
-; GFX6-NEXT:    s_xor_b64 s[10:11], s[0:1], s[14:15]
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
+; GFX6-NEXT:    s_xor_b64 s[10:11], s[0:1], s[14:15]
 ; GFX6-NEXT:    v_mul_lo_u32 v5, s10, v3
 ; GFX6-NEXT:    v_mul_hi_u32 v7, s10, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v9, s10, v3
@@ -12428,10 +12428,10 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX9-NEXT:    v_mul_f32_e32 v1, s18, v0
 ; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
 ; GFX9-NEXT:    v_mac_f32_e32 v0, s19, v1
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT:    v_mul_hi_u32 v3, s14, v0
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v2, s14, v1
+; GFX9-NEXT:    v_mul_hi_u32 v3, s14, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v5, s4, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v4, s14, v0
 ; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
@@ -12448,10 +12448,10 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v7
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v4, vcc
 ; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v8, v6, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0
-; GFX9-NEXT:    v_add_co_u32_e64 v0, s[2:3], v0, v2
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v4, vcc
+; GFX9-NEXT:    v_add_co_u32_e64 v0, s[2:3], v0, v2
 ; GFX9-NEXT:    v_addc_co_u32_e64 v2, vcc, v1, v3, s[2:3]
 ; GFX9-NEXT:    v_mul_lo_u32 v4, s14, v2
 ; GFX9-NEXT:    v_mul_hi_u32 v7, s14, v0
@@ -12465,25 +12465,25 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX9-NEXT:    v_mul_hi_u32 v12, v0, v4
 ; GFX9-NEXT:    v_mul_hi_u32 v8, v2, v9
 ; GFX9-NEXT:    v_mul_lo_u32 v9, v2, v9
-; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v11, v10
 ; GFX9-NEXT:    v_mul_hi_u32 v7, v2, v4
+; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v11, v10
 ; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v12, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v2, v2, v4
 ; GFX9-NEXT:    v_add_co_u32_e32 v9, vcc, v10, v9
 ; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v11, v8, vcc
 ; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v7, v6, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v8, v2
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_ashr_i32 s14, s5, 31
 ; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v5, v4, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_ashr_i32 s14, s5, 31
 ; GFX9-NEXT:    v_addc_co_u32_e64 v1, vcc, v1, v4, s[2:3]
 ; GFX9-NEXT:    s_add_u32 s2, s4, s14
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
 ; GFX9-NEXT:    s_mov_b32 s15, s14
 ; GFX9-NEXT:    s_addc_u32 s3, s5, s14
-; GFX9-NEXT:    s_xor_b64 s[4:5], s[2:3], s[14:15]
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    s_xor_b64 s[4:5], s[2:3], s[14:15]
 ; GFX9-NEXT:    v_mul_lo_u32 v2, s4, v1
 ; GFX9-NEXT:    v_mul_hi_u32 v3, s4, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v4, s4, v1
@@ -12520,10 +12520,10 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, 1, 2, s[0:1]
 ; GFX9-NEXT:    v_add_co_u32_e64 v4, s[0:1], v0, v4
+; GFX9-NEXT:    v_addc_co_u32_e64 v7, s[0:1], 0, v1, s[0:1]
 ; GFX9-NEXT:    v_mov_b32_e32 v8, s5
 ; GFX9-NEXT:    s_xor_b64 s[4:5], s[14:15], s[12:13]
 ; GFX9-NEXT:    s_ashr_i32 s12, s9, 31
-; GFX9-NEXT:    v_addc_co_u32_e64 v7, s[0:1], 0, v1, s[0:1]
 ; GFX9-NEXT:    s_add_u32 s0, s8, s12
 ; GFX9-NEXT:    s_mov_b32 s13, s12
 ; GFX9-NEXT:    s_addc_u32 s1, s9, s12
@@ -12571,14 +12571,14 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v9, v7, vcc
 ; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v11, v6, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v7, v4
-; GFX9-NEXT:    v_add_co_u32_e64 v2, s[0:1], v2, v4
 ; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v5, v8, vcc
+; GFX9-NEXT:    v_add_co_u32_e64 v2, s[0:1], v2, v4
 ; GFX9-NEXT:    v_addc_co_u32_e64 v4, vcc, v3, v7, s[0:1]
 ; GFX9-NEXT:    v_mul_lo_u32 v8, s10, v4
 ; GFX9-NEXT:    v_mul_hi_u32 v9, s10, v2
 ; GFX9-NEXT:    v_mul_lo_u32 v10, s11, v2
 ; GFX9-NEXT:    v_mul_lo_u32 v11, s10, v2
-; GFX9-NEXT:    s_ashr_i32 s10, s7, 31
+; GFX9-NEXT:    v_add_u32_e32 v3, v3, v7
 ; GFX9-NEXT:    v_add_u32_e32 v8, v9, v8
 ; GFX9-NEXT:    v_add_u32_e32 v8, v8, v10
 ; GFX9-NEXT:    v_mul_lo_u32 v12, v2, v8
@@ -12586,8 +12586,8 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX9-NEXT:    v_mul_hi_u32 v14, v2, v8
 ; GFX9-NEXT:    v_mul_hi_u32 v10, v4, v11
 ; GFX9-NEXT:    v_mul_lo_u32 v11, v4, v11
-; GFX9-NEXT:    v_add_co_u32_e32 v12, vcc, v13, v12
 ; GFX9-NEXT:    v_mul_hi_u32 v9, v4, v8
+; GFX9-NEXT:    v_add_co_u32_e32 v12, vcc, v13, v12
 ; GFX9-NEXT:    v_addc_co_u32_e32 v13, vcc, 0, v14, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v4, v4, v8
 ; GFX9-NEXT:    v_add_co_u32_e32 v11, vcc, v12, v11
@@ -12595,14 +12595,14 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v9, v6, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v10, v4
 ; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v5, v8, vcc
-; GFX9-NEXT:    v_add_u32_e32 v3, v3, v7
+; GFX9-NEXT:    s_ashr_i32 s10, s7, 31
 ; GFX9-NEXT:    v_addc_co_u32_e64 v3, vcc, v3, v8, s[0:1]
 ; GFX9-NEXT:    s_add_u32 s0, s6, s10
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
 ; GFX9-NEXT:    s_mov_b32 s11, s10
 ; GFX9-NEXT:    s_addc_u32 s1, s7, s10
-; GFX9-NEXT:    s_xor_b64 s[6:7], s[0:1], s[10:11]
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX9-NEXT:    s_xor_b64 s[6:7], s[0:1], s[10:11]
 ; GFX9-NEXT:    v_mul_lo_u32 v4, s6, v3
 ; GFX9-NEXT:    v_mul_hi_u32 v7, s6, v2
 ; GFX9-NEXT:    v_mul_hi_u32 v9, s6, v3
@@ -12622,9 +12622,9 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX9-NEXT:    v_mul_hi_u32 v5, s8, v2
 ; GFX9-NEXT:    v_mul_lo_u32 v7, s9, v2
 ; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, s4, v0
-; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v8, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v4, v5, v4
 ; GFX9-NEXT:    v_mul_lo_u32 v5, s8, v2
+; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v8, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v4, v4, v7
 ; GFX9-NEXT:    v_sub_u32_e32 v7, s7, v4
 ; GFX9-NEXT:    v_mov_b32_e32 v8, s9
@@ -12651,8 +12651,8 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v9, v5, vcc
 ; GFX9-NEXT:    v_addc_co_u32_e64 v8, s[0:1], 0, v3, s[0:1]
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
-; GFX9-NEXT:    s_xor_b64 s[0:1], s[10:11], s[12:13]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc
+; GFX9-NEXT:    s_xor_b64 s[0:1], s[10:11], s[12:13]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v8, vcc
 ; GFX9-NEXT:    v_xor_b32_e32 v2, s0, v2
 ; GFX9-NEXT:    v_xor_b32_e32 v3, s1, v3
@@ -12702,20 +12702,20 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX90A-NEXT:    v_mul_lo_u32 v6, s14, v0
 ; GFX90A-NEXT:    v_mul_lo_u32 v5, v0, v2
 ; GFX90A-NEXT:    v_mul_hi_u32 v7, v0, v6
-; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v7, v5
 ; GFX90A-NEXT:    v_mul_hi_u32 v3, v0, v2
-; GFX90A-NEXT:    v_mul_hi_u32 v8, v1, v6
+; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v7, v5
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX90A-NEXT:    v_mul_hi_u32 v8, v1, v6
 ; GFX90A-NEXT:    v_mul_lo_u32 v6, v1, v6
 ; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v6
 ; GFX90A-NEXT:    v_mul_hi_u32 v7, v1, v2
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v8, vcc
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v7, v4, vcc
 ; GFX90A-NEXT:    v_mul_lo_u32 v2, v1, v2
-; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
 ; GFX90A-NEXT:    v_mov_b32_e32 v6, 0
-; GFX90A-NEXT:    v_add_co_u32_e64 v0, s[0:1], v0, v2
+; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v5, vcc
+; GFX90A-NEXT:    v_add_co_u32_e64 v0, s[0:1], v0, v2
 ; GFX90A-NEXT:    v_addc_co_u32_e64 v2, vcc, v1, v3, s[0:1]
 ; GFX90A-NEXT:    v_mul_lo_u32 v5, s14, v2
 ; GFX90A-NEXT:    v_mul_hi_u32 v7, s14, v0
@@ -12736,23 +12736,23 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, v7, v4, vcc
 ; GFX90A-NEXT:    v_mul_lo_u32 v2, v2, v5
 ; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, v8, v2
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_ashr_i32 s14, s5, 31
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v6, v7, vcc
 ; GFX90A-NEXT:    v_add_u32_e32 v1, v1, v3
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    s_ashr_i32 s14, s5, 31
 ; GFX90A-NEXT:    v_addc_co_u32_e64 v1, vcc, v1, v5, s[0:1]
 ; GFX90A-NEXT:    s_add_u32 s0, s4, s14
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
 ; GFX90A-NEXT:    s_mov_b32 s15, s14
 ; GFX90A-NEXT:    s_addc_u32 s1, s5, s14
-; GFX90A-NEXT:    s_xor_b64 s[4:5], s[0:1], s[14:15]
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX90A-NEXT:    s_xor_b64 s[4:5], s[0:1], s[14:15]
 ; GFX90A-NEXT:    v_mul_lo_u32 v3, s4, v1
 ; GFX90A-NEXT:    v_mul_hi_u32 v5, s4, v0
-; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v5, v3
 ; GFX90A-NEXT:    v_mul_hi_u32 v2, s4, v1
-; GFX90A-NEXT:    v_mul_hi_u32 v7, s5, v0
+; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v5, v3
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
+; GFX90A-NEXT:    v_mul_hi_u32 v7, s5, v0
 ; GFX90A-NEXT:    v_mul_lo_u32 v0, s5, v0
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v3, v0
 ; GFX90A-NEXT:    v_mul_hi_u32 v5, s5, v1
@@ -12767,9 +12767,9 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX90A-NEXT:    v_mul_lo_u32 v3, s13, v0
 ; GFX90A-NEXT:    v_add_u32_e32 v2, v2, v3
 ; GFX90A-NEXT:    v_mul_lo_u32 v5, s12, v0
-; GFX90A-NEXT:    v_sub_co_u32_e32 v5, vcc, s4, v5
 ; GFX90A-NEXT:    v_sub_u32_e32 v3, s5, v2
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, s13
+; GFX90A-NEXT:    v_sub_co_u32_e32 v5, vcc, s4, v5
 ; GFX90A-NEXT:    v_subb_co_u32_e64 v3, s[0:1], v3, v7, vcc
 ; GFX90A-NEXT:    v_subrev_co_u32_e64 v7, s[0:1], s12, v5
 ; GFX90A-NEXT:    v_subbrev_co_u32_e64 v3, s[0:1], 0, v3, s[0:1]
@@ -12782,17 +12782,17 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX90A-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v3
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v3, 1, 2, s[0:1]
 ; GFX90A-NEXT:    v_mov_b32_e32 v8, s5
-; GFX90A-NEXT:    v_subb_co_u32_e32 v2, vcc, v8, v2, vcc
 ; GFX90A-NEXT:    v_add_co_u32_e64 v3, s[0:1], v0, v3
+; GFX90A-NEXT:    v_subb_co_u32_e32 v2, vcc, v8, v2, vcc
 ; GFX90A-NEXT:    v_addc_co_u32_e64 v7, s[0:1], 0, v1, s[0:1]
 ; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s13, v2
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
 ; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s12, v5
-; GFX90A-NEXT:    s_ashr_i32 s4, s9, 31
 ; GFX90A-NEXT:    s_xor_b64 s[0:1], s[14:15], s[10:11]
-; GFX90A-NEXT:    s_add_u32 s8, s8, s4
+; GFX90A-NEXT:    s_ashr_i32 s4, s9, 31
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, s13, v2
+; GFX90A-NEXT:    s_add_u32 s8, s8, s4
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v8, v5, vcc
 ; GFX90A-NEXT:    s_mov_b32 s5, s4
 ; GFX90A-NEXT:    s_addc_u32 s9, s9, s4
@@ -12825,10 +12825,10 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX90A-NEXT:    v_mul_lo_u32 v9, s10, v2
 ; GFX90A-NEXT:    v_mul_lo_u32 v8, v2, v5
 ; GFX90A-NEXT:    v_mul_hi_u32 v10, v2, v9
-; GFX90A-NEXT:    v_add_co_u32_e32 v8, vcc, v10, v8
 ; GFX90A-NEXT:    v_mul_hi_u32 v7, v2, v5
-; GFX90A-NEXT:    v_mul_hi_u32 v11, v3, v9
+; GFX90A-NEXT:    v_add_co_u32_e32 v8, vcc, v10, v8
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v7, vcc
+; GFX90A-NEXT:    v_mul_hi_u32 v11, v3, v9
 ; GFX90A-NEXT:    v_mul_lo_u32 v9, v3, v9
 ; GFX90A-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v9
 ; GFX90A-NEXT:    v_mul_hi_u32 v10, v3, v5
@@ -12836,8 +12836,8 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v8, vcc, v10, v4, vcc
 ; GFX90A-NEXT:    v_mul_lo_u32 v5, v3, v5
 ; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v7, v5
-; GFX90A-NEXT:    v_add_co_u32_e64 v2, s[0:1], v2, v5
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, v6, v8, vcc
+; GFX90A-NEXT:    v_add_co_u32_e64 v2, s[0:1], v2, v5
 ; GFX90A-NEXT:    v_addc_co_u32_e64 v5, vcc, v3, v7, s[0:1]
 ; GFX90A-NEXT:    v_mul_lo_u32 v8, s10, v5
 ; GFX90A-NEXT:    v_mul_hi_u32 v9, s10, v2
@@ -12858,22 +12858,22 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v9, vcc, v9, v4, vcc
 ; GFX90A-NEXT:    v_mul_lo_u32 v5, v5, v8
 ; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v10, v5
-; GFX90A-NEXT:    s_ashr_i32 s10, s7, 31
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v8, vcc, v6, v9, vcc
 ; GFX90A-NEXT:    v_add_u32_e32 v3, v3, v7
+; GFX90A-NEXT:    s_ashr_i32 s10, s7, 31
 ; GFX90A-NEXT:    v_addc_co_u32_e64 v3, vcc, v3, v8, s[0:1]
 ; GFX90A-NEXT:    s_add_u32 s0, s6, s10
 ; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v5
 ; GFX90A-NEXT:    s_mov_b32 s11, s10
 ; GFX90A-NEXT:    s_addc_u32 s1, s7, s10
-; GFX90A-NEXT:    s_xor_b64 s[6:7], s[0:1], s[10:11]
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX90A-NEXT:    s_xor_b64 s[6:7], s[0:1], s[10:11]
 ; GFX90A-NEXT:    v_mul_lo_u32 v7, s6, v3
 ; GFX90A-NEXT:    v_mul_hi_u32 v8, s6, v2
-; GFX90A-NEXT:    v_add_co_u32_e32 v7, vcc, v8, v7
 ; GFX90A-NEXT:    v_mul_hi_u32 v5, s6, v3
-; GFX90A-NEXT:    v_mul_hi_u32 v9, s7, v2
+; GFX90A-NEXT:    v_add_co_u32_e32 v7, vcc, v8, v7
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
+; GFX90A-NEXT:    v_mul_hi_u32 v9, s7, v2
 ; GFX90A-NEXT:    v_mul_lo_u32 v2, s7, v2
 ; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, v7, v2
 ; GFX90A-NEXT:    v_mul_hi_u32 v8, s7, v3
@@ -12913,8 +12913,8 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v9, v7, vcc
 ; GFX90A-NEXT:    v_addc_co_u32_e64 v8, s[0:1], 0, v3, s[0:1]
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
-; GFX90A-NEXT:    s_xor_b64 s[0:1], s[10:11], s[4:5]
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
+; GFX90A-NEXT:    s_xor_b64 s[0:1], s[10:11], s[4:5]
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v3, v8, vcc
 ; GFX90A-NEXT:    v_xor_b32_e32 v2, s0, v2
 ; GFX90A-NEXT:    v_xor_b32_e32 v3, s1, v3
@@ -12947,12 +12947,12 @@ define amdgpu_kernel void @srem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX6-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
 ; GFX6-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
-; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
-; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s2
 ; GFX6-NEXT:    v_mul_lo_u32 v2, v1, s2
+; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s2
 ; GFX6-NEXT:    v_mul_lo_u32 v4, v0, s2
 ; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12968,22 +12968,22 @@ define amdgpu_kernel void @srem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v4
 ; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v4
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v8, v3, vcc
-; GFX6-NEXT:    s_mov_b32 s5, s9
 ; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v3, v4, vcc
 ; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v9, v7, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GFX6-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v2
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
+; GFX6-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v2
 ; GFX6-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[0:1]
 ; GFX6-NEXT:    v_mul_lo_u32 v4, v2, s2
 ; GFX6-NEXT:    v_mul_hi_u32 v5, v0, s2
+; GFX6-NEXT:    s_mov_b32 s5, s9
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
 ; GFX6-NEXT:    v_mul_lo_u32 v5, v0, s2
 ; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, v0, v4
 ; GFX6-NEXT:    v_mul_lo_u32 v10, v0, v4
-; GFX6-NEXT:    v_mul_hi_u32 v12, v0, v4
 ; GFX6-NEXT:    v_mul_hi_u32 v11, v0, v5
+; GFX6-NEXT:    v_mul_hi_u32 v12, v0, v4
 ; GFX6-NEXT:    v_mul_hi_u32 v9, v2, v5
 ; GFX6-NEXT:    v_mul_lo_u32 v5, v2, v5
 ; GFX6-NEXT:    v_mul_hi_u32 v6, v2, v4
@@ -13002,8 +13002,8 @@ define amdgpu_kernel void @srem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GFX6-NEXT:    s_mov_b32 s3, s2
 ; GFX6-NEXT:    s_addc_u32 s1, s11, s2
-; GFX6-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX6-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
 ; GFX6-NEXT:    v_mul_lo_u32 v2, s0, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v3, s0, v0
 ; GFX6-NEXT:    v_mul_hi_u32 v4, s0, v1
@@ -13019,12 +13019,12 @@ define amdgpu_kernel void @srem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, v5, v7, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v8, v2, vcc
-; GFX6-NEXT:    v_mul_hi_u32 v2, v0, s3
 ; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s3
+; GFX6-NEXT:    v_mul_hi_u32 v2, v0, s3
 ; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s3
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
-; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s1
+; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
 ; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
 ; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s3, v0
 ; GFX6-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v1, vcc
@@ -13065,11 +13065,11 @@ define amdgpu_kernel void @srem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
 ; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s8
 ; GFX9-NEXT:    v_mul_lo_u32 v2, v1, s8
+; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s8
 ; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s8
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_ashr_i32 s0, s7, 31
@@ -13089,8 +13089,8 @@ define amdgpu_kernel void @srem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v4, vcc
 ; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v9, v5, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
-; GFX9-NEXT:    v_add_co_u32_e64 v0, s[2:3], v0, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v7, v4, vcc
+; GFX9-NEXT:    v_add_co_u32_e64 v0, s[2:3], v0, v2
 ; GFX9-NEXT:    v_addc_co_u32_e64 v2, vcc, v1, v3, s[2:3]
 ; GFX9-NEXT:    v_mul_lo_u32 v4, v2, s8
 ; GFX9-NEXT:    v_mul_hi_u32 v6, v0, s8
@@ -13103,8 +13103,8 @@ define amdgpu_kernel void @srem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX9-NEXT:    v_mul_hi_u32 v12, v0, v4
 ; GFX9-NEXT:    v_mul_hi_u32 v9, v2, v8
 ; GFX9-NEXT:    v_mul_lo_u32 v8, v2, v8
-; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v11, v10
 ; GFX9-NEXT:    v_mul_hi_u32 v6, v2, v4
+; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v11, v10
 ; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, v7, v12, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v2, v2, v4
 ; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v10, v8
@@ -13114,10 +13114,10 @@ define amdgpu_kernel void @srem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v7, v4, vcc
 ; GFX9-NEXT:    v_addc_co_u32_e64 v1, vcc, v1, v4, s[2:3]
 ; GFX9-NEXT:    s_add_u32 s2, s6, s0
-; GFX9-NEXT:    s_addc_u32 s3, s7, s0
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT:    s_xor_b64 s[2:3], s[2:3], s[0:1]
+; GFX9-NEXT:    s_addc_u32 s3, s7, s0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    s_xor_b64 s[2:3], s[2:3], s[0:1]
 ; GFX9-NEXT:    v_mul_lo_u32 v2, s2, v1
 ; GFX9-NEXT:    v_mul_hi_u32 v3, s2, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v4, s2, v1
@@ -13133,8 +13133,8 @@ define amdgpu_kernel void @srem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v6, v5, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v7, v2, vcc
-; GFX9-NEXT:    v_mul_hi_u32 v2, v0, s1
 ; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s1
+; GFX9-NEXT:    v_mul_hi_u32 v2, v0, s1
 ; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s1
 ; GFX9-NEXT:    v_add_u32_e32 v1, v2, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s3
@@ -13179,20 +13179,20 @@ define amdgpu_kernel void @srem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX90A-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX90A-NEXT:    v_trunc_f32_e32 v1, v1
 ; GFX90A-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX90A-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX90A-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX90A-NEXT:    v_mul_hi_u32 v4, v0, s2
 ; GFX90A-NEXT:    v_mul_lo_u32 v3, v1, s2
+; GFX90A-NEXT:    v_mul_hi_u32 v4, v0, s2
 ; GFX90A-NEXT:    v_add_u32_e32 v3, v4, v3
 ; GFX90A-NEXT:    v_sub_u32_e32 v3, v3, v0
 ; GFX90A-NEXT:    v_mul_lo_u32 v6, v0, s2
 ; GFX90A-NEXT:    v_mul_lo_u32 v5, v0, v3
 ; GFX90A-NEXT:    v_mul_hi_u32 v7, v0, v6
-; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v7, v5
 ; GFX90A-NEXT:    v_mul_hi_u32 v4, v0, v3
-; GFX90A-NEXT:    v_mul_hi_u32 v9, v1, v6
+; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v7, v5
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v8, v4, vcc
+; GFX90A-NEXT:    v_mul_hi_u32 v9, v1, v6
 ; GFX90A-NEXT:    v_mul_lo_u32 v6, v1, v6
 ; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v6
 ; GFX90A-NEXT:    v_mul_hi_u32 v7, v1, v3
@@ -13200,8 +13200,8 @@ define amdgpu_kernel void @srem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v7, v2, vcc
 ; GFX90A-NEXT:    v_mul_lo_u32 v3, v1, v3
 ; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
-; GFX90A-NEXT:    v_add_co_u32_e64 v0, s[0:1], v0, v3
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v8, v5, vcc
+; GFX90A-NEXT:    v_add_co_u32_e64 v0, s[0:1], v0, v3
 ; GFX90A-NEXT:    v_addc_co_u32_e64 v3, vcc, v1, v4, s[0:1]
 ; GFX90A-NEXT:    v_mul_lo_u32 v5, v3, s2
 ; GFX90A-NEXT:    v_mul_hi_u32 v6, v0, s2
@@ -13230,14 +13230,14 @@ define amdgpu_kernel void @srem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
 ; GFX90A-NEXT:    s_mov_b32 s1, s0
 ; GFX90A-NEXT:    s_addc_u32 s3, s7, s0
-; GFX90A-NEXT:    s_xor_b64 s[2:3], s[2:3], s[0:1]
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX90A-NEXT:    s_xor_b64 s[2:3], s[2:3], s[0:1]
 ; GFX90A-NEXT:    v_mul_lo_u32 v4, s2, v1
 ; GFX90A-NEXT:    v_mul_hi_u32 v5, s2, v0
-; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v5, v4
 ; GFX90A-NEXT:    v_mul_hi_u32 v3, s2, v1
-; GFX90A-NEXT:    v_mul_hi_u32 v6, s3, v0
+; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v5, v4
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v8, v3, vcc
+; GFX90A-NEXT:    v_mul_hi_u32 v6, s3, v0
 ; GFX90A-NEXT:    v_mul_lo_u32 v0, s3, v0
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v4, v0
 ; GFX90A-NEXT:    v_mul_hi_u32 v5, s3, v1
@@ -13247,8 +13247,8 @@ define amdgpu_kernel void @srem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v8, v3, vcc
 ; GFX90A-NEXT:    s_mov_b32 s1, 0x12d8fb
-; GFX90A-NEXT:    v_mul_hi_u32 v3, v0, s1
 ; GFX90A-NEXT:    v_mul_lo_u32 v1, v1, s1
+; GFX90A-NEXT:    v_mul_hi_u32 v3, v0, s1
 ; GFX90A-NEXT:    v_mul_lo_u32 v0, v0, s1
 ; GFX90A-NEXT:    v_add_u32_e32 v1, v3, v1
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, s3
@@ -13383,10 +13383,10 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
 ; GFX6-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
 ; GFX6-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
-; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX6-NEXT:    v_mul_hi_u32 v3, s2, v0
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mul_lo_u32 v2, s2, v1
+; GFX6-NEXT:    v_mul_hi_u32 v3, s2, v0
 ; GFX6-NEXT:    v_mul_lo_u32 v5, s3, v0
 ; GFX6-NEXT:    v_mul_lo_u32 v4, s2, v0
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
@@ -13404,10 +13404,10 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v5, v4, vcc
 ; GFX6-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v7, v4, vcc
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GFX6-NEXT:    v_mov_b32_e32 v6, 0
-; GFX6-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v2
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v6, v5, vcc
+; GFX6-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v2
 ; GFX6-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[0:1]
 ; GFX6-NEXT:    v_mul_lo_u32 v5, s2, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v7, s2, v0
@@ -13416,8 +13416,8 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
 ; GFX6-NEXT:    v_mul_lo_u32 v7, s2, v0
 ; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
 ; GFX6-NEXT:    v_mul_lo_u32 v10, v0, v5
-; GFX6-NEXT:    v_mul_hi_u32 v12, v0, v5
 ; GFX6-NEXT:    v_mul_hi_u32 v11, v0, v7
+; GFX6-NEXT:    v_mul_hi_u32 v12, v0, v5
 ; GFX6-NEXT:    v_mul_hi_u32 v9, v2, v7
 ; GFX6-NEXT:    v_mul_lo_u32 v7, v2, v7
 ; GFX6-NEXT:    v_mul_hi_u32 v8, v2, v5
@@ -13432,10 +13432,10 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
 ; GFX6-NEXT:    v_addc_u32_e64 v1, vcc, v1, v5, s[0:1]
 ; GFX6-NEXT:    s_add_u32 s0, s10, s14
-; GFX6-NEXT:    s_addc_u32 s1, s11, s14
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GFX6-NEXT:    s_xor_b64 s[10:11], s[0:1], s[14:15]
+; GFX6-NEXT:    s_addc_u32 s1, s11, s14
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX6-NEXT:    s_xor_b64 s[10:11], s[0:1], s[14:15]
 ; GFX6-NEXT:    v_mul_lo_u32 v2, s10, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v3, s10, v0
 ; GFX6-NEXT:    v_mul_hi_u32 v5, s10, v1
@@ -13463,12 +13463,12 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
 ; GFX6-NEXT:    v_subrev_i32_e64 v4, s[0:1], s12, v0
 ; GFX6-NEXT:    v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1]
 ; GFX6-NEXT:    v_cmp_le_u32_e64 s[2:3], s13, v5
-; GFX6-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1]
 ; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[2:3]
 ; GFX6-NEXT:    v_cmp_le_u32_e64 s[2:3], s12, v4
-; GFX6-NEXT:    v_subrev_i32_e64 v3, s[0:1], s12, v4
+; GFX6-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1]
 ; GFX6-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[2:3]
 ; GFX6-NEXT:    v_cmp_eq_u32_e64 s[2:3], s13, v5
+; GFX6-NEXT:    v_subrev_i32_e64 v3, s[0:1], s12, v4
 ; GFX6-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[2:3]
 ; GFX6-NEXT:    v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1]
 ; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
@@ -13515,10 +13515,10 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
 ; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT:    v_mul_hi_u32 v4, s10, v0
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v3, s10, v1
+; GFX9-NEXT:    v_mul_hi_u32 v4, s10, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v6, s4, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v5, s10, v0
 ; GFX9-NEXT:    v_add_u32_e32 v3, v4, v3
@@ -13528,17 +13528,17 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
 ; GFX9-NEXT:    v_mul_hi_u32 v8, v0, v3
 ; GFX9-NEXT:    v_mul_hi_u32 v7, v1, v5
 ; GFX9-NEXT:    v_mul_lo_u32 v5, v1, v5
-; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v6
 ; GFX9-NEXT:    v_mul_hi_u32 v9, v1, v3
+; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v6
 ; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v8, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v3, v1, v3
 ; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
 ; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v6, v7, vcc
 ; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v9, v2, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
 ; GFX9-NEXT:    v_mov_b32_e32 v6, 0
-; GFX9-NEXT:    v_add_co_u32_e64 v0, s[2:3], v0, v3
+; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
 ; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v6, v5, vcc
+; GFX9-NEXT:    v_add_co_u32_e64 v0, s[2:3], v0, v3
 ; GFX9-NEXT:    v_addc_co_u32_e64 v3, vcc, v1, v4, s[2:3]
 ; GFX9-NEXT:    v_mul_lo_u32 v5, s10, v3
 ; GFX9-NEXT:    v_mul_hi_u32 v7, s10, v0
@@ -13552,25 +13552,25 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
 ; GFX9-NEXT:    v_mul_hi_u32 v12, v0, v5
 ; GFX9-NEXT:    v_mul_hi_u32 v8, v3, v9
 ; GFX9-NEXT:    v_mul_lo_u32 v9, v3, v9
-; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v11, v10
 ; GFX9-NEXT:    v_mul_hi_u32 v7, v3, v5
+; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v11, v10
 ; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v12, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v3, v3, v5
 ; GFX9-NEXT:    v_add_co_u32_e32 v9, vcc, v10, v9
 ; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v11, v8, vcc
 ; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v7, v2, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v8, v3
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_ashr_i32 s10, s7, 31
 ; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v6, v5, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v1, v1, v4
-; GFX9-NEXT:    s_add_u32 s0, s6, s10
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_ashr_i32 s10, s7, 31
 ; GFX9-NEXT:    v_addc_co_u32_e64 v1, vcc, v1, v5, s[2:3]
+; GFX9-NEXT:    s_add_u32 s0, s6, s10
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
 ; GFX9-NEXT:    s_mov_b32 s11, s10
 ; GFX9-NEXT:    s_addc_u32 s1, s7, s10
-; GFX9-NEXT:    s_xor_b64 s[6:7], s[0:1], s[10:11]
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    s_xor_b64 s[6:7], s[0:1], s[10:11]
 ; GFX9-NEXT:    v_mul_lo_u32 v3, s6, v1
 ; GFX9-NEXT:    v_mul_hi_u32 v4, s6, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v5, s6, v1
@@ -13598,12 +13598,12 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
 ; GFX9-NEXT:    v_subrev_co_u32_e64 v5, s[0:1], s8, v0
 ; GFX9-NEXT:    v_subbrev_co_u32_e64 v6, s[2:3], 0, v3, s[0:1]
 ; GFX9-NEXT:    v_cmp_le_u32_e64 s[2:3], s9, v6
-; GFX9-NEXT:    v_subb_co_u32_e64 v3, s[0:1], v3, v4, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[2:3]
 ; GFX9-NEXT:    v_cmp_le_u32_e64 s[2:3], s8, v5
-; GFX9-NEXT:    v_subrev_co_u32_e64 v4, s[0:1], s8, v5
+; GFX9-NEXT:    v_subb_co_u32_e64 v3, s[0:1], v3, v4, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[2:3]
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], s9, v6
+; GFX9-NEXT:    v_subrev_co_u32_e64 v4, s[0:1], s8, v5
 ; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, v8, s[2:3]
 ; GFX9-NEXT:    v_subbrev_co_u32_e64 v3, s[0:1], 0, v3, s[0:1]
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v7
@@ -13653,31 +13653,31 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
 ; GFX90A-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX90A-NEXT:    v_trunc_f32_e32 v1, v1
 ; GFX90A-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX90A-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX90A-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX90A-NEXT:    s_mov_b32 s11, s10
-; GFX90A-NEXT:    v_mul_hi_u32 v5, s2, v0
 ; GFX90A-NEXT:    v_mul_lo_u32 v3, s2, v1
+; GFX90A-NEXT:    v_mul_hi_u32 v5, s2, v0
 ; GFX90A-NEXT:    v_mul_lo_u32 v4, s3, v0
 ; GFX90A-NEXT:    v_add_u32_e32 v3, v5, v3
-; GFX90A-NEXT:    v_add_u32_e32 v3, v3, v4
 ; GFX90A-NEXT:    v_mul_lo_u32 v6, s2, v0
+; GFX90A-NEXT:    v_add_u32_e32 v3, v3, v4
 ; GFX90A-NEXT:    v_mul_lo_u32 v5, v0, v3
 ; GFX90A-NEXT:    v_mul_hi_u32 v7, v0, v6
-; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v7, v5
 ; GFX90A-NEXT:    v_mul_hi_u32 v4, v0, v3
-; GFX90A-NEXT:    v_mul_hi_u32 v8, v1, v6
+; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v7, v5
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
+; GFX90A-NEXT:    v_mul_hi_u32 v8, v1, v6
 ; GFX90A-NEXT:    v_mul_lo_u32 v6, v1, v6
 ; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v6
 ; GFX90A-NEXT:    v_mul_hi_u32 v7, v1, v3
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v8, vcc
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v7, v2, vcc
 ; GFX90A-NEXT:    v_mul_lo_u32 v3, v1, v3
-; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
 ; GFX90A-NEXT:    v_mov_b32_e32 v6, 0
-; GFX90A-NEXT:    v_add_co_u32_e64 v0, s[0:1], v0, v3
+; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v6, v5, vcc
+; GFX90A-NEXT:    v_add_co_u32_e64 v0, s[0:1], v0, v3
 ; GFX90A-NEXT:    v_addc_co_u32_e64 v3, vcc, v1, v4, s[0:1]
 ; GFX90A-NEXT:    v_mul_lo_u32 v5, s2, v3
 ; GFX90A-NEXT:    v_mul_hi_u32 v7, s2, v0
@@ -13702,16 +13702,16 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
 ; GFX90A-NEXT:    v_add_u32_e32 v1, v1, v4
 ; GFX90A-NEXT:    v_addc_co_u32_e64 v1, vcc, v1, v5, s[0:1]
 ; GFX90A-NEXT:    s_add_u32 s0, s6, s10
-; GFX90A-NEXT:    s_addc_u32 s1, s7, s10
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
-; GFX90A-NEXT:    s_xor_b64 s[6:7], s[0:1], s[10:11]
+; GFX90A-NEXT:    s_addc_u32 s1, s7, s10
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX90A-NEXT:    s_xor_b64 s[6:7], s[0:1], s[10:11]
 ; GFX90A-NEXT:    v_mul_lo_u32 v4, s6, v1
 ; GFX90A-NEXT:    v_mul_hi_u32 v5, s6, v0
-; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v5, v4
 ; GFX90A-NEXT:    v_mul_hi_u32 v3, s6, v1
-; GFX90A-NEXT:    v_mul_hi_u32 v7, s7, v0
+; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v5, v4
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX90A-NEXT:    v_mul_hi_u32 v7, s7, v0
 ; GFX90A-NEXT:    v_mul_lo_u32 v0, s7, v0
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v4, v0
 ; GFX90A-NEXT:    v_mul_hi_u32 v5, s7, v1
@@ -13733,12 +13733,12 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
 ; GFX90A-NEXT:    v_subrev_co_u32_e64 v5, s[0:1], s8, v0
 ; GFX90A-NEXT:    v_subbrev_co_u32_e64 v6, s[2:3], 0, v3, s[0:1]
 ; GFX90A-NEXT:    v_cmp_le_u32_e64 s[2:3], s9, v6
-; GFX90A-NEXT:    v_subb_co_u32_e64 v3, s[0:1], v3, v4, s[0:1]
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[2:3]
 ; GFX90A-NEXT:    v_cmp_le_u32_e64 s[2:3], s8, v5
-; GFX90A-NEXT:    v_subrev_co_u32_e64 v4, s[0:1], s8, v5
+; GFX90A-NEXT:    v_subb_co_u32_e64 v3, s[0:1], v3, v4, s[0:1]
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[2:3]
 ; GFX90A-NEXT:    v_cmp_eq_u32_e64 s[2:3], s9, v6
+; GFX90A-NEXT:    v_subrev_co_u32_e64 v4, s[0:1], s8, v5
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v7, v7, v8, s[2:3]
 ; GFX90A-NEXT:    v_subbrev_co_u32_e64 v3, s[0:1], 0, v3, s[0:1]
 ; GFX90A-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v7
@@ -13910,13 +13910,13 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX6-NEXT:    v_mul_f32_e32 v1, s20, v0
 ; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
 ; GFX6-NEXT:    v_mac_f32_e32 v0, s21, v1
-; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_ashr_i32 s12, s9, 31
 ; GFX6-NEXT:    s_add_u32 s0, s8, s12
-; GFX6-NEXT:    v_mul_hi_u32 v3, s6, v0
 ; GFX6-NEXT:    v_mul_lo_u32 v2, s6, v1
+; GFX6-NEXT:    v_mul_hi_u32 v3, s6, v0
 ; GFX6-NEXT:    v_mul_lo_u32 v4, s7, v0
 ; GFX6-NEXT:    v_mul_lo_u32 v5, s6, v0
 ; GFX6-NEXT:    s_mov_b32 s13, s12
@@ -13937,10 +13937,10 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v4, v5, vcc
 ; GFX6-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v7, v4, vcc
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GFX6-NEXT:    v_mov_b32_e32 v6, 0
-; GFX6-NEXT:    v_add_i32_e64 v0, s[2:3], v0, v2
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v6, v5, vcc
+; GFX6-NEXT:    v_add_i32_e64 v0, s[2:3], v0, v2
 ; GFX6-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[2:3]
 ; GFX6-NEXT:    v_mul_lo_u32 v5, s6, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v7, s6, v0
@@ -13950,8 +13950,8 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX6-NEXT:    v_mul_lo_u32 v7, s6, v0
 ; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
 ; GFX6-NEXT:    v_mul_lo_u32 v10, v0, v5
-; GFX6-NEXT:    v_mul_hi_u32 v12, v0, v5
 ; GFX6-NEXT:    v_mul_hi_u32 v11, v0, v7
+; GFX6-NEXT:    v_mul_hi_u32 v12, v0, v5
 ; GFX6-NEXT:    v_mul_hi_u32 v9, v2, v7
 ; GFX6-NEXT:    v_mul_lo_u32 v7, v2, v7
 ; GFX6-NEXT:    v_mul_hi_u32 v8, v2, v5
@@ -13995,15 +13995,15 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX6-NEXT:    v_subrev_i32_e64 v5, s[0:1], s16, v0
 ; GFX6-NEXT:    v_subbrev_u32_e64 v7, s[2:3], 0, v2, s[0:1]
 ; GFX6-NEXT:    v_cmp_le_u32_e64 s[2:3], s17, v7
-; GFX6-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1]
 ; GFX6-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[2:3]
 ; GFX6-NEXT:    v_cmp_le_u32_e64 s[2:3], s16, v5
-; GFX6-NEXT:    v_subrev_i32_e64 v3, s[0:1], s16, v5
+; GFX6-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1]
 ; GFX6-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[2:3]
 ; GFX6-NEXT:    v_cmp_eq_u32_e64 s[2:3], s17, v7
+; GFX6-NEXT:    v_subrev_i32_e64 v3, s[0:1], s16, v5
 ; GFX6-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s[2:3]
-; GFX6-NEXT:    s_ashr_i32 s2, s15, 31
 ; GFX6-NEXT:    v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1]
+; GFX6-NEXT:    s_ashr_i32 s2, s15, 31
 ; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v8
 ; GFX6-NEXT:    s_add_u32 s8, s14, s2
 ; GFX6-NEXT:    v_cndmask_b32_e64 v2, v7, v2, s[0:1]
@@ -14015,8 +14015,8 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v9, s9
 ; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v7, v1, vcc
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s17, v1
-; GFX6-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
 ; GFX6-NEXT:    v_mac_f32_e32 v8, s18, v9
+; GFX6-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s16, v0
 ; GFX6-NEXT:    v_rcp_f32_e32 v8, v8
 ; GFX6-NEXT:    v_cndmask_b32_e64 v10, 0, -1, vcc
@@ -14042,33 +14042,33 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX6-NEXT:    v_mul_lo_u32 v7, s2, v3
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
 ; GFX6-NEXT:    v_mul_lo_u32 v8, v3, v2
-; GFX6-NEXT:    v_mul_hi_u32 v10, v3, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v9, v3, v7
+; GFX6-NEXT:    v_mul_hi_u32 v10, v3, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v11, v5, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v2, v5, v2
-; GFX6-NEXT:    s_mov_b32 s15, s14
 ; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
 ; GFX6-NEXT:    v_addc_u32_e32 v9, vcc, 0, v10, vcc
 ; GFX6-NEXT:    v_mul_lo_u32 v10, v5, v7
 ; GFX6-NEXT:    v_mul_hi_u32 v7, v5, v7
+; GFX6-NEXT:    s_mov_b32 s15, s14
 ; GFX6-NEXT:    v_xor_b32_e32 v0, s12, v0
-; GFX6-NEXT:    v_xor_b32_e32 v1, s12, v1
 ; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
 ; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, v9, v7, vcc
 ; GFX6-NEXT:    v_addc_u32_e32 v8, vcc, v11, v4, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
-; GFX6-NEXT:    v_add_i32_e64 v2, s[0:1], v3, v2
 ; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, v6, v8, vcc
+; GFX6-NEXT:    v_add_i32_e64 v2, s[0:1], v3, v2
 ; GFX6-NEXT:    v_addc_u32_e64 v3, vcc, v5, v7, s[0:1]
 ; GFX6-NEXT:    v_mul_lo_u32 v8, s2, v3
 ; GFX6-NEXT:    v_mul_hi_u32 v9, s2, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v10, s3, v2
+; GFX6-NEXT:    v_xor_b32_e32 v1, s12, v1
 ; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
 ; GFX6-NEXT:    v_mul_lo_u32 v9, s2, v2
 ; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
 ; GFX6-NEXT:    v_mul_lo_u32 v12, v2, v8
-; GFX6-NEXT:    v_mul_hi_u32 v14, v2, v8
 ; GFX6-NEXT:    v_mul_hi_u32 v13, v2, v9
+; GFX6-NEXT:    v_mul_hi_u32 v14, v2, v8
 ; GFX6-NEXT:    v_mul_hi_u32 v11, v3, v9
 ; GFX6-NEXT:    v_mul_lo_u32 v9, v3, v9
 ; GFX6-NEXT:    v_mul_hi_u32 v10, v3, v8
@@ -14085,8 +14085,8 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX6-NEXT:    s_add_u32 s0, s10, s14
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GFX6-NEXT:    s_addc_u32 s1, s11, s14
-; GFX6-NEXT:    s_xor_b64 s[10:11], s[0:1], s[14:15]
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
+; GFX6-NEXT:    s_xor_b64 s[10:11], s[0:1], s[14:15]
 ; GFX6-NEXT:    v_mul_lo_u32 v5, s10, v3
 ; GFX6-NEXT:    v_mul_hi_u32 v7, s10, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v9, s10, v3
@@ -14117,12 +14117,12 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX6-NEXT:    v_subrev_i32_e64 v6, s[0:1], s8, v2
 ; GFX6-NEXT:    v_subbrev_u32_e64 v7, s[2:3], 0, v4, s[0:1]
 ; GFX6-NEXT:    v_cmp_le_u32_e64 s[2:3], s9, v7
-; GFX6-NEXT:    v_subb_u32_e64 v4, s[0:1], v4, v5, s[0:1]
 ; GFX6-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[2:3]
 ; GFX6-NEXT:    v_cmp_le_u32_e64 s[2:3], s8, v6
-; GFX6-NEXT:    v_subrev_i32_e64 v5, s[0:1], s8, v6
+; GFX6-NEXT:    v_subb_u32_e64 v4, s[0:1], v4, v5, s[0:1]
 ; GFX6-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[2:3]
 ; GFX6-NEXT:    v_cmp_eq_u32_e64 s[2:3], s9, v7
+; GFX6-NEXT:    v_subrev_i32_e64 v5, s[0:1], s8, v6
 ; GFX6-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s[2:3]
 ; GFX6-NEXT:    v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1]
 ; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v8
@@ -14174,10 +14174,10 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX9-NEXT:    v_mul_f32_e32 v1, s18, v0
 ; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
 ; GFX9-NEXT:    v_mac_f32_e32 v0, s19, v1
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT:    v_mul_hi_u32 v3, s8, v0
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v2, s8, v1
+; GFX9-NEXT:    v_mul_hi_u32 v3, s8, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v5, s4, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v4, s8, v0
 ; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
@@ -14194,10 +14194,10 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v7
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v4, vcc
 ; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v8, v6, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0
-; GFX9-NEXT:    v_add_co_u32_e64 v0, s[2:3], v0, v2
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v4, vcc
+; GFX9-NEXT:    v_add_co_u32_e64 v0, s[2:3], v0, v2
 ; GFX9-NEXT:    v_addc_co_u32_e64 v2, vcc, v1, v3, s[2:3]
 ; GFX9-NEXT:    v_mul_lo_u32 v4, s8, v2
 ; GFX9-NEXT:    v_mul_hi_u32 v7, s8, v0
@@ -14211,25 +14211,25 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX9-NEXT:    v_mul_hi_u32 v12, v0, v4
 ; GFX9-NEXT:    v_mul_hi_u32 v8, v2, v9
 ; GFX9-NEXT:    v_mul_lo_u32 v9, v2, v9
-; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v11, v10
 ; GFX9-NEXT:    v_mul_hi_u32 v7, v2, v4
+; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v11, v10
 ; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v12, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v2, v2, v4
 ; GFX9-NEXT:    v_add_co_u32_e32 v9, vcc, v10, v9
 ; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v11, v8, vcc
 ; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v7, v6, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v8, v2
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_ashr_i32 s8, s5, 31
 ; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v5, v4, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_ashr_i32 s8, s5, 31
 ; GFX9-NEXT:    v_addc_co_u32_e64 v1, vcc, v1, v4, s[2:3]
 ; GFX9-NEXT:    s_add_u32 s2, s4, s8
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT:    s_addc_u32 s3, s5, s8
 ; GFX9-NEXT:    s_mov_b32 s9, s8
-; GFX9-NEXT:    s_xor_b64 s[14:15], s[2:3], s[8:9]
+; GFX9-NEXT:    s_addc_u32 s3, s5, s8
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    s_xor_b64 s[14:15], s[2:3], s[8:9]
 ; GFX9-NEXT:    v_mul_lo_u32 v2, s14, v1
 ; GFX9-NEXT:    v_mul_hi_u32 v3, s14, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v4, s14, v1
@@ -14258,12 +14258,12 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX9-NEXT:    v_subrev_co_u32_e64 v4, s[0:1], s12, v0
 ; GFX9-NEXT:    v_subbrev_co_u32_e64 v7, s[2:3], 0, v2, s[0:1]
 ; GFX9-NEXT:    v_cmp_le_u32_e64 s[2:3], s13, v7
-; GFX9-NEXT:    v_subb_co_u32_e64 v2, s[0:1], v2, v3, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[2:3]
 ; GFX9-NEXT:    v_cmp_le_u32_e64 s[2:3], s12, v4
-; GFX9-NEXT:    v_subrev_co_u32_e64 v3, s[0:1], s12, v4
+; GFX9-NEXT:    v_subb_co_u32_e64 v2, s[0:1], v2, v3, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[2:3]
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], s13, v7
+; GFX9-NEXT:    v_subrev_co_u32_e64 v3, s[0:1], s12, v4
 ; GFX9-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s[2:3]
 ; GFX9-NEXT:    v_subbrev_co_u32_e64 v2, s[0:1], 0, v2, s[0:1]
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v8
@@ -14273,8 +14273,8 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX9-NEXT:    s_add_u32 s2, s10, s0
 ; GFX9-NEXT:    s_mov_b32 s1, s0
 ; GFX9-NEXT:    s_addc_u32 s3, s11, s0
-; GFX9-NEXT:    s_xor_b64 s[10:11], s[2:3], s[0:1]
 ; GFX9-NEXT:    v_mov_b32_e32 v4, s15
+; GFX9-NEXT:    s_xor_b64 s[10:11], s[2:3], s[0:1]
 ; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v4, v1, vcc
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s10
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v7, s11
@@ -14318,8 +14318,8 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v9, v2, vcc
 ; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v11, v6, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v7
-; GFX9-NEXT:    v_add_co_u32_e64 v2, s[0:1], v3, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v5, v8, vcc
+; GFX9-NEXT:    v_add_co_u32_e64 v2, s[0:1], v3, v2
 ; GFX9-NEXT:    v_addc_co_u32_e64 v3, vcc, v4, v7, s[0:1]
 ; GFX9-NEXT:    v_mul_lo_u32 v8, s2, v3
 ; GFX9-NEXT:    v_mul_hi_u32 v9, s2, v2
@@ -14333,8 +14333,8 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX9-NEXT:    v_mul_hi_u32 v14, v2, v8
 ; GFX9-NEXT:    v_mul_hi_u32 v10, v3, v11
 ; GFX9-NEXT:    v_mul_lo_u32 v11, v3, v11
-; GFX9-NEXT:    v_add_co_u32_e32 v12, vcc, v13, v12
 ; GFX9-NEXT:    v_mul_hi_u32 v9, v3, v8
+; GFX9-NEXT:    v_add_co_u32_e32 v12, vcc, v13, v12
 ; GFX9-NEXT:    v_addc_co_u32_e32 v13, vcc, 0, v14, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v3, v3, v8
 ; GFX9-NEXT:    v_add_co_u32_e32 v11, vcc, v12, v11
@@ -14346,8 +14346,8 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX9-NEXT:    s_add_u32 s0, s6, s12
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v3
 ; GFX9-NEXT:    s_addc_u32 s1, s7, s12
-; GFX9-NEXT:    s_xor_b64 s[6:7], s[0:1], s[12:13]
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
+; GFX9-NEXT:    s_xor_b64 s[6:7], s[0:1], s[12:13]
 ; GFX9-NEXT:    v_mul_lo_u32 v4, s6, v3
 ; GFX9-NEXT:    v_mul_hi_u32 v7, s6, v2
 ; GFX9-NEXT:    v_mul_hi_u32 v9, s6, v3
@@ -14380,12 +14380,12 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX9-NEXT:    v_subrev_co_u32_e64 v7, s[0:1], s10, v2
 ; GFX9-NEXT:    v_subbrev_co_u32_e64 v8, s[2:3], 0, v4, s[0:1]
 ; GFX9-NEXT:    v_cmp_le_u32_e64 s[2:3], s11, v8
-; GFX9-NEXT:    v_subb_co_u32_e64 v4, s[0:1], v4, v5, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[2:3]
 ; GFX9-NEXT:    v_cmp_le_u32_e64 s[2:3], s10, v7
-; GFX9-NEXT:    v_subrev_co_u32_e64 v5, s[0:1], s10, v7
+; GFX9-NEXT:    v_subb_co_u32_e64 v4, s[0:1], v4, v5, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[2:3]
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], s11, v8
+; GFX9-NEXT:    v_subrev_co_u32_e64 v5, s[0:1], s10, v7
 ; GFX9-NEXT:    v_cndmask_b32_e64 v9, v9, v10, s[2:3]
 ; GFX9-NEXT:    v_subbrev_co_u32_e64 v4, s[0:1], 0, v4, s[0:1]
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v9
@@ -14453,20 +14453,20 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX90A-NEXT:    v_mul_lo_u32 v6, s2, v0
 ; GFX90A-NEXT:    v_mul_lo_u32 v5, v0, v2
 ; GFX90A-NEXT:    v_mul_hi_u32 v7, v0, v6
-; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v7, v5
 ; GFX90A-NEXT:    v_mul_hi_u32 v3, v0, v2
-; GFX90A-NEXT:    v_mul_hi_u32 v8, v1, v6
+; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v7, v5
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX90A-NEXT:    v_mul_hi_u32 v8, v1, v6
 ; GFX90A-NEXT:    v_mul_lo_u32 v6, v1, v6
 ; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v6
 ; GFX90A-NEXT:    v_mul_hi_u32 v7, v1, v2
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v8, vcc
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v7, v4, vcc
 ; GFX90A-NEXT:    v_mul_lo_u32 v2, v1, v2
-; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
 ; GFX90A-NEXT:    v_mov_b32_e32 v6, 0
-; GFX90A-NEXT:    v_add_co_u32_e64 v0, s[0:1], v0, v2
+; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v5, vcc
+; GFX90A-NEXT:    v_add_co_u32_e64 v0, s[0:1], v0, v2
 ; GFX90A-NEXT:    v_addc_co_u32_e64 v2, vcc, v1, v3, s[0:1]
 ; GFX90A-NEXT:    v_mul_lo_u32 v5, s2, v2
 ; GFX90A-NEXT:    v_mul_hi_u32 v7, s2, v0
@@ -14491,16 +14491,16 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX90A-NEXT:    v_add_u32_e32 v1, v1, v3
 ; GFX90A-NEXT:    v_addc_co_u32_e64 v1, vcc, v1, v5, s[0:1]
 ; GFX90A-NEXT:    s_add_u32 s0, s4, s14
-; GFX90A-NEXT:    s_addc_u32 s1, s5, s14
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX90A-NEXT:    s_xor_b64 s[4:5], s[0:1], s[14:15]
+; GFX90A-NEXT:    s_addc_u32 s1, s5, s14
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX90A-NEXT:    s_xor_b64 s[4:5], s[0:1], s[14:15]
 ; GFX90A-NEXT:    v_mul_lo_u32 v3, s4, v1
 ; GFX90A-NEXT:    v_mul_hi_u32 v5, s4, v0
-; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v5, v3
 ; GFX90A-NEXT:    v_mul_hi_u32 v2, s4, v1
-; GFX90A-NEXT:    v_mul_hi_u32 v7, s5, v0
+; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v5, v3
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
+; GFX90A-NEXT:    v_mul_hi_u32 v7, s5, v0
 ; GFX90A-NEXT:    v_mul_lo_u32 v0, s5, v0
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v3, v0
 ; GFX90A-NEXT:    v_mul_hi_u32 v5, s5, v1
@@ -14522,12 +14522,12 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX90A-NEXT:    v_subrev_co_u32_e64 v5, s[0:1], s12, v0
 ; GFX90A-NEXT:    v_subbrev_co_u32_e64 v7, s[2:3], 0, v2, s[0:1]
 ; GFX90A-NEXT:    v_cmp_le_u32_e64 s[2:3], s13, v7
-; GFX90A-NEXT:    v_subb_co_u32_e64 v2, s[0:1], v2, v3, s[0:1]
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[2:3]
 ; GFX90A-NEXT:    v_cmp_le_u32_e64 s[2:3], s12, v5
-; GFX90A-NEXT:    v_subrev_co_u32_e64 v3, s[0:1], s12, v5
+; GFX90A-NEXT:    v_subb_co_u32_e64 v2, s[0:1], v2, v3, s[0:1]
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[2:3]
 ; GFX90A-NEXT:    v_cmp_eq_u32_e64 s[2:3], s13, v7
+; GFX90A-NEXT:    v_subrev_co_u32_e64 v3, s[0:1], s12, v5
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s[2:3]
 ; GFX90A-NEXT:    v_subbrev_co_u32_e64 v2, s[0:1], 0, v2, s[0:1]
 ; GFX90A-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v8
@@ -14536,12 +14536,12 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX90A-NEXT:    v_subb_co_u32_e32 v1, vcc, v5, v1, vcc
 ; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s13, v1
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v7, v2, s[0:1]
-; GFX90A-NEXT:    s_ashr_i32 s0, s11, 31
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
 ; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s12, v0
-; GFX90A-NEXT:    s_add_u32 s2, s10, s0
+; GFX90A-NEXT:    s_ashr_i32 s0, s11, 31
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, s13, v1
+; GFX90A-NEXT:    s_add_u32 s2, s10, s0
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
 ; GFX90A-NEXT:    s_mov_b32 s1, s0
 ; GFX90A-NEXT:    s_addc_u32 s3, s11, s0
@@ -14553,10 +14553,9 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX90A-NEXT:    v_cvt_f32_u32_e32 v3, s5
 ; GFX90A-NEXT:    v_xor_b32_e32 v0, s14, v0
 ; GFX90A-NEXT:    s_sub_u32 s2, 0, s4
-; GFX90A-NEXT:    s_subb_u32 s3, 0, s5
+; GFX90A-NEXT:    v_xor_b32_e32 v1, s14, v1
 ; GFX90A-NEXT:    v_mac_f32_e32 v2, s16, v3
 ; GFX90A-NEXT:    v_rcp_f32_e32 v2, v2
-; GFX90A-NEXT:    v_xor_b32_e32 v1, s14, v1
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, s14
 ; GFX90A-NEXT:    v_subrev_co_u32_e32 v0, vcc, s14, v0
 ; GFX90A-NEXT:    v_mul_f32_e32 v2, s17, v2
@@ -14565,8 +14564,8 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX90A-NEXT:    v_mac_f32_e32 v2, s19, v3
 ; GFX90A-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GFX90A-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX90A-NEXT:    s_subb_u32 s3, 0, s5
 ; GFX90A-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v5, vcc
-; GFX90A-NEXT:    s_ashr_i32 s10, s7, 31
 ; GFX90A-NEXT:    v_mul_hi_u32 v7, s2, v2
 ; GFX90A-NEXT:    v_mul_lo_u32 v8, s2, v3
 ; GFX90A-NEXT:    v_mul_lo_u32 v5, s3, v2
@@ -14575,10 +14574,10 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX90A-NEXT:    v_mul_lo_u32 v9, s2, v2
 ; GFX90A-NEXT:    v_mul_lo_u32 v8, v2, v5
 ; GFX90A-NEXT:    v_mul_hi_u32 v10, v2, v9
-; GFX90A-NEXT:    v_add_co_u32_e32 v8, vcc, v10, v8
 ; GFX90A-NEXT:    v_mul_hi_u32 v7, v2, v5
-; GFX90A-NEXT:    v_mul_hi_u32 v11, v3, v9
+; GFX90A-NEXT:    v_add_co_u32_e32 v8, vcc, v10, v8
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v7, vcc
+; GFX90A-NEXT:    v_mul_hi_u32 v11, v3, v9
 ; GFX90A-NEXT:    v_mul_lo_u32 v9, v3, v9
 ; GFX90A-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v9
 ; GFX90A-NEXT:    v_mul_hi_u32 v10, v3, v5
@@ -14586,8 +14585,8 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v8, vcc, v10, v4, vcc
 ; GFX90A-NEXT:    v_mul_lo_u32 v5, v3, v5
 ; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v7, v5
-; GFX90A-NEXT:    v_add_co_u32_e64 v2, s[0:1], v2, v5
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, v6, v8, vcc
+; GFX90A-NEXT:    v_add_co_u32_e64 v2, s[0:1], v2, v5
 ; GFX90A-NEXT:    v_addc_co_u32_e64 v5, vcc, v3, v7, s[0:1]
 ; GFX90A-NEXT:    v_mul_lo_u32 v8, s2, v5
 ; GFX90A-NEXT:    v_mul_hi_u32 v9, s2, v2
@@ -14610,19 +14609,20 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v10, v5
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v8, vcc, v6, v9, vcc
 ; GFX90A-NEXT:    v_add_u32_e32 v3, v3, v7
+; GFX90A-NEXT:    s_ashr_i32 s10, s7, 31
 ; GFX90A-NEXT:    v_addc_co_u32_e64 v3, vcc, v3, v8, s[0:1]
 ; GFX90A-NEXT:    s_add_u32 s0, s6, s10
 ; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v5
 ; GFX90A-NEXT:    s_mov_b32 s11, s10
 ; GFX90A-NEXT:    s_addc_u32 s1, s7, s10
-; GFX90A-NEXT:    s_xor_b64 s[6:7], s[0:1], s[10:11]
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX90A-NEXT:    s_xor_b64 s[6:7], s[0:1], s[10:11]
 ; GFX90A-NEXT:    v_mul_lo_u32 v7, s6, v3
 ; GFX90A-NEXT:    v_mul_hi_u32 v8, s6, v2
-; GFX90A-NEXT:    v_add_co_u32_e32 v7, vcc, v8, v7
 ; GFX90A-NEXT:    v_mul_hi_u32 v5, s6, v3
-; GFX90A-NEXT:    v_mul_hi_u32 v9, s7, v2
+; GFX90A-NEXT:    v_add_co_u32_e32 v7, vcc, v8, v7
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
+; GFX90A-NEXT:    v_mul_hi_u32 v9, s7, v2
 ; GFX90A-NEXT:    v_mul_lo_u32 v2, s7, v2
 ; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, v7, v2
 ; GFX90A-NEXT:    v_mul_hi_u32 v8, s7, v3
@@ -14644,12 +14644,12 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX90A-NEXT:    v_subrev_co_u32_e64 v7, s[0:1], s4, v2
 ; GFX90A-NEXT:    v_subbrev_co_u32_e64 v8, s[2:3], 0, v5, s[0:1]
 ; GFX90A-NEXT:    v_cmp_le_u32_e64 s[2:3], s5, v8
-; GFX90A-NEXT:    v_subb_co_u32_e64 v5, s[0:1], v5, v6, s[0:1]
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[2:3]
 ; GFX90A-NEXT:    v_cmp_le_u32_e64 s[2:3], s4, v7
-; GFX90A-NEXT:    v_subrev_co_u32_e64 v6, s[0:1], s4, v7
+; GFX90A-NEXT:    v_subb_co_u32_e64 v5, s[0:1], v5, v6, s[0:1]
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[2:3]
 ; GFX90A-NEXT:    v_cmp_eq_u32_e64 s[2:3], s5, v8
+; GFX90A-NEXT:    v_subrev_co_u32_e64 v6, s[0:1], s4, v7
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v9, v9, v10, s[2:3]
 ; GFX90A-NEXT:    v_subbrev_co_u32_e64 v5, s[0:1], 0, v5, s[0:1]
 ; GFX90A-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v9

diff  --git a/llvm/test/CodeGen/AMDGPU/amdgpu-mul24-knownbits.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-mul24-knownbits.ll
index d37b4052acda9..e5321c8e53409 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-mul24-knownbits.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-mul24-knownbits.ll
@@ -5,8 +5,8 @@ define weak_odr amdgpu_kernel void @test_mul24_knownbits_kernel(float addrspace(
 ; GCN-LABEL: test_mul24_knownbits_kernel:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    v_and_b32_e32 v0, 3, v0
-; GCN-NEXT:    v_mul_i32_i24_e32 v0, -5, v0
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GCN-NEXT:    v_mul_i32_i24_e32 v0, -5, v0
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffffffe0, v0
 ; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]

diff  --git a/llvm/test/CodeGen/AMDGPU/anyext.ll b/llvm/test/CodeGen/AMDGPU/anyext.ll
index 0d2989da8655d..d744a6cd296d5 100644
--- a/llvm/test/CodeGen/AMDGPU/anyext.ll
+++ b/llvm/test/CodeGen/AMDGPU/anyext.ll
@@ -65,11 +65,11 @@ define amdgpu_kernel void @s_anyext_i16_i32(i32 addrspace(1)* %out, i16 addrspac
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
 ; GCN-NEXT:    s_mov_b32 s11, 0xf000
-; GCN-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
 ; GCN-NEXT:    s_mov_b32 s14, 0
 ; GCN-NEXT:    s_mov_b32 s15, s11
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b64 s[12:13], s[6:7]
+; GCN-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
 ; GCN-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, 1, v1
 ; GCN-NEXT:    s_mov_b64 s[2:3], s[14:15]
@@ -94,10 +94,10 @@ define amdgpu_kernel void @s_anyext_i16_i32(i32 addrspace(1)* %out, i16 addrspac
 ; GFX8-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s2, -1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s6, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s7
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 1, v1
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s6, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 1, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s8, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc

diff  --git a/llvm/test/CodeGen/AMDGPU/ashr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/ashr.v2i16.ll
index 5ac3daf657792..3cd866795c6f2 100644
--- a/llvm/test/CodeGen/AMDGPU/ashr.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/ashr.v2i16.ll
@@ -38,9 +38,9 @@ define amdgpu_kernel void @s_ashr_v2i16(<2 x i16> addrspace(1)* %out, i32, <2 x
 ; VI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 
 ; CI: s_mov_b32 [[MASK:s[0-9]+]], 0xffff{{$}}
-; CI-DAG: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], [[RHS]]
 ; CI-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
 ; CI: v_ashrrev_i32_e32 v{{[0-9]+}}, 16, [[LHS]]
+; CI-DAG: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], [[RHS]]
 ; CI: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
 ; CI: v_ashr_i32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; CI: v_ashr_i32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}

diff  --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index 80fca55df019f..0cff33934f95d 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -822,9 +822,9 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 addrspace
 ; GFX8-NEXT:    s_mov_b32 s13, s7
 ; GFX8-NEXT:    s_mul_i32 s7, s1, s6
 ; GFX8-NEXT:    s_mul_i32 s6, s0, s6
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s7, v0
 ; GFX8-NEXT:    s_mov_b32 s15, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s14, -1
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s7, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_atomic_add_x2 v[0:1], off, s[12:15], 0 glc
@@ -877,15 +877,15 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 addrspace
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:  BB4_2:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mul_lo_u32 v0, s2, v2
 ; GFX9-NEXT:    v_mul_lo_u32 v3, s3, v2
 ; GFX9-NEXT:    v_mul_hi_u32 v4, s2, v2
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    v_mul_lo_u32 v0, s2, v2
 ; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX9-NEXT:    v_add_u32_e32 v1, v4, v3
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
-; GFX9-NEXT:    v_add_u32_e32 v1, v4, v3
 ; GFX9-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s6, -1
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
@@ -1755,8 +1755,8 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out, i64 addrspac
 ; GFX8-NEXT:  BB9_2:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX8-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
 ; GFX8-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX8-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
 ; GFX8-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s4, v0
@@ -1794,8 +1794,8 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out, i64 addrspac
 ; GFX9-NEXT:  BB9_2:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX9-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
 ; GFX9-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX9-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
 ; GFX9-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s4, v0
@@ -1841,9 +1841,9 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out, i64 addrspac
 ; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
 ; GFX1064-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
 ; GFX1064-NEXT:    v_sub_co_u32 v0, vcc, s2, v0
-; GFX1064-NEXT:    s_mov_b32 s2, -1
 ; GFX1064-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc
 ; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1064-NEXT:    s_mov_b32 s2, -1
 ; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX1064-NEXT:    s_endpgm
 ;
@@ -1881,9 +1881,9 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out, i64 addrspac
 ; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
 ; GFX1032-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
 ; GFX1032-NEXT:    v_sub_co_u32 v0, vcc_lo, s2, v0
-; GFX1032-NEXT:    s_mov_b32 s2, -1
 ; GFX1032-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
 ; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1032-NEXT:    s_mov_b32 s2, -1
 ; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX1032-NEXT:    s_endpgm
 entry:
@@ -1959,9 +1959,9 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 addrspace
 ; GFX8-NEXT:    s_mov_b32 s13, s7
 ; GFX8-NEXT:    s_mul_i32 s7, s1, s6
 ; GFX8-NEXT:    s_mul_i32 s6, s0, s6
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s7, v0
 ; GFX8-NEXT:    s_mov_b32 s15, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s14, -1
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s7, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_atomic_sub_x2 v[0:1], off, s[12:15], 0 glc
@@ -2014,15 +2014,15 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 addrspace
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:  BB10_2:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mul_lo_u32 v0, s2, v2
 ; GFX9-NEXT:    v_mul_lo_u32 v3, s3, v2
 ; GFX9-NEXT:    v_mul_hi_u32 v4, s2, v2
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    v_mul_lo_u32 v0, s2, v2
 ; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX9-NEXT:    v_add_u32_e32 v1, v4, v3
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s0, v0
-; GFX9-NEXT:    v_add_u32_e32 v1, v4, v3
 ; GFX9-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s6, -1
 ; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v1, vcc

diff  --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index dfacab48c45d3..bc2a7c884bdfb 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -979,8 +979,8 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive
 ; GFX8-NEXT:    v_mul_lo_u32 v1, s2, v2
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s1
-; GFX8-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v1
+; GFX8-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s6, -1
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v3, v2, vcc
 ; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -1013,14 +1013,14 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s4, s0
-; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX9-NEXT:    v_mul_lo_u32 v0, s2, v2
 ; GFX9-NEXT:    v_mul_lo_u32 v3, s3, v2
 ; GFX9-NEXT:    v_mul_hi_u32 v4, s2, v2
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    v_mul_lo_u32 v0, s2, v2
 ; GFX9-NEXT:    s_mov_b32 s5, s1
 ; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX9-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX9-NEXT:    v_add_u32_e32 v1, v4, v3
+; GFX9-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
 ; GFX9-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s6, -1
@@ -1958,8 +1958,8 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) {
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX8-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
 ; GFX8-NEXT:    v_readfirstlane_b32 s3, v1
+; GFX8-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
 ; GFX8-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s3
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s2, v0
@@ -1991,8 +1991,8 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) {
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX9-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
 ; GFX9-NEXT:    v_readfirstlane_b32 s3, v1
+; GFX9-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
 ; GFX9-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s3
 ; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s2, v0
@@ -2030,9 +2030,9 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) {
 ; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
 ; GFX1064-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
 ; GFX1064-NEXT:    v_sub_co_u32 v0, vcc, s2, v0
-; GFX1064-NEXT:    s_mov_b32 s2, -1
 ; GFX1064-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc
 ; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1064-NEXT:    s_mov_b32 s2, -1
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX1064-NEXT:    s_endpgm
@@ -2064,9 +2064,9 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) {
 ; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
 ; GFX1032-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
 ; GFX1032-NEXT:    v_sub_co_u32 v0, vcc_lo, s2, v0
-; GFX1032-NEXT:    s_mov_b32 s2, -1
 ; GFX1032-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
 ; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1032-NEXT:    s_mov_b32 s2, -1
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX1032-NEXT:    s_endpgm
@@ -2158,8 +2158,8 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive
 ; GFX8-NEXT:    v_mul_lo_u32 v1, s2, v2
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s1
-; GFX8-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s0, v1
+; GFX8-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s6, -1
 ; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v3, v2, vcc
 ; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -2192,14 +2192,14 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s4, s0
-; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX9-NEXT:    v_mul_lo_u32 v0, s2, v2
 ; GFX9-NEXT:    v_mul_lo_u32 v3, s3, v2
 ; GFX9-NEXT:    v_mul_hi_u32 v4, s2, v2
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    v_mul_lo_u32 v0, s2, v2
 ; GFX9-NEXT:    s_mov_b32 s5, s1
 ; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX9-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX9-NEXT:    v_add_u32_e32 v1, v4, v3
+; GFX9-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s0, v0
 ; GFX9-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s6, -1
@@ -4189,8 +4189,8 @@ define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) {
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
 ; GFX8-NEXT:    v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1]
-; GFX8-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX8-NEXT:    s_mov_b32 s3, 0xf000
@@ -4222,8 +4222,8 @@ define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
 ; GFX9-NEXT:    v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
@@ -4594,8 +4594,8 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) {
 ; GFX8-NEXT:  BB24_2:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX8-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
 ; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
@@ -4627,8 +4627,8 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) {
 ; GFX9-NEXT:  BB24_2:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX9-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
 ; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]

diff  --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
index 765a681982164..2aae74dc16083 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
@@ -311,8 +311,8 @@ define amdgpu_ps void @add_i32_varying(<4 x i32> inreg %out, <4 x i32> inreg %in
 ; GFX1064-NEXT:    v_readlane_b32 s12, v1, 31
 ; GFX1064-NEXT:    v_mov_b32_e32 v2, s12
 ; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1064-NEXT:    v_readlane_b32 s12, v1, 15
 ; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064-NEXT:    v_readlane_b32 s12, v1, 15
 ; GFX1064-NEXT:    v_readlane_b32 s13, v1, 31
 ; GFX1064-NEXT:    v_writelane_b32 v3, s12, 16
 ; GFX1064-NEXT:    s_mov_b64 exec, s[10:11]

diff  --git a/llvm/test/CodeGen/AMDGPU/bitreverse.ll b/llvm/test/CodeGen/AMDGPU/bitreverse.ll
index 99ccc1d453193..b4d7b3184a95d 100644
--- a/llvm/test/CodeGen/AMDGPU/bitreverse.ll
+++ b/llvm/test/CodeGen/AMDGPU/bitreverse.ll
@@ -307,8 +307,8 @@ define amdgpu_kernel void @v_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2
 ; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GISEL-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; GISEL-NEXT:    v_mov_b32_e32 v2, s2

diff  --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation-inst-size-gfx10.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation-inst-size-gfx10.ll
index d40a6de631837..6a9839633e77a 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relaxation-inst-size-gfx10.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation-inst-size-gfx10.ll
@@ -6,6 +6,7 @@
 
 ; GCN-LABEL: {{^}}long_forward_branch_gfx10only:
 ; GFX9: s_cmp_eq_u32
+; GFX9: s_load_dwordx2
 ; GFX9-NEXT: s_cbranch_scc1
 
 ; GFX10: s_cmp_eq_u32

diff  --git a/llvm/test/CodeGen/AMDGPU/bypass-div.ll b/llvm/test/CodeGen/AMDGPU/bypass-div.ll
index b168f5747f16e..880dcbe8296f6 100644
--- a/llvm/test/CodeGen/AMDGPU/bypass-div.ll
+++ b/llvm/test/CodeGen/AMDGPU/bypass-div.ll
@@ -19,25 +19,25 @@ define i64 @sdiv64(i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v4, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v2, v2, v4
 ; GFX9-NEXT:    v_xor_b32_e32 v3, v3, v4
+; GFX9-NEXT:    v_xor_b32_e32 v2, v2, v4
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, v2
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v6, v3
 ; GFX9-NEXT:    v_sub_co_u32_e32 v7, vcc, 0, v2
 ; GFX9-NEXT:    v_subb_co_u32_e32 v8, vcc, 0, v3, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v15, 0
 ; GFX9-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v6
 ; GFX9-NEXT:    v_rcp_f32_e32 v5, v5
+; GFX9-NEXT:    v_mov_b32_e32 v15, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v14, 0
 ; GFX9-NEXT:    v_mul_f32_e32 v5, 0x5f7ffffc, v5
 ; GFX9-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v5
 ; GFX9-NEXT:    v_trunc_f32_e32 v6, v6
 ; GFX9-NEXT:    v_mac_f32_e32 v5, 0xcf800000, v6
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v5, v5
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v6
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v5, v5
+; GFX9-NEXT:    v_mul_lo_u32 v11, v7, v6
 ; GFX9-NEXT:    v_mul_lo_u32 v9, v8, v5
 ; GFX9-NEXT:    v_mul_hi_u32 v10, v7, v5
-; GFX9-NEXT:    v_mul_lo_u32 v11, v7, v6
 ; GFX9-NEXT:    v_mul_lo_u32 v12, v7, v5
 ; GFX9-NEXT:    v_add3_u32 v9, v10, v11, v9
 ; GFX9-NEXT:    v_mul_lo_u32 v11, v5, v9
@@ -53,12 +53,12 @@ define i64 @sdiv64(i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_addc_co_u32_e32 v10, vcc, v10, v12, vcc
 ; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, v16, v14, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v9, vcc, v10, v9
-; GFX9-NEXT:    v_add_co_u32_e64 v5, s[4:5], v5, v9
 ; GFX9-NEXT:    v_addc_co_u32_e32 v10, vcc, v15, v11, vcc
+; GFX9-NEXT:    v_add_co_u32_e64 v5, s[4:5], v5, v9
 ; GFX9-NEXT:    v_addc_co_u32_e64 v9, vcc, v6, v10, s[4:5]
 ; GFX9-NEXT:    v_mul_lo_u32 v11, v7, v9
-; GFX9-NEXT:    v_mul_hi_u32 v12, v7, v5
 ; GFX9-NEXT:    v_mul_lo_u32 v8, v8, v5
+; GFX9-NEXT:    v_mul_hi_u32 v12, v7, v5
 ; GFX9-NEXT:    v_mul_lo_u32 v7, v7, v5
 ; GFX9-NEXT:    v_add_u32_e32 v6, v6, v10
 ; GFX9-NEXT:    v_add3_u32 v8, v12, v11, v8
@@ -67,8 +67,8 @@ define i64 @sdiv64(i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_mul_hi_u32 v17, v5, v8
 ; GFX9-NEXT:    v_mul_hi_u32 v12, v9, v7
 ; GFX9-NEXT:    v_mul_lo_u32 v7, v9, v7
-; GFX9-NEXT:    v_add_co_u32_e32 v13, vcc, v16, v13
 ; GFX9-NEXT:    v_mul_hi_u32 v11, v9, v8
+; GFX9-NEXT:    v_add_co_u32_e32 v13, vcc, v16, v13
 ; GFX9-NEXT:    v_addc_co_u32_e32 v16, vcc, v15, v17, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v8, v9, v8
 ; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v13, v7
@@ -113,9 +113,9 @@ define i64 @sdiv64(i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_cmp_ge_u32_e64 s[4:5], v10, v2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], v9, v3
-; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v8, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v9, v11, v10, s[4:5]
 ; GFX9-NEXT:    v_add_co_u32_e64 v10, s[4:5], 2, v5
+; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v8, vcc
 ; GFX9-NEXT:    v_addc_co_u32_e64 v11, s[4:5], 0, v6, s[4:5]
 ; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
 ; GFX9-NEXT:    v_add_co_u32_e64 v12, s[4:5], 1, v5
@@ -156,8 +156,8 @@ define i64 @sdiv64(i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_mul_lo_u32 v3, v1, v2
 ; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
 ; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v3
-; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
 ; GFX9-NEXT:    v_sub_u32_e32 v3, v0, v2
+; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v3, 1, v1
@@ -188,18 +188,18 @@ define i64 @udiv64(i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, v3
 ; GFX9-NEXT:    v_sub_co_u32_e32 v6, vcc, 0, v2
 ; GFX9-NEXT:    v_subb_co_u32_e32 v7, vcc, 0, v3, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v13, 0
 ; GFX9-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
 ; GFX9-NEXT:    v_rcp_f32_e32 v4, v4
+; GFX9-NEXT:    v_mov_b32_e32 v13, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v12, 0
 ; GFX9-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
 ; GFX9-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
 ; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
 ; GFX9-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v5
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v4
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; GFX9-NEXT:    v_mul_lo_u32 v9, v7, v4
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v4
 ; GFX9-NEXT:    v_mul_lo_u32 v8, v6, v5
+; GFX9-NEXT:    v_mul_lo_u32 v9, v7, v4
 ; GFX9-NEXT:    v_mul_hi_u32 v10, v6, v4
 ; GFX9-NEXT:    v_mul_lo_u32 v11, v6, v4
 ; GFX9-NEXT:    v_add3_u32 v8, v10, v8, v9
@@ -216,12 +216,12 @@ define i64 @udiv64(i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, v10, v11, vcc
 ; GFX9-NEXT:    v_addc_co_u32_e32 v10, vcc, v15, v12, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v9, v8
-; GFX9-NEXT:    v_add_co_u32_e64 v4, s[4:5], v4, v8
 ; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, v13, v10, vcc
+; GFX9-NEXT:    v_add_co_u32_e64 v4, s[4:5], v4, v8
 ; GFX9-NEXT:    v_addc_co_u32_e64 v8, vcc, v5, v9, s[4:5]
 ; GFX9-NEXT:    v_mul_lo_u32 v10, v6, v8
-; GFX9-NEXT:    v_mul_hi_u32 v11, v6, v4
 ; GFX9-NEXT:    v_mul_lo_u32 v7, v7, v4
+; GFX9-NEXT:    v_mul_hi_u32 v11, v6, v4
 ; GFX9-NEXT:    v_mul_lo_u32 v6, v6, v4
 ; GFX9-NEXT:    v_add_u32_e32 v5, v5, v9
 ; GFX9-NEXT:    v_add3_u32 v7, v11, v10, v7
@@ -271,9 +271,9 @@ define i64 @udiv64(i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], v7, v3
-; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v7, v9, v8, s[4:5]
 ; GFX9-NEXT:    v_add_co_u32_e64 v8, s[4:5], 2, v4
+; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX9-NEXT:    v_addc_co_u32_e64 v9, s[4:5], 0, v5, s[4:5]
 ; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
 ; GFX9-NEXT:    v_add_co_u32_e64 v10, s[4:5], 1, v4
@@ -284,11 +284,11 @@ define i64 @udiv64(i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v7
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v7, v11, v9, s[4:5]
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v10, v8, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v0, vcc
 ; GFX9-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX9-NEXT:  BB1_2: ; %Flow
@@ -309,8 +309,8 @@ define i64 @udiv64(i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_mul_lo_u32 v3, v1, v2
 ; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
 ; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v3
-; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
 ; GFX9-NEXT:    v_sub_u32_e32 v3, v0, v2
+; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v3, 1, v1
@@ -346,19 +346,19 @@ define i64 @srem64(i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, v3
 ; GFX9-NEXT:    v_sub_co_u32_e32 v6, vcc, 0, v2
 ; GFX9-NEXT:    v_subb_co_u32_e32 v7, vcc, 0, v3, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v14, 0
 ; GFX9-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
 ; GFX9-NEXT:    v_rcp_f32_e32 v4, v4
+; GFX9-NEXT:    v_mov_b32_e32 v14, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v13, 0
 ; GFX9-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
 ; GFX9-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
 ; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
 ; GFX9-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v5
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v4
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v5, v5
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v4
+; GFX9-NEXT:    v_mul_lo_u32 v10, v6, v5
 ; GFX9-NEXT:    v_mul_lo_u32 v8, v7, v4
 ; GFX9-NEXT:    v_mul_hi_u32 v9, v6, v4
-; GFX9-NEXT:    v_mul_lo_u32 v10, v6, v5
 ; GFX9-NEXT:    v_mul_lo_u32 v11, v6, v4
 ; GFX9-NEXT:    v_add3_u32 v8, v9, v10, v8
 ; GFX9-NEXT:    v_mul_lo_u32 v10, v4, v8
@@ -374,12 +374,12 @@ define i64 @srem64(i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, v9, v11, vcc
 ; GFX9-NEXT:    v_addc_co_u32_e32 v10, vcc, v15, v13, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v9, v8
-; GFX9-NEXT:    v_add_co_u32_e64 v4, s[4:5], v4, v8
 ; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, v14, v10, vcc
+; GFX9-NEXT:    v_add_co_u32_e64 v4, s[4:5], v4, v8
 ; GFX9-NEXT:    v_addc_co_u32_e64 v8, vcc, v5, v9, s[4:5]
 ; GFX9-NEXT:    v_mul_lo_u32 v10, v6, v8
-; GFX9-NEXT:    v_mul_hi_u32 v11, v6, v4
 ; GFX9-NEXT:    v_mul_lo_u32 v7, v7, v4
+; GFX9-NEXT:    v_mul_hi_u32 v11, v6, v4
 ; GFX9-NEXT:    v_mul_lo_u32 v6, v6, v4
 ; GFX9-NEXT:    v_add_u32_e32 v5, v5, v9
 ; GFX9-NEXT:    v_add3_u32 v7, v11, v10, v7
@@ -388,8 +388,8 @@ define i64 @srem64(i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_mul_hi_u32 v16, v4, v7
 ; GFX9-NEXT:    v_mul_hi_u32 v11, v8, v6
 ; GFX9-NEXT:    v_mul_lo_u32 v6, v8, v6
-; GFX9-NEXT:    v_add_co_u32_e32 v12, vcc, v15, v12
 ; GFX9-NEXT:    v_mul_hi_u32 v10, v8, v7
+; GFX9-NEXT:    v_add_co_u32_e32 v12, vcc, v15, v12
 ; GFX9-NEXT:    v_addc_co_u32_e32 v15, vcc, v14, v16, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v7, v8, v7
 ; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v12, v6
@@ -420,8 +420,8 @@ define i64 @srem64(i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
 ; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v14, v7, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v7, v3, v4
-; GFX9-NEXT:    v_mul_hi_u32 v8, v2, v4
 ; GFX9-NEXT:    v_mul_lo_u32 v5, v2, v5
+; GFX9-NEXT:    v_mul_hi_u32 v8, v2, v4
 ; GFX9-NEXT:    v_mul_lo_u32 v4, v2, v4
 ; GFX9-NEXT:    v_add3_u32 v5, v8, v5, v7
 ; GFX9-NEXT:    v_sub_u32_e32 v7, v1, v5
@@ -430,13 +430,13 @@ define i64 @srem64(i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_sub_co_u32_e64 v7, s[4:5], v0, v2
 ; GFX9-NEXT:    v_subbrev_co_u32_e64 v8, s[6:7], 0, v4, s[4:5]
 ; GFX9-NEXT:    v_cmp_ge_u32_e64 s[6:7], v8, v3
-; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v5, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[6:7]
 ; GFX9-NEXT:    v_cmp_ge_u32_e64 s[6:7], v7, v2
+; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v5, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[6:7]
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[6:7], v8, v3
-; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
 ; GFX9-NEXT:    v_subb_co_u32_e64 v4, s[4:5], v4, v3, s[4:5]
+; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v9, v9, v10, s[6:7]
 ; GFX9-NEXT:    v_sub_co_u32_e64 v10, s[4:5], v7, v2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
@@ -444,12 +444,12 @@ define i64 @srem64(i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_subbrev_co_u32_e64 v4, s[4:5], 0, v4, s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v9
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, v7, v10, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, v8, v4, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v6
 ; GFX9-NEXT:    v_xor_b32_e32 v1, v1, v6
@@ -505,18 +505,18 @@ define i64 @urem64(i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, v3
 ; GFX9-NEXT:    v_sub_co_u32_e32 v6, vcc, 0, v2
 ; GFX9-NEXT:    v_subb_co_u32_e32 v7, vcc, 0, v3, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v13, 0
 ; GFX9-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
 ; GFX9-NEXT:    v_rcp_f32_e32 v4, v4
+; GFX9-NEXT:    v_mov_b32_e32 v13, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v12, 0
 ; GFX9-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
 ; GFX9-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
 ; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
 ; GFX9-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v5
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v4
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; GFX9-NEXT:    v_mul_lo_u32 v9, v7, v4
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v4
 ; GFX9-NEXT:    v_mul_lo_u32 v8, v6, v5
+; GFX9-NEXT:    v_mul_lo_u32 v9, v7, v4
 ; GFX9-NEXT:    v_mul_hi_u32 v10, v6, v4
 ; GFX9-NEXT:    v_mul_lo_u32 v11, v6, v4
 ; GFX9-NEXT:    v_add3_u32 v8, v10, v8, v9
@@ -533,12 +533,12 @@ define i64 @urem64(i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, v10, v11, vcc
 ; GFX9-NEXT:    v_addc_co_u32_e32 v10, vcc, v15, v12, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v9, v8
-; GFX9-NEXT:    v_add_co_u32_e64 v4, s[4:5], v4, v8
 ; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, v13, v10, vcc
+; GFX9-NEXT:    v_add_co_u32_e64 v4, s[4:5], v4, v8
 ; GFX9-NEXT:    v_addc_co_u32_e64 v8, vcc, v5, v9, s[4:5]
 ; GFX9-NEXT:    v_mul_lo_u32 v10, v6, v8
-; GFX9-NEXT:    v_mul_hi_u32 v11, v6, v4
 ; GFX9-NEXT:    v_mul_lo_u32 v7, v7, v4
+; GFX9-NEXT:    v_mul_hi_u32 v11, v6, v4
 ; GFX9-NEXT:    v_mul_lo_u32 v6, v6, v4
 ; GFX9-NEXT:    v_add_u32_e32 v5, v5, v9
 ; GFX9-NEXT:    v_add3_u32 v7, v11, v10, v7
@@ -574,8 +574,8 @@ define i64 @urem64(i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
 ; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v13, v6, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v6, v3, v4
-; GFX9-NEXT:    v_mul_hi_u32 v7, v2, v4
 ; GFX9-NEXT:    v_mul_lo_u32 v5, v2, v5
+; GFX9-NEXT:    v_mul_hi_u32 v7, v2, v4
 ; GFX9-NEXT:    v_mul_lo_u32 v4, v2, v4
 ; GFX9-NEXT:    v_add3_u32 v5, v7, v5, v6
 ; GFX9-NEXT:    v_sub_u32_e32 v6, v1, v5
@@ -584,13 +584,13 @@ define i64 @urem64(i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_sub_co_u32_e64 v6, s[4:5], v0, v2
 ; GFX9-NEXT:    v_subbrev_co_u32_e64 v7, s[6:7], 0, v4, s[4:5]
 ; GFX9-NEXT:    v_cmp_ge_u32_e64 s[6:7], v7, v3
-; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v5, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[6:7]
 ; GFX9-NEXT:    v_cmp_ge_u32_e64 s[6:7], v6, v2
+; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v5, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[6:7]
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[6:7], v7, v3
-; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
 ; GFX9-NEXT:    v_subb_co_u32_e64 v4, s[4:5], v4, v3, s[4:5]
+; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s[6:7]
 ; GFX9-NEXT:    v_sub_co_u32_e64 v9, s[4:5], v6, v2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
@@ -598,10 +598,10 @@ define i64 @urem64(i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_subbrev_co_u32_e64 v4, s[4:5], 0, v4, s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v8
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, v7, v4, s[4:5]
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v1, v4, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v6, v9, s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc
@@ -780,25 +780,25 @@ define <2 x i64> @sdivrem64(i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v4, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v2, v2, v4
 ; GFX9-NEXT:    v_xor_b32_e32 v3, v3, v4
+; GFX9-NEXT:    v_xor_b32_e32 v2, v2, v4
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, v2
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v6, v3
 ; GFX9-NEXT:    v_sub_co_u32_e32 v7, vcc, 0, v2
 ; GFX9-NEXT:    v_subb_co_u32_e32 v8, vcc, 0, v3, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v15, 0
 ; GFX9-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v6
 ; GFX9-NEXT:    v_rcp_f32_e32 v5, v5
+; GFX9-NEXT:    v_mov_b32_e32 v15, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v14, 0
 ; GFX9-NEXT:    v_mul_f32_e32 v5, 0x5f7ffffc, v5
 ; GFX9-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v5
 ; GFX9-NEXT:    v_trunc_f32_e32 v6, v6
 ; GFX9-NEXT:    v_mac_f32_e32 v5, 0xcf800000, v6
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v5, v5
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v6
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v5, v5
+; GFX9-NEXT:    v_mul_lo_u32 v11, v7, v6
 ; GFX9-NEXT:    v_mul_lo_u32 v9, v8, v5
 ; GFX9-NEXT:    v_mul_hi_u32 v10, v7, v5
-; GFX9-NEXT:    v_mul_lo_u32 v11, v7, v6
 ; GFX9-NEXT:    v_mul_lo_u32 v12, v7, v5
 ; GFX9-NEXT:    v_add3_u32 v9, v10, v11, v9
 ; GFX9-NEXT:    v_mul_lo_u32 v11, v5, v9
@@ -814,12 +814,12 @@ define <2 x i64> @sdivrem64(i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_addc_co_u32_e32 v10, vcc, v10, v12, vcc
 ; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, v16, v14, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v9, vcc, v10, v9
-; GFX9-NEXT:    v_add_co_u32_e64 v5, s[4:5], v5, v9
 ; GFX9-NEXT:    v_addc_co_u32_e32 v10, vcc, v15, v11, vcc
+; GFX9-NEXT:    v_add_co_u32_e64 v5, s[4:5], v5, v9
 ; GFX9-NEXT:    v_addc_co_u32_e64 v9, vcc, v6, v10, s[4:5]
 ; GFX9-NEXT:    v_mul_lo_u32 v11, v7, v9
-; GFX9-NEXT:    v_mul_hi_u32 v12, v7, v5
 ; GFX9-NEXT:    v_mul_lo_u32 v8, v8, v5
+; GFX9-NEXT:    v_mul_hi_u32 v12, v7, v5
 ; GFX9-NEXT:    v_mul_lo_u32 v7, v7, v5
 ; GFX9-NEXT:    v_add_u32_e32 v6, v6, v10
 ; GFX9-NEXT:    v_add3_u32 v8, v12, v11, v8
@@ -828,8 +828,8 @@ define <2 x i64> @sdivrem64(i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_mul_hi_u32 v17, v5, v8
 ; GFX9-NEXT:    v_mul_hi_u32 v12, v9, v7
 ; GFX9-NEXT:    v_mul_lo_u32 v7, v9, v7
-; GFX9-NEXT:    v_add_co_u32_e32 v13, vcc, v16, v13
 ; GFX9-NEXT:    v_mul_hi_u32 v11, v9, v8
+; GFX9-NEXT:    v_add_co_u32_e32 v13, vcc, v16, v13
 ; GFX9-NEXT:    v_addc_co_u32_e32 v16, vcc, v15, v17, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v8, v9, v8
 ; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v13, v7
@@ -877,8 +877,8 @@ define <2 x i64> @sdivrem64(i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_cndmask_b32_e64 v12, v12, v13, s[6:7]
 ; GFX9-NEXT:    v_add_co_u32_e64 v13, s[6:7], 2, v5
 ; GFX9-NEXT:    v_addc_co_u32_e64 v14, s[6:7], 0, v6, s[6:7]
-; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v8, vcc
 ; GFX9-NEXT:    v_add_co_u32_e64 v15, s[6:7], 1, v5
+; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v8, vcc
 ; GFX9-NEXT:    v_addc_co_u32_e64 v16, s[6:7], 0, v6, s[6:7]
 ; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 s[6:7], 0, v12
@@ -889,20 +889,20 @@ define <2 x i64> @sdivrem64(i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v8, v8, v14, vcc
 ; GFX9-NEXT:    v_subb_co_u32_e64 v3, s[4:5], v9, v3, s[4:5]
-; GFX9-NEXT:    v_sub_co_u32_e64 v2, s[4:5], v10, v2
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v8
 ; GFX9-NEXT:    v_cndmask_b32_e64 v8, v15, v13, s[6:7]
-; GFX9-NEXT:    v_subbrev_co_u32_e64 v3, s[4:5], 0, v3, s[4:5]
+; GFX9-NEXT:    v_sub_co_u32_e64 v2, s[4:5], v10, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v8, vcc
 ; GFX9-NEXT:    v_xor_b32_e32 v8, v7, v4
+; GFX9-NEXT:    v_subbrev_co_u32_e64 v3, s[4:5], 0, v3, s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s[6:7]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v12, vcc
 ; GFX9-NEXT:    v_xor_b32_e32 v4, v5, v8
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s[6:7]
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX9-NEXT:    v_xor_b32_e32 v6, v6, v8
 ; GFX9-NEXT:    v_sub_co_u32_e64 v4, s[8:9], v4, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v7
 ; GFX9-NEXT:    v_subb_co_u32_e64 v5, s[8:9], v6, v8, s[8:9]
 ; GFX9-NEXT:    v_xor_b32_e32 v1, v1, v7
@@ -929,13 +929,13 @@ define <2 x i64> @sdivrem64(i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_mul_lo_u32 v3, v1, v2
 ; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
 ; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v3
-; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
 ; GFX9-NEXT:    v_sub_u32_e32 v3, v0, v2
+; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX9-NEXT:    v_sub_u32_e32 v3, v0, v2
-; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
 ; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
+; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v0, v3, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v1, v4, vcc
 ; GFX9-NEXT:  BB8_4:
@@ -969,18 +969,18 @@ define <2 x i64> @udivrem64(i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, v3
 ; GFX9-NEXT:    v_sub_co_u32_e32 v6, vcc, 0, v2
 ; GFX9-NEXT:    v_subb_co_u32_e32 v7, vcc, 0, v3, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v13, 0
 ; GFX9-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
 ; GFX9-NEXT:    v_rcp_f32_e32 v4, v4
+; GFX9-NEXT:    v_mov_b32_e32 v13, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v12, 0
 ; GFX9-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
 ; GFX9-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
 ; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
 ; GFX9-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v5
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v4
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; GFX9-NEXT:    v_mul_lo_u32 v9, v7, v4
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v4
 ; GFX9-NEXT:    v_mul_lo_u32 v8, v6, v5
+; GFX9-NEXT:    v_mul_lo_u32 v9, v7, v4
 ; GFX9-NEXT:    v_mul_hi_u32 v10, v6, v4
 ; GFX9-NEXT:    v_mul_lo_u32 v11, v6, v4
 ; GFX9-NEXT:    v_add3_u32 v8, v10, v8, v9
@@ -997,12 +997,12 @@ define <2 x i64> @udivrem64(i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, v10, v11, vcc
 ; GFX9-NEXT:    v_addc_co_u32_e32 v10, vcc, v15, v12, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v9, v8
-; GFX9-NEXT:    v_add_co_u32_e64 v4, s[4:5], v4, v8
 ; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, v13, v10, vcc
+; GFX9-NEXT:    v_add_co_u32_e64 v4, s[4:5], v4, v8
 ; GFX9-NEXT:    v_addc_co_u32_e64 v8, vcc, v5, v9, s[4:5]
 ; GFX9-NEXT:    v_mul_lo_u32 v10, v6, v8
-; GFX9-NEXT:    v_mul_hi_u32 v11, v6, v4
 ; GFX9-NEXT:    v_mul_lo_u32 v7, v7, v4
+; GFX9-NEXT:    v_mul_hi_u32 v11, v6, v4
 ; GFX9-NEXT:    v_mul_lo_u32 v6, v6, v4
 ; GFX9-NEXT:    v_add_u32_e32 v5, v5, v9
 ; GFX9-NEXT:    v_add3_u32 v7, v11, v10, v7
@@ -1055,8 +1055,8 @@ define <2 x i64> @udivrem64(i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_cndmask_b32_e64 v10, v10, v11, s[6:7]
 ; GFX9-NEXT:    v_add_co_u32_e64 v11, s[6:7], 2, v4
 ; GFX9-NEXT:    v_addc_co_u32_e64 v12, s[6:7], 0, v5, s[6:7]
-; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX9-NEXT:    v_add_co_u32_e64 v13, s[6:7], 1, v4
+; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX9-NEXT:    v_addc_co_u32_e64 v14, s[6:7], 0, v5, s[6:7]
 ; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 s[6:7], 0, v10
@@ -1074,9 +1074,9 @@ define <2 x i64> @udivrem64(i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_cndmask_b32_e64 v6, v13, v11, s[6:7]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v7, v1, v3, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v8, v2, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v10, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v0, v1, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v10, vcc
 ; GFX9-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX9-NEXT:  BB9_2: ; %Flow
@@ -1098,13 +1098,13 @@ define <2 x i64> @udivrem64(i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_mul_lo_u32 v3, v1, v2
 ; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
 ; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v3
-; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
 ; GFX9-NEXT:    v_sub_u32_e32 v3, v0, v2
+; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX9-NEXT:    v_sub_u32_e32 v3, v0, v2
-; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
 ; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
+; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v0, v3, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v1, v4, vcc
 ; GFX9-NEXT:  BB9_4:

diff  --git a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll
index 3a5ed70f24a69..afc56bf798778 100644
--- a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll
@@ -9,7 +9,6 @@ declare hidden void @external_void_func_void() #0
 ; GCN: s_getpc_b64 s[34:35]
 ; GCN-NEXT: s_add_u32 s34, s34,
 ; GCN-NEXT: s_addc_u32 s35, s35,
-; GCN-NEXT: s_mov_b32 s32, 0
 ; GCN: s_swappc_b64 s[30:31], s[34:35]
 
 ; GCN-NEXT: #ASMSTART
@@ -96,10 +95,10 @@ define hidden void @void_func_void_clobber_vcc() #2 {
 }
 
 ; GCN-LABEL: {{^}}test_call_void_func_void_clobber_vcc:
-; GCN: s_getpc_b64
+; GCN: s_mov_b64 s[34:35], vcc
+; GCN-NEXT: s_getpc_b64
 ; GCN-NEXT: s_add_u32
 ; GCN-NEXT: s_addc_u32
-; GCN: s_mov_b64 s[34:35], vcc
 ; GCN-NEXT: s_swappc_b64
 ; GCN: s_mov_b64 vcc, s[34:35]
 define amdgpu_kernel void @test_call_void_func_void_clobber_vcc(i32 addrspace(1)* %out) #0 {
@@ -113,7 +112,7 @@ define amdgpu_kernel void @test_call_void_func_void_clobber_vcc(i32 addrspace(1)
 
 ; GCN-LABEL: {{^}}test_call_void_func_void_mayclobber_s31:
 ; GCN: s_mov_b32 s33, s31
-; GCN-NEXT: s_swappc_b64
+; GCN: s_swappc_b64
 ; GCN-NEXT: s_mov_b32 s31, s33
 define amdgpu_kernel void @test_call_void_func_void_mayclobber_s31(i32 addrspace(1)* %out) #0 {
   %s31 = call i32 asm sideeffect "; def $0", "={s31}"()
@@ -124,7 +123,7 @@ define amdgpu_kernel void @test_call_void_func_void_mayclobber_s31(i32 addrspace
 
 ; GCN-LABEL: {{^}}test_call_void_func_void_mayclobber_v31:
 ; GCN: v_mov_b32_e32 v40, v31
-; GCN-NEXT: s_swappc_b64
+; GCN: s_swappc_b64
 ; GCN-NEXT: v_mov_b32_e32 v31, v40
 define amdgpu_kernel void @test_call_void_func_void_mayclobber_v31(i32 addrspace(1)* %out) #0 {
   %v31 = call i32 asm sideeffect "; def $0", "={v31}"()
@@ -136,18 +135,17 @@ define amdgpu_kernel void @test_call_void_func_void_mayclobber_v31(i32 addrspace
 ; FIXME: What is the expected behavior for reserved registers here?
 
 ; GCN-LABEL: {{^}}test_call_void_func_void_preserves_s33:
-; MUBUF:        s_getpc_b64 s[4:5]
-; MUBUF-NEXT:   s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
-; MUBUF-NEXT:   s_addc_u32 s5, s5, external_void_func_void at rel32@hi+12
-; FLATSCR:      s_getpc_b64 s[0:1]
-; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void at rel32@lo+4
-; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void at rel32@hi+12
-; GCN: s_mov_b32 s32, 0
 ; GCN: #ASMSTART
 ; GCN-NEXT: ; def s33
 ; GCN-NEXT: #ASMEND
-; MUBUF:   s_swappc_b64 s[30:31], s[4:5]
-; FLATSCR: s_swappc_b64 s[30:31], s[0:1]
+; FLATSCR:      s_getpc_b64 s[0:1]
+; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void at rel32@lo+4
+; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void at rel32@hi+12
+; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; MUBUF:        s_getpc_b64 s[4:5]
+; MUBUF-NEXT:   s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
+; MUBUF-NEXT:   s_addc_u32 s5, s5, external_void_func_void at rel32@hi+12
+; MUBUF-NEXT:   s_swappc_b64 s[30:31], s[4:5]
 ; GCN: ;;#ASMSTART
 ; GCN-NEXT: ; use s33
 ; GCN-NEXT: ;;#ASMEND
@@ -163,18 +161,18 @@ define amdgpu_kernel void @test_call_void_func_void_preserves_s33(i32 addrspace(
 ; GCN-LABEL: {{^}}test_call_void_func_void_preserves_s34: {{.*}}
 ; GCN-NOT: s34
 
-; MUBUF:        s_getpc_b64 s[4:5]
-; MUBUF-NEXT:   s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
-; MUBUF-NEXT:   s_addc_u32 s5, s5, external_void_func_void at rel32@hi+12
-; FLATSCR:      s_getpc_b64 s[0:1]
-; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void at rel32@lo+4
-; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void at rel32@hi+12
 ; GCN: s_mov_b32 s32, 0
 
 ; GCN-NOT: s34
 ; GCN: ;;#ASMSTART
 ; GCN-NEXT: ; def s34
 ; GCN-NEXT: ;;#ASMEND
+; FLATSCR:      s_getpc_b64 s[0:1]
+; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void at rel32@lo+4
+; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void at rel32@hi+12
+; MUBUF:        s_getpc_b64 s[4:5]
+; MUBUF-NEXT:   s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
+; MUBUF-NEXT:   s_addc_u32 s5, s5, external_void_func_void at rel32@hi+12
 
 ; GCN-NOT: s34
 ; MUBUF:   s_swappc_b64 s[30:31], s[4:5]
@@ -196,18 +194,18 @@ define amdgpu_kernel void @test_call_void_func_void_preserves_s34(i32 addrspace(
 ; GCN-LABEL: {{^}}test_call_void_func_void_preserves_v40: {{.*}}
 
 ; GCN-NOT: v32
-; MUBUF: s_getpc_b64 s[4:5]
-; MUBUF-NEXT:   s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
-; MUBUF-NEXT:   s_addc_u32 s5, s5, external_void_func_void at rel32@hi+12
-; FLATSCR:      s_getpc_b64 s[0:1]
-; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void at rel32@lo+4
-; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void at rel32@hi+12
 ; GCN: s_mov_b32 s32, 0
 ; GCN-NOT: v40
 
 ; GCN: ;;#ASMSTART
 ; GCN-NEXT: ; def v40
 ; GCN-NEXT: ;;#ASMEND
+; MUBUF: s_getpc_b64 s[4:5]
+; MUBUF-NEXT:   s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
+; MUBUF-NEXT:   s_addc_u32 s5, s5, external_void_func_void at rel32@hi+12
+; FLATSCR:      s_getpc_b64 s[0:1]
+; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void at rel32@lo+4
+; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void at rel32@hi+12
 
 ; MUBUF:   s_swappc_b64 s[30:31], s[4:5]
 ; FLATSCR: s_swappc_b64 s[30:31], s[0:1]
@@ -250,10 +248,10 @@ define hidden void @void_func_void_clobber_s34() #2 {
 }
 
 ; GCN-LABEL: {{^}}test_call_void_func_void_clobber_s33:
+; GCN: s_mov_b32 s32, 0
 ; GCN: s_getpc_b64
 ; GCN-NEXT: s_add_u32
 ; GCN-NEXT: s_addc_u32
-; GCN-NEXT: s_mov_b32 s32, 0
 ; GCN: s_swappc_b64
 ; GCN-NEXT: s_endpgm
 define amdgpu_kernel void @test_call_void_func_void_clobber_s33() #0 {
@@ -262,10 +260,10 @@ define amdgpu_kernel void @test_call_void_func_void_clobber_s33() #0 {
 }
 
 ; GCN-LABEL: {{^}}test_call_void_func_void_clobber_s34:
+; GCN: s_mov_b32 s32, 0
 ; GCN: s_getpc_b64
 ; GCN-NEXT: s_add_u32
 ; GCN-NEXT: s_addc_u32
-; GCN-NEXT: s_mov_b32 s32, 0
 ; GCN: s_swappc_b64
 ; GCN-NEXT: s_endpgm
 define amdgpu_kernel void @test_call_void_func_void_clobber_s34() #0 {

diff  --git a/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll b/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll
index 15839879e2b86..d3e2222063b11 100644
--- a/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll
@@ -13,10 +13,10 @@ define amdgpu_kernel void @call_memory_arg_load(i32 addrspace(3)* %ptr, i32) #0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NEXT:    ds_read_b32 v0, v0
+; GCN-NEXT:    s_mov_b32 s32, 0
 ; GCN-NEXT:    s_getpc_b64 s[4:5]
 ; GCN-NEXT:    s_add_u32 s4, s4, func at rel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s5, s5, func at rel32@hi+12
-; GCN-NEXT:    s_mov_b32 s32, 0
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GCN-NEXT:    s_endpgm
   %vgpr = load volatile i32, i32 addrspace(3)* %ptr
@@ -37,10 +37,10 @@ define amdgpu_kernel void @call_memory_no_dep(i32 addrspace(1)* %ptr, i32) #0 {
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    global_store_dword v0, v0, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    s_mov_b32 s32, 0
 ; GCN-NEXT:    s_getpc_b64 s[6:7]
 ; GCN-NEXT:    s_add_u32 s6, s6, func at rel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s7, s7, func at rel32@hi+12
-; GCN-NEXT:    s_mov_b32 s32, 0
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GCN-NEXT:    s_endpgm
   store i32 0, i32 addrspace(1)* %ptr
@@ -58,10 +58,10 @@ define amdgpu_kernel void @call_no_wait_after_call(i32 addrspace(1)* %ptr, i32)
 ; GCN-NEXT:    s_add_u32 s0, s0, s17
 ; GCN-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    s_mov_b32 s32, 0
 ; GCN-NEXT:    s_getpc_b64 s[4:5]
 ; GCN-NEXT:    s_add_u32 s4, s4, func at rel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s5, s5, func at rel32@hi+12
-; GCN-NEXT:    s_mov_b32 s32, 0
 ; GCN-NEXT:    v_mov_b32_e32 v40, 0
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GCN-NEXT:    global_store_dword v40, v40, s[34:35]
@@ -80,10 +80,10 @@ define amdgpu_kernel void @call_no_wait_after_call_return_val(i32 addrspace(1)*
 ; GCN-NEXT:    s_add_u32 s0, s0, s17
 ; GCN-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    s_mov_b32 s32, 0
 ; GCN-NEXT:    s_getpc_b64 s[4:5]
 ; GCN-NEXT:    s_add_u32 s4, s4, func.return at rel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s5, s5, func.return at rel32@hi+12
-; GCN-NEXT:    s_mov_b32 s32, 0
 ; GCN-NEXT:    v_mov_b32_e32 v40, 0
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GCN-NEXT:    global_store_dword v40, v0, s[34:35]

diff  --git a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
index 52aa1544c0098..9c05d5880572e 100644
--- a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
@@ -276,9 +276,10 @@ define void @callee_with_stack_no_fp_elim_csr_vgpr() #1 {
 ; GCN-NEXT: v_writelane_b32 v0, s33, 63
 ; GCN-COUNT-60: v_writelane_b32 v0
 ; GCN: s_mov_b32 s33, s32
-; GCN-COUNT-2: v_writelane_b32 v0
+; GCN: v_writelane_b32 v0
 ; MUBUF:   buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
 ; FLATSCR: scratch_store_dword off, v41, s33 ; 4-byte Folded Spill
+; GCN: v_writelane_b32 v0
 ; MUBUF:   buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33 offset:8
 ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s33 offset:8
 ; GCN: ;;#ASMSTART
@@ -318,12 +319,14 @@ define void @last_lane_vgpr_for_fp_csr() #1 {
 ; MUBUF-NEXT:   buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s32 offset:12 ; 4-byte Folded Spill
 ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
-; GCN-COUNT-62: v_writelane_b32 v0,
-; GCN: s_mov_b32 [[FP_COPY:s[0-9]+]], s33
-; GCN-NEXT: s_mov_b32 s33, s32
-; GCN: v_writelane_b32 v0,
+; GCN-COUNT-61: v_writelane_b32 v0,
+; FLATSCR: s_mov_b32 [[FP_COPY:s[0-9]+]], s33
+; FLATSCR-NEXT: s_mov_b32 s33, s32
+; MUBUF: s_mov_b32 [[FP_COPY:s[0-9]+]], s33
+; MUBUF-NEXT: s_mov_b32 s33, s32
 ; MUBUF:   buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
 ; FLATSCR: scratch_store_dword off, v41, s33 ; 4-byte Folded Spill
+; GCN: v_writelane_b32 v0,
 ; MUBUF:   buffer_store_dword
 ; FLATSCR: scratch_store_dword
 ; GCN: ;;#ASMSTART
@@ -389,8 +392,8 @@ define void @realign_stack_no_fp_elim() #1 {
 ; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s32 offset:8 ; 4-byte Folded Spill
 ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
 ; GCN-NEXT: v_writelane_b32 v0, s33, 2
-; GCN-NEXT: v_writelane_b32 v0, s30, 0
 ; GCN-NEXT: s_mov_b32 s33, s32
+; GCN-NEXT: v_writelane_b32 v0, s30, 0
 ; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
 ; GCN: v_writelane_b32 v0, s31, 1
 ; MUBUF:   buffer_store_dword [[ZERO]], off, s[0:3], s33 offset:4
@@ -435,8 +438,8 @@ define void @no_unused_non_csr_sgpr_for_fp() #1 {
 ; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s32 offset:8 ; 4-byte Folded Spill
 ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
 ; GCN-NEXT: v_writelane_b32 [[CSR_VGPR]], s33, 2
-; GCN-NEXT: v_writelane_b32 [[CSR_VGPR]], s30, 0
 ; GCN-NEXT: s_mov_b32 s33, s32
+; GCN-NEXT: v_writelane_b32 [[CSR_VGPR]], s30, 0
 
 ; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s31, 1
 ; MUBUF-DAG:   buffer_store_dword
@@ -672,8 +675,8 @@ define void @callee_need_to_spill_fp_to_reg() #1 {
 ; GCN-NOT: v_mov_b32_e32 v0, 0x100c
 ; MUBUF-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s32, 0x40300
 ; MUBUF: buffer_store_dword v0, off, s[0:3], [[SCRATCH_SGPR]] ; 4-byte Folded Spill
-; FLATSCR: s_add_i32 [[SOFF:s[0-9]+]], s33, 0x1004
 ; FLATSCR: v_mov_b32_e32 v0, 0
+; FLATSCR: s_add_i32 [[SOFF:s[0-9]+]], s33, 0x1004
 ; FLATSCR: scratch_store_dword off, v0, [[SOFF]]
 define void @spill_fp_to_memory_scratch_reg_needed_mubuf_offset([4096 x i8] addrspace(5)* byval([4096 x i8]) align 4 %arg) #3 {
   %alloca = alloca i32, addrspace(5)

diff  --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll
index 968dbf20eb929..d4052cb8db02c 100644
--- a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll
@@ -238,8 +238,8 @@ define hidden void @use_every_sgpr_input() #1 {
 }
 
 ; GCN-LABEL: {{^}}kern_indirect_use_every_sgpr_input:
-; GCN: s_mov_b32 s12, s14
 ; GCN: s_mov_b32 s13, s15
+; GCN: s_mov_b32 s12, s14
 ; GCN: s_mov_b32 s14, s16
 ; GCN: s_mov_b32 s32, 0
 ; GCN: s_swappc_b64

diff  --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll
index f03f299ca301f..a6ba6a16223fc 100644
--- a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll
@@ -196,10 +196,10 @@ define hidden void @use_workgroup_id_yz() #1 {
 
 ; GCN-NOT: s6
 ; GCN: s_mov_b32 s4, s6
-; GCN-NEXT: s_getpc_b64 s[6:7]
+; GCN: s_mov_b32 s32, 0
+; GCN: s_getpc_b64 s[6:7]
 ; GCN-NEXT: s_add_u32 s6, s6, use_workgroup_id_x at rel32@lo+4
 ; GCN-NEXT: s_addc_u32 s7, s7, use_workgroup_id_x at rel32@hi+12
-; GCN: s_mov_b32 s32, 0
 ; GCN: s_swappc_b64
 ; GCN-NEXT: s_endpgm
 define amdgpu_kernel void @kern_indirect_use_workgroup_id_x() #1 {
@@ -254,8 +254,8 @@ define amdgpu_kernel void @kern_indirect_use_workgroup_id_xy() #1 {
 ; GCN: enable_sgpr_workgroup_id_y = 1
 ; GCN: enable_sgpr_workgroup_id_z = 1
 
-; GCN: s_mov_b32 s4, s6
 ; GCN: s_mov_b32 s5, s7
+; GCN: s_mov_b32 s4, s6
 ; GCN: s_mov_b32 s6, s8
 
 ; GCN: s_mov_b32 s32, 0
@@ -285,8 +285,8 @@ define amdgpu_kernel void @kern_indirect_use_workgroup_id_xz() #1 {
 ; GCN: enable_sgpr_workgroup_id_y = 1
 ; GCN: enable_sgpr_workgroup_id_z = 1
 
-; GCN: s_mov_b32 s4, s7
 ; GCN: s_mov_b32 s5, s8
+; GCN: s_mov_b32 s4, s7
 
 ; GCN: s_mov_b32 s32, 0
 ; GCN: s_swappc_b64
@@ -449,8 +449,8 @@ define hidden void @use_every_sgpr_input() #1 {
 ; GCN: enable_sgpr_dispatch_id = 1
 ; GCN: enable_sgpr_flat_scratch_init = 1
 
-; GCN: s_mov_b32 s12, s14
 ; GCN: s_mov_b32 s13, s15
+; GCN: s_mov_b32 s12, s14
 ; GCN: s_mov_b32 s14, s16
 ; GCN: s_mov_b32 s32, 0
 ; GCN: s_swappc_b64
@@ -550,18 +550,14 @@ define hidden void @func_use_every_sgpr_input_call_use_workgroup_id_xyz() #1 {
 ; GCN-DAG: s_mov_b64 s{{\[}}[[LO_X:[0-9]+]]{{\:}}[[HI_X:[0-9]+]]{{\]}}, s[4:5]
 ; GCN-DAG: s_mov_b64 s{{\[}}[[LO_Y:[0-9]+]]{{\:}}[[HI_Y:[0-9]+]]{{\]}}, s[6:7]
 
-
 ; GCN: s_mov_b32 s4, s12
 ; GCN: s_mov_b32 s5, s13
 ; GCN: s_mov_b32 s6, s14
 
-; GCN: s_mov_b64 s{{\[}}[[LO_Z:[0-9]+]]{{\:}}[[HI_Z:[0-9]+]]{{\]}}, s[8:9]
-
-; GCN-DAG: s_mov_b32 [[SAVE_X:s[0-57-9][0-9]*]], s12
-; GCN-DAG: s_mov_b32 [[SAVE_Y:s[0-57-9][0-9]*]], s13
 ; GCN-DAG: s_mov_b32 [[SAVE_Z:s[0-68-9][0-9]*]], s14
-
-
+; GCN-DAG: s_mov_b32 [[SAVE_Y:s[0-57-9][0-9]*]], s13
+; GCN-DAG: s_mov_b32 [[SAVE_X:s[0-57-9][0-9]*]], s12
+; GCN: s_mov_b64 s{{\[}}[[LO_Z:[0-9]+]]{{\:}}[[HI_Z:[0-9]+]]{{\]}}, s[8:9]
 
 ; GCN: s_swappc_b64
 

diff  --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
index a373442364055..225099caff519 100644
--- a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
@@ -640,8 +640,8 @@ define void @too_many_args_use_workitem_id_x_byval(
 ; FIXEDABI-NOT: v2
 ; FIXEDABI: v_mov_b32_e32 v31, v0
 ; FIXEDABI: v_mov_b32_e32 [[K0:v[0-9]+]], 0x3e7
-; FIXEDABI: buffer_store_dword [[K0]], off, s[0:3], 0 offset:4{{$}}
 ; FIXEDABI: s_movk_i32 s32, 0x400{{$}}
+; FIXEDABI: buffer_store_dword [[K0]], off, s[0:3], 0 offset:4{{$}}
 ; FIXEDABI: v_mov_b32_e32 [[K1:v[0-9]+]], 0x140
 
 ; FIXEDABI: buffer_store_dword [[K1]], off, s[0:3], s32{{$}}
@@ -787,14 +787,14 @@ define void @too_many_args_use_workitem_id_xyz(
 ; GCN-DAG: s_mov_b32 s32, 0
 
 ; GCN-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1
+; FIXEDABI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x140
 ; GCN-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2
 ; GCN-DAG: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]]
 ; VARABI-DAG: v_or_b32_e32 [[PACKEDID:v[0-9]+]], [[TMP2]], [[TMP0]]
 ; VARABI: buffer_store_dword [[PACKEDID]], off, s[0:3], s32{{$}}
 
-; FIXEDABI-DAG: v_or_b32_e32 v31, [[TMP2]], [[TMP0]]
-; FIXEDABI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x140
 ; FIXEDABI: buffer_store_dword [[K]], off, s[0:3], s32{{$}}
+; FIXEDABI-DAG: v_or_b32_e32 v31, [[TMP2]], [[TMP0]]
 
 ; GCN: s_swappc_b64
 define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_xyz() #1 {

diff  --git a/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll b/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll
index 9aa1af08e0837..2529cebbf1f47 100644
--- a/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll
+++ b/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll
@@ -14,8 +14,8 @@ entry:
 
 ; GCN-LABEL: {{^}}stored_fi_to_lds:
 ; GCN: s_load_dword [[LDSPTR:s[0-9]+]]
-; GCN: buffer_store_dword v{{[0-9]+}}, off,
 ; GCN: v_mov_b32_e32 [[ZERO0:v[0-9]+]], 4{{$}}
+; GCN: buffer_store_dword v{{[0-9]+}}, off,
 ; GCN: v_mov_b32_e32 [[VLDSPTR:v[0-9]+]], [[LDSPTR]]
 ; GCN: ds_write_b32  [[VLDSPTR]], [[ZERO0]]
 define amdgpu_kernel void @stored_fi_to_lds(float addrspace(5)* addrspace(3)* %ptr) #0 {

diff  --git a/llvm/test/CodeGen/AMDGPU/cc-update.ll b/llvm/test/CodeGen/AMDGPU/cc-update.ll
index c6ed23e1ae2a2..50f683c8c4951 100644
--- a/llvm/test/CodeGen/AMDGPU/cc-update.ll
+++ b/llvm/test/CodeGen/AMDGPU/cc-update.ll
@@ -59,11 +59,11 @@ define amdgpu_kernel void @test_kern_call() local_unnamed_addr #0 {
 ; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX803-NEXT:    s_add_u32 s0, s0, s15
 ; GFX803-NEXT:    s_addc_u32 s1, s1, 0
+; GFX803-NEXT:    s_mov_b32 s32, 0
+; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX803-NEXT:    s_getpc_b64 s[4:5]
 ; GFX803-NEXT:    s_add_u32 s4, s4, ex at rel32@lo+4
 ; GFX803-NEXT:    s_addc_u32 s5, s5, ex at rel32@hi+12
-; GFX803-NEXT:    s_mov_b32 s32, 0
-; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX803-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX803-NEXT:    s_endpgm
 ;
@@ -73,10 +73,10 @@ define amdgpu_kernel void @test_kern_call() local_unnamed_addr #0 {
 ; GFX900-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX900-NEXT:    s_add_u32 s0, s0, s15
 ; GFX900-NEXT:    s_addc_u32 s1, s1, 0
+; GFX900-NEXT:    s_mov_b32 s32, 0
 ; GFX900-NEXT:    s_getpc_b64 s[4:5]
 ; GFX900-NEXT:    s_add_u32 s4, s4, ex at rel32@lo+4
 ; GFX900-NEXT:    s_addc_u32 s5, s5, ex at rel32@hi+12
-; GFX900-NEXT:    s_mov_b32 s32, 0
 ; GFX900-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX900-NEXT:    s_endpgm
 ;
@@ -107,13 +107,13 @@ define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 {
 ; GFX803-NEXT:    s_add_u32 s0, s0, s15
 ; GFX803-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX803-NEXT:    v_mov_b32_e32 v0, 0
-; GFX803-NEXT:    s_getpc_b64 s[4:5]
-; GFX803-NEXT:    s_add_u32 s4, s4, ex at rel32@lo+4
-; GFX803-NEXT:    s_addc_u32 s5, s5, ex at rel32@hi+12
 ; GFX803-NEXT:    s_movk_i32 s32, 0x400
 ; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX803-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_getpc_b64 s[4:5]
+; GFX803-NEXT:    s_add_u32 s4, s4, ex at rel32@lo+4
+; GFX803-NEXT:    s_addc_u32 s5, s5, ex at rel32@hi+12
 ; GFX803-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX803-NEXT:    s_endpgm
 ;
@@ -124,12 +124,12 @@ define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 {
 ; GFX900-NEXT:    s_add_u32 s0, s0, s15
 ; GFX900-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX900-NEXT:    v_mov_b32_e32 v0, 0
-; GFX900-NEXT:    s_getpc_b64 s[4:5]
-; GFX900-NEXT:    s_add_u32 s4, s4, ex at rel32@lo+4
-; GFX900-NEXT:    s_addc_u32 s5, s5, ex at rel32@hi+12
 ; GFX900-NEXT:    s_movk_i32 s32, 0x400
 ; GFX900-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_getpc_b64 s[4:5]
+; GFX900-NEXT:    s_add_u32 s4, s4, ex at rel32@lo+4
+; GFX900-NEXT:    s_addc_u32 s5, s5, ex at rel32@hi+12
 ; GFX900-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX900-NEXT:    s_endpgm
 ;
@@ -219,12 +219,12 @@ define amdgpu_kernel void @test_force_fp_kern_call() local_unnamed_addr #2 {
 ; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX803-NEXT:    s_add_u32 s0, s0, s15
 ; GFX803-NEXT:    s_addc_u32 s1, s1, 0
-; GFX803-NEXT:    s_getpc_b64 s[4:5]
-; GFX803-NEXT:    s_add_u32 s4, s4, ex at rel32@lo+4
-; GFX803-NEXT:    s_addc_u32 s5, s5, ex at rel32@hi+12
 ; GFX803-NEXT:    s_mov_b32 s32, 0
 ; GFX803-NEXT:    s_mov_b32 s33, 0
 ; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX803-NEXT:    s_getpc_b64 s[4:5]
+; GFX803-NEXT:    s_add_u32 s4, s4, ex at rel32@lo+4
+; GFX803-NEXT:    s_addc_u32 s5, s5, ex at rel32@hi+12
 ; GFX803-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX803-NEXT:    s_endpgm
 ;
@@ -234,11 +234,11 @@ define amdgpu_kernel void @test_force_fp_kern_call() local_unnamed_addr #2 {
 ; GFX900-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX900-NEXT:    s_add_u32 s0, s0, s15
 ; GFX900-NEXT:    s_addc_u32 s1, s1, 0
+; GFX900-NEXT:    s_mov_b32 s32, 0
+; GFX900-NEXT:    s_mov_b32 s33, 0
 ; GFX900-NEXT:    s_getpc_b64 s[4:5]
 ; GFX900-NEXT:    s_add_u32 s4, s4, ex at rel32@lo+4
 ; GFX900-NEXT:    s_addc_u32 s5, s5, ex at rel32@hi+12
-; GFX900-NEXT:    s_mov_b32 s32, 0
-; GFX900-NEXT:    s_mov_b32 s33, 0
 ; GFX900-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX900-NEXT:    s_endpgm
 ;
@@ -268,16 +268,16 @@ define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_add
 ; GFX803-NEXT:    s_add_i32 s10, s10, s15
 ; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX803-NEXT:    s_add_u32 s0, s0, s15
-; GFX803-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX803-NEXT:    s_mov_b32 s33, 0
+; GFX803-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX803-NEXT:    v_mov_b32_e32 v0, 0
-; GFX803-NEXT:    s_getpc_b64 s[4:5]
-; GFX803-NEXT:    s_add_u32 s4, s4, ex at rel32@lo+4
-; GFX803-NEXT:    s_addc_u32 s5, s5, ex at rel32@hi+12
 ; GFX803-NEXT:    s_movk_i32 s32, 0x400
 ; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX803-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:4
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_getpc_b64 s[4:5]
+; GFX803-NEXT:    s_add_u32 s4, s4, ex at rel32@lo+4
+; GFX803-NEXT:    s_addc_u32 s5, s5, ex at rel32@hi+12
 ; GFX803-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX803-NEXT:    s_endpgm
 ;
@@ -286,15 +286,15 @@ define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_add
 ; GFX900-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
 ; GFX900-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX900-NEXT:    s_add_u32 s0, s0, s15
-; GFX900-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX900-NEXT:    s_mov_b32 s33, 0
+; GFX900-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX900-NEXT:    v_mov_b32_e32 v0, 0
-; GFX900-NEXT:    s_getpc_b64 s[4:5]
-; GFX900-NEXT:    s_add_u32 s4, s4, ex at rel32@lo+4
-; GFX900-NEXT:    s_addc_u32 s5, s5, ex at rel32@hi+12
 ; GFX900-NEXT:    s_movk_i32 s32, 0x400
 ; GFX900-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:4
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_getpc_b64 s[4:5]
+; GFX900-NEXT:    s_add_u32 s4, s4, ex at rel32@lo+4
+; GFX900-NEXT:    s_addc_u32 s5, s5, ex at rel32@hi+12
 ; GFX900-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX900-NEXT:    s_endpgm
 ;

diff  --git a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll
index 45b6c44f9ae49..8fab3d392c98f 100644
--- a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll
+++ b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll
@@ -59,18 +59,18 @@ define amdgpu_kernel void @cluster_load_cluster_store(i32* noalias %lb, i32* noa
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX10-NEXT:    s_add_u32 s0, s2, 24
-; GFX10-NEXT:    s_addc_u32 s1, s3, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-NEXT:    s_addc_u32 s1, s3, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v4, s6
-; GFX10-NEXT:    v_mov_b32_e32 v7, s1
 ; GFX10-NEXT:    v_mov_b32_e32 v5, s7
+; GFX10-NEXT:    v_mov_b32_e32 v7, s1
 ; GFX10-NEXT:    v_mov_b32_e32 v6, s0
-; GFX10-NEXT:    s_add_u32 s0, s4, 8
 ; GFX10-NEXT:    s_clause 0x3
 ; GFX10-NEXT:    flat_load_dword v8, v[0:1]
 ; GFX10-NEXT:    flat_load_dword v9, v[2:3]
 ; GFX10-NEXT:    flat_load_dword v10, v[4:5]
 ; GFX10-NEXT:    flat_load_dword v11, v[6:7]
+; GFX10-NEXT:    s_add_u32 s0, s4, 8
 ; GFX10-NEXT:    s_addc_u32 s1, s5, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s1
@@ -78,11 +78,11 @@ define amdgpu_kernel void @cluster_load_cluster_store(i32* noalias %lb, i32* noa
 ; GFX10-NEXT:    s_add_u32 s0, s4, 16
 ; GFX10-NEXT:    s_addc_u32 s1, s5, 0
 ; GFX10-NEXT:    s_add_u32 s2, s4, 24
-; GFX10-NEXT:    s_addc_u32 s3, s5, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-NEXT:    s_addc_u32 s3, s5, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v5, s1
-; GFX10-NEXT:    v_mov_b32_e32 v7, s3
 ; GFX10-NEXT:    v_mov_b32_e32 v4, s0
+; GFX10-NEXT:    v_mov_b32_e32 v7, s3
 ; GFX10-NEXT:    v_mov_b32_e32 v6, s2
 ; GFX10-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
 ; GFX10-NEXT:    flat_store_dword v[0:1], v8
@@ -169,26 +169,26 @@ define amdgpu_kernel void @cluster_load_valu_cluster_store(i32* noalias %lb, i32
 ; GFX10-NEXT:    s_add_u32 s0, s2, 24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX10-NEXT:    s_addc_u32 s1, s3, 0
-; GFX10-NEXT:    v_mov_b32_e32 v4, s6
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-NEXT:    v_mov_b32_e32 v4, s6
+; GFX10-NEXT:    v_mov_b32_e32 v5, s7
 ; GFX10-NEXT:    flat_load_dword v6, v[2:3]
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s1
-; GFX10-NEXT:    v_mov_b32_e32 v5, s7
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s0
-; GFX10-NEXT:    s_add_u32 s0, s4, 8
-; GFX10-NEXT:    s_addc_u32 s1, s5, 0
 ; GFX10-NEXT:    s_clause 0x2
 ; GFX10-NEXT:    flat_load_dword v8, v[0:1]
 ; GFX10-NEXT:    flat_load_dword v9, v[4:5]
 ; GFX10-NEXT:    flat_load_dword v10, v[2:3]
+; GFX10-NEXT:    s_add_u32 s0, s4, 8
+; GFX10-NEXT:    s_addc_u32 s1, s5, 0
 ; GFX10-NEXT:    s_add_u32 s2, s4, 16
-; GFX10-NEXT:    s_addc_u32 s3, s5, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s1
+; GFX10-NEXT:    s_addc_u32 s3, s5, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX10-NEXT:    s_add_u32 s0, s4, 24
-; GFX10-NEXT:    v_mov_b32_e32 v5, s3
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-NEXT:    v_mov_b32_e32 v5, s3
 ; GFX10-NEXT:    s_addc_u32 s1, s5, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v4, s2
 ; GFX10-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
@@ -328,14 +328,14 @@ define amdgpu_ps void @cluster_image_sample(<8 x i32> inreg %src, <4 x i32> inre
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v8, v0
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v9, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
-; GFX9-NEXT:    v_mov_b32_e32 v10, 1.0
+; GFX9-NEXT:    v_mov_b32_e32 v5, v4
 ; GFX9-NEXT:    v_add_f32_e32 v2, 1.0, v8
 ; GFX9-NEXT:    v_add_f32_e32 v3, 1.0, v9
-; GFX9-NEXT:    v_mov_b32_e32 v5, v4
 ; GFX9-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX9-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX9-NEXT:    v_add_f32_e32 v8, 2.0, v8
 ; GFX9-NEXT:    v_add_f32_e32 v9, 2.0, v9
+; GFX9-NEXT:    v_mov_b32_e32 v10, 1.0
 ; GFX9-NEXT:    v_mov_b32_e32 v11, v10
 ; GFX9-NEXT:    v_mov_b32_e32 v12, v10
 ; GFX9-NEXT:    v_mov_b32_e32 v13, v10
@@ -351,13 +351,13 @@ define amdgpu_ps void @cluster_image_sample(<8 x i32> inreg %src, <4 x i32> inre
 ;
 ; GFX10-LABEL: cluster_image_sample:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-NEXT:    v_cvt_f32_i32_e32 v8, v0
 ; GFX10-NEXT:    v_cvt_f32_i32_e32 v9, v1
+; GFX10-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v10, 1.0
-; GFX10-NEXT:    v_mov_b32_e32 v5, v4
 ; GFX10-NEXT:    v_add_f32_e32 v2, 1.0, v8
 ; GFX10-NEXT:    v_add_f32_e32 v3, 1.0, v9
+; GFX10-NEXT:    v_mov_b32_e32 v5, v4
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX10-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX10-NEXT:    v_add_f32_e32 v8, 2.0, v8

diff  --git a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll
index 16831972cdfbf..4b7b604d03afe 100644
--- a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll
+++ b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll
@@ -258,12 +258,12 @@ define amdgpu_kernel void @test_copy_v4i8_extra_use(<4 x i8> addrspace(1)* %out0
 ; SI-NEXT:    s_mov_b32 s0, s6
 ; SI-NEXT:    s_mov_b32 s1, s7
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 9, v0
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; SI-NEXT:    v_and_b32_e32 v4, s12, v1
-; SI-NEXT:    v_add_i32_e32 v1, vcc, 9, v1
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 9, v0
 ; SI-NEXT:    v_and_b32_e32 v2, s12, v0
+; SI-NEXT:    v_and_b32_e32 v4, s12, v1
 ; SI-NEXT:    v_and_b32_e32 v3, s13, v3
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 9, v1
 ; SI-NEXT:    v_or_b32_e32 v2, v2, v3
 ; SI-NEXT:    v_and_b32_e32 v1, s13, v1
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x900, v2
@@ -303,9 +303,9 @@ define amdgpu_kernel void @test_copy_v4i8_extra_use(<4 x i8> addrspace(1)* %out0
 ; VI-NEXT:    v_add_u16_e32 v1, 9, v1
 ; VI-NEXT:    v_add_u16_e32 v3, 9, v0
 ; VI-NEXT:    v_and_b32_e32 v1, s13, v1
-; VI-NEXT:    v_or_b32_e32 v1, v4, v1
 ; VI-NEXT:    v_and_b32_e32 v2, s12, v0
 ; VI-NEXT:    v_and_b32_e32 v3, s13, v3
+; VI-NEXT:    v_or_b32_e32 v1, v4, v1
 ; VI-NEXT:    v_or_b32_e32 v2, v2, v3
 ; VI-NEXT:    v_add_u16_e32 v1, s14, v1
 ; VI-NEXT:    v_add_u16_e32 v2, s14, v2
@@ -347,12 +347,13 @@ define amdgpu_kernel void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %o
 ; SI-NEXT:    s_mov_b32 s6, s10
 ; SI-NEXT:    s_mov_b32 s7, s11
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 9, v0
+; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; SI-NEXT:    v_and_b32_e32 v4, s16, v1
-; SI-NEXT:    v_add_i32_e32 v1, vcc, 9, v1
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 9, v0
 ; SI-NEXT:    v_and_b32_e32 v2, s16, v0
+; SI-NEXT:    v_and_b32_e32 v4, s16, v1
 ; SI-NEXT:    v_and_b32_e32 v3, s17, v3
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 9, v1
 ; SI-NEXT:    v_or_b32_e32 v2, v2, v3
 ; SI-NEXT:    v_and_b32_e32 v1, s17, v1
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x900, v2
@@ -361,7 +362,6 @@ define amdgpu_kernel void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %o
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x9000000, v1
-; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
 ; SI-NEXT:    buffer_store_dword v1, off, s[12:15], 0
 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
@@ -389,20 +389,20 @@ define amdgpu_kernel void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %o
 ; VI-NEXT:    s_mov_b32 s6, s10
 ; VI-NEXT:    s_mov_b32 s7, s11
 ; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
 ; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; VI-NEXT:    v_and_b32_e32 v4, s16, v1
 ; VI-NEXT:    v_add_u16_e32 v1, 9, v1
 ; VI-NEXT:    v_add_u16_e32 v3, 9, v0
 ; VI-NEXT:    v_and_b32_e32 v1, s17, v1
-; VI-NEXT:    v_or_b32_e32 v1, v4, v1
 ; VI-NEXT:    v_and_b32_e32 v2, s16, v0
 ; VI-NEXT:    v_and_b32_e32 v3, s17, v3
+; VI-NEXT:    v_or_b32_e32 v1, v4, v1
 ; VI-NEXT:    v_or_b32_e32 v2, v2, v3
 ; VI-NEXT:    v_add_u16_e32 v1, s18, v1
 ; VI-NEXT:    v_add_u16_e32 v2, s18, v2
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; VI-NEXT:    v_or_b32_e32 v1, v2, v1
-; VI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
 ; VI-NEXT:    buffer_store_dword v1, off, s[12:15], 0
 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm

diff  --git a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
index 4db6ec143e812..5b3b04f2fa9f9 100644
--- a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
@@ -31,13 +31,13 @@ define float @call_split_type_used_outside_block_v2f32() #0 {
 ; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    v_writelane_b32 v40, s33, 2
-; GCN-NEXT:    v_writelane_b32 v40, s30, 0
 ; GCN-NEXT:    s_mov_b32 s33, s32
 ; GCN-NEXT:    s_addk_i32 s32, 0x400
+; GCN-NEXT:    v_writelane_b32 v40, s30, 0
+; GCN-NEXT:    v_writelane_b32 v40, s31, 1
 ; GCN-NEXT:    s_getpc_b64 s[4:5]
 ; GCN-NEXT:    s_add_u32 s4, s4, func_v2f32 at rel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s5, s5, func_v2f32 at rel32@hi+12
-; GCN-NEXT:    v_writelane_b32 v40, s31, 1
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 0
 ; GCN-NEXT:    v_readlane_b32 s5, v40, 1
@@ -65,13 +65,13 @@ define float @call_split_type_used_outside_block_v3f32() #0 {
 ; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    v_writelane_b32 v40, s33, 2
-; GCN-NEXT:    v_writelane_b32 v40, s30, 0
 ; GCN-NEXT:    s_mov_b32 s33, s32
 ; GCN-NEXT:    s_addk_i32 s32, 0x400
+; GCN-NEXT:    v_writelane_b32 v40, s30, 0
+; GCN-NEXT:    v_writelane_b32 v40, s31, 1
 ; GCN-NEXT:    s_getpc_b64 s[4:5]
 ; GCN-NEXT:    s_add_u32 s4, s4, func_v3f32 at rel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s5, s5, func_v3f32 at rel32@hi+12
-; GCN-NEXT:    v_writelane_b32 v40, s31, 1
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 0
 ; GCN-NEXT:    v_readlane_b32 s5, v40, 1
@@ -99,13 +99,13 @@ define half @call_split_type_used_outside_block_v4f16() #0 {
 ; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    v_writelane_b32 v40, s33, 2
-; GCN-NEXT:    v_writelane_b32 v40, s30, 0
 ; GCN-NEXT:    s_mov_b32 s33, s32
 ; GCN-NEXT:    s_addk_i32 s32, 0x400
+; GCN-NEXT:    v_writelane_b32 v40, s30, 0
+; GCN-NEXT:    v_writelane_b32 v40, s31, 1
 ; GCN-NEXT:    s_getpc_b64 s[4:5]
 ; GCN-NEXT:    s_add_u32 s4, s4, func_v4f16 at rel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s5, s5, func_v4f16 at rel32@hi+12
-; GCN-NEXT:    v_writelane_b32 v40, s31, 1
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 0
 ; GCN-NEXT:    v_readlane_b32 s5, v40, 1
@@ -133,13 +133,13 @@ define { i32, half } @call_split_type_used_outside_block_struct() #0 {
 ; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    v_writelane_b32 v40, s33, 2
-; GCN-NEXT:    v_writelane_b32 v40, s30, 0
 ; GCN-NEXT:    s_mov_b32 s33, s32
 ; GCN-NEXT:    s_addk_i32 s32, 0x400
+; GCN-NEXT:    v_writelane_b32 v40, s30, 0
+; GCN-NEXT:    v_writelane_b32 v40, s31, 1
 ; GCN-NEXT:    s_getpc_b64 s[4:5]
 ; GCN-NEXT:    s_add_u32 s4, s4, func_struct at rel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s5, s5, func_struct at rel32@hi+12
-; GCN-NEXT:    v_writelane_b32 v40, s31, 1
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 0
 ; GCN-NEXT:    v_mov_b32_e32 v1, v4

diff  --git a/llvm/test/CodeGen/AMDGPU/ctpop16.ll b/llvm/test/CodeGen/AMDGPU/ctpop16.ll
index bfa1b9502aaf9..2767f0bdc0172 100644
--- a/llvm/test/CodeGen/AMDGPU/ctpop16.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctpop16.ll
@@ -409,14 +409,14 @@ define amdgpu_kernel void @v_ctpop_v4i16(<4 x i16> addrspace(1)* noalias %out, <
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v2, s0, v0
-; SI-NEXT:    v_and_b32_e32 v3, s0, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT:    v_and_b32_e32 v3, s0, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_bcnt_u32_b32_e64 v1, v1, 0
 ; SI-NEXT:    v_bcnt_u32_b32_e64 v0, v0, 0
 ; SI-NEXT:    v_bcnt_u32_b32_e64 v3, v3, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_bcnt_u32_b32_e64 v2, v2, 0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; SI-NEXT:    v_or_b32_e32 v1, v3, v1
 ; SI-NEXT:    v_or_b32_e32 v0, v2, v0
@@ -431,8 +431,8 @@ define amdgpu_kernel void @v_ctpop_v4i16(<4 x i16> addrspace(1)* noalias %out, <
 ; VI-NEXT:    s_mov_b32 s7, 0xf000
 ; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; VI-NEXT:    s_mov_b32 s0, 0xffff
@@ -444,8 +444,8 @@ define amdgpu_kernel void @v_ctpop_v4i16(<4 x i16> addrspace(1)* noalias %out, <
 ; VI-NEXT:    v_bcnt_u32_b32 v2, v2, 0
 ; VI-NEXT:    v_bcnt_u32_b32 v3, v3, 0
 ; VI-NEXT:    v_bcnt_u32_b32 v1, v1, 0
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; VI-NEXT:    v_bcnt_u32_b32 v0, v0, 0
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; VI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; VI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v3
@@ -534,24 +534,24 @@ define amdgpu_kernel void @v_ctpop_v8i16(<8 x i16> addrspace(1)* noalias %out, <
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v4, s0, v0
-; SI-NEXT:    v_and_b32_e32 v5, s0, v1
-; SI-NEXT:    v_and_b32_e32 v6, s0, v2
-; SI-NEXT:    v_and_b32_e32 v7, s0, v3
 ; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT:    v_and_b32_e32 v5, s0, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_and_b32_e32 v6, s0, v2
 ; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT:    v_and_b32_e32 v7, s0, v3
 ; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; SI-NEXT:    v_bcnt_u32_b32_e64 v3, v3, 0
 ; SI-NEXT:    v_bcnt_u32_b32_e64 v2, v2, 0
 ; SI-NEXT:    v_bcnt_u32_b32_e64 v1, v1, 0
 ; SI-NEXT:    v_bcnt_u32_b32_e64 v0, v0, 0
 ; SI-NEXT:    v_bcnt_u32_b32_e64 v7, v7, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; SI-NEXT:    v_bcnt_u32_b32_e64 v6, v6, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_bcnt_u32_b32_e64 v5, v5, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_bcnt_u32_b32_e64 v4, v4, 0
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; SI-NEXT:    v_or_b32_e32 v3, v7, v3
 ; SI-NEXT:    v_or_b32_e32 v2, v6, v2
@@ -568,8 +568,8 @@ define amdgpu_kernel void @v_ctpop_v8i16(<8 x i16> addrspace(1)* noalias %out, <
 ; VI-NEXT:    s_mov_b32 s7, 0xf000
 ; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; VI-NEXT:    s_mov_b32 s0, 0xffff
@@ -587,12 +587,12 @@ define amdgpu_kernel void @v_ctpop_v8i16(<8 x i16> addrspace(1)* noalias %out, <
 ; VI-NEXT:    v_bcnt_u32_b32 v6, v6, 0
 ; VI-NEXT:    v_bcnt_u32_b32 v7, v7, 0
 ; VI-NEXT:    v_bcnt_u32_b32 v3, v3, 0
-; VI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; VI-NEXT:    v_bcnt_u32_b32 v2, v2, 0
-; VI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; VI-NEXT:    v_bcnt_u32_b32 v1, v1, 0
-; VI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
 ; VI-NEXT:    v_bcnt_u32_b32 v0, v0, 0
+; VI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; VI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; VI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
 ; VI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
 ; VI-NEXT:    v_or_b32_e32 v3, v3, v4
 ; VI-NEXT:    v_or_b32_e32 v2, v2, v5
@@ -718,19 +718,19 @@ define amdgpu_kernel void @v_ctpop_v16i16(<16 x i16> addrspace(1)* noalias %out,
 ; SI-NEXT:    v_and_b32_e32 v12, s0, v0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v8, s0, v4
-; SI-NEXT:    v_and_b32_e32 v9, s0, v5
-; SI-NEXT:    v_and_b32_e32 v10, s0, v6
-; SI-NEXT:    v_and_b32_e32 v11, s0, v7
 ; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; SI-NEXT:    v_and_b32_e32 v9, s0, v5
 ; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; SI-NEXT:    v_and_b32_e32 v10, s0, v6
 ; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; SI-NEXT:    v_and_b32_e32 v11, s0, v7
 ; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; SI-NEXT:    v_and_b32_e32 v13, s0, v1
-; SI-NEXT:    v_and_b32_e32 v14, s0, v2
-; SI-NEXT:    v_and_b32_e32 v15, 0xffff, v3
 ; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT:    v_and_b32_e32 v13, s0, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_and_b32_e32 v14, s0, v2
 ; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT:    v_and_b32_e32 v15, 0xffff, v3
 ; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; SI-NEXT:    v_bcnt_u32_b32_e64 v7, v7, 0
 ; SI-NEXT:    v_bcnt_u32_b32_e64 v6, v6, 0
@@ -741,20 +741,20 @@ define amdgpu_kernel void @v_ctpop_v16i16(<16 x i16> addrspace(1)* noalias %out,
 ; SI-NEXT:    v_bcnt_u32_b32_e64 v1, v1, 0
 ; SI-NEXT:    v_bcnt_u32_b32_e64 v0, v0, 0
 ; SI-NEXT:    v_bcnt_u32_b32_e64 v11, v11, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
 ; SI-NEXT:    v_bcnt_u32_b32_e64 v10, v10, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
 ; SI-NEXT:    v_bcnt_u32_b32_e64 v9, v9, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; SI-NEXT:    v_bcnt_u32_b32_e64 v8, v8, 0
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; SI-NEXT:    v_bcnt_u32_b32_e64 v15, v15, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; SI-NEXT:    v_bcnt_u32_b32_e64 v14, v14, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_bcnt_u32_b32_e64 v13, v13, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_bcnt_u32_b32_e64 v12, v12, 0
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; SI-NEXT:    v_or_b32_e32 v7, v11, v7
 ; SI-NEXT:    v_or_b32_e32 v6, v10, v6
@@ -777,8 +777,8 @@ define amdgpu_kernel void @v_ctpop_v16i16(<16 x i16> addrspace(1)* noalias %out,
 ; VI-NEXT:    s_mov_b32 s7, 0xf000
 ; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, 16, v0
 ; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
@@ -790,15 +790,15 @@ define amdgpu_kernel void @v_ctpop_v16i16(<16 x i16> addrspace(1)* noalias %out,
 ; VI-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
 ; VI-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
 ; VI-NEXT:    v_lshrrev_b32_e32 v12, 16, v0
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_lshrrev_b32_e32 v13, 16, v6
-; VI-NEXT:    v_lshrrev_b32_e32 v14, 16, v5
-; VI-NEXT:    v_lshrrev_b32_e32 v15, 16, v4
 ; VI-NEXT:    v_and_b32_e32 v3, v8, v3
 ; VI-NEXT:    v_and_b32_e32 v2, v8, v2
 ; VI-NEXT:    v_and_b32_e32 v1, v8, v1
 ; VI-NEXT:    v_and_b32_e32 v0, v8, v0
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshrrev_b32_e32 v8, 16, v7
+; VI-NEXT:    v_lshrrev_b32_e32 v13, 16, v6
+; VI-NEXT:    v_lshrrev_b32_e32 v14, 16, v5
+; VI-NEXT:    v_lshrrev_b32_e32 v15, 16, v4
 ; VI-NEXT:    v_bcnt_u32_b32 v9, v9, 0
 ; VI-NEXT:    v_bcnt_u32_b32 v10, v10, 0
 ; VI-NEXT:    v_bcnt_u32_b32 v11, v11, 0
@@ -807,25 +807,25 @@ define amdgpu_kernel void @v_ctpop_v16i16(<16 x i16> addrspace(1)* noalias %out,
 ; VI-NEXT:    v_and_b32_e32 v6, s0, v6
 ; VI-NEXT:    v_and_b32_e32 v5, s0, v5
 ; VI-NEXT:    v_and_b32_e32 v4, s0, v4
+; VI-NEXT:    v_bcnt_u32_b32 v3, v3, 0
+; VI-NEXT:    v_bcnt_u32_b32 v2, v2, 0
+; VI-NEXT:    v_bcnt_u32_b32 v1, v1, 0
+; VI-NEXT:    v_bcnt_u32_b32 v0, v0, 0
 ; VI-NEXT:    v_bcnt_u32_b32 v8, v8, 0
 ; VI-NEXT:    v_bcnt_u32_b32 v13, v13, 0
 ; VI-NEXT:    v_bcnt_u32_b32 v14, v14, 0
 ; VI-NEXT:    v_bcnt_u32_b32 v15, v15, 0
-; VI-NEXT:    v_bcnt_u32_b32 v3, v3, 0
 ; VI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; VI-NEXT:    v_bcnt_u32_b32 v2, v2, 0
 ; VI-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; VI-NEXT:    v_bcnt_u32_b32 v1, v1, 0
 ; VI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; VI-NEXT:    v_bcnt_u32_b32 v0, v0, 0
 ; VI-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
 ; VI-NEXT:    v_bcnt_u32_b32 v7, v7, 0
-; VI-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
 ; VI-NEXT:    v_bcnt_u32_b32 v6, v6, 0
-; VI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
 ; VI-NEXT:    v_bcnt_u32_b32 v5, v5, 0
-; VI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
 ; VI-NEXT:    v_bcnt_u32_b32 v4, v4, 0
+; VI-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; VI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; VI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
 ; VI-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
 ; VI-NEXT:    v_or_b32_e32 v3, v3, v9
 ; VI-NEXT:    v_or_b32_e32 v2, v2, v10
@@ -1186,8 +1186,8 @@ define amdgpu_kernel void @v_ctpop_i16_add_literal(i16 addrspace(1)* noalias %ou
 ; VI-NEXT:    s_mov_b32 s7, 0xf000
 ; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_ushort v0, v[0:1]
 ; VI-NEXT:    s_movk_i32 s0, 0x3e7

diff  --git a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
index 710596f117ba4..6bf77bc93b1c5 100644
--- a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
@@ -1063,9 +1063,9 @@ define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(i64 addrspace(1)* n
 ; GFX9-GISEL-NEXT:    v_or_b32_sdwa v4, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-GISEL-NEXT:    v_bfe_u32 v3, v3, 0, 16
 ; GFX9-GISEL-NEXT:    v_bfe_u32 v4, v4, 0, 16
-; GFX9-GISEL-NEXT:    v_lshl_or_b32 v3, v4, 16, v3
 ; GFX9-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 16
 ; GFX9-GISEL-NEXT:    v_bfe_u32 v2, v2, 0, 16
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v3, v4, 16, v3
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v2, v2, 16, v0
 ; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v4, v3
 ; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v0, v2

diff  --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
index af389deee2f6e..5b2b8b0a7bffe 100644
--- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
@@ -87,9 +87,9 @@ define float @v_uitofp_to_f32_multi_use_lshr8_mask255(i32 %arg0) nounwind {
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
-; SI-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
 ; SI-NEXT:    buffer_store_dword v1, off, s[4:7], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -98,9 +98,9 @@ define float @v_uitofp_to_f32_multi_use_lshr8_mask255(i32 %arg0) nounwind {
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
-; VI-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
 ; VI-NEXT:    s_mov_b32 s7, 0xf000
 ; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
 ; VI-NEXT:    buffer_store_dword v1, off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
@@ -933,8 +933,8 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* n
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v0, s0, v4
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 9, v5
-; SI-NEXT:    v_or_b32_e32 v0, v7, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v6
+; SI-NEXT:    v_or_b32_e32 v0, v7, v0
 ; SI-NEXT:    v_and_b32_e32 v2, s0, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x900, v0
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
@@ -955,8 +955,8 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* n
 ; VI-NEXT:    s_mov_b32 s10, -1
 ; VI-NEXT:    v_mov_b32_e32 v5, 9
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v4, v[0:1]
 ; VI-NEXT:    s_mov_b32 s6, s10
@@ -968,10 +968,11 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* n
 ; VI-NEXT:    v_cvt_f32_ubyte2_e32 v2, v4
 ; VI-NEXT:    v_cvt_f32_ubyte1_e32 v1, v4
 ; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v4
-; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; VI-NEXT:    v_and_b32_e32 v7, 0xffffff00, v4
 ; VI-NEXT:    v_add_u16_e32 v8, 9, v4
 ; VI-NEXT:    v_add_u16_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; VI-NEXT:    s_nop 0
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v6
 ; VI-NEXT:    v_or_b32_sdwa v0, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
@@ -1056,10 +1057,10 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias
 ; SI-NEXT:    v_cvt_f32_ubyte2_e32 v5, v5
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_ubyte0_e32 v2, v8
+; SI-NEXT:    v_cvt_f32_ubyte0_e32 v4, v7
 ; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:24
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; SI-NEXT:    v_cvt_f32_ubyte0_e32 v4, v7
 ; SI-NEXT:    v_cvt_f32_ubyte3_e32 v3, v2
 ; SI-NEXT:    v_cvt_f32_ubyte2_e32 v2, v2
 ; SI-NEXT:    buffer_store_dwordx2 v[4:5], off, s[4:7], 0 offset:16
@@ -1104,11 +1105,11 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias
 ; VI-NEXT:    v_cvt_f32_ubyte0_e32 v6, v4
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; VI-NEXT:    v_or_b32_sdwa v2, v2, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-NEXT:    v_cvt_f32_ubyte0_e32 v4, v11
-; VI-NEXT:    v_cvt_f32_ubyte3_e32 v3, v2
+; VI-NEXT:    v_or_b32_sdwa v2, v2, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
+; VI-NEXT:    v_cvt_f32_ubyte3_e32 v3, v2
 ; VI-NEXT:    v_cvt_f32_ubyte2_e32 v2, v2
 ; VI-NEXT:    buffer_store_dwordx3 v[4:6], off, s[4:7], 0 offset:16
 ; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0

diff  --git a/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll
index 8c2a77bb6c98e..47f66c7bb74c9 100644
--- a/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll
+++ b/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll
@@ -26,13 +26,11 @@ define amdgpu_ps float @_amdgpu_ps_main() #0 {
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    s_buffer_load_dwordx4 s[0:3], s[0:3], 0x40
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_sub_f32_e64 v5, s24, s28
 ; GCN-NEXT:    s_clause 0x1
 ; GCN-NEXT:    s_buffer_load_dwordx4 s[4:7], s[0:3], 0x50
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    s_buffer_load_dword s0, s[0:3], 0x2c
-; GCN-NEXT:    v_fma_f32 v1, v1, v5, s28
-; GCN-NEXT:    v_add_f32_e64 v5, s29, -1.0
+; GCN-NEXT:    v_sub_f32_e64 v5, s24, s28
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_clause 0x4
 ; GCN-NEXT:    s_buffer_load_dwordx4 s[8:11], s[0:3], 0x60
@@ -40,12 +38,14 @@ define amdgpu_ps float @_amdgpu_ps_main() #0 {
 ; GCN-NEXT:    s_buffer_load_dwordx4 s[16:19], s[0:3], 0x0
 ; GCN-NEXT:    s_buffer_load_dwordx4 s[20:23], s[0:3], 0x70
 ; GCN-NEXT:    s_buffer_load_dwordx4 s[24:27], s[0:3], 0x10
+; GCN-NEXT:    v_fma_f32 v1, v1, v5, s28
 ; GCN-NEXT:    v_max_f32_e64 v6, s0, s0 clamp
+; GCN-NEXT:    v_add_f32_e64 v5, s29, -1.0
 ; GCN-NEXT:    v_sub_f32_e32 v8, s0, v1
-; GCN-NEXT:    s_mov_b32 s0, 0x3c23d70a
 ; GCN-NEXT:    v_fma_f32 v7, -s2, v6, s6
-; GCN-NEXT:    v_fmac_f32_e32 v1, v6, v8
 ; GCN-NEXT:    v_fma_f32 v5, v6, v5, 1.0
+; GCN-NEXT:    s_mov_b32 s0, 0x3c23d70a
+; GCN-NEXT:    v_fmac_f32_e32 v1, v6, v8
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f32_e32 v9, s10, v0
 ; GCN-NEXT:    v_fma_f32 v0, -v0, s10, s14

diff  --git a/llvm/test/CodeGen/AMDGPU/ds-alignment.ll b/llvm/test/CodeGen/AMDGPU/ds-alignment.ll
index fb81d48470286..189715bfbb6cf 100644
--- a/llvm/test/CodeGen/AMDGPU/ds-alignment.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds-alignment.ll
@@ -373,9 +373,10 @@ define amdgpu_kernel void @ds12align1(<3 x i32> addrspace(3)* %in, <3 x i32> add
 ; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
 ; ALIGNED-GISEL-NEXT:    v_or3_b32 v0, v0, v3, v4
+; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(2)
+; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_sdwa v3, s3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(1)
 ; ALIGNED-GISEL-NEXT:    v_and_b32_e32 v4, v8, v1
-; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_sdwa v3, s3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; ALIGNED-GISEL-NEXT:    v_and_b32_e32 v1, v9, v1
 ; ALIGNED-GISEL-NEXT:    v_and_or_b32 v3, v6, s2, v3

diff  --git a/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll b/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll
index 0d186f0cc03b3..1205b52fff1c3 100644
--- a/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll
@@ -48,11 +48,11 @@ define amdgpu_kernel void @write_ds_sub0_offset0_global_clamp_bit(float %dummy.v
 ; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; CI-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
 ; CI-NEXT:    s_mov_b64 vcc, 0
-; CI-NEXT:    v_mov_b32_e32 v2, 0x7b
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s0
 ; CI-NEXT:    s_mov_b32 s0, 0
 ; CI-NEXT:    v_div_fmas_f32 v1, v1, v1, v1
+; CI-NEXT:    v_mov_b32_e32 v2, 0x7b
 ; CI-NEXT:    s_mov_b32 m0, -1
 ; CI-NEXT:    s_mov_b32 s3, 0xf000
 ; CI-NEXT:    s_mov_b32 s2, -1
@@ -83,8 +83,8 @@ define amdgpu_kernel void @write_ds_sub0_offset0_global_clamp_bit(float %dummy.v
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-NEXT:    s_mov_b32 vcc_lo, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 0x7b
+; GFX10-NEXT:    s_mov_b32 vcc_lo, 0
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v2, 0, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
@@ -295,11 +295,11 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_
 ; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; CI-NEXT:    v_sub_i32_e32 v0, vcc, 0x3fb, v0
 ; CI-NEXT:    s_mov_b64 vcc, 0
-; CI-NEXT:    v_mov_b32_e32 v2, 0x7b
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s0
 ; CI-NEXT:    s_mov_b32 s0, 0
 ; CI-NEXT:    v_div_fmas_f32 v1, v1, v1, v1
+; CI-NEXT:    v_mov_b32_e32 v2, 0x7b
 ; CI-NEXT:    v_mov_b32_e32 v3, 0
 ; CI-NEXT:    s_mov_b32 m0, -1
 ; CI-NEXT:    s_mov_b32 s3, 0xf000
@@ -332,9 +332,9 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-NEXT:    s_mov_b32 vcc_lo, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 0x7b
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
+; GFX10-NEXT:    s_mov_b32 vcc_lo, 0
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v2, 0x3fb, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0

diff  --git a/llvm/test/CodeGen/AMDGPU/ds_read2.ll b/llvm/test/CodeGen/AMDGPU/ds_read2.ll
index 6e9236946820c..c8310032c012c 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_read2.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_read2.ll
@@ -548,10 +548,10 @@ define amdgpu_kernel void @unaligned_read2_f32(float addrspace(1)* %out, float a
 ; CI-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
 ; CI-NEXT:    s_waitcnt lgkmcnt(2)
 ; CI-NEXT:    v_or_b32_e32 v6, v6, v7
-; CI-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
-; CI-NEXT:    v_or_b32_e32 v2, v2, v3
 ; CI-NEXT:    s_waitcnt lgkmcnt(1)
 ; CI-NEXT:    v_lshlrev_b32_e32 v8, 8, v8
+; CI-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; CI-NEXT:    v_or_b32_e32 v2, v2, v3
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_or_b32_e32 v1, v8, v1
 ; CI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
@@ -642,10 +642,10 @@ define amdgpu_kernel void @unaligned_offset_read2_f32(float addrspace(1)* %out,
 ; CI-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
 ; CI-NEXT:    s_waitcnt lgkmcnt(2)
 ; CI-NEXT:    v_or_b32_e32 v6, v6, v7
-; CI-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
-; CI-NEXT:    v_or_b32_e32 v2, v2, v3
 ; CI-NEXT:    s_waitcnt lgkmcnt(1)
 ; CI-NEXT:    v_lshlrev_b32_e32 v8, 8, v8
+; CI-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; CI-NEXT:    v_or_b32_e32 v2, v2, v3
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_or_b32_e32 v1, v8, v1
 ; CI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
@@ -1126,8 +1126,8 @@ define amdgpu_kernel void @sgemm_inner_loop_read2_sequence(float addrspace(1)* %
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v3
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(2)
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v4
-; GFX9-NEXT:    v_add_f32_e32 v0, v0, v5
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT:    v_add_f32_e32 v0, v0, v5
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v6
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v7

diff  --git a/llvm/test/CodeGen/AMDGPU/ds_write2.ll b/llvm/test/CodeGen/AMDGPU/ds_write2.ll
index 7fceb602a0ba9..2f7108d121acd 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_write2.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_write2.ll
@@ -183,9 +183,9 @@ define amdgpu_kernel void @simple_write2_two_val_subreg2_mixed_f32(float addrspa
 ; CI-LABEL: simple_write2_two_val_subreg2_mixed_f32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
-; CI-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
 ; CI-NEXT:    s_mov_b32 s3, 0xf000
 ; CI-NEXT:    s_mov_b32 s2, 0
+; CI-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
 ; CI-NEXT:    v_mov_b32_e32 v2, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    buffer_load_dwordx2 v[3:4], v[1:2], s[0:3], 0 addr64 glc
@@ -230,9 +230,9 @@ define amdgpu_kernel void @simple_write2_two_val_subreg2_f32(float addrspace(1)*
 ; CI-LABEL: simple_write2_two_val_subreg2_f32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
-; CI-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
 ; CI-NEXT:    s_mov_b32 s3, 0xf000
 ; CI-NEXT:    s_mov_b32 s2, 0
+; CI-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
 ; CI-NEXT:    v_mov_b32_e32 v2, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    buffer_load_dwordx2 v[1:2], v[1:2], s[0:3], 0 addr64
@@ -269,9 +269,9 @@ define amdgpu_kernel void @simple_write2_two_val_subreg4_f32(float addrspace(1)*
 ; CI-LABEL: simple_write2_two_val_subreg4_f32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
-; CI-NEXT:    v_lshlrev_b32_e32 v1, 4, v0
 ; CI-NEXT:    s_mov_b32 s3, 0xf000
 ; CI-NEXT:    s_mov_b32 s2, 0
+; CI-NEXT:    v_lshlrev_b32_e32 v1, 4, v0
 ; CI-NEXT:    v_mov_b32_e32 v2, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    buffer_load_dwordx4 v[1:4], v[1:2], s[0:3], 0 addr64
@@ -653,8 +653,8 @@ define amdgpu_kernel void @unaligned_offset_simple_write2_one_val_f64(double add
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
 ; CI-NEXT:    s_mov_b32 m0, -1
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    v_lshrrev_b32_e32 v3, 24, v1
 ; CI-NEXT:    ds_write_b8 v0, v1 offset:5
+; CI-NEXT:    v_lshrrev_b32_e32 v3, 24, v1
 ; CI-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
 ; CI-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
 ; CI-NEXT:    ds_write_b8 v0, v2 offset:13
@@ -682,9 +682,9 @@ define amdgpu_kernel void @unaligned_offset_simple_write2_one_val_f64(double add
 ; GFX9-ALIGNED-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
 ; GFX9-ALIGNED-NEXT:    v_add_u32_e32 v2, s4, v2
 ; GFX9-ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-ALIGNED-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
 ; GFX9-ALIGNED-NEXT:    ds_write_b8_d16_hi v2, v0 offset:7
 ; GFX9-ALIGNED-NEXT:    ds_write_b8 v2, v0 offset:5
+; GFX9-ALIGNED-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
 ; GFX9-ALIGNED-NEXT:    v_lshrrev_b32_e32 v4, 8, v0
 ; GFX9-ALIGNED-NEXT:    ds_write_b8_d16_hi v2, v1 offset:15
 ; GFX9-ALIGNED-NEXT:    ds_write_b8 v2, v1 offset:13
@@ -898,8 +898,8 @@ define amdgpu_kernel void @write2_sgemm_sequence(float addrspace(1)* %C, i32 %ld
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-NEXT:    s_lshl_b32 s2, s2, 2
 ; GFX9-NEXT:    s_add_i32 s3, s2, 0xc20
-; GFX9-NEXT:    s_addk_i32 s2, 0xc60
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s3
+; GFX9-NEXT:    s_addk_i32 s2, 0xc60
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s2
@@ -955,13 +955,13 @@ define amdgpu_kernel void @simple_write2_v4f32_superreg_align4(<4 x float> addrs
 ; CI-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; CI-NEXT:    s_mov_b32 m0, -1
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    v_add_i32_e32 v0, vcc, s4, v0
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; CI-NEXT:    v_add_i32_e32 v0, vcc, s4, v0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s0
 ; CI-NEXT:    v_mov_b32_e32 v2, s1
-; CI-NEXT:    ds_write2_b32 v0, v1, v2 offset1:1
 ; CI-NEXT:    v_mov_b32_e32 v3, s2
+; CI-NEXT:    ds_write2_b32 v0, v1, v2 offset1:1
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    ds_write2_b32 v0, v3, v1 offset0:2 offset1:3
 ; CI-NEXT:    s_endpgm

diff  --git a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
index 94b4c75d68339..c0291a1599188 100644
--- a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
+++ b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
@@ -14,8 +14,8 @@ define i32 @s_add_co_select_user() {
 ; GFX7-NEXT:    s_or_b32 s4, s4, s5
 ; GFX7-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX7-NEXT:    s_addc_u32 s4, s6, 0
-; GFX7-NEXT:    s_cselect_b64 vcc, 1, 0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    s_cselect_b64 vcc, 1, 0
 ; GFX7-NEXT:    s_cmp_gt_u32 s6, 31
 ; GFX7-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX7-NEXT:    s_cselect_b64 vcc, -1, 0
@@ -31,8 +31,8 @@ define i32 @s_add_co_select_user() {
 ; GFX9-NEXT:    v_add_co_u32_e64 v0, s[4:5], s6, s6
 ; GFX9-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GFX9-NEXT:    s_addc_u32 s4, s6, 0
-; GFX9-NEXT:    s_cselect_b64 vcc, 1, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-NEXT:    s_cselect_b64 vcc, 1, 0
 ; GFX9-NEXT:    s_cmp_gt_u32 s6, 31
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0

diff  --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll
index 3ff4a3a147c07..18bf60d2281bc 100644
--- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll
+++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll
@@ -69,8 +69,8 @@ define amdgpu_kernel void @global_store_2xi16_align2(i16 addrspace(1)* %p, i16 a
 ; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX7-ALIGNED-NEXT:    s_add_u32 s2, s0, 2
 ; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-ALIGNED-NEXT:    flat_store_short v[0:1], v2
 ; GFX7-ALIGNED-NEXT:    s_addc_u32 s3, s1, 0
+; GFX7-ALIGNED-NEXT:    flat_store_short v[0:1], v2
 ; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v2, 2
 ; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v1, s3
@@ -85,8 +85,8 @@ define amdgpu_kernel void @global_store_2xi16_align2(i16 addrspace(1)* %p, i16 a
 ; GFX7-UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX7-UNALIGNED-NEXT:    s_add_u32 s2, s0, 2
 ; GFX7-UNALIGNED-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-UNALIGNED-NEXT:    flat_store_short v[0:1], v2
 ; GFX7-UNALIGNED-NEXT:    s_addc_u32 s3, s1, 0
+; GFX7-UNALIGNED-NEXT:    flat_store_short v[0:1], v2
 ; GFX7-UNALIGNED-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-UNALIGNED-NEXT:    v_mov_b32_e32 v2, 2
 ; GFX7-UNALIGNED-NEXT:    v_mov_b32_e32 v1, s3
@@ -192,15 +192,15 @@ define amdgpu_kernel void @global_store_2xi16_align1(i16 addrspace(1)* %p, i16 a
 ; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX7-ALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-ALIGNED-NEXT:    s_add_u32 s2, s0, 2
-; GFX7-ALIGNED-NEXT:    s_addc_u32 s3, s1, 0
 ; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-ALIGNED-NEXT:    s_add_u32 s4, s0, 1
+; GFX7-ALIGNED-NEXT:    s_addc_u32 s3, s1, 0
 ; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-ALIGNED-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-ALIGNED-NEXT:    s_add_u32 s4, s0, 1
 ; GFX7-ALIGNED-NEXT:    flat_store_byte v[0:1], v2
+; GFX7-ALIGNED-NEXT:    s_addc_u32 s5, s1, 0
 ; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-ALIGNED-NEXT:    s_add_u32 s0, s0, 3
 ; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-ALIGNED-NEXT:    s_add_u32 s0, s0, 3
 ; GFX7-ALIGNED-NEXT:    flat_store_byte v[0:1], v3
 ; GFX7-ALIGNED-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v0, s0

diff  --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll
index ff7eb92fca067..3a90bff5b29a1 100644
--- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll
+++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll
@@ -237,8 +237,8 @@ define void @private_store_2xi16_align1(i16 addrspace(5)* %p, i16 addrspace(5)*
 ; GFX7-ALIGNED:       ; %bb.0:
 ; GFX7-ALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v3, 1
-; GFX7-ALIGNED-NEXT:    buffer_store_byte v3, v1, s[0:3], 0 offen
 ; GFX7-ALIGNED-NEXT:    v_add_i32_e32 v2, vcc, 2, v1
+; GFX7-ALIGNED-NEXT:    buffer_store_byte v3, v1, s[0:3], 0 offen
 ; GFX7-ALIGNED-NEXT:    v_add_i32_e32 v3, vcc, 1, v1
 ; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX7-ALIGNED-NEXT:    v_add_i32_e32 v1, vcc, 3, v1

diff  --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
index a62bbac8ff870..ca8cf1c5ae12a 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
@@ -723,9 +723,9 @@ define <4 x half> @v_test_canonicalize_reg_reg_undef_undef_v4f16(half %val0, hal
 ; GFX9-NEXT: s_setpc_b64
 
 ; VI: s_waitcnt
-; VI-NEXT: v_max_f16_e32 v0, v0, v0
 ; VI-NEXT: v_max_f16_e32 v1, v1, v1
 ; VI-NEXT: v_max_f16_sdwa v2, v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT: v_max_f16_e32 v0, v0, v0
 ; VI-NEXT: v_or_b32_e32 v0, 0x7e000000, v0
 ; VI-NEXT: v_or_b32_e32 v1, v1, v2
 ; VI-NEXT: s_setpc_b64

diff  --git a/llvm/test/CodeGen/AMDGPU/fexp.ll b/llvm/test/CodeGen/AMDGPU/fexp.ll
index ba595f65bc572..7eaac9421f566 100644
--- a/llvm/test/CodeGen/AMDGPU/fexp.ll
+++ b/llvm/test/CodeGen/AMDGPU/fexp.ll
@@ -163,15 +163,15 @@ define <4 x half> @v_exp_v4f16(<4 x half> %arg0) {
 ; VI-NEXT:    s_movk_i32 s4, 0x3dc5
 ; VI-NEXT:    v_mov_b32_e32 v3, s4
 ; VI-NEXT:    v_mul_f16_e32 v2, s4, v1
-; VI-NEXT:    v_mul_f16_e32 v4, s4, v0
 ; VI-NEXT:    v_mul_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_mul_f16_e32 v4, s4, v0
 ; VI-NEXT:    v_mul_f16_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_exp_f16_e32 v2, v2
-; VI-NEXT:    v_exp_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
 ; VI-NEXT:    v_exp_f16_e32 v4, v4
 ; VI-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v1, v2, v1
+; VI-NEXT:    v_exp_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
 ; VI-NEXT:    v_or_b32_e32 v0, v4, v0
+; VI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_exp_v4f16:
@@ -179,15 +179,15 @@ define <4 x half> @v_exp_v4f16(<4 x half> %arg0) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_movk_i32 s4, 0x3dc5
 ; GFX9-NEXT:    v_mul_f16_e32 v2, s4, v1
-; GFX9-NEXT:    v_mul_f16_e32 v3, s4, v0
 ; GFX9-NEXT:    v_mul_f16_sdwa v1, v1, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_mul_f16_e32 v3, s4, v0
 ; GFX9-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-NEXT:    v_exp_f16_e32 v2, v2
-; GFX9-NEXT:    v_exp_f16_e32 v1, v1
 ; GFX9-NEXT:    v_exp_f16_e32 v3, v3
 ; GFX9-NEXT:    v_exp_f16_e32 v0, v0
-; GFX9-NEXT:    v_pack_b32_f16 v1, v2, v1
+; GFX9-NEXT:    v_exp_f16_e32 v1, v1
 ; GFX9-NEXT:    v_pack_b32_f16 v0, v3, v0
+; GFX9-NEXT:    v_pack_b32_f16 v1, v2, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x half> @llvm.exp.v4f16(<4 x half> %arg0)
   ret <4 x half> %result

diff  --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll
index 4ea187fea27e4..72ab22592fb05 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll
@@ -171,14 +171,14 @@ define amdgpu_kernel void @test(i32 addrspace(1)* %out, i32 %in) {
 ; GCN-NEXT:    ;;#ASMEND
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, vcc_lo
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ;;#ASMEND
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ;;#ASMEND
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, vcc_lo
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ;;#ASMEND
 ; GCN-NEXT:    s_mov_b32 s2, exec_lo

diff  --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
index 7864ebec2e99a..5b607f976f538 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
@@ -227,8 +227,8 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_lshl_b32 s1, s0, 2
 ; GFX9-NEXT:    s_and_b32 s0, s0, 15
-; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX9-NEXT:    s_add_i32 s1, s1, 4
+; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX9-NEXT:    scratch_store_dword off, v0, s1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_add_i32 s0, s0, 4
@@ -269,8 +269,8 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
 ; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
 ; GFX9-PAL-NEXT:    s_lshl_b32 s1, s0, 2
 ; GFX9-PAL-NEXT:    s_and_b32 s0, s0, 15
-; GFX9-PAL-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX9-PAL-NEXT:    s_add_i32 s1, s1, 4
+; GFX9-PAL-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s1
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    s_add_i32 s0, s0, 4
@@ -338,8 +338,8 @@ define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) {
 ; GFX10-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
-; GFX10-NEXT:    s_and_b32 s0, s2, 15
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 15
+; GFX10-NEXT:    s_and_b32 s0, s2, 15
 ; GFX10-NEXT:    s_lshl_b32 s1, s2, 2
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX10-NEXT:    s_add_i32 s1, s1, 4
@@ -362,8 +362,8 @@ define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) {
 ; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
 ; GFX9-PAL-NEXT:    s_lshl_b32 s1, s0, 2
 ; GFX9-PAL-NEXT:    s_and_b32 s0, s0, 15
-; GFX9-PAL-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX9-PAL-NEXT:    s_add_i32 s1, s1, 4
+; GFX9-PAL-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s1
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    s_add_i32 s0, s0, 4
@@ -382,8 +382,8 @@ define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) {
 ; GFX10-PAL-NEXT:    s_addc_u32 s3, s3, 0
 ; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
 ; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
-; GFX10-PAL-NEXT:    s_and_b32 s1, s0, 15
 ; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, 15
+; GFX10-PAL-NEXT:    s_and_b32 s1, s0, 15
 ; GFX10-PAL-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX10-PAL-NEXT:    s_lshl_b32 s1, s1, 2
 ; GFX10-PAL-NEXT:    s_add_i32 s0, s0, 4
@@ -412,8 +412,8 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
 ; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 4
-; GFX9-NEXT:    v_add_u32_e32 v2, v1, v0
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
+; GFX9-NEXT:    v_add_u32_e32 v2, v1, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 15
 ; GFX9-NEXT:    scratch_store_dword v2, v3, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -855,9 +855,9 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_lshl_b32 s1, s0, 2
 ; GFX9-NEXT:    s_and_b32 s0, s0, 15
-; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX9-NEXT:    s_addk_i32 s1, 0x104
+; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX9-NEXT:    scratch_store_dword off, v0, s1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_addk_i32 s0, 0x104
@@ -898,13 +898,13 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
 ; GFX9-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
 ; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s4, s3
 ; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
-; GFX9-PAL-NEXT:    s_lshl_b32 s1, s0, 2
-; GFX9-PAL-NEXT:    s_and_b32 s0, s0, 15
 ; GFX9-PAL-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4 glc
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-PAL-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX9-PAL-NEXT:    s_lshl_b32 s1, s0, 2
+; GFX9-PAL-NEXT:    s_and_b32 s0, s0, 15
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX9-PAL-NEXT:    s_addk_i32 s1, 0x104
+; GFX9-PAL-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s1
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    s_addk_i32 s0, 0x104
@@ -988,9 +988,9 @@ define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) {
 ; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
 ; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
-; GFX9-NEXT:    s_lshl_b32 s0, s2, 2
 ; GFX9-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_lshl_b32 s0, s2, 2
 ; GFX9-NEXT:    s_addk_i32 s0, 0x104
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX9-NEXT:    scratch_store_dword off, v0, s0
@@ -1010,8 +1010,8 @@ define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) {
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
 ; GFX10-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_and_b32 s0, s2, 15
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 15
+; GFX10-NEXT:    s_and_b32 s0, s2, 15
 ; GFX10-NEXT:    s_lshl_b32 s1, s2, 2
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX10-NEXT:    s_addk_i32 s1, 0x104
@@ -1032,13 +1032,13 @@ define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) {
 ; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
 ; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
 ; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
-; GFX9-PAL-NEXT:    s_lshl_b32 s1, s0, 2
-; GFX9-PAL-NEXT:    s_and_b32 s0, s0, 15
 ; GFX9-PAL-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4 glc
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-PAL-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX9-PAL-NEXT:    s_lshl_b32 s1, s0, 2
+; GFX9-PAL-NEXT:    s_and_b32 s0, s0, 15
 ; GFX9-PAL-NEXT:    s_addk_i32 s1, 0x104
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
+; GFX9-PAL-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s1
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    s_addk_i32 s0, 0x104
@@ -1085,8 +1085,8 @@ define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) {
 ; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
 ; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
 ; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX1030-PAL-NEXT:    s_and_b32 s1, s0, 15
 ; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, 15
+; GFX1030-PAL-NEXT:    s_and_b32 s1, s0, 15
 ; GFX1030-PAL-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX1030-PAL-NEXT:    s_lshl_b32 s1, s1, 2
 ; GFX1030-PAL-NEXT:    s_addk_i32 s0, 0x104
@@ -1627,9 +1627,9 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_lshl_b32 s1, s0, 2
 ; GFX9-NEXT:    s_and_b32 s0, s0, 15
-; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX9-NEXT:    s_addk_i32 s1, 0x4004
+; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX9-NEXT:    scratch_store_dword off, v0, s1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_addk_i32 s0, 0x4004
@@ -1670,13 +1670,13 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
 ; GFX9-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
 ; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s4, s3
 ; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
-; GFX9-PAL-NEXT:    s_lshl_b32 s1, s0, 2
-; GFX9-PAL-NEXT:    s_and_b32 s0, s0, 15
 ; GFX9-PAL-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4 glc
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-PAL-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX9-PAL-NEXT:    s_lshl_b32 s1, s0, 2
+; GFX9-PAL-NEXT:    s_and_b32 s0, s0, 15
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX9-PAL-NEXT:    s_addk_i32 s1, 0x4004
+; GFX9-PAL-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s1
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    s_addk_i32 s0, 0x4004
@@ -1760,9 +1760,9 @@ define amdgpu_ps void @store_load_sindex_large_offset_foo(i32 inreg %idx) {
 ; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
 ; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
-; GFX9-NEXT:    s_lshl_b32 s0, s2, 2
 ; GFX9-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_lshl_b32 s0, s2, 2
 ; GFX9-NEXT:    s_addk_i32 s0, 0x4004
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX9-NEXT:    scratch_store_dword off, v0, s0
@@ -1782,8 +1782,8 @@ define amdgpu_ps void @store_load_sindex_large_offset_foo(i32 inreg %idx) {
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
 ; GFX10-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_and_b32 s0, s2, 15
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 15
+; GFX10-NEXT:    s_and_b32 s0, s2, 15
 ; GFX10-NEXT:    s_lshl_b32 s1, s2, 2
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX10-NEXT:    s_addk_i32 s1, 0x4004
@@ -1804,13 +1804,13 @@ define amdgpu_ps void @store_load_sindex_large_offset_foo(i32 inreg %idx) {
 ; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
 ; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
 ; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
-; GFX9-PAL-NEXT:    s_lshl_b32 s1, s0, 2
-; GFX9-PAL-NEXT:    s_and_b32 s0, s0, 15
 ; GFX9-PAL-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4 glc
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-PAL-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX9-PAL-NEXT:    s_lshl_b32 s1, s0, 2
+; GFX9-PAL-NEXT:    s_and_b32 s0, s0, 15
 ; GFX9-PAL-NEXT:    s_addk_i32 s1, 0x4004
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
+; GFX9-PAL-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s1
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    s_addk_i32 s0, 0x4004
@@ -1857,8 +1857,8 @@ define amdgpu_ps void @store_load_sindex_large_offset_foo(i32 inreg %idx) {
 ; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
 ; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
 ; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX1030-PAL-NEXT:    s_and_b32 s1, s0, 15
 ; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, 15
+; GFX1030-PAL-NEXT:    s_and_b32 s1, s0, 15
 ; GFX1030-PAL-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX1030-PAL-NEXT:    s_lshl_b32 s1, s1, 2
 ; GFX1030-PAL-NEXT:    s_addk_i32 s0, 0x4004
@@ -2102,9 +2102,9 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
-; GFX9-NEXT:    s_movk_i32 s0, 0x3000
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 13
 ; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
+; GFX9-NEXT:    s_movk_i32 s0, 0x3000
 ; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_add_i32 s0, s0, 4
@@ -2216,8 +2216,8 @@ define void @store_load_large_imm_offset_foo() {
 ; GFX9-LABEL: store_load_large_imm_offset_foo:
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_movk_i32 s0, 0x3000
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 13
+; GFX9-NEXT:    s_movk_i32 s0, 0x3000
 ; GFX9-NEXT:    scratch_store_dword off, v0, s32
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_add_i32 s0, s0, s32
@@ -2247,8 +2247,8 @@ define void @store_load_large_imm_offset_foo() {
 ; GFX9-PAL-LABEL: store_load_large_imm_offset_foo:
 ; GFX9-PAL:       ; %bb.0: ; %bb
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-PAL-NEXT:    s_movk_i32 s0, 0x3000
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 13
+; GFX9-PAL-NEXT:    s_movk_i32 s0, 0x3000
 ; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s32
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    s_add_i32 s0, s0, s32
@@ -2330,8 +2330,8 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
 ; GFX9-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
 ; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s4, s3
 ; GFX9-PAL-NEXT:    v_add_u32_e32 v0, s0, v0
-; GFX9-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
 ; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
+; GFX9-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 15
 ; GFX9-PAL-NEXT:    scratch_store_dword v0, v1, off offset:1024
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
@@ -2722,8 +2722,8 @@ bb:
 define amdgpu_ps void @large_offset() {
 ; GFX9-LABEL: large_offset:
 ; GFX9:       ; %bb.0: ; %bb
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, v0

diff  --git a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll
index 23d0971e2be7d..49667f7008db8 100644
--- a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll
@@ -231,9 +231,9 @@ define <4 x half> @test_fmax_legacy_ugt_v4f16(<4 x half> %a, <4 x half> %b) #0 {
 ; GFX9-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
 ; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
-; GFX9-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v7, v6
 ; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX9-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v7, v6
 ; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
 ; GFX9-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v5, v4
 ; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
@@ -260,9 +260,9 @@ define <4 x half> @test_fmax_legacy_ugt_v4f16(<4 x half> %a, <4 x half> %b) #0 {
 ; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
 ; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
-; VI-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v7, v6
 ; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; VI-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v7, v6
 ; VI-SAFE-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
 ; VI-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v5, v4
 ; VI-SAFE-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
@@ -280,8 +280,8 @@ define <4 x half> @test_fmax_legacy_ugt_v4f16(<4 x half> %a, <4 x half> %b) #0 {
 ; VI-NNAN:       ; %bb.0:
 ; VI-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NNAN-NEXT:    v_max_f16_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NNAN-NEXT:    v_max_f16_e32 v1, v1, v3
 ; VI-NNAN-NEXT:    v_max_f16_sdwa v5, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NNAN-NEXT:    v_max_f16_e32 v1, v1, v3
 ; VI-NNAN-NEXT:    v_max_f16_e32 v0, v0, v2
 ; VI-NNAN-NEXT:    v_or_b32_e32 v0, v0, v5
 ; VI-NNAN-NEXT:    v_or_b32_e32 v1, v1, v4
@@ -347,17 +347,17 @@ define <8 x half> @test_fmax_legacy_ugt_v8f16(<8 x half> %a, <8 x half> %b) #0 {
 ; GFX9-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v14, 16, v7
 ; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v15, 16, v3
-; GFX9-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v15, v14
 ; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v12, 16, v6
 ; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v13, 16, v2
-; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v14, v14, v15, vcc
-; GFX9-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v13, v12
+; GFX9-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v15, v14
 ; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v10, 16, v5
 ; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
-; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v12, v12, v13, vcc
-; GFX9-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v11, v10
+; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v14, v14, v15, vcc
+; GFX9-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v13, v12
 ; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
 ; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
+; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v12, v12, v13, vcc
+; GFX9-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v11, v10
 ; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v10, v10, v11, vcc
 ; GFX9-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v9, v8
 ; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
@@ -394,17 +394,17 @@ define <8 x half> @test_fmax_legacy_ugt_v8f16(<8 x half> %a, <8 x half> %b) #0 {
 ; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v14, 16, v7
 ; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v15, 16, v3
-; VI-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v15, v14
 ; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v12, 16, v6
 ; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v13, 16, v2
-; VI-SAFE-NEXT:    v_cndmask_b32_e32 v14, v14, v15, vcc
-; VI-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v13, v12
+; VI-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v15, v14
 ; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v10, 16, v5
 ; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
-; VI-SAFE-NEXT:    v_cndmask_b32_e32 v12, v12, v13, vcc
-; VI-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v11, v10
+; VI-SAFE-NEXT:    v_cndmask_b32_e32 v14, v14, v15, vcc
+; VI-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v13, v12
 ; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
 ; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
+; VI-SAFE-NEXT:    v_cndmask_b32_e32 v12, v12, v13, vcc
+; VI-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v11, v10
 ; VI-SAFE-NEXT:    v_cndmask_b32_e32 v10, v10, v11, vcc
 ; VI-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v9, v8
 ; VI-SAFE-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
@@ -430,12 +430,12 @@ define <8 x half> @test_fmax_legacy_ugt_v8f16(<8 x half> %a, <8 x half> %b) #0 {
 ; VI-NNAN:       ; %bb.0:
 ; VI-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NNAN-NEXT:    v_max_f16_sdwa v8, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NNAN-NEXT:    v_max_f16_e32 v3, v3, v7
 ; VI-NNAN-NEXT:    v_max_f16_sdwa v9, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NNAN-NEXT:    v_max_f16_e32 v2, v2, v6
 ; VI-NNAN-NEXT:    v_max_f16_sdwa v10, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NNAN-NEXT:    v_max_f16_e32 v1, v1, v5
 ; VI-NNAN-NEXT:    v_max_f16_sdwa v11, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NNAN-NEXT:    v_max_f16_e32 v3, v3, v7
+; VI-NNAN-NEXT:    v_max_f16_e32 v2, v2, v6
+; VI-NNAN-NEXT:    v_max_f16_e32 v1, v1, v5
 ; VI-NNAN-NEXT:    v_max_f16_e32 v0, v0, v4
 ; VI-NNAN-NEXT:    v_or_b32_e32 v0, v0, v11
 ; VI-NNAN-NEXT:    v_or_b32_e32 v1, v1, v10

diff  --git a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll
index 22773ac06c122..a0096b01c659d 100644
--- a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll
@@ -232,9 +232,9 @@ define <4 x half> @test_fmin_legacy_ule_v4f16(<4 x half> %a, <4 x half> %b) #0 {
 ; GFX9-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
 ; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
-; GFX9-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v7, v6
 ; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX9-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v7, v6
 ; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
 ; GFX9-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v5, v4
 ; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
@@ -261,9 +261,9 @@ define <4 x half> @test_fmin_legacy_ule_v4f16(<4 x half> %a, <4 x half> %b) #0 {
 ; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
 ; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
-; VI-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v7, v6
 ; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; VI-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v7, v6
 ; VI-SAFE-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
 ; VI-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v5, v4
 ; VI-SAFE-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
@@ -281,8 +281,8 @@ define <4 x half> @test_fmin_legacy_ule_v4f16(<4 x half> %a, <4 x half> %b) #0 {
 ; VI-NNAN:       ; %bb.0:
 ; VI-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NNAN-NEXT:    v_min_f16_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NNAN-NEXT:    v_min_f16_e32 v1, v1, v3
 ; VI-NNAN-NEXT:    v_min_f16_sdwa v5, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NNAN-NEXT:    v_min_f16_e32 v1, v1, v3
 ; VI-NNAN-NEXT:    v_min_f16_e32 v0, v0, v2
 ; VI-NNAN-NEXT:    v_or_b32_e32 v0, v0, v5
 ; VI-NNAN-NEXT:    v_or_b32_e32 v1, v1, v4
@@ -348,17 +348,17 @@ define <8 x half> @test_fmin_legacy_ule_v8f16(<8 x half> %a, <8 x half> %b) #0 {
 ; GFX9-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v14, 16, v7
 ; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v15, 16, v3
-; GFX9-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v15, v14
 ; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v12, 16, v6
 ; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v13, 16, v2
-; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v14, v14, v15, vcc
-; GFX9-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v13, v12
+; GFX9-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v15, v14
 ; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v10, 16, v5
 ; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
-; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v12, v12, v13, vcc
-; GFX9-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v11, v10
+; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v14, v14, v15, vcc
+; GFX9-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v13, v12
 ; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
 ; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
+; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v12, v12, v13, vcc
+; GFX9-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v11, v10
 ; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v10, v10, v11, vcc
 ; GFX9-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v9, v8
 ; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
@@ -395,17 +395,17 @@ define <8 x half> @test_fmin_legacy_ule_v8f16(<8 x half> %a, <8 x half> %b) #0 {
 ; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v14, 16, v7
 ; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v15, 16, v3
-; VI-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v15, v14
 ; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v12, 16, v6
 ; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v13, 16, v2
-; VI-SAFE-NEXT:    v_cndmask_b32_e32 v14, v14, v15, vcc
-; VI-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v13, v12
+; VI-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v15, v14
 ; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v10, 16, v5
 ; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
-; VI-SAFE-NEXT:    v_cndmask_b32_e32 v12, v12, v13, vcc
-; VI-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v11, v10
+; VI-SAFE-NEXT:    v_cndmask_b32_e32 v14, v14, v15, vcc
+; VI-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v13, v12
 ; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
 ; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
+; VI-SAFE-NEXT:    v_cndmask_b32_e32 v12, v12, v13, vcc
+; VI-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v11, v10
 ; VI-SAFE-NEXT:    v_cndmask_b32_e32 v10, v10, v11, vcc
 ; VI-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v9, v8
 ; VI-SAFE-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
@@ -431,12 +431,12 @@ define <8 x half> @test_fmin_legacy_ule_v8f16(<8 x half> %a, <8 x half> %b) #0 {
 ; VI-NNAN:       ; %bb.0:
 ; VI-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NNAN-NEXT:    v_min_f16_sdwa v8, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NNAN-NEXT:    v_min_f16_e32 v3, v3, v7
 ; VI-NNAN-NEXT:    v_min_f16_sdwa v9, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NNAN-NEXT:    v_min_f16_e32 v2, v2, v6
 ; VI-NNAN-NEXT:    v_min_f16_sdwa v10, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NNAN-NEXT:    v_min_f16_e32 v1, v1, v5
 ; VI-NNAN-NEXT:    v_min_f16_sdwa v11, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NNAN-NEXT:    v_min_f16_e32 v3, v3, v7
+; VI-NNAN-NEXT:    v_min_f16_e32 v2, v2, v6
+; VI-NNAN-NEXT:    v_min_f16_e32 v1, v1, v5
 ; VI-NNAN-NEXT:    v_min_f16_e32 v0, v0, v4
 ; VI-NNAN-NEXT:    v_or_b32_e32 v0, v0, v11
 ; VI-NNAN-NEXT:    v_or_b32_e32 v1, v1, v10

diff  --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
index 90593dc965164..76099a7d2c4d4 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
@@ -131,8 +131,8 @@ define amdgpu_kernel void @fold_user_fneg_fabs_v2f16(<2 x half> addrspace(1)* %o
 
 ; GCN-LABEL: {{^}}s_fneg_multi_use_fabs_v2f16:
 ; GFX9: s_and_b32 [[ABS:s[0-9]+]], s{{[0-9]+}}, 0x7fff7fff
-; GFX9: v_mov_b32_e32 [[V_ABS:v[0-9]+]], [[ABS]]
 ; GFX9: s_xor_b32 [[NEG:s[0-9]+]], [[ABS]], 0x80008000
+; GFX9: v_mov_b32_e32 [[V_ABS:v[0-9]+]], [[ABS]]
 ; GFX9-DAG: v_mov_b32_e32 [[V_NEG:v[0-9]+]], [[NEG]]
 ; GFX9-DAG: global_store_dword v{{[0-9]+}}, [[V_ABS]], s{{\[[0-9]+:[0-9]+\]}}
 ; GFX9: global_store_dword v{{[0-9]+}}, [[V_NEG]], s{{\[[0-9]+:[0-9]+\]}}

diff  --git a/llvm/test/CodeGen/AMDGPU/fp-min-max-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp-min-max-atomics.ll
index e1aa9b0097483..669a44c830d16 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-min-max-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-min-max-atomics.ll
@@ -382,15 +382,15 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f32_off4_slc(<4 x i32> inre
 ;
 ; GFX10-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    s_clause 0x2
+; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-NEXT:    s_nop 0
-; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x3c
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x3c
 ; GFX10-NEXT:    buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ds_write_b32 v1, v0
@@ -906,17 +906,16 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inre
 ;
 ; GFX10-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    s_clause 0x2
+; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-NEXT:    s_nop 0
-; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x3c
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x3c
 ; GFX10-NEXT:    buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -967,17 +966,16 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inre
 ;
 ; G_GFX10-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc:
 ; G_GFX10:       ; %bb.0: ; %main_body
-; G_GFX10-NEXT:    s_clause 0x2
+; G_GFX10-NEXT:    s_clause 0x1
 ; G_GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; G_GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; G_GFX10-NEXT:    s_nop 0
-; G_GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x3c
 ; G_GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; G_GFX10-NEXT:    v_mov_b32_e32 v0, s2
 ; G_GFX10-NEXT:    v_mov_b32_e32 v1, s3
+; G_GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x3c
 ; G_GFX10-NEXT:    buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc
 ; G_GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; G_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; G_GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; G_GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; G_GFX10-NEXT:    s_endpgm
 ;

diff  --git a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
index a7eaaf83f23a3..1b028c48e8043 100644
--- a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
@@ -54,8 +54,8 @@ define amdgpu_kernel void @buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> inreg %r
 ; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x44
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, s10
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
 ; GFX90A-NEXT:    buffer_atomic_add_f64 v[0:1], v3, s[4:7], 0 idxen offset:4 glc slc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
@@ -104,8 +104,8 @@ define amdgpu_kernel void @raw_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> inre
 ; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x44
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, s10
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
 ; GFX90A-NEXT:    buffer_atomic_add_f64 v[0:1], v3, s[4:7], 4 offen glc slc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
@@ -154,8 +154,8 @@ define amdgpu_kernel void @struct_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> i
 ; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x44
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, s10
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
 ; GFX90A-NEXT:    buffer_atomic_add_f64 v[0:1], v3, s[4:7], 0 idxen offset:4 glc slc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
@@ -204,8 +204,8 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> inre
 ; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x44
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, s10
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
 ; GFX90A-NEXT:    buffer_atomic_min_f64 v[0:1], v3, s[4:7], 4 offen glc slc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
@@ -254,8 +254,8 @@ define amdgpu_kernel void @struct_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> i
 ; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x44
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, s10
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
 ; GFX90A-NEXT:    buffer_atomic_min_f64 v[0:1], v3, s[4:7], 0 idxen offset:4 glc slc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
@@ -304,8 +304,8 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inre
 ; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x44
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, s10
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
 ; GFX90A-NEXT:    buffer_atomic_max_f64 v[0:1], v3, s[4:7], 4 offen glc slc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
@@ -354,8 +354,8 @@ define amdgpu_kernel void @struct_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> i
 ; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x44
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, s10
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
 ; GFX90A-NEXT:    buffer_atomic_max_f64 v[0:1], v3, s[4:7], 0 idxen offset:4 glc slc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]

diff  --git a/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll b/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll
index e39653e6a357b..dad35934c962b 100644
--- a/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll
@@ -338,10 +338,10 @@ define amdgpu_kernel void @fp_to_sint_v2i64(<2 x i64> addrspace(1)* %out, <2 x f
 ; VI-NEXT:    v_floor_f32_e32 v1, v1
 ; VI-NEXT:    v_fma_f32 v2, v1, s3, |v0|
 ; VI-NEXT:    v_trunc_f32_e32 v4, s0
-; VI-NEXT:    v_mul_f32_e64 v3, |v4|, s2
 ; VI-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; VI-NEXT:    v_floor_f32_e32 v3, v3
+; VI-NEXT:    v_mul_f32_e64 v3, |v4|, s2
 ; VI-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; VI-NEXT:    v_floor_f32_e32 v3, v3
 ; VI-NEXT:    v_cvt_u32_f32_e32 v5, v3
 ; VI-NEXT:    v_fma_f32 v3, v3, s3, |v4|
 ; VI-NEXT:    v_ashrrev_i32_e32 v0, 31, v0
@@ -517,10 +517,10 @@ define amdgpu_kernel void @fp_to_sint_v4i64(<4 x i64> addrspace(1)* %out, <4 x f
 ; VI-NEXT:    v_mul_f32_e64 v1, |v0|, s8
 ; VI-NEXT:    v_floor_f32_e32 v1, v1
 ; VI-NEXT:    v_fma_f32 v2, v1, s9, |v0|
-; VI-NEXT:    v_trunc_f32_e32 v4, s0
 ; VI-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; VI-NEXT:    v_mul_f32_e64 v3, |v4|, s8
+; VI-NEXT:    v_trunc_f32_e32 v4, s0
 ; VI-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; VI-NEXT:    v_mul_f32_e64 v3, |v4|, s8
 ; VI-NEXT:    v_floor_f32_e32 v3, v3
 ; VI-NEXT:    v_ashrrev_i32_e32 v0, 31, v0
 ; VI-NEXT:    v_cvt_u32_f32_e32 v5, v3
@@ -549,8 +549,8 @@ define amdgpu_kernel void @fp_to_sint_v4i64(<4 x i64> addrspace(1)* %out, <4 x f
 ; VI-NEXT:    v_cvt_u32_f32_e32 v9, v6
 ; VI-NEXT:    v_fma_f32 v6, v6, s9, |v8|
 ; VI-NEXT:    v_cvt_u32_f32_e32 v10, v6
-; VI-NEXT:    v_sub_u32_e32 v6, vcc, v5, v4
 ; VI-NEXT:    v_xor_b32_e32 v7, v7, v4
+; VI-NEXT:    v_sub_u32_e32 v6, vcc, v5, v4
 ; VI-NEXT:    v_ashrrev_i32_e32 v5, 31, v8
 ; VI-NEXT:    v_subb_u32_e32 v7, vcc, v7, v4, vcc
 ; VI-NEXT:    v_xor_b32_e32 v4, v10, v5

diff  --git a/llvm/test/CodeGen/AMDGPU/fpext.f16.ll b/llvm/test/CodeGen/AMDGPU/fpext.f16.ll
index 56aa155648088..693c456f37a26 100644
--- a/llvm/test/CodeGen/AMDGPU/fpext.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fpext.f16.ll
@@ -56,8 +56,8 @@ entry:
 ; GCN: buffer_load_dword
 ; SI-DAG: v_lshrrev_b32_e32
 ; SI-DAG: v_cvt_f32_f16_e32
-; GFX89: v_cvt_f32_f16_sdwa
 ; GCN: v_cvt_f32_f16_e32
+; GFX89: v_cvt_f32_f16_sdwa
 
 ; GCN: v_cvt_f64_f32_e32
 ; GCN: v_cvt_f64_f32_e32

diff  --git a/llvm/test/CodeGen/AMDGPU/fpow.ll b/llvm/test/CodeGen/AMDGPU/fpow.ll
index 8c9d2c09bf582..7568ddda1a59b 100644
--- a/llvm/test/CodeGen/AMDGPU/fpow.ll
+++ b/llvm/test/CodeGen/AMDGPU/fpow.ll
@@ -171,10 +171,10 @@ define <2 x half> @v_pow_v2f16(<2 x half> %x, <2 x half> %y) {
 ; GFX8-NEXT:    v_log_f32_e32 v0, v0
 ; GFX8-NEXT:    v_mul_legacy_f32_e32 v2, v3, v2
 ; GFX8-NEXT:    v_mul_legacy_f32_e32 v0, v1, v0
-; GFX8-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX8-NEXT:    v_exp_f32_e32 v2, v2
-; GFX8-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX8-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX8-NEXT:    v_cvt_f16_f32_sdwa v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -189,10 +189,10 @@ define <2 x half> @v_pow_v2f16(<2 x half> %x, <2 x half> %y) {
 ; GFX9-NEXT:    v_log_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_legacy_f32_e32 v2, v3, v2
 ; GFX9-NEXT:    v_mul_legacy_f32_e32 v0, v1, v0
-; GFX9-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX9-NEXT:    v_exp_f32_e32 v2, v2
-; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX9-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX9-NEXT:    v_cvt_f16_f32_e32 v1, v2
+; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX9-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -253,10 +253,10 @@ define <2 x half> @v_pow_v2f16_fneg_lhs(<2 x half> %x, <2 x half> %y) {
 ; GFX8-NEXT:    v_log_f32_e32 v0, v0
 ; GFX8-NEXT:    v_mul_legacy_f32_e32 v2, v3, v2
 ; GFX8-NEXT:    v_mul_legacy_f32_e32 v0, v1, v0
-; GFX8-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX8-NEXT:    v_exp_f32_e32 v2, v2
-; GFX8-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX8-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX8-NEXT:    v_cvt_f16_f32_sdwa v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -271,10 +271,10 @@ define <2 x half> @v_pow_v2f16_fneg_lhs(<2 x half> %x, <2 x half> %y) {
 ; GFX9-NEXT:    v_log_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_legacy_f32_e32 v2, v3, v2
 ; GFX9-NEXT:    v_mul_legacy_f32_e32 v0, v1, v0
-; GFX9-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX9-NEXT:    v_exp_f32_e32 v2, v2
-; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX9-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX9-NEXT:    v_cvt_f16_f32_e32 v1, v2
+; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX9-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -282,19 +282,19 @@ define <2 x half> @v_pow_v2f16_fneg_lhs(<2 x half> %x, <2 x half> %y) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-;GFX10-NEXT:     v_cvt_f32_f16_sdwa v2, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-;GFX10-NEXT:     v_cvt_f32_f16_e64 v0, -v0
-;GFX10-NEXT:     v_cvt_f32_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-;GFX10-NEXT:     v_cvt_f32_f16_e32 v1, v1
-;GFX10-NEXT:     v_log_f32_e32 v2, v2
-;GFX10-NEXT:     v_log_f32_e32 v0, v0
-;GFX10-NEXT:     v_mul_legacy_f32_e32 v2, v3, v2
-;GFX10-NEXT:     v_mul_legacy_f32_e32 v0, v1, v0
-;GFX10-NEXT:     v_exp_f32_e32 v1, v2
-;GFX10-NEXT:     v_exp_f32_e32 v0, v0
-;GFX10-NEXT:     v_cvt_f16_f32_e32 v1, v1
-;GFX10-NEXT:     v_cvt_f16_f32_e32 v0, v0
-;GFX10-NEXT:     v_pack_b32_f16 v0, v0, v1
+; GFX10-NEXT:    v_cvt_f32_f16_sdwa v2, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT:    v_cvt_f32_f16_e64 v0, -v0
+; GFX10-NEXT:    v_cvt_f32_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX10-NEXT:    v_log_f32_e32 v2, v2
+; GFX10-NEXT:    v_log_f32_e32 v0, v0
+; GFX10-NEXT:    v_mul_legacy_f32_e32 v2, v3, v2
+; GFX10-NEXT:    v_mul_legacy_f32_e32 v0, v1, v0
+; GFX10-NEXT:    v_exp_f32_e32 v1, v2
+; GFX10-NEXT:    v_exp_f32_e32 v0, v0
+; GFX10-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX10-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX10-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %x.fneg = fneg <2 x half> %x
   %pow = call <2 x half> @llvm.pow.v2f16(<2 x half> %x.fneg, <2 x half> %y)
@@ -336,10 +336,10 @@ define <2 x half> @v_pow_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y) {
 ; GFX8-NEXT:    v_log_f32_e32 v0, v0
 ; GFX8-NEXT:    v_mul_legacy_f32_e32 v2, v3, v2
 ; GFX8-NEXT:    v_mul_legacy_f32_e32 v0, v1, v0
-; GFX8-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX8-NEXT:    v_exp_f32_e32 v2, v2
-; GFX8-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX8-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX8-NEXT:    v_cvt_f16_f32_sdwa v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -354,10 +354,10 @@ define <2 x half> @v_pow_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y) {
 ; GFX9-NEXT:    v_log_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_legacy_f32_e32 v2, v3, v2
 ; GFX9-NEXT:    v_mul_legacy_f32_e32 v0, v1, v0
-; GFX9-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX9-NEXT:    v_exp_f32_e32 v2, v2
-; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX9-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX9-NEXT:    v_cvt_f16_f32_e32 v1, v2
+; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX9-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -365,19 +365,19 @@ define <2 x half> @v_pow_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-;GFX10-NEXT:     v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-;GFX10-NEXT:     v_cvt_f32_f16_e32 v0, v0
-;GFX10-NEXT:     v_cvt_f32_f16_sdwa v3, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-;GFX10-NEXT:     v_cvt_f32_f16_e64 v1, -v1
-;GFX10-NEXT:     v_log_f32_e32 v2, v2
-;GFX10-NEXT:     v_log_f32_e32 v0, v0
-;GFX10-NEXT:     v_mul_legacy_f32_e32 v2, v3, v2
-;GFX10-NEXT:     v_mul_legacy_f32_e32 v0, v1, v0
-;GFX10-NEXT:     v_exp_f32_e32 v1, v2
-;GFX10-NEXT:     v_exp_f32_e32 v0, v0
-;GFX10-NEXT:     v_cvt_f16_f32_e32 v1, v1
-;GFX10-NEXT:     v_cvt_f16_f32_e32 v0, v0
-;GFX10-NEXT:     v_pack_b32_f16 v0, v0, v1
+; GFX10-NEXT:    v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX10-NEXT:    v_cvt_f32_f16_sdwa v3, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT:    v_cvt_f32_f16_e64 v1, -v1
+; GFX10-NEXT:    v_log_f32_e32 v2, v2
+; GFX10-NEXT:    v_log_f32_e32 v0, v0
+; GFX10-NEXT:    v_mul_legacy_f32_e32 v2, v3, v2
+; GFX10-NEXT:    v_mul_legacy_f32_e32 v0, v1, v0
+; GFX10-NEXT:    v_exp_f32_e32 v1, v2
+; GFX10-NEXT:    v_exp_f32_e32 v0, v0
+; GFX10-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX10-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX10-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %y.fneg = fneg <2 x half> %y
   %pow = call <2 x half> @llvm.pow.v2f16(<2 x half> %x, <2 x half> %y.fneg)
@@ -397,9 +397,9 @@ define <2 x half> @v_pow_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y) {
 ; GFX6-NEXT:    s_mov_b32 s4, 0x80008000
 ; GFX6-NEXT:    v_xor_b32_e32 v0, s4, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX6-NEXT:    v_xor_b32_e32 v2, s4, v2
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
@@ -424,10 +424,10 @@ define <2 x half> @v_pow_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y) {
 ; GFX8-NEXT:    v_log_f32_e32 v0, v0
 ; GFX8-NEXT:    v_mul_legacy_f32_e32 v2, v3, v2
 ; GFX8-NEXT:    v_mul_legacy_f32_e32 v0, v1, v0
-; GFX8-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX8-NEXT:    v_exp_f32_e32 v2, v2
-; GFX8-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX8-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX8-NEXT:    v_cvt_f16_f32_sdwa v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -442,10 +442,10 @@ define <2 x half> @v_pow_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y) {
 ; GFX9-NEXT:    v_log_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_legacy_f32_e32 v2, v3, v2
 ; GFX9-NEXT:    v_mul_legacy_f32_e32 v0, v1, v0
-; GFX9-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX9-NEXT:    v_exp_f32_e32 v2, v2
-; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX9-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX9-NEXT:    v_cvt_f16_f32_e32 v1, v2
+; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX9-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;

diff  --git a/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll b/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll
index 570f97f354989..cbf42e9fb80cc 100644
--- a/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll
@@ -114,12 +114,13 @@ entry:
 ; SI: v_ashrrev_i32_e32 v[[R_I64_0_High:[0-9]+]], 31, v[[R_I64_0_Low]]
 ; SI: v_cvt_i32_f32_e32 v[[R_I64_1_Low:[0-9]+]], v[[A_F32_1]]
 ; SI: v_ashrrev_i32_e32 v[[R_I64_1_High:[0-9]+]], 31, v[[R_I64_1_Low]]
-; VI: v_cvt_f32_f16_sdwa v[[A_F32_1:[0-9]+]], v[[A_F16_0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; VI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_F16_0]]
-; VI: v_cvt_i32_f32_e32 v[[R_I64_1_Low:[0-9]+]], v[[A_F32_1]]
-; VI: v_cvt_i32_f32_e32 v[[R_I64_0_Low:[0-9]+]], v[[A_F32_0]]
-; VI: v_ashrrev_i32_e32 v[[R_I64_1_High:[0-9]+]], 31, v[[R_I64_1_Low]]
-; VI: v_ashrrev_i32_e32 v[[R_I64_0_High:[0-9]+]], 31, v[[R_I64_0_Low]]
+; VI-DAG: v_cvt_f32_f16_sdwa v[[A_F32_1:[0-9]+]], v[[A_F16_0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_F16_0]]
+; VI-DAG: v_cvt_i32_f32_e32 v[[R_I64_1_Low:[0-9]+]], v[[A_F32_1]]
+; VI-DAG: v_cvt_i32_f32_e32 v[[R_I64_0_Low:[0-9]+]], v[[A_F32_0]]
+; VI-NOT: DEADBEEF
+; VI-DAG: v_ashrrev_i32_e32 v[[R_I64_1_High:[0-9]+]], 31, v[[R_I64_1_Low]]
+; VI-DAG: v_ashrrev_i32_e32 v[[R_I64_0_High:[0-9]+]], 31, v[[R_I64_0_Low]]
 ; GCN: buffer_store_dwordx4 v{{\[}}[[R_I64_0_Low]]{{\:}}[[R_I64_1_High]]{{\]}}
 ; GCN: s_endpgm
 define amdgpu_kernel void @fptosi_v2f16_to_v2i64(

diff  --git a/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll b/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll
index 0792de1406b59..34c426402b980 100644
--- a/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll
@@ -112,10 +112,10 @@ entry:
 ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
 ; SI: v_cvt_u32_f32_e32 v[[R_I64_0_Low:[0-9]+]], v[[A_F32_0]]
 ; SI: v_cvt_u32_f32_e32 v[[R_I64_1_Low:[0-9]+]], v[[A_F32_1]]
-; VI: v_cvt_f32_f16_sdwa v[[A_F32_1:[0-9]+]], v[[A_F16_0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_F16_0]]
-; VI: v_cvt_u32_f32_e32 v[[R_I64_1_Low:[0-9]+]], v[[A_F32_1]]
+; VI: v_cvt_f32_f16_sdwa v[[A_F32_1:[0-9]+]], v[[A_F16_0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI: v_cvt_u32_f32_e32 v[[R_I64_0_Low:[0-9]+]], v[[A_F32_0]]
+; VI: v_cvt_u32_f32_e32 v[[R_I64_1_Low:[0-9]+]], v[[A_F32_1]]
 ; GCN: v_mov_b32_e32 v[[R_I64_0_High:[0-9]+]], 0
 ; GCN: buffer_store_dwordx4 v{{\[}}[[R_I64_0_Low]]{{\:}}[[R_I64_1_High]]{{\]}}
 ; GCN: s_endpgm

diff  --git a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
index 469cfe96fb6d2..e17ac33a8bb7c 100644
--- a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
@@ -63,13 +63,12 @@ entry:
 ; GCN: buffer_load_dwordx4 v{{\[}}[[A_F64_0:[0-9]+]]:[[A_F64_3:[0-9]+]]{{\]}}
 ; GCN-DAG: v_cvt_f32_f64_e32 v[[A_F32_0:[0-9]+]], v{{\[}}[[A_F64_0]]:{{[0-9]+}}{{\]}}
 ; GCN-DAG: v_cvt_f32_f64_e32 v[[A_F32_1:[0-9]+]], v{{\[}}{{[0-9]+}}:[[A_F64_3]]{{\]}}
+; VI: v_cvt_f16_f32_sdwa v[[R_F16_HI:[0-9]+]], v[[A_F32_1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
 ; GCN-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[A_F32_0]]
 ;
 ; SI-DAG: v_cvt_f16_f32_e32 v[[CVTHI:[0-9]+]], v[[A_F32_1]]
 ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[CVTHI]]
 
-; VI: v_cvt_f16_f32_sdwa v[[R_F16_HI:[0-9]+]], v[[A_F32_1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-
 ; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
 
 ; GFX9-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[A_F32_1]]

diff  --git a/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll
index 0302ffd81600c..092e25237dff7 100644
--- a/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll
+++ b/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll
@@ -14,16 +14,16 @@ define void @callee_with_stack_and_call() #0 {
 ; SPILL-TO-VGPR-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; SPILL-TO-VGPR-NEXT:    s_mov_b64 exec, s[4:5]
 ; SPILL-TO-VGPR-NEXT:    v_writelane_b32 v40, s33, 2
-; SPILL-TO-VGPR-NEXT:    v_writelane_b32 v40, s30, 0
 ; SPILL-TO-VGPR-NEXT:    s_mov_b32 s33, s32
 ; SPILL-TO-VGPR-NEXT:    s_addk_i32 s32, 0x400
+; SPILL-TO-VGPR-NEXT:    v_writelane_b32 v40, s30, 0
 ; SPILL-TO-VGPR-NEXT:    v_mov_b32_e32 v0, 0
-; SPILL-TO-VGPR-NEXT:    s_getpc_b64 s[4:5]
-; SPILL-TO-VGPR-NEXT:    s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
-; SPILL-TO-VGPR-NEXT:    s_addc_u32 s5, s5, external_void_func_void at rel32@hi+12
 ; SPILL-TO-VGPR-NEXT:    v_writelane_b32 v40, s31, 1
 ; SPILL-TO-VGPR-NEXT:    buffer_store_dword v0, off, s[0:3], s33
 ; SPILL-TO-VGPR-NEXT:    s_waitcnt vmcnt(0)
+; SPILL-TO-VGPR-NEXT:    s_getpc_b64 s[4:5]
+; SPILL-TO-VGPR-NEXT:    s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
+; SPILL-TO-VGPR-NEXT:    s_addc_u32 s5, s5, external_void_func_void at rel32@hi+12
 ; SPILL-TO-VGPR-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; SPILL-TO-VGPR-NEXT:    v_readlane_b32 s4, v40, 0
 ; SPILL-TO-VGPR-NEXT:    v_readlane_b32 s5, v40, 1
@@ -52,11 +52,11 @@ define void @callee_with_stack_and_call() #0 {
 ; NO-SPILL-TO-VGPR-NEXT:    s_waitcnt vmcnt(0)
 ; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 exec, s[6:7]
 ; NO-SPILL-TO-VGPR-NEXT:    v_mov_b32_e32 v0, 0
+; NO-SPILL-TO-VGPR-NEXT:    buffer_store_dword v0, off, s[0:3], s33
+; NO-SPILL-TO-VGPR-NEXT:    s_waitcnt vmcnt(0)
 ; NO-SPILL-TO-VGPR-NEXT:    s_getpc_b64 s[4:5]
 ; NO-SPILL-TO-VGPR-NEXT:    s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
 ; NO-SPILL-TO-VGPR-NEXT:    s_addc_u32 s5, s5, external_void_func_void at rel32@hi+12
-; NO-SPILL-TO-VGPR-NEXT:    buffer_store_dword v0, off, s[0:3], s33
-; NO-SPILL-TO-VGPR-NEXT:    s_waitcnt vmcnt(0)
 ; NO-SPILL-TO-VGPR-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 s[6:7], exec
 ; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 exec, 3

diff  --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll
index 1c460656073ba..d16758d780914 100644
--- a/llvm/test/CodeGen/AMDGPU/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/frem.ll
@@ -59,9 +59,9 @@ define amdgpu_kernel void @frem_f16(half addrspace(1)* %out, half addrspace(1)*
 ; CI-NEXT:    s_mov_b32 s9, s5
 ; CI-NEXT:    s_mov_b32 s4, s6
 ; CI-NEXT:    s_mov_b32 s5, s7
-; CI-NEXT:    s_mov_b32 s3, s11
 ; CI-NEXT:    s_mov_b32 s6, s10
 ; CI-NEXT:    s_mov_b32 s7, s11
+; CI-NEXT:    s_mov_b32 s3, s11
 ; CI-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
 ; CI-NEXT:    buffer_load_ushort v1, off, s[0:3], 0 offset:8
 ; CI-NEXT:    s_waitcnt vmcnt(1)
@@ -1393,9 +1393,9 @@ define amdgpu_kernel void @frem_v2f16(<2 x half> addrspace(1)* %out, <2 x half>
 ; CI-NEXT:    s_mov_b32 s1, s5
 ; CI-NEXT:    s_mov_b32 s4, s6
 ; CI-NEXT:    s_mov_b32 s5, s7
-; CI-NEXT:    s_mov_b32 s11, s3
 ; CI-NEXT:    s_mov_b32 s6, s2
 ; CI-NEXT:    s_mov_b32 s7, s3
+; CI-NEXT:    s_mov_b32 s11, s3
 ; CI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
 ; CI-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:16
 ; CI-NEXT:    s_mov_b32 s6, 3

diff  --git a/llvm/test/CodeGen/AMDGPU/fshl.ll b/llvm/test/CodeGen/AMDGPU/fshl.ll
index 19d259144daf2..c8029d9735ac1 100644
--- a/llvm/test/CodeGen/AMDGPU/fshl.ll
+++ b/llvm/test/CodeGen/AMDGPU/fshl.ll
@@ -171,8 +171,8 @@ define amdgpu_kernel void @fshl_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x,
 ; SI-NEXT:    v_mov_b32_e32 v0, s9
 ; SI-NEXT:    s_not_b32 s1, s1
 ; SI-NEXT:    v_alignbit_b32 v0, s3, v0, 1
-; SI-NEXT:    v_mov_b32_e32 v1, s1
 ; SI-NEXT:    s_lshr_b32 s3, s3, 1
+; SI-NEXT:    v_mov_b32_e32 v1, s1
 ; SI-NEXT:    v_alignbit_b32 v1, s3, v0, v1
 ; SI-NEXT:    v_mov_b32_e32 v0, s8
 ; SI-NEXT:    s_not_b32 s0, s0
@@ -192,9 +192,9 @@ define amdgpu_kernel void @fshl_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x,
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s7
 ; VI-NEXT:    s_not_b32 s1, s1
-; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    s_lshr_b32 s7, s5, 1
 ; VI-NEXT:    v_alignbit_b32 v0, s5, v0, 1
+; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_alignbit_b32 v1, s7, v0, v1
 ; VI-NEXT:    v_mov_b32_e32 v0, s6
 ; VI-NEXT:    s_not_b32 s0, s0
@@ -218,8 +218,8 @@ define amdgpu_kernel void @fshl_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x,
 ; GFX9-NEXT:    s_lshr_b32 s0, s5, 1
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s7
 ; GFX9-NEXT:    s_not_b32 s1, s9
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    v_alignbit_b32 v0, s5, v0, 1
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    v_alignbit_b32 v1, s0, v0, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX9-NEXT:    s_not_b32 s1, s8
@@ -363,20 +363,20 @@ define amdgpu_kernel void @fshl_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x,
 ; SI-NEXT:    v_mov_b32_e32 v0, s15
 ; SI-NEXT:    s_not_b32 s3, s3
 ; SI-NEXT:    v_alignbit_b32 v0, s11, v0, 1
-; SI-NEXT:    v_mov_b32_e32 v1, s3
 ; SI-NEXT:    s_lshr_b32 s11, s11, 1
+; SI-NEXT:    v_mov_b32_e32 v1, s3
 ; SI-NEXT:    v_alignbit_b32 v3, s11, v0, v1
 ; SI-NEXT:    v_mov_b32_e32 v0, s14
 ; SI-NEXT:    s_not_b32 s2, s2
-; SI-NEXT:    v_mov_b32_e32 v1, s2
 ; SI-NEXT:    v_alignbit_b32 v0, s10, v0, 1
 ; SI-NEXT:    s_lshr_b32 s3, s10, 1
+; SI-NEXT:    v_mov_b32_e32 v1, s2
 ; SI-NEXT:    v_alignbit_b32 v2, s3, v0, v1
 ; SI-NEXT:    v_mov_b32_e32 v0, s13
 ; SI-NEXT:    s_not_b32 s1, s1
-; SI-NEXT:    v_mov_b32_e32 v1, s1
 ; SI-NEXT:    v_alignbit_b32 v0, s9, v0, 1
 ; SI-NEXT:    s_lshr_b32 s2, s9, 1
+; SI-NEXT:    v_mov_b32_e32 v1, s1
 ; SI-NEXT:    v_alignbit_b32 v1, s2, v0, v1
 ; SI-NEXT:    v_mov_b32_e32 v0, s12
 ; SI-NEXT:    s_not_b32 s0, s0
@@ -396,21 +396,21 @@ define amdgpu_kernel void @fshl_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x,
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s11
 ; VI-NEXT:    s_not_b32 s3, s3
-; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    s_lshr_b32 s11, s7, 1
 ; VI-NEXT:    v_alignbit_b32 v0, s7, v0, 1
+; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_alignbit_b32 v3, s11, v0, v1
 ; VI-NEXT:    v_mov_b32_e32 v0, s10
 ; VI-NEXT:    s_not_b32 s2, s2
-; VI-NEXT:    v_mov_b32_e32 v1, s2
 ; VI-NEXT:    v_alignbit_b32 v0, s6, v0, 1
 ; VI-NEXT:    s_lshr_b32 s3, s6, 1
+; VI-NEXT:    v_mov_b32_e32 v1, s2
 ; VI-NEXT:    v_alignbit_b32 v2, s3, v0, v1
 ; VI-NEXT:    v_mov_b32_e32 v0, s9
 ; VI-NEXT:    s_not_b32 s1, s1
-; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_alignbit_b32 v0, s5, v0, 1
 ; VI-NEXT:    s_lshr_b32 s2, s5, 1
+; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_alignbit_b32 v1, s2, v0, v1
 ; VI-NEXT:    v_mov_b32_e32 v0, s8
 ; VI-NEXT:    s_not_b32 s0, s0
@@ -434,20 +434,20 @@ define amdgpu_kernel void @fshl_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x,
 ; GFX9-NEXT:    s_lshr_b32 s0, s7, 1
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s11
 ; GFX9-NEXT:    s_not_b32 s1, s15
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    v_alignbit_b32 v0, s7, v0, 1
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    v_alignbit_b32 v3, s0, v0, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s10
 ; GFX9-NEXT:    s_not_b32 s1, s14
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    v_alignbit_b32 v0, s6, v0, 1
 ; GFX9-NEXT:    s_lshr_b32 s0, s6, 1
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    v_alignbit_b32 v2, s0, v0, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s9
 ; GFX9-NEXT:    s_not_b32 s1, s13
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    v_alignbit_b32 v0, s5, v0, 1
 ; GFX9-NEXT:    s_lshr_b32 s0, s5, 1
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    v_alignbit_b32 v1, s0, v0, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s8
 ; GFX9-NEXT:    s_not_b32 s1, s12
@@ -494,10 +494,10 @@ define amdgpu_kernel void @fshl_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x,
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_lshr_b32 s0, s7, 1
-; GFX10-NEXT:    v_alignbit_b32 v5, s5, s9, 1
-; GFX10-NEXT:    v_alignbit_b32 v6, s4, s8, 1
 ; GFX10-NEXT:    v_alignbit_b32 v0, s7, s11, 1
 ; GFX10-NEXT:    v_alignbit_b32 v1, s6, s10, 1
+; GFX10-NEXT:    v_alignbit_b32 v5, s5, s9, 1
+; GFX10-NEXT:    v_alignbit_b32 v6, s4, s8, 1
 ; GFX10-NEXT:    s_not_b32 s1, s15
 ; GFX10-NEXT:    s_lshr_b32 s6, s6, 1
 ; GFX10-NEXT:    s_not_b32 s7, s14

diff  --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll
index 2648fde7b6c96..ec6b03f29f57d 100644
--- a/llvm/test/CodeGen/AMDGPU/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/fshr.ll
@@ -640,9 +640,9 @@ define i16 @v_fshr_i16(i16 %src0, i16 %src1, i16 %src2) {
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_xor_b32_e32 v3, -1, v2
-; VI-NEXT:    v_and_b32_e32 v2, 15, v2
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 1, v0
 ; VI-NEXT:    v_and_b32_e32 v3, 15, v3
+; VI-NEXT:    v_and_b32_e32 v2, 15, v2
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, v3, v0
 ; VI-NEXT:    v_lshrrev_b16_e32 v1, v2, v1
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -652,9 +652,9 @@ define i16 @v_fshr_i16(i16 %src0, i16 %src1, i16 %src2) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_xor_b32_e32 v3, -1, v2
-; GFX9-NEXT:    v_and_b32_e32 v2, 15, v2
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v3, 15, v3
+; GFX9-NEXT:    v_and_b32_e32 v2, 15, v2
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, v3, v0
 ; GFX9-NEXT:    v_lshrrev_b16_e32 v1, v2, v1
 ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -670,8 +670,8 @@ define i16 @v_fshr_i16(i16 %src0, i16 %src1, i16 %src2) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_xor_b32_e32 v3, -1, v2
-; GFX10-NEXT:    v_and_b32_e32 v2, 15, v2
 ; GFX10-NEXT:    v_lshlrev_b16 v0, 1, v0
+; GFX10-NEXT:    v_and_b32_e32 v2, 15, v2
 ; GFX10-NEXT:    v_and_b32_e32 v3, 15, v3
 ; GFX10-NEXT:    v_lshrrev_b16 v1, v2, v1
 ; GFX10-NEXT:    v_lshlrev_b16 v0, v3, v0
@@ -710,9 +710,9 @@ define <2 x i16> @v_fshr_v2i16(<2 x i16> %src0, <2 x i16> %src1, <2 x i16> %src2
 ; VI-NEXT:    v_lshlrev_b16_e32 v3, v3, v5
 ; VI-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-NEXT:    v_xor_b32_e32 v4, -1, v2
-; VI-NEXT:    v_and_b32_e32 v2, 15, v2
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 1, v0
 ; VI-NEXT:    v_and_b32_e32 v4, 15, v4
+; VI-NEXT:    v_and_b32_e32 v2, 15, v2
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, v4, v0
 ; VI-NEXT:    v_lshrrev_b16_e32 v1, v2, v1
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -724,9 +724,9 @@ define <2 x i16> @v_fshr_v2i16(<2 x i16> %src0, <2 x i16> %src1, <2 x i16> %src2
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_xor_b32_e32 v3, -1, v2
 ; GFX9-NEXT:    s_mov_b32 s4, 0xf000f
-; GFX9-NEXT:    v_and_b32_e32 v2, s4, v2
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1]
 ; GFX9-NEXT:    v_and_b32_e32 v3, s4, v3
+; GFX9-NEXT:    v_and_b32_e32 v2, s4, v2
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v0, v3, v0
 ; GFX9-NEXT:    v_pk_lshrrev_b16 v1, v2, v1
 ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -767,9 +767,9 @@ define <3 x i16> @v_fshr_v3i16(<3 x i16> %src0, <3 x i16> %src1, <3 x i16> %src2
 ; SI-NEXT:    s_mov_b32 s4, 0xffff
 ; SI-NEXT:    v_or_b32_e32 v3, 16, v8
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; SI-NEXT:    v_alignbit_b32 v3, v2, v4, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_and_b32_e32 v0, s4, v0
+; SI-NEXT:    v_alignbit_b32 v3, v2, v4, v3
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v1
 ; SI-NEXT:    v_and_b32_e32 v2, s4, v3
 ; SI-NEXT:    v_alignbit_b32 v1, v3, v1, 16
@@ -788,9 +788,9 @@ define <3 x i16> @v_fshr_v3i16(<3 x i16> %src0, <3 x i16> %src1, <3 x i16> %src2
 ; VI-NEXT:    v_lshlrev_b16_e32 v6, v6, v8
 ; VI-NEXT:    v_or_b32_sdwa v6, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-NEXT:    v_xor_b32_e32 v7, -1, v5
-; VI-NEXT:    v_and_b32_e32 v5, 15, v5
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 1, v1
 ; VI-NEXT:    v_and_b32_e32 v7, 15, v7
+; VI-NEXT:    v_and_b32_e32 v5, 15, v5
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, v7, v1
 ; VI-NEXT:    v_lshrrev_b16_e32 v3, v5, v3
 ; VI-NEXT:    v_or_b32_e32 v1, v1, v3
@@ -817,9 +817,9 @@ define <3 x i16> @v_fshr_v3i16(<3 x i16> %src0, <3 x i16> %src1, <3 x i16> %src2
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v6, v6, v8
 ; GFX9-NEXT:    v_or_b32_e32 v6, v6, v7
 ; GFX9-NEXT:    v_xor_b32_e32 v7, -1, v5
-; GFX9-NEXT:    v_and_b32_e32 v5, 15, v5
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 1, v1
 ; GFX9-NEXT:    v_and_b32_e32 v7, 15, v7
+; GFX9-NEXT:    v_and_b32_e32 v5, 15, v5
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, v7, v1
 ; GFX9-NEXT:    v_lshrrev_b16_e32 v3, v5, v3
 ; GFX9-NEXT:    v_or_b32_e32 v1, v1, v3
@@ -843,30 +843,30 @@ define <3 x i16> @v_fshr_v3i16(<3 x i16> %src0, <3 x i16> %src1, <3 x i16> %src2
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_xor_b32_e32 v8, -1, v4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
+; GFX10-NEXT:    v_xor_b32_e32 v8, -1, v4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v0
 ; GFX10-NEXT:    v_lshlrev_b16 v0, 1, v0
 ; GFX10-NEXT:    v_and_b32_e32 v4, 15, v4
-; GFX10-NEXT:    v_and_b32_e32 v8, 15, v8
 ; GFX10-NEXT:    v_and_b32_e32 v9, 15, v6
 ; GFX10-NEXT:    v_xor_b32_e32 v6, -1, v6
+; GFX10-NEXT:    v_and_b32_e32 v8, 15, v8
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
-; GFX10-NEXT:    v_lshrrev_b16 v2, v4, v2
-; GFX10-NEXT:    v_lshlrev_b16 v0, v8, v0
 ; GFX10-NEXT:    v_lshlrev_b16 v10, 1, v10
-; GFX10-NEXT:    v_and_b32_e32 v6, 15, v6
 ; GFX10-NEXT:    v_xor_b32_e32 v11, -1, v5
+; GFX10-NEXT:    v_and_b32_e32 v6, 15, v6
+; GFX10-NEXT:    v_lshlrev_b16 v0, v8, v0
+; GFX10-NEXT:    v_lshrrev_b16 v2, v4, v2
 ; GFX10-NEXT:    v_lshrrev_b16 v4, v9, v7
-; GFX10-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX10-NEXT:    v_and_b32_e32 v2, 15, v5
-; GFX10-NEXT:    v_lshlrev_b16 v6, v6, v10
 ; GFX10-NEXT:    v_lshlrev_b16 v1, 1, v1
+; GFX10-NEXT:    v_lshlrev_b16 v6, v6, v10
 ; GFX10-NEXT:    v_and_b32_e32 v7, 15, v11
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX10-NEXT:    v_lshrrev_b16 v2, v2, v3
+; GFX10-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, 15, v5
 ; GFX10-NEXT:    v_or_b32_e32 v4, v6, v4
 ; GFX10-NEXT:    v_lshlrev_b16 v1, v7, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX10-NEXT:    v_lshrrev_b16 v2, v2, v3
 ; GFX10-NEXT:    v_lshl_or_b32 v0, v4, 16, v0
 ; GFX10-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -893,9 +893,9 @@ define <4 x i16> @v_fshr_v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2
 ; SI-NEXT:    v_alignbit_b32 v2, v2, v5, v4
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; SI-NEXT:    v_and_b32_e32 v2, s4, v2
-; SI-NEXT:    v_or_b32_e32 v2, v2, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_and_b32_e32 v0, s4, v0
+; SI-NEXT:    v_or_b32_e32 v2, v2, v3
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v1
 ; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
 ; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
@@ -906,8 +906,8 @@ define <4 x i16> @v_fshr_v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v5
 ; VI-NEXT:    v_and_b32_e32 v7, 15, v6
-; VI-NEXT:    v_xor_b32_e32 v6, -1, v6
 ; VI-NEXT:    v_mov_b32_e32 v8, 1
+; VI-NEXT:    v_xor_b32_e32 v6, -1, v6
 ; VI-NEXT:    v_lshlrev_b16_sdwa v9, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NEXT:    v_and_b32_e32 v6, 15, v6
 ; VI-NEXT:    v_lshrrev_b16_sdwa v7, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
@@ -920,9 +920,9 @@ define <4 x i16> @v_fshr_v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2
 ; VI-NEXT:    v_and_b32_e32 v7, 15, v7
 ; VI-NEXT:    v_lshlrev_b16_e32 v7, v7, v8
 ; VI-NEXT:    v_xor_b32_e32 v8, -1, v5
-; VI-NEXT:    v_and_b32_e32 v5, 15, v5
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 1, v1
 ; VI-NEXT:    v_and_b32_e32 v8, 15, v8
+; VI-NEXT:    v_and_b32_e32 v5, 15, v5
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, v8, v1
 ; VI-NEXT:    v_lshrrev_b16_e32 v3, v5, v3
 ; VI-NEXT:    v_or_b32_e32 v1, v1, v3
@@ -944,8 +944,8 @@ define <4 x i16> @v_fshr_v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v5
 ; GFX9-NEXT:    v_and_b32_e32 v7, 15, v6
-; GFX9-NEXT:    v_xor_b32_e32 v6, -1, v6
 ; GFX9-NEXT:    v_mov_b32_e32 v8, 1
+; GFX9-NEXT:    v_xor_b32_e32 v6, -1, v6
 ; GFX9-NEXT:    v_lshlrev_b16_sdwa v9, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-NEXT:    v_and_b32_e32 v6, 15, v6
 ; GFX9-NEXT:    v_lshrrev_b16_sdwa v7, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
@@ -958,9 +958,9 @@ define <4 x i16> @v_fshr_v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2
 ; GFX9-NEXT:    v_and_b32_e32 v7, 15, v7
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v7, v7, v8
 ; GFX9-NEXT:    v_xor_b32_e32 v8, -1, v5
-; GFX9-NEXT:    v_and_b32_e32 v5, 15, v5
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 1, v1
 ; GFX9-NEXT:    v_and_b32_e32 v8, 15, v8
+; GFX9-NEXT:    v_and_b32_e32 v5, 15, v5
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, v8, v1
 ; GFX9-NEXT:    v_lshrrev_b16_e32 v3, v5, v3
 ; GFX9-NEXT:    v_or_b32_e32 v1, v1, v3
@@ -973,9 +973,9 @@ define <4 x i16> @v_fshr_v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2
 ; GFX9-NEXT:    v_lshrrev_b16_e32 v2, v3, v2
 ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0xffff
-; GFX9-NEXT:    v_and_b32_e32 v1, v2, v1
 ; GFX9-NEXT:    v_or_b32_e32 v7, v7, v9
 ; GFX9-NEXT:    v_and_b32_e32 v0, v2, v0
+; GFX9-NEXT:    v_and_b32_e32 v1, v2, v1
 ; GFX9-NEXT:    v_lshl_or_b32 v0, v7, 16, v0
 ; GFX9-NEXT:    v_lshl_or_b32 v1, v6, 16, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -991,8 +991,8 @@ define <4 x i16> @v_fshr_v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v5
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v0
 ; GFX10-NEXT:    v_xor_b32_e32 v9, -1, v6
 ; GFX10-NEXT:    v_and_b32_e32 v6, 15, v6
@@ -1038,8 +1038,8 @@ define i64 @v_fshr_i64(i64 %src0, i64 %src1, i64 %src2) {
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v5, 63, v4
-; SI-NEXT:    v_not_b32_e32 v4, v4
 ; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 1
+; SI-NEXT:    v_not_b32_e32 v4, v4
 ; SI-NEXT:    v_and_b32_e32 v4, 63, v4
 ; SI-NEXT:    v_lshr_b64 v[2:3], v[2:3], v5
 ; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], v4
@@ -1051,8 +1051,8 @@ define i64 @v_fshr_i64(i64 %src0, i64 %src1, i64 %src2) {
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_and_b32_e32 v5, 63, v4
-; VI-NEXT:    v_not_b32_e32 v4, v4
 ; VI-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
+; VI-NEXT:    v_not_b32_e32 v4, v4
 ; VI-NEXT:    v_and_b32_e32 v4, 63, v4
 ; VI-NEXT:    v_lshrrev_b64 v[2:3], v5, v[2:3]
 ; VI-NEXT:    v_lshlrev_b64 v[0:1], v4, v[0:1]
@@ -1064,8 +1064,8 @@ define i64 @v_fshr_i64(i64 %src0, i64 %src1, i64 %src2) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v5, 63, v4
-; GFX9-NEXT:    v_not_b32_e32 v4, v4
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
+; GFX9-NEXT:    v_not_b32_e32 v4, v4
 ; GFX9-NEXT:    v_and_b32_e32 v4, 63, v4
 ; GFX9-NEXT:    v_lshrrev_b64 v[2:3], v5, v[2:3]
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v4, v[0:1]
@@ -1121,8 +1121,8 @@ define <2 x i64> @v_fshr_v2i64(<2 x i64> %src0, <2 x i64> %src1, <2 x i64> %src2
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_and_b32_e32 v9, 63, v8
-; VI-NEXT:    v_not_b32_e32 v8, v8
 ; VI-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
+; VI-NEXT:    v_not_b32_e32 v8, v8
 ; VI-NEXT:    v_and_b32_e32 v8, 63, v8
 ; VI-NEXT:    v_lshrrev_b64 v[4:5], v9, v[4:5]
 ; VI-NEXT:    v_lshlrev_b64 v[0:1], v8, v[0:1]
@@ -1142,8 +1142,8 @@ define <2 x i64> @v_fshr_v2i64(<2 x i64> %src0, <2 x i64> %src1, <2 x i64> %src2
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v9, 63, v8
-; GFX9-NEXT:    v_not_b32_e32 v8, v8
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
+; GFX9-NEXT:    v_not_b32_e32 v8, v8
 ; GFX9-NEXT:    v_and_b32_e32 v8, 63, v8
 ; GFX9-NEXT:    v_lshrrev_b64 v[4:5], v9, v[4:5]
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v8, v[0:1]

diff  --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
index 84866764e6508..6e89bfe3ae02e 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
@@ -100,14 +100,14 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm() #0 {
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 1
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_i1 at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_i1 at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    buffer_store_byte v0, off, s[0:3], s32
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
@@ -193,13 +193,13 @@ define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 {
 ; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_i1_signext at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_i1_signext at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    buffer_store_byte v0, off, s[0:3], s32
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
@@ -291,13 +291,13 @@ define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 {
 ; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_i1_zeroext at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_i1_zeroext at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    buffer_store_byte v0, off, s[0:3], s32
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
@@ -387,14 +387,14 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm(i32) #0 {
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_i8 at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_i8 at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -477,13 +477,13 @@ define amdgpu_gfx void @test_call_external_void_func_i8_signext(i32) #0 {
 ; GFX9-NEXT:    global_load_sbyte v0, v[0:1], off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_i8_signext at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_i8_signext at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -569,13 +569,13 @@ define amdgpu_gfx void @test_call_external_void_func_i8_zeroext(i32) #0 {
 ; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_i8_zeroext at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_i8_zeroext at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -659,14 +659,14 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm() #0 {
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_i16 at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_i16 at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -749,13 +749,13 @@ define amdgpu_gfx void @test_call_external_void_func_i16_signext(i32) #0 {
 ; GFX9-NEXT:    global_load_ushort v0, v[0:1], off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_i16_signext at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_i16_signext at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -841,13 +841,13 @@ define amdgpu_gfx void @test_call_external_void_func_i16_zeroext(i32) #0 {
 ; GFX9-NEXT:    global_load_ushort v0, v[0:1], off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_i16_zeroext at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_i16_zeroext at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -931,14 +931,14 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm(i32) #0 {
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 42
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_i32 at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_i32 at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -1019,15 +1019,15 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm() #0 {
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x7b
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_i64 at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_i64 at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -1113,13 +1113,13 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v2i64 at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v2i64 at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -1205,17 +1205,17 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm() #0 {
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 1
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 2
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 3
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 4
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v2i64 at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v2i64 at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -1305,15 +1305,15 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 1
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v3i64 at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v3i64 at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -1408,17 +1408,17 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 1
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 2
 ; GFX9-NEXT:    v_mov_b32_e32 v6, 3
 ; GFX9-NEXT:    v_mov_b32_e32 v7, 4
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v4i64 at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v4i64 at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -1513,14 +1513,14 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm() #0 {
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x4400
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_f16 at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_f16 at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -1601,14 +1601,14 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm() #0 {
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 4.0
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_f32 at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_f32 at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -1689,15 +1689,15 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm() #0 {
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 1.0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 2.0
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v2f32 at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v2f32 at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -1780,16 +1780,16 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm() #0 {
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 1.0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 2.0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 4.0
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v3f32 at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v3f32 at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -1874,18 +1874,18 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm() #0 {
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 1.0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 2.0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 4.0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, -1.0
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0.5
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v5f32 at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v5f32 at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -1974,15 +1974,15 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm() #0 {
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x40100000
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_f64 at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_f64 at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -2065,17 +2065,17 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm() #0 {
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 2.0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0x40100000
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v2f64 at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v2f64 at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -2162,19 +2162,19 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm() #0 {
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 2.0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0x40100000
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0x40200000
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v3f64 at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v3f64 at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -2204,10 +2204,10 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm() #0 {
 ; GFX10-NEXT:    v_mov_b32_e32 v5, 0x40200000
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_getpc_b64 s[4:5]
 ; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_v3f64 at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_v3f64 at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
@@ -2238,10 +2238,10 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v5, 0x40200000
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v3f64 at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v3f64 at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
@@ -2266,13 +2266,13 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16() #0 {
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v2i16 at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v2i16 at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -2355,13 +2355,13 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16() #0 {
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v3i16 at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v3i16 at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -2444,13 +2444,13 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16() #0 {
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v3f16 at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v3f16 at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -2532,15 +2532,15 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm() #0 {
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x20001
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 3
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v3i16 at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v3i16 at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -2623,15 +2623,15 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm() #0 {
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x40003c00
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x4400
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v3f16 at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v3f16 at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -2715,13 +2715,13 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16() #0 {
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v4i16 at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v4i16 at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -2803,15 +2803,15 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm() #0 {
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x20001
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x40003
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v4i16 at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v4i16 at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -2895,13 +2895,13 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16() #0 {
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v2f16 at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v2f16 at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -2984,13 +2984,13 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32() #0 {
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v2i32 at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v2i32 at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -3072,15 +3072,15 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm() #0 {
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 1
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v2i32 at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v2i32 at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -3163,16 +3163,16 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm(i32) #0 {
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 3
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 4
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 5
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v3i32 at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v3i32 at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -3257,17 +3257,17 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32(i32) #0 {
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 3
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 4
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 5
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 6
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v3i32_i32 at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v3i32_i32 at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -3355,13 +3355,13 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32() #0 {
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v4i32 at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v4i32 at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -3443,17 +3443,17 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm() #0 {
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 1
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 2
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 3
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 4
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v4i32 at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v4i32 at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -3540,18 +3540,18 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm() #0 {
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 1
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 2
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 3
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 4
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 5
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v5i32 at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v5i32 at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -3642,16 +3642,16 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32() #0 {
 ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx4 v[0:3], v8, s[4:5]
 ; GFX9-NEXT:    global_load_dwordx4 v[4:7], v8, s[4:5] offset:16
-; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v8i32 at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v8i32 at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -3676,16 +3676,15 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    global_load_dwordx4 v[0:3], v8, s[4:5]
 ; GFX10-NEXT:    global_load_dwordx4 v[4:7], v8, s[4:5] offset:16
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    s_getpc_b64 s[4:5]
 ; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_v8i32 at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_v8i32 at rel32@hi+12
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
@@ -3711,16 +3710,15 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_clause 0x1
 ; GFX10-SCRATCH-NEXT:    global_load_dwordx4 v[0:3], v8, s[0:1]
 ; GFX10-SCRATCH-NEXT:    global_load_dwordx4 v[4:7], v8, s[0:1] offset:16
-; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v8i32 at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v8i32 at rel32@hi+12
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
@@ -3746,9 +3744,9 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm() #0 {
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 1
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 2
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 3
@@ -3757,10 +3755,10 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 6
 ; GFX9-NEXT:    v_mov_b32_e32 v6, 7
 ; GFX9-NEXT:    v_mov_b32_e32 v7, 8
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v8i32 at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v8i32 at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -3792,10 +3790,10 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm() #0 {
 ; GFX10-NEXT:    v_mov_b32_e32 v7, 8
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_getpc_b64 s[4:5]
 ; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_v8i32 at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_v8i32 at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
@@ -3828,10 +3826,10 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v7, 8
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v8i32 at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v8i32 at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
@@ -3857,18 +3855,18 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 {
 ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v16, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx4 v[0:3], v16, s[4:5]
 ; GFX9-NEXT:    global_load_dwordx4 v[4:7], v16, s[4:5] offset:16
 ; GFX9-NEXT:    global_load_dwordx4 v[8:11], v16, s[4:5] offset:32
 ; GFX9-NEXT:    global_load_dwordx4 v[12:15], v16, s[4:5] offset:48
-; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v16i32 at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v16i32 at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -3893,18 +3891,17 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x3
 ; GFX10-NEXT:    global_load_dwordx4 v[0:3], v16, s[4:5]
 ; GFX10-NEXT:    global_load_dwordx4 v[4:7], v16, s[4:5] offset:16
 ; GFX10-NEXT:    global_load_dwordx4 v[8:11], v16, s[4:5] offset:32
 ; GFX10-NEXT:    global_load_dwordx4 v[12:15], v16, s[4:5] offset:48
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    s_getpc_b64 s[4:5]
 ; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_v16i32 at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_v16i32 at rel32@hi+12
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
@@ -3930,18 +3927,17 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_clause 0x3
 ; GFX10-SCRATCH-NEXT:    global_load_dwordx4 v[0:3], v16, s[0:1]
 ; GFX10-SCRATCH-NEXT:    global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
 ; GFX10-SCRATCH-NEXT:    global_load_dwordx4 v[8:11], v16, s[0:1] offset:32
 ; GFX10-SCRATCH-NEXT:    global_load_dwordx4 v[12:15], v16, s[0:1] offset:48
-; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v16i32 at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v16i32 at rel32@hi+12
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
@@ -3969,8 +3965,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 {
 ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v28, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx4 v[0:3], v28, s[4:5]
 ; GFX9-NEXT:    global_load_dwordx4 v[4:7], v28, s[4:5] offset:16
@@ -3981,11 +3977,11 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 {
 ; GFX9-NEXT:    global_load_dwordx4 v[24:27], v28, s[4:5] offset:96
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    global_load_dwordx4 v[28:31], v28, s[4:5] offset:112
-; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v32i32 at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v32i32 at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -4010,8 +4006,6 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x7
 ; GFX10-NEXT:    global_load_dwordx4 v[0:3], v32, s[4:5]
@@ -4022,10 +4016,11 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 {
 ; GFX10-NEXT:    global_load_dwordx4 v[20:23], v32, s[4:5] offset:80
 ; GFX10-NEXT:    global_load_dwordx4 v[24:27], v32, s[4:5] offset:96
 ; GFX10-NEXT:    global_load_dwordx4 v[28:31], v32, s[4:5] offset:112
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    s_getpc_b64 s[4:5]
 ; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_v32i32 at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_v32i32 at rel32@hi+12
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
@@ -4051,8 +4046,6 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_clause 0x7
 ; GFX10-SCRATCH-NEXT:    global_load_dwordx4 v[0:3], v32, s[0:1]
@@ -4063,10 +4056,11 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 {
 ; GFX10-SCRATCH-NEXT:    global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
 ; GFX10-SCRATCH-NEXT:    global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
 ; GFX10-SCRATCH-NEXT:    global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
-; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v32i32 at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v32i32 at rel32@hi+12
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
@@ -4094,8 +4088,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 {
 ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v28, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx4 v[0:3], v28, s[4:5]
 ; GFX9-NEXT:    global_load_dwordx4 v[4:7], v28, s[4:5] offset:16
@@ -4106,11 +4100,11 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 {
 ; GFX9-NEXT:    global_load_dwordx4 v[24:27], v28, s[4:5] offset:96
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    global_load_dwordx4 v[28:31], v28, s[4:5] offset:112
-; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v32i32_i32 at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v32i32_i32 at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-NEXT:    global_load_dword v32, v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -4139,8 +4133,6 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    global_load_dword v33, v[0:1], off
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x7
@@ -4152,10 +4144,11 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 {
 ; GFX10-NEXT:    global_load_dwordx4 v[20:23], v32, s[4:5] offset:80
 ; GFX10-NEXT:    global_load_dwordx4 v[24:27], v32, s[4:5] offset:96
 ; GFX10-NEXT:    global_load_dwordx4 v[28:31], v32, s[4:5] offset:112
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    s_getpc_b64 s[4:5]
 ; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_v32i32_i32 at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_v32i32_i32 at rel32@hi+12
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_waitcnt vmcnt(8)
 ; GFX10-NEXT:    buffer_store_dword v33, off, s[0:3], s32
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
@@ -4183,8 +4176,6 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    global_load_dword v33, v[0:1], off
 ; GFX10-SCRATCH-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_clause 0x7
@@ -4196,10 +4187,11 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
 ; GFX10-SCRATCH-NEXT:    global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
 ; GFX10-SCRATCH-NEXT:    global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
-; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v32i32_i32 at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v32i32_i32 at rel32@hi+12
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(8)
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v33, s32
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
@@ -4229,17 +4221,17 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(i32 addrspace(1)* %o
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX9-NEXT:    s_mov_b32 s33, s32
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v41, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 42
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    v_mov_b32_e32 v42, v1
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, external_i32_func_i32 at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, external_i32_func_i32 at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    v_mov_b32_e32 v42, v1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9-NEXT:    global_store_dword v[41:42], v0, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -4265,16 +4257,16 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(i32 addrspace(1)* %o
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s4
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-NEXT:    s_mov_b32 s33, s32
-; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    v_mov_b32_e32 v41, v0
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 42
+; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_mov_b32_e32 v42, v1
 ; GFX10-NEXT:    s_getpc_b64 s[4:5]
 ; GFX10-NEXT:    s_add_u32 s4, s4, external_i32_func_i32 at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s5, s5, external_i32_func_i32 at rel32@hi+12
-; GFX10-NEXT:    v_mov_b32_e32 v42, v1
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX10-NEXT:    global_store_dword v[41:42], v0, off
@@ -4303,16 +4295,16 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(i32 addrspace(1)* %o
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v42, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v41, v0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 42
+; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v42, v1
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_i32_func_i32 at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_i32_func_i32 at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v42, v1
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    global_store_dword v[41:42], v0, off
@@ -4345,16 +4337,16 @@ define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 {
 ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v1, v2, s[4:5] offset:4
 ; GFX9-NEXT:    global_load_ubyte v0, v2, s[4:5]
-; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_struct_i8_i32 at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_struct_i8_i32 at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -4379,16 +4371,15 @@ define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    global_load_ubyte v0, v2, s[4:5]
 ; GFX10-NEXT:    global_load_dword v1, v2, s[4:5] offset:4
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    s_getpc_b64 s[4:5]
 ; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_struct_i8_i32 at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_struct_i8_i32 at rel32@hi+12
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
@@ -4414,16 +4405,15 @@ define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_clause 0x1
 ; GFX10-SCRATCH-NEXT:    global_load_ubyte v0, v2, s[0:1]
 ; GFX10-SCRATCH-NEXT:    global_load_dword v1, v2, s[0:1] offset:4
-; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_struct_i8_i32 at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_struct_i8_i32 at rel32@hi+12
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
@@ -4453,14 +4443,14 @@ define amdgpu_gfx void @test_call_external_void_func_byval_struct_i8_i32() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 3
 ; GFX9-NEXT:    buffer_store_byte v0, off, s[0:3], s33
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 8
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:4
 ; GFX9-NEXT:    v_lshrrev_b32_e64 v0, 6, s33
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_byval_struct_i8_i32 at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_byval_struct_i8_i32 at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -4560,14 +4550,14 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:4
 ; GFX9-NEXT:    v_lshrrev_b32_e64 v0, 6, s33
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_addk_i32 s32, 0x800
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_add_u32_e32 v0, 8, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e64 v1, 6, s33
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9-NEXT:    buffer_load_ubyte v0, off, s[0:3], s33 offset:8
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:12
@@ -4698,11 +4688,11 @@ define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 {
 ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx4 v[0:3], v0, s[4:5]
-; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v16i8 at rel32@lo+4
@@ -4714,14 +4704,14 @@ define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
-; GFX9-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 24, v2
-; GFX9-NEXT:    v_mov_b32_e32 v8, v2
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 8, v3
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 16, v3
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 24, v3
+; GFX9-NEXT:    v_mov_b32_e32 v4, v1
+; GFX9-NEXT:    v_mov_b32_e32 v8, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v12, v3
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v16
 ; GFX9-NEXT:    v_mov_b32_e32 v2, v17
@@ -4765,14 +4755,14 @@ define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 {
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
-; GFX10-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 24, v2
-; GFX10-NEXT:    v_mov_b32_e32 v8, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v13, 8, v3
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v14, 16, v3
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v15, 24, v3
+; GFX10-NEXT:    v_mov_b32_e32 v4, v1
+; GFX10-NEXT:    v_mov_b32_e32 v8, v2
 ; GFX10-NEXT:    v_mov_b32_e32 v12, v3
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v16
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v17
@@ -4817,14 +4807,14 @@ define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 {
 ; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
 ; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
 ; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
 ; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
 ; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v11, 24, v2
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v8, v2
 ; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v13, 8, v3
 ; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v14, 16, v3
 ; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v15, 24, v3
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v4, v1
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v8, v2
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v12, v3
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, v16
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v2, v17
@@ -4903,14 +4893,14 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm_inreg() #0 {
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 1
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_i1_inreg at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_i1_inreg at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    buffer_store_byte v0, off, s[0:3], s32
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
@@ -4994,14 +4984,14 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 {
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7b
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[6:7]
 ; GFX9-NEXT:    s_add_u32 s6, s6, external_void_func_i8_inreg at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s7, s7, external_void_func_i8_inreg at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -5082,14 +5072,14 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 {
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7b
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[6:7]
 ; GFX9-NEXT:    s_add_u32 s6, s6, external_void_func_i16_inreg at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s7, s7, external_void_func_i16_inreg at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -5170,14 +5160,14 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 {
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s4, 42
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[6:7]
 ; GFX9-NEXT:    s_add_u32 s6, s6, external_void_func_i32_inreg at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s7, s7, external_void_func_i32_inreg at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -5258,15 +5248,15 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 {
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7b
 ; GFX9-NEXT:    s_mov_b32 s5, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[6:7]
 ; GFX9-NEXT:    s_add_u32 s6, s6, external_void_func_i64_inreg at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s7, s7, external_void_func_i64_inreg at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -5351,13 +5341,13 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 {
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[8:9]
 ; GFX9-NEXT:    s_add_u32 s8, s8, external_void_func_v2i64_inreg at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s9, s9, external_void_func_v2i64_inreg at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[8:9]
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -5441,17 +5431,17 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 {
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s4, 1
 ; GFX9-NEXT:    s_mov_b32 s5, 2
 ; GFX9-NEXT:    s_mov_b32 s6, 3
 ; GFX9-NEXT:    s_mov_b32 s7, 4
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[8:9]
 ; GFX9-NEXT:    s_add_u32 s8, s8, external_void_func_v2i64_inreg at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s9, s9, external_void_func_v2i64_inreg at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[8:9]
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -5540,15 +5530,15 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 {
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s8, 1
 ; GFX9-NEXT:    s_mov_b32 s9, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[10:11]
 ; GFX9-NEXT:    s_add_u32 s10, s10, external_void_func_v3i64_inreg at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s11, s11, external_void_func_v3i64_inreg at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[10:11]
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -5640,17 +5630,17 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 {
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s8, 1
 ; GFX9-NEXT:    s_mov_b32 s9, 2
 ; GFX9-NEXT:    s_mov_b32 s10, 3
 ; GFX9-NEXT:    s_mov_b32 s11, 4
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[12:13]
 ; GFX9-NEXT:    s_add_u32 s12, s12, external_void_func_v4i64_inreg at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s13, s13, external_void_func_v4i64_inreg at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[12:13]
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -5743,14 +5733,14 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 {
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_movk_i32 s4, 0x4400
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[6:7]
 ; GFX9-NEXT:    s_add_u32 s6, s6, external_void_func_f16_inreg at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s7, s7, external_void_func_f16_inreg at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -5831,14 +5821,14 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 {
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s4, 4.0
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[6:7]
 ; GFX9-NEXT:    s_add_u32 s6, s6, external_void_func_f32_inreg at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s7, s7, external_void_func_f32_inreg at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -5919,15 +5909,15 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 {
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s4, 1.0
 ; GFX9-NEXT:    s_mov_b32 s5, 2.0
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[6:7]
 ; GFX9-NEXT:    s_add_u32 s6, s6, external_void_func_v2f32_inreg at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s7, s7, external_void_func_v2f32_inreg at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -6010,16 +6000,16 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 {
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s4, 1.0
 ; GFX9-NEXT:    s_mov_b32 s5, 2.0
 ; GFX9-NEXT:    s_mov_b32 s6, 4.0
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[8:9]
 ; GFX9-NEXT:    s_add_u32 s8, s8, external_void_func_v3f32_inreg at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s9, s9, external_void_func_v3f32_inreg at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[8:9]
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -6104,18 +6094,18 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 {
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s4, 1.0
 ; GFX9-NEXT:    s_mov_b32 s5, 2.0
 ; GFX9-NEXT:    s_mov_b32 s6, 4.0
 ; GFX9-NEXT:    s_mov_b32 s7, -1.0
 ; GFX9-NEXT:    s_mov_b32 s8, 0.5
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[10:11]
 ; GFX9-NEXT:    s_add_u32 s10, s10, external_void_func_v5f32_inreg at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s11, s11, external_void_func_v5f32_inreg at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[10:11]
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -6204,15 +6194,15 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 {
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s4, 0
 ; GFX9-NEXT:    s_mov_b32 s5, 0x40100000
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[6:7]
 ; GFX9-NEXT:    s_add_u32 s6, s6, external_void_func_f64_inreg at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s7, s7, external_void_func_f64_inreg at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -6295,17 +6285,17 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 {
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s4, 0
 ; GFX9-NEXT:    s_mov_b32 s5, 2.0
 ; GFX9-NEXT:    s_mov_b32 s6, 0
 ; GFX9-NEXT:    s_mov_b32 s7, 0x40100000
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[8:9]
 ; GFX9-NEXT:    s_add_u32 s8, s8, external_void_func_v2f64_inreg at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s9, s9, external_void_func_v2f64_inreg at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[8:9]
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -6392,19 +6382,19 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 {
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s4, 0
 ; GFX9-NEXT:    s_mov_b32 s5, 2.0
 ; GFX9-NEXT:    s_mov_b32 s6, 0
 ; GFX9-NEXT:    s_mov_b32 s7, 0x40100000
 ; GFX9-NEXT:    s_mov_b32 s8, 0
 ; GFX9-NEXT:    s_mov_b32 s9, 0x40200000
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[10:11]
 ; GFX9-NEXT:    s_add_u32 s10, s10, external_void_func_v3f64_inreg at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s11, s11, external_void_func_v3f64_inreg at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[10:11]
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -6434,10 +6424,10 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 {
 ; GFX10-NEXT:    s_mov_b32 s9, 0x40200000
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_getpc_b64 s[10:11]
 ; GFX10-NEXT:    s_add_u32 s10, s10, external_void_func_v3f64_inreg at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s11, s11, external_void_func_v3f64_inreg at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[10:11]
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
@@ -6468,10 +6458,10 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s9, 0x40200000
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v3f64_inreg at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v3f64_inreg at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
@@ -6496,13 +6486,13 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 {
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[6:7]
 ; GFX9-NEXT:    s_add_u32 s6, s6, external_void_func_v2i16_inreg at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s7, s7, external_void_func_v2i16_inreg at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -6585,13 +6575,13 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 {
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[6:7]
 ; GFX9-NEXT:    s_add_u32 s6, s6, external_void_func_v3i16_inreg at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s7, s7, external_void_func_v3i16_inreg at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -6674,13 +6664,13 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 {
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[6:7]
 ; GFX9-NEXT:    s_add_u32 s6, s6, external_void_func_v3f16_inreg at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s7, s7, external_void_func_v3f16_inreg at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -6762,15 +6752,15 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 {
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s4, 0x20001
 ; GFX9-NEXT:    s_mov_b32 s5, 3
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[6:7]
 ; GFX9-NEXT:    s_add_u32 s6, s6, external_void_func_v3i16_inreg at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s7, s7, external_void_func_v3i16_inreg at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -6853,15 +6843,15 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 {
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s4, 0x40003c00
 ; GFX9-NEXT:    s_movk_i32 s5, 0x4400
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[6:7]
 ; GFX9-NEXT:    s_add_u32 s6, s6, external_void_func_v3f16_inreg at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s7, s7, external_void_func_v3f16_inreg at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -6945,13 +6935,13 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 {
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[6:7]
 ; GFX9-NEXT:    s_add_u32 s6, s6, external_void_func_v4i16_inreg at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s7, s7, external_void_func_v4i16_inreg at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -7033,15 +7023,15 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 {
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s4, 0x20001
 ; GFX9-NEXT:    s_mov_b32 s5, 0x40003
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[6:7]
 ; GFX9-NEXT:    s_add_u32 s6, s6, external_void_func_v4i16_inreg at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s7, s7, external_void_func_v4i16_inreg at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -7125,13 +7115,13 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 {
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[6:7]
 ; GFX9-NEXT:    s_add_u32 s6, s6, external_void_func_v2f16_inreg at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s7, s7, external_void_func_v2f16_inreg at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -7214,13 +7204,13 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 {
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[6:7]
 ; GFX9-NEXT:    s_add_u32 s6, s6, external_void_func_v2i32_inreg at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s7, s7, external_void_func_v2i32_inreg at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -7302,15 +7292,15 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 {
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s4, 1
 ; GFX9-NEXT:    s_mov_b32 s5, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[6:7]
 ; GFX9-NEXT:    s_add_u32 s6, s6, external_void_func_v2i32_inreg at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s7, s7, external_void_func_v2i32_inreg at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -7393,16 +7383,16 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 {
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s4, 3
 ; GFX9-NEXT:    s_mov_b32 s5, 4
 ; GFX9-NEXT:    s_mov_b32 s6, 5
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[8:9]
 ; GFX9-NEXT:    s_add_u32 s8, s8, external_void_func_v3i32_inreg at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s9, s9, external_void_func_v3i32_inreg at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[8:9]
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -7487,17 +7477,17 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 {
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s4, 3
 ; GFX9-NEXT:    s_mov_b32 s5, 4
 ; GFX9-NEXT:    s_mov_b32 s6, 5
 ; GFX9-NEXT:    s_mov_b32 s7, 6
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[8:9]
 ; GFX9-NEXT:    s_add_u32 s8, s8, external_void_func_v3i32_i32_inreg at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s9, s9, external_void_func_v3i32_i32_inreg at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[8:9]
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -7585,13 +7575,13 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 {
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[8:9]
 ; GFX9-NEXT:    s_add_u32 s8, s8, external_void_func_v4i32_inreg at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s9, s9, external_void_func_v4i32_inreg at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[8:9]
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -7673,17 +7663,17 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 {
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s4, 1
 ; GFX9-NEXT:    s_mov_b32 s5, 2
 ; GFX9-NEXT:    s_mov_b32 s6, 3
 ; GFX9-NEXT:    s_mov_b32 s7, 4
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[8:9]
 ; GFX9-NEXT:    s_add_u32 s8, s8, external_void_func_v4i32_inreg at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s9, s9, external_void_func_v4i32_inreg at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[8:9]
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -7770,18 +7760,18 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 {
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s4, 1
 ; GFX9-NEXT:    s_mov_b32 s5, 2
 ; GFX9-NEXT:    s_mov_b32 s6, 3
 ; GFX9-NEXT:    s_mov_b32 s7, 4
 ; GFX9-NEXT:    s_mov_b32 s8, 5
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[10:11]
 ; GFX9-NEXT:    s_add_u32 s10, s10, external_void_func_v5i32_inreg at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s11, s11, external_void_func_v5i32_inreg at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[10:11]
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -7871,15 +7861,15 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 {
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[4:5], 0x0
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[12:13]
 ; GFX9-NEXT:    s_add_u32 s12, s12, external_void_func_v8i32_inreg at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s13, s13, external_void_func_v8i32_inreg at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[12:13]
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -7966,9 +7956,9 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 {
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s4, 1
 ; GFX9-NEXT:    s_mov_b32 s5, 2
 ; GFX9-NEXT:    s_mov_b32 s6, 3
@@ -7977,10 +7967,10 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 {
 ; GFX9-NEXT:    s_mov_b32 s9, 6
 ; GFX9-NEXT:    s_mov_b32 s10, 7
 ; GFX9-NEXT:    s_mov_b32 s11, 8
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[12:13]
 ; GFX9-NEXT:    s_add_u32 s12, s12, external_void_func_v8i32_inreg at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s13, s13, external_void_func_v8i32_inreg at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[12:13]
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -8012,10 +8002,10 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 {
 ; GFX10-NEXT:    s_mov_b32 s11, 8
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_getpc_b64 s[12:13]
 ; GFX10-NEXT:    s_add_u32 s12, s12, external_void_func_v8i32_inreg at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s13, s13, external_void_func_v8i32_inreg at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[12:13]
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
@@ -8048,10 +8038,10 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s11, 8
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v8i32_inreg at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v8i32_inreg at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
@@ -8076,15 +8066,15 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 {
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_load_dwordx16 s[4:19], s[4:5], 0x0
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[20:21]
 ; GFX9-NEXT:    s_add_u32 s20, s20, external_void_func_v16i32_inreg at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s21, s21, external_void_func_v16i32_inreg at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[20:21]
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -8217,10 +8207,10 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
 ; GFX9-NEXT:    s_mov_b32 s27, s43
 ; GFX9-NEXT:    s_mov_b32 s28, s44
 ; GFX9-NEXT:    s_mov_b32 s29, s45
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:20
 ; GFX9-NEXT:    s_getpc_b64 s[30:31]
 ; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v32i32_inreg at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v32i32_inreg at rel32@hi+12
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:20
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 16
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 17
@@ -8455,8 +8445,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
 ; GFX9-NEXT:    s_load_dwordx16 s[4:19], s[20:21], 0x0
 ; GFX9-NEXT:    s_load_dwordx16 s[36:51], s[20:21], 0x40
 ; GFX9-NEXT:    s_mov_b32 s33, s32
-; GFX9-NEXT:    v_mov_b32_e32 v0, s22
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_mov_b32_e32 v0, s22
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s46
@@ -8482,10 +8472,10 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
 ; GFX9-NEXT:    s_mov_b32 s28, s44
 ; GFX9-NEXT:    s_mov_b32 s29, s45
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 17
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:20
 ; GFX9-NEXT:    s_getpc_b64 s[30:31]
 ; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v32i32_i32_inreg at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v32i32_i32_inreg at rel32@hi+12
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:20
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 16
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 17
@@ -8711,12 +8701,12 @@ define amdgpu_gfx void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, d
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s33
 ; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s33 offset:4
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, stack_passed_f64_arg at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, stack_passed_f64_arg at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    buffer_store_dword v32, off, s[0:3], s32
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
@@ -8742,19 +8732,19 @@ define amdgpu_gfx void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, d
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s4
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-NEXT:    s_mov_b32 s33, s32
-; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    buffer_load_dword v32, off, s[0:3], s33
 ; GFX10-NEXT:    buffer_load_dword v33, off, s[0:3], s33 offset:4
+; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    s_getpc_b64 s[4:5]
 ; GFX10-NEXT:    s_add_u32 s4, s4, stack_passed_f64_arg at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s5, s5, stack_passed_f64_arg at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-NEXT:    buffer_store_dword v32, off, s[0:3], s32
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:4
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
@@ -8853,10 +8843,10 @@ define amdgpu_gfx void @stack_12xv3i32() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v29, 9
 ; GFX9-NEXT:    v_mov_b32_e32 v30, 10
 ; GFX9-NEXT:    v_mov_b32_e32 v31, 11
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_12xv3i32 at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_12xv3i32 at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -8920,10 +8910,10 @@ define amdgpu_gfx void @stack_12xv3i32() #0 {
 ; GFX10-NEXT:    v_mov_b32_e32 v29, 9
 ; GFX10-NEXT:    v_mov_b32_e32 v30, 10
 ; GFX10-NEXT:    v_mov_b32_e32 v31, 11
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_getpc_b64 s[4:5]
 ; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_12xv3i32 at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_12xv3i32 at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
@@ -8985,10 +8975,10 @@ define amdgpu_gfx void @stack_12xv3i32() #0 {
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v29, 9
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v30, 10
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v31, 11
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_12xv3i32 at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_12xv3i32 at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
@@ -9076,10 +9066,10 @@ define amdgpu_gfx void @stack_8xv5i32() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v29, 5
 ; GFX9-NEXT:    v_mov_b32_e32 v30, 6
 ; GFX9-NEXT:    v_mov_b32_e32 v31, 7
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_8xv5i32 at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_8xv5i32 at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -9105,13 +9095,13 @@ define amdgpu_gfx void @stack_8xv5i32() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_mov_b32_e32 v3, 14
 ; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32
 ; GFX10-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
 ; GFX10-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:8
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 11
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 12
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 13
+; GFX10-NEXT:    v_mov_b32_e32 v3, 14
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 15
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:12
@@ -9151,10 +9141,10 @@ define amdgpu_gfx void @stack_8xv5i32() #0 {
 ; GFX10-NEXT:    v_mov_b32_e32 v29, 5
 ; GFX10-NEXT:    v_mov_b32_e32 v30, 6
 ; GFX10-NEXT:    v_mov_b32_e32 v31, 7
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_getpc_b64 s[4:5]
 ; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_8xv5i32 at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_8xv5i32 at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
@@ -9221,10 +9211,10 @@ define amdgpu_gfx void @stack_8xv5i32() #0 {
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v29, 5
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v30, 6
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v31, 7
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_8xv5i32 at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_8xv5i32 at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
@@ -9308,10 +9298,10 @@ define amdgpu_gfx void @stack_8xv5f32() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v29, 0x40a00000
 ; GFX9-NEXT:    v_mov_b32_e32 v30, 0x40c00000
 ; GFX9-NEXT:    v_mov_b32_e32 v31, 0x40e00000
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_8xv5f32 at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_8xv5f32 at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -9337,13 +9327,13 @@ define amdgpu_gfx void @stack_8xv5f32() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_mov_b32_e32 v3, 0x41600000
 ; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32
 ; GFX10-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
 ; GFX10-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:8
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0x41300000
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0x41400000
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0x41500000
+; GFX10-NEXT:    v_mov_b32_e32 v3, 0x41600000
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0x41700000
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:12
@@ -9383,10 +9373,10 @@ define amdgpu_gfx void @stack_8xv5f32() #0 {
 ; GFX10-NEXT:    v_mov_b32_e32 v29, 0x40a00000
 ; GFX10-NEXT:    v_mov_b32_e32 v30, 0x40c00000
 ; GFX10-NEXT:    v_mov_b32_e32 v31, 0x40e00000
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_getpc_b64 s[4:5]
 ; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_8xv5f32 at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_8xv5f32 at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
@@ -9453,10 +9443,10 @@ define amdgpu_gfx void @stack_8xv5f32() #0 {
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v29, 0x40a00000
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v30, 0x40c00000
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v31, 0x40e00000
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_8xv5f32 at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_8xv5f32 at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1

diff  --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll
index 4218e321b3b84..2f5f1485c98c2 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll
@@ -14,13 +14,13 @@ define amdgpu_gfx void @test_call_external_void_func_void_clobber_s30_s31_call_e
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 4
 ; GFX9-NEXT:    v_writelane_b32 v40, s34, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s35, 1
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX9-NEXT:    s_getpc_b64 s[34:35]
 ; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_void at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_void at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ;;#ASMEND
@@ -109,17 +109,17 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_s31(i32 addrspace(1)
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 3
 ; GFX9-NEXT:    v_writelane_b32 v40, s34, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ; def s31
 ; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_mov_b32 s34, s31
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_void at rel32@hi+12
-; GFX9-NEXT:    s_mov_b32 s34, s31
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 1
 ; GFX9-NEXT:    s_mov_b32 s31, s34
@@ -187,18 +187,18 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_v31(i32 addrspace(1)
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ; def v31
 ; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    v_mov_b32_e32 v41, v31
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_void at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    v_mov_b32_e32 v41, v31
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9-NEXT:    v_mov_b32_e32 v31, v41
 ; GFX9-NEXT:    ;;#ASMSTART
@@ -231,10 +231,10 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_v31(i32 addrspace(1)
 ; GFX10-NEXT:    ; def v31
 ; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_mov_b32_e32 v41, v31
 ; GFX10-NEXT:    s_getpc_b64 s[4:5]
 ; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_void at rel32@hi+12
-; GFX10-NEXT:    v_mov_b32_e32 v41, v31
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX10-NEXT:    v_mov_b32_e32 v31, v41
@@ -267,16 +267,16 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s33(i32 addrspace(1)*
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_void at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ; def s33
 ; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_void at rel32@hi+12
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    ;;#ASMSTART
@@ -339,16 +339,16 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s34(i32 addrspace(1)*
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 3
 ; GFX9-NEXT:    v_writelane_b32 v40, s34, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_void at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ; def s34
 ; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_void at rel32@hi+12
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 1
 ; GFX9-NEXT:    ;;#ASMSTART
@@ -413,17 +413,17 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_v40(i32 addrspace(1)*
 ; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    v_writelane_b32 v41, s33, 2
-; GFX9-NEXT:    v_writelane_b32 v41, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v41, s30, 0
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_void at rel32@hi+12
 ; GFX9-NEXT:    v_writelane_b32 v41, s31, 1
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ; def v40
 ; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_void at rel32@hi+12
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ; use v40
@@ -451,13 +451,13 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_v40(i32 addrspace(1)*
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_void at rel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v41, s30, 0
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ; def v40
 ; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    v_writelane_b32 v41, s30, 0
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_void at rel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v41, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX10-NEXT:    ;;#ASMSTART
@@ -572,13 +572,13 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s33() #0 {
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, void_func_void_clobber_s33 at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, void_func_void_clobber_s33 at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -629,13 +629,13 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s34() #0 {
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, void_func_void_clobber_s34 at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, void_func_void_clobber_s34 at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
@@ -687,16 +687,16 @@ define amdgpu_gfx void @callee_saved_sgpr_kernel() #1 {
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 3
 ; GFX9-NEXT:    v_writelane_b32 v40, s40, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_void at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ; def s40
 ; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_void at rel32@hi+12
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 1
 ; GFX9-NEXT:    ;;#ASMSTART
@@ -762,21 +762,21 @@ define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 {
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 3
 ; GFX9-NEXT:    v_writelane_b32 v40, s40, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ; def s40
 ; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ; def v32
 ; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    v_mov_b32_e32 v41, v32
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_void at rel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
-; GFX9-NEXT:    v_mov_b32_e32 v41, v32
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ; use s40
@@ -807,10 +807,10 @@ define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 3
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_getpc_b64 s[4:5]
 ; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_void at rel32@hi+12
-; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    v_writelane_b32 v40, s40, 0
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ; def s40

diff  --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll
index ef07ac81373e7..3e0fae8b29764 100644
--- a/llvm/test/CodeGen/AMDGPU/half.ll
+++ b/llvm/test/CodeGen/AMDGPU/half.ll
@@ -65,8 +65,8 @@ define amdgpu_kernel void @load_v3f16_arg(<3 x half> addrspace(1)* %out, <3 x ha
 ; SI-NEXT:    s_add_u32 s4, s0, 4
 ; SI-NEXT:    s_addc_u32 s5, s1, 0
 ; SI-NEXT:    v_mov_b32_e32 v2, s4
-; SI-NEXT:    v_mov_b32_e32 v0, s0
 ; SI-NEXT:    v_mov_b32_e32 v4, s3
+; SI-NEXT:    v_mov_b32_e32 v0, s0
 ; SI-NEXT:    v_mov_b32_e32 v3, s5
 ; SI-NEXT:    v_mov_b32_e32 v1, s1
 ; SI-NEXT:    v_mov_b32_e32 v5, s2
@@ -82,8 +82,8 @@ define amdgpu_kernel void @load_v3f16_arg(<3 x half> addrspace(1)* %out, <3 x ha
 ; VI-NEXT:    s_add_u32 s4, s0, 4
 ; VI-NEXT:    s_addc_u32 s5, s1, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s4
-; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v4, s3
+; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v3, s5
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_mov_b32_e32 v5, s2
@@ -336,16 +336,16 @@ define amdgpu_kernel void @extload_v8f16_to_v8f32_arg(<8 x float> addrspace(1)*
 ; SI-NEXT:    s_lshr_b32 s8, s3, 16
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, s4
 ; SI-NEXT:    s_lshr_b32 s4, s2, 16
-; SI-NEXT:    v_cvt_f32_f16_e32 v0, s0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v7, s8
 ; SI-NEXT:    v_cvt_f32_f16_e32 v5, s4
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, s0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v6, s3
 ; SI-NEXT:    v_cvt_f32_f16_e32 v4, s2
 ; SI-NEXT:    s_add_u32 s0, s6, 16
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, s1
 ; SI-NEXT:    s_addc_u32 s1, s7, 0
-; SI-NEXT:    v_mov_b32_e32 v9, s1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, s5
+; SI-NEXT:    v_mov_b32_e32 v9, s1
 ; SI-NEXT:    v_mov_b32_e32 v8, s0
 ; SI-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
 ; SI-NEXT:    s_nop 0
@@ -364,16 +364,16 @@ define amdgpu_kernel void @extload_v8f16_to_v8f32_arg(<8 x float> addrspace(1)*
 ; VI-NEXT:    s_lshr_b32 s8, s3, 16
 ; VI-NEXT:    v_cvt_f32_f16_e32 v3, s4
 ; VI-NEXT:    s_lshr_b32 s4, s2, 16
-; VI-NEXT:    v_cvt_f32_f16_e32 v0, s0
 ; VI-NEXT:    v_cvt_f32_f16_e32 v7, s8
 ; VI-NEXT:    v_cvt_f32_f16_e32 v5, s4
+; VI-NEXT:    v_cvt_f32_f16_e32 v0, s0
 ; VI-NEXT:    v_cvt_f32_f16_e32 v6, s3
 ; VI-NEXT:    v_cvt_f32_f16_e32 v4, s2
 ; VI-NEXT:    s_add_u32 s0, s6, 16
 ; VI-NEXT:    v_cvt_f32_f16_e32 v2, s1
 ; VI-NEXT:    s_addc_u32 s1, s7, 0
-; VI-NEXT:    v_mov_b32_e32 v9, s1
 ; VI-NEXT:    v_cvt_f32_f16_e32 v1, s5
+; VI-NEXT:    v_mov_b32_e32 v9, s1
 ; VI-NEXT:    v_mov_b32_e32 v8, s0
 ; VI-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
 ; VI-NEXT:    s_nop 0
@@ -462,14 +462,14 @@ define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(<3 x double> addrspace(1)*
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, s3
 ; SI-NEXT:    s_lshr_b32 s4, s2, 16
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, s4
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, s2
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, s4
 ; SI-NEXT:    s_add_u32 s2, s0, 16
 ; SI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v0
 ; SI-NEXT:    s_addc_u32 s3, s1, 0
-; SI-NEXT:    v_mov_b32_e32 v7, s3
 ; SI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v1
 ; SI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
+; SI-NEXT:    v_mov_b32_e32 v7, s3
 ; SI-NEXT:    v_mov_b32_e32 v6, s2
 ; SI-NEXT:    flat_store_dwordx2 v[6:7], v[4:5]
 ; SI-NEXT:    v_mov_b32_e32 v5, s1
@@ -484,14 +484,14 @@ define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(<3 x double> addrspace(1)*
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f32_f16_e32 v1, s3
 ; VI-NEXT:    s_lshr_b32 s4, s2, 16
-; VI-NEXT:    v_cvt_f32_f16_e32 v2, s4
 ; VI-NEXT:    v_cvt_f32_f16_e32 v0, s2
+; VI-NEXT:    v_cvt_f32_f16_e32 v2, s4
 ; VI-NEXT:    s_add_u32 s2, s0, 16
 ; VI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v1
 ; VI-NEXT:    s_addc_u32 s3, s1, 0
-; VI-NEXT:    v_mov_b32_e32 v7, s3
 ; VI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
 ; VI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
+; VI-NEXT:    v_mov_b32_e32 v7, s3
 ; VI-NEXT:    v_mov_b32_e32 v6, s2
 ; VI-NEXT:    flat_store_dwordx2 v[6:7], v[4:5]
 ; VI-NEXT:    v_mov_b32_e32 v5, s1
@@ -515,13 +515,13 @@ define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(<4 x double> addrspace(1)*
 ; SI-NEXT:    s_lshr_b32 s5, s2, 16
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, s2
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, s5
-; SI-NEXT:    s_add_u32 s2, s0, 16
 ; SI-NEXT:    v_cvt_f64_f32_e32 v[6:7], v4
 ; SI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v5
+; SI-NEXT:    s_add_u32 s2, s0, 16
 ; SI-NEXT:    s_addc_u32 s3, s1, 0
-; SI-NEXT:    v_mov_b32_e32 v9, s3
 ; SI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
 ; SI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
+; SI-NEXT:    v_mov_b32_e32 v9, s3
 ; SI-NEXT:    v_mov_b32_e32 v8, s2
 ; SI-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
 ; SI-NEXT:    s_nop 0
@@ -541,13 +541,13 @@ define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(<4 x double> addrspace(1)*
 ; VI-NEXT:    s_lshr_b32 s4, s2, 16
 ; VI-NEXT:    v_cvt_f32_f16_e32 v0, s2
 ; VI-NEXT:    v_cvt_f32_f16_e32 v2, s4
-; VI-NEXT:    s_add_u32 s2, s0, 16
 ; VI-NEXT:    v_cvt_f64_f32_e32 v[6:7], v4
 ; VI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v5
+; VI-NEXT:    s_add_u32 s2, s0, 16
 ; VI-NEXT:    s_addc_u32 s3, s1, 0
-; VI-NEXT:    v_mov_b32_e32 v9, s3
 ; VI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
 ; VI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
+; VI-NEXT:    v_mov_b32_e32 v9, s3
 ; VI-NEXT:    v_mov_b32_e32 v8, s2
 ; VI-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
 ; VI-NEXT:    s_nop 0
@@ -573,13 +573,13 @@ define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(<8 x double> addrspace(1)*
 ; SI-NEXT:    s_lshr_b32 s8, s1, 16
 ; SI-NEXT:    s_lshr_b32 s4, s0, 16
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, s5
-; SI-NEXT:    v_cvt_f32_f16_e32 v9, s0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v8, s2
+; SI-NEXT:    v_cvt_f32_f16_e32 v9, s0
 ; SI-NEXT:    s_add_u32 s0, s6, 48
 ; SI-NEXT:    v_cvt_f32_f16_e32 v5, s1
-; SI-NEXT:    s_addc_u32 s1, s7, 0
 ; SI-NEXT:    v_cvt_f64_f32_e32 v[14:15], v0
 ; SI-NEXT:    v_cvt_f64_f32_e32 v[12:13], v12
+; SI-NEXT:    s_addc_u32 s1, s7, 0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v4, s8
 ; SI-NEXT:    v_mov_b32_e32 v17, s1
 ; SI-NEXT:    v_mov_b32_e32 v16, s0
@@ -592,12 +592,13 @@ define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(<8 x double> addrspace(1)*
 ; SI-NEXT:    flat_store_dwordx4 v[16:17], v[12:15]
 ; SI-NEXT:    v_cvt_f64_f32_e32 v[6:7], v4
 ; SI-NEXT:    v_mov_b32_e32 v13, s1
+; SI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v5
 ; SI-NEXT:    v_mov_b32_e32 v12, s0
 ; SI-NEXT:    s_add_u32 s0, s6, 16
-; SI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v5
 ; SI-NEXT:    s_addc_u32 s1, s7, 0
-; SI-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
 ; SI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
+; SI-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
+; SI-NEXT:    s_nop 0
 ; SI-NEXT:    v_mov_b32_e32 v9, s1
 ; SI-NEXT:    v_mov_b32_e32 v8, s0
 ; SI-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
@@ -616,19 +617,19 @@ define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(<8 x double> addrspace(1)*
 ; VI-NEXT:    s_lshr_b32 s8, s2, 16
 ; VI-NEXT:    s_lshr_b32 s9, s3, 16
 ; VI-NEXT:    v_cvt_f32_f16_e32 v0, s4
-; VI-NEXT:    v_cvt_f32_f16_e32 v12, s3
 ; VI-NEXT:    v_cvt_f32_f16_e32 v4, s8
 ; VI-NEXT:    v_cvt_f32_f16_e32 v5, s9
+; VI-NEXT:    v_cvt_f32_f16_e32 v12, s3
 ; VI-NEXT:    s_lshr_b32 s5, s1, 16
-; VI-NEXT:    v_cvt_f32_f16_e32 v8, s2
 ; VI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v0
 ; VI-NEXT:    v_cvt_f32_f16_e32 v0, s0
+; VI-NEXT:    v_cvt_f32_f16_e32 v8, s2
 ; VI-NEXT:    s_add_u32 s0, s6, 48
 ; VI-NEXT:    v_cvt_f64_f32_e32 v[10:11], v4
-; VI-NEXT:    v_cvt_f32_f16_e32 v4, s1
-; VI-NEXT:    s_addc_u32 s1, s7, 0
 ; VI-NEXT:    v_cvt_f64_f32_e32 v[14:15], v5
+; VI-NEXT:    v_cvt_f32_f16_e32 v4, s1
 ; VI-NEXT:    v_cvt_f64_f32_e32 v[12:13], v12
+; VI-NEXT:    s_addc_u32 s1, s7, 0
 ; VI-NEXT:    v_cvt_f32_f16_e32 v1, s5
 ; VI-NEXT:    v_mov_b32_e32 v17, s1
 ; VI-NEXT:    v_mov_b32_e32 v16, s0
@@ -638,12 +639,13 @@ define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(<8 x double> addrspace(1)*
 ; VI-NEXT:    flat_store_dwordx4 v[16:17], v[12:15]
 ; VI-NEXT:    v_cvt_f64_f32_e32 v[6:7], v1
 ; VI-NEXT:    v_mov_b32_e32 v13, s1
+; VI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
 ; VI-NEXT:    v_mov_b32_e32 v12, s0
 ; VI-NEXT:    s_add_u32 s0, s6, 16
-; VI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
 ; VI-NEXT:    s_addc_u32 s1, s7, 0
-; VI-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
 ; VI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; VI-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
+; VI-NEXT:    s_nop 0
 ; VI-NEXT:    v_mov_b32_e32 v9, s1
 ; VI-NEXT:    v_mov_b32_e32 v8, s0
 ; VI-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
@@ -854,9 +856,9 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f32(<4 x float> addrspace(1
 ; VI-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cvt_f32_f16_e32 v0, v4
-; VI-NEXT:    v_cvt_f32_f16_sdwa v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-NEXT:    v_cvt_f32_f16_e32 v2, v5
 ; VI-NEXT:    v_cvt_f32_f16_sdwa v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI-NEXT:    v_cvt_f32_f16_sdwa v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-NEXT:    v_mov_b32_e32 v4, s0
 ; VI-NEXT:    v_mov_b32_e32 v5, s1
 ; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
@@ -958,20 +960,21 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f32(<16 x float> addrspac
 ; SI-NEXT:    v_cvt_f32_f16_e32 v12, v7
 ; SI-NEXT:    v_cvt_f32_f16_e32 v10, v6
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v16, 16, v5
-; SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v4
 ; SI-NEXT:    flat_store_dwordx4 v[13:14], v[9:12]
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
+; SI-NEXT:    s_nop 0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v12, v3
 ; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
+; SI-NEXT:    v_lshrrev_b32_e32 v16, 16, v5
+; SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v4
 ; SI-NEXT:    v_cvt_f32_f16_e32 v6, v0
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v10, v2
-; SI-NEXT:    v_cvt_f32_f16_e32 v9, v1
-; SI-NEXT:    v_cvt_f32_f16_e32 v13, v3
+; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v4
 ; SI-NEXT:    v_mov_b32_e32 v5, s1
+; SI-NEXT:    v_cvt_f32_f16_e32 v9, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v13, v3
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, v16
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v17
 ; SI-NEXT:    v_mov_b32_e32 v4, s0
@@ -1005,8 +1008,8 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f32(<16 x float> addrspac
 ; VI-NEXT:    v_mov_b32_e32 v19, s3
 ; VI-NEXT:    v_mov_b32_e32 v18, s2
 ; VI-NEXT:    s_add_u32 s2, s0, 48
-; VI-NEXT:    s_addc_u32 s3, s1, 0
 ; VI-NEXT:    v_mov_b32_e32 v17, s1
+; VI-NEXT:    s_addc_u32 s3, s1, 0
 ; VI-NEXT:    v_mov_b32_e32 v16, s0
 ; VI-NEXT:    s_add_u32 s0, s0, 32
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
@@ -1014,12 +1017,12 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f32(<16 x float> addrspac
 ; VI-NEXT:    v_mov_b32_e32 v20, s2
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_cvt_f32_f16_e32 v14, v3
-; VI-NEXT:    v_cvt_f32_f16_sdwa v15, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-NEXT:    v_cvt_f32_f16_e32 v12, v2
+; VI-NEXT:    v_cvt_f32_f16_sdwa v15, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-NEXT:    v_cvt_f32_f16_sdwa v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-NEXT:    v_cvt_f32_f16_e32 v10, v1
-; VI-NEXT:    v_cvt_f32_f16_sdwa v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-NEXT:    v_cvt_f32_f16_e32 v8, v0
+; VI-NEXT:    v_cvt_f32_f16_sdwa v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-NEXT:    v_cvt_f32_f16_sdwa v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-NEXT:    flat_store_dwordx4 v[18:19], v[12:15]
 ; VI-NEXT:    s_waitcnt vmcnt(1)
@@ -1076,10 +1079,10 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f64(<2 x double> addrspace(
 ; SI-NEXT:    v_mov_b32_e32 v5, s1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, v1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v1
 ; SI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; SI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
 ; SI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; SI-NEXT:    s_endpgm
 ;
@@ -1093,10 +1096,10 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f64(<2 x double> addrspace(
 ; VI-NEXT:    v_mov_b32_e32 v4, s0
 ; VI-NEXT:    v_mov_b32_e32 v5, s1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-NEXT:    v_cvt_f32_f16_e32 v1, v0
-; VI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
+; VI-NEXT:    v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v1
+; VI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
 ; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; VI-NEXT:    s_endpgm
   %val = load <2 x half>, <2 x half> addrspace(1)* %in
@@ -1120,12 +1123,12 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f64(<3 x double> addrspace(
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; SI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v1
-; SI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
-; SI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
 ; SI-NEXT:    flat_store_dwordx2 v[6:7], v[4:5]
+; SI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; SI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
 ; SI-NEXT:    v_mov_b32_e32 v5, s1
 ; SI-NEXT:    v_mov_b32_e32 v4, s0
 ; SI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
@@ -1201,12 +1204,12 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f64(<4 x double> addrspace(
 ; VI-NEXT:    v_mov_b32_e32 v9, s1
 ; VI-NEXT:    v_mov_b32_e32 v8, s0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_cvt_f32_f16_sdwa v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-NEXT:    v_cvt_f32_f16_e32 v3, v1
+; VI-NEXT:    v_cvt_f32_f16_sdwa v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-NEXT:    v_cvt_f32_f16_e32 v2, v0
 ; VI-NEXT:    v_cvt_f32_f16_sdwa v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; VI-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
 ; VI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v3
+; VI-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
 ; VI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v2
 ; VI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v10
 ; VI-NEXT:    v_mov_b32_e32 v11, s3
@@ -1233,8 +1236,8 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f64(<8 x double> addrspace(
 ; SI-NEXT:    v_mov_b32_e32 v7, s3
 ; SI-NEXT:    v_mov_b32_e32 v6, s2
 ; SI-NEXT:    s_add_u32 s2, s0, 32
-; SI-NEXT:    s_addc_u32 s3, s1, 0
 ; SI-NEXT:    v_mov_b32_e32 v13, s1
+; SI-NEXT:    s_addc_u32 s3, s1, 0
 ; SI-NEXT:    v_mov_b32_e32 v12, s0
 ; SI-NEXT:    s_add_u32 s0, s0, 16
 ; SI-NEXT:    v_mov_b32_e32 v15, s3
@@ -1242,25 +1245,26 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f64(<8 x double> addrspace(
 ; SI-NEXT:    v_mov_b32_e32 v14, s2
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
 ; SI-NEXT:    v_cvt_f32_f16_e32 v8, v2
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v4
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v10, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v4, v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v16, v5
 ; SI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v3
 ; SI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
 ; SI-NEXT:    v_cvt_f32_f16_e32 v17, v9
 ; SI-NEXT:    v_cvt_f32_f16_e32 v18, v11
-; SI-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
 ; SI-NEXT:    flat_store_dwordx4 v[6:7], v[0:3]
-; SI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
+; SI-NEXT:    s_nop 0
 ; SI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v10
+; SI-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
 ; SI-NEXT:    v_cvt_f64_f32_e32 v[10:11], v16
 ; SI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v17
+; SI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
 ; SI-NEXT:    v_cvt_f64_f32_e32 v[6:7], v18
 ; SI-NEXT:    v_mov_b32_e32 v17, s1
 ; SI-NEXT:    v_mov_b32_e32 v16, s0
@@ -1281,22 +1285,22 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f64(<8 x double> addrspace(
 ; VI-NEXT:    v_mov_b32_e32 v8, s3
 ; VI-NEXT:    v_mov_b32_e32 v7, s2
 ; VI-NEXT:    s_add_u32 s2, s0, 32
-; VI-NEXT:    s_addc_u32 s3, s1, 0
 ; VI-NEXT:    v_mov_b32_e32 v13, s1
+; VI-NEXT:    s_addc_u32 s3, s1, 0
 ; VI-NEXT:    v_mov_b32_e32 v12, s0
 ; VI-NEXT:    s_add_u32 s0, s0, 16
 ; VI-NEXT:    v_mov_b32_e32 v15, s3
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
 ; VI-NEXT:    v_mov_b32_e32 v14, s2
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_cvt_f32_f16_sdwa v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-NEXT:    v_cvt_f32_f16_e32 v9, v0
 ; VI-NEXT:    v_cvt_f32_f16_sdwa v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-NEXT:    v_cvt_f32_f16_e32 v0, v3
-; VI-NEXT:    v_cvt_f32_f16_e32 v11, v2
+; VI-NEXT:    v_cvt_f32_f16_sdwa v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-NEXT:    v_cvt_f32_f16_e32 v10, v1
-; VI-NEXT:    v_cvt_f64_f32_e32 v[5:6], v5
+; VI-NEXT:    v_cvt_f32_f16_e32 v11, v2
 ; VI-NEXT:    v_cvt_f64_f32_e32 v[3:4], v0
+; VI-NEXT:    v_cvt_f64_f32_e32 v[5:6], v5
 ; VI-NEXT:    v_cvt_f32_f16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-NEXT:    v_cvt_f32_f16_sdwa v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v9
@@ -1359,37 +1363,38 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(<16 x double> addrspa
 ; SI-NEXT:    v_cvt_f64_f32_e32 v[10:11], v10
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
 ; SI-NEXT:    flat_store_dwordx4 v[14:15], v[8:11]
-; SI-NEXT:    v_mov_b32_e32 v15, s3
+; SI-NEXT:    s_nop 0
 ; SI-NEXT:    v_cvt_f64_f32_e32 v[8:9], v2
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
 ; SI-NEXT:    v_cvt_f64_f32_e32 v[10:11], v3
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT:    v_mov_b32_e32 v14, s2
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; SI-NEXT:    flat_store_dwordx4 v[16:17], v[8:11]
-; SI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
+; SI-NEXT:    v_mov_b32_e32 v15, s3
 ; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v9, v0
 ; SI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v1
+; SI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
 ; SI-NEXT:    v_cvt_f32_f16_e32 v8, v8
 ; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v7
 ; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
 ; SI-NEXT:    flat_store_dwordx4 v[18:19], v[0:3]
 ; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v6
-; SI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v8
 ; SI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v9
+; SI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v8
 ; SI-NEXT:    v_cvt_f32_f16_e32 v8, v10
-; SI-NEXT:    v_cvt_f32_f16_e32 v10, v11
+; SI-NEXT:    v_mov_b32_e32 v14, s2
 ; SI-NEXT:    s_add_u32 s2, s0, 0x60
 ; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
-; SI-NEXT:    flat_store_dwordx4 v[12:13], v[0:3]
+; SI-NEXT:    v_cvt_f32_f16_e32 v10, v11
 ; SI-NEXT:    s_addc_u32 s3, s1, 0
+; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
+; SI-NEXT:    flat_store_dwordx4 v[12:13], v[0:3]
+; SI-NEXT:    v_mov_b32_e32 v17, s3
 ; SI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v7
 ; SI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v8
 ; SI-NEXT:    v_cvt_f32_f16_e32 v7, v20
-; SI-NEXT:    v_mov_b32_e32 v17, s3
 ; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
 ; SI-NEXT:    v_cvt_f32_f16_e32 v12, v5
 ; SI-NEXT:    v_mov_b32_e32 v16, s2
@@ -1400,10 +1405,10 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(<16 x double> addrspa
 ; SI-NEXT:    s_add_u32 s0, s0, 64
 ; SI-NEXT:    flat_store_dwordx4 v[14:15], v[0:3]
 ; SI-NEXT:    s_addc_u32 s1, s1, 0
-; SI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v7
 ; SI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v21
-; SI-NEXT:    v_cvt_f64_f32_e32 v[6:7], v12
+; SI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v7
 ; SI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
+; SI-NEXT:    v_cvt_f64_f32_e32 v[6:7], v12
 ; SI-NEXT:    v_mov_b32_e32 v19, s3
 ; SI-NEXT:    v_mov_b32_e32 v13, s1
 ; SI-NEXT:    v_mov_b32_e32 v18, s2
@@ -1449,43 +1454,45 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(<16 x double> addrspa
 ; VI-NEXT:    v_cvt_f32_f16_e32 v3, v2
 ; VI-NEXT:    v_cvt_f32_f16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-NEXT:    flat_store_dwordx4 v[14:15], v[8:11]
-; VI-NEXT:    v_mov_b32_e32 v15, s3
+; VI-NEXT:    s_nop 0
 ; VI-NEXT:    v_cvt_f64_f32_e32 v[8:9], v3
 ; VI-NEXT:    v_cvt_f64_f32_e32 v[10:11], v2
 ; VI-NEXT:    v_cvt_f32_f16_e32 v2, v1
 ; VI-NEXT:    v_cvt_f32_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; VI-NEXT:    v_mov_b32_e32 v14, s2
 ; VI-NEXT:    flat_store_dwordx4 v[16:17], v[8:11]
-; VI-NEXT:    s_add_u32 s2, s0, 0x60
+; VI-NEXT:    s_nop 0
 ; VI-NEXT:    v_cvt_f32_f16_e32 v8, v0
 ; VI-NEXT:    v_cvt_f32_f16_sdwa v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v2
 ; VI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_cvt_f32_f16_e32 v10, v4
-; VI-NEXT:    s_addc_u32 s3, s1, 0
-; VI-NEXT:    v_mov_b32_e32 v17, s3
 ; VI-NEXT:    flat_store_dwordx4 v[18:19], v[0:3]
-; VI-NEXT:    v_cvt_f32_f16_sdwa v18, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    v_cvt_f32_f16_e32 v10, v4
 ; VI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v8
 ; VI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v9
+; VI-NEXT:    v_cvt_f32_f16_sdwa v18, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-NEXT:    v_cvt_f32_f16_e32 v4, v7
 ; VI-NEXT:    v_cvt_f32_f16_sdwa v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; VI-NEXT:    v_cvt_f32_f16_e32 v8, v5
+; VI-NEXT:    v_mov_b32_e32 v15, s3
+; VI-NEXT:    v_mov_b32_e32 v14, s2
+; VI-NEXT:    s_add_u32 s2, s0, 0x60
 ; VI-NEXT:    flat_store_dwordx4 v[12:13], v[0:3]
-; VI-NEXT:    v_cvt_f32_f16_sdwa v12, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI-NEXT:    v_cvt_f32_f16_e32 v8, v5
 ; VI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v4
 ; VI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v7
 ; VI-NEXT:    v_cvt_f32_f16_e32 v7, v6
 ; VI-NEXT:    v_cvt_f32_f16_sdwa v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI-NEXT:    s_addc_u32 s3, s1, 0
+; VI-NEXT:    v_cvt_f32_f16_sdwa v12, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI-NEXT:    v_mov_b32_e32 v17, s3
 ; VI-NEXT:    v_mov_b32_e32 v16, s2
 ; VI-NEXT:    s_add_u32 s2, s0, 0x50
 ; VI-NEXT:    s_addc_u32 s3, s1, 0
-; VI-NEXT:    flat_store_dwordx4 v[14:15], v[0:3]
 ; VI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v10
+; VI-NEXT:    flat_store_dwordx4 v[14:15], v[0:3]
+; VI-NEXT:    v_cvt_f64_f32_e32 v[10:11], v6
 ; VI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v8
 ; VI-NEXT:    v_cvt_f64_f32_e32 v[8:9], v7
-; VI-NEXT:    v_cvt_f64_f32_e32 v[10:11], v6
 ; VI-NEXT:    s_add_u32 s0, s0, 64
 ; VI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v12
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
@@ -1579,8 +1586,8 @@ define amdgpu_kernel void @global_truncstore_v3f32_to_v3f16(<3 x half> addrspace
 ; SI-NEXT:    v_cvt_f16_f32_e32 v4, v0
 ; SI-NEXT:    v_mov_b32_e32 v0, s2
 ; SI-NEXT:    v_mov_b32_e32 v1, s3
-; SI-NEXT:    flat_store_short v[0:1], v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    flat_store_short v[0:1], v2
 ; SI-NEXT:    v_mov_b32_e32 v0, s0
 ; SI-NEXT:    v_or_b32_e32 v2, v4, v3
 ; SI-NEXT:    v_mov_b32_e32 v1, s1
@@ -1644,10 +1651,10 @@ define amdgpu_kernel void @global_truncstore_v4f32_to_v4f16(<4 x half> addrspace
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_cvt_f16_f32_sdwa v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; VI-NEXT:    v_cvt_f16_f32_e32 v5, v0
 ; VI-NEXT:    v_cvt_f16_f32_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
 ; VI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; VI-NEXT:    v_cvt_f16_f32_sdwa v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; VI-NEXT:    v_cvt_f16_f32_e32 v5, v0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_or_b32_e32 v3, v2, v3
@@ -1678,20 +1685,20 @@ define amdgpu_kernel void @global_truncstore_v8f32_to_v8f16(<8 x half> addrspace
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT:    v_cvt_f16_f32_e32 v10, v0
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
 ; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v10, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v7
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v6, v0
-; SI-NEXT:    v_or_b32_e32 v3, v2, v3
 ; SI-NEXT:    v_or_b32_e32 v0, v4, v5
+; SI-NEXT:    v_or_b32_e32 v3, v2, v3
 ; SI-NEXT:    v_or_b32_e32 v2, v10, v7
 ; SI-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
 ; SI-NEXT:    s_endpgm
@@ -1712,17 +1719,17 @@ define amdgpu_kernel void @global_truncstore_v8f32_to_v8f16(<8 x half> addrspace
 ; VI-NEXT:    v_mov_b32_e32 v9, s1
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_cvt_f16_f32_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; VI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cvt_f16_f32_sdwa v7, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
 ; VI-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; VI-NEXT:    v_cvt_f16_f32_sdwa v10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
 ; VI-NEXT:    v_cvt_f16_f32_sdwa v5, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
 ; VI-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; VI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; VI-NEXT:    v_cvt_f16_f32_sdwa v10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
 ; VI-NEXT:    v_cvt_f16_f32_e32 v11, v0
-; VI-NEXT:    v_or_b32_e32 v3, v2, v3
 ; VI-NEXT:    v_or_b32_e32 v1, v6, v7
 ; VI-NEXT:    v_or_b32_e32 v0, v4, v5
+; VI-NEXT:    v_or_b32_e32 v3, v2, v3
 ; VI-NEXT:    v_or_b32_e32 v2, v11, v10
 ; VI-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
 ; VI-NEXT:    s_endpgm
@@ -1745,8 +1752,8 @@ define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(<16 x half> addrsp
 ; SI-NEXT:    v_mov_b32_e32 v13, s3
 ; SI-NEXT:    s_addc_u32 s5, s3, 0
 ; SI-NEXT:    v_mov_b32_e32 v12, s2
-; SI-NEXT:    s_add_u32 s2, s2, 16
 ; SI-NEXT:    v_mov_b32_e32 v4, s4
+; SI-NEXT:    s_add_u32 s2, s2, 16
 ; SI-NEXT:    v_mov_b32_e32 v5, s5
 ; SI-NEXT:    s_addc_u32 s3, s3, 0
 ; SI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
@@ -1759,39 +1766,38 @@ define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(<16 x half> addrsp
 ; SI-NEXT:    s_addc_u32 s3, s1, 0
 ; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
 ; SI-NEXT:    v_cvt_f16_f32_e32 v16, v5
-; SI-NEXT:    v_cvt_f16_f32_e32 v17, v4
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
-; SI-NEXT:    v_cvt_f16_f32_e32 v9, v9
+; SI-NEXT:    v_cvt_f16_f32_e32 v17, v4
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v15, v15
 ; SI-NEXT:    v_cvt_f16_f32_e32 v13, v13
+; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
+; SI-NEXT:    v_cvt_f16_f32_e32 v9, v9
 ; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
 ; SI-NEXT:    v_cvt_f16_f32_e32 v12, v12
 ; SI-NEXT:    v_cvt_f16_f32_e32 v10, v10
 ; SI-NEXT:    v_cvt_f16_f32_e32 v8, v8
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v1
 ; SI-NEXT:    v_mov_b32_e32 v5, s3
+; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
 ; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v16
-; SI-NEXT:    v_or_b32_e32 v3, v6, v2
-; SI-NEXT:    v_or_b32_e32 v2, v17, v7
 ; SI-NEXT:    v_mov_b32_e32 v4, s2
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v18
-; SI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; SI-NEXT:    v_or_b32_e32 v3, v6, v2
+; SI-NEXT:    v_or_b32_e32 v2, v17, v7
 ; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v15
 ; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v13
 ; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
 ; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; SI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; SI-NEXT:    v_mov_b32_e32 v5, s1
 ; SI-NEXT:    v_or_b32_e32 v1, v14, v6
 ; SI-NEXT:    v_or_b32_e32 v0, v12, v7
@@ -1813,8 +1819,8 @@ define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(<16 x half> addrsp
 ; VI-NEXT:    v_mov_b32_e32 v13, s3
 ; VI-NEXT:    s_addc_u32 s5, s3, 0
 ; VI-NEXT:    v_mov_b32_e32 v12, s2
-; VI-NEXT:    s_add_u32 s2, s2, 16
 ; VI-NEXT:    v_mov_b32_e32 v4, s4
+; VI-NEXT:    s_add_u32 s2, s2, 16
 ; VI-NEXT:    v_mov_b32_e32 v5, s5
 ; VI-NEXT:    s_addc_u32 s3, s3, 0
 ; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
@@ -1845,8 +1851,8 @@ define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(<16 x half> addrsp
 ; VI-NEXT:    v_cvt_f16_f32_sdwa v9, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
 ; VI-NEXT:    v_cvt_f16_f32_e32 v8, v8
 ; VI-NEXT:    v_mov_b32_e32 v5, s3
-; VI-NEXT:    v_or_b32_e32 v1, v2, v3
 ; VI-NEXT:    v_mov_b32_e32 v4, s2
+; VI-NEXT:    v_or_b32_e32 v1, v2, v3
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v16
 ; VI-NEXT:    v_or_b32_e32 v3, v6, v7
 ; VI-NEXT:    v_or_b32_e32 v2, v18, v17
@@ -1910,8 +1916,8 @@ define amdgpu_kernel void @fadd_v2f16(<2 x half> addrspace(1)* %out, <2 x half>
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, s0
 ; SI-NEXT:    s_lshr_b32 s0, s1, 16
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, s1
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, s0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, s2
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, s0
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; SI-NEXT:    v_add_f32_e32 v0, v0, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
@@ -1966,23 +1972,23 @@ define amdgpu_kernel void @fadd_v4f16(<4 x half> addrspace(1)* %out, <4 x half>
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v6, v0
 ; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v7, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v8, v2
 ; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_cvt_f32_f16_e32 v7, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v9, v3
 ; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; SI-NEXT:    v_add_f32_e32 v7, v7, v9
-; SI-NEXT:    v_add_f32_e32 v0, v0, v2
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_add_f32_e32 v6, v6, v8
 ; SI-NEXT:    v_add_f32_e32 v1, v1, v3
+; SI-NEXT:    v_add_f32_e32 v0, v0, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT:    v_add_f32_e32 v6, v6, v8
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v7
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v6
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
@@ -2035,25 +2041,25 @@ define amdgpu_kernel void @fadd_v8f16(<8 x half> addrspace(1)* %out, <8 x half>
 ; SI-NEXT:    s_lshr_b32 s0, s4, 16
 ; SI-NEXT:    v_cvt_f32_f16_e32 v8, s0
 ; SI-NEXT:    s_lshr_b32 s0, s5, 16
-; SI-NEXT:    v_cvt_f32_f16_e32 v0, s10
 ; SI-NEXT:    s_lshr_b32 s11, s1, 16
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, s10
 ; SI-NEXT:    s_lshr_b32 s10, s2, 16
 ; SI-NEXT:    v_cvt_f32_f16_e32 v9, s0
 ; SI-NEXT:    s_lshr_b32 s0, s6, 16
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, s11
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, s10
-; SI-NEXT:    v_cvt_f32_f16_e32 v10, s0
 ; SI-NEXT:    s_lshr_b32 s10, s3, 16
+; SI-NEXT:    v_cvt_f32_f16_e32 v10, s0
 ; SI-NEXT:    s_lshr_b32 s0, s7, 16
-; SI-NEXT:    v_cvt_f32_f16_e32 v12, s4
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, s10
-; SI-NEXT:    v_cvt_f32_f16_e32 v11, s0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v5, s1
+; SI-NEXT:    v_cvt_f32_f16_e32 v11, s0
+; SI-NEXT:    v_cvt_f32_f16_e32 v12, s4
 ; SI-NEXT:    v_cvt_f32_f16_e32 v13, s5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v6, s2
-; SI-NEXT:    v_cvt_f32_f16_e32 v15, s6
 ; SI-NEXT:    v_cvt_f32_f16_e32 v7, s3
 ; SI-NEXT:    v_cvt_f32_f16_e32 v14, s7
+; SI-NEXT:    v_cvt_f32_f16_e32 v15, s6
 ; SI-NEXT:    v_add_f32_e32 v1, v1, v9
 ; SI-NEXT:    v_add_f32_e32 v0, v0, v8
 ; SI-NEXT:    v_add_f32_e32 v3, v3, v11
@@ -2112,8 +2118,8 @@ define amdgpu_kernel void @fadd_v8f16(<8 x half> addrspace(1)* %out, <8 x half>
 ; VI-NEXT:    v_add_f16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    v_add_f16_e32 v1, s1, v1
-; VI-NEXT:    s_lshr_b32 s2, s0, 16
 ; VI-NEXT:    s_lshr_b32 s1, s4, 16
+; VI-NEXT:    s_lshr_b32 s2, s0, 16
 ; VI-NEXT:    v_or_b32_e32 v1, v1, v0
 ; VI-NEXT:    v_mov_b32_e32 v0, s1
 ; VI-NEXT:    v_mov_b32_e32 v4, s2

diff  --git a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
index d950e69adf795..fb9348bae000c 100644
--- a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
+++ b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
@@ -35,8 +35,8 @@ define amdgpu_kernel void @udiv32_invariant_denom(i32 addrspace(1)* nocapture %a
 ; GFX9-NEXT:    s_add_u32 s2, s2, 1
 ; GFX9-NEXT:    v_add_u32_e32 v4, 1, v2
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s4, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX9-NEXT:    s_addc_u32 s3, s3, 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX9-NEXT:    global_store_dword v1, v2, s[0:1]
 ; GFX9-NEXT:    s_add_u32 s0, s0, 4
 ; GFX9-NEXT:    s_addc_u32 s1, s1, 0
@@ -135,8 +135,8 @@ define amdgpu_kernel void @urem32_invariant_denom(i32 addrspace(1)* nocapture %a
 ; GFX9-NEXT:    s_add_u32 s2, s2, 1
 ; GFX9-NEXT:    v_subrev_u32_e32 v3, s4, v2
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s4, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; GFX9-NEXT:    s_addc_u32 s3, s3, 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; GFX9-NEXT:    global_store_dword v1, v2, s[0:1]
 ; GFX9-NEXT:    s_add_u32 s0, s0, 4
 ; GFX9-NEXT:    s_addc_u32 s1, s1, 0
@@ -206,14 +206,13 @@ define amdgpu_kernel void @sdiv32_invariant_denom(i32 addrspace(1)* nocapture %a
 ; GFX9-LABEL: sdiv32_invariant_denom:
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_load_dword s3, s[0:1], 0x2c
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_ashr_i32 s2, s3, 31
 ; GFX9-NEXT:    s_add_i32 s3, s3, s2
 ; GFX9-NEXT:    s_xor_b32 s3, s3, s2
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
 ; GFX9-NEXT:    s_sub_i32 s4, 0, s3
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
@@ -236,8 +235,9 @@ define amdgpu_kernel void @sdiv32_invariant_denom(i32 addrspace(1)* nocapture %a
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX9-NEXT:    v_xor_b32_e32 v2, s2, v2
-; GFX9-NEXT:    v_subrev_u32_e32 v2, s2, v2
 ; GFX9-NEXT:    s_add_i32 s4, s4, 1
+; GFX9-NEXT:    v_subrev_u32_e32 v2, s2, v2
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_store_dword v1, v2, s[0:1]
 ; GFX9-NEXT:    s_add_u32 s0, s0, 4
 ; GFX9-NEXT:    s_addc_u32 s1, s1, 0
@@ -248,12 +248,10 @@ define amdgpu_kernel void @sdiv32_invariant_denom(i32 addrspace(1)* nocapture %a
 ;
 ; GFX10-LABEL: sdiv32_invariant_denom:
 ; GFX10:       ; %bb.0: ; %bb
-; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    s_load_dword s3, s[0:1], 0x2c
-; GFX10-NEXT:    s_nop 0
-; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_ashr_i32 s2, s3, 31
+; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX10-NEXT:    s_add_i32 s3, s3, s2
 ; GFX10-NEXT:    s_xor_b32 s3, s3, s2
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s3
@@ -273,8 +271,8 @@ define amdgpu_kernel void @sdiv32_invariant_denom(i32 addrspace(1)* nocapture %a
 ; GFX10-NEXT:    v_add_nc_u32_e32 v4, 1, v2
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v3, s4, v3
 ; GFX10-NEXT:    s_add_i32 s4, s4, 1
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s3, v3
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v5, s3, v3
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s3, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc_lo
 ; GFX10-NEXT:    v_add_nc_u32_e32 v4, 1, v2
@@ -282,6 +280,7 @@ define amdgpu_kernel void @sdiv32_invariant_denom(i32 addrspace(1)* nocapture %a
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc_lo
 ; GFX10-NEXT:    v_xor_b32_e32 v2, s2, v2
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v2, s2, v2
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_store_dword v1, v2, s[0:1]
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_add_u32 s0, s0, 4
@@ -311,14 +310,13 @@ define amdgpu_kernel void @srem32_invariant_denom(i32 addrspace(1)* nocapture %a
 ; GFX9-LABEL: srem32_invariant_denom:
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_ashr_i32 s3, s2, 31
 ; GFX9-NEXT:    s_add_i32 s2, s2, s3
 ; GFX9-NEXT:    s_xor_b32 s2, s2, s3
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s2
 ; GFX9-NEXT:    s_sub_i32 s3, 0, s2
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
@@ -337,8 +335,9 @@ define amdgpu_kernel void @srem32_invariant_denom(i32 addrspace(1)* nocapture %a
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; GFX9-NEXT:    v_subrev_u32_e32 v3, s2, v2
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s2, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; GFX9-NEXT:    s_add_i32 s3, s3, 1
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_store_dword v1, v2, s[0:1]
 ; GFX9-NEXT:    s_add_u32 s0, s0, 4
 ; GFX9-NEXT:    s_addc_u32 s1, s1, 0
@@ -349,12 +348,10 @@ define amdgpu_kernel void @srem32_invariant_denom(i32 addrspace(1)* nocapture %a
 ;
 ; GFX10-LABEL: srem32_invariant_denom:
 ; GFX10:       ; %bb.0: ; %bb
-; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    s_load_dword s2, s[0:1], 0x2c
-; GFX10-NEXT:    s_nop 0
-; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_ashr_i32 s3, s2, 31
+; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX10-NEXT:    s_add_i32 s2, s2, s3
 ; GFX10-NEXT:    s_xor_b32 s2, s2, s3
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s2
@@ -379,6 +376,7 @@ define amdgpu_kernel void @srem32_invariant_denom(i32 addrspace(1)* nocapture %a
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, s2, v2
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s2, v2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_store_dword v1, v2, s[0:1]
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_add_u32 s0, s0, 4
@@ -431,10 +429,10 @@ define amdgpu_kernel void @udiv16_invariant_denom(i16 addrspace(1)* nocapture %a
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v7, v0
 ; GFX9-NEXT:    v_add_u16_e32 v4, 1, v4
 ; GFX9-NEXT:    v_mad_f32 v0, -v0, v2, v8
-; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v0|, v2
 ; GFX9-NEXT:    v_cmp_eq_u16_e32 vcc, s5, v4
-; GFX9-NEXT:    v_addc_co_u32_e64 v0, s[0:1], 0, v7, s[0:1]
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v0|, v2
 ; GFX9-NEXT:    s_and_b64 vcc, exec, vcc
+; GFX9-NEXT:    v_addc_co_u32_e64 v0, s[0:1], 0, v7, s[0:1]
 ; GFX9-NEXT:    global_store_short v[5:6], v0, off
 ; GFX9-NEXT:    s_cbranch_vccz BB4_1
 ; GFX9-NEXT:  ; %bb.2: ; %bb2
@@ -461,9 +459,9 @@ define amdgpu_kernel void @udiv16_invariant_denom(i16 addrspace(1)* nocapture %a
 ; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x400, v4
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v7, v3
 ; GFX10-NEXT:    v_add_co_u32 v5, s0, s2, v5
-; GFX10-NEXT:    s_and_b32 vcc_lo, exec_lo, vcc_lo
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v6, s0, s3, v6, s0
 ; GFX10-NEXT:    v_trunc_f32_e32 v0, v0
+; GFX10-NEXT:    s_and_b32 vcc_lo, exec_lo, vcc_lo
 ; GFX10-NEXT:    v_mad_f32 v7, -v0, v2, v7
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX10-NEXT:    v_cmp_ge_f32_e64 s0, |v7|, v2
@@ -508,13 +506,13 @@ define amdgpu_kernel void @urem16_invariant_denom(i16 addrspace(1)* nocapture %a
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v8, v0
 ; GFX9-NEXT:    v_lshlrev_b64 v[5:6], 1, v[0:1]
 ; GFX9-NEXT:    v_add_u16_e32 v4, 1, v4
-; GFX9-NEXT:    v_cmp_eq_u16_e32 vcc, s8, v4
+; GFX9-NEXT:    v_mov_b32_e32 v7, s5
 ; GFX9-NEXT:    v_mul_f32_e32 v9, v8, v3
 ; GFX9-NEXT:    v_trunc_f32_e32 v9, v9
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v10, v9
 ; GFX9-NEXT:    v_mad_f32 v8, -v9, v2, v8
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 s[2:3], |v8|, v2
-; GFX9-NEXT:    v_mov_b32_e32 v7, s5
+; GFX9-NEXT:    v_cmp_eq_u16_e32 vcc, s8, v4
 ; GFX9-NEXT:    v_addc_co_u32_e64 v8, s[2:3], 0, v10, s[2:3]
 ; GFX9-NEXT:    v_mul_lo_u32 v8, v8, s7
 ; GFX9-NEXT:    v_add_co_u32_e64 v5, s[0:1], s4, v5
@@ -610,8 +608,8 @@ define amdgpu_kernel void @sdiv16_invariant_denom(i16 addrspace(1)* nocapture %a
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v7|, |v2|
 ; GFX9-NEXT:    v_cmp_eq_u16_e32 vcc, s5, v4
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, v0, s[0:1]
-; GFX9-NEXT:    v_add_u32_e32 v0, v8, v0
 ; GFX9-NEXT:    s_and_b64 vcc, exec, vcc
+; GFX9-NEXT:    v_add_u32_e32 v0, v8, v0
 ; GFX9-NEXT:    global_store_short v[5:6], v0, off
 ; GFX9-NEXT:    s_cbranch_vccz BB6_1
 ; GFX9-NEXT:  ; %bb.2: ; %bb2
@@ -640,10 +638,10 @@ define amdgpu_kernel void @sdiv16_invariant_denom(i16 addrspace(1)* nocapture %a
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v7, v3
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v8, 30, v8
 ; GFX10-NEXT:    v_add_co_u32 v5, s0, s2, v5
-; GFX10-NEXT:    s_and_b32 vcc_lo, exec_lo, vcc_lo
 ; GFX10-NEXT:    v_trunc_f32_e32 v0, v0
 ; GFX10-NEXT:    v_or_b32_e32 v8, 1, v8
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v6, s0, s3, v6, s0
+; GFX10-NEXT:    s_and_b32 vcc_lo, exec_lo, vcc_lo
 ; GFX10-NEXT:    v_mad_f32 v7, -v0, v2, v7
 ; GFX10-NEXT:    v_cvt_i32_f32_e32 v0, v0
 ; GFX10-NEXT:    v_cmp_ge_f32_e64 s1, |v7|, |v2|
@@ -700,12 +698,12 @@ define amdgpu_kernel void @srem16_invariant_denom(i16 addrspace(1)* nocapture %a
 ; GFX9-NEXT:    v_add_u32_e32 v0, v11, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s6
 ; GFX9-NEXT:    v_add_u16_e32 v4, 1, v4
-; GFX9-NEXT:    v_cmp_eq_u16_e32 vcc, s7, v4
 ; GFX9-NEXT:    v_mov_b32_e32 v8, s5
+; GFX9-NEXT:    v_cmp_eq_u16_e32 vcc, s7, v4
 ; GFX9-NEXT:    v_add_co_u32_e64 v5, s[0:1], s4, v5
 ; GFX9-NEXT:    s_and_b64 vcc, exec, vcc
-; GFX9-NEXT:    v_sub_u32_e32 v0, v7, v0
 ; GFX9-NEXT:    v_addc_co_u32_e64 v6, s[0:1], v8, v6, s[0:1]
+; GFX9-NEXT:    v_sub_u32_e32 v0, v7, v0
 ; GFX9-NEXT:    global_store_short v[5:6], v0, off
 ; GFX9-NEXT:    s_cbranch_vccz BB7_1
 ; GFX9-NEXT:  ; %bb.2: ; %bb2
@@ -741,9 +739,9 @@ define amdgpu_kernel void @srem16_invariant_denom(i16 addrspace(1)* nocapture %a
 ; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x400, v4
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, v8, v9
 ; GFX10-NEXT:    v_add_co_u32 v5, s0, s2, v5
-; GFX10-NEXT:    s_and_b32 vcc_lo, exec_lo, vcc_lo
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v6, s0, s3, v6, s0
 ; GFX10-NEXT:    v_mul_lo_u32 v0, v0, s1
+; GFX10-NEXT:    s_and_b32 vcc_lo, exec_lo, vcc_lo
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v0, v7, v0
 ; GFX10-NEXT:    global_store_short v[5:6], v0, off
 ; GFX10-NEXT:    s_cbranch_vccz BB7_1

diff  --git a/llvm/test/CodeGen/AMDGPU/idot2.ll b/llvm/test/CodeGen/AMDGPU/idot2.ll
index 7a3fea9b85d95..9601a29c2500e 100644
--- a/llvm/test/CodeGen/AMDGPU/idot2.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot2.ll
@@ -46,6 +46,7 @@ define amdgpu_kernel void @udot2(<2 x i16> addrspace(1)* %src1,
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX8-NEXT:    s_mov_b32 s2, 0xffff
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_load_dword s3, s[0:1], 0x0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
@@ -54,7 +55,6 @@ define amdgpu_kernel void @udot2(<2 x i16> addrspace(1)* %src1,
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v0, v[0:1]
-; GFX8-NEXT:    s_load_dword s3, s[0:1], 0x0
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
 ; GFX8-NEXT:    v_and_b32_e32 v1, s2, v3
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
@@ -185,15 +185,15 @@ define amdgpu_kernel void @udot2_MulMul(<2 x i16> addrspace(1)* %src1,
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v0, v[0:1]
-; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
@@ -323,6 +323,7 @@ define amdgpu_kernel void @idot2(<2 x i16> addrspace(1)* %src1,
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
@@ -331,7 +332,6 @@ define amdgpu_kernel void @idot2(<2 x i16> addrspace(1)* %src1,
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v0, v[0:1]
-; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
 ; GFX8-NEXT:    v_bfe_i32 v1, v3, 0, 16
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 16, v3
@@ -457,6 +457,7 @@ define amdgpu_kernel void @idot2_MixedTypedMul(<2 x i16> addrspace(1)* %src1,
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
@@ -465,7 +466,6 @@ define amdgpu_kernel void @idot2_MixedTypedMul(<2 x i16> addrspace(1)* %src1,
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v0, v[0:1]
-; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
 ; GFX8-NEXT:    v_bfe_i32 v1, v3, 0, 16
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
@@ -599,6 +599,7 @@ define amdgpu_kernel void @udot2_alt_AddOperands(<2 x i16> addrspace(1)* %src1,
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX8-NEXT:    s_mov_b32 s2, 0xffff
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_load_dword s3, s[0:1], 0x0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
@@ -607,7 +608,6 @@ define amdgpu_kernel void @udot2_alt_AddOperands(<2 x i16> addrspace(1)* %src1,
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v0, v[0:1]
-; GFX8-NEXT:    s_load_dword s3, s[0:1], 0x0
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
 ; GFX8-NEXT:    v_and_b32_e32 v1, s2, v3
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
@@ -738,6 +738,7 @@ define amdgpu_kernel void @idot2_MixedExt(<2 x i16> addrspace(1)* %src1,
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
@@ -746,7 +747,6 @@ define amdgpu_kernel void @idot2_MixedExt(<2 x i16> addrspace(1)* %src1,
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v0, v[0:1]
-; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
 ; GFX8-NEXT:    v_bfe_i32 v1, v3, 0, 16
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 16, v3
@@ -1127,10 +1127,10 @@ define amdgpu_kernel void @udot2_v4i16_Hi(<4 x i16> addrspace(1)* %src1,
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, 0
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX7-NEXT:    s_mov_b32 s10, 0
 ; GFX7-NEXT:    s_mov_b32 s11, s3
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 offset:4
 ; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 offset:4
@@ -1156,6 +1156,7 @@ define amdgpu_kernel void @udot2_v4i16_Hi(<4 x i16> addrspace(1)* %src1,
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX8-NEXT:    s_mov_b32 s2, 0xffff
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_load_dword s3, s[0:1], 0x0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s4, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
@@ -1168,7 +1169,6 @@ define amdgpu_kernel void @udot2_v4i16_Hi(<4 x i16> addrspace(1)* %src1,
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 4, v4
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
 ; GFX8-NEXT:    flat_load_dword v0, v[0:1]
-; GFX8-NEXT:    s_load_dword s3, s[0:1], 0x0
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
 ; GFX8-NEXT:    v_and_b32_e32 v1, s2, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
@@ -1582,6 +1582,7 @@ define amdgpu_kernel void @notudot2_DiffIndex(<2 x i16> addrspace(1)* %src1,
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX8-NEXT:    s_mov_b32 s2, 0xffff
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_load_dword s3, s[0:1], 0x0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
@@ -1590,7 +1591,6 @@ define amdgpu_kernel void @notudot2_DiffIndex(<2 x i16> addrspace(1)* %src1,
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v0, v[0:1]
-; GFX8-NEXT:    s_load_dword s3, s[0:1], 0x0
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
 ; GFX8-NEXT:    v_and_b32_e32 v1, s2, v3
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
@@ -1725,6 +1725,7 @@ define amdgpu_kernel void @udot2_MultipleUses_add1(<2 x i16> addrspace(1)* %src1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX8-NEXT:    s_mov_b32 s2, 0xffff
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_load_dword s3, s[0:1], 0x0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
@@ -1733,7 +1734,6 @@ define amdgpu_kernel void @udot2_MultipleUses_add1(<2 x i16> addrspace(1)* %src1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v0, v[0:1]
-; GFX8-NEXT:    s_load_dword s3, s[0:1], 0x0
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
 ; GFX8-NEXT:    v_and_b32_e32 v1, s2, v3
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
@@ -1876,6 +1876,7 @@ define amdgpu_kernel void @idot2_MultipleUses_add1(<2 x i16> addrspace(1)* %src1
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
@@ -1884,7 +1885,6 @@ define amdgpu_kernel void @idot2_MultipleUses_add1(<2 x i16> addrspace(1)* %src1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v0, v[0:1]
-; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
 ; GFX8-NEXT:    v_bfe_i32 v1, v3, 0, 16
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 16, v3
@@ -2029,6 +2029,7 @@ define amdgpu_kernel void @udot2_MultipleUses_mul1(<2 x i16> addrspace(1)* %src1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX8-NEXT:    s_mov_b32 s2, 0xffff
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_load_dword s3, s[0:1], 0x0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
@@ -2037,7 +2038,6 @@ define amdgpu_kernel void @udot2_MultipleUses_mul1(<2 x i16> addrspace(1)* %src1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v0, v[0:1]
-; GFX8-NEXT:    s_load_dword s3, s[0:1], 0x0
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
 ; GFX8-NEXT:    v_and_b32_e32 v1, s2, v3
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
@@ -2189,6 +2189,7 @@ define amdgpu_kernel void @idot2_MultipleUses_mul1(<2 x i16> addrspace(1)* %src1
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
@@ -2197,7 +2198,6 @@ define amdgpu_kernel void @idot2_MultipleUses_mul1(<2 x i16> addrspace(1)* %src1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v0, v[0:1]
-; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
 ; GFX8-NEXT:    v_bfe_i32 v1, v3, 0, 16
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 16, v3
@@ -2348,6 +2348,7 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(<2 x i16> addrspace(1)* %src1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX8-NEXT:    s_mov_b32 s2, 0xffff
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_load_dword s3, s[0:1], 0x0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
@@ -2356,7 +2357,6 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(<2 x i16> addrspace(1)* %src1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v0, v[0:1]
-; GFX8-NEXT:    s_load_dword s3, s[0:1], 0x0
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
 ; GFX8-NEXT:    v_and_b32_e32 v1, s2, v3
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
@@ -2503,6 +2503,7 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(<2 x i16> addrspace(1)* %src1
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
@@ -2511,7 +2512,6 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(<2 x i16> addrspace(1)* %src1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v0, v[0:1]
-; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
 ; GFX8-NEXT:    v_bfe_i32 v1, v3, 0, 16
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 16, v3
@@ -2636,8 +2636,8 @@ define amdgpu_kernel void @udot2_acc16(<2 x i16> addrspace(1)* %src1,
 ; GFX7-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
 ; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
-; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
 ; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
 ; GFX7-NEXT:    buffer_load_ushort v4, off, s[0:3], 0
 ; GFX7-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX7-NEXT:    s_waitcnt vmcnt(2)
@@ -2662,12 +2662,12 @@ define amdgpu_kernel void @udot2_acc16(<2 x i16> addrspace(1)* %src1,
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v4, v[0:1]
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s1
-; GFX8-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX8-NEXT:    flat_load_ushort v6, v[2:3]
 ; GFX8-NEXT:    s_waitcnt vmcnt(2)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
@@ -2799,6 +2799,7 @@ define amdgpu_kernel void @notsdot2_sext8(<2 x i8> addrspace(1)* %src1,
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
@@ -2807,7 +2808,6 @@ define amdgpu_kernel void @notsdot2_sext8(<2 x i8> addrspace(1)* %src1,
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
-; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
 ; GFX8-NEXT:    v_bfe_i32 v1, v3, 0, 8
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v3, 8, v3

diff  --git a/llvm/test/CodeGen/AMDGPU/idot4s.ll b/llvm/test/CodeGen/AMDGPU/idot4s.ll
index 6f44f2aa70804..ee240a4c49080 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4s.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4s.ll
@@ -47,6 +47,7 @@ define amdgpu_kernel void @idot4_acc32(<4 x i8> addrspace(1)* %src1,
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
@@ -55,7 +56,6 @@ define amdgpu_kernel void @idot4_acc32(<4 x i8> addrspace(1)* %src1,
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v0, v[0:1]
-; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
 ; GFX8-NEXT:    v_bfe_i32 v1, v3, 0, 8
 ; GFX8-NEXT:    v_bfe_i32 v4, v3, 8, 8
@@ -194,18 +194,18 @@ define amdgpu_kernel void @idot4_acc16(<4 x i8> addrspace(1)* %src1,
 ; GFX7-NEXT:    v_bfe_i32 v3, v2, 8, 8
 ; GFX7-NEXT:    s_waitcnt vmcnt(1)
 ; GFX7-NEXT:    v_bfe_i32 v5, v0, 0, 8
-; GFX7-NEXT:    v_bfe_i32 v6, v0, 8, 8
 ; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX7-NEXT:    v_bfe_i32 v6, v0, 8, 8
 ; GFX7-NEXT:    v_and_b32_e32 v5, s4, v5
 ; GFX7-NEXT:    v_bfe_i32 v4, v2, 16, 8
-; GFX7-NEXT:    v_bfe_i32 v7, v0, 16, 8
 ; GFX7-NEXT:    v_and_b32_e32 v3, s4, v3
+; GFX7-NEXT:    v_bfe_i32 v7, v0, 16, 8
 ; GFX7-NEXT:    v_and_b32_e32 v6, s4, v6
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mad_u32_u24 v1, v1, v5, v8
 ; GFX7-NEXT:    v_ashrrev_i32_e32 v2, 24, v2
-; GFX7-NEXT:    v_ashrrev_i32_e32 v0, 24, v0
 ; GFX7-NEXT:    v_and_b32_e32 v4, s4, v4
+; GFX7-NEXT:    v_ashrrev_i32_e32 v0, 24, v0
 ; GFX7-NEXT:    v_and_b32_e32 v7, s4, v7
 ; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v6, v1
 ; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
@@ -225,31 +225,31 @@ define amdgpu_kernel void @idot4_acc16(<4 x i8> addrspace(1)* %src1,
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v4, v[0:1]
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s7
-; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX8-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX8-NEXT:    flat_load_ushort v1, v[2:3]
 ; GFX8-NEXT:    s_waitcnt vmcnt(2)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 8, v4
 ; GFX8-NEXT:    v_bfe_i32 v7, v4, 0, 8
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 8, v4
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
 ; GFX8-NEXT:    v_bfe_i32 v9, v9, 0, 8
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 24, v4
+; GFX8-NEXT:    v_bfe_i32 v5, v5, 0, 8
+; GFX8-NEXT:    v_bfe_i32 v4, v4, 0, 8
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 8, v0
 ; GFX8-NEXT:    v_bfe_i32 v8, v0, 0, 8
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 8, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
 ; GFX8-NEXT:    v_bfe_i32 v10, v10, 0, 8
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mad_u16 v1, v7, v8, v1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
-; GFX8-NEXT:    v_bfe_i32 v5, v5, 0, 8
 ; GFX8-NEXT:    v_bfe_i32 v6, v6, 0, 8
 ; GFX8-NEXT:    v_mad_u16 v1, v9, v10, v1
-; GFX8-NEXT:    v_bfe_i32 v4, v4, 0, 8
 ; GFX8-NEXT:    v_bfe_i32 v0, v0, 0, 8
 ; GFX8-NEXT:    v_mad_u16 v1, v5, v6, v1
 ; GFX8-NEXT:    v_mad_u16 v0, v4, v0, v1
@@ -267,11 +267,11 @@ define amdgpu_kernel void @idot4_acc16(<4 x i8> addrspace(1)* %src1,
 ; GFX9-NODL-NEXT:    global_load_dword v3, v0, s[6:7]
 ; GFX9-NODL-NEXT:    global_load_ushort v4, v1, s[2:3]
 ; GFX9-NODL-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v8, 8, v2
-; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v9, 8, v3
 ; GFX9-NODL-NEXT:    v_bfe_i32 v6, v2, 0, 8
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NODL-NEXT:    v_bfe_i32 v7, v3, 0, 8
+; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v8, 8, v2
+; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v9, 8, v3
 ; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
 ; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
 ; GFX9-NODL-NEXT:    v_bfe_i32 v8, v8, 0, 8
@@ -301,11 +301,11 @@ define amdgpu_kernel void @idot4_acc16(<4 x i8> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    global_load_dword v3, v0, s[6:7]
 ; GFX9-DL-NEXT:    global_load_ushort v4, v1, s[2:3]
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v8, 8, v2
-; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v9, 8, v3
 ; GFX9-DL-NEXT:    v_bfe_i32 v6, v2, 0, 8
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-DL-NEXT:    v_bfe_i32 v7, v3, 0, 8
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v8, 8, v2
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v9, 8, v3
 ; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
 ; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
 ; GFX9-DL-NEXT:    v_bfe_i32 v8, v8, 0, 8
@@ -337,10 +337,10 @@ define amdgpu_kernel void @idot4_acc16(<4 x i8> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-DL-NEXT:    global_load_ushort v3, v0, s[2:3]
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-DL-NEXT:    v_bfe_i32 v4, v1, 0, 8
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v6, 8, v2
-; GFX10-DL-NEXT:    v_bfe_i32 v4, v1, 0, 8
 ; GFX10-DL-NEXT:    v_bfe_i32 v7, v2, 0, 8
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
@@ -348,10 +348,10 @@ define amdgpu_kernel void @idot4_acc16(<4 x i8> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    v_bfe_i32 v6, v6, 0, 8
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DL-NEXT:    v_mad_u16 v3, v4, v7, v3
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
 ; GFX10-DL-NEXT:    v_bfe_i32 v4, v8, 0, 8
 ; GFX10-DL-NEXT:    v_bfe_i32 v7, v9, 0, 8
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
 ; GFX10-DL-NEXT:    v_mad_u16 v3, v5, v6, v3
 ; GFX10-DL-NEXT:    v_bfe_i32 v1, v1, 0, 8
 ; GFX10-DL-NEXT:    v_bfe_i32 v2, v2, 0, 8
@@ -415,19 +415,19 @@ define amdgpu_kernel void @idot4_acc8(<4 x i8> addrspace(1)* %src1,
 ; GFX7-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
 ; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
-; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
 ; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
 ; GFX7-NEXT:    buffer_load_ubyte v8, off, s[0:3], 0
 ; GFX7-NEXT:    s_movk_i32 s4, 0xff
 ; GFX7-NEXT:    s_waitcnt vmcnt(2)
 ; GFX7-NEXT:    v_and_b32_e32 v1, s4, v2
 ; GFX7-NEXT:    v_bfe_u32 v3, v2, 8, 8
+; GFX7-NEXT:    v_bfe_u32 v4, v2, 16, 8
 ; GFX7-NEXT:    s_waitcnt vmcnt(1)
 ; GFX7-NEXT:    v_and_b32_e32 v5, s4, v0
 ; GFX7-NEXT:    v_bfe_u32 v6, v0, 8, 8
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mad_u32_u24 v1, v1, v5, v8
-; GFX7-NEXT:    v_bfe_u32 v4, v2, 16, 8
 ; GFX7-NEXT:    v_bfe_u32 v7, v0, 16, 8
 ; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v6, v1
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
@@ -482,9 +482,9 @@ define amdgpu_kernel void @idot4_acc8(<4 x i8> addrspace(1)* %src1,
 ; GFX9-NODL-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
 ; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v6, 8, v2
-; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v8, 24, v2
 ; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v7, 8, v3
+; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v8, 24, v2
 ; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NODL-NEXT:    v_mad_legacy_u16 v2, v2, v3, v4
 ; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
@@ -508,9 +508,9 @@ define amdgpu_kernel void @idot4_acc8(<4 x i8> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
 ; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v6, 8, v2
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v8, 24, v2
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v7, 8, v3
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v8, 24, v2
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-DL-NEXT:    v_mad_legacy_u16 v2, v2, v3, v4
 ; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
@@ -624,6 +624,7 @@ define amdgpu_kernel void @idot4_multiuse_mul1(<4 x i8> addrspace(1)* %src1,
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
@@ -632,7 +633,6 @@ define amdgpu_kernel void @idot4_multiuse_mul1(<4 x i8> addrspace(1)* %src1,
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v0, v[0:1]
-; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
 ; GFX8-NEXT:    v_bfe_i32 v1, v3, 0, 8
 ; GFX8-NEXT:    v_bfe_i32 v4, v3, 8, 8
@@ -815,6 +815,7 @@ define amdgpu_kernel void @idot4_acc32_vecMul(<4 x i8> addrspace(1)* %src1,
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
@@ -823,18 +824,17 @@ define amdgpu_kernel void @idot4_acc32_vecMul(<4 x i8> addrspace(1)* %src1,
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v0, v[0:1]
-; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v1, 8, v3
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v4, 24, v3
 ; GFX8-NEXT:    v_bfe_i32 v5, v3, 16, 8
 ; GFX8-NEXT:    v_bfe_i32 v3, v3, 0, 8
+; GFX8-NEXT:    v_bfe_i32 v1, v1, 0, 8
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v2, 8, v0
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v6, 24, v0
 ; GFX8-NEXT:    v_bfe_i32 v7, v0, 16, 8
 ; GFX8-NEXT:    v_bfe_i32 v0, v0, 0, 8
-; GFX8-NEXT:    v_bfe_i32 v1, v1, 0, 8
 ; GFX8-NEXT:    v_bfe_i32 v2, v2, 0, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mad_i32_i24 v0, v3, v0, s2
@@ -979,15 +979,15 @@ define amdgpu_kernel void @idot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1,
 ; GFX7-NEXT:    v_or_b32_e32 v3, v5, v3
 ; GFX7-NEXT:    v_and_b32_e32 v6, s4, v7
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
 ; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
 ; GFX7-NEXT:    v_and_b32_e32 v3, s4, v3
 ; GFX7-NEXT:    v_bfe_i32 v4, v2, 16, 8
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mad_u32_u24 v1, v1, v3, v8
 ; GFX7-NEXT:    v_ashrrev_i32_e32 v2, 24, v2
-; GFX7-NEXT:    v_ashrrev_i32_e32 v0, 24, v0
 ; GFX7-NEXT:    v_and_b32_e32 v4, s4, v4
+; GFX7-NEXT:    v_ashrrev_i32_e32 v0, 24, v0
 ; GFX7-NEXT:    v_mad_u32_u24 v1, v5, v7, v1
 ; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
 ; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
@@ -1006,10 +1006,10 @@ define amdgpu_kernel void @idot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1,
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v4, v[0:1]
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s7
-; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX8-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX8-NEXT:    flat_load_ushort v1, v[2:3]
@@ -1049,20 +1049,20 @@ define amdgpu_kernel void @idot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1,
 ; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
 ; GFX9-NODL-NEXT:    v_ashrrev_i16_e32 v7, 8, v1
-; GFX9-NODL-NEXT:    v_and_b32_sdwa v1, v4, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NODL-NEXT:    v_ashrrev_i16_e32 v8, 8, v2
 ; GFX9-NODL-NEXT:    v_and_b32_sdwa v2, v4, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NODL-NEXT:    v_and_b32_sdwa v1, v4, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NODL-NEXT:    v_lshl_or_b32 v2, v8, 16, v2
 ; GFX9-NODL-NEXT:    v_lshl_or_b32 v1, v7, 16, v1
+; GFX9-NODL-NEXT:    v_ashrrev_i16_e32 v9, 8, v5
 ; GFX9-NODL-NEXT:    v_ashrrev_i16_e32 v10, 8, v6
 ; GFX9-NODL-NEXT:    v_and_b32_sdwa v6, v4, sext(v6) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NODL-NEXT:    v_pk_mul_lo_u16 v1, v1, v2
-; GFX9-NODL-NEXT:    v_ashrrev_i16_e32 v9, 8, v5
 ; GFX9-NODL-NEXT:    v_and_b32_sdwa v4, v4, sext(v5) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT:    v_add_u16_e32 v3, v1, v3
+; GFX9-NODL-NEXT:    v_pk_mul_lo_u16 v1, v1, v2
 ; GFX9-NODL-NEXT:    v_lshl_or_b32 v5, v10, 16, v6
 ; GFX9-NODL-NEXT:    v_lshl_or_b32 v4, v9, 16, v4
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NODL-NEXT:    v_add_u16_e32 v3, v1, v3
 ; GFX9-NODL-NEXT:    v_pk_mul_lo_u16 v2, v4, v5
 ; GFX9-NODL-NEXT:    v_add_u16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-NODL-NEXT:    v_add_u16_e32 v1, v1, v2
@@ -1086,20 +1086,20 @@ define amdgpu_kernel void @idot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
 ; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v7, 8, v1
-; GFX9-DL-NEXT:    v_and_b32_sdwa v1, v4, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v8, 8, v2
 ; GFX9-DL-NEXT:    v_and_b32_sdwa v2, v4, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-DL-NEXT:    v_and_b32_sdwa v1, v4, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-DL-NEXT:    v_lshl_or_b32 v2, v8, 16, v2
 ; GFX9-DL-NEXT:    v_lshl_or_b32 v1, v7, 16, v1
+; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v9, 8, v5
 ; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v10, 8, v6
 ; GFX9-DL-NEXT:    v_and_b32_sdwa v6, v4, sext(v6) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v1, v1, v2
-; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v9, 8, v5
 ; GFX9-DL-NEXT:    v_and_b32_sdwa v4, v4, sext(v5) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_add_u16_e32 v3, v1, v3
+; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v1, v1, v2
 ; GFX9-DL-NEXT:    v_lshl_or_b32 v5, v10, 16, v6
 ; GFX9-DL-NEXT:    v_lshl_or_b32 v4, v9, 16, v4
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DL-NEXT:    v_add_u16_e32 v3, v1, v3
 ; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v2, v4, v5
 ; GFX9-DL-NEXT:    v_add_u16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-DL-NEXT:    v_add_u16_e32 v1, v1, v2
@@ -1121,21 +1121,21 @@ define amdgpu_kernel void @idot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    global_load_ushort v3, v0, s[0:1]
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
 ; GFX10-DL-NEXT:    v_ashrrev_i16 v5, 8, v1
-; GFX10-DL-NEXT:    v_and_b32_sdwa v8, v4, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-DL-NEXT:    v_ashrrev_i16 v6, 8, v2
 ; GFX10-DL-NEXT:    v_and_b32_sdwa v7, v4, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-DL-NEXT:    v_and_b32_sdwa v8, v4, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX10-DL-NEXT:    v_lshl_or_b32 v5, v5, 16, v8
 ; GFX10-DL-NEXT:    v_lshl_or_b32 v6, v6, 16, v7
+; GFX10-DL-NEXT:    v_lshl_or_b32 v5, v5, 16, v8
 ; GFX10-DL-NEXT:    v_ashrrev_i16 v7, 8, v1
-; GFX10-DL-NEXT:    v_and_b32_sdwa v1, v4, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX10-DL-NEXT:    v_ashrrev_i16 v8, 8, v2
 ; GFX10-DL-NEXT:    v_and_b32_sdwa v2, v4, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-DL-NEXT:    v_and_b32_sdwa v1, v4, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v4, v5, v6
-; GFX10-DL-NEXT:    v_lshl_or_b32 v1, v7, 16, v1
 ; GFX10-DL-NEXT:    v_lshl_or_b32 v2, v8, 16, v2
+; GFX10-DL-NEXT:    v_lshl_or_b32 v1, v7, 16, v1
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DL-NEXT:    v_add_nc_u16 v3, v4, v3

diff  --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll
index ad5a0a5bd65f9..9e4f7517ee8fb 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4u.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll
@@ -49,6 +49,7 @@ define amdgpu_kernel void @udot4_acc32(<4 x i8> addrspace(1)* %src1,
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX8-NEXT:    s_movk_i32 s2, 0xff
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_load_dword s3, s[0:1], 0x0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
@@ -57,7 +58,6 @@ define amdgpu_kernel void @udot4_acc32(<4 x i8> addrspace(1)* %src1,
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v0, v[0:1]
-; GFX8-NEXT:    s_load_dword s3, s[0:1], 0x0
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
 ; GFX8-NEXT:    v_and_b32_e32 v1, s2, v3
 ; GFX8-NEXT:    v_bfe_u32 v4, v3, 8, 8
@@ -186,19 +186,19 @@ define amdgpu_kernel void @udot4_acc16(<4 x i8> addrspace(1)* %src1,
 ; GFX7-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
 ; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
-; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
 ; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
 ; GFX7-NEXT:    buffer_load_ushort v8, off, s[0:3], 0
 ; GFX7-NEXT:    s_movk_i32 s4, 0xff
 ; GFX7-NEXT:    s_waitcnt vmcnt(2)
 ; GFX7-NEXT:    v_and_b32_e32 v1, s4, v2
 ; GFX7-NEXT:    v_bfe_u32 v3, v2, 8, 8
+; GFX7-NEXT:    v_bfe_u32 v4, v2, 16, 8
 ; GFX7-NEXT:    s_waitcnt vmcnt(1)
 ; GFX7-NEXT:    v_and_b32_e32 v5, s4, v0
 ; GFX7-NEXT:    v_bfe_u32 v6, v0, 8, 8
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mad_u32_u24 v1, v1, v5, v8
-; GFX7-NEXT:    v_bfe_u32 v4, v2, 16, 8
 ; GFX7-NEXT:    v_bfe_u32 v7, v0, 16, 8
 ; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v6, v1
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
@@ -218,24 +218,24 @@ define amdgpu_kernel void @udot4_acc16(<4 x i8> addrspace(1)* %src1,
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v4, v[0:1]
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s7
-; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX8-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX8-NEXT:    flat_load_ushort v1, v[2:3]
 ; GFX8-NEXT:    s_movk_i32 s0, 0xff
 ; GFX8-NEXT:    v_mov_b32_e32 v5, s0
 ; GFX8-NEXT:    s_waitcnt vmcnt(2)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 8, v4
 ; GFX8-NEXT:    v_and_b32_e32 v6, s0, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 8, v4
 ; GFX8-NEXT:    v_and_b32_e32 v8, s0, v8
 ; GFX8-NEXT:    v_and_b32_sdwa v10, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 24, v4
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 8, v0
 ; GFX8-NEXT:    v_and_b32_e32 v7, s0, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 8, v0
 ; GFX8-NEXT:    v_and_b32_e32 v9, s0, v9
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mad_u16 v1, v6, v7, v1
@@ -259,11 +259,11 @@ define amdgpu_kernel void @udot4_acc16(<4 x i8> addrspace(1)* %src1,
 ; GFX9-NODL-NEXT:    global_load_dword v3, v0, s[6:7]
 ; GFX9-NODL-NEXT:    global_load_ushort v4, v1, s[2:3]
 ; GFX9-NODL-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v6, 8, v2
-; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v7, 8, v3
 ; GFX9-NODL-NEXT:    v_and_b32_e32 v0, s0, v2
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NODL-NEXT:    v_and_b32_e32 v5, s0, v3
+; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v6, 8, v2
+; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v7, 8, v3
 ; GFX9-NODL-NEXT:    v_and_b32_e32 v6, s0, v6
 ; GFX9-NODL-NEXT:    v_and_b32_e32 v7, s0, v7
 ; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
@@ -290,11 +290,11 @@ define amdgpu_kernel void @udot4_acc16(<4 x i8> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    global_load_dword v3, v0, s[6:7]
 ; GFX9-DL-NEXT:    global_load_ushort v4, v1, s[2:3]
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v6, 8, v2
-; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v7, 8, v3
 ; GFX9-DL-NEXT:    v_and_b32_e32 v0, s0, v2
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-DL-NEXT:    v_and_b32_e32 v5, s0, v3
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v6, 8, v2
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v7, 8, v3
 ; GFX9-DL-NEXT:    v_and_b32_e32 v6, s0, v6
 ; GFX9-DL-NEXT:    v_and_b32_e32 v7, s0, v7
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
@@ -323,10 +323,10 @@ define amdgpu_kernel void @udot4_acc16(<4 x i8> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-DL-NEXT:    global_load_ushort v3, v0, s[2:3]
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-DL-NEXT:    v_and_b32_e32 v4, s0, v1
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v6, 8, v2
-; GFX10-DL-NEXT:    v_and_b32_e32 v4, s0, v1
 ; GFX10-DL-NEXT:    v_and_b32_e32 v7, s0, v2
 ; GFX10-DL-NEXT:    v_and_b32_e32 v5, s0, v5
 ; GFX10-DL-NEXT:    v_and_b32_e32 v6, s0, v6
@@ -398,19 +398,19 @@ define amdgpu_kernel void @udot4_acc8(<4 x i8> addrspace(1)* %src1,
 ; GFX7-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
 ; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
-; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
 ; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
 ; GFX7-NEXT:    buffer_load_ubyte v8, off, s[0:3], 0
 ; GFX7-NEXT:    s_movk_i32 s4, 0xff
 ; GFX7-NEXT:    s_waitcnt vmcnt(2)
 ; GFX7-NEXT:    v_and_b32_e32 v1, s4, v2
 ; GFX7-NEXT:    v_bfe_u32 v3, v2, 8, 8
+; GFX7-NEXT:    v_bfe_u32 v4, v2, 16, 8
 ; GFX7-NEXT:    s_waitcnt vmcnt(1)
 ; GFX7-NEXT:    v_and_b32_e32 v5, s4, v0
 ; GFX7-NEXT:    v_bfe_u32 v6, v0, 8, 8
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mad_u32_u24 v1, v1, v5, v8
-; GFX7-NEXT:    v_bfe_u32 v4, v2, 16, 8
 ; GFX7-NEXT:    v_bfe_u32 v7, v0, 16, 8
 ; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v6, v1
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
@@ -465,9 +465,9 @@ define amdgpu_kernel void @udot4_acc8(<4 x i8> addrspace(1)* %src1,
 ; GFX9-NODL-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
 ; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v6, 8, v2
-; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v8, 24, v2
 ; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v7, 8, v3
+; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v8, 24, v2
 ; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NODL-NEXT:    v_mad_legacy_u16 v2, v2, v3, v4
 ; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
@@ -491,9 +491,9 @@ define amdgpu_kernel void @udot4_acc8(<4 x i8> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
 ; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v6, 8, v2
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v8, 24, v2
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v7, 8, v3
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v8, 24, v2
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-DL-NEXT:    v_mad_legacy_u16 v2, v2, v3, v4
 ; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
@@ -722,19 +722,19 @@ define amdgpu_kernel void @udot4_CommutationInsideMAD(<4 x i8> addrspace(1)* %sr
 ; GFX7-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
 ; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
-; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
 ; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
 ; GFX7-NEXT:    buffer_load_ubyte v8, off, s[0:3], 0
 ; GFX7-NEXT:    s_movk_i32 s4, 0xff
 ; GFX7-NEXT:    s_waitcnt vmcnt(2)
 ; GFX7-NEXT:    v_and_b32_e32 v1, s4, v2
 ; GFX7-NEXT:    v_bfe_u32 v3, v2, 8, 8
+; GFX7-NEXT:    v_bfe_u32 v4, v2, 16, 8
 ; GFX7-NEXT:    s_waitcnt vmcnt(1)
 ; GFX7-NEXT:    v_and_b32_e32 v5, s4, v0
 ; GFX7-NEXT:    v_bfe_u32 v6, v0, 8, 8
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mad_u32_u24 v1, v5, v1, v8
-; GFX7-NEXT:    v_bfe_u32 v4, v2, 16, 8
 ; GFX7-NEXT:    v_bfe_u32 v7, v0, 16, 8
 ; GFX7-NEXT:    v_mad_u32_u24 v1, v6, v3, v1
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
@@ -789,9 +789,9 @@ define amdgpu_kernel void @udot4_CommutationInsideMAD(<4 x i8> addrspace(1)* %sr
 ; GFX9-NODL-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
 ; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v6, 8, v2
-; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v8, 24, v2
 ; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v7, 8, v3
+; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v8, 24, v2
 ; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NODL-NEXT:    v_mad_legacy_u16 v2, v3, v2, v4
 ; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
@@ -815,9 +815,9 @@ define amdgpu_kernel void @udot4_CommutationInsideMAD(<4 x i8> addrspace(1)* %sr
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
 ; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v6, 8, v2
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v8, 24, v2
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v7, 8, v3
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v8, 24, v2
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-DL-NEXT:    v_mad_legacy_u16 v2, v3, v2, v4
 ; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
@@ -905,19 +905,19 @@ define amdgpu_kernel void @udot4_CommutationAccrossMADs(<4 x i8> addrspace(1)* %
 ; GFX7-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
 ; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
-; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
 ; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
 ; GFX7-NEXT:    buffer_load_ubyte v8, off, s[0:3], 0
 ; GFX7-NEXT:    s_movk_i32 s4, 0xff
 ; GFX7-NEXT:    s_waitcnt vmcnt(2)
 ; GFX7-NEXT:    v_bfe_u32 v3, v2, 8, 8
 ; GFX7-NEXT:    v_and_b32_e32 v1, s4, v2
+; GFX7-NEXT:    v_bfe_u32 v4, v2, 16, 8
 ; GFX7-NEXT:    s_waitcnt vmcnt(1)
 ; GFX7-NEXT:    v_bfe_u32 v6, v0, 8, 8
 ; GFX7-NEXT:    v_and_b32_e32 v5, s4, v0
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mad_u32_u24 v3, v6, v3, v8
-; GFX7-NEXT:    v_bfe_u32 v4, v2, 16, 8
 ; GFX7-NEXT:    v_bfe_u32 v7, v0, 16, 8
 ; GFX7-NEXT:    v_mad_u32_u24 v1, v5, v1, v3
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
@@ -937,12 +937,12 @@ define amdgpu_kernel void @udot4_CommutationAccrossMADs(<4 x i8> addrspace(1)* %
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v4, v[0:1]
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s1
-; GFX8-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX8-NEXT:    flat_load_ubyte v10, v[2:3]
 ; GFX8-NEXT:    s_waitcnt vmcnt(2)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 8, v4
@@ -950,9 +950,9 @@ define amdgpu_kernel void @udot4_CommutationAccrossMADs(<4 x i8> addrspace(1)* %
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 24, v4
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 8, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mad_u16 v6, v7, v6, v10
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 24, v0
 ; GFX8-NEXT:    v_mad_u16 v0, v0, v4, v6
 ; GFX8-NEXT:    v_mad_u16 v0, v5, v1, v0
@@ -977,8 +977,8 @@ define amdgpu_kernel void @udot4_CommutationAccrossMADs(<4 x i8> addrspace(1)* %
 ; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NODL-NEXT:    v_mad_legacy_u16 v5, v6, v5, v9
 ; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
-; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v7, 24, v2
 ; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
+; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v7, 24, v2
 ; GFX9-NODL-NEXT:    v_mad_legacy_u16 v2, v3, v2, v5
 ; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v8, 24, v3
 ; GFX9-NODL-NEXT:    v_mad_legacy_u16 v0, v4, v0, v2
@@ -1003,8 +1003,8 @@ define amdgpu_kernel void @udot4_CommutationAccrossMADs(<4 x i8> addrspace(1)* %
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-DL-NEXT:    v_mad_legacy_u16 v5, v6, v5, v9
 ; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v7, 24, v2
 ; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v7, 24, v2
 ; GFX9-DL-NEXT:    v_mad_legacy_u16 v2, v3, v2, v5
 ; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v8, 24, v3
 ; GFX9-DL-NEXT:    v_mad_legacy_u16 v0, v4, v0, v2
@@ -1118,6 +1118,7 @@ define amdgpu_kernel void @udot4_multiuse_mul1(<4 x i8> addrspace(1)* %src1,
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX8-NEXT:    s_movk_i32 s2, 0xff
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_load_dword s3, s[0:1], 0x0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
@@ -1126,7 +1127,6 @@ define amdgpu_kernel void @udot4_multiuse_mul1(<4 x i8> addrspace(1)* %src1,
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v0, v[0:1]
-; GFX8-NEXT:    s_load_dword s3, s[0:1], 0x0
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
 ; GFX8-NEXT:    v_and_b32_e32 v1, s2, v3
 ; GFX8-NEXT:    v_bfe_u32 v4, v3, 8, 8
@@ -1293,9 +1293,9 @@ define amdgpu_kernel void @udot4_multiuse_add1(<4 x i8> addrspace(1)* %src1,
 ; GFX7-NEXT:    v_and_b32_e32 v1, s4, v2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_bfe_u32 v6, v0, 8, 8
+; GFX7-NEXT:    v_and_b32_e32 v5, s4, v0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    v_mad_u32_u24 v3, v3, v6, s5
-; GFX7-NEXT:    v_and_b32_e32 v5, s4, v0
 ; GFX7-NEXT:    v_bfe_u32 v4, v2, 16, 8
 ; GFX7-NEXT:    v_bfe_u32 v7, v0, 16, 8
 ; GFX7-NEXT:    v_mad_u32_u24 v1, v1, v5, v3
@@ -1315,6 +1315,7 @@ define amdgpu_kernel void @udot4_multiuse_add1(<4 x i8> addrspace(1)* %src1,
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX8-NEXT:    s_movk_i32 s2, 0xff
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_load_dword s3, s[0:1], 0x0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
@@ -1323,7 +1324,6 @@ define amdgpu_kernel void @udot4_multiuse_add1(<4 x i8> addrspace(1)* %src1,
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v0, v[0:1]
-; GFX8-NEXT:    s_load_dword s3, s[0:1], 0x0
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
 ; GFX8-NEXT:    v_bfe_u32 v4, v3, 8, 8
 ; GFX8-NEXT:    v_and_b32_e32 v1, s2, v3
@@ -1331,9 +1331,9 @@ define amdgpu_kernel void @udot4_multiuse_add1(<4 x i8> addrspace(1)* %src1,
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 24, v3
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_bfe_u32 v5, v0, 8, 8
+; GFX8-NEXT:    v_and_b32_e32 v2, s2, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mad_u32_u24 v4, v4, v5, s3
-; GFX8-NEXT:    v_and_b32_e32 v2, s2, v0
 ; GFX8-NEXT:    v_bfe_u32 v7, v0, 16, 8
 ; GFX8-NEXT:    v_mad_u32_u24 v1, v1, v2, v4
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
@@ -1515,11 +1515,11 @@ define amdgpu_kernel void @notdot4_mixedtypes(<4 x i8> addrspace(1)* %src1,
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v4, v[0:1]
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX8-NEXT:    flat_load_ushort v10, v[2:3]
 ; GFX8-NEXT:    s_waitcnt vmcnt(2)
@@ -1527,6 +1527,7 @@ define amdgpu_kernel void @notdot4_mixedtypes(<4 x i8> addrspace(1)* %src1,
 ; GFX8-NEXT:    v_and_b32_e32 v7, s2, v7
 ; GFX8-NEXT:    v_bfe_i32 v1, v4, 0, 8
 ; GFX8-NEXT:    v_and_b32_sdwa v9, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 24, v4
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 8, v0
 ; GFX8-NEXT:    v_and_b32_e32 v8, s2, v8
@@ -1535,7 +1536,6 @@ define amdgpu_kernel void @notdot4_mixedtypes(<4 x i8> addrspace(1)* %src1,
 ; GFX8-NEXT:    v_mad_u16 v7, v7, v8, v10
 ; GFX8-NEXT:    v_and_b32_sdwa v5, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_mad_u16 v1, v1, v6, v7
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 24, v4
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
 ; GFX8-NEXT:    v_mad_u16 v1, v9, v5, v1
 ; GFX8-NEXT:    v_mad_u16 v0, v4, v0, v1
@@ -1723,6 +1723,7 @@ define amdgpu_kernel void @udot4_acc32_vecMul(<4 x i8> addrspace(1)* %src1,
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX8-NEXT:    s_movk_i32 s2, 0xff
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_load_dword s3, s[0:1], 0x0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
@@ -1731,7 +1732,6 @@ define amdgpu_kernel void @udot4_acc32_vecMul(<4 x i8> addrspace(1)* %src1,
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v0, v[0:1]
-; GFX8-NEXT:    s_load_dword s3, s[0:1], 0x0
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 24, v3
 ; GFX8-NEXT:    v_bfe_u32 v4, v3, 16, 8
@@ -1870,16 +1870,16 @@ define amdgpu_kernel void @udot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1,
 ; GFX7-NEXT:    s_waitcnt vmcnt(2)
 ; GFX7-NEXT:    v_and_b32_e32 v1, s4, v2
 ; GFX7-NEXT:    v_and_b32_e32 v3, s5, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; GFX7-NEXT:    s_waitcnt vmcnt(1)
 ; GFX7-NEXT:    v_and_b32_e32 v5, s4, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
+; GFX7-NEXT:    v_and_b32_e32 v6, s5, v0
 ; GFX7-NEXT:    v_or_b32_e32 v1, v3, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 8, v5
-; GFX7-NEXT:    v_and_b32_e32 v6, s5, v0
 ; GFX7-NEXT:    v_or_b32_e32 v3, v6, v3
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
 ; GFX7-NEXT:    v_and_b32_e32 v1, s5, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
 ; GFX7-NEXT:    v_and_b32_e32 v3, s5, v3
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mad_u32_u24 v1, v1, v3, v8
@@ -1905,17 +1905,17 @@ define amdgpu_kernel void @udot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1,
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v4, v[0:1]
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s1
-; GFX8-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX8-NEXT:    flat_load_ushort v10, v[2:3]
 ; GFX8-NEXT:    s_waitcnt vmcnt(2)
-; GFX8-NEXT:    v_and_b32_sdwa v9, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 24, v4
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v6, 8, v4
+; GFX8-NEXT:    v_and_b32_sdwa v9, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_and_b32_e32 v4, s2, v4
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 24, v0
@@ -1944,23 +1944,23 @@ define amdgpu_kernel void @udot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1,
 ; GFX9-NODL-NEXT:    global_load_ushort v3, v0, s[2:3]
 ; GFX9-NODL-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NODL-NEXT:    v_lshrrev_b16_e32 v5, 8, v1
-; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NODL-NEXT:    v_and_b32_sdwa v10, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v6, 24, v1
-; GFX9-NODL-NEXT:    v_and_b32_sdwa v9, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NODL-NEXT:    v_and_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NODL-NEXT:    v_lshrrev_b16_e32 v7, 8, v2
 ; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v8, 24, v2
+; GFX9-NODL-NEXT:    v_and_b32_sdwa v9, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NODL-NEXT:    v_and_b32_sdwa v10, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-NODL-NEXT:    v_and_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NODL-NEXT:    v_and_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NODL-NEXT:    v_lshl_or_b32 v2, v7, 16, v2
 ; GFX9-NODL-NEXT:    v_lshl_or_b32 v1, v5, 16, v1
 ; GFX9-NODL-NEXT:    v_and_b32_e32 v10, v4, v10
 ; GFX9-NODL-NEXT:    v_and_b32_e32 v4, v4, v9
 ; GFX9-NODL-NEXT:    v_pk_mul_lo_u16 v1, v1, v2
-; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT:    v_add_u16_e32 v3, v1, v3
 ; GFX9-NODL-NEXT:    v_lshl_or_b32 v5, v8, 16, v10
 ; GFX9-NODL-NEXT:    v_lshl_or_b32 v4, v6, 16, v4
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NODL-NEXT:    v_add_u16_e32 v3, v1, v3
 ; GFX9-NODL-NEXT:    v_pk_mul_lo_u16 v2, v4, v5
 ; GFX9-NODL-NEXT:    v_add_u16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-NODL-NEXT:    v_add_u16_e32 v1, v1, v2
@@ -1982,23 +1982,23 @@ define amdgpu_kernel void @udot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    global_load_ushort v3, v0, s[2:3]
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-DL-NEXT:    v_lshrrev_b16_e32 v5, 8, v1
-; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-DL-NEXT:    v_and_b32_sdwa v10, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v6, 24, v1
-; GFX9-DL-NEXT:    v_and_b32_sdwa v9, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-DL-NEXT:    v_and_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-DL-NEXT:    v_lshrrev_b16_e32 v7, 8, v2
 ; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v8, 24, v2
+; GFX9-DL-NEXT:    v_and_b32_sdwa v9, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-DL-NEXT:    v_and_b32_sdwa v10, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-DL-NEXT:    v_and_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-DL-NEXT:    v_and_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-DL-NEXT:    v_lshl_or_b32 v2, v7, 16, v2
 ; GFX9-DL-NEXT:    v_lshl_or_b32 v1, v5, 16, v1
 ; GFX9-DL-NEXT:    v_and_b32_e32 v10, v4, v10
 ; GFX9-DL-NEXT:    v_and_b32_e32 v4, v4, v9
 ; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v1, v1, v2
-; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_add_u16_e32 v3, v1, v3
 ; GFX9-DL-NEXT:    v_lshl_or_b32 v5, v8, 16, v10
 ; GFX9-DL-NEXT:    v_lshl_or_b32 v4, v6, 16, v4
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DL-NEXT:    v_add_u16_e32 v3, v1, v3
 ; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v2, v4, v5
 ; GFX9-DL-NEXT:    v_add_u16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-DL-NEXT:    v_add_u16_e32 v1, v1, v2
@@ -2021,15 +2021,15 @@ define amdgpu_kernel void @udot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    global_load_ushort v3, v0, s[0:1]
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
 ; GFX10-DL-NEXT:    v_lshrrev_b16 v5, 8, v1
-; GFX10-DL-NEXT:    v_and_b32_sdwa v8, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-DL-NEXT:    v_lshrrev_b16 v6, 8, v2
 ; GFX10-DL-NEXT:    v_and_b32_sdwa v7, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-DL-NEXT:    v_and_b32_sdwa v8, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX10-DL-NEXT:    v_and_b32_sdwa v9, v1, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-DL-NEXT:    v_and_b32_sdwa v10, v2, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-DL-NEXT:    v_lshl_or_b32 v5, v5, 16, v8
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
 ; GFX10-DL-NEXT:    v_lshl_or_b32 v6, v6, 16, v7
+; GFX10-DL-NEXT:    v_lshl_or_b32 v5, v5, 16, v8
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
 ; GFX10-DL-NEXT:    v_and_b32_e32 v7, v4, v10
 ; GFX10-DL-NEXT:    v_and_b32_e32 v4, v4, v9
@@ -2089,21 +2089,21 @@ define amdgpu_kernel void @udot4_acc8_vecMul(<4 x i8> addrspace(1)* %src1,
 ; GFX7-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
 ; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
-; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
 ; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
 ; GFX7-NEXT:    buffer_load_ubyte v8, off, s[0:3], 0
 ; GFX7-NEXT:    s_movk_i32 s4, 0xff
 ; GFX7-NEXT:    s_waitcnt vmcnt(2)
 ; GFX7-NEXT:    v_and_b32_e32 v3, s4, v2
 ; GFX7-NEXT:    v_bfe_u32 v4, v2, 8, 8
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 24, v2
 ; GFX7-NEXT:    s_waitcnt vmcnt(1)
 ; GFX7-NEXT:    v_and_b32_e32 v6, s4, v0
 ; GFX7-NEXT:    v_bfe_u32 v7, v0, 8, 8
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mad_u32_u24 v3, v3, v6, v8
-; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 24, v2
-; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
 ; GFX7-NEXT:    v_bfe_u32 v2, v2, 16, 8
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
 ; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
 ; GFX7-NEXT:    v_mad_u32_u24 v3, v4, v7, v3
 ; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v3

diff  --git a/llvm/test/CodeGen/AMDGPU/idot8s.ll b/llvm/test/CodeGen/AMDGPU/idot8s.ll
index d0cde94b098c0..df34354806cc5 100644
--- a/llvm/test/CodeGen/AMDGPU/idot8s.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot8s.ll
@@ -19,10 +19,10 @@ define amdgpu_kernel void @idot8_acc32(<8 x i4> addrspace(1)* %src1,
 ; GFX7-NEXT:    s_mov_b32 s15, 0xe8f000
 ; GFX7-NEXT:    s_add_u32 s12, s12, s3
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX7-NEXT:    s_mov_b32 s10, 0
 ; GFX7-NEXT:    s_mov_b32 s11, s3
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
@@ -87,6 +87,8 @@ define amdgpu_kernel void @idot8_acc32(<8 x i4> addrspace(1)* %src1,
 ; GFX8-NEXT:    v_bfe_i32 v4, v3, 4, 4
 ; GFX8-NEXT:    v_bfe_i32 v6, v3, 8, 4
 ; GFX8-NEXT:    v_bfe_i32 v8, v3, 12, 4
+; GFX8-NEXT:    v_bfe_i32 v10, v3, 16, 4
+; GFX8-NEXT:    v_bfe_i32 v12, v3, 20, 4
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_bfe_i32 v2, v0, 0, 4
 ; GFX8-NEXT:    v_bfe_i32 v5, v0, 4, 4
@@ -96,10 +98,8 @@ define amdgpu_kernel void @idot8_acc32(<8 x i4> addrspace(1)* %src1,
 ; GFX8-NEXT:    v_mad_i32_i24 v1, v4, v5, v1
 ; GFX8-NEXT:    v_bfe_i32 v9, v0, 12, 4
 ; GFX8-NEXT:    v_mad_i32_i24 v1, v6, v7, v1
-; GFX8-NEXT:    v_bfe_i32 v10, v3, 16, 4
 ; GFX8-NEXT:    v_bfe_i32 v11, v0, 16, 4
 ; GFX8-NEXT:    v_mad_i32_i24 v1, v8, v9, v1
-; GFX8-NEXT:    v_bfe_i32 v12, v3, 20, 4
 ; GFX8-NEXT:    v_bfe_i32 v13, v0, 20, 4
 ; GFX8-NEXT:    v_mad_i32_i24 v1, v10, v11, v1
 ; GFX8-NEXT:    v_bfe_i32 v14, v3, 24, 4
@@ -136,20 +136,20 @@ define amdgpu_kernel void @idot8_acc32(<8 x i4> addrspace(1)* %src1,
 ; GFX9-NEXT:    v_bfe_i32 v4, v2, 0, 4
 ; GFX9-NEXT:    v_bfe_i32 v5, v1, 4, 4
 ; GFX9-NEXT:    v_bfe_i32 v6, v2, 4, 4
-; GFX9-NEXT:    v_mul_i32_i24_e32 v3, v3, v4
 ; GFX9-NEXT:    v_bfe_i32 v7, v1, 8, 4
 ; GFX9-NEXT:    v_bfe_i32 v8, v2, 8, 4
-; GFX9-NEXT:    v_mul_i32_i24_e32 v4, v5, v6
 ; GFX9-NEXT:    v_bfe_i32 v9, v1, 12, 4
 ; GFX9-NEXT:    v_bfe_i32 v10, v2, 12, 4
 ; GFX9-NEXT:    v_bfe_i32 v11, v1, 16, 4
 ; GFX9-NEXT:    v_bfe_i32 v12, v2, 16, 4
 ; GFX9-NEXT:    v_bfe_i32 v13, v1, 20, 4
-; GFX9-NEXT:    v_bfe_i32 v15, v1, 24, 4
 ; GFX9-NEXT:    v_bfe_i32 v14, v2, 20, 4
+; GFX9-NEXT:    v_bfe_i32 v15, v1, 24, 4
 ; GFX9-NEXT:    v_bfe_i32 v16, v2, 24, 4
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 28, v1
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 28, v2
+; GFX9-NEXT:    v_mul_i32_i24_e32 v3, v3, v4
+; GFX9-NEXT:    v_mul_i32_i24_e32 v4, v5, v6
 ; GFX9-NEXT:    v_mul_i32_i24_e32 v5, v7, v8
 ; GFX9-NEXT:    v_mul_i32_i24_e32 v6, v9, v10
 ; GFX9-NEXT:    v_mul_i32_i24_e32 v1, v1, v2
@@ -175,11 +175,11 @@ define amdgpu_kernel void @idot8_acc32(<8 x i4> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-DL-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    global_load_dword v2, v0, s[4:5]
 ; GFX9-DL-NEXT:    global_load_dword v3, v0, s[6:7]
 ; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX9-DL-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-DL-NEXT:    v_dot8_i32_i4 v0, v2, v3, s0
 ; GFX9-DL-NEXT:    global_store_dword v1, v0, s[2:3]
@@ -333,10 +333,10 @@ define amdgpu_kernel void @idot8_acc16(<8 x i4> addrspace(1)* %src1,
 ; GFX7-NEXT:    s_mov_b32 s15, 0xe8f000
 ; GFX7-NEXT:    s_add_u32 s12, s12, s3
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX7-NEXT:    s_mov_b32 s10, 0
 ; GFX7-NEXT:    s_mov_b32 s11, s3
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64
@@ -422,23 +422,23 @@ define amdgpu_kernel void @idot8_acc16(<8 x i4> addrspace(1)* %src1,
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v16, 12, v3
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 28, v3
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 20, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 12, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 8, v3
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 4, v2
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v17, 12, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 12, v3
-; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 8, v3
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 28, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 20, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 12, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 8, v2
-; GFX8-NEXT:    v_lshlrev_b16_e32 v10, 12, v10
-; GFX8-NEXT:    v_lshlrev_b16_e32 v15, 12, v15
 ; GFX8-NEXT:    v_lshlrev_b16_sdwa v18, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_lshlrev_b16_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
 ; GFX8-NEXT:    v_lshlrev_b16_sdwa v19, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_lshlrev_b16_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
 ; GFX8-NEXT:    v_lshlrev_b16_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v5, 12, v16
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v16, 12, v17
+; GFX8-NEXT:    v_lshlrev_b16_e32 v10, 12, v10
+; GFX8-NEXT:    v_lshlrev_b16_e32 v15, 12, v15
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v9, 12, v9
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v14, 12, v14
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v10, 12, v10
@@ -453,18 +453,18 @@ define amdgpu_kernel void @idot8_acc16(<8 x i4> addrspace(1)* %src1,
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v8, 12, v8
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v13, 12, v13
 ; GFX8-NEXT:    v_mad_u16 v4, v9, v14, v4
-; GFX8-NEXT:    v_lshlrev_b16_e32 v7, 12, v7
-; GFX8-NEXT:    v_lshlrev_b16_e32 v12, 12, v12
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v17, 12, v18
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v18, 12, v19
+; GFX8-NEXT:    v_lshlrev_b16_e32 v7, 12, v7
+; GFX8-NEXT:    v_lshlrev_b16_e32 v12, 12, v12
 ; GFX8-NEXT:    v_mad_u16 v4, v8, v13, v4
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v7, 12, v7
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v12, 12, v12
 ; GFX8-NEXT:    v_mad_u16 v4, v17, v18, v4
-; GFX8-NEXT:    v_lshlrev_b16_e32 v6, 12, v6
-; GFX8-NEXT:    v_lshlrev_b16_e32 v11, 12, v11
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v3, 12, v3
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v2, 12, v2
+; GFX8-NEXT:    v_lshlrev_b16_e32 v6, 12, v6
+; GFX8-NEXT:    v_lshlrev_b16_e32 v11, 12, v11
 ; GFX8-NEXT:    v_mad_u16 v4, v7, v12, v4
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v6, 12, v6
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v11, 12, v11
@@ -484,12 +484,12 @@ define amdgpu_kernel void @idot8_acc16(<8 x i4> addrspace(1)* %src1,
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 12
-; GFX9-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v1, v0, s[4:5]
 ; GFX9-NEXT:    global_load_dword v2, v0, s[6:7]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    global_load_ushort v3, v0, s[2:3]
+; GFX9-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 4, v1
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
@@ -504,14 +504,14 @@ define amdgpu_kernel void @idot8_acc16(<8 x i4> addrspace(1)* %src1,
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 20, v2
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 12, v2
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 8, v2
-; GFX9-NEXT:    v_lshlrev_b16_e32 v9, 12, v9
-; GFX9-NEXT:    v_lshlrev_b16_e32 v14, 12, v14
 ; GFX9-NEXT:    v_lshlrev_b16_sdwa v17, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT:    v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
 ; GFX9-NEXT:    v_lshlrev_b16_sdwa v18, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
 ; GFX9-NEXT:    v_lshlrev_b16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
 ; GFX9-NEXT:    v_ashrrev_i16_e32 v4, 12, v15
 ; GFX9-NEXT:    v_ashrrev_i16_e32 v15, 12, v16
+; GFX9-NEXT:    v_lshlrev_b16_e32 v9, 12, v9
+; GFX9-NEXT:    v_lshlrev_b16_e32 v14, 12, v14
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v8, 12, v8
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v13, 12, v13
 ; GFX9-NEXT:    v_ashrrev_i16_e32 v9, 12, v9
@@ -526,18 +526,18 @@ define amdgpu_kernel void @idot8_acc16(<8 x i4> addrspace(1)* %src1,
 ; GFX9-NEXT:    v_ashrrev_i16_e32 v7, 12, v7
 ; GFX9-NEXT:    v_ashrrev_i16_e32 v12, 12, v12
 ; GFX9-NEXT:    v_mad_legacy_u16 v3, v8, v13, v3
-; GFX9-NEXT:    v_lshlrev_b16_e32 v6, 12, v6
-; GFX9-NEXT:    v_lshlrev_b16_e32 v11, 12, v11
 ; GFX9-NEXT:    v_ashrrev_i16_e32 v16, 12, v17
 ; GFX9-NEXT:    v_ashrrev_i16_e32 v17, 12, v18
+; GFX9-NEXT:    v_lshlrev_b16_e32 v6, 12, v6
+; GFX9-NEXT:    v_lshlrev_b16_e32 v11, 12, v11
 ; GFX9-NEXT:    v_mad_legacy_u16 v3, v7, v12, v3
 ; GFX9-NEXT:    v_ashrrev_i16_e32 v6, 12, v6
 ; GFX9-NEXT:    v_ashrrev_i16_e32 v11, 12, v11
 ; GFX9-NEXT:    v_mad_legacy_u16 v3, v16, v17, v3
-; GFX9-NEXT:    v_lshlrev_b16_e32 v5, 12, v5
-; GFX9-NEXT:    v_lshlrev_b16_e32 v10, 12, v10
 ; GFX9-NEXT:    v_ashrrev_i16_e32 v1, 12, v1
 ; GFX9-NEXT:    v_ashrrev_i16_e32 v2, 12, v2
+; GFX9-NEXT:    v_lshlrev_b16_e32 v5, 12, v5
+; GFX9-NEXT:    v_lshlrev_b16_e32 v10, 12, v10
 ; GFX9-NEXT:    v_mad_legacy_u16 v3, v6, v11, v3
 ; GFX9-NEXT:    v_ashrrev_i16_e32 v5, 12, v5
 ; GFX9-NEXT:    v_ashrrev_i16_e32 v10, 12, v10
@@ -557,12 +557,12 @@ define amdgpu_kernel void @idot8_acc16(<8 x i4> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v4, 12
-; GFX9-DL-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
 ; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DL-NEXT:    global_load_ushort v3, v0, s[2:3]
+; GFX9-DL-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v9, 4, v1
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
@@ -577,14 +577,14 @@ define amdgpu_kernel void @idot8_acc16(<8 x i4> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v11, 20, v2
 ; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v12, 12, v2
 ; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v13, 8, v2
-; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v9, 12, v9
-; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v14, 12, v14
 ; GFX9-DL-NEXT:    v_lshlrev_b16_sdwa v17, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-DL-NEXT:    v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
 ; GFX9-DL-NEXT:    v_lshlrev_b16_sdwa v18, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-DL-NEXT:    v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
 ; GFX9-DL-NEXT:    v_lshlrev_b16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
 ; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v4, 12, v15
 ; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v15, 12, v16
+; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v9, 12, v9
+; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v14, 12, v14
 ; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v8, 12, v8
 ; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v13, 12, v13
 ; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v9, 12, v9
@@ -599,18 +599,18 @@ define amdgpu_kernel void @idot8_acc16(<8 x i4> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v7, 12, v7
 ; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v12, 12, v12
 ; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v8, v13, v3
-; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v6, 12, v6
-; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v11, 12, v11
 ; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v16, 12, v17
 ; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v17, 12, v18
+; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v6, 12, v6
+; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v11, 12, v11
 ; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v7, v12, v3
 ; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v6, 12, v6
 ; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v11, 12, v11
 ; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v16, v17, v3
-; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v5, 12, v5
-; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v10, 12, v10
 ; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v1, 12, v1
 ; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v2, 12, v2
+; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v5, 12, v5
+; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v10, 12, v10
 ; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v6, v11, v3
 ; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v5, 12, v5
 ; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v10, 12, v10
@@ -649,45 +649,45 @@ define amdgpu_kernel void @idot8_acc16(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v16, 4, v2
 ; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v17, 12, v2
 ; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v11, 28, v2
-; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v10, 12, v10
 ; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v12, 24, v2
-; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v16, 12, v16
 ; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v13, 20, v2
 ; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v14, 16, v2
 ; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v15, 12, v2
 ; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v2, 8, v2
 ; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v1, 12, v1
+; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v10, 12, v10
+; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v16, 12, v16
 ; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v17, 12, v17
 ; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v9, 12, v9
-; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v10, 12, v10
 ; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v2, 12, v2
+; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v10, 12, v10
 ; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v16, 12, v16
 ; GFX10-DL-XNACK-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DL-XNACK-NEXT:    v_mad_u16 v1, v1, v17, v3
-; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v8, 12, v8
 ; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v3, 12, v9
-; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v9, 12, v15
 ; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v2, 12, v2
+; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v8, 12, v8
+; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v9, 12, v15
 ; GFX10-DL-XNACK-NEXT:    v_mad_u16 v1, v10, v16, v1
 ; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v7, 12, v7
 ; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v10, 12, v14
 ; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v8, 12, v8
 ; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v9, 12, v9
 ; GFX10-DL-XNACK-NEXT:    v_mad_u16 v1, v3, v2, v1
-; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v6, 12, v6
 ; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v2, 12, v7
-; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v7, 12, v13
 ; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v3, 12, v10
+; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v6, 12, v6
+; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v7, 12, v13
 ; GFX10-DL-XNACK-NEXT:    v_mad_u16 v1, v8, v9, v1
 ; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v5, 12, v5
 ; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v8, 12, v12
 ; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v6, 12, v6
 ; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v7, 12, v7
 ; GFX10-DL-XNACK-NEXT:    v_mad_u16 v1, v2, v3, v1
-; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v4, 12, v4
 ; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v2, 12, v5
-; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v5, 12, v11
 ; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v3, 12, v8
+; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v4, 12, v4
+; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v5, 12, v11
 ; GFX10-DL-XNACK-NEXT:    v_mad_u16 v1, v6, v7, v1
 ; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v4, 12, v4
 ; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v5, 12, v5
@@ -727,45 +727,45 @@ define amdgpu_kernel void @idot8_acc16(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v16, 4, v0
 ; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v17, 12, v0
 ; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v11, 28, v0
-; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v10, 12, v10
 ; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v12, 24, v0
-; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v16, 12, v16
 ; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v13, 20, v0
 ; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v14, 16, v0
 ; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v15, 12, v0
 ; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v1, 12, v1
+; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v10, 12, v10
+; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v16, 12, v16
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v17, 12, v17
 ; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v9, 12, v9
-; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v10, 12, v10
 ; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v0, 12, v0
+; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v10, 12, v10
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v16, 12, v16
 ; GFX10-DL-NOXNACK-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v1, v1, v17, v3
-; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v8, 12, v8
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v3, 12, v9
-; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v9, 12, v15
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v0, 12, v0
+; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v8, 12, v8
+; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v9, 12, v15
 ; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v1, v10, v16, v1
 ; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v7, 12, v7
 ; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v10, 12, v14
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v8, 12, v8
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v9, 12, v9
 ; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v0, v3, v0, v1
-; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v6, 12, v6
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v1, 12, v7
-; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v7, 12, v13
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v3, 12, v10
+; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v6, 12, v6
+; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v7, 12, v13
 ; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v0, v8, v9, v0
 ; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v5, 12, v5
 ; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v8, 12, v12
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v6, 12, v6
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v7, 12, v7
 ; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v0, v1, v3, v0
-; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v4, 12, v4
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v1, 12, v5
-; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v5, 12, v11
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v3, 12, v8
+; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v4, 12, v4
+; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v5, 12, v11
 ; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v0, v6, v7, v0
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v4, 12, v4
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v5, 12, v5
@@ -907,10 +907,10 @@ define amdgpu_kernel void @idot8_acc8(<8 x i4> addrspace(1)* %src1,
 ; GFX7-NEXT:    s_mov_b32 s15, 0xe8f000
 ; GFX7-NEXT:    s_add_u32 s12, s12, s3
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX7-NEXT:    s_mov_b32 s10, 0
 ; GFX7-NEXT:    s_mov_b32 s11, s3
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64
@@ -996,23 +996,23 @@ define amdgpu_kernel void @idot8_acc8(<8 x i4> addrspace(1)* %src1,
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v16, 12, v3
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 28, v3
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 20, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 12, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 8, v3
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 4, v2
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v17, 12, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 12, v3
-; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 8, v3
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 28, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 20, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 12, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 8, v2
-; GFX8-NEXT:    v_lshlrev_b16_e32 v10, 12, v10
-; GFX8-NEXT:    v_lshlrev_b16_e32 v15, 12, v15
 ; GFX8-NEXT:    v_lshlrev_b16_sdwa v18, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_lshlrev_b16_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
 ; GFX8-NEXT:    v_lshlrev_b16_sdwa v19, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_lshlrev_b16_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
 ; GFX8-NEXT:    v_lshlrev_b16_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v5, 12, v16
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v16, 12, v17
+; GFX8-NEXT:    v_lshlrev_b16_e32 v10, 12, v10
+; GFX8-NEXT:    v_lshlrev_b16_e32 v15, 12, v15
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v9, 12, v9
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v14, 12, v14
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v10, 12, v10
@@ -1027,18 +1027,18 @@ define amdgpu_kernel void @idot8_acc8(<8 x i4> addrspace(1)* %src1,
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v8, 12, v8
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v13, 12, v13
 ; GFX8-NEXT:    v_mad_u16 v4, v9, v14, v4
-; GFX8-NEXT:    v_lshlrev_b16_e32 v7, 12, v7
-; GFX8-NEXT:    v_lshlrev_b16_e32 v12, 12, v12
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v17, 12, v18
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v18, 12, v19
+; GFX8-NEXT:    v_lshlrev_b16_e32 v7, 12, v7
+; GFX8-NEXT:    v_lshlrev_b16_e32 v12, 12, v12
 ; GFX8-NEXT:    v_mad_u16 v4, v8, v13, v4
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v7, 12, v7
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v12, 12, v12
 ; GFX8-NEXT:    v_mad_u16 v4, v17, v18, v4
-; GFX8-NEXT:    v_lshlrev_b16_e32 v6, 12, v6
-; GFX8-NEXT:    v_lshlrev_b16_e32 v11, 12, v11
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v3, 12, v3
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v2, 12, v2
+; GFX8-NEXT:    v_lshlrev_b16_e32 v6, 12, v6
+; GFX8-NEXT:    v_lshlrev_b16_e32 v11, 12, v11
 ; GFX8-NEXT:    v_mad_u16 v4, v7, v12, v4
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v6, 12, v6
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v11, 12, v11
@@ -1058,12 +1058,12 @@ define amdgpu_kernel void @idot8_acc8(<8 x i4> addrspace(1)* %src1,
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 12
-; GFX9-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v1, v0, s[4:5]
 ; GFX9-NEXT:    global_load_dword v2, v0, s[6:7]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    global_load_ubyte v3, v0, s[2:3]
+; GFX9-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 4, v1
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
@@ -1078,14 +1078,14 @@ define amdgpu_kernel void @idot8_acc8(<8 x i4> addrspace(1)* %src1,
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 20, v2
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 12, v2
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 8, v2
-; GFX9-NEXT:    v_lshlrev_b16_e32 v9, 12, v9
-; GFX9-NEXT:    v_lshlrev_b16_e32 v14, 12, v14
 ; GFX9-NEXT:    v_lshlrev_b16_sdwa v17, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT:    v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
 ; GFX9-NEXT:    v_lshlrev_b16_sdwa v18, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
 ; GFX9-NEXT:    v_lshlrev_b16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
 ; GFX9-NEXT:    v_ashrrev_i16_e32 v4, 12, v15
 ; GFX9-NEXT:    v_ashrrev_i16_e32 v15, 12, v16
+; GFX9-NEXT:    v_lshlrev_b16_e32 v9, 12, v9
+; GFX9-NEXT:    v_lshlrev_b16_e32 v14, 12, v14
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v8, 12, v8
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v13, 12, v13
 ; GFX9-NEXT:    v_ashrrev_i16_e32 v9, 12, v9
@@ -1100,18 +1100,18 @@ define amdgpu_kernel void @idot8_acc8(<8 x i4> addrspace(1)* %src1,
 ; GFX9-NEXT:    v_ashrrev_i16_e32 v7, 12, v7
 ; GFX9-NEXT:    v_ashrrev_i16_e32 v12, 12, v12
 ; GFX9-NEXT:    v_mad_legacy_u16 v3, v8, v13, v3
-; GFX9-NEXT:    v_lshlrev_b16_e32 v6, 12, v6
-; GFX9-NEXT:    v_lshlrev_b16_e32 v11, 12, v11
 ; GFX9-NEXT:    v_ashrrev_i16_e32 v16, 12, v17
 ; GFX9-NEXT:    v_ashrrev_i16_e32 v17, 12, v18
+; GFX9-NEXT:    v_lshlrev_b16_e32 v6, 12, v6
+; GFX9-NEXT:    v_lshlrev_b16_e32 v11, 12, v11
 ; GFX9-NEXT:    v_mad_legacy_u16 v3, v7, v12, v3
 ; GFX9-NEXT:    v_ashrrev_i16_e32 v6, 12, v6
 ; GFX9-NEXT:    v_ashrrev_i16_e32 v11, 12, v11
 ; GFX9-NEXT:    v_mad_legacy_u16 v3, v16, v17, v3
-; GFX9-NEXT:    v_lshlrev_b16_e32 v5, 12, v5
-; GFX9-NEXT:    v_lshlrev_b16_e32 v10, 12, v10
 ; GFX9-NEXT:    v_ashrrev_i16_e32 v1, 12, v1
 ; GFX9-NEXT:    v_ashrrev_i16_e32 v2, 12, v2
+; GFX9-NEXT:    v_lshlrev_b16_e32 v5, 12, v5
+; GFX9-NEXT:    v_lshlrev_b16_e32 v10, 12, v10
 ; GFX9-NEXT:    v_mad_legacy_u16 v3, v6, v11, v3
 ; GFX9-NEXT:    v_ashrrev_i16_e32 v5, 12, v5
 ; GFX9-NEXT:    v_ashrrev_i16_e32 v10, 12, v10
@@ -1131,12 +1131,12 @@ define amdgpu_kernel void @idot8_acc8(<8 x i4> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v4, 12
-; GFX9-DL-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
 ; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DL-NEXT:    global_load_ubyte v3, v0, s[2:3]
+; GFX9-DL-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v9, 4, v1
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
@@ -1151,14 +1151,14 @@ define amdgpu_kernel void @idot8_acc8(<8 x i4> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v11, 20, v2
 ; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v12, 12, v2
 ; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v13, 8, v2
-; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v9, 12, v9
-; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v14, 12, v14
 ; GFX9-DL-NEXT:    v_lshlrev_b16_sdwa v17, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-DL-NEXT:    v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
 ; GFX9-DL-NEXT:    v_lshlrev_b16_sdwa v18, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-DL-NEXT:    v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
 ; GFX9-DL-NEXT:    v_lshlrev_b16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
 ; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v4, 12, v15
 ; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v15, 12, v16
+; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v9, 12, v9
+; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v14, 12, v14
 ; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v8, 12, v8
 ; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v13, 12, v13
 ; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v9, 12, v9
@@ -1173,18 +1173,18 @@ define amdgpu_kernel void @idot8_acc8(<8 x i4> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v7, 12, v7
 ; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v12, 12, v12
 ; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v8, v13, v3
-; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v6, 12, v6
-; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v11, 12, v11
 ; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v16, 12, v17
 ; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v17, 12, v18
+; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v6, 12, v6
+; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v11, 12, v11
 ; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v7, v12, v3
 ; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v6, 12, v6
 ; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v11, 12, v11
 ; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v16, v17, v3
-; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v5, 12, v5
-; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v10, 12, v10
 ; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v1, 12, v1
 ; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v2, 12, v2
+; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v5, 12, v5
+; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v10, 12, v10
 ; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v6, v11, v3
 ; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v5, 12, v5
 ; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v10, 12, v10
@@ -1223,45 +1223,45 @@ define amdgpu_kernel void @idot8_acc8(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v16, 4, v2
 ; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v17, 12, v2
 ; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v11, 28, v2
-; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v10, 12, v10
 ; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v12, 24, v2
-; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v16, 12, v16
 ; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v13, 20, v2
 ; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v14, 16, v2
 ; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v15, 12, v2
 ; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v2, 8, v2
 ; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v1, 12, v1
+; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v10, 12, v10
+; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v16, 12, v16
 ; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v17, 12, v17
 ; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v9, 12, v9
-; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v10, 12, v10
 ; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v2, 12, v2
+; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v10, 12, v10
 ; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v16, 12, v16
 ; GFX10-DL-XNACK-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DL-XNACK-NEXT:    v_mad_u16 v1, v1, v17, v3
-; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v8, 12, v8
 ; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v3, 12, v9
-; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v9, 12, v15
 ; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v2, 12, v2
+; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v8, 12, v8
+; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v9, 12, v15
 ; GFX10-DL-XNACK-NEXT:    v_mad_u16 v1, v10, v16, v1
 ; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v7, 12, v7
 ; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v10, 12, v14
 ; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v8, 12, v8
 ; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v9, 12, v9
 ; GFX10-DL-XNACK-NEXT:    v_mad_u16 v1, v3, v2, v1
-; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v6, 12, v6
 ; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v2, 12, v7
-; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v7, 12, v13
 ; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v3, 12, v10
+; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v6, 12, v6
+; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v7, 12, v13
 ; GFX10-DL-XNACK-NEXT:    v_mad_u16 v1, v8, v9, v1
 ; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v5, 12, v5
 ; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v8, 12, v12
 ; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v6, 12, v6
 ; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v7, 12, v7
 ; GFX10-DL-XNACK-NEXT:    v_mad_u16 v1, v2, v3, v1
-; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v4, 12, v4
 ; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v2, 12, v5
-; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v5, 12, v11
 ; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v3, 12, v8
+; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v4, 12, v4
+; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v5, 12, v11
 ; GFX10-DL-XNACK-NEXT:    v_mad_u16 v1, v6, v7, v1
 ; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v4, 12, v4
 ; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v5, 12, v5
@@ -1301,45 +1301,45 @@ define amdgpu_kernel void @idot8_acc8(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v16, 4, v0
 ; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v17, 12, v0
 ; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v11, 28, v0
-; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v10, 12, v10
 ; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v12, 24, v0
-; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v16, 12, v16
 ; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v13, 20, v0
 ; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v14, 16, v0
 ; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v15, 12, v0
 ; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v1, 12, v1
+; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v10, 12, v10
+; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v16, 12, v16
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v17, 12, v17
 ; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v9, 12, v9
-; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v10, 12, v10
 ; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v0, 12, v0
+; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v10, 12, v10
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v16, 12, v16
 ; GFX10-DL-NOXNACK-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v1, v1, v17, v3
-; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v8, 12, v8
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v3, 12, v9
-; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v9, 12, v15
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v0, 12, v0
+; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v8, 12, v8
+; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v9, 12, v15
 ; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v1, v10, v16, v1
 ; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v7, 12, v7
 ; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v10, 12, v14
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v8, 12, v8
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v9, 12, v9
 ; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v0, v3, v0, v1
-; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v6, 12, v6
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v1, 12, v7
-; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v7, 12, v13
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v3, 12, v10
+; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v6, 12, v6
+; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v7, 12, v13
 ; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v0, v8, v9, v0
 ; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v5, 12, v5
 ; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v8, 12, v12
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v6, 12, v6
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v7, 12, v7
 ; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v0, v1, v3, v0
-; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v4, 12, v4
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v1, 12, v5
-; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v5, 12, v11
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v3, 12, v8
+; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v4, 12, v4
+; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v5, 12, v11
 ; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v0, v6, v7, v0
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v4, 12, v4
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v5, 12, v5
@@ -1482,10 +1482,10 @@ define amdgpu_kernel void @idot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1,
 ; GFX7-NEXT:    s_mov_b32 s15, 0xe8f000
 ; GFX7-NEXT:    s_add_u32 s12, s12, s3
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX7-NEXT:    s_mov_b32 s10, 0
 ; GFX7-NEXT:    s_mov_b32 s11, s3
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
@@ -1552,6 +1552,8 @@ define amdgpu_kernel void @idot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1,
 ; GFX8-NEXT:    v_bfe_i32 v4, v3, 4, 4
 ; GFX8-NEXT:    v_bfe_i32 v6, v3, 8, 4
 ; GFX8-NEXT:    v_bfe_i32 v8, v3, 12, 4
+; GFX8-NEXT:    v_bfe_i32 v10, v3, 16, 4
+; GFX8-NEXT:    v_bfe_i32 v12, v3, 20, 4
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_bfe_i32 v2, v0, 0, 4
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1562,10 +1564,8 @@ define amdgpu_kernel void @idot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1,
 ; GFX8-NEXT:    v_mad_i32_i24 v1, v4, v5, v1
 ; GFX8-NEXT:    v_bfe_i32 v9, v0, 12, 4
 ; GFX8-NEXT:    v_mad_i32_i24 v1, v6, v7, v1
-; GFX8-NEXT:    v_bfe_i32 v10, v3, 16, 4
 ; GFX8-NEXT:    v_bfe_i32 v11, v0, 16, 4
 ; GFX8-NEXT:    v_mad_i32_i24 v1, v8, v9, v1
-; GFX8-NEXT:    v_bfe_i32 v12, v3, 20, 4
 ; GFX8-NEXT:    v_bfe_i32 v13, v0, 20, 4
 ; GFX8-NEXT:    v_mad_i32_i24 v1, v10, v11, v1
 ; GFX8-NEXT:    v_bfe_i32 v14, v3, 24, 4
@@ -1610,8 +1610,8 @@ define amdgpu_kernel void @idot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1,
 ; GFX9-NEXT:    v_bfe_i32 v11, v1, 16, 4
 ; GFX9-NEXT:    v_bfe_i32 v12, v2, 16, 4
 ; GFX9-NEXT:    v_bfe_i32 v13, v1, 20, 4
-; GFX9-NEXT:    v_bfe_i32 v15, v1, 24, 4
 ; GFX9-NEXT:    v_bfe_i32 v14, v2, 20, 4
+; GFX9-NEXT:    v_bfe_i32 v15, v1, 24, 4
 ; GFX9-NEXT:    v_bfe_i32 v16, v2, 24, 4
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 28, v1
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 28, v2
@@ -1661,8 +1661,8 @@ define amdgpu_kernel void @idot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    v_bfe_i32 v11, v1, 16, 4
 ; GFX9-DL-NEXT:    v_bfe_i32 v12, v2, 16, 4
 ; GFX9-DL-NEXT:    v_bfe_i32 v13, v1, 20, 4
-; GFX9-DL-NEXT:    v_bfe_i32 v15, v1, 24, 4
 ; GFX9-DL-NEXT:    v_bfe_i32 v14, v2, 20, 4
+; GFX9-DL-NEXT:    v_bfe_i32 v15, v1, 24, 4
 ; GFX9-DL-NEXT:    v_bfe_i32 v16, v2, 24, 4
 ; GFX9-DL-NEXT:    v_ashrrev_i32_e32 v1, 28, v1
 ; GFX9-DL-NEXT:    v_ashrrev_i32_e32 v2, 28, v2
@@ -1917,10 +1917,10 @@ define amdgpu_kernel void @idot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX7-NEXT:    s_mov_b32 s15, 0xe8f000
 ; GFX7-NEXT:    s_add_u32 s12, s12, s3
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX7-NEXT:    s_mov_b32 s10, 0
 ; GFX7-NEXT:    s_mov_b32 s11, s3
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
@@ -2030,21 +2030,21 @@ define amdgpu_kernel void @idot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v3, 28, v1
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_ashrrev_i32_e32 v10, 28, v2
 ; GFX9-NEXT:    v_bfe_i32 v4, v1, 24, 4
-; GFX9-NEXT:    v_bfe_i32 v11, v2, 24, 4
 ; GFX9-NEXT:    v_bfe_i32 v5, v1, 20, 4
-; GFX9-NEXT:    v_bfe_i32 v12, v2, 20, 4
 ; GFX9-NEXT:    v_bfe_i32 v6, v1, 16, 4
-; GFX9-NEXT:    v_bfe_i32 v13, v2, 16, 4
 ; GFX9-NEXT:    v_bfe_i32 v7, v1, 12, 4
-; GFX9-NEXT:    v_bfe_i32 v14, v2, 12, 4
 ; GFX9-NEXT:    v_bfe_i32 v8, v1, 8, 4
 ; GFX9-NEXT:    v_bfe_i32 v9, v1, 4, 4
+; GFX9-NEXT:    v_bfe_i32 v1, v1, 0, 4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_ashrrev_i32_e32 v10, 28, v2
+; GFX9-NEXT:    v_bfe_i32 v11, v2, 24, 4
+; GFX9-NEXT:    v_bfe_i32 v12, v2, 20, 4
+; GFX9-NEXT:    v_bfe_i32 v13, v2, 16, 4
+; GFX9-NEXT:    v_bfe_i32 v14, v2, 12, 4
 ; GFX9-NEXT:    v_bfe_i32 v15, v2, 8, 4
 ; GFX9-NEXT:    v_bfe_i32 v16, v2, 4, 4
-; GFX9-NEXT:    v_bfe_i32 v1, v1, 0, 4
 ; GFX9-NEXT:    v_bfe_i32 v2, v2, 0, 4
 ; GFX9-NEXT:    v_mul_i32_i24_e32 v1, v1, v2
 ; GFX9-NEXT:    v_mul_i32_i24_e32 v2, v9, v16
@@ -2073,11 +2073,11 @@ define amdgpu_kernel void @idot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-DL-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    global_load_dword v2, v0, s[4:5]
 ; GFX9-DL-NEXT:    global_load_dword v3, v0, s[6:7]
 ; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX9-DL-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-DL-NEXT:    v_dot8_i32_i4 v0, v2, v3, s0
 ; GFX9-DL-NEXT:    global_store_dword v1, v0, s[2:3]
@@ -2195,10 +2195,10 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX7-NEXT:    s_mov_b32 s15, 0xe8f000
 ; GFX7-NEXT:    s_add_u32 s12, s12, s3
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX7-NEXT:    s_mov_b32 s10, 0
 ; GFX7-NEXT:    s_mov_b32 s11, s3
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64
@@ -2238,19 +2238,19 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX7-NEXT:    v_bfe_i32 v0, v0, 12, 4
 ; GFX7-NEXT:    v_or_b32_e32 v5, v6, v5
 ; GFX7-NEXT:    v_or_b32_e32 v6, v11, v10
+; GFX7-NEXT:    v_and_b32_e32 v3, v2, v3
+; GFX7-NEXT:    v_and_b32_e32 v9, v2, v9
 ; GFX7-NEXT:    v_and_b32_e32 v12, v2, v14
 ; GFX7-NEXT:    v_and_b32_e32 v13, v2, v15
+; GFX7-NEXT:    v_and_b32_e32 v0, v2, v0
 ; GFX7-NEXT:    v_and_b32_e32 v14, v2, v16
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v15, 16, v1
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v16, 16, v4
-; GFX7-NEXT:    v_lshrrev_b32_e32 v11, 16, v6
-; GFX7-NEXT:    v_and_b32_e32 v3, v2, v3
-; GFX7-NEXT:    v_and_b32_e32 v9, v2, v9
-; GFX7-NEXT:    v_and_b32_e32 v0, v2, v0
 ; GFX7-NEXT:    v_and_b32_e32 v4, v2, v4
 ; GFX7-NEXT:    v_and_b32_e32 v1, v2, v1
-; GFX7-NEXT:    v_and_b32_e32 v6, v2, v6
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v10, 16, v5
+; GFX7-NEXT:    v_lshrrev_b32_e32 v11, 16, v6
+; GFX7-NEXT:    v_and_b32_e32 v6, v2, v6
 ; GFX7-NEXT:    v_and_b32_e32 v2, v2, v5
 ; GFX7-NEXT:    buffer_load_ushort v5, off, s[0:3], 0
 ; GFX7-NEXT:    v_and_b32_e32 v8, s4, v8
@@ -2297,21 +2297,21 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 12, v3
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 20, v3
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 28, v3
+; GFX8-NEXT:    v_lshlrev_b16_sdwa v16, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 4, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 8, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 12, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 20, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 28, v2
-; GFX8-NEXT:    v_lshlrev_b16_sdwa v16, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
 ; GFX8-NEXT:    v_lshlrev_b16_sdwa v17, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_lshlrev_b16_e32 v3, 12, v3
 ; GFX8-NEXT:    v_lshlrev_b16_sdwa v18, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
 ; GFX8-NEXT:    v_lshlrev_b16_sdwa v5, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_lshlrev_b16_e32 v3, 12, v3
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v2, 12, v2
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v6, 12, v6
-; GFX8-NEXT:    v_lshlrev_b16_e32 v11, 12, v11
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v3, 12, v3
+; GFX8-NEXT:    v_lshlrev_b16_e32 v11, 12, v11
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v2, 12, v2
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v7, 12, v7
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v12, 12, v12
@@ -2328,16 +2328,16 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v13, 12, v13
 ; GFX8-NEXT:    v_mad_u16 v2, v7, v12, v2
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v9, 12, v9
-; GFX8-NEXT:    v_lshlrev_b16_e32 v14, 12, v14
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v17, 12, v17
+; GFX8-NEXT:    v_lshlrev_b16_e32 v14, 12, v14
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v5, 12, v5
 ; GFX8-NEXT:    v_mad_u16 v2, v8, v13, v2
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v9, 12, v9
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v14, 12, v14
 ; GFX8-NEXT:    v_mad_u16 v2, v17, v5, v2
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v10, 12, v10
-; GFX8-NEXT:    v_lshlrev_b16_e32 v15, 12, v15
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v16, 12, v16
+; GFX8-NEXT:    v_lshlrev_b16_e32 v15, 12, v15
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v18, 12, v18
 ; GFX8-NEXT:    v_mad_u16 v2, v9, v14, v2
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v10, 12, v10
@@ -2358,30 +2358,30 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0xffff
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v3, v0, s[4:5]
 ; GFX9-NEXT:    global_load_dword v4, v0, s[6:7]
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_and_b32_e32 v10, 15, v3
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v17, 15, v4
 ; GFX9-NEXT:    v_bfe_u32 v0, v3, 24, 4
-; GFX9-NEXT:    v_bfe_u32 v6, v3, 16, 4
-; GFX9-NEXT:    v_bfe_u32 v8, v3, 8, 4
-; GFX9-NEXT:    v_bfe_u32 v13, v4, 16, 4
-; GFX9-NEXT:    v_bfe_u32 v15, v4, 8, 4
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 28, v3
+; GFX9-NEXT:    v_bfe_u32 v6, v3, 16, 4
 ; GFX9-NEXT:    v_bfe_u32 v7, v3, 20, 4
+; GFX9-NEXT:    v_bfe_u32 v8, v3, 8, 4
 ; GFX9-NEXT:    v_bfe_u32 v9, v3, 12, 4
 ; GFX9-NEXT:    v_bfe_u32 v3, v3, 4, 4
-; GFX9-NEXT:    v_and_b32_e32 v10, v2, v10
 ; GFX9-NEXT:    v_bfe_u32 v11, v4, 24, 4
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 28, v4
+; GFX9-NEXT:    v_bfe_u32 v13, v4, 16, 4
 ; GFX9-NEXT:    v_bfe_u32 v14, v4, 20, 4
+; GFX9-NEXT:    v_bfe_u32 v15, v4, 8, 4
 ; GFX9-NEXT:    v_bfe_u32 v16, v4, 12, 4
 ; GFX9-NEXT:    v_bfe_u32 v4, v4, 4, 4
+; GFX9-NEXT:    v_and_b32_e32 v10, v2, v10
 ; GFX9-NEXT:    v_and_b32_e32 v17, v2, v17
 ; GFX9-NEXT:    v_lshl_or_b32 v3, v3, 16, v10
 ; GFX9-NEXT:    v_lshl_or_b32 v4, v4, 16, v17
@@ -2394,14 +2394,14 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX9-NEXT:    v_and_b32_e32 v8, v2, v8
 ; GFX9-NEXT:    v_and_b32_e32 v0, v2, v0
 ; GFX9-NEXT:    v_and_b32_e32 v15, v2, v15
-; GFX9-NEXT:    v_lshl_or_b32 v0, v5, 16, v0
 ; GFX9-NEXT:    v_lshl_or_b32 v8, v9, 16, v8
+; GFX9-NEXT:    v_lshl_or_b32 v0, v5, 16, v0
 ; GFX9-NEXT:    v_lshl_or_b32 v5, v16, 16, v15
 ; GFX9-NEXT:    v_and_b32_e32 v6, v2, v6
-; GFX9-NEXT:    v_pk_lshlrev_b16 v8, 12, v8 op_sel_hi:[0,1]
-; GFX9-NEXT:    v_pk_lshlrev_b16 v5, 12, v5 op_sel_hi:[0,1]
 ; GFX9-NEXT:    v_and_b32_e32 v13, v2, v13
 ; GFX9-NEXT:    v_and_b32_e32 v2, v2, v11
+; GFX9-NEXT:    v_pk_lshlrev_b16 v8, 12, v8 op_sel_hi:[0,1]
+; GFX9-NEXT:    v_pk_lshlrev_b16 v5, 12, v5 op_sel_hi:[0,1]
 ; GFX9-NEXT:    v_lshl_or_b32 v6, v7, 16, v6
 ; GFX9-NEXT:    v_lshl_or_b32 v7, v14, 16, v13
 ; GFX9-NEXT:    v_lshl_or_b32 v2, v12, 16, v2
@@ -2414,8 +2414,8 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX9-NEXT:    v_pk_mul_lo_u16 v5, v8, v5
 ; GFX9-NEXT:    v_pk_ashrrev_i16 v6, 12, v6 op_sel_hi:[0,1]
 ; GFX9-NEXT:    v_pk_ashrrev_i16 v0, 12, v0 op_sel_hi:[0,1]
-; GFX9-NEXT:    v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1]
 ; GFX9-NEXT:    v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1]
+; GFX9-NEXT:    v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1]
 ; GFX9-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
 ; GFX9-NEXT:    v_pk_mul_lo_u16 v2, v6, v7
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -2441,30 +2441,30 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v2, 0xffff
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    global_load_dword v3, v0, s[4:5]
 ; GFX9-DL-NEXT:    global_load_dword v4, v0, s[6:7]
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-DL-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-DL-NEXT:    v_and_b32_e32 v10, 15, v3
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-DL-NEXT:    v_and_b32_e32 v17, 15, v4
 ; GFX9-DL-NEXT:    v_bfe_u32 v0, v3, 24, 4
-; GFX9-DL-NEXT:    v_bfe_u32 v6, v3, 16, 4
-; GFX9-DL-NEXT:    v_bfe_u32 v8, v3, 8, 4
-; GFX9-DL-NEXT:    v_bfe_u32 v13, v4, 16, 4
-; GFX9-DL-NEXT:    v_bfe_u32 v15, v4, 8, 4
 ; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v5, 28, v3
+; GFX9-DL-NEXT:    v_bfe_u32 v6, v3, 16, 4
 ; GFX9-DL-NEXT:    v_bfe_u32 v7, v3, 20, 4
+; GFX9-DL-NEXT:    v_bfe_u32 v8, v3, 8, 4
 ; GFX9-DL-NEXT:    v_bfe_u32 v9, v3, 12, 4
 ; GFX9-DL-NEXT:    v_bfe_u32 v3, v3, 4, 4
-; GFX9-DL-NEXT:    v_and_b32_e32 v10, v2, v10
 ; GFX9-DL-NEXT:    v_bfe_u32 v11, v4, 24, 4
 ; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v12, 28, v4
+; GFX9-DL-NEXT:    v_bfe_u32 v13, v4, 16, 4
 ; GFX9-DL-NEXT:    v_bfe_u32 v14, v4, 20, 4
+; GFX9-DL-NEXT:    v_bfe_u32 v15, v4, 8, 4
 ; GFX9-DL-NEXT:    v_bfe_u32 v16, v4, 12, 4
 ; GFX9-DL-NEXT:    v_bfe_u32 v4, v4, 4, 4
+; GFX9-DL-NEXT:    v_and_b32_e32 v10, v2, v10
 ; GFX9-DL-NEXT:    v_and_b32_e32 v17, v2, v17
 ; GFX9-DL-NEXT:    v_lshl_or_b32 v3, v3, 16, v10
 ; GFX9-DL-NEXT:    v_lshl_or_b32 v4, v4, 16, v17
@@ -2477,14 +2477,14 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    v_and_b32_e32 v8, v2, v8
 ; GFX9-DL-NEXT:    v_and_b32_e32 v0, v2, v0
 ; GFX9-DL-NEXT:    v_and_b32_e32 v15, v2, v15
-; GFX9-DL-NEXT:    v_lshl_or_b32 v0, v5, 16, v0
 ; GFX9-DL-NEXT:    v_lshl_or_b32 v8, v9, 16, v8
+; GFX9-DL-NEXT:    v_lshl_or_b32 v0, v5, 16, v0
 ; GFX9-DL-NEXT:    v_lshl_or_b32 v5, v16, 16, v15
 ; GFX9-DL-NEXT:    v_and_b32_e32 v6, v2, v6
-; GFX9-DL-NEXT:    v_pk_lshlrev_b16 v8, 12, v8 op_sel_hi:[0,1]
-; GFX9-DL-NEXT:    v_pk_lshlrev_b16 v5, 12, v5 op_sel_hi:[0,1]
 ; GFX9-DL-NEXT:    v_and_b32_e32 v13, v2, v13
 ; GFX9-DL-NEXT:    v_and_b32_e32 v2, v2, v11
+; GFX9-DL-NEXT:    v_pk_lshlrev_b16 v8, 12, v8 op_sel_hi:[0,1]
+; GFX9-DL-NEXT:    v_pk_lshlrev_b16 v5, 12, v5 op_sel_hi:[0,1]
 ; GFX9-DL-NEXT:    v_lshl_or_b32 v6, v7, 16, v6
 ; GFX9-DL-NEXT:    v_lshl_or_b32 v7, v14, 16, v13
 ; GFX9-DL-NEXT:    v_lshl_or_b32 v2, v12, 16, v2
@@ -2497,8 +2497,8 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v5, v8, v5
 ; GFX9-DL-NEXT:    v_pk_ashrrev_i16 v6, 12, v6 op_sel_hi:[0,1]
 ; GFX9-DL-NEXT:    v_pk_ashrrev_i16 v0, 12, v0 op_sel_hi:[0,1]
-; GFX9-DL-NEXT:    v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1]
 ; GFX9-DL-NEXT:    v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1]
+; GFX9-DL-NEXT:    v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1]
 ; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
 ; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v2, v6, v7
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
@@ -2535,43 +2535,43 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-XNACK-NEXT:    v_and_b32_e32 v11, 15, v1
 ; GFX10-DL-XNACK-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-DL-XNACK-NEXT:    v_and_b32_e32 v13, 15, v2
-; GFX10-DL-XNACK-NEXT:    v_bfe_u32 v7, v1, 16, 4
-; GFX10-DL-XNACK-NEXT:    v_bfe_u32 v9, v1, 8, 4
 ; GFX10-DL-XNACK-NEXT:    v_bfe_u32 v5, v1, 24, 4
 ; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v6, 28, v1
+; GFX10-DL-XNACK-NEXT:    v_bfe_u32 v7, v1, 16, 4
 ; GFX10-DL-XNACK-NEXT:    v_bfe_u32 v8, v1, 20, 4
+; GFX10-DL-XNACK-NEXT:    v_bfe_u32 v9, v1, 8, 4
 ; GFX10-DL-XNACK-NEXT:    v_bfe_u32 v10, v1, 12, 4
 ; GFX10-DL-XNACK-NEXT:    v_bfe_u32 v1, v1, 4, 4
-; GFX10-DL-XNACK-NEXT:    v_and_b32_e32 v11, v4, v11
 ; GFX10-DL-XNACK-NEXT:    v_bfe_u32 v16, v2, 4, 4
+; GFX10-DL-XNACK-NEXT:    v_and_b32_e32 v11, v4, v11
 ; GFX10-DL-XNACK-NEXT:    v_and_b32_e32 v13, v4, v13
 ; GFX10-DL-XNACK-NEXT:    v_bfe_u32 v18, v2, 8, 4
-; GFX10-DL-XNACK-NEXT:    v_and_b32_e32 v9, v4, v9
-; GFX10-DL-XNACK-NEXT:    v_lshl_or_b32 v1, v1, 16, v11
-; GFX10-DL-XNACK-NEXT:    v_and_b32_e32 v7, v4, v7
-; GFX10-DL-XNACK-NEXT:    v_lshl_or_b32 v11, v16, 16, v13
 ; GFX10-DL-XNACK-NEXT:    v_bfe_u32 v12, v2, 24, 4
 ; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v14, 28, v2
-; GFX10-DL-XNACK-NEXT:    v_pk_lshlrev_b16 v1, 12, v1 op_sel_hi:[0,1]
+; GFX10-DL-XNACK-NEXT:    v_lshl_or_b32 v1, v1, 16, v11
+; GFX10-DL-XNACK-NEXT:    v_lshl_or_b32 v11, v16, 16, v13
 ; GFX10-DL-XNACK-NEXT:    v_bfe_u32 v15, v2, 16, 4
 ; GFX10-DL-XNACK-NEXT:    v_bfe_u32 v17, v2, 20, 4
 ; GFX10-DL-XNACK-NEXT:    v_bfe_u32 v2, v2, 12, 4
+; GFX10-DL-XNACK-NEXT:    v_and_b32_e32 v9, v4, v9
+; GFX10-DL-XNACK-NEXT:    v_and_b32_e32 v7, v4, v7
 ; GFX10-DL-XNACK-NEXT:    v_and_b32_e32 v13, v4, v18
+; GFX10-DL-XNACK-NEXT:    v_pk_lshlrev_b16 v1, 12, v1 op_sel_hi:[0,1]
 ; GFX10-DL-XNACK-NEXT:    v_pk_lshlrev_b16 v11, 12, v11 op_sel_hi:[0,1]
 ; GFX10-DL-XNACK-NEXT:    v_lshl_or_b32 v9, v10, 16, v9
 ; GFX10-DL-XNACK-NEXT:    v_lshl_or_b32 v7, v8, 16, v7
-; GFX10-DL-XNACK-NEXT:    v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1]
 ; GFX10-DL-XNACK-NEXT:    v_lshl_or_b32 v2, v2, 16, v13
+; GFX10-DL-XNACK-NEXT:    v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1]
 ; GFX10-DL-XNACK-NEXT:    v_pk_ashrrev_i16 v8, 12, v11 op_sel_hi:[0,1]
-; GFX10-DL-XNACK-NEXT:    v_pk_lshlrev_b16 v9, 12, v9 op_sel_hi:[0,1]
 ; GFX10-DL-XNACK-NEXT:    v_and_b32_e32 v10, v4, v15
-; GFX10-DL-XNACK-NEXT:    v_pk_lshlrev_b16 v7, 12, v7 op_sel_hi:[0,1]
+; GFX10-DL-XNACK-NEXT:    v_pk_lshlrev_b16 v9, 12, v9 op_sel_hi:[0,1]
 ; GFX10-DL-XNACK-NEXT:    v_pk_lshlrev_b16 v2, 12, v2 op_sel_hi:[0,1]
+; GFX10-DL-XNACK-NEXT:    v_pk_lshlrev_b16 v7, 12, v7 op_sel_hi:[0,1]
 ; GFX10-DL-XNACK-NEXT:    v_pk_mul_lo_u16 v1, v1, v8
-; GFX10-DL-XNACK-NEXT:    v_pk_ashrrev_i16 v9, 12, v9 op_sel_hi:[0,1]
 ; GFX10-DL-XNACK-NEXT:    v_lshl_or_b32 v8, v17, 16, v10
-; GFX10-DL-XNACK-NEXT:    v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1]
+; GFX10-DL-XNACK-NEXT:    v_pk_ashrrev_i16 v9, 12, v9 op_sel_hi:[0,1]
 ; GFX10-DL-XNACK-NEXT:    v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1]
+; GFX10-DL-XNACK-NEXT:    v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1]
 ; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v10, 16, v1
 ; GFX10-DL-XNACK-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DL-XNACK-NEXT:    v_add_nc_u16 v1, v1, v3
@@ -2623,43 +2623,43 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NOXNACK-NEXT:    v_and_b32_e32 v11, 15, v1
 ; GFX10-DL-NOXNACK-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-DL-NOXNACK-NEXT:    v_and_b32_e32 v13, 15, v0
-; GFX10-DL-NOXNACK-NEXT:    v_bfe_u32 v7, v1, 16, 4
-; GFX10-DL-NOXNACK-NEXT:    v_bfe_u32 v9, v1, 8, 4
 ; GFX10-DL-NOXNACK-NEXT:    v_bfe_u32 v5, v1, 24, 4
 ; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v6, 28, v1
+; GFX10-DL-NOXNACK-NEXT:    v_bfe_u32 v7, v1, 16, 4
 ; GFX10-DL-NOXNACK-NEXT:    v_bfe_u32 v8, v1, 20, 4
+; GFX10-DL-NOXNACK-NEXT:    v_bfe_u32 v9, v1, 8, 4
 ; GFX10-DL-NOXNACK-NEXT:    v_bfe_u32 v10, v1, 12, 4
 ; GFX10-DL-NOXNACK-NEXT:    v_bfe_u32 v1, v1, 4, 4
-; GFX10-DL-NOXNACK-NEXT:    v_and_b32_e32 v11, v4, v11
 ; GFX10-DL-NOXNACK-NEXT:    v_bfe_u32 v16, v0, 4, 4
+; GFX10-DL-NOXNACK-NEXT:    v_and_b32_e32 v11, v4, v11
 ; GFX10-DL-NOXNACK-NEXT:    v_and_b32_e32 v13, v4, v13
 ; GFX10-DL-NOXNACK-NEXT:    v_bfe_u32 v18, v0, 8, 4
-; GFX10-DL-NOXNACK-NEXT:    v_and_b32_e32 v9, v4, v9
-; GFX10-DL-NOXNACK-NEXT:    v_lshl_or_b32 v1, v1, 16, v11
-; GFX10-DL-NOXNACK-NEXT:    v_and_b32_e32 v7, v4, v7
-; GFX10-DL-NOXNACK-NEXT:    v_lshl_or_b32 v11, v16, 16, v13
 ; GFX10-DL-NOXNACK-NEXT:    v_bfe_u32 v12, v0, 24, 4
 ; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v14, 28, v0
-; GFX10-DL-NOXNACK-NEXT:    v_pk_lshlrev_b16 v1, 12, v1 op_sel_hi:[0,1]
+; GFX10-DL-NOXNACK-NEXT:    v_lshl_or_b32 v1, v1, 16, v11
+; GFX10-DL-NOXNACK-NEXT:    v_lshl_or_b32 v11, v16, 16, v13
 ; GFX10-DL-NOXNACK-NEXT:    v_bfe_u32 v15, v0, 16, 4
 ; GFX10-DL-NOXNACK-NEXT:    v_bfe_u32 v17, v0, 20, 4
 ; GFX10-DL-NOXNACK-NEXT:    v_bfe_u32 v0, v0, 12, 4
+; GFX10-DL-NOXNACK-NEXT:    v_and_b32_e32 v9, v4, v9
+; GFX10-DL-NOXNACK-NEXT:    v_and_b32_e32 v7, v4, v7
 ; GFX10-DL-NOXNACK-NEXT:    v_and_b32_e32 v13, v4, v18
+; GFX10-DL-NOXNACK-NEXT:    v_pk_lshlrev_b16 v1, 12, v1 op_sel_hi:[0,1]
 ; GFX10-DL-NOXNACK-NEXT:    v_pk_lshlrev_b16 v11, 12, v11 op_sel_hi:[0,1]
 ; GFX10-DL-NOXNACK-NEXT:    v_lshl_or_b32 v9, v10, 16, v9
 ; GFX10-DL-NOXNACK-NEXT:    v_lshl_or_b32 v7, v8, 16, v7
-; GFX10-DL-NOXNACK-NEXT:    v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1]
 ; GFX10-DL-NOXNACK-NEXT:    v_lshl_or_b32 v0, v0, 16, v13
+; GFX10-DL-NOXNACK-NEXT:    v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1]
 ; GFX10-DL-NOXNACK-NEXT:    v_pk_ashrrev_i16 v8, 12, v11 op_sel_hi:[0,1]
-; GFX10-DL-NOXNACK-NEXT:    v_pk_lshlrev_b16 v9, 12, v9 op_sel_hi:[0,1]
 ; GFX10-DL-NOXNACK-NEXT:    v_and_b32_e32 v10, v4, v15
-; GFX10-DL-NOXNACK-NEXT:    v_pk_lshlrev_b16 v7, 12, v7 op_sel_hi:[0,1]
+; GFX10-DL-NOXNACK-NEXT:    v_pk_lshlrev_b16 v9, 12, v9 op_sel_hi:[0,1]
 ; GFX10-DL-NOXNACK-NEXT:    v_pk_lshlrev_b16 v0, 12, v0 op_sel_hi:[0,1]
+; GFX10-DL-NOXNACK-NEXT:    v_pk_lshlrev_b16 v7, 12, v7 op_sel_hi:[0,1]
 ; GFX10-DL-NOXNACK-NEXT:    v_pk_mul_lo_u16 v1, v1, v8
-; GFX10-DL-NOXNACK-NEXT:    v_pk_ashrrev_i16 v9, 12, v9 op_sel_hi:[0,1]
 ; GFX10-DL-NOXNACK-NEXT:    v_lshl_or_b32 v8, v17, 16, v10
-; GFX10-DL-NOXNACK-NEXT:    v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1]
+; GFX10-DL-NOXNACK-NEXT:    v_pk_ashrrev_i16 v9, 12, v9 op_sel_hi:[0,1]
 ; GFX10-DL-NOXNACK-NEXT:    v_pk_ashrrev_i16 v0, 12, v0 op_sel_hi:[0,1]
+; GFX10-DL-NOXNACK-NEXT:    v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1]
 ; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v10, 16, v1
 ; GFX10-DL-NOXNACK-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DL-NOXNACK-NEXT:    v_add_nc_u16 v1, v1, v3
@@ -2807,10 +2807,10 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX7-NEXT:    s_mov_b32 s15, 0xe8f000
 ; GFX7-NEXT:    s_add_u32 s12, s12, s3
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX7-NEXT:    s_mov_b32 s10, 0
 ; GFX7-NEXT:    s_mov_b32 s11, s3
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX7-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64
@@ -2839,15 +2839,15 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX7-NEXT:    v_and_b32_e32 v9, v2, v13
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v13, 8, v16
 ; GFX7-NEXT:    buffer_load_ubyte v16, off, s[0:3], 0
+; GFX7-NEXT:    v_ashrrev_i32_e32 v1, 28, v4
 ; GFX7-NEXT:    v_bfe_i32 v5, v4, 24, 4
 ; GFX7-NEXT:    v_bfe_i32 v10, v4, 4, 4
-; GFX7-NEXT:    v_ashrrev_i32_e32 v1, 28, v4
 ; GFX7-NEXT:    v_bfe_i32 v4, v4, 0, 4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v11, 8, v1
 ; GFX7-NEXT:    v_and_b32_e32 v5, s4, v5
-; GFX7-NEXT:    v_ashrrev_i32_e32 v12, 28, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 8, v10
 ; GFX7-NEXT:    v_and_b32_e32 v4, v2, v4
+; GFX7-NEXT:    v_ashrrev_i32_e32 v12, 28, v0
 ; GFX7-NEXT:    v_bfe_i32 v14, v0, 20, 4
 ; GFX7-NEXT:    v_bfe_i32 v15, v0, 16, 4
 ; GFX7-NEXT:    v_bfe_i32 v17, v0, 8, 4
@@ -2861,29 +2861,29 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX7-NEXT:    v_and_b32_e32 v14, v2, v17
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v15, 8, v18
 ; GFX7-NEXT:    v_and_b32_e32 v0, v2, v0
-; GFX7-NEXT:    v_or_b32_e32 v0, v0, v15
-; GFX7-NEXT:    v_or_b32_e32 v8, v9, v8
-; GFX7-NEXT:    v_or_b32_e32 v9, v11, v10
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; GFX7-NEXT:    v_and_b32_e32 v6, s5, v6
+; GFX7-NEXT:    v_or_b32_e32 v8, v9, v8
+; GFX7-NEXT:    v_or_b32_e32 v9, v11, v10
 ; GFX7-NEXT:    v_or_b32_e32 v10, v14, v13
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v15
+; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX7-NEXT:    v_and_b32_e32 v4, s5, v4
 ; GFX7-NEXT:    v_or_b32_e32 v5, v6, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 16, v10
 ; GFX7-NEXT:    v_and_b32_e32 v0, v3, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX7-NEXT:    v_and_b32_e32 v4, s5, v4
-; GFX7-NEXT:    v_or_b32_e32 v0, v0, v8
 ; GFX7-NEXT:    v_or_b32_e32 v4, v4, v7
 ; GFX7-NEXT:    v_and_b32_e32 v7, v3, v9
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v8
 ; GFX7-NEXT:    v_or_b32_e32 v3, v7, v6
 ; GFX7-NEXT:    v_and_b32_e32 v7, v2, v4
 ; GFX7-NEXT:    v_and_b32_e32 v13, v2, v0
 ; GFX7-NEXT:    v_bfe_u32 v8, v4, 8, 8
 ; GFX7-NEXT:    v_bfe_u32 v14, v0, 8, 8
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 24, v4
-; GFX7-NEXT:    v_lshrrev_b32_e32 v11, 24, v0
 ; GFX7-NEXT:    v_bfe_u32 v4, v4, 16, 8
+; GFX7-NEXT:    v_lshrrev_b32_e32 v11, 24, v0
 ; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
 ; GFX7-NEXT:    v_and_b32_e32 v1, v2, v1
 ; GFX7-NEXT:    v_and_b32_e32 v12, v2, v12
@@ -2930,60 +2930,60 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX8-NEXT:    s_add_u32 s8, s8, s3
 ; GFX8-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX8-NEXT:    s_waitcnt vmcnt(2)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 20, v3
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 28, v3
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 12, v3
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 8, v3
-; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 20, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 4, v3
+; GFX8-NEXT:    v_lshlrev_b16_e32 v16, 12, v3
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 20, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 28, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 12, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 8, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 4, v3
-; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 20, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 4, v2
-; GFX8-NEXT:    v_lshlrev_b16_e32 v16, 12, v3
 ; GFX8-NEXT:    v_lshlrev_b16_sdwa v17, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
 ; GFX8-NEXT:    v_lshlrev_b16_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v18, 12, v2
 ; GFX8-NEXT:    v_lshlrev_b16_sdwa v19, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
 ; GFX8-NEXT:    v_lshlrev_b16_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v5, 12, v10
-; GFX8-NEXT:    v_lshlrev_b16_e32 v7, 12, v7
-; GFX8-NEXT:    v_lshlrev_b16_e32 v12, 12, v12
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v10, 12, v16
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v16, 12, v17
+; GFX8-NEXT:    v_lshlrev_b16_e32 v7, 12, v7
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v17, 12, v3
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v3, 12, v6
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v6, 12, v15
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v15, 12, v18
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v18, 12, v19
+; GFX8-NEXT:    v_lshlrev_b16_e32 v12, 12, v12
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v19, 12, v2
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v2, 12, v11
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v9, 12, v9
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v8, 12, v8
-; GFX8-NEXT:    v_lshlrev_b16_e32 v13, 12, v13
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v14, 12, v14
+; GFX8-NEXT:    v_lshlrev_b16_e32 v13, 12, v13
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v7, 12, v7
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v3, 12, v3
-; GFX8-NEXT:    v_ashrrev_i16_e32 v2, 12, v2
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v12, 12, v12
-; GFX8-NEXT:    v_ashrrev_i16_e32 v11, 12, v14
-; GFX8-NEXT:    v_mul_lo_u16_e32 v10, v10, v15
-; GFX8-NEXT:    v_mul_lo_u16_sdwa v2, v3, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_ashrrev_i16_e32 v2, 12, v2
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v9, 12, v9
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v8, 12, v8
+; GFX8-NEXT:    v_ashrrev_i16_e32 v11, 12, v14
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v13, 12, v13
-; GFX8-NEXT:    v_mul_lo_u16_sdwa v3, v7, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_mul_lo_u16_e32 v10, v10, v15
 ; GFX8-NEXT:    v_mul_lo_u16_e32 v15, v16, v18
-; GFX8-NEXT:    v_mul_lo_u16_sdwa v7, v8, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_mul_lo_u16_e32 v8, v9, v11
-; GFX8-NEXT:    v_or_b32_sdwa v3, v15, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_mul_lo_u16_sdwa v2, v3, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_mul_lo_u16_sdwa v3, v7, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v5, 12, v5
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v6, 12, v6
 ; GFX8-NEXT:    v_mul_lo_u16_e32 v14, v17, v19
+; GFX8-NEXT:    v_mul_lo_u16_sdwa v7, v8, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_mul_lo_u16_e32 v8, v9, v11
+; GFX8-NEXT:    v_or_b32_sdwa v3, v15, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_mul_lo_u16_sdwa v5, v5, v6 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v6, v14, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
 ; GFX8-NEXT:    v_or_b32_sdwa v8, v10, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
@@ -3016,67 +3016,67 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v1, v0, s[4:5]
 ; GFX9-NEXT:    global_load_dword v2, v0, s[6:7]
 ; GFX9-NEXT:    global_load_ubyte v4, v3, s[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 12
+; GFX9-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 20, v1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 28, v1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 12, v1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 8, v1
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 28, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 20, v1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 4, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 20, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 28, v2
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 4, v2
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v15, 12, v1
 ; GFX9-NEXT:    v_lshlrev_b16_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
 ; GFX9-NEXT:    v_lshlrev_b16_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT:    v_lshlrev_b16_sdwa v18, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v17, 12, v2
+; GFX9-NEXT:    v_lshlrev_b16_sdwa v18, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
 ; GFX9-NEXT:    v_lshlrev_b16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 12, v2
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 8, v2
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 12, v9
-; GFX9-NEXT:    v_lshlrev_b16_e32 v6, 12, v6
-; GFX9-NEXT:    v_lshlrev_b16_e32 v11, 12, v11
 ; GFX9-NEXT:    v_ashrrev_i16_e32 v9, 12, v15
 ; GFX9-NEXT:    v_ashrrev_i16_e32 v15, 12, v16
+; GFX9-NEXT:    v_lshlrev_b16_e32 v6, 12, v6
 ; GFX9-NEXT:    v_ashrrev_i16_e32 v16, 12, v1
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 12, v5
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v5, 12, v14
 ; GFX9-NEXT:    v_ashrrev_i16_e32 v14, 12, v17
 ; GFX9-NEXT:    v_ashrrev_i16_e32 v17, 12, v18
+; GFX9-NEXT:    v_lshlrev_b16_e32 v11, 12, v11
 ; GFX9-NEXT:    v_ashrrev_i16_e32 v18, 12, v0
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 12, v10
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v8, 12, v8
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v7, 12, v7
-; GFX9-NEXT:    v_lshlrev_b16_e32 v12, 12, v12
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v13, 12, v13
+; GFX9-NEXT:    v_lshlrev_b16_e32 v12, 12, v12
 ; GFX9-NEXT:    v_ashrrev_i16_e32 v6, 12, v6
 ; GFX9-NEXT:    v_ashrrev_i16_e32 v1, 12, v1
-; GFX9-NEXT:    v_ashrrev_i16_e32 v0, 12, v0
 ; GFX9-NEXT:    v_ashrrev_i16_e32 v11, 12, v11
-; GFX9-NEXT:    v_ashrrev_i16_e32 v10, 12, v13
-; GFX9-NEXT:    v_mul_lo_u16_sdwa v0, v1, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    v_ashrrev_i16_e32 v0, 12, v0
 ; GFX9-NEXT:    v_ashrrev_i16_e32 v8, 12, v8
 ; GFX9-NEXT:    v_ashrrev_i16_e32 v7, 12, v7
+; GFX9-NEXT:    v_ashrrev_i16_e32 v10, 12, v13
 ; GFX9-NEXT:    v_ashrrev_i16_e32 v12, 12, v12
-; GFX9-NEXT:    v_mul_lo_u16_sdwa v1, v6, v11 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    v_mul_lo_u16_e32 v19, v15, v17
-; GFX9-NEXT:    v_mul_lo_u16_sdwa v6, v7, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT:    v_mul_lo_u16_e32 v7, v8, v10
-; GFX9-NEXT:    v_or_b32_sdwa v1, v19, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_mul_lo_u16_sdwa v0, v1, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    v_mul_lo_u16_sdwa v1, v6, v11 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    v_ashrrev_i16_e32 v2, 12, v2
 ; GFX9-NEXT:    v_ashrrev_i16_e32 v5, 12, v5
 ; GFX9-NEXT:    v_mul_lo_u16_e32 v13, v16, v18
-; GFX9-NEXT:    v_mul_lo_u16_sdwa v2, v2, v5 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    v_mul_lo_u16_sdwa v6, v7, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    v_mul_lo_u16_e32 v7, v8, v10
+; GFX9-NEXT:    v_or_b32_sdwa v1, v19, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_mul_lo_u16_e32 v9, v9, v14
-; GFX9-NEXT:    v_or_b32_sdwa v6, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_mul_lo_u16_sdwa v2, v2, v5 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v5, v13, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v6, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
 ; GFX9-NEXT:    v_or_b32_sdwa v7, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
@@ -3109,67 +3109,67 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9-DL-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
 ; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
 ; GFX9-DL-NEXT:    global_load_ubyte v4, v3, s[2:3]
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 12
+; GFX9-DL-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v5, 20, v1
 ; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v6, 28, v1
 ; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v7, 12, v1
 ; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v8, 8, v1
-; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v11, 28, v2
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v5, 20, v1
 ; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v9, 4, v1
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v10, 20, v2
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v11, 28, v2
 ; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v14, 4, v2
 ; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v15, 12, v1
 ; GFX9-DL-NEXT:    v_lshlrev_b16_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
 ; GFX9-DL-NEXT:    v_lshlrev_b16_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-DL-NEXT:    v_lshlrev_b16_sdwa v18, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
 ; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v17, 12, v2
+; GFX9-DL-NEXT:    v_lshlrev_b16_sdwa v18, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
 ; GFX9-DL-NEXT:    v_lshlrev_b16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v12, 12, v2
 ; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v13, 8, v2
 ; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v2, 12, v9
-; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v6, 12, v6
-; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v11, 12, v11
 ; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v9, 12, v15
 ; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v15, 12, v16
+; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v6, 12, v6
 ; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v16, 12, v1
 ; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v1, 12, v5
 ; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v5, 12, v14
 ; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v14, 12, v17
 ; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v17, 12, v18
+; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v11, 12, v11
 ; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v18, 12, v0
 ; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v0, 12, v10
 ; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v8, 12, v8
 ; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v7, 12, v7
-; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v12, 12, v12
 ; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v13, 12, v13
+; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v12, 12, v12
 ; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v6, 12, v6
 ; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v1, 12, v1
-; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v0, 12, v0
 ; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v11, 12, v11
-; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v10, 12, v13
-; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v0, v1, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v0, 12, v0
 ; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v8, 12, v8
 ; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v7, 12, v7
+; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v10, 12, v13
 ; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v12, 12, v12
-; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v1, v6, v11 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v19, v15, v17
-; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v6, v7, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v7, v8, v10
-; GFX9-DL-NEXT:    v_or_b32_sdwa v1, v19, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v0, v1, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v1, v6, v11 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v2, 12, v2
 ; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v5, 12, v5
 ; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v13, v16, v18
-; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v2, v2, v5 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v6, v7, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v7, v8, v10
+; GFX9-DL-NEXT:    v_or_b32_sdwa v1, v19, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v9, v9, v14
-; GFX9-DL-NEXT:    v_or_b32_sdwa v6, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v2, v2, v5 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-DL-NEXT:    v_or_b32_sdwa v5, v13, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-DL-NEXT:    v_or_b32_sdwa v6, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
 ; GFX9-DL-NEXT:    v_or_b32_sdwa v7, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
@@ -3231,10 +3231,10 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v6, 12, v6
 ; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v13, 12, v13
 ; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v10, 12, v10
-; GFX10-DL-XNACK-NEXT:    v_mul_lo_u16 v8, v8, v15
 ; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v17, 12, v17
 ; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v9, 12, v9
 ; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v16, 12, v16
+; GFX10-DL-XNACK-NEXT:    v_mul_lo_u16 v8, v8, v15
 ; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
 ; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v12, 16, v2
 ; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v7, 12, v7
@@ -3251,33 +3251,33 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v8, 8, v8
 ; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v5, 12, v5
 ; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v12, 12, v12
-; GFX10-DL-XNACK-NEXT:    v_mul_lo_u16 v6, v6, v13
 ; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v7, 12, v7
 ; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v0, 12, v0
 ; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v14, 12, v14
 ; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v11, 12, v11
-; GFX10-DL-XNACK-NEXT:    v_mul_lo_u16 v10, v10, v15
-; GFX10-DL-XNACK-NEXT:    v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-DL-XNACK-NEXT:    v_mul_lo_u16 v6, v6, v13
 ; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v1, 12, v1
 ; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v2, 12, v2
-; GFX10-DL-XNACK-NEXT:    v_mul_lo_u16 v9, v0, v11
+; GFX10-DL-XNACK-NEXT:    v_mul_lo_u16 v10, v10, v15
+; GFX10-DL-XNACK-NEXT:    v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v5, 12, v5
 ; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v12, 12, v12
+; GFX10-DL-XNACK-NEXT:    v_mul_lo_u16 v9, v0, v11
 ; GFX10-DL-XNACK-NEXT:    v_mul_lo_u16 v11, v7, v14
 ; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v6, 8, v6
-; GFX10-DL-XNACK-NEXT:    v_mul_lo_u16 v1, v1, v2
 ; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v10, 8, v10
 ; GFX10-DL-XNACK-NEXT:    v_lshlrev_b32_e32 v0, 16, v8
+; GFX10-DL-XNACK-NEXT:    v_mul_lo_u16 v1, v1, v2
 ; GFX10-DL-XNACK-NEXT:    v_mul_lo_u16 v2, v5, v12
 ; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v9, 8, v9
 ; GFX10-DL-XNACK-NEXT:    v_or_b32_sdwa v6, v11, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-DL-XNACK-NEXT:    v_or_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-DL-XNACK-NEXT:    v_or_b32_sdwa v11, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-DL-XNACK-NEXT:    v_or_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-DL-XNACK-NEXT:    v_or_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-DL-XNACK-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
+; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v10, 8, v11
 ; GFX10-DL-XNACK-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DL-XNACK-NEXT:    v_add_nc_u16 v3, v1, v3
-; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v10, 8, v11
 ; GFX10-DL-XNACK-NEXT:    v_or_b32_sdwa v1, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX10-DL-XNACK-NEXT:    v_add_nc_u16 v9, v3, v10
 ; GFX10-DL-XNACK-NEXT:    v_lshrrev_b64 v[2:3], 24, v[0:1]
@@ -3318,27 +3318,27 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v12, 16, v0
 ; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v13, 28, v0
 ; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v14, 24, v0
+; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v16, 8, v0
 ; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v17, 4, v0
+; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v0, 12, v0
 ; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v8, 12, v8
 ; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v15, 12, v15
-; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v16, 8, v0
-; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v0, 12, v0
 ; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v6, 28, v1
 ; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v10, 4, v1
 ; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v9, 12, v9
-; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v8, 12, v8
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v18, 12, v0
 ; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v0, 12, v16
+; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v8, 12, v8
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v15, 12, v15
 ; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v3, 20, v1
 ; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
 ; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v6, 12, v6
 ; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v13, 12, v13
 ; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v10, 12, v10
-; GFX10-DL-NOXNACK-NEXT:    v_mul_lo_u16 v8, v8, v15
 ; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v17, 12, v17
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v9, 12, v9
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v0, 12, v0
+; GFX10-DL-NOXNACK-NEXT:    v_mul_lo_u16 v8, v8, v15
 ; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
 ; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v7, 12, v7
 ; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v3, 12, v3
@@ -3347,38 +3347,38 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v6, 12, v6
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v13, 12, v13
 ; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v1, 12, v1
-; GFX10-DL-NOXNACK-NEXT:    v_mul_lo_u16 v0, v9, v0
-; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v8, 8, v8
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v10, 12, v10
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v15, 12, v17
+; GFX10-DL-NOXNACK-NEXT:    v_mul_lo_u16 v0, v9, v0
+; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v8, 8, v8
 ; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v5, 12, v5
-; GFX10-DL-NOXNACK-NEXT:    v_mul_lo_u16 v6, v6, v13
 ; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v12, 12, v12
-; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v9, 12, v11
-; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v3, 12, v3
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v7, 12, v7
+; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v3, 12, v3
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v14, 12, v14
+; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v9, 12, v11
+; GFX10-DL-NOXNACK-NEXT:    v_mul_lo_u16 v6, v6, v13
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v1, 12, v1
 ; GFX10-DL-NOXNACK-NEXT:    v_mul_lo_u16 v10, v10, v15
 ; GFX10-DL-NOXNACK-NEXT:    v_or_b32_sdwa v8, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-DL-NOXNACK-NEXT:    v_mul_lo_u16 v3, v3, v9
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v5, 12, v5
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v11, 12, v12
+; GFX10-DL-NOXNACK-NEXT:    v_mul_lo_u16 v3, v3, v9
 ; GFX10-DL-NOXNACK-NEXT:    v_mul_lo_u16 v9, v7, v14
 ; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v6, 8, v6
 ; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v10, 8, v10
-; GFX10-DL-NOXNACK-NEXT:    v_mul_lo_u16 v1, v1, v18
 ; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b32_e32 v0, 16, v8
+; GFX10-DL-NOXNACK-NEXT:    v_mul_lo_u16 v1, v1, v18
 ; GFX10-DL-NOXNACK-NEXT:    v_mul_lo_u16 v12, v5, v11
 ; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v3, 8, v3
 ; GFX10-DL-NOXNACK-NEXT:    v_or_b32_sdwa v6, v9, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-DL-NOXNACK-NEXT:    v_or_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-DL-NOXNACK-NEXT:    v_or_b32_sdwa v9, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-DL-NOXNACK-NEXT:    v_or_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-DL-NOXNACK-NEXT:    v_or_b32_sdwa v3, v12, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b32_e32 v10, 16, v6
+; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v9, 8, v9
 ; GFX10-DL-NOXNACK-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DL-NOXNACK-NEXT:    v_add_nc_u16 v2, v1, v2
-; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v9, 8, v9
 ; GFX10-DL-NOXNACK-NEXT:    v_or_b32_sdwa v1, v3, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX10-DL-NOXNACK-NEXT:    v_add_nc_u16 v9, v2, v9
 ; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b64 v[2:3], 24, v[0:1]

diff  --git a/llvm/test/CodeGen/AMDGPU/idot8u.ll b/llvm/test/CodeGen/AMDGPU/idot8u.ll
index d3bb2a4981de3..ed1914a804296 100644
--- a/llvm/test/CodeGen/AMDGPU/idot8u.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot8u.ll
@@ -17,10 +17,10 @@ define amdgpu_kernel void @udot8_acc32(<8 x i4> addrspace(1)* %src1,
 ; GFX7-NEXT:    s_mov_b32 s15, 0xe8f000
 ; GFX7-NEXT:    s_add_u32 s12, s12, s3
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX7-NEXT:    s_mov_b32 s10, 0
 ; GFX7-NEXT:    s_mov_b32 s11, s3
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
@@ -130,21 +130,21 @@ define amdgpu_kernel void @udot8_acc32(<8 x i4> addrspace(1)* %src1,
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 28, v1
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 28, v2
 ; GFX9-NEXT:    v_bfe_u32 v4, v1, 24, 4
-; GFX9-NEXT:    v_bfe_u32 v11, v2, 24, 4
 ; GFX9-NEXT:    v_bfe_u32 v5, v1, 20, 4
-; GFX9-NEXT:    v_bfe_u32 v12, v2, 20, 4
 ; GFX9-NEXT:    v_bfe_u32 v6, v1, 16, 4
-; GFX9-NEXT:    v_bfe_u32 v13, v2, 16, 4
 ; GFX9-NEXT:    v_bfe_u32 v7, v1, 12, 4
-; GFX9-NEXT:    v_bfe_u32 v14, v2, 12, 4
 ; GFX9-NEXT:    v_bfe_u32 v8, v1, 8, 4
 ; GFX9-NEXT:    v_bfe_u32 v9, v1, 4, 4
+; GFX9-NEXT:    v_and_b32_e32 v1, 15, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 28, v2
+; GFX9-NEXT:    v_bfe_u32 v11, v2, 24, 4
+; GFX9-NEXT:    v_bfe_u32 v12, v2, 20, 4
+; GFX9-NEXT:    v_bfe_u32 v13, v2, 16, 4
+; GFX9-NEXT:    v_bfe_u32 v14, v2, 12, 4
 ; GFX9-NEXT:    v_bfe_u32 v15, v2, 8, 4
 ; GFX9-NEXT:    v_bfe_u32 v16, v2, 4, 4
-; GFX9-NEXT:    v_and_b32_e32 v1, 15, v1
 ; GFX9-NEXT:    v_and_b32_e32 v2, 15, v2
 ; GFX9-NEXT:    v_mul_u32_u24_e32 v1, v1, v2
 ; GFX9-NEXT:    v_mul_u32_u24_e32 v2, v9, v16
@@ -173,11 +173,11 @@ define amdgpu_kernel void @udot8_acc32(<8 x i4> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-DL-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    global_load_dword v2, v0, s[4:5]
 ; GFX9-DL-NEXT:    global_load_dword v3, v0, s[6:7]
 ; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX9-DL-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-DL-NEXT:    v_dot8_u32_u4 v0, v2, v3, s0
 ; GFX9-DL-NEXT:    global_store_dword v1, v0, s[2:3]
@@ -296,8 +296,8 @@ define amdgpu_kernel void @udot8_acc16(<8 x i4> addrspace(1)* %src1,
 ; GFX7-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
 ; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
-; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
 ; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
 ; GFX7-NEXT:    buffer_load_ushort v16, off, s[0:3], 0
 ; GFX7-NEXT:    s_addc_u32 s13, s13, 0
 ; GFX7-NEXT:    s_waitcnt vmcnt(2)
@@ -342,12 +342,12 @@ define amdgpu_kernel void @udot8_acc16(<8 x i4> addrspace(1)* %src1,
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v4, v[0:1]
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s1
-; GFX8-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX8-NEXT:    flat_load_ushort v18, v[2:3]
 ; GFX8-NEXT:    s_mov_b32 s10, -1
 ; GFX8-NEXT:    s_mov_b32 s11, 0xe80000
@@ -394,11 +394,11 @@ define amdgpu_kernel void @udot8_acc16(<8 x i4> addrspace(1)* %src1,
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v2, v0, s[4:5]
 ; GFX9-NEXT:    global_load_dword v3, v0, s[6:7]
 ; GFX9-NEXT:    global_load_ushort v17, v1, s[2:3]
+; GFX9-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 28, v2
 ; GFX9-NEXT:    v_bfe_u32 v4, v2, 24, 4
@@ -440,11 +440,11 @@ define amdgpu_kernel void @udot8_acc16(<8 x i4> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-DL-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    global_load_dword v2, v0, s[4:5]
 ; GFX9-DL-NEXT:    global_load_dword v3, v0, s[6:7]
 ; GFX9-DL-NEXT:    global_load_ushort v17, v1, s[2:3]
+; GFX9-DL-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v0, 28, v2
 ; GFX9-DL-NEXT:    v_bfe_u32 v4, v2, 24, 4
@@ -614,8 +614,8 @@ define amdgpu_kernel void @udot8_acc8(<8 x i4> addrspace(1)* %src1,
 ; GFX7-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
 ; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
-; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
 ; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
 ; GFX7-NEXT:    buffer_load_ubyte v16, off, s[0:3], 0
 ; GFX7-NEXT:    s_addc_u32 s13, s13, 0
 ; GFX7-NEXT:    s_waitcnt vmcnt(2)
@@ -660,12 +660,12 @@ define amdgpu_kernel void @udot8_acc8(<8 x i4> addrspace(1)* %src1,
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v4, v[0:1]
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s1
-; GFX8-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX8-NEXT:    flat_load_ubyte v18, v[2:3]
 ; GFX8-NEXT:    s_mov_b32 s10, -1
 ; GFX8-NEXT:    s_mov_b32 s11, 0xe80000
@@ -712,11 +712,11 @@ define amdgpu_kernel void @udot8_acc8(<8 x i4> addrspace(1)* %src1,
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v2, v0, s[4:5]
 ; GFX9-NEXT:    global_load_dword v3, v0, s[6:7]
 ; GFX9-NEXT:    global_load_ubyte v17, v1, s[2:3]
+; GFX9-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 28, v2
 ; GFX9-NEXT:    v_bfe_u32 v4, v2, 24, 4
@@ -758,11 +758,11 @@ define amdgpu_kernel void @udot8_acc8(<8 x i4> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-DL-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    global_load_dword v2, v0, s[4:5]
 ; GFX9-DL-NEXT:    global_load_dword v3, v0, s[6:7]
 ; GFX9-DL-NEXT:    global_load_ubyte v17, v1, s[2:3]
+; GFX9-DL-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v0, 28, v2
 ; GFX9-DL-NEXT:    v_bfe_u32 v4, v2, 24, 4
@@ -932,8 +932,8 @@ define amdgpu_kernel void @udot8_acc4(<8 x i4> addrspace(1)* %src1,
 ; GFX7-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
 ; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
-; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
 ; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
 ; GFX7-NEXT:    buffer_load_ubyte v16, off, s[0:3], 0
 ; GFX7-NEXT:    s_addc_u32 s13, s13, 0
 ; GFX7-NEXT:    s_waitcnt vmcnt(2)
@@ -979,12 +979,12 @@ define amdgpu_kernel void @udot8_acc4(<8 x i4> addrspace(1)* %src1,
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v4, v[0:1]
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s1
-; GFX8-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX8-NEXT:    flat_load_ubyte v18, v[2:3]
 ; GFX8-NEXT:    s_mov_b32 s10, -1
 ; GFX8-NEXT:    s_mov_b32 s11, 0xe80000
@@ -1032,11 +1032,11 @@ define amdgpu_kernel void @udot8_acc4(<8 x i4> addrspace(1)* %src1,
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v2, v0, s[4:5]
 ; GFX9-NEXT:    global_load_dword v3, v0, s[6:7]
 ; GFX9-NEXT:    global_load_ubyte v17, v1, s[2:3]
+; GFX9-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 28, v2
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 24, v2
@@ -1079,11 +1079,11 @@ define amdgpu_kernel void @udot8_acc4(<8 x i4> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-DL-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    global_load_dword v2, v0, s[4:5]
 ; GFX9-DL-NEXT:    global_load_dword v3, v0, s[6:7]
 ; GFX9-DL-NEXT:    global_load_ubyte v17, v1, s[2:3]
+; GFX9-DL-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v0, 28, v2
 ; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v4, 24, v2
@@ -1239,8 +1239,8 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(<8 x i4> addrspace(1)* %sr
 ; GFX7-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
 ; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
-; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
 ; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
 ; GFX7-NEXT:    buffer_load_ubyte v16, off, s[0:3], 0
 ; GFX7-NEXT:    s_addc_u32 s13, s13, 0
 ; GFX7-NEXT:    s_waitcnt vmcnt(2)
@@ -1286,12 +1286,12 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(<8 x i4> addrspace(1)* %sr
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v4, v[0:1]
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s1
-; GFX8-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX8-NEXT:    flat_load_ubyte v18, v[2:3]
 ; GFX8-NEXT:    s_mov_b32 s10, -1
 ; GFX8-NEXT:    s_mov_b32 s11, 0xe80000
@@ -1339,11 +1339,11 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(<8 x i4> addrspace(1)* %sr
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v2, v0, s[4:5]
 ; GFX9-NEXT:    global_load_dword v3, v0, s[6:7]
 ; GFX9-NEXT:    global_load_ubyte v17, v1, s[2:3]
+; GFX9-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 28, v2
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 24, v2
@@ -1386,11 +1386,11 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(<8 x i4> addrspace(1)* %sr
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-DL-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    global_load_dword v2, v0, s[4:5]
 ; GFX9-DL-NEXT:    global_load_dword v3, v0, s[6:7]
 ; GFX9-DL-NEXT:    global_load_ubyte v17, v1, s[2:3]
+; GFX9-DL-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v0, 28, v2
 ; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v4, 24, v2
@@ -1536,10 +1536,10 @@ define amdgpu_kernel void @udot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1,
 ; GFX7-NEXT:    s_mov_b32 s15, 0xe8f000
 ; GFX7-NEXT:    s_add_u32 s12, s12, s3
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX7-NEXT:    s_mov_b32 s10, 0
 ; GFX7-NEXT:    s_mov_b32 s11, s3
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
@@ -1654,20 +1654,20 @@ define amdgpu_kernel void @udot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1,
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_bfe_u32 v3, v1, 4, 4
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 28, v1
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 28, v2
 ; GFX9-NEXT:    v_bfe_u32 v5, v1, 24, 4
-; GFX9-NEXT:    v_bfe_u32 v12, v2, 24, 4
 ; GFX9-NEXT:    v_bfe_u32 v6, v1, 20, 4
-; GFX9-NEXT:    v_bfe_u32 v13, v2, 20, 4
 ; GFX9-NEXT:    v_bfe_u32 v7, v1, 16, 4
-; GFX9-NEXT:    v_bfe_u32 v14, v2, 16, 4
 ; GFX9-NEXT:    v_bfe_u32 v8, v1, 12, 4
 ; GFX9-NEXT:    v_bfe_u32 v9, v1, 8, 4
+; GFX9-NEXT:    v_and_b32_e32 v1, 15, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_bfe_u32 v10, v2, 4, 4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 28, v2
+; GFX9-NEXT:    v_bfe_u32 v12, v2, 24, 4
+; GFX9-NEXT:    v_bfe_u32 v13, v2, 20, 4
+; GFX9-NEXT:    v_bfe_u32 v14, v2, 16, 4
 ; GFX9-NEXT:    v_bfe_u32 v15, v2, 12, 4
 ; GFX9-NEXT:    v_bfe_u32 v16, v2, 8, 4
-; GFX9-NEXT:    v_bfe_u32 v10, v2, 4, 4
-; GFX9-NEXT:    v_and_b32_e32 v1, 15, v1
 ; GFX9-NEXT:    v_and_b32_e32 v2, 15, v2
 ; GFX9-NEXT:    v_mul_u32_u24_e32 v17, v1, v2
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1705,20 +1705,20 @@ define amdgpu_kernel void @udot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-DL-NEXT:    v_bfe_u32 v3, v1, 4, 4
 ; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v4, 28, v1
-; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v11, 28, v2
 ; GFX9-DL-NEXT:    v_bfe_u32 v5, v1, 24, 4
-; GFX9-DL-NEXT:    v_bfe_u32 v12, v2, 24, 4
 ; GFX9-DL-NEXT:    v_bfe_u32 v6, v1, 20, 4
-; GFX9-DL-NEXT:    v_bfe_u32 v13, v2, 20, 4
 ; GFX9-DL-NEXT:    v_bfe_u32 v7, v1, 16, 4
-; GFX9-DL-NEXT:    v_bfe_u32 v14, v2, 16, 4
 ; GFX9-DL-NEXT:    v_bfe_u32 v8, v1, 12, 4
 ; GFX9-DL-NEXT:    v_bfe_u32 v9, v1, 8, 4
+; GFX9-DL-NEXT:    v_and_b32_e32 v1, 15, v1
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DL-NEXT:    v_bfe_u32 v10, v2, 4, 4
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v11, 28, v2
+; GFX9-DL-NEXT:    v_bfe_u32 v12, v2, 24, 4
+; GFX9-DL-NEXT:    v_bfe_u32 v13, v2, 20, 4
+; GFX9-DL-NEXT:    v_bfe_u32 v14, v2, 16, 4
 ; GFX9-DL-NEXT:    v_bfe_u32 v15, v2, 12, 4
 ; GFX9-DL-NEXT:    v_bfe_u32 v16, v2, 8, 4
-; GFX9-DL-NEXT:    v_bfe_u32 v10, v2, 4, 4
-; GFX9-DL-NEXT:    v_and_b32_e32 v1, 15, v1
 ; GFX9-DL-NEXT:    v_and_b32_e32 v2, 15, v2
 ; GFX9-DL-NEXT:    v_mul_u32_u24_e32 v17, v1, v2
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1763,10 +1763,10 @@ define amdgpu_kernel void @udot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    v_bfe_u32 v5, v1, 20, 4
 ; GFX10-DL-NEXT:    v_bfe_u32 v6, v1, 16, 4
 ; GFX10-DL-NEXT:    v_bfe_u32 v7, v1, 12, 4
-; GFX10-DL-NEXT:    v_bfe_u32 v12, v2, 12, 4
 ; GFX10-DL-NEXT:    v_bfe_u32 v1, v1, 8, 4
-; GFX10-DL-NEXT:    v_bfe_u32 v11, v2, 8, 4
 ; GFX10-DL-NEXT:    v_bfe_u32 v10, v2, 4, 4
+; GFX10-DL-NEXT:    v_bfe_u32 v11, v2, 8, 4
+; GFX10-DL-NEXT:    v_bfe_u32 v12, v2, 12, 4
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    v_mad_u32_u24 v13, v8, v9, s2
 ; GFX10-DL-NEXT:    v_bfe_u32 v14, v2, 20, 4
@@ -1872,10 +1872,10 @@ define amdgpu_kernel void @udot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX7-NEXT:    s_mov_b32 s15, 0xe8f000
 ; GFX7-NEXT:    s_add_u32 s12, s12, s3
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX7-NEXT:    s_mov_b32 s10, 0
 ; GFX7-NEXT:    s_mov_b32 s11, s3
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
@@ -1985,21 +1985,21 @@ define amdgpu_kernel void @udot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 28, v1
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 28, v2
 ; GFX9-NEXT:    v_bfe_u32 v4, v1, 24, 4
-; GFX9-NEXT:    v_bfe_u32 v11, v2, 24, 4
 ; GFX9-NEXT:    v_bfe_u32 v5, v1, 20, 4
-; GFX9-NEXT:    v_bfe_u32 v12, v2, 20, 4
 ; GFX9-NEXT:    v_bfe_u32 v6, v1, 16, 4
-; GFX9-NEXT:    v_bfe_u32 v13, v2, 16, 4
 ; GFX9-NEXT:    v_bfe_u32 v7, v1, 12, 4
-; GFX9-NEXT:    v_bfe_u32 v14, v2, 12, 4
 ; GFX9-NEXT:    v_bfe_u32 v8, v1, 8, 4
 ; GFX9-NEXT:    v_bfe_u32 v9, v1, 4, 4
+; GFX9-NEXT:    v_and_b32_e32 v1, 15, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 28, v2
+; GFX9-NEXT:    v_bfe_u32 v11, v2, 24, 4
+; GFX9-NEXT:    v_bfe_u32 v12, v2, 20, 4
+; GFX9-NEXT:    v_bfe_u32 v13, v2, 16, 4
+; GFX9-NEXT:    v_bfe_u32 v14, v2, 12, 4
 ; GFX9-NEXT:    v_bfe_u32 v15, v2, 8, 4
 ; GFX9-NEXT:    v_bfe_u32 v16, v2, 4, 4
-; GFX9-NEXT:    v_and_b32_e32 v1, 15, v1
 ; GFX9-NEXT:    v_and_b32_e32 v2, 15, v2
 ; GFX9-NEXT:    v_mul_u32_u24_e32 v1, v1, v2
 ; GFX9-NEXT:    v_mul_u32_u24_e32 v2, v9, v16
@@ -2028,11 +2028,11 @@ define amdgpu_kernel void @udot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-DL-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    global_load_dword v2, v0, s[4:5]
 ; GFX9-DL-NEXT:    global_load_dword v3, v0, s[6:7]
 ; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX9-DL-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-DL-NEXT:    v_dot8_u32_u4 v0, v2, v3, s0
 ; GFX9-DL-NEXT:    global_store_dword v1, v0, s[2:3]
@@ -2108,10 +2108,10 @@ define amdgpu_kernel void @udot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX7-NEXT:    s_mov_b32 s15, 0xe8f000
 ; GFX7-NEXT:    s_add_u32 s12, s12, s3
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX7-NEXT:    s_mov_b32 s10, 0
 ; GFX7-NEXT:    s_mov_b32 s11, s3
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
@@ -2133,27 +2133,27 @@ define amdgpu_kernel void @udot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX7-NEXT:    v_and_b32_e32 v7, s4, v8
 ; GFX7-NEXT:    s_waitcnt vmcnt(1)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 12, v0
+; GFX7-NEXT:    v_and_b32_e32 v13, 15, v0
 ; GFX7-NEXT:    v_or_b32_e32 v6, v6, v7
 ; GFX7-NEXT:    v_and_b32_e32 v7, s4, v8
-; GFX7-NEXT:    v_and_b32_e32 v13, 15, v0
 ; GFX7-NEXT:    v_or_b32_e32 v7, v13, v7
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 16, v6
-; GFX7-NEXT:    v_lshrrev_b32_e32 v13, 16, v7
 ; GFX7-NEXT:    v_and_b32_e32 v6, 15, v6
+; GFX7-NEXT:    v_lshrrev_b32_e32 v13, 16, v7
 ; GFX7-NEXT:    v_and_b32_e32 v7, 15, v7
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mad_u32_u24 v6, v6, v7, v16
 ; GFX7-NEXT:    v_bfe_u32 v12, v0, 8, 4
-; GFX7-NEXT:    v_mad_u32_u24 v6, v8, v13, v6
 ; GFX7-NEXT:    v_bfe_u32 v14, v0, 20, 4
+; GFX7-NEXT:    v_mad_u32_u24 v6, v8, v13, v6
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v9, 28, v0
 ; GFX7-NEXT:    v_bfe_u32 v10, v0, 24, 4
 ; GFX7-NEXT:    v_bfe_u32 v11, v0, 12, 4
 ; GFX7-NEXT:    v_alignbit_b32 v0, v14, v0, 16
 ; GFX7-NEXT:    v_mad_u32_u24 v5, v5, v12, v6
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v15, 16, v2
-; GFX7-NEXT:    v_lshrrev_b32_e32 v14, 16, v0
 ; GFX7-NEXT:    v_and_b32_e32 v2, 15, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v14, 16, v0
 ; GFX7-NEXT:    v_and_b32_e32 v0, 15, v0
 ; GFX7-NEXT:    v_mad_u32_u24 v4, v4, v11, v5
 ; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v4
@@ -2175,12 +2175,12 @@ define amdgpu_kernel void @udot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v4, v[0:1]
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s1
-; GFX8-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX8-NEXT:    flat_load_ushort v18, v[2:3]
 ; GFX8-NEXT:    s_mov_b32 s10, -1
 ; GFX8-NEXT:    s_mov_b32 s11, 0xe80000
@@ -2192,6 +2192,9 @@ define amdgpu_kernel void @udot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX8-NEXT:    v_bfe_u32 v6, v4, 8, 4
 ; GFX8-NEXT:    v_bfe_u32 v7, v4, 12, 4
 ; GFX8-NEXT:    v_bfe_u32 v8, v4, 16, 4
+; GFX8-NEXT:    v_bfe_u32 v9, v4, 20, 4
+; GFX8-NEXT:    v_bfe_u32 v10, v4, 24, 4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 28, v4
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
 ; GFX8-NEXT:    v_and_b32_e32 v11, 15, v0
 ; GFX8-NEXT:    v_bfe_u32 v12, v0, 4, 4
@@ -2203,13 +2206,10 @@ define amdgpu_kernel void @udot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX8-NEXT:    v_mad_u16 v1, v6, v13, v1
 ; GFX8-NEXT:    v_bfe_u32 v15, v0, 16, 4
 ; GFX8-NEXT:    v_mad_u16 v1, v7, v14, v1
-; GFX8-NEXT:    v_bfe_u32 v9, v4, 20, 4
 ; GFX8-NEXT:    v_bfe_u32 v16, v0, 20, 4
 ; GFX8-NEXT:    v_mad_u16 v1, v8, v15, v1
-; GFX8-NEXT:    v_bfe_u32 v10, v4, 24, 4
 ; GFX8-NEXT:    v_bfe_u32 v17, v0, 24, 4
 ; GFX8-NEXT:    v_mad_u16 v1, v9, v16, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 28, v4
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 28, v0
 ; GFX8-NEXT:    v_mad_u16 v1, v10, v17, v1
 ; GFX8-NEXT:    v_mad_u16 v0, v4, v0, v1
@@ -2227,44 +2227,44 @@ define amdgpu_kernel void @udot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0xffff
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v3, v0, s[4:5]
 ; GFX9-NEXT:    global_load_dword v4, v0, s[6:7]
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_bfe_u32 v0, v3, 24, 4
 ; GFX9-NEXT:    v_bfe_u32 v6, v3, 16, 4
 ; GFX9-NEXT:    v_bfe_u32 v8, v3, 8, 4
+; GFX9-NEXT:    v_and_b32_e32 v10, 15, v3
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_bfe_u32 v11, v4, 24, 4
 ; GFX9-NEXT:    v_bfe_u32 v13, v4, 16, 4
 ; GFX9-NEXT:    v_bfe_u32 v15, v4, 8, 4
 ; GFX9-NEXT:    v_and_b32_e32 v17, 15, v4
-; GFX9-NEXT:    v_and_b32_e32 v10, 15, v3
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 28, v3
-; GFX9-NEXT:    v_and_b32_e32 v0, v2, v0
 ; GFX9-NEXT:    v_bfe_u32 v7, v3, 20, 4
-; GFX9-NEXT:    v_and_b32_e32 v6, v2, v6
 ; GFX9-NEXT:    v_bfe_u32 v9, v3, 12, 4
-; GFX9-NEXT:    v_and_b32_e32 v8, v2, v8
 ; GFX9-NEXT:    v_bfe_u32 v3, v3, 4, 4
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 28, v4
 ; GFX9-NEXT:    v_bfe_u32 v14, v4, 20, 4
 ; GFX9-NEXT:    v_bfe_u32 v16, v4, 12, 4
 ; GFX9-NEXT:    v_bfe_u32 v4, v4, 4, 4
-; GFX9-NEXT:    v_and_b32_e32 v17, v2, v17
 ; GFX9-NEXT:    v_and_b32_e32 v11, v2, v11
+; GFX9-NEXT:    v_and_b32_e32 v0, v2, v0
 ; GFX9-NEXT:    v_and_b32_e32 v13, v2, v13
+; GFX9-NEXT:    v_and_b32_e32 v6, v2, v6
 ; GFX9-NEXT:    v_and_b32_e32 v15, v2, v15
+; GFX9-NEXT:    v_and_b32_e32 v8, v2, v8
+; GFX9-NEXT:    v_and_b32_e32 v17, v2, v17
 ; GFX9-NEXT:    v_and_b32_e32 v2, v2, v10
 ; GFX9-NEXT:    v_lshl_or_b32 v4, v4, 16, v17
 ; GFX9-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
 ; GFX9-NEXT:    v_pk_mul_lo_u16 v2, v2, v4
 ; GFX9-NEXT:    global_load_ushort v4, v1, s[2:3]
 ; GFX9-NEXT:    v_lshl_or_b32 v0, v5, 16, v0
-; GFX9-NEXT:    v_lshl_or_b32 v6, v7, 16, v6
 ; GFX9-NEXT:    v_lshl_or_b32 v5, v14, 16, v13
+; GFX9-NEXT:    v_lshl_or_b32 v6, v7, 16, v6
 ; GFX9-NEXT:    v_lshl_or_b32 v7, v16, 16, v15
 ; GFX9-NEXT:    v_lshl_or_b32 v8, v9, 16, v8
 ; GFX9-NEXT:    v_pk_mul_lo_u16 v3, v6, v5
@@ -2294,44 +2294,44 @@ define amdgpu_kernel void @udot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v2, 0xffff
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    global_load_dword v3, v0, s[4:5]
 ; GFX9-DL-NEXT:    global_load_dword v4, v0, s[6:7]
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-DL-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-DL-NEXT:    v_bfe_u32 v0, v3, 24, 4
 ; GFX9-DL-NEXT:    v_bfe_u32 v6, v3, 16, 4
 ; GFX9-DL-NEXT:    v_bfe_u32 v8, v3, 8, 4
+; GFX9-DL-NEXT:    v_and_b32_e32 v10, 15, v3
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-DL-NEXT:    v_bfe_u32 v11, v4, 24, 4
 ; GFX9-DL-NEXT:    v_bfe_u32 v13, v4, 16, 4
 ; GFX9-DL-NEXT:    v_bfe_u32 v15, v4, 8, 4
 ; GFX9-DL-NEXT:    v_and_b32_e32 v17, 15, v4
-; GFX9-DL-NEXT:    v_and_b32_e32 v10, 15, v3
 ; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v5, 28, v3
-; GFX9-DL-NEXT:    v_and_b32_e32 v0, v2, v0
 ; GFX9-DL-NEXT:    v_bfe_u32 v7, v3, 20, 4
-; GFX9-DL-NEXT:    v_and_b32_e32 v6, v2, v6
 ; GFX9-DL-NEXT:    v_bfe_u32 v9, v3, 12, 4
-; GFX9-DL-NEXT:    v_and_b32_e32 v8, v2, v8
 ; GFX9-DL-NEXT:    v_bfe_u32 v3, v3, 4, 4
 ; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v12, 28, v4
 ; GFX9-DL-NEXT:    v_bfe_u32 v14, v4, 20, 4
 ; GFX9-DL-NEXT:    v_bfe_u32 v16, v4, 12, 4
 ; GFX9-DL-NEXT:    v_bfe_u32 v4, v4, 4, 4
-; GFX9-DL-NEXT:    v_and_b32_e32 v17, v2, v17
 ; GFX9-DL-NEXT:    v_and_b32_e32 v11, v2, v11
+; GFX9-DL-NEXT:    v_and_b32_e32 v0, v2, v0
 ; GFX9-DL-NEXT:    v_and_b32_e32 v13, v2, v13
+; GFX9-DL-NEXT:    v_and_b32_e32 v6, v2, v6
 ; GFX9-DL-NEXT:    v_and_b32_e32 v15, v2, v15
+; GFX9-DL-NEXT:    v_and_b32_e32 v8, v2, v8
+; GFX9-DL-NEXT:    v_and_b32_e32 v17, v2, v17
 ; GFX9-DL-NEXT:    v_and_b32_e32 v2, v2, v10
 ; GFX9-DL-NEXT:    v_lshl_or_b32 v4, v4, 16, v17
 ; GFX9-DL-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
 ; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v2, v2, v4
 ; GFX9-DL-NEXT:    global_load_ushort v4, v1, s[2:3]
 ; GFX9-DL-NEXT:    v_lshl_or_b32 v0, v5, 16, v0
-; GFX9-DL-NEXT:    v_lshl_or_b32 v6, v7, 16, v6
 ; GFX9-DL-NEXT:    v_lshl_or_b32 v5, v14, 16, v13
+; GFX9-DL-NEXT:    v_lshl_or_b32 v6, v7, 16, v6
 ; GFX9-DL-NEXT:    v_lshl_or_b32 v7, v16, 16, v15
 ; GFX9-DL-NEXT:    v_lshl_or_b32 v8, v9, 16, v8
 ; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v3, v6, v5
@@ -2395,17 +2395,17 @@ define amdgpu_kernel void @udot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DL-NEXT:    v_add_nc_u16 v3, v6, v3
 ; GFX10-DL-NEXT:    v_bfe_u32 v1, v1, 20, 4
+; GFX10-DL-NEXT:    v_bfe_u32 v6, v2, 20, 4
+; GFX10-DL-NEXT:    v_and_b32_e32 v7, v4, v7
 ; GFX10-DL-NEXT:    v_and_b32_e32 v11, v4, v11
 ; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v9, v9, v10
-; GFX10-DL-NEXT:    v_bfe_u32 v6, v2, 20, 4
 ; GFX10-DL-NEXT:    v_add_nc_u16 v3, v3, v12
-; GFX10-DL-NEXT:    v_and_b32_e32 v7, v4, v7
 ; GFX10-DL-NEXT:    v_bfe_u32 v10, v2, 24, 4
-; GFX10-DL-NEXT:    v_lshl_or_b32 v1, v1, 16, v11
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 28, v2
-; GFX10-DL-NEXT:    v_add_nc_u16 v3, v3, v9
 ; GFX10-DL-NEXT:    v_lshl_or_b32 v6, v6, 16, v7
+; GFX10-DL-NEXT:    v_lshl_or_b32 v1, v1, 16, v11
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v7, 16, v9
+; GFX10-DL-NEXT:    v_add_nc_u16 v3, v3, v9
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 28, v2
 ; GFX10-DL-NEXT:    v_and_b32_e32 v9, v4, v10
 ; GFX10-DL-NEXT:    v_and_b32_e32 v4, v4, v5
 ; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v1, v1, v6
@@ -2469,10 +2469,10 @@ define amdgpu_kernel void @udot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX7-NEXT:    s_mov_b32 s15, 0xe8f000
 ; GFX7-NEXT:    s_add_u32 s12, s12, s3
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX7-NEXT:    s_mov_b32 s10, 0
 ; GFX7-NEXT:    s_mov_b32 s11, s3
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
@@ -2487,43 +2487,43 @@ define amdgpu_kernel void @udot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX7-NEXT:    s_waitcnt vmcnt(2)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 28, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 4, v2
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 4, v2
-; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 12, v2
 ; GFX7-NEXT:    v_bfe_u32 v1, v2, 8, 4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 4, v2
 ; GFX7-NEXT:    v_and_b32_e32 v5, 15, v2
 ; GFX7-NEXT:    v_bfe_u32 v7, v2, 16, 4
-; GFX7-NEXT:    v_alignbit_b32 v2, v6, v2, 24
-; GFX7-NEXT:    v_and_b32_e32 v6, s4, v9
+; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 12, v2
 ; GFX7-NEXT:    s_waitcnt vmcnt(1)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v11, 4, v0
+; GFX7-NEXT:    v_alignbit_b32 v2, v6, v2, 24
+; GFX7-NEXT:    v_and_b32_e32 v6, s4, v9
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 4, v0
+; GFX7-NEXT:    v_bfe_u32 v10, v0, 8, 4
 ; GFX7-NEXT:    v_and_b32_e32 v4, s4, v4
 ; GFX7-NEXT:    v_or_b32_e32 v5, v5, v6
 ; GFX7-NEXT:    v_and_b32_e32 v6, v3, v9
-; GFX7-NEXT:    v_bfe_u32 v10, v0, 8, 4
 ; GFX7-NEXT:    v_and_b32_e32 v3, v3, v11
+; GFX7-NEXT:    v_and_b32_e32 v12, 15, v0
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v4
 ; GFX7-NEXT:    v_or_b32_e32 v3, v10, v3
-; GFX7-NEXT:    v_and_b32_e32 v12, 15, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v13, 28, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX7-NEXT:    v_or_b32_e32 v6, v12, v6
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_and_b32_e32 v2, s5, v2
-; GFX7-NEXT:    v_or_b32_e32 v3, v6, v3
-; GFX7-NEXT:    v_lshrrev_b32_e32 v15, 12, v0
 ; GFX7-NEXT:    v_bfe_u32 v14, v0, 16, 4
-; GFX7-NEXT:    v_alignbit_b32 v0, v13, v0, 24
+; GFX7-NEXT:    v_lshrrev_b32_e32 v15, 12, v0
 ; GFX7-NEXT:    v_and_b32_e32 v8, s4, v8
+; GFX7-NEXT:    v_and_b32_e32 v2, s5, v2
+; GFX7-NEXT:    v_alignbit_b32 v0, v13, v0, 24
 ; GFX7-NEXT:    v_or_b32_e32 v1, v5, v1
-; GFX7-NEXT:    v_and_b32_e32 v4, s4, v15
-; GFX7-NEXT:    v_and_b32_e32 v0, s5, v0
+; GFX7-NEXT:    v_or_b32_e32 v3, v6, v3
 ; GFX7-NEXT:    v_or_b32_e32 v7, v7, v8
+; GFX7-NEXT:    v_and_b32_e32 v4, s4, v15
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_and_b32_e32 v0, s5, v0
 ; GFX7-NEXT:    v_and_b32_e32 v6, 15, v1
 ; GFX7-NEXT:    v_and_b32_e32 v12, 15, v3
-; GFX7-NEXT:    v_or_b32_e32 v2, v7, v2
 ; GFX7-NEXT:    v_or_b32_e32 v4, v14, v4
+; GFX7-NEXT:    v_or_b32_e32 v2, v7, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    v_bfe_u32 v7, v1, 8, 4
 ; GFX7-NEXT:    v_bfe_u32 v13, v3, 8, 4
@@ -2531,8 +2531,8 @@ define amdgpu_kernel void @udot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX7-NEXT:    v_mad_u32_u24 v6, v6, v12, v16
 ; GFX7-NEXT:    v_or_b32_e32 v0, v4, v0
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 24, v1
-; GFX7-NEXT:    v_lshrrev_b32_e32 v10, 24, v3
 ; GFX7-NEXT:    v_bfe_u32 v1, v1, 16, 4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v10, 24, v3
 ; GFX7-NEXT:    v_bfe_u32 v3, v3, 16, 4
 ; GFX7-NEXT:    v_mad_u32_u24 v6, v7, v13, v6
 ; GFX7-NEXT:    v_mad_u32_u24 v1, v1, v3, v6
@@ -2543,8 +2543,8 @@ define amdgpu_kernel void @udot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX7-NEXT:    v_bfe_u32 v15, v0, 8, 4
 ; GFX7-NEXT:    v_mad_u32_u24 v1, v8, v14, v1
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 24, v2
-; GFX7-NEXT:    v_lshrrev_b32_e32 v11, 24, v0
 ; GFX7-NEXT:    v_bfe_u32 v2, v2, 16, 4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v11, 24, v0
 ; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 4
 ; GFX7-NEXT:    v_mad_u32_u24 v1, v9, v15, v1
 ; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
@@ -2577,23 +2577,23 @@ define amdgpu_kernel void @udot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX8-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX8-NEXT:    s_waitcnt vmcnt(2)
 ; GFX8-NEXT:    v_bfe_u32 v3, v4, 20, 4
-; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_bfe_u32 v13, v2, 20, 4
 ; GFX8-NEXT:    v_bfe_u32 v7, v4, 24, 4
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 28, v4
-; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 28, v2
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_bfe_u32 v13, v2, 20, 4
 ; GFX8-NEXT:    v_bfe_u32 v14, v2, 24, 4
-; GFX8-NEXT:    v_mul_lo_u16_sdwa v3, v3, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 28, v2
 ; GFX8-NEXT:    v_bfe_u32 v6, v4, 16, 4
-; GFX8-NEXT:    v_bfe_u32 v12, v2, 16, 4
 ; GFX8-NEXT:    v_bfe_u32 v9, v4, 8, 4
-; GFX8-NEXT:    v_bfe_u32 v16, v2, 8, 4
 ; GFX8-NEXT:    v_bfe_u32 v10, v4, 12, 4
 ; GFX8-NEXT:    v_and_b32_e32 v11, 15, v4
+; GFX8-NEXT:    v_bfe_u32 v4, v4, 4, 4
+; GFX8-NEXT:    v_bfe_u32 v12, v2, 16, 4
+; GFX8-NEXT:    v_bfe_u32 v16, v2, 8, 4
 ; GFX8-NEXT:    v_bfe_u32 v17, v2, 12, 4
 ; GFX8-NEXT:    v_and_b32_e32 v18, 15, v2
-; GFX8-NEXT:    v_bfe_u32 v4, v4, 4, 4
 ; GFX8-NEXT:    v_bfe_u32 v2, v2, 4, 4
+; GFX8-NEXT:    v_mul_lo_u16_sdwa v3, v3, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_mul_lo_u16_e32 v13, v7, v14
 ; GFX8-NEXT:    v_mul_lo_u16_sdwa v8, v8, v15 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_mul_lo_u16_e32 v19, v6, v12
@@ -2602,12 +2602,12 @@ define amdgpu_kernel void @udot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX8-NEXT:    v_mul_lo_u16_e32 v11, v11, v18
 ; GFX8-NEXT:    v_mul_lo_u16_sdwa v4, v4, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v8, v13, v8
+; GFX8-NEXT:    v_or_b32_e32 v3, v19, v3
 ; GFX8-NEXT:    v_or_b32_e32 v9, v9, v10
 ; GFX8-NEXT:    v_or_b32_e32 v10, v11, v4
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v8
-; GFX8-NEXT:    v_or_b32_e32 v3, v19, v3
-; GFX8-NEXT:    v_or_b32_sdwa v3, v3, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v9
+; GFX8-NEXT:    v_or_b32_sdwa v3, v3, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v4, v4, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 8, v3
 ; GFX8-NEXT:    v_lshrrev_b64 v[2:3], 24, v[2:3]
@@ -2636,44 +2636,44 @@ define amdgpu_kernel void @udot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v1, v0, s[4:5]
 ; GFX9-NEXT:    global_load_dword v2, v0, s[6:7]
 ; GFX9-NEXT:    global_load_ubyte v4, v3, s[2:3]
+; GFX9-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_bfe_u32 v0, v1, 20, 4
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_bfe_u32 v12, v2, 20, 4
 ; GFX9-NEXT:    v_bfe_u32 v6, v1, 24, 4
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 28, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 28, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_bfe_u32 v12, v2, 20, 4
 ; GFX9-NEXT:    v_bfe_u32 v13, v2, 24, 4
-; GFX9-NEXT:    v_mul_lo_u16_sdwa v0, v0, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 28, v2
 ; GFX9-NEXT:    v_bfe_u32 v5, v1, 16, 4
-; GFX9-NEXT:    v_bfe_u32 v11, v2, 16, 4
 ; GFX9-NEXT:    v_bfe_u32 v8, v1, 8, 4
-; GFX9-NEXT:    v_bfe_u32 v15, v2, 8, 4
 ; GFX9-NEXT:    v_bfe_u32 v9, v1, 12, 4
 ; GFX9-NEXT:    v_and_b32_e32 v10, 15, v1
+; GFX9-NEXT:    v_bfe_u32 v1, v1, 4, 4
+; GFX9-NEXT:    v_bfe_u32 v11, v2, 16, 4
+; GFX9-NEXT:    v_bfe_u32 v15, v2, 8, 4
 ; GFX9-NEXT:    v_bfe_u32 v16, v2, 12, 4
 ; GFX9-NEXT:    v_and_b32_e32 v17, 15, v2
-; GFX9-NEXT:    v_bfe_u32 v1, v1, 4, 4
 ; GFX9-NEXT:    v_bfe_u32 v2, v2, 4, 4
+; GFX9-NEXT:    v_mul_lo_u16_sdwa v0, v0, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    v_mul_lo_u16_e32 v12, v6, v13
 ; GFX9-NEXT:    v_mul_lo_u16_sdwa v7, v7, v14 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    v_mul_lo_u16_e32 v18, v5, v11
-; GFX9-NEXT:    v_mul_lo_u16_sdwa v2, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    v_mul_lo_u16_e32 v8, v8, v15
 ; GFX9-NEXT:    v_mul_lo_u16_sdwa v9, v9, v16 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    v_mul_lo_u16_e32 v10, v10, v17
+; GFX9-NEXT:    v_mul_lo_u16_sdwa v2, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_e32 v7, v12, v7
-; GFX9-NEXT:    v_or_b32_e32 v8, v8, v9
 ; GFX9-NEXT:    v_or_b32_e32 v1, v18, v0
+; GFX9-NEXT:    v_or_b32_e32 v8, v8, v9
 ; GFX9-NEXT:    v_or_b32_e32 v9, v10, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v7
-; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v8
+; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_e32 v2, v2, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 8, v1
 ; GFX9-NEXT:    v_lshrrev_b64 v[0:1], 24, v[0:1]
@@ -2702,44 +2702,44 @@ define amdgpu_kernel void @udot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9-DL-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
 ; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
 ; GFX9-DL-NEXT:    global_load_ubyte v4, v3, s[2:3]
+; GFX9-DL-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-DL-NEXT:    v_bfe_u32 v0, v1, 20, 4
-; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-DL-NEXT:    v_bfe_u32 v12, v2, 20, 4
 ; GFX9-DL-NEXT:    v_bfe_u32 v6, v1, 24, 4
 ; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v7, 28, v1
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v14, 28, v2
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-DL-NEXT:    v_bfe_u32 v12, v2, 20, 4
 ; GFX9-DL-NEXT:    v_bfe_u32 v13, v2, 24, 4
-; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v0, v0, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v14, 28, v2
 ; GFX9-DL-NEXT:    v_bfe_u32 v5, v1, 16, 4
-; GFX9-DL-NEXT:    v_bfe_u32 v11, v2, 16, 4
 ; GFX9-DL-NEXT:    v_bfe_u32 v8, v1, 8, 4
-; GFX9-DL-NEXT:    v_bfe_u32 v15, v2, 8, 4
 ; GFX9-DL-NEXT:    v_bfe_u32 v9, v1, 12, 4
 ; GFX9-DL-NEXT:    v_and_b32_e32 v10, 15, v1
+; GFX9-DL-NEXT:    v_bfe_u32 v1, v1, 4, 4
+; GFX9-DL-NEXT:    v_bfe_u32 v11, v2, 16, 4
+; GFX9-DL-NEXT:    v_bfe_u32 v15, v2, 8, 4
 ; GFX9-DL-NEXT:    v_bfe_u32 v16, v2, 12, 4
 ; GFX9-DL-NEXT:    v_and_b32_e32 v17, 15, v2
-; GFX9-DL-NEXT:    v_bfe_u32 v1, v1, 4, 4
 ; GFX9-DL-NEXT:    v_bfe_u32 v2, v2, 4, 4
+; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v0, v0, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v12, v6, v13
 ; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v7, v7, v14 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v18, v5, v11
-; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v2, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v8, v8, v15
 ; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v9, v9, v16 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v10, v10, v17
+; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v2, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-DL-NEXT:    v_or_b32_e32 v7, v12, v7
-; GFX9-DL-NEXT:    v_or_b32_e32 v8, v8, v9
 ; GFX9-DL-NEXT:    v_or_b32_e32 v1, v18, v0
+; GFX9-DL-NEXT:    v_or_b32_e32 v8, v8, v9
 ; GFX9-DL-NEXT:    v_or_b32_e32 v9, v10, v2
 ; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v10, 16, v7
-; GFX9-DL-NEXT:    v_or_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 16, v8
+; GFX9-DL-NEXT:    v_or_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-DL-NEXT:    v_or_b32_e32 v2, v2, v0
 ; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v10, 8, v1
 ; GFX9-DL-NEXT:    v_lshrrev_b64 v[0:1], 24, v[0:1]
@@ -2784,16 +2784,16 @@ define amdgpu_kernel void @udot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v14, 28, v2
 ; GFX10-DL-NEXT:    v_mul_lo_u16 v9, v9, v10
 ; GFX10-DL-NEXT:    v_bfe_u32 v5, v1, 16, 4
-; GFX10-DL-NEXT:    v_mul_lo_u16 v8, v8, v13
 ; GFX10-DL-NEXT:    v_bfe_u32 v0, v1, 20, 4
 ; GFX10-DL-NEXT:    v_bfe_u32 v6, v1, 24, 4
 ; GFX10-DL-NEXT:    v_and_b32_e32 v11, 15, v1
-; GFX10-DL-NEXT:    v_lshlrev_b16 v9, 8, v9
 ; GFX10-DL-NEXT:    v_bfe_u32 v1, v1, 4, 4
 ; GFX10-DL-NEXT:    v_bfe_u32 v15, v2, 4, 4
-; GFX10-DL-NEXT:    v_mul_lo_u16 v7, v7, v14
+; GFX10-DL-NEXT:    v_mul_lo_u16 v8, v8, v13
+; GFX10-DL-NEXT:    v_lshlrev_b16 v9, 8, v9
 ; GFX10-DL-NEXT:    v_bfe_u32 v10, v2, 20, 4
 ; GFX10-DL-NEXT:    v_bfe_u32 v13, v2, 24, 4
+; GFX10-DL-NEXT:    v_mul_lo_u16 v7, v7, v14
 ; GFX10-DL-NEXT:    v_bfe_u32 v12, v2, 16, 4
 ; GFX10-DL-NEXT:    v_and_b32_e32 v2, 15, v2
 ; GFX10-DL-NEXT:    v_mul_lo_u16 v1, v1, v15
@@ -2801,12 +2801,12 @@ define amdgpu_kernel void @udot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    v_mul_lo_u16 v9, v0, v10
 ; GFX10-DL-NEXT:    v_mul_lo_u16 v10, v6, v13
 ; GFX10-DL-NEXT:    v_lshlrev_b16 v7, 8, v7
-; GFX10-DL-NEXT:    v_mul_lo_u16 v2, v11, v2
 ; GFX10-DL-NEXT:    v_lshlrev_b16 v1, 8, v1
 ; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 16, v8
+; GFX10-DL-NEXT:    v_mul_lo_u16 v2, v11, v2
 ; GFX10-DL-NEXT:    v_mul_lo_u16 v11, v5, v12
-; GFX10-DL-NEXT:    v_or_b32_e32 v7, v10, v7
 ; GFX10-DL-NEXT:    v_lshlrev_b16 v9, 8, v9
+; GFX10-DL-NEXT:    v_or_b32_e32 v7, v10, v7
 ; GFX10-DL-NEXT:    v_or_b32_sdwa v10, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX10-DL-NEXT:    v_or_b32_e32 v1, v2, v1
 ; GFX10-DL-NEXT:    v_or_b32_e32 v2, v11, v9
@@ -2883,8 +2883,8 @@ define amdgpu_kernel void @udot8_acc4_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX7-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
 ; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
-; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
 ; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
 ; GFX7-NEXT:    buffer_load_ubyte v16, off, s[0:3], 0
 ; GFX7-NEXT:    s_addc_u32 s13, s13, 0
 ; GFX7-NEXT:    s_waitcnt vmcnt(2)
@@ -2930,8 +2930,8 @@ define amdgpu_kernel void @udot8_acc4_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v4, v[0:1]
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s1
@@ -2945,18 +2945,18 @@ define amdgpu_kernel void @udot8_acc4_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX8-NEXT:    v_bfe_u32 v5, v4, 4, 4
 ; GFX8-NEXT:    v_bfe_u32 v6, v4, 8, 4
 ; GFX8-NEXT:    v_bfe_u32 v7, v4, 12, 4
+; GFX8-NEXT:    v_bfe_u32 v8, v4, 16, 4
+; GFX8-NEXT:    v_bfe_u32 v9, v4, 20, 4
+; GFX8-NEXT:    v_bfe_u32 v10, v4, 24, 4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 28, v4
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v11, 15, v0
 ; GFX8-NEXT:    v_bfe_u32 v12, v0, 4, 4
 ; GFX8-NEXT:    v_bfe_u32 v13, v0, 8, 4
 ; GFX8-NEXT:    v_bfe_u32 v14, v0, 12, 4
-; GFX8-NEXT:    v_bfe_u32 v8, v4, 16, 4
 ; GFX8-NEXT:    v_bfe_u32 v15, v0, 16, 4
-; GFX8-NEXT:    v_bfe_u32 v9, v4, 20, 4
-; GFX8-NEXT:    v_bfe_u32 v10, v4, 24, 4
 ; GFX8-NEXT:    v_bfe_u32 v16, v0, 20, 4
 ; GFX8-NEXT:    v_bfe_u32 v17, v0, 24, 4
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 28, v4
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 28, v0
 ; GFX8-NEXT:    v_mul_u32_u24_e32 v0, v4, v0
 ; GFX8-NEXT:    v_mul_u32_u24_e32 v4, v10, v17
@@ -2991,27 +2991,27 @@ define amdgpu_kernel void @udot8_acc4_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v2, v0, s[4:5]
 ; GFX9-NEXT:    global_load_dword v3, v0, s[6:7]
+; GFX9-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_and_b32_e32 v0, 15, v2
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v10, 15, v3
 ; GFX9-NEXT:    v_bfe_u32 v4, v2, 4, 4
-; GFX9-NEXT:    v_bfe_u32 v11, v3, 4, 4
 ; GFX9-NEXT:    v_bfe_u32 v5, v2, 8, 4
-; GFX9-NEXT:    v_bfe_u32 v12, v3, 8, 4
 ; GFX9-NEXT:    v_bfe_u32 v6, v2, 12, 4
-; GFX9-NEXT:    v_bfe_u32 v13, v3, 12, 4
 ; GFX9-NEXT:    v_bfe_u32 v7, v2, 16, 4
-; GFX9-NEXT:    v_bfe_u32 v14, v3, 16, 4
 ; GFX9-NEXT:    v_bfe_u32 v8, v2, 20, 4
 ; GFX9-NEXT:    v_bfe_u32 v9, v2, 24, 4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 28, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v10, 15, v3
+; GFX9-NEXT:    v_bfe_u32 v11, v3, 4, 4
+; GFX9-NEXT:    v_bfe_u32 v12, v3, 8, 4
+; GFX9-NEXT:    v_bfe_u32 v13, v3, 12, 4
+; GFX9-NEXT:    v_bfe_u32 v14, v3, 16, 4
 ; GFX9-NEXT:    v_bfe_u32 v15, v3, 20, 4
 ; GFX9-NEXT:    v_bfe_u32 v16, v3, 24, 4
-; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 28, v2
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 28, v3
 ; GFX9-NEXT:    v_mul_u32_u24_e32 v2, v2, v3
 ; GFX9-NEXT:    v_mul_u32_u24_e32 v3, v9, v16
@@ -3046,27 +3046,27 @@ define amdgpu_kernel void @udot8_acc4_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-DL-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    global_load_dword v2, v0, s[4:5]
 ; GFX9-DL-NEXT:    global_load_dword v3, v0, s[6:7]
+; GFX9-DL-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-DL-NEXT:    v_and_b32_e32 v0, 15, v2
-; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_and_b32_e32 v10, 15, v3
 ; GFX9-DL-NEXT:    v_bfe_u32 v4, v2, 4, 4
-; GFX9-DL-NEXT:    v_bfe_u32 v11, v3, 4, 4
 ; GFX9-DL-NEXT:    v_bfe_u32 v5, v2, 8, 4
-; GFX9-DL-NEXT:    v_bfe_u32 v12, v3, 8, 4
 ; GFX9-DL-NEXT:    v_bfe_u32 v6, v2, 12, 4
-; GFX9-DL-NEXT:    v_bfe_u32 v13, v3, 12, 4
 ; GFX9-DL-NEXT:    v_bfe_u32 v7, v2, 16, 4
-; GFX9-DL-NEXT:    v_bfe_u32 v14, v3, 16, 4
 ; GFX9-DL-NEXT:    v_bfe_u32 v8, v2, 20, 4
 ; GFX9-DL-NEXT:    v_bfe_u32 v9, v2, 24, 4
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v2, 28, v2
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DL-NEXT:    v_and_b32_e32 v10, 15, v3
+; GFX9-DL-NEXT:    v_bfe_u32 v11, v3, 4, 4
+; GFX9-DL-NEXT:    v_bfe_u32 v12, v3, 8, 4
+; GFX9-DL-NEXT:    v_bfe_u32 v13, v3, 12, 4
+; GFX9-DL-NEXT:    v_bfe_u32 v14, v3, 16, 4
 ; GFX9-DL-NEXT:    v_bfe_u32 v15, v3, 20, 4
 ; GFX9-DL-NEXT:    v_bfe_u32 v16, v3, 24, 4
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v2, 28, v2
 ; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v3, 28, v3
 ; GFX9-DL-NEXT:    v_mul_u32_u24_e32 v2, v2, v3
 ; GFX9-DL-NEXT:    v_mul_u32_u24_e32 v3, v9, v16
@@ -3232,6 +3232,7 @@ define amdgpu_kernel void @udot8_variant1(i32 addrspace(1)* %v1addr,
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
@@ -3240,22 +3241,21 @@ define amdgpu_kernel void @udot8_variant1(i32 addrspace(1)* %v1addr,
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v0, v[0:1]
-; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
 ; GFX8-NEXT:    v_and_b32_e32 v1, 15, v3
 ; GFX8-NEXT:    v_bfe_u32 v4, v3, 4, 4
 ; GFX8-NEXT:    v_bfe_u32 v6, v3, 8, 4
 ; GFX8-NEXT:    v_bfe_u32 v8, v3, 12, 4
+; GFX8-NEXT:    v_bfe_u32 v10, v3, 16, 4
+; GFX8-NEXT:    v_bfe_u32 v12, v3, 20, 4
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v2, 15, v0
 ; GFX8-NEXT:    v_bfe_u32 v5, v0, 4, 4
 ; GFX8-NEXT:    v_bfe_u32 v7, v0, 8, 4
 ; GFX8-NEXT:    v_bfe_u32 v9, v0, 12, 4
-; GFX8-NEXT:    v_bfe_u32 v10, v3, 16, 4
 ; GFX8-NEXT:    v_bfe_u32 v11, v0, 16, 4
-; GFX8-NEXT:    v_bfe_u32 v12, v3, 20, 4
-; GFX8-NEXT:    v_bfe_u32 v14, v3, 24, 4
 ; GFX8-NEXT:    v_bfe_u32 v13, v0, 20, 4
+; GFX8-NEXT:    v_bfe_u32 v14, v3, 24, 4
 ; GFX8-NEXT:    v_bfe_u32 v15, v0, 24, 4
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 28, v3
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 28, v0
@@ -3296,8 +3296,8 @@ define amdgpu_kernel void @udot8_variant1(i32 addrspace(1)* %v1addr,
 ; GFX9-NEXT:    v_bfe_u32 v11, v1, 16, 4
 ; GFX9-NEXT:    v_bfe_u32 v12, v2, 16, 4
 ; GFX9-NEXT:    v_bfe_u32 v13, v1, 20, 4
-; GFX9-NEXT:    v_bfe_u32 v15, v1, 24, 4
 ; GFX9-NEXT:    v_bfe_u32 v14, v2, 20, 4
+; GFX9-NEXT:    v_bfe_u32 v15, v1, 24, 4
 ; GFX9-NEXT:    v_bfe_u32 v16, v2, 24, 4
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 28, v1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 28, v2

diff  --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
index 62278db9952ba..0c37f43f6a76a 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
@@ -454,8 +454,8 @@ define amdgpu_kernel void @dynamic_insertelement_v3f32(<3 x float> addrspace(1)*
 ; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    s_cmp_lg_u32 s4, 1
 ; SI-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_mov_b32_e32 v1, s9
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    s_cmp_lg_u32 s4, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
 ; SI-NEXT:    v_mov_b32_e32 v3, s8
@@ -478,8 +478,8 @@ define amdgpu_kernel void @dynamic_insertelement_v3f32(<3 x float> addrspace(1)*
 ; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    s_cmp_lg_u32 s4, 1
 ; VI-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
-; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    v_mov_b32_e32 v1, s9
+; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    s_cmp_lg_u32 s4, 0
 ; VI-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
 ; VI-NEXT:    v_mov_b32_e32 v3, s8
@@ -507,12 +507,12 @@ define amdgpu_kernel void @dynamic_insertelement_v4f32(<4 x float> addrspace(1)*
 ; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    s_cmp_lg_u32 s4, 2
 ; SI-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_mov_b32_e32 v1, s10
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    s_cmp_lg_u32 s4, 1
 ; SI-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_mov_b32_e32 v1, s9
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    s_cmp_lg_u32 s4, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
 ; SI-NEXT:    v_mov_b32_e32 v4, s8
@@ -535,12 +535,12 @@ define amdgpu_kernel void @dynamic_insertelement_v4f32(<4 x float> addrspace(1)*
 ; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    s_cmp_lg_u32 s4, 2
 ; VI-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
-; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    v_mov_b32_e32 v1, s10
+; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    s_cmp_lg_u32 s4, 1
 ; VI-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
-; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    v_mov_b32_e32 v1, s9
+; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    s_cmp_lg_u32 s4, 0
 ; VI-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
 ; VI-NEXT:    v_mov_b32_e32 v4, s8
@@ -568,28 +568,28 @@ define amdgpu_kernel void @dynamic_insertelement_v8f32(<8 x float> addrspace(1)*
 ; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    s_cmp_lg_u32 s4, 2
 ; SI-NEXT:    v_cndmask_b32_e32 v3, v4, v0, vcc
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_mov_b32_e32 v0, s10
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    s_cmp_lg_u32 s4, 1
 ; SI-NEXT:    v_cndmask_b32_e32 v2, v4, v0, vcc
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_mov_b32_e32 v0, s9
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    s_cmp_lg_u32 s4, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v1, v4, v0, vcc
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_mov_b32_e32 v0, s8
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    s_cmp_lg_u32 s4, 7
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_mov_b32_e32 v5, s15
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    s_cmp_lg_u32 s4, 6
 ; SI-NEXT:    v_cndmask_b32_e32 v7, v4, v5, vcc
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_mov_b32_e32 v5, s14
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    s_cmp_lg_u32 s4, 5
 ; SI-NEXT:    v_cndmask_b32_e32 v6, v4, v5, vcc
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_mov_b32_e32 v5, s13
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    s_cmp_lg_u32 s4, 4
 ; SI-NEXT:    v_cndmask_b32_e32 v5, v4, v5, vcc
 ; SI-NEXT:    v_mov_b32_e32 v8, s12
@@ -613,28 +613,28 @@ define amdgpu_kernel void @dynamic_insertelement_v8f32(<8 x float> addrspace(1)*
 ; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    s_cmp_lg_u32 s4, 2
 ; VI-NEXT:    v_cndmask_b32_e32 v3, v4, v0, vcc
-; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    v_mov_b32_e32 v0, s10
+; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    s_cmp_lg_u32 s4, 1
 ; VI-NEXT:    v_cndmask_b32_e32 v2, v4, v0, vcc
-; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    v_mov_b32_e32 v0, s9
+; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    s_cmp_lg_u32 s4, 0
 ; VI-NEXT:    v_cndmask_b32_e32 v1, v4, v0, vcc
-; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    v_mov_b32_e32 v0, s8
+; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    s_cmp_lg_u32 s4, 7
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
-; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    v_mov_b32_e32 v5, s15
+; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    s_cmp_lg_u32 s4, 6
 ; VI-NEXT:    v_cndmask_b32_e32 v7, v4, v5, vcc
-; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    v_mov_b32_e32 v5, s14
+; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    s_cmp_lg_u32 s4, 5
 ; VI-NEXT:    v_cndmask_b32_e32 v6, v4, v5, vcc
-; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    v_mov_b32_e32 v5, s13
+; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    s_cmp_lg_u32 s4, 4
 ; VI-NEXT:    v_cndmask_b32_e32 v5, v4, v5, vcc
 ; VI-NEXT:    v_mov_b32_e32 v8, s12
@@ -774,8 +774,8 @@ define amdgpu_kernel void @dynamic_insertelement_v3i32(<3 x i32> addrspace(1)* %
 ; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    s_cmp_lg_u32 s4, 1
 ; SI-NEXT:    v_cndmask_b32_e32 v2, 5, v0, vcc
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_mov_b32_e32 v0, s9
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    s_cmp_lg_u32 s4, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v0, vcc
 ; SI-NEXT:    v_mov_b32_e32 v0, s8
@@ -820,16 +820,16 @@ define amdgpu_kernel void @dynamic_insertelement_v4i32(<4 x i32> addrspace(1)* %
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, s11
 ; SI-NEXT:    s_cmp_eq_u32 s6, 3
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_mov_b32_e32 v4, s4
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    s_cmp_eq_u32 s6, 2
 ; SI-NEXT:    v_cndmask_b32_e32 v3, v0, v4, vcc
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_mov_b32_e32 v0, s10
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    s_cmp_eq_u32 s6, 1
 ; SI-NEXT:    v_cndmask_b32_e32 v2, v0, v4, vcc
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_mov_b32_e32 v0, s9
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    s_cmp_eq_u32 s6, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v1, v0, v4, vcc
 ; SI-NEXT:    v_mov_b32_e32 v0, s8
@@ -880,28 +880,28 @@ define amdgpu_kernel void @dynamic_insertelement_v8i32(<8 x i32> addrspace(1)* %
 ; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    s_cmp_lg_u32 s4, 2
 ; SI-NEXT:    v_cndmask_b32_e32 v3, 5, v0, vcc
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_mov_b32_e32 v0, s10
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    s_cmp_lg_u32 s4, 1
 ; SI-NEXT:    v_cndmask_b32_e32 v2, 5, v0, vcc
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_mov_b32_e32 v0, s9
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    s_cmp_lg_u32 s4, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v0, vcc
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_mov_b32_e32 v0, s8
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    s_cmp_lg_u32 s4, 7
 ; SI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_mov_b32_e32 v4, s15
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    s_cmp_lg_u32 s4, 6
 ; SI-NEXT:    v_cndmask_b32_e32 v7, 5, v4, vcc
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_mov_b32_e32 v4, s14
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    s_cmp_lg_u32 s4, 5
 ; SI-NEXT:    v_cndmask_b32_e32 v6, 5, v4, vcc
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_mov_b32_e32 v4, s13
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    s_cmp_lg_u32 s4, 4
 ; SI-NEXT:    v_cndmask_b32_e32 v5, 5, v4, vcc
 ; SI-NEXT:    v_mov_b32_e32 v4, s12
@@ -1164,8 +1164,8 @@ define amdgpu_kernel void @dynamic_insertelement_v3i8(<3 x i8> addrspace(1)* %ou
 ; SI-NEXT:    s_andn2_b32 s5, s6, s4
 ; SI-NEXT:    s_and_b32 s4, s4, 0x5050505
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    s_lshr_b32 s5, s4, 16
+; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; SI-NEXT:    v_mov_b32_e32 v0, s5
 ; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0 offset:2
@@ -1184,8 +1184,8 @@ define amdgpu_kernel void @dynamic_insertelement_v3i8(<3 x i8> addrspace(1)* %ou
 ; VI-NEXT:    s_andn2_b32 s5, s6, s4
 ; VI-NEXT:    s_and_b32 s4, s4, 0x5050505
 ; VI-NEXT:    s_or_b32 s4, s4, s5
-; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    s_lshr_b32 s5, s4, 16
+; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; VI-NEXT:    v_mov_b32_e32 v0, s5
 ; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0 offset:2
@@ -1306,28 +1306,28 @@ define amdgpu_kernel void @dynamic_insertelement_v16i8(<16 x i8> addrspace(1)* %
 ; SI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
 ; SI-NEXT:    v_mov_b32_e32 v1, s5
 ; SI-NEXT:    s_cselect_b64 vcc, -1, 0
-; SI-NEXT:    s_lshr_b32 s6, s11, 8
 ; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
 ; SI-NEXT:    s_movk_i32 s5, 0xff
-; SI-NEXT:    s_cmp_lg_u32 s4, 13
+; SI-NEXT:    s_lshr_b32 s6, s11, 8
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; SI-NEXT:    v_and_b32_e32 v1, s5, v1
+; SI-NEXT:    s_cmp_lg_u32 s4, 13
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; SI-NEXT:    v_mov_b32_e32 v1, s6
 ; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    s_cmp_lg_u32 s4, 12
 ; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_mov_b32_e32 v2, s11
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v2, 5, v2, vcc
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; SI-NEXT:    v_and_b32_e32 v2, s5, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    s_mov_b32 s6, 0xffff
 ; SI-NEXT:    s_lshr_b32 s7, s10, 24
-; SI-NEXT:    s_cmp_lg_u32 s4, 11
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; SI-NEXT:    v_and_b32_e32 v1, s6, v1
+; SI-NEXT:    s_cmp_lg_u32 s4, 11
 ; SI-NEXT:    v_or_b32_e32 v3, v1, v0
 ; SI-NEXT:    v_mov_b32_e32 v0, s7
 ; SI-NEXT:    s_cselect_b64 vcc, -1, 0
@@ -1338,24 +1338,24 @@ define amdgpu_kernel void @dynamic_insertelement_v16i8(<16 x i8> addrspace(1)* %
 ; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
 ; SI-NEXT:    s_lshr_b32 s7, s10, 8
-; SI-NEXT:    s_cmp_lg_u32 s4, 9
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; SI-NEXT:    v_and_b32_e32 v1, s5, v1
+; SI-NEXT:    s_cmp_lg_u32 s4, 9
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_mov_b32_e32 v1, s7
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    s_cmp_lg_u32 s4, 8
 ; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_mov_b32_e32 v2, s10
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v2, 5, v2, vcc
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; SI-NEXT:    v_and_b32_e32 v2, s5, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    s_lshr_b32 s7, s9, 24
-; SI-NEXT:    s_cmp_lg_u32 s4, 7
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; SI-NEXT:    v_and_b32_e32 v1, s6, v1
+; SI-NEXT:    s_cmp_lg_u32 s4, 7
 ; SI-NEXT:    v_or_b32_e32 v2, v1, v0
 ; SI-NEXT:    v_mov_b32_e32 v0, s7
 ; SI-NEXT:    s_cselect_b64 vcc, -1, 0
@@ -1366,24 +1366,24 @@ define amdgpu_kernel void @dynamic_insertelement_v16i8(<16 x i8> addrspace(1)* %
 ; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
 ; SI-NEXT:    s_lshr_b32 s7, s9, 8
-; SI-NEXT:    s_cmp_lg_u32 s4, 5
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; SI-NEXT:    v_and_b32_e32 v1, s5, v1
+; SI-NEXT:    s_cmp_lg_u32 s4, 5
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_mov_b32_e32 v1, s7
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    s_cmp_lg_u32 s4, 4
 ; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_mov_b32_e32 v4, s9
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v4, 5, v4, vcc
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; SI-NEXT:    v_and_b32_e32 v4, s5, v4
 ; SI-NEXT:    v_or_b32_e32 v1, v4, v1
 ; SI-NEXT:    s_lshr_b32 s7, s8, 24
-; SI-NEXT:    s_cmp_lg_u32 s4, 3
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; SI-NEXT:    v_and_b32_e32 v1, s6, v1
+; SI-NEXT:    s_cmp_lg_u32 s4, 3
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v0
 ; SI-NEXT:    v_mov_b32_e32 v0, s7
 ; SI-NEXT:    s_cselect_b64 vcc, -1, 0
@@ -1394,12 +1394,12 @@ define amdgpu_kernel void @dynamic_insertelement_v16i8(<16 x i8> addrspace(1)* %
 ; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v4, 5, v4, vcc
 ; SI-NEXT:    s_lshr_b32 s7, s8, 8
-; SI-NEXT:    s_cmp_lg_u32 s4, 1
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; SI-NEXT:    v_and_b32_e32 v4, s5, v4
+; SI-NEXT:    s_cmp_lg_u32 s4, 1
 ; SI-NEXT:    v_or_b32_e32 v0, v4, v0
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_mov_b32_e32 v4, s7
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    s_cmp_lg_u32 s4, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v4, 5, v4, vcc
 ; SI-NEXT:    v_mov_b32_e32 v5, s8
@@ -1436,15 +1436,15 @@ define amdgpu_kernel void @dynamic_insertelement_v16i8(<16 x i8> addrspace(1)* %
 ; VI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
 ; VI-NEXT:    s_cmp_lg_u32 s4, 13
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    s_cmp_lg_u32 s4, 12
 ; VI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
-; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s11
-; VI-NEXT:    s_lshr_b32 s5, s10, 24
+; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; VI-NEXT:    v_cndmask_b32_e32 v2, 5, v2, vcc
+; VI-NEXT:    s_lshr_b32 s5, s10, 24
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_cmp_lg_u32 s4, 11
 ; VI-NEXT:    v_or_b32_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -1460,15 +1460,15 @@ define amdgpu_kernel void @dynamic_insertelement_v16i8(<16 x i8> addrspace(1)* %
 ; VI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
 ; VI-NEXT:    s_cmp_lg_u32 s4, 9
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    s_cmp_lg_u32 s4, 8
 ; VI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
-; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s10
-; VI-NEXT:    s_lshr_b32 s5, s9, 24
+; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; VI-NEXT:    v_cndmask_b32_e32 v2, 5, v2, vcc
+; VI-NEXT:    s_lshr_b32 s5, s9, 24
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_cmp_lg_u32 s4, 7
 ; VI-NEXT:    v_or_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -1484,15 +1484,15 @@ define amdgpu_kernel void @dynamic_insertelement_v16i8(<16 x i8> addrspace(1)* %
 ; VI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
 ; VI-NEXT:    s_cmp_lg_u32 s4, 5
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    s_cmp_lg_u32 s4, 4
 ; VI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
-; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    v_mov_b32_e32 v4, s9
-; VI-NEXT:    s_lshr_b32 s5, s8, 24
+; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; VI-NEXT:    v_cndmask_b32_e32 v4, 5, v4, vcc
+; VI-NEXT:    s_lshr_b32 s5, s8, 24
 ; VI-NEXT:    v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_cmp_lg_u32 s4, 3
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -1508,8 +1508,8 @@ define amdgpu_kernel void @dynamic_insertelement_v16i8(<16 x i8> addrspace(1)* %
 ; VI-NEXT:    v_cndmask_b32_e32 v4, 5, v4, vcc
 ; VI-NEXT:    s_cmp_lg_u32 s4, 1
 ; VI-NEXT:    v_or_b32_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    v_mov_b32_e32 v4, s5
+; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    s_cmp_lg_u32 s4, 0
 ; VI-NEXT:    v_cndmask_b32_e32 v4, 5, v4, vcc
 ; VI-NEXT:    v_mov_b32_e32 v5, s8
@@ -1660,8 +1660,8 @@ define amdgpu_kernel void @dynamic_insertelement_v2i64(<2 x i64> addrspace(1)* %
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_cmp_eq_u32 s6, 1
-; SI-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; SI-NEXT:    v_mov_b32_e32 v0, s11
+; SI-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; SI-NEXT:    v_cndmask_b32_e64 v3, v0, 0, s[4:5]
 ; SI-NEXT:    v_mov_b32_e32 v0, s10
 ; SI-NEXT:    s_cmp_eq_u32 s6, 0
@@ -1715,8 +1715,8 @@ define amdgpu_kernel void @dynamic_insertelement_v3i64(<3 x i64> addrspace(1)* %
 ; SI-NEXT:    v_mov_b32_e32 v0, s10
 ; SI-NEXT:    s_cmp_eq_u32 s12, 0
 ; SI-NEXT:    v_cndmask_b32_e64 v2, v0, 5, s[4:5]
-; SI-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; SI-NEXT:    v_mov_b32_e32 v0, s9
+; SI-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; SI-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[4:5]
 ; SI-NEXT:    v_mov_b32_e32 v0, s8
 ; SI-NEXT:    s_cmp_eq_u32 s12, 2
@@ -1775,14 +1775,14 @@ define amdgpu_kernel void @dynamic_insertelement_v4f64(<4 x double> addrspace(1)
 ; SI-NEXT:    v_mov_b32_e32 v0, s10
 ; SI-NEXT:    s_cmp_eq_u32 s4, 0
 ; SI-NEXT:    v_cndmask_b32_e64 v2, v0, 0, vcc
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_mov_b32_e32 v0, s9
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v1, v0, v4, vcc
 ; SI-NEXT:    v_mov_b32_e32 v0, s8
 ; SI-NEXT:    s_cmp_eq_u32 s4, 3
 ; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_mov_b32_e32 v5, s15
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v7, v5, v4, vcc
 ; SI-NEXT:    v_mov_b32_e32 v5, s14
 ; SI-NEXT:    s_cmp_eq_u32 s4, 2

diff  --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
index 3b545d2d5faa1..50b75095b0567 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
@@ -583,8 +583,8 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)*
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
-; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
 ; VI-NEXT:    s_lshr_b32 s0, s4, 16
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -603,8 +603,8 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)*
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    flat_load_dword v3, v[0:1]
-; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
+; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
 ; CI-NEXT:    s_lshr_b32 s0, s4, 16
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    s_waitcnt vmcnt(0)
@@ -706,9 +706,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_1(<2 x i16> addrspace(1)* %out,
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v1, 0x3e70000
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT:    v_mov_b32_e32 v1, 0x3e70000
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
@@ -764,9 +764,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_1_inlineimm(<2 x i16> addrspace
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v1, 0xfff10000
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT:    v_mov_b32_e32 v1, 0xfff10000
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
@@ -940,9 +940,9 @@ define amdgpu_kernel void @v_insertelement_v2f16_1(<2 x half> addrspace(1)* %out
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v1, 0x45000000
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT:    v_mov_b32_e32 v1, 0x45000000
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
@@ -998,9 +998,9 @@ define amdgpu_kernel void @v_insertelement_v2f16_1_inlineimm(<2 x half> addrspac
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v1, 0x230000
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT:    v_mov_b32_e32 v1, 0x230000
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
@@ -1190,8 +1190,8 @@ define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(<2 x half> addrspa
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    v_mov_b32_e32 v3, s5
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s4, v4
@@ -1199,8 +1199,8 @@ define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(<2 x half> addrspa
 ; VI-NEXT:    flat_load_dword v2, v[2:3]
 ; VI-NEXT:    flat_load_dword v3, v[0:1]
 ; VI-NEXT:    s_mov_b32 s2, 0xffff
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v4
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v4
 ; VI-NEXT:    s_mov_b32 s0, 0x12341234
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(1)
@@ -1225,8 +1225,8 @@ define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(<2 x half> addrspa
 ; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; CI-NEXT:    flat_load_dword v2, v[2:3]
 ; CI-NEXT:    flat_load_dword v3, v[0:1]
-; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v4
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
+; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v4
 ; CI-NEXT:    s_mov_b32 s0, 0x12341234
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    s_waitcnt vmcnt(1)
@@ -1272,8 +1272,8 @@ define amdgpu_kernel void @v_insertelement_v4f16_0(<4 x half> addrspace(1)* %out
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
 ; VI-NEXT:    s_mov_b32 s0, 0xffff
 ; VI-NEXT:    v_mov_b32_e32 v4, s4
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -1292,8 +1292,8 @@ define amdgpu_kernel void @v_insertelement_v4f16_0(<4 x half> addrspace(1)* %out
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
 ; CI-NEXT:    s_mov_b32 s0, 0xffff
 ; CI-NEXT:    v_mov_b32_e32 v4, s4
 ; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -1357,8 +1357,8 @@ define amdgpu_kernel void @v_insertelement_v4f16_1(<4 x half> addrspace(1)* %out
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
 ; CI-NEXT:    s_lshl_b32 s0, s4, 16
 ; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; CI-NEXT:    s_waitcnt vmcnt(0)
@@ -1402,8 +1402,8 @@ define amdgpu_kernel void @v_insertelement_v4f16_2(<4 x half> addrspace(1)* %out
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
 ; VI-NEXT:    s_mov_b32 s0, 0xffff
 ; VI-NEXT:    v_mov_b32_e32 v4, s4
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -1422,8 +1422,8 @@ define amdgpu_kernel void @v_insertelement_v4f16_2(<4 x half> addrspace(1)* %out
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
 ; CI-NEXT:    s_mov_b32 s0, 0xffff
 ; CI-NEXT:    v_mov_b32_e32 v4, s4
 ; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -1487,8 +1487,8 @@ define amdgpu_kernel void @v_insertelement_v4f16_3(<4 x half> addrspace(1)* %out
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
 ; CI-NEXT:    s_lshl_b32 s0, s4, 16
 ; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; CI-NEXT:    s_waitcnt vmcnt(0)
@@ -1532,8 +1532,8 @@ define amdgpu_kernel void @v_insertelement_v4i16_2(<4 x i16> addrspace(1)* %out,
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
 ; VI-NEXT:    s_mov_b32 s0, 0xffff
 ; VI-NEXT:    v_mov_b32_e32 v4, s4
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -1552,8 +1552,8 @@ define amdgpu_kernel void @v_insertelement_v4i16_2(<4 x i16> addrspace(1)* %out,
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
 ; CI-NEXT:    s_mov_b32 s0, 0xffff
 ; CI-NEXT:    v_mov_b32_e32 v4, s4
 ; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -1668,9 +1668,9 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(<4 x half> addrspa
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
-; GFX9-NEXT:    s_pack_ll_b32_b16 s5, s6, s6
 ; GFX9-NEXT:    s_mov_b64 s[2:3], 0xffff
 ; GFX9-NEXT:    s_lshl_b32 s4, s7, 4
+; GFX9-NEXT:    s_pack_ll_b32_b16 s5, s6, s6
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX9-NEXT:    v_mov_b32_e32 v4, s5

diff  --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
index c7c67e2207970..e6fa62056d79d 100644
--- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
@@ -1035,8 +1035,8 @@ define amdgpu_kernel void @v3i16_arg(<3 x i16> addrspace(1)* nocapture %out, <3
 ; VI-NEXT:    s_add_u32 s4, s2, 4
 ; VI-NEXT:    s_addc_u32 s5, s3, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s4
-; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v4, s1
+; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v3, s5
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_mov_b32_e32 v5, s0
@@ -1721,8 +1721,8 @@ define amdgpu_kernel void @v5i8_arg(<5 x i8> addrspace(1)* nocapture %out, <5 x
 ; VI-NEXT:    s_addc_u32 s1, s3, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    flat_store_byte v[2:3], v4
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    flat_store_byte v[2:3], v4
 ; VI-NEXT:    v_mov_b32_e32 v2, s4
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
@@ -2198,8 +2198,8 @@ define amdgpu_kernel void @v5f32_arg(<5 x float> addrspace(1)* nocapture %out, <
 ; VI-NEXT:    s_addc_u32 s1, s3, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s1
 ; VI-NEXT:    v_mov_b32_e32 v1, s0
-; VI-NEXT:    flat_store_dword v[1:2], v3
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    flat_store_dword v[1:2], v3
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    v_mov_b32_e32 v2, s6
 ; VI-NEXT:    v_mov_b32_e32 v3, s7
@@ -2308,12 +2308,12 @@ define amdgpu_kernel void @v5i64_arg(<5 x i64> addrspace(1)* nocapture %out, <5
 ; VI-NEXT:    v_mov_b32_e32 v5, s9
 ; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; VI-NEXT:    v_mov_b32_e32 v5, s3
-; VI-NEXT:    v_mov_b32_e32 v4, s2
-; VI-NEXT:    s_add_u32 s2, s2, 32
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    v_mov_b32_e32 v2, s6
 ; VI-NEXT:    v_mov_b32_e32 v3, s7
+; VI-NEXT:    v_mov_b32_e32 v4, s2
+; VI-NEXT:    s_add_u32 s2, s2, 32
 ; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; VI-NEXT:    s_addc_u32 s3, s3, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
@@ -2451,12 +2451,12 @@ define amdgpu_kernel void @v5f64_arg(<5 x double> addrspace(1)* nocapture %out,
 ; VI-NEXT:    v_mov_b32_e32 v5, s9
 ; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; VI-NEXT:    v_mov_b32_e32 v5, s3
-; VI-NEXT:    v_mov_b32_e32 v4, s2
-; VI-NEXT:    s_add_u32 s2, s2, 32
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    v_mov_b32_e32 v2, s6
 ; VI-NEXT:    v_mov_b32_e32 v3, s7
+; VI-NEXT:    v_mov_b32_e32 v4, s2
+; VI-NEXT:    s_add_u32 s2, s2, 32
 ; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; VI-NEXT:    s_addc_u32 s3, s3, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
@@ -5873,10 +5873,10 @@ define amdgpu_kernel void @array_3xi16(i8 %arg0, [3 x i16] %arg1) {
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    s_add_u32 s2, s0, 42
-; VI-NEXT:    s_addc_u32 s3, s1, 0
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v5, s3
+; VI-NEXT:    s_addc_u32 s3, s1, 0
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_mov_b32_e32 v5, s3
 ; VI-NEXT:    v_mov_b32_e32 v4, s2
 ; VI-NEXT:    flat_load_ushort v0, v[0:1]
 ; VI-NEXT:    flat_load_ushort v1, v[2:3]

diff  --git a/llvm/test/CodeGen/AMDGPU/lds-atomic-fmin-fmax.ll b/llvm/test/CodeGen/AMDGPU/lds-atomic-fmin-fmax.ll
index 9dbef047293c8..7ddcfeb05d775 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-atomic-fmin-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-atomic-fmin-fmax.ll
@@ -616,15 +616,15 @@ define amdgpu_kernel void @lds_ds_fmin_f64(double addrspace(5)* %out, double add
 ; SI-NEXT:    s_mov_b32 s0, 0
 ; SI-NEXT:    s_mov_b32 s1, 0x40450000
 ; SI-NEXT:    v_mov_b32_e32 v0, s0
-; SI-NEXT:    v_mov_b32_e32 v1, s1
 ; SI-NEXT:    v_mov_b32_e32 v2, s4
+; SI-NEXT:    v_mov_b32_e32 v1, s1
 ; SI-NEXT:    ds_min_rtn_f64 v[2:3], v2, v[0:1]
 ; SI-NEXT:    v_mov_b32_e32 v4, s5
 ; SI-NEXT:    ds_min_f64 v4, v[0:1]
 ; SI-NEXT:    v_mov_b32_e32 v0, s3
-; SI-NEXT:    s_add_i32 s0, s2, 4
 ; SI-NEXT:    s_waitcnt lgkmcnt(1)
 ; SI-NEXT:    ds_min_rtn_f64 v[0:1], v0, v[2:3]
+; SI-NEXT:    s_add_i32 s0, s2, 4
 ; SI-NEXT:    v_mov_b32_e32 v2, s0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    buffer_store_dword v1, v2, s[8:11], 0 offen
@@ -656,9 +656,9 @@ define amdgpu_kernel void @lds_ds_fmin_f64(double addrspace(5)* %out, double add
 ; GFX7-NEXT:    v_mov_b32_e32 v4, s1
 ; GFX7-NEXT:    ds_min_f64 v4, v[0:1] offset:64
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s5
-; GFX7-NEXT:    s_add_i32 s0, s4, 4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(1)
 ; GFX7-NEXT:    ds_min_rtn_f64 v[0:1], v0, v[2:3]
+; GFX7-NEXT:    s_add_i32 s0, s4, 4
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -689,9 +689,9 @@ define amdgpu_kernel void @lds_ds_fmin_f64(double addrspace(5)* %out, double add
 ; VI-NEXT:    v_mov_b32_e32 v4, s1
 ; VI-NEXT:    ds_min_f64 v4, v[0:1] offset:64
 ; VI-NEXT:    v_mov_b32_e32 v0, s5
-; VI-NEXT:    s_add_i32 s0, s4, 4
 ; VI-NEXT:    s_waitcnt lgkmcnt(1)
 ; VI-NEXT:    ds_min_rtn_f64 v[0:1], v0, v[2:3]
+; VI-NEXT:    s_add_i32 s0, s4, 4
 ; VI-NEXT:    v_mov_b32_e32 v3, s0
 ; VI-NEXT:    v_mov_b32_e32 v2, s4
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -777,8 +777,8 @@ define amdgpu_kernel void @lds_ds_fmin_f64(double addrspace(5)* %out, double add
 ; G_SI-NEXT:    s_lshl_b32 s5, s2, 3
 ; G_SI-NEXT:    s_mov_b32 s1, 0x40450000
 ; G_SI-NEXT:    v_mov_b32_e32 v0, s0
-; G_SI-NEXT:    v_mov_b32_e32 v1, s1
 ; G_SI-NEXT:    v_mov_b32_e32 v2, s5
+; G_SI-NEXT:    v_mov_b32_e32 v1, s1
 ; G_SI-NEXT:    s_mov_b32 m0, -1
 ; G_SI-NEXT:    ds_min_rtn_f64 v[2:3], v2, v[0:1]
 ; G_SI-NEXT:    s_lshl_b32 s2, s2, 4
@@ -822,8 +822,8 @@ define amdgpu_kernel void @lds_ds_fmin_f64(double addrspace(5)* %out, double add
 ; G_GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; G_GFX7-NEXT:    v_mov_b32_e32 v0, s7
 ; G_GFX7-NEXT:    ds_min_rtn_f64 v[0:1], v0, v[2:3]
-; G_GFX7-NEXT:    s_add_u32 s0, s6, 4
 ; G_GFX7-NEXT:    v_mov_b32_e32 v2, s6
+; G_GFX7-NEXT:    s_add_u32 s0, s6, 4
 ; G_GFX7-NEXT:    v_mov_b32_e32 v3, s0
 ; G_GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; G_GFX7-NEXT:    buffer_store_dword v0, v2, s[8:11], 0 offen
@@ -856,8 +856,8 @@ define amdgpu_kernel void @lds_ds_fmin_f64(double addrspace(5)* %out, double add
 ; G_VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; G_VI-NEXT:    v_mov_b32_e32 v0, s7
 ; G_VI-NEXT:    ds_min_rtn_f64 v[0:1], v0, v[2:3]
-; G_VI-NEXT:    s_add_u32 s0, s6, 4
 ; G_VI-NEXT:    v_mov_b32_e32 v2, s6
+; G_VI-NEXT:    s_add_u32 s0, s6, 4
 ; G_VI-NEXT:    v_mov_b32_e32 v3, s0
 ; G_VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; G_VI-NEXT:    buffer_store_dword v0, v2, s[88:91], 0 offen
@@ -867,16 +867,16 @@ define amdgpu_kernel void @lds_ds_fmin_f64(double addrspace(5)* %out, double add
 ; G_GFX9-LABEL: lds_ds_fmin_f64:
 ; G_GFX9:       ; %bb.0:
 ; G_GFX9-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; G_GFX9-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
 ; G_GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x24
 ; G_GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
-; G_GFX9-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
 ; G_GFX9-NEXT:    s_mov_b32 s10, -1
 ; G_GFX9-NEXT:    s_mov_b32 s11, 0xe00000
 ; G_GFX9-NEXT:    s_add_u32 s8, s8, s3
 ; G_GFX9-NEXT:    s_mov_b32 s0, 0
+; G_GFX9-NEXT:    s_addc_u32 s9, s9, 0
 ; G_GFX9-NEXT:    s_mov_b32 s1, 0x40450000
 ; G_GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; G_GFX9-NEXT:    s_addc_u32 s9, s9, 0
 ; G_GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; G_GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; G_GFX9-NEXT:    s_add_i32 s0, s2, 4
@@ -885,8 +885,8 @@ define amdgpu_kernel void @lds_ds_fmin_f64(double addrspace(5)* %out, double add
 ; G_GFX9-NEXT:    ds_min_rtn_f64 v[2:3], v2, v[0:1]
 ; G_GFX9-NEXT:    s_lshl_b32 s0, s0, 4
 ; G_GFX9-NEXT:    v_mov_b32_e32 v5, s0
-; G_GFX9-NEXT:    ds_min_rtn_f64 v[0:1], v5, v[0:1]
 ; G_GFX9-NEXT:    v_mov_b32_e32 v4, s7
+; G_GFX9-NEXT:    ds_min_rtn_f64 v[0:1], v5, v[0:1]
 ; G_GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; G_GFX9-NEXT:    ds_min_rtn_f64 v[0:1], v4, v[2:3]
 ; G_GFX9-NEXT:    v_mov_b32_e32 v2, s6
@@ -959,15 +959,15 @@ define amdgpu_kernel void @lds_ds_fmax_f64(double addrspace(5)* %out, double add
 ; SI-NEXT:    s_mov_b32 s0, 0
 ; SI-NEXT:    s_mov_b32 s1, 0x40450000
 ; SI-NEXT:    v_mov_b32_e32 v0, s0
-; SI-NEXT:    v_mov_b32_e32 v1, s1
 ; SI-NEXT:    v_mov_b32_e32 v2, s4
+; SI-NEXT:    v_mov_b32_e32 v1, s1
 ; SI-NEXT:    ds_max_rtn_f64 v[2:3], v2, v[0:1]
 ; SI-NEXT:    v_mov_b32_e32 v4, s5
 ; SI-NEXT:    ds_max_f64 v4, v[0:1]
 ; SI-NEXT:    v_mov_b32_e32 v0, s3
-; SI-NEXT:    s_add_i32 s0, s2, 4
 ; SI-NEXT:    s_waitcnt lgkmcnt(1)
 ; SI-NEXT:    ds_max_rtn_f64 v[0:1], v0, v[2:3]
+; SI-NEXT:    s_add_i32 s0, s2, 4
 ; SI-NEXT:    v_mov_b32_e32 v2, s0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    buffer_store_dword v1, v2, s[8:11], 0 offen
@@ -999,9 +999,9 @@ define amdgpu_kernel void @lds_ds_fmax_f64(double addrspace(5)* %out, double add
 ; GFX7-NEXT:    v_mov_b32_e32 v4, s1
 ; GFX7-NEXT:    ds_max_f64 v4, v[0:1] offset:64
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s5
-; GFX7-NEXT:    s_add_i32 s0, s4, 4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(1)
 ; GFX7-NEXT:    ds_max_rtn_f64 v[0:1], v0, v[2:3]
+; GFX7-NEXT:    s_add_i32 s0, s4, 4
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1032,9 +1032,9 @@ define amdgpu_kernel void @lds_ds_fmax_f64(double addrspace(5)* %out, double add
 ; VI-NEXT:    v_mov_b32_e32 v4, s1
 ; VI-NEXT:    ds_max_f64 v4, v[0:1] offset:64
 ; VI-NEXT:    v_mov_b32_e32 v0, s5
-; VI-NEXT:    s_add_i32 s0, s4, 4
 ; VI-NEXT:    s_waitcnt lgkmcnt(1)
 ; VI-NEXT:    ds_max_rtn_f64 v[0:1], v0, v[2:3]
+; VI-NEXT:    s_add_i32 s0, s4, 4
 ; VI-NEXT:    v_mov_b32_e32 v3, s0
 ; VI-NEXT:    v_mov_b32_e32 v2, s4
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1120,8 +1120,8 @@ define amdgpu_kernel void @lds_ds_fmax_f64(double addrspace(5)* %out, double add
 ; G_SI-NEXT:    s_lshl_b32 s5, s2, 3
 ; G_SI-NEXT:    s_mov_b32 s1, 0x40450000
 ; G_SI-NEXT:    v_mov_b32_e32 v0, s0
-; G_SI-NEXT:    v_mov_b32_e32 v1, s1
 ; G_SI-NEXT:    v_mov_b32_e32 v2, s5
+; G_SI-NEXT:    v_mov_b32_e32 v1, s1
 ; G_SI-NEXT:    s_mov_b32 m0, -1
 ; G_SI-NEXT:    ds_max_rtn_f64 v[2:3], v2, v[0:1]
 ; G_SI-NEXT:    s_lshl_b32 s2, s2, 4
@@ -1165,8 +1165,8 @@ define amdgpu_kernel void @lds_ds_fmax_f64(double addrspace(5)* %out, double add
 ; G_GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; G_GFX7-NEXT:    v_mov_b32_e32 v0, s7
 ; G_GFX7-NEXT:    ds_max_rtn_f64 v[0:1], v0, v[2:3]
-; G_GFX7-NEXT:    s_add_u32 s0, s6, 4
 ; G_GFX7-NEXT:    v_mov_b32_e32 v2, s6
+; G_GFX7-NEXT:    s_add_u32 s0, s6, 4
 ; G_GFX7-NEXT:    v_mov_b32_e32 v3, s0
 ; G_GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; G_GFX7-NEXT:    buffer_store_dword v0, v2, s[8:11], 0 offen
@@ -1199,8 +1199,8 @@ define amdgpu_kernel void @lds_ds_fmax_f64(double addrspace(5)* %out, double add
 ; G_VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; G_VI-NEXT:    v_mov_b32_e32 v0, s7
 ; G_VI-NEXT:    ds_max_rtn_f64 v[0:1], v0, v[2:3]
-; G_VI-NEXT:    s_add_u32 s0, s6, 4
 ; G_VI-NEXT:    v_mov_b32_e32 v2, s6
+; G_VI-NEXT:    s_add_u32 s0, s6, 4
 ; G_VI-NEXT:    v_mov_b32_e32 v3, s0
 ; G_VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; G_VI-NEXT:    buffer_store_dword v0, v2, s[88:91], 0 offen
@@ -1210,16 +1210,16 @@ define amdgpu_kernel void @lds_ds_fmax_f64(double addrspace(5)* %out, double add
 ; G_GFX9-LABEL: lds_ds_fmax_f64:
 ; G_GFX9:       ; %bb.0:
 ; G_GFX9-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; G_GFX9-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
 ; G_GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x24
 ; G_GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
-; G_GFX9-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
 ; G_GFX9-NEXT:    s_mov_b32 s10, -1
 ; G_GFX9-NEXT:    s_mov_b32 s11, 0xe00000
 ; G_GFX9-NEXT:    s_add_u32 s8, s8, s3
 ; G_GFX9-NEXT:    s_mov_b32 s0, 0
+; G_GFX9-NEXT:    s_addc_u32 s9, s9, 0
 ; G_GFX9-NEXT:    s_mov_b32 s1, 0x40450000
 ; G_GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; G_GFX9-NEXT:    s_addc_u32 s9, s9, 0
 ; G_GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; G_GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; G_GFX9-NEXT:    s_add_i32 s0, s2, 4
@@ -1228,8 +1228,8 @@ define amdgpu_kernel void @lds_ds_fmax_f64(double addrspace(5)* %out, double add
 ; G_GFX9-NEXT:    ds_max_rtn_f64 v[2:3], v2, v[0:1]
 ; G_GFX9-NEXT:    s_lshl_b32 s0, s0, 4
 ; G_GFX9-NEXT:    v_mov_b32_e32 v5, s0
-; G_GFX9-NEXT:    ds_max_rtn_f64 v[0:1], v5, v[0:1]
 ; G_GFX9-NEXT:    v_mov_b32_e32 v4, s7
+; G_GFX9-NEXT:    ds_max_rtn_f64 v[0:1], v5, v[0:1]
 ; G_GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; G_GFX9-NEXT:    ds_max_rtn_f64 v[0:1], v4, v[2:3]
 ; G_GFX9-NEXT:    v_mov_b32_e32 v2, s6

diff  --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index c8a7cf27e3240..3e19a864f1120 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -371,7 +371,7 @@
 ; GCN-O1-NEXT:        SI post-RA bundler
 ; GCN-O1-NEXT:        MachineDominator Tree Construction
 ; GCN-O1-NEXT:        Machine Natural Loop Construction
-; GCN-O1-NEXT:        Post RA top-down list latency scheduler
+; GCN-O1-NEXT:        PostRA Machine Instruction Scheduler
 ; GCN-O1-NEXT:        Machine Block Frequency Analysis
 ; GCN-O1-NEXT:        MachinePostDominator Tree Construction
 ; GCN-O1-NEXT:        Branch Probability Basic Block Placement
@@ -656,7 +656,7 @@
 ; GCN-O1-OPTS-NEXT:        SI post-RA bundler
 ; GCN-O1-OPTS-NEXT:        MachineDominator Tree Construction
 ; GCN-O1-OPTS-NEXT:        Machine Natural Loop Construction
-; GCN-O1-OPTS-NEXT:        Post RA top-down list latency scheduler
+; GCN-O1-OPTS-NEXT:        PostRA Machine Instruction Scheduler
 ; GCN-O1-OPTS-NEXT:        Machine Block Frequency Analysis
 ; GCN-O1-OPTS-NEXT:        MachinePostDominator Tree Construction
 ; GCN-O1-OPTS-NEXT:        Branch Probability Basic Block Placement
@@ -943,7 +943,7 @@
 ; GCN-O2-NEXT:        SI post-RA bundler
 ; GCN-O2-NEXT:        MachineDominator Tree Construction
 ; GCN-O2-NEXT:        Machine Natural Loop Construction
-; GCN-O2-NEXT:        Post RA top-down list latency scheduler
+; GCN-O2-NEXT:        PostRA Machine Instruction Scheduler
 ; GCN-O2-NEXT:        Machine Block Frequency Analysis
 ; GCN-O2-NEXT:        MachinePostDominator Tree Construction
 ; GCN-O2-NEXT:        Branch Probability Basic Block Placement
@@ -1243,7 +1243,7 @@
 ; GCN-O3-NEXT:        SI post-RA bundler
 ; GCN-O3-NEXT:        MachineDominator Tree Construction
 ; GCN-O3-NEXT:        Machine Natural Loop Construction
-; GCN-O3-NEXT:        Post RA top-down list latency scheduler
+; GCN-O3-NEXT:        PostRA Machine Instruction Scheduler
 ; GCN-O3-NEXT:        Machine Block Frequency Analysis
 ; GCN-O3-NEXT:        MachinePostDominator Tree Construction
 ; GCN-O3-NEXT:        Branch Probability Basic Block Placement

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll
index 6ee84aa4704fd..2094e35997711 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll
@@ -557,8 +557,8 @@ define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
 ; GFX9-NEXT:    v_and_b32_e32 v4, v6, v4
 ; GFX9-NEXT:    v_and_b32_e32 v2, v6, v2
 ; GFX9-NEXT:    v_and_b32_e32 v0, v6, v0
-; GFX9-NEXT:    v_lshl_or_b32 v3, v3, 16, v2
 ; GFX9-NEXT:    v_lshl_or_b32 v4, v5, 16, v4
+; GFX9-NEXT:    v_lshl_or_b32 v3, v3, 16, v2
 ; GFX9-NEXT:    v_lshl_or_b32 v2, v1, 16, v0
 ; GFX9-NEXT:    image_sample_d v[0:3], v[2:4], s[0:7], s[8:11] dmask:0xf a16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -639,12 +639,12 @@ define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
 ; GFX9-LABEL: sample_c_d_2d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    v_mov_b32_e32 v9, 0xffff
-; GFX9-NEXT:    v_mov_b32_e32 v8, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v7, v3
+; GFX9-NEXT:    v_mov_b32_e32 v8, v2
 ; GFX9-NEXT:    v_and_b32_e32 v2, v9, v5
-; GFX9-NEXT:    v_and_b32_e32 v1, v9, v1
 ; GFX9-NEXT:    v_lshl_or_b32 v3, v6, 16, v2
 ; GFX9-NEXT:    v_and_b32_e32 v2, v9, v7
+; GFX9-NEXT:    v_and_b32_e32 v1, v9, v1
 ; GFX9-NEXT:    v_lshl_or_b32 v2, v4, 16, v2
 ; GFX9-NEXT:    v_lshl_or_b32 v1, v8, 16, v1
 ; GFX9-NEXT:    image_sample_c_d v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf a16
@@ -798,8 +798,8 @@ define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX9-NEXT:    v_and_b32_e32 v4, v6, v4
 ; GFX9-NEXT:    v_and_b32_e32 v2, v6, v2
 ; GFX9-NEXT:    v_and_b32_e32 v0, v6, v0
-; GFX9-NEXT:    v_lshl_or_b32 v3, v3, 16, v2
 ; GFX9-NEXT:    v_lshl_or_b32 v4, v5, 16, v4
+; GFX9-NEXT:    v_lshl_or_b32 v3, v3, 16, v2
 ; GFX9-NEXT:    v_lshl_or_b32 v2, v1, 16, v0
 ; GFX9-NEXT:    image_sample_cd v[0:3], v[2:4], s[0:7], s[8:11] dmask:0xf a16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -843,12 +843,12 @@ define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> in
 ; GFX9-LABEL: sample_c_cd_2d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    v_mov_b32_e32 v9, 0xffff
-; GFX9-NEXT:    v_mov_b32_e32 v8, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v7, v3
+; GFX9-NEXT:    v_mov_b32_e32 v8, v2
 ; GFX9-NEXT:    v_and_b32_e32 v2, v9, v5
-; GFX9-NEXT:    v_and_b32_e32 v1, v9, v1
 ; GFX9-NEXT:    v_lshl_or_b32 v3, v6, 16, v2
 ; GFX9-NEXT:    v_and_b32_e32 v2, v9, v7
+; GFX9-NEXT:    v_and_b32_e32 v1, v9, v1
 ; GFX9-NEXT:    v_lshl_or_b32 v2, v4, 16, v2
 ; GFX9-NEXT:    v_lshl_or_b32 v1, v8, 16, v1
 ; GFX9-NEXT:    image_sample_c_cd v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf a16

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll
index 0bc7f5093b0e2..d8a61668cb4f1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll
@@ -1718,8 +1718,8 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1_tfe(<8 x i32> inreg %rsrc, <4 x
 ; VERDE-NEXT:    s_mov_b32 s15, 0xf000
 ; VERDE-NEXT:    s_mov_b32 s14, -1
 ; VERDE-NEXT:    s_waitcnt vmcnt(0)
-; VERDE-NEXT:    v_mov_b32_e32 v0, v9
 ; VERDE-NEXT:    buffer_store_dword v10, off, s[12:15], 0
+; VERDE-NEXT:    v_mov_b32_e32 v0, v9
 ; VERDE-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; VERDE-NEXT:    ; return to shader part epilog
 ;

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.a16.dim.ll
index b507d9fc0049c..d23caa43bbb62 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.a16.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.a16.dim.ll
@@ -548,12 +548,12 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_d_3d(<8 x i32> inreg %rsrc, <4 x
 ; GFX10GISEL-NEXT:    v_mov_b32_e32 v9, v2
 ; GFX10GISEL-NEXT:    v_mov_b32_e32 v10, v3
 ; GFX10GISEL-NEXT:    v_mov_b32_e32 v11, 0xffff
-; GFX10GISEL-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX10GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10GISEL-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX10GISEL-NEXT:    s_lshl_b32 s12, s0, 16
 ; GFX10GISEL-NEXT:    v_and_or_b32 v3, v9, v11, s12
-; GFX10GISEL-NEXT:    v_and_or_b32 v4, v10, v11, v4
 ; GFX10GISEL-NEXT:    v_and_or_b32 v2, v0, v11, v1
+; GFX10GISEL-NEXT:    v_and_or_b32 v4, v10, v11, v4
 ; GFX10GISEL-NEXT:    v_and_or_b32 v5, v5, v11, s12
 ; GFX10GISEL-NEXT:    image_sample_d_g16 v[0:3], v[2:8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D
 ; GFX10GISEL-NEXT:    s_waitcnt vmcnt(0)
@@ -929,14 +929,14 @@ define amdgpu_ps float @sample_g16_noa16_c_d_o_2darray_V1(<8 x i32> inreg %rsrc,
 ; GFX10GISEL:       ; %bb.0: ; %main_body
 ; GFX10GISEL-NEXT:    v_mov_b32_e32 v9, v3
 ; GFX10GISEL-NEXT:    v_mov_b32_e32 v10, v2
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v11, v4
 ; GFX10GISEL-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX10GISEL-NEXT:    v_mov_b32_e32 v3, v1
-; GFX10GISEL-NEXT:    v_mov_b32_e32 v11, v4
 ; GFX10GISEL-NEXT:    v_mov_b32_e32 v0, 0xffff
-; GFX10GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; GFX10GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v9
-; GFX10GISEL-NEXT:    v_and_or_b32 v5, v11, v0, v5
+; GFX10GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; GFX10GISEL-NEXT:    v_and_or_b32 v4, v10, v0, v1
+; GFX10GISEL-NEXT:    v_and_or_b32 v5, v11, v0, v5
 ; GFX10GISEL-NEXT:    image_sample_c_d_o_g16 v0, v[2:8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY
 ; GFX10GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10GISEL-NEXT:    ; return to shader part epilog
@@ -965,14 +965,14 @@ define amdgpu_ps <2 x float> @sample_g16_noa16_c_d_o_2darray_V2(<8 x i32> inreg
 ; GFX10GISEL:       ; %bb.0: ; %main_body
 ; GFX10GISEL-NEXT:    v_mov_b32_e32 v9, v3
 ; GFX10GISEL-NEXT:    v_mov_b32_e32 v10, v2
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v11, v4
 ; GFX10GISEL-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX10GISEL-NEXT:    v_mov_b32_e32 v3, v1
-; GFX10GISEL-NEXT:    v_mov_b32_e32 v11, v4
 ; GFX10GISEL-NEXT:    v_mov_b32_e32 v0, 0xffff
-; GFX10GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; GFX10GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v9
-; GFX10GISEL-NEXT:    v_and_or_b32 v5, v11, v0, v5
+; GFX10GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; GFX10GISEL-NEXT:    v_and_or_b32 v4, v10, v0, v1
+; GFX10GISEL-NEXT:    v_and_or_b32 v5, v11, v0, v5
 ; GFX10GISEL-NEXT:    image_sample_c_d_o_g16 v[0:1], v[2:8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY
 ; GFX10GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10GISEL-NEXT:    ; return to shader part epilog

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll
index 4f5ace0a40d66..6227d51e3c9e2 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll
@@ -49,8 +49,8 @@ define amdgpu_kernel void @set_inactive_scc(i32 addrspace(1)* %out, i32 %in, <4
 ; GCN-NEXT:    s_load_dword s6, s[0:1], 0x2c
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x34
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN-NEXT:    s_buffer_load_dword s0, s[0:3], 0x0
+; GCN-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN-NEXT:    s_not_b64 exec, exec
 ; GCN-NEXT:    v_mov_b32_e32 v0, 42
 ; GCN-NEXT:    s_not_b64 exec, exec

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.fma.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.fma.f16.ll
index 70d6c2c173c11..3b1a8030c5e89 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.fma.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.fma.f16.ll
@@ -107,9 +107,9 @@ define amdgpu_kernel void @fma_f16_imm_c(
 ; GCN: buffer_load_dword v[[C_V2_F16:[0-9]+]]
 
 ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
+; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
 ; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
 ; SI: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
-; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
 ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
 ; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
 ; SI: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]]

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
index 535e6710855de..88fd38acf189b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
@@ -134,9 +134,9 @@ define amdgpu_kernel void @fmuladd_f16_imm_b(
 ; GFX10: buffer_load_dword v[[C_V2_F16:[0-9]+]]
 
 ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
+; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
 ; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
 ; SI: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
-; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
 
 ; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
 ; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]]

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
index 20d86f53547cf..f5cb3bd0fe8a7 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
@@ -102,11 +102,11 @@ define amdgpu_kernel void @maxnum_f16(
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s12, s6
 ; GFX10-NEXT:    s_mov_b32 s13, s7
-; GFX10-NEXT:    s_mov_b32 s0, s4
 ; GFX10-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_mov_b32 s0, s4
 ; GFX10-NEXT:    s_mov_b32 s1, s5
 ; GFX10-NEXT:    v_max_f16_e32 v0, v0, v0
 ; GFX10-NEXT:    v_max_f16_e32 v1, v1, v1
@@ -320,12 +320,12 @@ define amdgpu_kernel void @maxnum_v2f16(
 ; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
 ; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; SI-NEXT:    v_max_f32_e32 v2, v3, v2
-; SI-NEXT:    v_max_f32_e32 v0, v0, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_max_f32_e32 v0, v0, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    s_mov_b32 s0, s4
-; SI-NEXT:    s_mov_b32 s1, s5
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; SI-NEXT:    s_mov_b32 s1, s5
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
@@ -344,8 +344,8 @@ define amdgpu_kernel void @maxnum_v2f16(
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_max_f16_e64 v1, s4, s4
 ; VI-NEXT:    v_max_f16_e64 v0, s5, s5
-; VI-NEXT:    s_lshr_b32 s4, s4, 16
 ; VI-NEXT:    s_lshr_b32 s5, s5, 16
+; VI-NEXT:    s_lshr_b32 s4, s4, 16
 ; VI-NEXT:    v_max_f16_e32 v0, v1, v0
 ; VI-NEXT:    v_max_f16_e64 v1, s5, s5
 ; VI-NEXT:    v_max_f16_e64 v2, s4, s4
@@ -361,9 +361,9 @@ define amdgpu_kernel void @maxnum_v2f16(
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s0, s4
 ; GFX9-NEXT:    s_load_dword s10, s[6:7], 0x0
 ; GFX9-NEXT:    s_load_dword s11, s[8:9], 0x0
+; GFX9-NEXT:    s_mov_b32 s0, s4
 ; GFX9-NEXT:    s_mov_b32 s1, s5
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_max_f16 v1, s10, s10
@@ -568,14 +568,14 @@ define amdgpu_kernel void @maxnum_v3f16(
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b32 s0, s4
 ; SI-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
 ; SI-NEXT:    s_load_dwordx2 s[8:9], s[8:9], 0x0
+; SI-NEXT:    s_mov_b32 s0, s4
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_lshr_b32 s1, s6, 16
 ; SI-NEXT:    s_lshr_b32 s4, s8, 16
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, s1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, s4
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, s1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, s6
 ; SI-NEXT:    v_cvt_f32_f16_e32 v5, s8
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, s7
@@ -592,8 +592,8 @@ define amdgpu_kernel void @maxnum_v3f16(
 ; SI-NEXT:    v_max_f32_e32 v0, v0, v3
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    s_mov_b32 s1, s5
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:4
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], 0
@@ -613,8 +613,8 @@ define amdgpu_kernel void @maxnum_v3f16(
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_max_f16_e64 v1, s4, s4
 ; VI-NEXT:    v_max_f16_e64 v0, s6, s6
-; VI-NEXT:    s_lshr_b32 s4, s4, 16
 ; VI-NEXT:    s_lshr_b32 s6, s6, 16
+; VI-NEXT:    s_lshr_b32 s4, s4, 16
 ; VI-NEXT:    v_max_f16_e32 v0, v1, v0
 ; VI-NEXT:    v_max_f16_e64 v1, s6, s6
 ; VI-NEXT:    v_max_f16_e64 v2, s4, s4
@@ -641,8 +641,8 @@ define amdgpu_kernel void @maxnum_v3f16(
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_max_f16 v1, s4, s4
 ; GFX9-NEXT:    v_pk_max_f16 v0, s10, s10
-; GFX9-NEXT:    v_pk_max_f16 v0, v1, v0
 ; GFX9-NEXT:    v_pk_max_f16 v2, s11, s11
+; GFX9-NEXT:    v_pk_max_f16 v0, v1, v0
 ; GFX9-NEXT:    v_pk_max_f16 v1, s5, s5
 ; GFX9-NEXT:    v_pk_max_f16 v1, v1, v2
 ; GFX9-NEXT:    buffer_store_short v1, off, s[0:3], 0 offset:4
@@ -693,8 +693,8 @@ define amdgpu_kernel void @maxnum_v4f16(
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, s4
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, s5
 ; SI-NEXT:    s_lshr_b32 s4, s4, 16
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, s5
 ; SI-NEXT:    s_lshr_b32 s5, s5, 16
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, s4
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, s5
@@ -743,17 +743,17 @@ define amdgpu_kernel void @maxnum_v4f16(
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_max_f16_e64 v1, s5, s5
 ; VI-NEXT:    v_max_f16_e64 v0, s7, s7
-; VI-NEXT:    s_lshr_b32 s5, s5, 16
 ; VI-NEXT:    s_lshr_b32 s7, s7, 16
+; VI-NEXT:    s_lshr_b32 s5, s5, 16
 ; VI-NEXT:    v_max_f16_e32 v0, v1, v0
-; VI-NEXT:    v_max_f16_e64 v2, s5, s5
 ; VI-NEXT:    v_max_f16_e64 v1, s7, s7
+; VI-NEXT:    v_max_f16_e64 v2, s5, s5
 ; VI-NEXT:    v_max_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_e32 v1, v0, v1
-; VI-NEXT:    v_max_f16_e64 v2, s4, s4
 ; VI-NEXT:    v_max_f16_e64 v0, s6, s6
-; VI-NEXT:    s_lshr_b32 s4, s4, 16
+; VI-NEXT:    v_max_f16_e64 v2, s4, s4
 ; VI-NEXT:    s_lshr_b32 s5, s6, 16
+; VI-NEXT:    s_lshr_b32 s4, s4, 16
 ; VI-NEXT:    v_max_f16_e32 v0, v2, v0
 ; VI-NEXT:    v_max_f16_e64 v2, s5, s5
 ; VI-NEXT:    v_max_f16_e64 v3, s4, s4
@@ -776,8 +776,8 @@ define amdgpu_kernel void @maxnum_v4f16(
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_max_f16 v1, s5, s5
 ; GFX9-NEXT:    v_pk_max_f16 v0, s11, s11
-; GFX9-NEXT:    v_pk_max_f16 v1, v1, v0
 ; GFX9-NEXT:    v_pk_max_f16 v2, s10, s10
+; GFX9-NEXT:    v_pk_max_f16 v1, v1, v0
 ; GFX9-NEXT:    v_pk_max_f16 v0, s4, s4
 ; GFX9-NEXT:    v_pk_max_f16 v0, v0, v2
 ; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -838,9 +838,9 @@ define amdgpu_kernel void @fmax_v4f16_imm_a(
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_max_f32_e32 v3, 2.0, v3
 ; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; SI-NEXT:    v_max_f32_e32 v0, 0x41000000, v0
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_max_f32_e32 v0, 0x41000000, v0
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
@@ -864,8 +864,8 @@ define amdgpu_kernel void @fmax_v4f16_imm_a(
 ; VI-NEXT:    s_lshr_b32 s5, s5, 16
 ; VI-NEXT:    v_max_f16_e64 v3, s5, s5
 ; VI-NEXT:    v_max_f16_e64 v2, s4, s4
-; VI-NEXT:    v_max_f16_sdwa v0, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-NEXT:    v_max_f16_e32 v1, 0x4200, v1
+; VI-NEXT:    v_max_f16_sdwa v0, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-NEXT:    s_lshr_b32 s4, s4, 16
 ; VI-NEXT:    v_or_b32_e32 v1, v1, v0
 ; VI-NEXT:    v_max_f16_e32 v0, 0x4800, v2

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
index 02130936cd82e..36a8741b11ebe 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
@@ -102,11 +102,11 @@ define amdgpu_kernel void @minnum_f16_ieee(
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s12, s6
 ; GFX10-NEXT:    s_mov_b32 s13, s7
-; GFX10-NEXT:    s_mov_b32 s0, s4
 ; GFX10-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_mov_b32 s0, s4
 ; GFX10-NEXT:    s_mov_b32 s1, s5
 ; GFX10-NEXT:    v_max_f16_e32 v0, v0, v0
 ; GFX10-NEXT:    v_max_f16_e32 v1, v1, v1
@@ -348,12 +348,12 @@ define amdgpu_kernel void @minnum_v2f16_ieee(
 ; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
 ; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; SI-NEXT:    v_min_f32_e32 v2, v3, v2
-; SI-NEXT:    v_min_f32_e32 v0, v0, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_min_f32_e32 v0, v0, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    s_mov_b32 s0, s4
-; SI-NEXT:    s_mov_b32 s1, s5
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; SI-NEXT:    s_mov_b32 s1, s5
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
@@ -372,8 +372,8 @@ define amdgpu_kernel void @minnum_v2f16_ieee(
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_max_f16_e64 v1, s4, s4
 ; VI-NEXT:    v_max_f16_e64 v0, s5, s5
-; VI-NEXT:    s_lshr_b32 s4, s4, 16
 ; VI-NEXT:    s_lshr_b32 s5, s5, 16
+; VI-NEXT:    s_lshr_b32 s4, s4, 16
 ; VI-NEXT:    v_min_f16_e32 v0, v1, v0
 ; VI-NEXT:    v_max_f16_e64 v1, s5, s5
 ; VI-NEXT:    v_max_f16_e64 v2, s4, s4
@@ -389,9 +389,9 @@ define amdgpu_kernel void @minnum_v2f16_ieee(
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s0, s4
 ; GFX9-NEXT:    s_load_dword s10, s[6:7], 0x0
 ; GFX9-NEXT:    s_load_dword s11, s[8:9], 0x0
+; GFX9-NEXT:    s_mov_b32 s0, s4
 ; GFX9-NEXT:    s_mov_b32 s1, s5
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_max_f16 v1, s10, s10
@@ -631,14 +631,14 @@ define amdgpu_kernel void @minnum_v3f16(
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b32 s0, s4
 ; SI-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
 ; SI-NEXT:    s_load_dwordx2 s[8:9], s[8:9], 0x0
+; SI-NEXT:    s_mov_b32 s0, s4
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_lshr_b32 s1, s6, 16
 ; SI-NEXT:    s_lshr_b32 s4, s8, 16
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, s1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, s4
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, s1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, s6
 ; SI-NEXT:    v_cvt_f32_f16_e32 v5, s8
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, s7
@@ -655,8 +655,8 @@ define amdgpu_kernel void @minnum_v3f16(
 ; SI-NEXT:    v_min_f32_e32 v0, v0, v3
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    s_mov_b32 s1, s5
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:4
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], 0
@@ -676,8 +676,8 @@ define amdgpu_kernel void @minnum_v3f16(
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_max_f16_e64 v1, s4, s4
 ; VI-NEXT:    v_max_f16_e64 v0, s6, s6
-; VI-NEXT:    s_lshr_b32 s4, s4, 16
 ; VI-NEXT:    s_lshr_b32 s6, s6, 16
+; VI-NEXT:    s_lshr_b32 s4, s4, 16
 ; VI-NEXT:    v_min_f16_e32 v0, v1, v0
 ; VI-NEXT:    v_max_f16_e64 v1, s6, s6
 ; VI-NEXT:    v_max_f16_e64 v2, s4, s4
@@ -704,8 +704,8 @@ define amdgpu_kernel void @minnum_v3f16(
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_max_f16 v1, s4, s4
 ; GFX9-NEXT:    v_pk_max_f16 v0, s10, s10
-; GFX9-NEXT:    v_pk_min_f16 v0, v1, v0
 ; GFX9-NEXT:    v_pk_max_f16 v2, s11, s11
+; GFX9-NEXT:    v_pk_min_f16 v0, v1, v0
 ; GFX9-NEXT:    v_pk_max_f16 v1, s5, s5
 ; GFX9-NEXT:    v_pk_min_f16 v1, v1, v2
 ; GFX9-NEXT:    buffer_store_short v1, off, s[0:3], 0 offset:4
@@ -756,8 +756,8 @@ define amdgpu_kernel void @minnum_v4f16(
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, s4
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, s5
 ; SI-NEXT:    s_lshr_b32 s4, s4, 16
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, s5
 ; SI-NEXT:    s_lshr_b32 s5, s5, 16
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, s4
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, s5
@@ -806,17 +806,17 @@ define amdgpu_kernel void @minnum_v4f16(
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_max_f16_e64 v1, s5, s5
 ; VI-NEXT:    v_max_f16_e64 v0, s7, s7
-; VI-NEXT:    s_lshr_b32 s5, s5, 16
 ; VI-NEXT:    s_lshr_b32 s7, s7, 16
+; VI-NEXT:    s_lshr_b32 s5, s5, 16
 ; VI-NEXT:    v_min_f16_e32 v0, v1, v0
-; VI-NEXT:    v_max_f16_e64 v2, s5, s5
 ; VI-NEXT:    v_max_f16_e64 v1, s7, s7
+; VI-NEXT:    v_max_f16_e64 v2, s5, s5
 ; VI-NEXT:    v_min_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_e32 v1, v0, v1
-; VI-NEXT:    v_max_f16_e64 v2, s4, s4
 ; VI-NEXT:    v_max_f16_e64 v0, s6, s6
-; VI-NEXT:    s_lshr_b32 s4, s4, 16
+; VI-NEXT:    v_max_f16_e64 v2, s4, s4
 ; VI-NEXT:    s_lshr_b32 s5, s6, 16
+; VI-NEXT:    s_lshr_b32 s4, s4, 16
 ; VI-NEXT:    v_min_f16_e32 v0, v2, v0
 ; VI-NEXT:    v_max_f16_e64 v2, s5, s5
 ; VI-NEXT:    v_max_f16_e64 v3, s4, s4
@@ -839,8 +839,8 @@ define amdgpu_kernel void @minnum_v4f16(
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_max_f16 v1, s5, s5
 ; GFX9-NEXT:    v_pk_max_f16 v0, s11, s11
-; GFX9-NEXT:    v_pk_min_f16 v1, v1, v0
 ; GFX9-NEXT:    v_pk_max_f16 v2, s10, s10
+; GFX9-NEXT:    v_pk_min_f16 v1, v1, v0
 ; GFX9-NEXT:    v_pk_max_f16 v0, s4, s4
 ; GFX9-NEXT:    v_pk_min_f16 v0, v0, v2
 ; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -901,9 +901,9 @@ define amdgpu_kernel void @fmin_v4f16_imm_a(
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_min_f32_e32 v3, 2.0, v3
 ; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; SI-NEXT:    v_min_f32_e32 v0, 0x41000000, v0
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_min_f32_e32 v0, 0x41000000, v0
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
@@ -927,8 +927,8 @@ define amdgpu_kernel void @fmin_v4f16_imm_a(
 ; VI-NEXT:    s_lshr_b32 s5, s5, 16
 ; VI-NEXT:    v_max_f16_e64 v3, s5, s5
 ; VI-NEXT:    v_max_f16_e64 v2, s4, s4
-; VI-NEXT:    v_min_f16_sdwa v0, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-NEXT:    v_min_f16_e32 v1, 0x4200, v1
+; VI-NEXT:    v_min_f16_sdwa v0, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-NEXT:    s_lshr_b32 s4, s4, 16
 ; VI-NEXT:    v_or_b32_e32 v1, v1, v0
 ; VI-NEXT:    v_min_f16_e32 v0, 0x4800, v2

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
index a945847e8db37..fafbb4c54142c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
@@ -68,8 +68,8 @@ define { i64, i1 } @umulo_i64_v_v(i64 %x, i64 %y) {
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v4, v7, vcc_lo
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v9, vcc_lo
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v1
-; GFX10-NEXT:    v_add3_u32 v1, v6, v5, v8
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v4, vcc_lo
+; GFX10-NEXT:    v_add3_u32 v1, v6, v5, v8
 ; GFX10-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[3:4]
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -127,8 +127,8 @@ define { i64, i1 } @smulo_i64_s_s(i64 %x, i64 %y) {
 ; GFX9-NEXT:    v_mul_hi_u32 v4, v1, v2
 ; GFX9-NEXT:    v_add_co_u32_e32 v9, vcc, v6, v5
 ; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v8, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v9, vcc, v9, v7
 ; GFX9-NEXT:    v_mul_hi_i32 v10, v1, v3
+; GFX9-NEXT:    v_add_co_u32_e32 v9, vcc, v9, v7
 ; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v8, v4, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v8, v1, v3
 ; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v10, vcc
@@ -147,8 +147,8 @@ define { i64, i1 } @smulo_i64_s_s(i64 %x, i64 %y) {
 ; GFX9-NEXT:    v_add3_u32 v1, v6, v5, v7
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v5, 31, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v6, v5
 ; GFX9-NEXT:    v_mul_lo_u32 v0, v0, v2
+; GFX9-NEXT:    v_mov_b32_e32 v6, v5
 ; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, v[3:4], v[5:6]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -179,10 +179,10 @@ define { i64, i1 } @smulo_i64_s_s(i64 %x, i64 %y) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v7, v7, v10, vcc_lo
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v4, 31, v1
 ; GFX10-NEXT:    v_sub_co_u32 v8, vcc_lo, v6, v0
-; GFX10-NEXT:    v_mul_lo_u32 v0, v0, v2
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v9, vcc_lo, 0, v7, vcc_lo
 ; GFX10-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 0, v3
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v4
+; GFX10-NEXT:    v_mul_lo_u32 v0, v0, v2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc_lo
 ; GFX10-NEXT:    v_cmp_ne_u64_e32 vcc_lo, v[6:7], v[4:5]
@@ -350,8 +350,8 @@ define amdgpu_kernel void @smulo_i64_s(i64 %x, i64 %y) {
 ; GFX9-NEXT:    s_add_u32 s6, s10, s9
 ; GFX9-NEXT:    s_mul_i32 s8, s1, s2
 ; GFX9-NEXT:    s_addc_u32 s5, 0, s5
-; GFX9-NEXT:    s_add_u32 s6, s6, s8
 ; GFX9-NEXT:    s_mul_hi_u32 s4, s1, s2
+; GFX9-NEXT:    s_add_u32 s6, s6, s8
 ; GFX9-NEXT:    s_mul_hi_i32 s7, s1, s3
 ; GFX9-NEXT:    s_addc_u32 s4, s5, s4
 ; GFX9-NEXT:    s_addc_u32 s5, s7, 0
@@ -387,14 +387,14 @@ define amdgpu_kernel void @smulo_i64_s(i64 %x, i64 %y) {
 ; GFX10-NEXT:    s_mul_i32 s9, s0, s3
 ; GFX10-NEXT:    s_mul_hi_u32 s10, s0, s2
 ; GFX10-NEXT:    s_mul_hi_u32 s5, s0, s3
-; GFX10-NEXT:    s_add_u32 s11, s10, s9
 ; GFX10-NEXT:    s_mul_i32 s8, s1, s2
-; GFX10-NEXT:    s_addc_u32 s5, 0, s5
+; GFX10-NEXT:    s_add_u32 s11, s10, s9
 ; GFX10-NEXT:    s_mul_hi_u32 s4, s1, s2
-; GFX10-NEXT:    s_add_u32 s11, s11, s8
+; GFX10-NEXT:    s_addc_u32 s5, 0, s5
 ; GFX10-NEXT:    s_mul_hi_i32 s6, s1, s3
-; GFX10-NEXT:    s_addc_u32 s4, s5, s4
+; GFX10-NEXT:    s_add_u32 s11, s11, s8
 ; GFX10-NEXT:    s_mul_i32 s7, s1, s3
+; GFX10-NEXT:    s_addc_u32 s4, s5, s4
 ; GFX10-NEXT:    s_addc_u32 s5, s6, 0
 ; GFX10-NEXT:    s_add_u32 s4, s4, s7
 ; GFX10-NEXT:    s_addc_u32 s5, 0, s5
@@ -487,8 +487,8 @@ define { i64, i1 } @umulo_i64_v_4(i64 %i) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v7, 0x3fffffff, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v6, v0
-; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, v[6:7], v[0:1]
 ; GFX9-NEXT:    v_lshlrev_b64 v[4:5], 2, v[0:1]
+; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, v[6:7], v[0:1]
 ; GFX9-NEXT:    v_alignbit_b32 v3, v1, v0, 30
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v4

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
index 6b5b6d9efe404..f6568f98485c1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
@@ -17,10 +17,10 @@ define amdgpu_kernel void @round_f64(double addrspace(1)* %out, double %x) #0 {
 ; SI-NEXT:    s_andn2_b64 s[2:3], s[10:11], s[0:1]
 ; SI-NEXT:    s_and_b32 s0, s11, 0x80000000
 ; SI-NEXT:    s_cmp_lt_i32 s5, 0
+; SI-NEXT:    v_mov_b32_e32 v0, s3
+; SI-NEXT:    v_mov_b32_e32 v1, s0
 ; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    s_cmp_gt_i32 s5, 51
-; SI-NEXT:    v_mov_b32_e32 v1, s0
-; SI-NEXT:    v_mov_b32_e32 v0, s3
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; SI-NEXT:    v_mov_b32_e32 v1, s11
 ; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
@@ -79,8 +79,8 @@ define amdgpu_kernel void @v_round_f64(double addrspace(1)* %out, double addrspa
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b64 s[0:1], s[6:7]
 ; SI-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_mov_b32 s0, -1
 ; SI-NEXT:    s_movk_i32 s7, 0xfc01
+; SI-NEXT:    s_mov_b32 s0, -1
 ; SI-NEXT:    s_mov_b32 s1, 0xfffff
 ; SI-NEXT:    s_brev_b32 s6, -2
 ; SI-NEXT:    v_mov_b32_e32 v8, 0x3ff00000
@@ -92,8 +92,8 @@ define amdgpu_kernel void @v_round_f64(double addrspace(1)* %out, double addrspa
 ; SI-NEXT:    v_not_b32_e32 v4, v4
 ; SI-NEXT:    v_not_b32_e32 v5, v5
 ; SI-NEXT:    v_and_b32_e32 v5, v3, v5
-; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v6
 ; SI-NEXT:    v_and_b32_e32 v4, v2, v4
+; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v6
 ; SI-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
 ; SI-NEXT:    v_cndmask_b32_e64 v4, v4, 0, vcc
 ; SI-NEXT:    v_cmp_lt_i32_e32 vcc, 51, v6
@@ -158,9 +158,9 @@ define amdgpu_kernel void @round_v2f64(<2 x double> addrspace(1)* %out, <2 x dou
 ; SI-NEXT:    s_andn2_b64 s[12:13], s[10:11], s[0:1]
 ; SI-NEXT:    s_and_b32 s0, s11, s15
 ; SI-NEXT:    s_cmp_lt_i32 s14, 0
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_mov_b32_e32 v0, s13
 ; SI-NEXT:    v_mov_b32_e32 v1, s0
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    s_cmp_gt_i32 s14, 51
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; SI-NEXT:    v_mov_b32_e32 v1, s11
@@ -173,11 +173,11 @@ define amdgpu_kernel void @round_v2f64(<2 x double> addrspace(1)* %out, <2 x dou
 ; SI-NEXT:    v_add_f64 v[2:3], s[10:11], -v[0:1]
 ; SI-NEXT:    s_bfe_u32 s0, s9, 0xb0014
 ; SI-NEXT:    s_add_i32 s7, s0, s7
-; SI-NEXT:    s_lshr_b64 s[0:1], s[2:3], s7
-; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5
 ; SI-NEXT:    s_brev_b32 s10, -2
 ; SI-NEXT:    v_mov_b32_e32 v6, 0x3ff00000
 ; SI-NEXT:    v_mov_b32_e32 v4, s11
+; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5
+; SI-NEXT:    s_lshr_b64 s[0:1], s[2:3], s7
 ; SI-NEXT:    v_bfi_b32 v4, s10, v6, v4
 ; SI-NEXT:    s_andn2_b64 s[2:3], s[8:9], s[0:1]
 ; SI-NEXT:    s_and_b32 s0, s9, s15
@@ -185,10 +185,10 @@ define amdgpu_kernel void @round_v2f64(<2 x double> addrspace(1)* %out, <2 x dou
 ; SI-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-NEXT:    s_cmp_lt_i32 s7, 0
 ; SI-NEXT:    v_add_f64 v[2:3], v[0:1], v[2:3]
+; SI-NEXT:    v_mov_b32_e32 v0, s3
+; SI-NEXT:    v_mov_b32_e32 v1, s0
 ; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    s_cmp_gt_i32 s7, 51
-; SI-NEXT:    v_mov_b32_e32 v1, s0
-; SI-NEXT:    v_mov_b32_e32 v0, s3
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; SI-NEXT:    v_mov_b32_e32 v1, s9
 ; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
@@ -257,9 +257,9 @@ define amdgpu_kernel void @round_v4f64(<4 x double> addrspace(1)* %out, <4 x dou
 ; SI-NEXT:    s_andn2_b64 s[16:17], s[6:7], s[0:1]
 ; SI-NEXT:    s_and_b32 s0, s7, s20
 ; SI-NEXT:    s_cmp_lt_i32 s19, 0
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_mov_b32_e32 v0, s17
 ; SI-NEXT:    v_mov_b32_e32 v1, s0
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    s_cmp_gt_i32 s19, 51
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; SI-NEXT:    v_mov_b32_e32 v1, s7
@@ -272,10 +272,10 @@ define amdgpu_kernel void @round_v4f64(<4 x double> addrspace(1)* %out, <4 x dou
 ; SI-NEXT:    v_add_f64 v[2:3], s[6:7], -v[0:1]
 ; SI-NEXT:    s_bfe_u32 s0, s5, 0xb0014
 ; SI-NEXT:    s_add_i32 s17, s0, s18
-; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5
 ; SI-NEXT:    s_brev_b32 s16, -2
 ; SI-NEXT:    v_mov_b32_e32 v12, 0x3ff00000
 ; SI-NEXT:    v_mov_b32_e32 v4, s7
+; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5
 ; SI-NEXT:    s_lshr_b64 s[0:1], s[2:3], s17
 ; SI-NEXT:    v_bfi_b32 v4, s16, v12, v4
 ; SI-NEXT:    s_andn2_b64 s[6:7], s[4:5], s[0:1]
@@ -284,9 +284,9 @@ define amdgpu_kernel void @round_v4f64(<4 x double> addrspace(1)* %out, <4 x dou
 ; SI-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-NEXT:    s_cmp_lt_i32 s17, 0
 ; SI-NEXT:    v_add_f64 v[2:3], v[0:1], v[2:3]
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_mov_b32_e32 v0, s7
 ; SI-NEXT:    v_mov_b32_e32 v1, s0
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    s_cmp_gt_i32 s17, 51
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; SI-NEXT:    v_mov_b32_e32 v1, s5
@@ -297,19 +297,19 @@ define amdgpu_kernel void @round_v4f64(<4 x double> addrspace(1)* %out, <4 x dou
 ; SI-NEXT:    v_mov_b32_e32 v4, s4
 ; SI-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
 ; SI-NEXT:    s_bfe_u32 s0, s11, 0xb0014
-; SI-NEXT:    s_add_i32 s6, s0, s18
 ; SI-NEXT:    v_add_f64 v[4:5], s[4:5], -v[0:1]
+; SI-NEXT:    s_add_i32 s6, s0, s18
 ; SI-NEXT:    s_lshr_b64 s[0:1], s[2:3], s6
 ; SI-NEXT:    v_mov_b32_e32 v6, s5
-; SI-NEXT:    s_andn2_b64 s[4:5], s[10:11], s[0:1]
 ; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5
+; SI-NEXT:    s_andn2_b64 s[4:5], s[10:11], s[0:1]
 ; SI-NEXT:    s_and_b32 s0, s11, s20
 ; SI-NEXT:    v_bfi_b32 v6, s16, v12, v6
 ; SI-NEXT:    s_cmp_lt_i32 s6, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v9, 0, v6, vcc
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_mov_b32_e32 v4, s5
 ; SI-NEXT:    v_mov_b32_e32 v5, s0
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    s_cmp_gt_i32 s6, 51
 ; SI-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
 ; SI-NEXT:    v_mov_b32_e32 v5, s11
@@ -323,8 +323,8 @@ define amdgpu_kernel void @round_v4f64(<4 x double> addrspace(1)* %out, <4 x dou
 ; SI-NEXT:    s_bfe_u32 s0, s9, 0xb0014
 ; SI-NEXT:    s_add_i32 s4, s0, s18
 ; SI-NEXT:    v_mov_b32_e32 v10, s11
-; SI-NEXT:    s_lshr_b64 s[0:1], s[2:3], s4
 ; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5
+; SI-NEXT:    s_lshr_b64 s[0:1], s[2:3], s4
 ; SI-NEXT:    v_bfi_b32 v10, s16, v12, v10
 ; SI-NEXT:    s_andn2_b64 s[2:3], s[8:9], s[0:1]
 ; SI-NEXT:    s_and_b32 s0, s9, s20
@@ -332,9 +332,9 @@ define amdgpu_kernel void @round_v4f64(<4 x double> addrspace(1)* %out, <4 x dou
 ; SI-NEXT:    v_mov_b32_e32 v6, 0
 ; SI-NEXT:    s_cmp_lt_i32 s4, 0
 ; SI-NEXT:    v_add_f64 v[6:7], v[4:5], v[6:7]
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_mov_b32_e32 v4, s3
 ; SI-NEXT:    v_mov_b32_e32 v5, s0
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    s_cmp_gt_i32 s4, 51
 ; SI-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
 ; SI-NEXT:    v_mov_b32_e32 v5, s9
@@ -423,9 +423,9 @@ define amdgpu_kernel void @round_v8f64(<8 x double> addrspace(1)* %out, <8 x dou
 ; SI-NEXT:    s_andn2_b64 s[24:25], s[6:7], s[2:3]
 ; SI-NEXT:    s_and_b32 s2, s7, s28
 ; SI-NEXT:    s_cmp_lt_i32 s26, 0
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_mov_b32_e32 v0, s25
 ; SI-NEXT:    v_mov_b32_e32 v1, s2
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    s_cmp_gt_i32 s26, 51
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; SI-NEXT:    v_mov_b32_e32 v1, s7
@@ -438,10 +438,10 @@ define amdgpu_kernel void @round_v8f64(<8 x double> addrspace(1)* %out, <8 x dou
 ; SI-NEXT:    v_add_f64 v[2:3], s[6:7], -v[0:1]
 ; SI-NEXT:    s_bfe_u32 s2, s5, 0xb0014
 ; SI-NEXT:    s_add_i32 s24, s2, s23
-; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5
 ; SI-NEXT:    s_brev_b32 s29, -2
 ; SI-NEXT:    v_mov_b32_e32 v14, 0x3ff00000
 ; SI-NEXT:    v_mov_b32_e32 v4, s7
+; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5
 ; SI-NEXT:    s_lshr_b64 s[2:3], s[20:21], s24
 ; SI-NEXT:    v_bfi_b32 v4, s29, v14, v4
 ; SI-NEXT:    s_andn2_b64 s[6:7], s[4:5], s[2:3]
@@ -450,9 +450,9 @@ define amdgpu_kernel void @round_v8f64(<8 x double> addrspace(1)* %out, <8 x dou
 ; SI-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-NEXT:    s_cmp_lt_i32 s24, 0
 ; SI-NEXT:    v_add_f64 v[2:3], v[0:1], v[2:3]
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_mov_b32_e32 v0, s7
 ; SI-NEXT:    v_mov_b32_e32 v1, s2
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    s_cmp_gt_i32 s24, 51
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; SI-NEXT:    v_mov_b32_e32 v1, s5
@@ -475,9 +475,9 @@ define amdgpu_kernel void @round_v8f64(<8 x double> addrspace(1)* %out, <8 x dou
 ; SI-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-NEXT:    s_cmp_lt_i32 s6, 0
 ; SI-NEXT:    v_add_f64 v[0:1], v[0:1], v[4:5]
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_mov_b32_e32 v4, s5
 ; SI-NEXT:    v_mov_b32_e32 v5, s2
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    s_cmp_gt_i32 s6, 51
 ; SI-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
 ; SI-NEXT:    v_mov_b32_e32 v5, s11
@@ -500,9 +500,9 @@ define amdgpu_kernel void @round_v8f64(<8 x double> addrspace(1)* %out, <8 x dou
 ; SI-NEXT:    v_mov_b32_e32 v6, 0
 ; SI-NEXT:    s_cmp_lt_i32 s6, 0
 ; SI-NEXT:    v_add_f64 v[6:7], v[4:5], v[6:7]
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_mov_b32_e32 v4, s5
 ; SI-NEXT:    v_mov_b32_e32 v5, s2
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    s_cmp_gt_i32 s6, 51
 ; SI-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
 ; SI-NEXT:    v_mov_b32_e32 v5, s9
@@ -513,8 +513,8 @@ define amdgpu_kernel void @round_v8f64(<8 x double> addrspace(1)* %out, <8 x dou
 ; SI-NEXT:    v_mov_b32_e32 v8, s8
 ; SI-NEXT:    v_cndmask_b32_e64 v4, v4, v8, s[2:3]
 ; SI-NEXT:    s_bfe_u32 s2, s15, 0xb0014
-; SI-NEXT:    s_add_i32 s4, s2, s23
 ; SI-NEXT:    v_add_f64 v[8:9], s[8:9], -v[4:5]
+; SI-NEXT:    s_add_i32 s4, s2, s23
 ; SI-NEXT:    s_lshr_b64 s[2:3], s[20:21], s4
 ; SI-NEXT:    v_mov_b32_e32 v10, s9
 ; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[8:9]|, 0.5
@@ -549,13 +549,13 @@ define amdgpu_kernel void @round_v8f64(<8 x double> addrspace(1)* %out, <8 x dou
 ; SI-NEXT:    v_mov_b32_e32 v8, s27
 ; SI-NEXT:    s_cmp_lt_i32 s25, 0
 ; SI-NEXT:    v_cndmask_b32_e64 v17, v8, v9, s[4:5]
+; SI-NEXT:    v_mov_b32_e32 v8, s11
 ; SI-NEXT:    v_mov_b32_e32 v9, s8
 ; SI-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; SI-NEXT:    v_mov_b32_e32 v8, s11
 ; SI-NEXT:    s_cmp_gt_i32 s25, 51
 ; SI-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s[8:9]
-; SI-NEXT:    v_mov_b32_e32 v10, s10
 ; SI-NEXT:    v_mov_b32_e32 v9, s19
+; SI-NEXT:    v_mov_b32_e32 v10, s10
 ; SI-NEXT:    s_cselect_b64 s[10:11], -1, 0
 ; SI-NEXT:    v_cndmask_b32_e64 v9, v8, v9, s[10:11]
 ; SI-NEXT:    v_cndmask_b32_e64 v8, v10, 0, s[8:9]
@@ -567,9 +567,9 @@ define amdgpu_kernel void @round_v8f64(<8 x double> addrspace(1)* %out, <8 x dou
 ; SI-NEXT:    s_andn2_b64 s[20:21], s[16:17], s[8:9]
 ; SI-NEXT:    s_and_b32 s8, s17, s28
 ; SI-NEXT:    s_cmp_lt_i32 s10, 0
+; SI-NEXT:    v_mov_b32_e32 v10, s21
 ; SI-NEXT:    v_mov_b32_e32 v11, s8
 ; SI-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; SI-NEXT:    v_mov_b32_e32 v10, s21
 ; SI-NEXT:    s_cmp_gt_i32 s10, 51
 ; SI-NEXT:    v_cndmask_b32_e64 v10, v10, v11, s[8:9]
 ; SI-NEXT:    v_mov_b32_e32 v11, s17
@@ -592,9 +592,9 @@ define amdgpu_kernel void @round_v8f64(<8 x double> addrspace(1)* %out, <8 x dou
 ; SI-NEXT:    v_add_f64 v[10:11], v[8:9], v[10:11]
 ; SI-NEXT:    v_cndmask_b32_e64 v9, 0, v19, s[8:9]
 ; SI-NEXT:    v_mov_b32_e32 v8, 0
+; SI-NEXT:    v_mov_b32_e32 v16, s15
 ; SI-NEXT:    v_add_f64 v[8:9], v[12:13], v[8:9]
 ; SI-NEXT:    v_mov_b32_e32 v12, s24
-; SI-NEXT:    v_mov_b32_e32 v16, s15
 ; SI-NEXT:    v_cndmask_b32_e64 v13, v15, v16, s[2:3]
 ; SI-NEXT:    v_cndmask_b32_e64 v12, v12, 0, vcc
 ; SI-NEXT:    v_mov_b32_e32 v15, s14
@@ -605,8 +605,8 @@ define amdgpu_kernel void @round_v8f64(<8 x double> addrspace(1)* %out, <8 x dou
 ; SI-NEXT:    v_mov_b32_e32 v18, s13
 ; SI-NEXT:    v_cndmask_b32_e64 v15, v15, 0, s[4:5]
 ; SI-NEXT:    v_mov_b32_e32 v16, s12
-; SI-NEXT:    v_cndmask_b32_e64 v16, v15, v16, s[6:7]
 ; SI-NEXT:    v_cndmask_b32_e64 v17, v17, v18, s[6:7]
+; SI-NEXT:    v_cndmask_b32_e64 v16, v15, v16, s[6:7]
 ; SI-NEXT:    v_mov_b32_e32 v15, s13
 ; SI-NEXT:    v_bfi_b32 v18, s29, v14, v15
 ; SI-NEXT:    v_add_f64 v[14:15], s[12:13], -v[16:17]
@@ -687,9 +687,9 @@ define amdgpu_kernel void @round_v8f64(<8 x double> addrspace(1)* %out, <8 x dou
 ; CI-NEXT:    v_cndmask_b32_e32 v15, 0, v17, vcc
 ; CI-NEXT:    v_mov_b32_e32 v14, 0
 ; CI-NEXT:    v_mov_b32_e32 v17, s9
-; CI-NEXT:    v_bfi_b32 v19, s18, v16, v17
 ; CI-NEXT:    v_add_f64 v[8:9], v[8:9], v[14:15]
 ; CI-NEXT:    v_add_f64 v[14:15], s[8:9], -v[12:13]
+; CI-NEXT:    v_bfi_b32 v19, s18, v16, v17
 ; CI-NEXT:    v_trunc_f64_e32 v[16:17], s[10:11]
 ; CI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[14:15]|, 0.5
 ; CI-NEXT:    v_add_f64 v[14:15], s[10:11], -v[16:17]

diff  --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
index ffe5784af1855..f1b7373814c0d 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
@@ -1075,8 +1075,8 @@ define amdgpu_kernel void @constant_zextload_v3i16_to_v3i32(<3 x i32> addrspace(
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s1, s5
 ; GCN-NOHSA-VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s6, s4, 16
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s5, s5, s8
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s6, s4, 16
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s4, s4, s8
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s6
@@ -1256,8 +1256,8 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i32(<4 x i32> addrspace(
 ; GCN-NOHSA-VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s6, s5, 16
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s7, s4, 16
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s5, s5, s8
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s7, s4, 16
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s4, s4, s8
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s7
@@ -1329,8 +1329,8 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i32(<4 x i32> addrspace(
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
 ; GCN-HSA-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-HSA-NEXT:    s_ashr_i64 s[2:3], s[0:1], 48
 ; GCN-HSA-NEXT:    s_ashr_i32 s4, s0, 16
+; GCN-HSA-NEXT:    s_ashr_i64 s[2:3], s[0:1], 48
 ; GCN-HSA-NEXT:    s_sext_i32_i16 s1, s1
 ; GCN-HSA-NEXT:    s_sext_i32_i16 s0, s0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s0
@@ -1436,14 +1436,14 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i32(<8 x i32> addrspace(
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-HSA-NEXT:    s_lshr_b32 s2, s7, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s3, s6, 16
 ; GCN-HSA-NEXT:    s_lshr_b32 s9, s5, 16
 ; GCN-HSA-NEXT:    s_lshr_b32 s10, s4, 16
-; GCN-HSA-NEXT:    s_and_b32 s7, s7, s8
-; GCN-HSA-NEXT:    s_and_b32 s6, s6, s8
+; GCN-HSA-NEXT:    s_lshr_b32 s2, s7, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s3, s6, 16
 ; GCN-HSA-NEXT:    s_and_b32 s5, s5, s8
 ; GCN-HSA-NEXT:    s_and_b32 s4, s4, s8
+; GCN-HSA-NEXT:    s_and_b32 s7, s7, s8
+; GCN-HSA-NEXT:    s_and_b32 s6, s6, s8
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 16
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -1474,12 +1474,12 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i32(<8 x i32> addrspace(
 ; GCN-NOHSA-VI-NEXT:    s_load_dwordx4 s[4:7], s[6:7], 0x0
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s11, s7, 16
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s12, s6, 16
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s7, s7, s8
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s12, s6, 16
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s6, s6, s8
 ; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s9, s5, 16
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s10, s4, 16
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s5, s5, s8
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s10, s4, 16
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s4, s4, s8
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s12
@@ -1570,10 +1570,10 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i32(<8 x i32> addrspace(
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-HSA-NEXT:    s_ashr_i32 s2, s7, 16
-; GCN-HSA-NEXT:    s_ashr_i32 s3, s6, 16
 ; GCN-HSA-NEXT:    s_ashr_i32 s8, s5, 16
 ; GCN-HSA-NEXT:    s_ashr_i32 s9, s4, 16
+; GCN-HSA-NEXT:    s_ashr_i32 s2, s7, 16
+; GCN-HSA-NEXT:    s_ashr_i32 s3, s6, 16
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 16
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -1584,9 +1584,9 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i32(<8 x i32> addrspace(
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s7
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-HSA-NEXT:    s_sext_i32_i16 s5, s5
 ; GCN-HSA-NEXT:    s_sext_i32_i16 s4, s4
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s9
@@ -1724,22 +1724,22 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i32(<16 x i32> addrspa
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-HSA-NEXT:    s_lshr_b32 s2, s11, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s3, s10, 16
 ; GCN-HSA-NEXT:    s_lshr_b32 s13, s5, 16
 ; GCN-HSA-NEXT:    s_lshr_b32 s14, s4, 16
 ; GCN-HSA-NEXT:    s_lshr_b32 s15, s7, 16
 ; GCN-HSA-NEXT:    s_lshr_b32 s16, s6, 16
 ; GCN-HSA-NEXT:    s_lshr_b32 s17, s9, 16
 ; GCN-HSA-NEXT:    s_lshr_b32 s18, s8, 16
-; GCN-HSA-NEXT:    s_and_b32 s11, s11, s12
-; GCN-HSA-NEXT:    s_and_b32 s10, s10, s12
+; GCN-HSA-NEXT:    s_lshr_b32 s2, s11, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s3, s10, 16
 ; GCN-HSA-NEXT:    s_and_b32 s5, s5, s12
 ; GCN-HSA-NEXT:    s_and_b32 s4, s4, s12
 ; GCN-HSA-NEXT:    s_and_b32 s7, s7, s12
 ; GCN-HSA-NEXT:    s_and_b32 s6, s6, s12
 ; GCN-HSA-NEXT:    s_and_b32 s9, s9, s12
 ; GCN-HSA-NEXT:    s_and_b32 s8, s8, s12
+; GCN-HSA-NEXT:    s_and_b32 s11, s11, s12
+; GCN-HSA-NEXT:    s_and_b32 s10, s10, s12
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 48
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -1788,31 +1788,32 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i32(<16 x i32> addrspa
 ; GCN-NOHSA-VI-NEXT:    s_load_dwordx8 s[4:11], s[6:7], 0x0
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s19, s11, 16
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s20, s10, 16
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s11, s11, s12
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s20, s10, 16
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s10, s10, s12
 ; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s17, s9, 16
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s18, s8, 16
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s9, s9, s12
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s18, s8, 16
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s8, s8, s12
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s10
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s20
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s11
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s19
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
 ; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s15, s7, 16
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s16, s6, 16
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s7, s7, s12
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s16, s6, 16
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s6, s6, s12
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s13, s5, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s8
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s18
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s9
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s17
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s13, s5, 16
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s14, s4, 16
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s5, s5, s12
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s14, s4, 16
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s4, s4, s12
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
+; GCN-NOHSA-VI-NEXT:    s_nop 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s7
@@ -1940,14 +1941,14 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i32(<16 x i32> addrspa
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-HSA-NEXT:    s_ashr_i32 s2, s11, 16
-; GCN-HSA-NEXT:    s_ashr_i32 s3, s10, 16
 ; GCN-HSA-NEXT:    s_ashr_i32 s12, s5, 16
 ; GCN-HSA-NEXT:    s_ashr_i32 s13, s4, 16
 ; GCN-HSA-NEXT:    s_ashr_i32 s14, s7, 16
 ; GCN-HSA-NEXT:    s_ashr_i32 s15, s6, 16
 ; GCN-HSA-NEXT:    s_ashr_i32 s16, s9, 16
 ; GCN-HSA-NEXT:    s_ashr_i32 s17, s8, 16
+; GCN-HSA-NEXT:    s_ashr_i32 s2, s11, 16
+; GCN-HSA-NEXT:    s_ashr_i32 s3, s10, 16
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 48
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -1971,18 +1972,18 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i32(<16 x i32> addrspa
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s9
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s16
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-HSA-NEXT:    s_sext_i32_i16 s7, s7
 ; GCN-HSA-NEXT:    s_sext_i32_i16 s6, s6
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s15
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s7
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s14
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-HSA-NEXT:    s_sext_i32_i16 s5, s5
 ; GCN-HSA-NEXT:    s_sext_i32_i16 s4, s4
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s13
@@ -2014,20 +2015,21 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i32(<16 x i32> addrspa
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s19
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s11
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s18
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
 ; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s14, s7, 16
 ; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s15, s6, 16
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s7, s7
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s6, s6
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s12, s5, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s8
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s17
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s9
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s16
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s12, s5, 16
 ; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s13, s4, 16
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s5, s5
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s4, s4
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
+; GCN-NOHSA-VI-NEXT:    s_nop 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s15
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s7
@@ -2203,9 +2205,6 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(<32 x i32> addrspa
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx16 s[4:19], s[2:3], 0x0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-HSA-NEXT:    s_and_b32 s34, s17, s20
-; GCN-HSA-NEXT:    s_and_b32 s35, s16, s20
-; GCN-HSA-NEXT:    s_and_b32 s36, s19, s20
 ; GCN-HSA-NEXT:    s_and_b32 s21, s5, s20
 ; GCN-HSA-NEXT:    s_and_b32 s22, s4, s20
 ; GCN-HSA-NEXT:    s_and_b32 s23, s7, s20
@@ -2218,11 +2217,10 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(<32 x i32> addrspa
 ; GCN-HSA-NEXT:    s_and_b32 s30, s12, s20
 ; GCN-HSA-NEXT:    s_and_b32 s31, s15, s20
 ; GCN-HSA-NEXT:    s_and_b32 s33, s14, s20
+; GCN-HSA-NEXT:    s_and_b32 s34, s17, s20
+; GCN-HSA-NEXT:    s_and_b32 s35, s16, s20
+; GCN-HSA-NEXT:    s_and_b32 s36, s19, s20
 ; GCN-HSA-NEXT:    s_and_b32 s20, s18, s20
-; GCN-HSA-NEXT:    s_lshr_b32 s17, s17, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s16, s16, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s19, s19, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s18, s18, 16
 ; GCN-HSA-NEXT:    s_lshr_b32 s5, s5, 16
 ; GCN-HSA-NEXT:    s_lshr_b32 s4, s4, 16
 ; GCN-HSA-NEXT:    s_lshr_b32 s7, s7, 16
@@ -2235,6 +2233,10 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(<32 x i32> addrspa
 ; GCN-HSA-NEXT:    s_lshr_b32 s12, s12, 16
 ; GCN-HSA-NEXT:    s_lshr_b32 s15, s15, 16
 ; GCN-HSA-NEXT:    s_lshr_b32 s14, s14, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s17, s17, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s16, s16, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s19, s19, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s18, s18, 16
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x70
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s3
@@ -2319,67 +2321,68 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(<32 x i32> addrspa
 ; GCN-NOHSA-VI-NEXT:    s_load_dwordx16 s[4:19], s[6:7], 0x0
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s36, s19, 16
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s37, s18, 16
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s19, s19, s20
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s37, s18, 16
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s18, s18, s20
 ; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s34, s17, 16
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s35, s16, 16
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s17, s17, s20
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s35, s16, 16
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s16, s16, s20
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s18
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s37
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s19
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s36
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
 ; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s31, s15, 16
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s33, s14, 16
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s15, s15, s20
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s33, s14, 16
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s14, s14, s20
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s29, s13, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s35
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s17
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s34
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s29, s13, 16
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s30, s12, 16
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s13, s13, s20
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s30, s12, 16
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s12, s12, s20
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s27, s11, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s14
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s33
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s15
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s31
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s27, s11, 16
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s28, s10, 16
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s11, s11, s20
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s28, s10, 16
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s10, s10, s20
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s25, s9, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s12
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s30
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s13
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s29
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s25, s9, 16
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s26, s8, 16
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s9, s9, s20
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s26, s8, 16
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s8, s8, s20
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s23, s7, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s10
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s28
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s11
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s27
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s23, s7, 16
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s24, s6, 16
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s7, s7, s20
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s24, s6, 16
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s6, s6, s20
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s21, s5, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s8
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s26
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s9
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s25
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s21, s5, 16
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s22, s4, 16
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s5, s5, s20
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s22, s4, 16
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s4, s4, s20
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
+; GCN-NOHSA-VI-NEXT:    s_nop 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s24
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s7
@@ -2591,10 +2594,6 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(<32 x i32> addrspa
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx16 s[4:19], s[2:3], 0x0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-HSA-NEXT:    s_ashr_i32 s33, s17, 16
-; GCN-HSA-NEXT:    s_ashr_i32 s34, s16, 16
-; GCN-HSA-NEXT:    s_ashr_i32 s35, s19, 16
-; GCN-HSA-NEXT:    s_ashr_i32 s36, s18, 16
 ; GCN-HSA-NEXT:    s_ashr_i32 s20, s5, 16
 ; GCN-HSA-NEXT:    s_ashr_i32 s21, s4, 16
 ; GCN-HSA-NEXT:    s_ashr_i32 s22, s7, 16
@@ -2607,6 +2606,10 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(<32 x i32> addrspa
 ; GCN-HSA-NEXT:    s_ashr_i32 s29, s12, 16
 ; GCN-HSA-NEXT:    s_ashr_i32 s30, s15, 16
 ; GCN-HSA-NEXT:    s_ashr_i32 s31, s14, 16
+; GCN-HSA-NEXT:    s_ashr_i32 s33, s17, 16
+; GCN-HSA-NEXT:    s_ashr_i32 s34, s16, 16
+; GCN-HSA-NEXT:    s_ashr_i32 s35, s19, 16
+; GCN-HSA-NEXT:    s_ashr_i32 s36, s18, 16
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x70
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s3
@@ -2674,18 +2677,18 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(<32 x i32> addrspa
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s9
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s24
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-HSA-NEXT:    s_sext_i32_i16 s7, s7
 ; GCN-HSA-NEXT:    s_sext_i32_i16 s6, s6
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s23
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s7
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s22
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-HSA-NEXT:    s_sext_i32_i16 s5, s5
 ; GCN-HSA-NEXT:    s_sext_i32_i16 s4, s4
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s21
@@ -2717,56 +2720,57 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(<32 x i32> addrspa
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s36
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s19
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s35
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
 ; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s30, s15, 16
 ; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s31, s14, 16
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s15, s15
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s14, s14
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s28, s13, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s34
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s17
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s33
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s28, s13, 16
 ; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s29, s12, 16
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s13, s13
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s12, s12
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s26, s11, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s14
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s31
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s15
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s30
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s26, s11, 16
 ; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s27, s10, 16
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s11, s11
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s10, s10
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s24, s9, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s12
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s29
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s13
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s28
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s24, s9, 16
 ; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s25, s8, 16
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s9, s9
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s8, s8
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s22, s7, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s10
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s27
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s11
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s26
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s22, s7, 16
 ; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s23, s6, 16
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s7, s7
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s6, s6
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s20, s5, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s8
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s25
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s9
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s24
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s20, s5, 16
 ; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s21, s4, 16
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s5, s5
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s4, s4
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
+; GCN-NOHSA-VI-NEXT:    s_nop 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s23
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s7
@@ -3100,6 +3104,11 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(<64 x i32> addrspa
 ; GCN-HSA-NEXT:    s_and_b32 s53, s14, s37
 ; GCN-HSA-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x10
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-HSA-NEXT:    s_and_b32 s18, s1, s37
+; GCN-HSA-NEXT:    s_and_b32 s19, s0, s37
+; GCN-HSA-NEXT:    s_and_b32 s54, s3, s37
+; GCN-HSA-NEXT:    s_and_b32 s55, s2, s37
+; GCN-HSA-NEXT:    s_and_b32 s56, s5, s37
 ; GCN-HSA-NEXT:    s_and_b32 s57, s4, s37
 ; GCN-HSA-NEXT:    s_and_b32 s58, s7, s37
 ; GCN-HSA-NEXT:    s_and_b32 s59, s6, s37
@@ -3110,10 +3119,13 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(<64 x i32> addrspa
 ; GCN-HSA-NEXT:    s_and_b32 s64, s13, s37
 ; GCN-HSA-NEXT:    s_and_b32 s65, s12, s37
 ; GCN-HSA-NEXT:    s_and_b32 s66, s15, s37
-; GCN-HSA-NEXT:    s_and_b32 s54, s3, s37
-; GCN-HSA-NEXT:    s_and_b32 s55, s2, s37
-; GCN-HSA-NEXT:    s_and_b32 s56, s5, s37
+; GCN-HSA-NEXT:    s_and_b32 s37, s14, s37
+; GCN-HSA-NEXT:    s_lshr_b32 s67, s1, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s68, s0, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s3, s3, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s2, s2, 16
 ; GCN-HSA-NEXT:    s_lshr_b32 s5, s5, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s4, s4, 16
 ; GCN-HSA-NEXT:    s_lshr_b32 s7, s7, 16
 ; GCN-HSA-NEXT:    s_lshr_b32 s6, s6, 16
 ; GCN-HSA-NEXT:    s_lshr_b32 s9, s9, 16
@@ -3123,15 +3135,7 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(<64 x i32> addrspa
 ; GCN-HSA-NEXT:    s_lshr_b32 s13, s13, 16
 ; GCN-HSA-NEXT:    s_lshr_b32 s12, s12, 16
 ; GCN-HSA-NEXT:    s_lshr_b32 s15, s15, 16
-; GCN-HSA-NEXT:    s_and_b32 s18, s1, s37
-; GCN-HSA-NEXT:    s_and_b32 s19, s0, s37
-; GCN-HSA-NEXT:    s_and_b32 s37, s14, s37
 ; GCN-HSA-NEXT:    s_lshr_b32 s14, s14, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s67, s1, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s68, s0, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s3, s3, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s2, s2, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s4, s4, 16
 ; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0xf0
 ; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s1
@@ -3182,12 +3186,11 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(<64 x i32> addrspa
 ; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0x60
 ; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v19, s1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v18, s0
-; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0x50
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s14
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s66
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s15
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[21:22], v[0:3]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v18, s0
+; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0x50
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s63
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s10
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s62
@@ -3197,8 +3200,9 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(<64 x i32> addrspa
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s60
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v15, s9
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s57
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s55
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[21:22], v[0:3]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s55
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s56
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s5
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s2
@@ -3208,9 +3212,9 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(<64 x i32> addrspa
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s68
 ; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[26:27], v[8:11]
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[28:29], v[12:15]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v6, s18
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s53
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[28:29], v[12:15]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v7, s67
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s51
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s36
@@ -3284,18 +3288,7 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(<64 x i32> addrspa
 ; GCN-NOHSA-VI-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x40
 ; GCN-NOHSA-VI-NEXT:    s_load_dwordx16 s[36:51], s[18:19], 0x0
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s70, s15, s20
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s60, s5, 16
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s61, s4, 16
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s62, s7, 16
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s63, s6, 16
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s64, s9, s20
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s65, s8, s20
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s66, s11, s20
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s67, s10, s20
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s68, s13, s20
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s69, s12, s20
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s15, s15, 16
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s53, s1, s20
 ; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s18, s37, 16
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s19, s37, s20
 ; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s21, s36, 16
@@ -3328,15 +3321,26 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(<64 x i32> addrspa
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s49, s51, s20
 ; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s51, s50, 16
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s50, s50, s20
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s53, s1, s20
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s55, s0, s20
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s57, s3, s20
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s59, s2, s20
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s60, s5, 16
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s5, s5, s20
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s61, s4, 16
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s4, s4, s20
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s62, s7, 16
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s7, s7, s20
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s63, s6, 16
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s6, s6, s20
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s64, s9, s20
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s65, s8, s20
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s66, s11, s20
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s67, s10, s20
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s68, s13, s20
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s69, s12, s20
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s70, s15, s20
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s20, s14, s20
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s15, s15, 16
 ; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s14, s14, 16
 ; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s52, s1, 16
 ; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s54, s0, 16
@@ -3352,16 +3356,17 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(<64 x i32> addrspa
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s14
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s70
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s15
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
 ; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s11, s11, 16
 ; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s10, s10, 16
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s9, s9, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s69
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s12
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s68
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s13
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s9, s9, 16
 ; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s8, s8, 16
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224
+; GCN-NOHSA-VI-NEXT:    s_nop 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s67
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s10
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s66
@@ -3841,7 +3846,12 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(<64 x i32> addrspa
 ; GCN-HSA-NEXT:    s_sext_i32_i16 s52, s18
 ; GCN-HSA-NEXT:    s_load_dwordx16 s[4:19], s[2:3], 0x10
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-HSA-NEXT:    s_ashr_i32 s53, s5, 16
+; GCN-HSA-NEXT:    s_ashr_i32 s54, s4, 16
+; GCN-HSA-NEXT:    s_ashr_i32 s55, s7, 16
+; GCN-HSA-NEXT:    s_ashr_i32 s56, s6, 16
 ; GCN-HSA-NEXT:    s_ashr_i32 s57, s9, 16
+; GCN-HSA-NEXT:    s_ashr_i32 s58, s8, 16
 ; GCN-HSA-NEXT:    s_ashr_i32 s59, s11, 16
 ; GCN-HSA-NEXT:    s_ashr_i32 s60, s10, 16
 ; GCN-HSA-NEXT:    s_ashr_i32 s61, s13, 16
@@ -3852,11 +3862,6 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(<64 x i32> addrspa
 ; GCN-HSA-NEXT:    s_ashr_i32 s66, s16, 16
 ; GCN-HSA-NEXT:    s_ashr_i32 s67, s19, 16
 ; GCN-HSA-NEXT:    s_ashr_i32 s68, s18, 16
-; GCN-HSA-NEXT:    s_ashr_i32 s53, s5, 16
-; GCN-HSA-NEXT:    s_ashr_i32 s54, s4, 16
-; GCN-HSA-NEXT:    s_ashr_i32 s55, s7, 16
-; GCN-HSA-NEXT:    s_ashr_i32 s56, s6, 16
-; GCN-HSA-NEXT:    s_ashr_i32 s58, s8, 16
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xf0
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s3
@@ -3913,21 +3918,20 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(<64 x i32> addrspa
 ; GCN-HSA-NEXT:    s_sext_i32_i16 s18, s18
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v19, s3
 ; GCN-HSA-NEXT:    s_sext_i32_i16 s4, s4
+; GCN-HSA-NEXT:    s_sext_i32_i16 s7, s7
+; GCN-HSA-NEXT:    s_sext_i32_i16 s6, s6
+; GCN-HSA-NEXT:    s_sext_i32_i16 s9, s9
 ; GCN-HSA-NEXT:    s_sext_i32_i16 s8, s8
 ; GCN-HSA-NEXT:    s_sext_i32_i16 s13, s13
 ; GCN-HSA-NEXT:    s_sext_i32_i16 s12, s12
 ; GCN-HSA-NEXT:    s_sext_i32_i16 s15, s15
 ; GCN-HSA-NEXT:    s_sext_i32_i16 s14, s14
-; GCN-HSA-NEXT:    v_mov_b32_e32 v18, s2
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x50
-; GCN-HSA-NEXT:    s_sext_i32_i16 s7, s7
-; GCN-HSA-NEXT:    s_sext_i32_i16 s6, s6
-; GCN-HSA-NEXT:    s_sext_i32_i16 s9, s9
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s18
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s68
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s19
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s67
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[21:22], v[0:3]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v18, s2
+; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x50
 ; GCN-HSA-NEXT:    s_sext_i32_i16 s5, s5
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s14
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s64
@@ -3938,8 +3942,9 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(<64 x i32> addrspa
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s13
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v15, s61
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s8
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[21:22], v[0:3]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s58
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s9
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s57
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s56
@@ -3949,9 +3954,9 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(<64 x i32> addrspa
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s54
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[26:27], v[8:11]
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[28:29], v[12:15]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v6, s5
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s52
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[28:29], v[12:15]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v7, s53
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s48
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s50
@@ -4028,8 +4033,6 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(<64 x i32> addrspa
 ; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s70, s14, 16
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s15, s15
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s14, s14
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s67, s13, 16
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s68, s12, 16
 ; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s18, s37, 16
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s20, s37
 ; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s22, s39, 16
@@ -4047,13 +4050,15 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(<64 x i32> addrspa
 ; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s47, s51, 16
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s49, s51
 ; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s51, s1, 16
-; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s53, s1
 ; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s52, s0, 16
+; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s53, s1
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s54, s0
 ; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s55, s3, 16
-; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s57, s3
 ; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s56, s2, 16
+; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s57, s3
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s58, s2
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s67, s13, 16
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s68, s12, 16
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s13, s13
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s12, s12
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s3, 0xf000
@@ -4064,56 +4069,55 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(<64 x i32> addrspa
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s70
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s15
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s69
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
 ; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s65, s11, 16
 ; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s66, s10, 16
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s11, s11
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s10, s10
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s63, s9, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s12
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s68
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s13
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s67
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s63, s9, 16
 ; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s64, s8, 16
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s9, s9
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s8, s8
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s61, s7, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s10
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s66
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s11
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s65
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s61, s7, 16
 ; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s62, s6, 16
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s7, s7
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s6, s6
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s59, s5, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s8
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s64
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s9
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s63
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s59, s5, 16
 ; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s60, s4, 16
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s5, s5
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s4, s4
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s19, s36, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s62
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s7
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s61
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s19, s36, 16
+; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s21, s36
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s60
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s5
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s59
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160
-; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s21, s36
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s23, s38, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s58
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s56
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s57
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s55
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s23, s38, 16
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s25, s38
 ; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s27, s40, 16
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s29, s40
@@ -4127,6 +4131,8 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(<64 x i32> addrspa
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s46, s48
 ; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s48, s50, 16
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s50, s50
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144
+; GCN-NOHSA-VI-NEXT:    s_nop 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s54
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s52
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s53
@@ -5014,10 +5020,10 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(<4 x i64> addrspace(
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_mov_b32 s4, s3
 ; GCN-HSA-NEXT:    s_lshr_b32 s6, s2, 16
-; GCN-HSA-NEXT:    s_bfe_i64 s[4:5], s[4:5], 0x100000
 ; GCN-HSA-NEXT:    s_bfe_i64 s[8:9], s[2:3], 0x100000
 ; GCN-HSA-NEXT:    s_ashr_i64 s[2:3], s[2:3], 48
 ; GCN-HSA-NEXT:    s_bfe_i64 s[6:7], s[6:7], 0x100000
+; GCN-HSA-NEXT:    s_bfe_i64 s[4:5], s[4:5], 0x100000
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 16
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s3
@@ -5048,10 +5054,10 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(<4 x i64> addrspace(
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s8, s5
 ; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s10, s5, 16
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[8:9], s[8:9], 0x100000
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[10:11], s[10:11], 0x100000
 ; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[6:7], s[4:5], 0x100000
 ; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s4, s4, 16
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[8:9], s[8:9], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[10:11], s[10:11], 0x100000
 ; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[4:5], s[4:5], 0x100000
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s8
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s9
@@ -5158,10 +5164,10 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i64(<8 x i64> addrspace(
 ; GCN-HSA-NEXT:    s_lshr_b32 s2, s7, 16
 ; GCN-HSA-NEXT:    s_lshr_b32 s10, s6, 16
 ; GCN-HSA-NEXT:    s_lshr_b32 s11, s4, 16
-; GCN-HSA-NEXT:    s_and_b32 s3, s7, s8
 ; GCN-HSA-NEXT:    s_and_b32 s4, s4, s8
 ; GCN-HSA-NEXT:    s_and_b32 s6, s6, s8
 ; GCN-HSA-NEXT:    s_and_b32 s5, s5, s8
+; GCN-HSA-NEXT:    s_and_b32 s3, s7, s8
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 48
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s3
@@ -5211,12 +5217,13 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i64(<8 x i64> addrspace(
 ; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s6, s6, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s8
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s7
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
 ; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s5, s5, 16
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s4, s4, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s11
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s4, s4, 16
+; GCN-NOHSA-VI-NEXT:    s_nop 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s10
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s5
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
@@ -5333,14 +5340,14 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(<8 x i64> addrspace(
 ; GCN-HSA-NEXT:    s_mov_b32 s8, s5
 ; GCN-HSA-NEXT:    s_lshr_b32 s10, s6, 16
 ; GCN-HSA-NEXT:    s_lshr_b32 s12, s4, 16
-; GCN-HSA-NEXT:    s_bfe_i64 s[2:3], s[2:3], 0x100000
 ; GCN-HSA-NEXT:    s_bfe_i64 s[14:15], s[4:5], 0x100000
 ; GCN-HSA-NEXT:    s_bfe_i64 s[16:17], s[6:7], 0x100000
-; GCN-HSA-NEXT:    s_ashr_i64 s[6:7], s[6:7], 48
 ; GCN-HSA-NEXT:    s_ashr_i64 s[4:5], s[4:5], 48
+; GCN-HSA-NEXT:    s_ashr_i64 s[6:7], s[6:7], 48
 ; GCN-HSA-NEXT:    s_bfe_i64 s[12:13], s[12:13], 0x100000
 ; GCN-HSA-NEXT:    s_bfe_i64 s[10:11], s[10:11], 0x100000
 ; GCN-HSA-NEXT:    s_bfe_i64 s[8:9], s[8:9], 0x100000
+; GCN-HSA-NEXT:    s_bfe_i64 s[2:3], s[2:3], 0x100000
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 48
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -5400,17 +5407,18 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(<8 x i64> addrspace(
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s19
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s7
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
 ; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[8:9], s[4:5], 0x100000
 ; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s4, s4, 16
 ; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[10:11], s[10:11], 0x100000
 ; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[12:13], s[12:13], 0x100000
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[4:5], s[4:5], 0x100000
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s14
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s15
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s17
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[4:5], s[4:5], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_nop 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s10
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s11
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s12
@@ -5559,7 +5567,6 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(<16 x i64> addrspa
 ; GCN-HSA-NEXT:    s_lshr_b32 s17, s10, 16
 ; GCN-HSA-NEXT:    s_lshr_b32 s18, s6, 16
 ; GCN-HSA-NEXT:    s_lshr_b32 s19, s4, 16
-; GCN-HSA-NEXT:    s_and_b32 s3, s9, s12
 ; GCN-HSA-NEXT:    s_and_b32 s4, s4, s12
 ; GCN-HSA-NEXT:    s_and_b32 s6, s6, s12
 ; GCN-HSA-NEXT:    s_and_b32 s10, s10, s12
@@ -5567,6 +5574,7 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(<16 x i64> addrspa
 ; GCN-HSA-NEXT:    s_and_b32 s5, s5, s12
 ; GCN-HSA-NEXT:    s_and_b32 s7, s7, s12
 ; GCN-HSA-NEXT:    s_and_b32 s11, s11, s12
+; GCN-HSA-NEXT:    s_and_b32 s3, s9, s12
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x50
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s3
@@ -5648,28 +5656,29 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(<16 x i64> addrspa
 ; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s10, s10, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s12
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s11
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
 ; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s9, s9, 16
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s8, s8, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s19
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s10
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s8, s8, 16
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s7, s7, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s18
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s9
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s7, s7, 16
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s6, s6, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s17
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s8
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s6, s6, 16
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s5, s5, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s7
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s5, s5, 16
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s4, s4, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s15
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s4, s4, 16
+; GCN-NOHSA-VI-NEXT:    s_nop 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s14
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s5
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
@@ -5859,18 +5868,17 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(<16 x i64> addrspa
 ; GCN-HSA-NEXT:    s_mov_b32 s16, s5
 ; GCN-HSA-NEXT:    s_lshr_b32 s18, s10, 16
 ; GCN-HSA-NEXT:    s_lshr_b32 s20, s8, 16
-; GCN-HSA-NEXT:    s_bfe_i64 s[2:3], s[2:3], 0x100000
-; GCN-HSA-NEXT:    s_bfe_i64 s[34:35], s[10:11], 0x100000
-; GCN-HSA-NEXT:    s_ashr_i64 s[10:11], s[10:11], 48
 ; GCN-HSA-NEXT:    s_lshr_b32 s22, s6, 16
 ; GCN-HSA-NEXT:    s_lshr_b32 s24, s4, 16
-; GCN-HSA-NEXT:    s_bfe_i64 s[12:13], s[12:13], 0x100000
+; GCN-HSA-NEXT:    s_bfe_i64 s[34:35], s[10:11], 0x100000
+; GCN-HSA-NEXT:    s_ashr_i64 s[10:11], s[10:11], 48
+; GCN-HSA-NEXT:    s_bfe_i64 s[2:3], s[2:3], 0x100000
 ; GCN-HSA-NEXT:    s_bfe_i64 s[26:27], s[4:5], 0x100000
 ; GCN-HSA-NEXT:    s_bfe_i64 s[28:29], s[6:7], 0x100000
 ; GCN-HSA-NEXT:    s_bfe_i64 s[30:31], s[8:9], 0x100000
-; GCN-HSA-NEXT:    s_ashr_i64 s[8:9], s[8:9], 48
 ; GCN-HSA-NEXT:    s_ashr_i64 s[4:5], s[4:5], 48
 ; GCN-HSA-NEXT:    s_ashr_i64 s[6:7], s[6:7], 48
+; GCN-HSA-NEXT:    s_ashr_i64 s[8:9], s[8:9], 48
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s10
@@ -5881,6 +5889,7 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(<16 x i64> addrspa
 ; GCN-HSA-NEXT:    s_bfe_i64 s[18:19], s[18:19], 0x100000
 ; GCN-HSA-NEXT:    s_bfe_i64 s[16:17], s[16:17], 0x100000
 ; GCN-HSA-NEXT:    s_bfe_i64 s[14:15], s[14:15], 0x100000
+; GCN-HSA-NEXT:    s_bfe_i64 s[12:13], s[12:13], 0x100000
 ; GCN-HSA-NEXT:    s_add_u32 s22, s0, 0x70
 ; GCN-HSA-NEXT:    s_addc_u32 s23, s1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v6, s8
@@ -5888,8 +5897,8 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(<16 x i64> addrspa
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s22
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v7, s9
 ; GCN-HSA-NEXT:    s_addc_u32 s9, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s9
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s23
+; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s9
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s12
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s13
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s8
@@ -5910,28 +5919,28 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(<16 x i64> addrspa
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s5
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s4
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s5
-; GCN-HSA-NEXT:    s_add_u32 s4, s0, 0x60
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s16
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s17
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s5
+; GCN-HSA-NEXT:    s_add_u32 s4, s0, 0x60
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s4
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s5
-; GCN-HSA-NEXT:    s_add_u32 s4, s0, 64
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s34
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s35
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s18
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s19
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s5
+; GCN-HSA-NEXT:    s_add_u32 s4, s0, 64
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s4
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s5
-; GCN-HSA-NEXT:    s_add_u32 s4, s0, 32
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s30
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s31
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s20
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s21
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s5
+; GCN-HSA-NEXT:    s_add_u32 s4, s0, 32
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s4
@@ -5964,13 +5973,13 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(<16 x i64> addrspa
 ; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s10, s10, 16
 ; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[34:35], s[10:11], 0x100000
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s10, s11
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[36:37], s[10:11], 0x100000
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s10, s11, 16
 ; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[24:25], s[8:9], 0x100000
 ; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s8, s8, 16
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[10:11], s[10:11], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[36:37], s[10:11], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s10, s11, 16
 ; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[26:27], s[8:9], 0x100000
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s8, s9
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[10:11], s[10:11], 0x100000
 ; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[18:19], s[6:7], 0x100000
 ; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s6, s6, 16
 ; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[28:29], s[8:9], 0x100000
@@ -5979,45 +5988,46 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(<16 x i64> addrspa
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s37
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s10
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s11
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
 ; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[20:21], s[6:7], 0x100000
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s6, s7
 ; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[8:9], s[8:9], 0x100000
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[22:23], s[6:7], 0x100000
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s30
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s31
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s34
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s35
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[22:23], s[6:7], 0x100000
 ; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s6, s7, 16
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[6:7], s[6:7], 0x100000
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s28
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s29
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s8
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s9
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[6:7], s[6:7], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s14, s5
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s24
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s25
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s26
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s27
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s14, s5
 ; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s16, s5, 16
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[12:13], s[4:5], 0x100000
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s22
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s23
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s7
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[12:13], s[4:5], 0x100000
 ; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s4, s4, 16
 ; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[14:15], s[14:15], 0x100000
 ; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[16:17], s[16:17], 0x100000
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[4:5], s[4:5], 0x100000
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s18
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s19
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s20
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s21
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[4:5], s[4:5], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_nop 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s14
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s15
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s16
@@ -6257,7 +6267,6 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(<32 x i64> addrspa
 ; GCN-HSA-NEXT:    s_and_b32 s35, s15, s20
 ; GCN-HSA-NEXT:    s_and_b32 s36, s17, s20
 ; GCN-HSA-NEXT:    s_and_b32 s20, s19, s20
-; GCN-HSA-NEXT:    s_lshr_b32 s19, s19, 16
 ; GCN-HSA-NEXT:    s_lshr_b32 s5, s5, 16
 ; GCN-HSA-NEXT:    s_lshr_b32 s7, s7, 16
 ; GCN-HSA-NEXT:    s_lshr_b32 s9, s9, 16
@@ -6265,6 +6274,7 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(<32 x i64> addrspa
 ; GCN-HSA-NEXT:    s_lshr_b32 s13, s13, 16
 ; GCN-HSA-NEXT:    s_lshr_b32 s15, s15, 16
 ; GCN-HSA-NEXT:    s_lshr_b32 s17, s17, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s19, s19, 16
 ; GCN-HSA-NEXT:    s_lshr_b32 s18, s18, 16
 ; GCN-HSA-NEXT:    s_lshr_b32 s16, s16, 16
 ; GCN-HSA-NEXT:    s_lshr_b32 s14, s14, 16
@@ -6289,9 +6299,9 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(<32 x i64> addrspa
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s2
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x70
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s20
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s19
+; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x70
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s36
@@ -6302,9 +6312,9 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(<32 x i64> addrspa
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s15
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x50
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s34
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s13
+; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x50
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[10:11], v[0:3]
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s33
@@ -6418,60 +6428,61 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(<32 x i64> addrspa
 ; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s18, s18, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s20
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s19
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
 ; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s17, s17, 16
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s16, s16, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s36
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s18
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s16, s16, 16
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s15, s15, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s35
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s17
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s15, s15, 16
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s14, s14, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s34
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s16
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s14, s14, 16
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s13, s13, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s33
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s15
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s13, s13, 16
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s12, s12, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s31
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s14
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s12, s12, 16
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s11, s11, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s30
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s13
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s11, s11, 16
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s10, s10, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s29
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s12
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s10, s10, 16
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s9, s9, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s28
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s11
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s9, s9, 16
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s8, s8, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s27
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s10
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s8, s8, 16
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s7, s7, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s26
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s9
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s7, s7, 16
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s6, s6, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s25
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s8
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s6, s6, 16
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s5, s5, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s24
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s7
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s5, s5, 16
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s4, s4, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s23
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s4, s4, 16
+; GCN-NOHSA-VI-NEXT:    s_nop 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s22
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s5
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
@@ -6816,27 +6827,22 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(<32 x i64> addrspa
 ; GCN-HSA-NEXT:    s_lshr_b32 s66, s2, 16
 ; GCN-HSA-NEXT:    s_lshr_b32 s68, s0, 16
 ; GCN-HSA-NEXT:    s_bfe_i64 s[18:19], s[0:1], 0x100000
-; GCN-HSA-NEXT:    s_ashr_i64 s[36:37], s[0:1], 48
-; GCN-HSA-NEXT:    s_ashr_i64 s[0:1], s[14:15], 48
 ; GCN-HSA-NEXT:    s_bfe_i64 s[20:21], s[2:3], 0x100000
+; GCN-HSA-NEXT:    s_ashr_i64 s[36:37], s[0:1], 48
 ; GCN-HSA-NEXT:    s_ashr_i64 s[70:71], s[2:3], 48
+; GCN-HSA-NEXT:    s_ashr_i64 s[0:1], s[14:15], 48
 ; GCN-HSA-NEXT:    s_bfe_i64 s[2:3], s[38:39], 0x100000
-; GCN-HSA-NEXT:    s_ashr_i64 s[74:75], s[6:7], 48
-; GCN-HSA-NEXT:    s_ashr_i64 s[76:77], s[8:9], 48
-; GCN-HSA-NEXT:    s_ashr_i64 s[78:79], s[10:11], 48
-; GCN-HSA-NEXT:    s_bfe_i64 s[48:49], s[48:49], 0x100000
-; GCN-HSA-NEXT:    s_bfe_i64 s[46:47], s[46:47], 0x100000
-; GCN-HSA-NEXT:    s_bfe_i64 s[44:45], s[44:45], 0x100000
-; GCN-HSA-NEXT:    s_bfe_i64 s[42:43], s[42:43], 0x100000
-; GCN-HSA-NEXT:    s_bfe_i64 s[40:41], s[40:41], 0x100000
+; GCN-HSA-NEXT:    s_bfe_i64 s[22:23], s[4:5], 0x100000
 ; GCN-HSA-NEXT:    s_bfe_i64 s[24:25], s[6:7], 0x100000
 ; GCN-HSA-NEXT:    s_bfe_i64 s[26:27], s[8:9], 0x100000
 ; GCN-HSA-NEXT:    s_bfe_i64 s[28:29], s[10:11], 0x100000
 ; GCN-HSA-NEXT:    s_bfe_i64 s[30:31], s[12:13], 0x100000
-; GCN-HSA-NEXT:    s_ashr_i64 s[12:13], s[12:13], 48
 ; GCN-HSA-NEXT:    s_bfe_i64 s[34:35], s[14:15], 0x100000
-; GCN-HSA-NEXT:    s_bfe_i64 s[22:23], s[4:5], 0x100000
 ; GCN-HSA-NEXT:    s_ashr_i64 s[72:73], s[4:5], 48
+; GCN-HSA-NEXT:    s_ashr_i64 s[74:75], s[6:7], 48
+; GCN-HSA-NEXT:    s_ashr_i64 s[76:77], s[8:9], 48
+; GCN-HSA-NEXT:    s_ashr_i64 s[78:79], s[10:11], 48
+; GCN-HSA-NEXT:    s_ashr_i64 s[12:13], s[12:13], 48
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s0
@@ -6851,6 +6857,11 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(<32 x i64> addrspa
 ; GCN-HSA-NEXT:    s_bfe_i64 s[38:39], s[54:55], 0x100000
 ; GCN-HSA-NEXT:    s_bfe_i64 s[52:53], s[52:53], 0x100000
 ; GCN-HSA-NEXT:    s_bfe_i64 s[50:51], s[50:51], 0x100000
+; GCN-HSA-NEXT:    s_bfe_i64 s[48:49], s[48:49], 0x100000
+; GCN-HSA-NEXT:    s_bfe_i64 s[46:47], s[46:47], 0x100000
+; GCN-HSA-NEXT:    s_bfe_i64 s[44:45], s[44:45], 0x100000
+; GCN-HSA-NEXT:    s_bfe_i64 s[42:43], s[42:43], 0x100000
+; GCN-HSA-NEXT:    s_bfe_i64 s[40:41], s[40:41], 0x100000
 ; GCN-HSA-NEXT:    s_add_u32 s54, s16, 0xf0
 ; GCN-HSA-NEXT:    s_addc_u32 s55, s17, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v6, s12
@@ -6897,10 +6908,8 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(<32 x i64> addrspa
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s13
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s12
 ; GCN-HSA-NEXT:    s_add_u32 s12, s16, 0xc0
-; GCN-HSA-NEXT:    s_addc_u32 s13, s17, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s55
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[22:23], v[0:3]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v19, s13
+; GCN-HSA-NEXT:    s_addc_u32 s13, s17, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s42
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s43
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s78
@@ -6911,19 +6920,21 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(<32 x i64> addrspa
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v15, s77
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s48
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s49
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s50
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[22:23], v[0:3]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s72
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s50
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s73
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s51
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s70
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s71
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[26:27], v[8:11]
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[28:29], v[12:15]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v19, s13
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s52
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s53
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[26:27], v[8:11]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v6, s36
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v7, s37
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s34
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[28:29], v[12:15]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s35
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s30
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s38
@@ -7024,27 +7035,27 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(<32 x i64> addrspa
 ; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[22:23], s[4:5], 0x100000
 ; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[68:69], s[14:15], 0x100000
 ; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s14, s14, 16
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[4:5], s[24:25], 0x100000
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[24:25], s[28:29], 0x100000
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[28:29], s[34:35], 0x100000
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[34:35], s[38:39], 0x100000
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[38:39], s[42:43], 0x100000
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[42:43], s[48:49], 0x100000
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[48:49], s[54:55], 0x100000
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[54:55], s[60:61], 0x100000
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[60:61], s[66:67], 0x100000
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[66:67], s[72:73], 0x100000
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s0, s16
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s1, s17
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[4:5], s[24:25], 0x100000
 ; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[16:17], s[26:27], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[24:25], s[28:29], 0x100000
 ; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[26:27], s[30:31], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[28:29], s[34:35], 0x100000
 ; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[30:31], s[36:37], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[34:35], s[38:39], 0x100000
 ; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[36:37], s[40:41], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[38:39], s[42:43], 0x100000
 ; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[40:41], s[46:47], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[42:43], s[48:49], 0x100000
 ; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[46:47], s[52:53], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[48:49], s[54:55], 0x100000
 ; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[52:53], s[58:59], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[54:55], s[60:61], 0x100000
 ; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[58:59], s[64:65], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[60:61], s[66:67], 0x100000
 ; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[64:65], s[70:71], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[66:67], s[72:73], 0x100000
 ; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[20:21], s[2:3], 0x100000
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s2, -1
@@ -7053,54 +7064,55 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(<32 x i64> addrspa
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s65
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s66
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s67
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
 ; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[62:63], s[12:13], 0x100000
 ; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s12, s12, 16
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[12:13], s[12:13], 0x100000
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s68
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s69
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s14
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s15
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[12:13], s[12:13], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[56:57], s[10:11], 0x100000
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s58
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s59
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s60
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s61
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[56:57], s[10:11], 0x100000
 ; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s10, s10, 16
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[10:11], s[10:11], 0x100000
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s62
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s63
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s12
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s13
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[10:11], s[10:11], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[50:51], s[8:9], 0x100000
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s52
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s53
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s54
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s55
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[50:51], s[8:9], 0x100000
 ; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s8, s8, 16
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[8:9], s[8:9], 0x100000
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s56
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s57
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s10
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s11
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[8:9], s[8:9], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[44:45], s[6:7], 0x100000
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s46
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s47
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s48
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s49
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[44:45], s[6:7], 0x100000
 ; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s6, s6, 16
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[6:7], s[6:7], 0x100000
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s50
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s51
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s8
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s9
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[6:7], s[6:7], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_nop 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s40
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s41
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s42

diff  --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
index 161f03c4b8795..481ab8d39647d 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
@@ -525,8 +525,8 @@ define amdgpu_kernel void @global_load_v16i16(<16 x i16> addrspace(1)* %out, <16
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_add_u32 s4, s0, 16
-; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-HSA-NEXT:    s_add_u32 s2, s2, 16
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s3, 0
@@ -1725,8 +1725,8 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i32(<4 x i32> addrspace(1)
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-HSA-NEXT:    v_ashr_i64 v[7:8], v[3:4], 48
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v1, 16, v3
-; GCN-HSA-NEXT:    v_bfe_i32 v0, v3, 0, 16
 ; GCN-HSA-NEXT:    v_bfe_i32 v2, v4, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v0, v3, 0, 16
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v3, v7
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[5:6], v[0:3]
 ; GCN-HSA-NEXT:    s_endpgm
@@ -2177,8 +2177,8 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i32(<16 x i32> addrspace
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 48
-; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v15, s1
+; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    s_mov_b32 s4, 0xffff
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s0
 ; GCN-HSA-NEXT:    s_add_u32 s0, s0, 32
@@ -2189,16 +2189,16 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i32(<16 x i32> addrspace
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v13, 16, v3
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v10, 16, v7
-; GCN-HSA-NEXT:    v_and_b32_e32 v9, s4, v7
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v8, 16, v6
+; GCN-HSA-NEXT:    v_and_b32_e32 v9, s4, v7
 ; GCN-HSA-NEXT:    v_and_b32_e32 v7, s4, v6
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[11:12], v[7:10]
-; GCN-HSA-NEXT:    v_and_b32_e32 v12, s4, v3
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
-; GCN-HSA-NEXT:    v_and_b32_e32 v8, s4, v1
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
+; GCN-HSA-NEXT:    v_and_b32_e32 v8, s4, v1
 ; GCN-HSA-NEXT:    v_and_b32_e32 v6, s4, v0
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
+; GCN-HSA-NEXT:    v_and_b32_e32 v12, s4, v3
 ; GCN-HSA-NEXT:    v_and_b32_e32 v10, s4, v2
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
@@ -2415,22 +2415,22 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i32(<16 x i32> addrspace
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v19, s3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v18, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 48
-; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s1
+; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s0
 ; GCN-HSA-NEXT:    s_add_u32 s0, s0, 32
-; GCN-HSA-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s1
+; GCN-HSA-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s1
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s0
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v9, 16, v0
-; GCN-HSA-NEXT:    v_bfe_i32 v8, v0, 0, 16
 ; GCN-HSA-NEXT:    v_bfe_i32 v10, v1, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v8, v0, 0, 16
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v15, 16, v3
-; GCN-HSA-NEXT:    v_bfe_i32 v14, v3, 0, 16
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v13, 16, v2
+; GCN-HSA-NEXT:    v_bfe_i32 v14, v3, 0, 16
 ; GCN-HSA-NEXT:    v_bfe_i32 v12, v2, 0, 16
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v11, 16, v1
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[18:19], v[12:15]
@@ -2438,8 +2438,8 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i32(<16 x i32> addrspace
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(2)
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v3, 16, v5
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v10, 16, v7
-; GCN-HSA-NEXT:    v_bfe_i32 v9, v7, 0, 16
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v8, 16, v6
+; GCN-HSA-NEXT:    v_bfe_i32 v9, v7, 0, 16
 ; GCN-HSA-NEXT:    v_bfe_i32 v7, v6, 0, 16
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v1, 16, v4
 ; GCN-HSA-NEXT:    v_bfe_i32 v2, v5, 0, 16
@@ -2469,12 +2469,12 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i32(<16 x i32> addrspace
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v17, 16, v6
 ; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v18, v7, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v16, v6, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v10, v1, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v9, 16, v0
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v10, v1, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v8, v0, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v15, 16, v3
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v14, v3, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v13, 16, v2
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v14, v3, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v12, v2, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v3, 16, v5
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v1, 16, v4
@@ -2682,12 +2682,12 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i32(<32 x i32> addrspace
 ; GCN-HSA-NEXT:    s_add_u32 s4, s2, 32
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s4
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s5
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s3
-; GCN-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
-; GCN-HSA-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s5
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s2, 48
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s3, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s2
@@ -2726,41 +2726,41 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i32(<32 x i32> addrspace
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[16:19]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v19, 16, v7
-; GCN-HSA-NEXT:    v_and_b32_e32 v18, s14, v7
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v17, 16, v6
+; GCN-HSA-NEXT:    v_and_b32_e32 v18, s14, v7
 ; GCN-HSA-NEXT:    v_and_b32_e32 v16, s14, v6
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s11
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[16:19]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s1
-; GCN-HSA-NEXT:    s_add_u32 s0, s0, 48
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[16:19]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s3
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(3)
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v7, 16, v13
-; GCN-HSA-NEXT:    v_and_b32_e32 v6, s14, v13
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v5, 16, v12
+; GCN-HSA-NEXT:    v_and_b32_e32 v6, s14, v13
 ; GCN-HSA-NEXT:    v_and_b32_e32 v4, s14, v12
+; GCN-HSA-NEXT:    s_add_u32 s0, s0, 48
+; GCN-HSA-NEXT:    v_mov_b32_e32 v19, s2
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-HSA-NEXT:    v_mov_b32_e32 v19, s2
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v18, 16, v15
-; GCN-HSA-NEXT:    v_and_b32_e32 v17, s14, v15
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v16, 16, v14
+; GCN-HSA-NEXT:    v_and_b32_e32 v17, s14, v15
 ; GCN-HSA-NEXT:    v_and_b32_e32 v15, s14, v14
 ; GCN-HSA-NEXT:    s_addc_u32 s1, s1, 0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[19:20], v[15:18]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[19:20], v[15:18]
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v18, 16, v9
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v16, 16, v8
+; GCN-HSA-NEXT:    v_and_b32_e32 v17, s14, v9
 ; GCN-HSA-NEXT:    v_and_b32_e32 v15, s14, v8
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s1
-; GCN-HSA-NEXT:    v_and_b32_e32 v17, s14, v9
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
-; GCN-HSA-NEXT:    v_and_b32_e32 v5, s14, v3
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GCN-HSA-NEXT:    v_and_b32_e32 v5, s14, v3
 ; GCN-HSA-NEXT:    v_and_b32_e32 v3, s14, v2
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v14, 16, v11
-; GCN-HSA-NEXT:    v_and_b32_e32 v13, s14, v11
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v12, 16, v10
+; GCN-HSA-NEXT:    v_and_b32_e32 v13, s14, v11
 ; GCN-HSA-NEXT:    v_and_b32_e32 v11, s14, v10
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v7, s0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[15:18]
@@ -3092,8 +3092,8 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i32(<32 x i32> addrspace
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s4
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s5
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s5
 ; GCN-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; GCN-HSA-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
 ; GCN-HSA-NEXT:    s_add_u32 s4, s2, 32
@@ -3102,8 +3102,8 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i32(<32 x i32> addrspace
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s3, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s5
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s2
 ; GCN-HSA-NEXT:    flat_load_dwordx4 v[8:11], v[8:9]
 ; GCN-HSA-NEXT:    flat_load_dwordx4 v[12:15], v[12:13]
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 16
@@ -3124,8 +3124,8 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i32(<32 x i32> addrspace
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x70
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v19, 16, v3
-; GCN-HSA-NEXT:    v_bfe_i32 v18, v3, 0, 16
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v17, 16, v2
+; GCN-HSA-NEXT:    v_bfe_i32 v18, v3, 0, 16
 ; GCN-HSA-NEXT:    v_bfe_i32 v16, v2, 0, 16
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[16:19]
@@ -3138,8 +3138,8 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i32(<32 x i32> addrspace
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v19, s3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v18, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x50
-; GCN-HSA-NEXT:    v_bfe_i32 v2, v5, 0, 16
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v1, 16, v4
+; GCN-HSA-NEXT:    v_bfe_i32 v2, v5, 0, 16
 ; GCN-HSA-NEXT:    v_bfe_i32 v0, v4, 0, 16
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[20:21], v[0:3]
@@ -3149,8 +3149,8 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i32(<32 x i32> addrspace
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    s_add_u32 s0, s0, 48
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v3, 16, v7
-; GCN-HSA-NEXT:    v_bfe_i32 v2, v7, 0, 16
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v1, 16, v6
+; GCN-HSA-NEXT:    v_bfe_i32 v2, v7, 0, 16
 ; GCN-HSA-NEXT:    v_bfe_i32 v0, v6, 0, 16
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(4)
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v7, 16, v9
@@ -3165,9 +3165,9 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i32(<32 x i32> addrspace
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v1, 16, v10
 ; GCN-HSA-NEXT:    v_bfe_i32 v2, v11, 0, 16
 ; GCN-HSA-NEXT:    v_bfe_i32 v0, v10, 0, 16
+; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s1
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[18:19], v[4:7]
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[20:21], v[0:3]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s1
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(6)
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v7, 16, v13
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v5, 16, v12
@@ -3200,8 +3200,8 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i32(<32 x i32> addrspace
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s1, s5
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(3)
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v19, 16, v3
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v18, v3, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v17, 16, v2
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v18, v3, 0, 16
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v35, 16, v13
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v33, 16, v12
@@ -3209,24 +3209,24 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i32(<32 x i32> addrspace
 ; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v32, v12, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v16, v2, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v23, 16, v1
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v22, v1, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v21, 16, v0
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v22, v1, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v20, v0, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v3, 16, v7
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v2, v7, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v1, 16, v6
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v2, v7, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v0, v6, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v27, 16, v5
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v26, v5, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v25, 16, v4
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v26, v5, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v24, v4, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v7, 16, v11
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v6, v11, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v5, 16, v10
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v6, v11, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v4, v10, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v31, 16, v9
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v30, v9, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v29, 16, v8
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v30, v9, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v28, v8, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v11, 16, v15
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v9, 16, v14
@@ -3617,16 +3617,16 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(<64 x i32> addrspace
 ; GCN-HSA-NEXT:    s_add_u32 s4, s2, 32
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
 ; GCN-HSA-NEXT:    s_add_u32 s6, s2, 48
-; GCN-HSA-NEXT:    s_addc_u32 s7, s3, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v33, s3
+; GCN-HSA-NEXT:    s_addc_u32 s7, s3, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v32, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s2, 64
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s3, 0
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[8:11], v[8:9]
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[12:15], v[12:13]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s5
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s7
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v29, s3
-; GCN-HSA-NEXT:    flat_load_dwordx4 v[8:11], v[8:9]
-; GCN-HSA-NEXT:    flat_load_dwordx4 v[12:15], v[12:13]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s4
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s6
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v28, s2
@@ -3658,8 +3658,8 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(<64 x i32> addrspace
 ; GCN-HSA-NEXT:    s_addc_u32 s13, s1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s12
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v27, 16, v3
-; GCN-HSA-NEXT:    v_and_b32_e32 v26, s17, v3
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v25, 16, v2
+; GCN-HSA-NEXT:    v_and_b32_e32 v26, s17, v3
 ; GCN-HSA-NEXT:    v_and_b32_e32 v24, s17, v2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s13
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[24:27]
@@ -3673,8 +3673,8 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(<64 x i32> addrspace
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s10
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v3, 16, v7
-; GCN-HSA-NEXT:    v_and_b32_e32 v2, s17, v7
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
+; GCN-HSA-NEXT:    v_and_b32_e32 v2, s17, v7
 ; GCN-HSA-NEXT:    v_and_b32_e32 v0, s17, v6
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s11
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
@@ -3682,27 +3682,27 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(<64 x i32> addrspace
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(9)
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v3, 16, v9
-; GCN-HSA-NEXT:    v_and_b32_e32 v2, s17, v9
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v1, 16, v8
+; GCN-HSA-NEXT:    v_and_b32_e32 v2, s17, v9
 ; GCN-HSA-NEXT:    v_and_b32_e32 v0, s17, v8
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s6
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s1
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v3, 16, v11
-; GCN-HSA-NEXT:    v_and_b32_e32 v2, s17, v11
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v1, 16, v10
+; GCN-HSA-NEXT:    v_and_b32_e32 v2, s17, v11
 ; GCN-HSA-NEXT:    v_and_b32_e32 v0, s17, v10
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s7
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x80
-; GCN-HSA-NEXT:    v_mov_b32_e32 v27, s5
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v27, s5
+; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x80
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v26, s4
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(6)
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v3, 16, v33
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v1, 16, v32
 ; GCN-HSA-NEXT:    v_and_b32_e32 v2, s17, v33
 ; GCN-HSA-NEXT:    v_and_b32_e32 v0, s17, v32
-; GCN-HSA-NEXT:    v_mov_b32_e32 v26, s4
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v7, 16, v35
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v5, 16, v34
 ; GCN-HSA-NEXT:    v_and_b32_e32 v6, s17, v35
@@ -3715,12 +3715,12 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(<64 x i32> addrspace
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x90
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, s15
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v11, 16, v29
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v9, 16, v28
 ; GCN-HSA-NEXT:    v_and_b32_e32 v10, s17, v29
 ; GCN-HSA-NEXT:    v_and_b32_e32 v8, s17, v28
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
+; GCN-HSA-NEXT:    s_add_u32 s2, s0, s15
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v3, 16, v31
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v1, 16, v30
@@ -3733,26 +3733,26 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(<64 x i32> addrspace
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, s16
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s2
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 64
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v3, 16, v21
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v1, 16, v20
 ; GCN-HSA-NEXT:    v_and_b32_e32 v2, s17, v21
 ; GCN-HSA-NEXT:    v_and_b32_e32 v0, s17, v20
+; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s2
+; GCN-HSA-NEXT:    s_add_u32 s2, s0, 64
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v7, 16, v23
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v5, 16, v22
 ; GCN-HSA-NEXT:    v_and_b32_e32 v6, s17, v23
 ; GCN-HSA-NEXT:    v_and_b32_e32 v4, s17, v22
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v3, 16, v15
-; GCN-HSA-NEXT:    v_and_b32_e32 v2, s17, v15
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v1, 16, v14
-; GCN-HSA-NEXT:    v_and_b32_e32 v0, s17, v14
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
+; GCN-HSA-NEXT:    v_and_b32_e32 v2, s17, v15
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v7, 16, v13
-; GCN-HSA-NEXT:    v_and_b32_e32 v6, s17, v13
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v5, 16, v12
+; GCN-HSA-NEXT:    v_and_b32_e32 v0, s17, v14
+; GCN-HSA-NEXT:    v_and_b32_e32 v6, s17, v13
 ; GCN-HSA-NEXT:    v_and_b32_e32 v4, s17, v12
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v15, 16, v17
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v13, 16, v16
@@ -3767,15 +3767,15 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(<64 x i32> addrspace
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 32
-; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v9, 16, v18
 ; GCN-HSA-NEXT:    v_and_b32_e32 v10, s17, v19
 ; GCN-HSA-NEXT:    v_and_b32_e32 v8, s17, v18
-; GCN-HSA-NEXT:    s_add_u32 s0, s0, 48
+; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
-; GCN-HSA-NEXT:    s_addc_u32 s1, s1, 0
+; GCN-HSA-NEXT:    s_add_u32 s0, s0, 48
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s2
+; GCN-HSA-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
 ; GCN-HSA-NEXT:    s_nop 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
@@ -3785,8 +3785,8 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(<64 x i32> addrspace
 ;
 ; GCN-NOHSA-VI-LABEL: global_zextload_v64i16_to_v64i32:
 ; GCN-NOHSA-VI:       ; %bb.0:
-; GCN-NOHSA-VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s88, SCRATCH_RSRC_DWORD0
+; GCN-NOHSA-VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s89, SCRATCH_RSRC_DWORD1
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s90, -1
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s91, 0xe80000
@@ -3878,8 +3878,8 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(<64 x i32> addrspace
 ; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v58, s0, v58
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v57
 ; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v2, s0, v57
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v0, s0, v56
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v56
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v0, s0, v56
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s0, s4
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[58:61], off, s[0:3], 0 offset:240
@@ -4404,8 +4404,8 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(<64 x i32> addrspace
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s5
-; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s4
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s4
 ; GCN-HSA-NEXT:    flat_load_dwordx4 v[8:11], v[8:9]
 ; GCN-HSA-NEXT:    flat_load_dwordx4 v[12:15], v[12:13]
 ; GCN-HSA-NEXT:    s_add_u32 s4, s2, 64
@@ -4446,8 +4446,8 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(<64 x i32> addrspace
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xc0
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v27, 16, v3
-; GCN-HSA-NEXT:    v_bfe_i32 v26, v3, 0, 16
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v25, 16, v2
+; GCN-HSA-NEXT:    v_bfe_i32 v26, v3, 0, 16
 ; GCN-HSA-NEXT:    v_bfe_i32 v24, v2, 0, 16
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[24:27]
@@ -4462,11 +4462,11 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(<64 x i32> addrspace
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v3, 16, v7
-; GCN-HSA-NEXT:    v_bfe_i32 v2, v7, 0, 16
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v1, 16, v6
+; GCN-HSA-NEXT:    v_bfe_i32 v2, v7, 0, 16
 ; GCN-HSA-NEXT:    v_bfe_i32 v0, v6, 0, 16
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xa0
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(8)
@@ -4475,8 +4475,8 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(<64 x i32> addrspace
 ; GCN-HSA-NEXT:    v_bfe_i32 v2, v13, 0, 16
 ; GCN-HSA-NEXT:    v_bfe_i32 v0, v12, 0, 16
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s1
-; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s0
+; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[12:13], v[0:3]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s3
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v3, 16, v9
@@ -4484,27 +4484,27 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(<64 x i32> addrspace
 ; GCN-HSA-NEXT:    v_bfe_i32 v2, v9, 0, 16
 ; GCN-HSA-NEXT:    v_bfe_i32 v0, v8, 0, 16
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s5
-; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s2
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xb0
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v7, 16, v15
-; GCN-HSA-NEXT:    v_bfe_i32 v6, v15, 0, 16
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v5, 16, v14
+; GCN-HSA-NEXT:    v_bfe_i32 v6, v15, 0, 16
 ; GCN-HSA-NEXT:    v_bfe_i32 v4, v14, 0, 16
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s2
+; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xb0
+; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[24:25], v[0:3]
-; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v14, 16, v11
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x80
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x90
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v14, 16, v11
-; GCN-HSA-NEXT:    v_bfe_i32 v13, v11, 0, 16
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v12, 16, v10
+; GCN-HSA-NEXT:    v_bfe_i32 v13, v11, 0, 16
 ; GCN-HSA-NEXT:    v_bfe_i32 v11, v10, 0, 16
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
+; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x90
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[11:14]
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(11)
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v3, 16, v17
@@ -4531,13 +4531,13 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(<64 x i32> addrspace
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 64
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s2
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, s10
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(12)
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v11, 16, v23
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v9, 16, v22
 ; GCN-HSA-NEXT:    v_bfe_i32 v10, v23, 0, 16
 ; GCN-HSA-NEXT:    v_bfe_i32 v8, v22, 0, 16
+; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s2
+; GCN-HSA-NEXT:    s_add_u32 s2, s0, s10
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(12)
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v15, 16, v29
@@ -4550,17 +4550,17 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(<64 x i32> addrspace
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 32
-; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v9, 16, v30
 ; GCN-HSA-NEXT:    v_bfe_i32 v10, v31, 0, 16
 ; GCN-HSA-NEXT:    v_bfe_i32 v8, v30, 0, 16
-; GCN-HSA-NEXT:    s_add_u32 s0, s0, 48
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
+; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v3, 16, v21
-; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s3
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v1, 16, v20
 ; GCN-HSA-NEXT:    v_bfe_i32 v2, v21, 0, 16
 ; GCN-HSA-NEXT:    v_bfe_i32 v0, v20, 0, 16
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
+; GCN-HSA-NEXT:    s_add_u32 s0, s0, 48
+; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s3
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(14)
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v7, 16, v33
@@ -4581,8 +4581,8 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(<64 x i32> addrspace
 ;
 ; GCN-NOHSA-VI-LABEL: global_sextload_v64i16_to_v64i32:
 ; GCN-NOHSA-VI:       ; %bb.0:
-; GCN-NOHSA-VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s88, SCRATCH_RSRC_DWORD0
+; GCN-NOHSA-VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s89, SCRATCH_RSRC_DWORD1
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s90, -1
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s91, 0xe80000
@@ -4611,8 +4611,8 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(<64 x i32> addrspace
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v51, 16, v9
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(3)
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v31, 16, v15
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v30, v15, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v29, 16, v14
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v30, v15, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v28, v14, 0, 16
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dword v28, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
@@ -4620,54 +4620,54 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(<64 x i32> addrspace
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dword v30, off, s[88:91], 0 offset:12 ; 4-byte Folded Spill
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dword v31, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill
 ; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[60:63], off, s[8:11], 0 offset:112
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v50, v9, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v58, v1, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v56, v0, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v31, 16, v19
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v30, v19, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v29, 16, v18
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v30, v19, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v28, v18, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v39, 16, v17
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v38, v17, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v37, 16, v16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v38, v17, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v36, v16, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v19, 16, v23
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v18, v23, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v17, 16, v22
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v18, v23, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v16, v22, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v43, 16, v21
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v42, v21, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v41, 16, v20
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v42, v21, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v40, v20, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v23, 16, v27
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v22, v27, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v21, 16, v26
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v22, v27, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v20, v26, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v47, 16, v25
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v46, v25, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v45, 16, v24
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v46, v25, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v44, v24, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v27, 16, v11
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v26, v11, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v25, 16, v10
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v26, v11, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v24, v10, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v49, 16, v8
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v50, v9, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v48, v8, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v11, 16, v3
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v10, v3, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v9, 16, v2
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v10, v3, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v8, v2, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v58, v1, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v56, v0, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v35, 16, v13
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v34, v13, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v33, 16, v12
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v34, v13, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v32, v12, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v15, 16, v7
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v14, v7, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v13, 16, v6
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v14, v7, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v12, v6, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v55, 16, v5
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v54, v5, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v53, 16, v4
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v54, v5, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v52, v4, 0, 16
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v3, 16, v61
@@ -5765,8 +5765,8 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i64(<4 x i64> addrspace(1)
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s1
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v3, v1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s1
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, v1
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v7, v1
@@ -5792,8 +5792,8 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i64(<4 x i64> addrspace(1)
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s9, s7
 ; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx2 v[8:9], off, s[8:11], 0
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s6, 0xffff
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v7, 0
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s0, s4
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s1, s5
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, 0
@@ -5952,8 +5952,8 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(<4 x i64> addrspace(1)
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v4, v2
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v6, v5, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v4, v4, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v6, v5, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v0, v1, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v2, v3, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
@@ -6080,8 +6080,8 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i64(<8 x i64> addrspace(1)
 ; GCN-HSA-LABEL: global_zextload_v8i16_to_v8i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v12, 0
 ; GCN-HSA-NEXT:    s_mov_b32 s4, 0xffff
+; GCN-HSA-NEXT:    v_mov_b32_e32 v12, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v14, v12
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v15, v12
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -6093,11 +6093,11 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i64(<8 x i64> addrspace(1)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 16
-; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s1
+; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s0
-; GCN-HSA-NEXT:    s_add_u32 s0, s0, 32
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v19, s3
+; GCN-HSA-NEXT:    s_add_u32 s0, s0, 32
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v8, v12
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v18, s2
 ; GCN-HSA-NEXT:    s_addc_u32 s1, s1, 0
@@ -6134,8 +6134,8 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i64(<8 x i64> addrspace(1)
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s9, s7
 ; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s6, 0xffff
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v17, 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v19, 0
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v17, 0
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s0, s4
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s1, s5
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v7, 0
@@ -6314,21 +6314,21 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i64(<8 x i64> addrspace(1)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s1
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s0
-; GCN-HSA-NEXT:    s_add_u32 s0, s0, 32
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s2
+; GCN-HSA-NEXT:    s_add_u32 s0, s0, 32
 ; GCN-HSA-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s1
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s0
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(0)
-; GCN-HSA-NEXT:    v_bfe_i32 v4, v1, 0, 16
 ; GCN-HSA-NEXT:    v_ashr_i64 v[6:7], v[0:1], 48
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
+; GCN-HSA-NEXT:    v_bfe_i32 v4, v1, 0, 16
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v11, v3
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
-; GCN-HSA-NEXT:    v_bfe_i32 v8, v2, 0, 16
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GCN-HSA-NEXT:    v_bfe_i32 v4, v0, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v8, v2, 0, 16
 ; GCN-HSA-NEXT:    v_ashr_i64 v[2:3], v[2:3], 48
 ; GCN-HSA-NEXT:    v_bfe_i32 v0, v11, 0, 16
 ; GCN-HSA-NEXT:    v_bfe_i32 v6, v1, 0, 16
@@ -6359,15 +6359,15 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i64(<8 x i64> addrspace(1)
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v11, v3
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
 ; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v12, v11, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v14, v3, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v8, v2, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v4, v0, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v0, v1, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v6, v5, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v8, v2, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v2, v7, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v10, v10, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v13, 31, v12
@@ -6554,8 +6554,8 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(<16 x i64> addrspace
 ; GCN-HSA-LABEL: global_zextload_v16i16_to_v16i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v8, 0
 ; GCN-HSA-NEXT:    s_mov_b32 s6, 0xffff
+; GCN-HSA-NEXT:    v_mov_b32_e32 v8, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v10, v8
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v12, v8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -6585,39 +6585,39 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(<16 x i64> addrspace
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s4
 ; GCN-HSA-NEXT:    s_add_u32 s4, s0, 0x70
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 32
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
 ; GCN-HSA-NEXT:    v_and_b32_e32 v9, s6, v1
+; GCN-HSA-NEXT:    s_add_u32 s2, s0, 32
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[13:14], v[9:12]
-; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s5
-; GCN-HSA-NEXT:    v_mov_b32_e32 v18, s3
+; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s4
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
 ; GCN-HSA-NEXT:    v_and_b32_e32 v9, s6, v3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v18, s3
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[13:14], v[9:12]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s2
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 64
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v9, 16, v7
 ; GCN-HSA-NEXT:    v_and_b32_e32 v7, s6, v7
+; GCN-HSA-NEXT:    s_add_u32 s2, s0, 64
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v14, 0
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v13, 16, v6
 ; GCN-HSA-NEXT:    v_and_b32_e32 v11, s6, v6
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[15:16], v[7:10]
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[17:18], v[11:14]
-; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s1
+; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v15, s0
 ; GCN-HSA-NEXT:    s_add_u32 s0, s0, 0x60
 ; GCN-HSA-NEXT:    s_addc_u32 s1, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v18, s3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, v8
-; GCN-HSA-NEXT:    v_mov_b32_e32 v7, v8
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v3, v14
+; GCN-HSA-NEXT:    v_mov_b32_e32 v7, v8
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
-; GCN-HSA-NEXT:    v_and_b32_e32 v6, s6, v0
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v13, 16, v2
 ; GCN-HSA-NEXT:    v_and_b32_e32 v11, s6, v2
+; GCN-HSA-NEXT:    v_and_b32_e32 v6, s6, v0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v18, s3
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v2, 16, v4
 ; GCN-HSA-NEXT:    v_and_b32_e32 v0, s6, v4
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
@@ -6642,8 +6642,8 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(<16 x i64> addrspace
 ; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
 ; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s0, 0xffff
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v28, 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v30, 0
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v28, 0
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s1, s5
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v25, v28
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v21, v28
@@ -6657,8 +6657,8 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(<16 x i64> addrspace
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v8, s0, v0
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v10, 16, v0
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v0, s0, v2
 ; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v12, s0, v1
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v0, s0, v2
 ; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v16, s0, v3
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v18, 16, v3
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
@@ -6671,14 +6671,15 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(<16 x i64> addrspace
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v22, 16, v4
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v26, 16, v5
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v6
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:112
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v4, v28
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:112
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v6, 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v27, 0
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v14, 16, v1
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[3:6], off, s[0:3], 0 offset:96
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, v28
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[3:6], off, s[0:3], 0 offset:96
+; GCN-NOHSA-VI-NEXT:    s_nop 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:80
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:64
@@ -6955,8 +6956,8 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(<16 x i64> addrspace
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s1
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s0
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(1)
-; GCN-HSA-NEXT:    v_bfe_i32 v8, v1, 0, 16
 ; GCN-HSA-NEXT:    v_ashr_i64 v[10:11], v[0:1], 48
+; GCN-HSA-NEXT:    v_bfe_i32 v8, v1, 0, 16
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v9, 31, v8
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, v3
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[16:17], v[8:11]
@@ -6966,42 +6967,42 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(<16 x i64> addrspace
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 32
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v9, 31, v8
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[14:15], v[8:11]
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[14:15], v[8:11]
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
 ; GCN-HSA-NEXT:    v_bfe_i32 v8, v2, 0, 16
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v15, s3
-; GCN-HSA-NEXT:    v_bfe_i32 v2, v2, 0, 16
 ; GCN-HSA-NEXT:    v_bfe_i32 v0, v0, 0, 16
-; GCN-HSA-NEXT:    v_bfe_i32 v10, v1, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v2, v2, 0, 16
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x60
+; GCN-HSA-NEXT:    v_bfe_i32 v10, v1, 0, 16
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[12:13], v[0:3]
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v9, 31, v8
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v11, 31, v10
-; GCN-HSA-NEXT:    s_waitcnt vmcnt(3)
-; GCN-HSA-NEXT:    v_bfe_i32 v0, v5, 0, 16
-; GCN-HSA-NEXT:    v_ashr_i64 v[2:3], v[4:5], 48
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[12:13], v[0:3]
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[14:15], v[8:11]
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(4)
+; GCN-HSA-NEXT:    v_ashr_i64 v[2:3], v[4:5], 48
+; GCN-HSA-NEXT:    v_bfe_i32 v0, v5, 0, 16
 ; GCN-HSA-NEXT:    s_add_u32 s0, s0, 64
+; GCN-HSA-NEXT:    v_mov_b32_e32 v11, v7
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v8, 16, v6
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v11, v7
 ; GCN-HSA-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[16:17], v[0:3]
 ; GCN-HSA-NEXT:    v_bfe_i32 v10, v8, 0, 16
-; GCN-HSA-NEXT:    v_bfe_i32 v8, v6, 0, 16
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
 ; GCN-HSA-NEXT:    v_bfe_i32 v0, v4, 0, 16
-; GCN-HSA-NEXT:    v_bfe_i32 v4, v11, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v8, v6, 0, 16
 ; GCN-HSA-NEXT:    v_ashr_i64 v[6:7], v[6:7], 48
-; GCN-HSA-NEXT:    v_bfe_i32 v2, v1, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v4, v11, 0, 16
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v15, s3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s1
+; GCN-HSA-NEXT:    v_bfe_i32 v2, v1, 0, 16
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s0
@@ -7060,8 +7061,8 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(<16 x i64> addrspace
 ; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v16, v1, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v6, v1, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v4, v4, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v6, v1, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v10, v0, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v0, v2, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v2, v5, 0, 16
@@ -7412,8 +7413,8 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(<32 x i64> addrspace
 ; GCN-HSA-LABEL: global_zextload_v32i16_to_v32i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-HSA-NEXT:    s_mov_b32 s16, 0xffff
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, v1
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -7474,28 +7475,28 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(<32 x i64> addrspace
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s12
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[22:23], v[2:5]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s5
-; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s4
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(4)
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v4, 16, v19
 ; GCN-HSA-NEXT:    v_and_b32_e32 v2, s16, v19
+; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s4
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[22:23], v[2:5]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s7
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v4, 16, v17
 ; GCN-HSA-NEXT:    v_and_b32_e32 v2, s16, v17
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s6
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[22:23], v[2:5]
-; GCN-HSA-NEXT:    s_add_u32 s4, s0, 32
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s9
-; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
+; GCN-HSA-NEXT:    s_add_u32 s4, s0, 32
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v4, 16, v15
 ; GCN-HSA-NEXT:    v_and_b32_e32 v2, s16, v15
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s8
+; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[22:23], v[2:5]
-; GCN-HSA-NEXT:    s_add_u32 s6, s0, 0xe0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v7, 0
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v4, 16, v18
 ; GCN-HSA-NEXT:    v_and_b32_e32 v2, s16, v18
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v18, s1
+; GCN-HSA-NEXT:    s_add_u32 s6, s0, 0xe0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, v7
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s0
 ; GCN-HSA-NEXT:    s_addc_u32 s7, s1, 0
@@ -7530,19 +7531,19 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(<32 x i64> addrspace
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s7
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s6
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[9:10], v[2:5]
-; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v11, 16, v6
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v2, 16, v21
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[15:16], v[0:3]
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v11, 16, v6
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s4
 ; GCN-HSA-NEXT:    v_and_b32_e32 v9, s16, v6
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s4
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v6, 16, v20
 ; GCN-HSA-NEXT:    v_and_b32_e32 v4, s16, v20
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, v1
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s5
-; GCN-HSA-NEXT:    s_add_u32 s0, s0, 64
+; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[2:3], v[4:7]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-HSA-NEXT:    s_add_u32 s0, s0, 64
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v14, 16, v8
 ; GCN-HSA-NEXT:    v_and_b32_e32 v12, s16, v8
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v13, v1
@@ -7590,18 +7591,18 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(<32 x i64> addrspace
 ; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v36, s0, v37
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v38, 16, v37
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v37, 0
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v1, s0, v3
 ; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v12, s0, v0
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v1, s0, v3
 ; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v16, s0, v2
 ; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v20, s0, v5
 ; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v24, s0, v4
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v26, 16, v4
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v4, s0, v6
 ; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v28, s0, v7
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v4, s0, v6
 ; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v39, s0, v32
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v41, 16, v32
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v32, s0, v34
 ; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v42, s0, v31
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v32, s0, v34
 ; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v45, s0, v33
 ; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v51, s0, v35
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v55, v37
@@ -7619,33 +7620,33 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(<32 x i64> addrspace
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v48, 0
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v22, 16, v5
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v30, 16, v7
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v44, 16, v31
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[45:48], off, s[0:3], 0 offset:160
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v45, 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v43, v37
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v45, 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v7, 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v5, v37
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v18, 16, v2
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[42:45], off, s[0:3], 0 offset:128
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:96
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v14, 16, v0
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v18, 16, v2
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v34, 16, v34
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v35, 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v33, v37
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v42, 0
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[42:45], off, s[0:3], 0 offset:128
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v40, v37
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v42, 0
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:96
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v31, 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v4, 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v29, v37
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, v37
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v13, v37
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v9, v37
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[39:42], off, s[0:3], 0 offset:144
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:176
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v39, 0
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[39:42], off, s[0:3], 0 offset:144
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v25, v37
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v39, 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v21, v37
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v17, v37
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:112
@@ -8114,8 +8115,8 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(<32 x i64> addrspace
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s4
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s5
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s5
 ; GCN-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; GCN-HSA-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
 ; GCN-HSA-NEXT:    s_add_u32 s4, s2, 32
@@ -8124,8 +8125,8 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(<32 x i64> addrspace
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s3, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s5
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s2
 ; GCN-HSA-NEXT:    flat_load_dwordx4 v[8:11], v[8:9]
 ; GCN-HSA-NEXT:    flat_load_dwordx4 v[12:15], v[12:13]
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 48
@@ -8135,8 +8136,8 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(<32 x i64> addrspace
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s5
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s4
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(3)
-; GCN-HSA-NEXT:    v_bfe_i32 v16, v1, 0, 16
 ; GCN-HSA-NEXT:    v_ashr_i64 v[18:19], v[0:1], 48
+; GCN-HSA-NEXT:    v_bfe_i32 v16, v1, 0, 16
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[20:21], v[16:19]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s3
@@ -8151,13 +8152,13 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(<32 x i64> addrspace
 ; GCN-HSA-NEXT:    s_addc_u32 s9, s1, 0
 ; GCN-HSA-NEXT:    s_add_u32 s10, s0, 0x70
 ; GCN-HSA-NEXT:    s_addc_u32 s11, s1, 0
-; GCN-HSA-NEXT:    s_add_u32 s12, s0, 0x50
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, v3
+; GCN-HSA-NEXT:    s_add_u32 s12, s0, 0x50
 ; GCN-HSA-NEXT:    v_bfe_i32 v16, v1, 0, 16
-; GCN-HSA-NEXT:    s_addc_u32 s13, s1, 0
 ; GCN-HSA-NEXT:    v_ashr_i64 v[18:19], v[2:3], 48
-; GCN-HSA-NEXT:    s_add_u32 s14, s0, 32
+; GCN-HSA-NEXT:    s_addc_u32 s13, s1, 0
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
+; GCN-HSA-NEXT:    s_add_u32 s14, s0, 32
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[20:21], v[16:19]
 ; GCN-HSA-NEXT:    s_addc_u32 s15, s1, 0
@@ -8170,16 +8171,16 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(<32 x i64> addrspace
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[1:2], v[16:19]
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s1
-; GCN-HSA-NEXT:    v_bfe_i32 v2, v1, 0, 16
 ; GCN-HSA-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v2, v1, 0, 16
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s0
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[16:17], v[0:3]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s5
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(6)
-; GCN-HSA-NEXT:    v_bfe_i32 v0, v5, 0, 16
 ; GCN-HSA-NEXT:    v_ashr_i64 v[2:3], v[4:5], 48
+; GCN-HSA-NEXT:    v_bfe_i32 v0, v5, 0, 16
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s4
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[16:17], v[0:3]
@@ -8192,8 +8193,8 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(<32 x i64> addrspace
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[18:19], v[0:3]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v19, s9
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(7)
-; GCN-HSA-NEXT:    v_bfe_i32 v0, v9, 0, 16
 ; GCN-HSA-NEXT:    v_ashr_i64 v[2:3], v[8:9], 48
+; GCN-HSA-NEXT:    v_bfe_i32 v0, v9, 0, 16
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v18, s8
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[18:19], v[0:3]
@@ -8206,19 +8207,19 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(<32 x i64> addrspace
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[16:17], v[0:3]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s13
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(8)
-; GCN-HSA-NEXT:    v_bfe_i32 v0, v13, 0, 16
 ; GCN-HSA-NEXT:    v_ashr_i64 v[2:3], v[12:13], 48
+; GCN-HSA-NEXT:    v_bfe_i32 v0, v13, 0, 16
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s12
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[16:17], v[0:3]
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xe0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, v15
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v19, s11
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, v15
 ; GCN-HSA-NEXT:    v_bfe_i32 v0, v0, 0, 16
 ; GCN-HSA-NEXT:    v_ashr_i64 v[2:3], v[14:15], 48
+; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xe0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v18, s10
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v5, 16, v6
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[18:19], v[0:3]
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
@@ -8236,18 +8237,18 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(<32 x i64> addrspace
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xa0
-; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    v_bfe_i32 v2, v7, 0, 16
-; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s3
+; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s3
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v6, 16, v14
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-HSA-NEXT:    v_bfe_i32 v4, v14, 0, 16
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v2, 16, v12
 ; GCN-HSA-NEXT:    v_bfe_i32 v0, v12, 0, 16
-; GCN-HSA-NEXT:    v_bfe_i32 v4, v14, 0, 16
-; GCN-HSA-NEXT:    v_bfe_i32 v14, v15, 0, 16
 ; GCN-HSA-NEXT:    v_bfe_i32 v12, v10, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v14, v15, 0, 16
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x80
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v13, 31, v12
@@ -8255,19 +8256,19 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(<32 x i64> addrspace
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v11, 16, v8
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[16:17], v[12:15]
-; GCN-HSA-NEXT:    v_bfe_i32 v10, v11, 0, 16
-; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s3
 ; GCN-HSA-NEXT:    v_bfe_i32 v8, v8, 0, 16
+; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s3
+; GCN-HSA-NEXT:    v_bfe_i32 v10, v11, 0, 16
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x60
-; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v9, 31, v8
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v11, 31, v10
-; GCN-HSA-NEXT:    s_add_u32 s0, s0, 64
+; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    v_bfe_i32 v6, v6, 0, 16
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
+; GCN-HSA-NEXT:    s_add_u32 s0, s0, 64
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s3
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s2
 ; GCN-HSA-NEXT:    s_addc_u32 s1, s1, 0
@@ -8300,9 +8301,9 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(<32 x i64> addrspace
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v16, 16, v14
 ; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v18, v16, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v16, v14, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v14, v15
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v19, 31, v18
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v14, v15
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:224
 ; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v14, v14, 0, 16
@@ -8322,13 +8323,13 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(<32 x i64> addrspace
 ; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v14, v14, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v13, 31, v12
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v15, 31, v14
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:208
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v16, v11
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v12, 16, v10
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:208
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v17, 16, v11
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v14, v10, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v12, 16, v10
 ; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v11, v16, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v16, v12, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v14, v10, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v13, v17, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v15, 31, v14
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
@@ -8378,10 +8379,10 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(<32 x i64> addrspace
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v20, v0, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v12, v1, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v8, v11, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v14, v2, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v1, v4, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v3, v3, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v8, v11, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v14, v2, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v19, 31, v18
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v21, 31, v20
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v11, 31, v10

diff  --git a/llvm/test/CodeGen/AMDGPU/load-local.128.ll b/llvm/test/CodeGen/AMDGPU/load-local.128.ll
index 6aa75718031c3..288bb96b6cd85 100644
--- a/llvm/test/CodeGen/AMDGPU/load-local.128.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-local.128.ll
@@ -131,9 +131,9 @@ define <4 x i32> @load_lds_v4i32_align1(<4 x i32> addrspace(3)* %ptr) {
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    v_or_b32_e32 v0, v9, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-NEXT:    v_or_b32_e32 v3, v3, v5
 ; GFX7-NEXT:    v_or_b32_e32 v2, v2, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 8, v6
+; GFX7-NEXT:    v_or_b32_e32 v3, v3, v5
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v7
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX7-NEXT:    v_or_b32_e32 v3, v3, v0
@@ -161,7 +161,6 @@ define <4 x i32> @load_lds_v4i32_align1(<4 x i32> addrspace(3)* %ptr) {
 ; GFX6-NEXT:    ds_read_u8 v8, v0
 ; GFX6-NEXT:    v_add_i32_e32 v9, vcc, 14, v0
 ; GFX6-NEXT:    v_add_i32_e32 v10, vcc, 3, v0
-; GFX6-NEXT:    v_add_i32_e32 v11, vcc, 2, v0
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(1)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
@@ -176,6 +175,7 @@ define <4 x i32> @load_lds_v4i32_align1(<4 x i32> addrspace(3)* %ptr) {
 ; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 13, v0
 ; GFX6-NEXT:    v_add_i32_e32 v6, vcc, 12, v0
 ; GFX6-NEXT:    v_add_i32_e32 v7, vcc, 15, v0
+; GFX6-NEXT:    v_add_i32_e32 v11, vcc, 2, v0
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
 ; GFX6-NEXT:    ds_read_u8 v4, v4
 ; GFX6-NEXT:    ds_read_u8 v5, v5
@@ -187,13 +187,13 @@ define <4 x i32> @load_lds_v4i32_align1(<4 x i32> addrspace(3)* %ptr) {
 ; GFX6-NEXT:    ds_read_u8 v0, v0
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(7)
 ; GFX6-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(4)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 8, v7
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    s_waitcnt lgkmcnt(3)
-; GFX6-NEXT:    v_or_b32_e32 v4, v4, v9
 ; GFX6-NEXT:    v_or_b32_e32 v2, v3, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 8, v5
+; GFX6-NEXT:    s_waitcnt lgkmcnt(3)
+; GFX6-NEXT:    v_or_b32_e32 v4, v4, v9
 ; GFX6-NEXT:    v_or_b32_e32 v3, v3, v6
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX6-NEXT:    v_or_b32_e32 v3, v4, v3
@@ -324,8 +324,8 @@ define <4 x i32> @load_lds_v4i32_align2(<4 x i32> addrspace(3)* %ptr) {
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
 ; GFX6-NEXT:    v_or_b32_e32 v2, v2, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
 ; GFX6-NEXT:    v_or_b32_e32 v3, v3, v6
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)

diff  --git a/llvm/test/CodeGen/AMDGPU/load-local.96.ll b/llvm/test/CodeGen/AMDGPU/load-local.96.ll
index 0d8c3424cb837..85857f0f93b7e 100644
--- a/llvm/test/CodeGen/AMDGPU/load-local.96.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-local.96.ll
@@ -106,12 +106,11 @@ define <3 x i32> @load_lds_v3i32_align1(<3 x i32> addrspace(3)* %ptr) {
 ; GFX7-NEXT:    v_or_b32_e32 v3, v3, v6
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(3)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; GFX7-NEXT:    s_waitcnt lgkmcnt(2)
-; GFX7-NEXT:    v_or_b32_e32 v2, v2, v4
-; GFX7-NEXT:    s_waitcnt lgkmcnt(1)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
 ; GFX7-NEXT:    v_or_b32_e32 v7, v7, v8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v4
 ; GFX7-NEXT:    v_or_b32_e32 v3, v3, v7
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    v_or_b32_e32 v0, v5, v0

diff  --git a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll
index ddd40b0cc343f..217ad227e04c0 100644
--- a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll
@@ -113,8 +113,8 @@ define void @func_local_stack_offset_uses_sp(i64 addrspace(1)* %out) {
 ; MUBUF-NEXT:    s_and_b32 s33, s33, 0xfff80000
 ; MUBUF-NEXT:    v_lshrrev_b32_e64 v3, 6, s33
 ; MUBUF-NEXT:    v_add_u32_e32 v3, 0x1000, v3
-; MUBUF-NEXT:    v_mov_b32_e32 v4, 0
 ; MUBUF-NEXT:    v_add_u32_e32 v2, 64, v3
+; MUBUF-NEXT:    v_mov_b32_e32 v4, 0
 ; MUBUF-NEXT:    s_mov_b32 s4, 0
 ; MUBUF-NEXT:    s_add_i32 s32, s32, 0x180000
 ; MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], s33
@@ -224,9 +224,9 @@ define amdgpu_kernel void @local_stack_offset_uses_sp_flat(<3 x i64> addrspace(1
 ; MUBUF-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen glc
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; MUBUF-NEXT:    v_or_b32_e32 v2, 0x12d0, v0
+; MUBUF-NEXT:    v_or_b32_e32 v1, 0x12c0, v0
 ; MUBUF-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen glc
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
-; MUBUF-NEXT:    v_or_b32_e32 v1, 0x12c0, v0
 ; MUBUF-NEXT:    v_or_b32_e32 v2, 0x12c4, v0
 ; MUBUF-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen glc
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)

diff  --git a/llvm/test/CodeGen/AMDGPU/lshl64-to-32.ll b/llvm/test/CodeGen/AMDGPU/lshl64-to-32.ll
index 2ac06d9240d22..7c2607103fd5b 100644
--- a/llvm/test/CodeGen/AMDGPU/lshl64-to-32.ll
+++ b/llvm/test/CodeGen/AMDGPU/lshl64-to-32.ll
@@ -113,10 +113,10 @@ define amdgpu_kernel void @muli24_shl64(i64 addrspace(1)* nocapture %arg, i32 ad
 ; GCN-LABEL: muli24_shl64:
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GCN-NEXT:    v_mov_b32_e32 v2, 0
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_mov_b32 s2, 0
 ; GCN-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GCN-NEXT:    v_mov_b32_e32 v2, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b64 s[0:1], s[6:7]
 ; GCN-NEXT:    buffer_load_dword v1, v[1:2], s[0:3], 0 addr64

diff  --git a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
index d4fa0b3386b29..4a35b3fd54811 100644
--- a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
@@ -27,9 +27,9 @@ define amdgpu_kernel void @s_lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16>
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    s_lshr_b32 s1, s5, 16
 ; VI-NEXT:    s_lshr_b32 s6, s0, 16
-; VI-NEXT:    s_lshr_b32 s1, s1, s6
 ; VI-NEXT:    s_and_b32 s5, s5, s4
 ; VI-NEXT:    s_and_b32 s0, s0, s4
+; VI-NEXT:    s_lshr_b32 s1, s1, s6
 ; VI-NEXT:    s_lshr_b32 s0, s5, s0
 ; VI-NEXT:    s_lshl_b32 s1, s1, 16
 ; VI-NEXT:    s_or_b32 s0, s0, s1
@@ -49,9 +49,9 @@ define amdgpu_kernel void @s_lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16>
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_lshr_b32 s1, s2, 16
 ; CI-NEXT:    s_lshr_b32 s8, s0, 16
-; CI-NEXT:    s_lshr_b32 s1, s1, s8
 ; CI-NEXT:    s_and_b32 s2, s2, s3
 ; CI-NEXT:    s_and_b32 s0, s0, s3
+; CI-NEXT:    s_lshr_b32 s1, s1, s8
 ; CI-NEXT:    s_lshr_b32 s0, s2, s0
 ; CI-NEXT:    s_lshl_b32 s1, s1, 16
 ; CI-NEXT:    s_or_b32 s0, s0, s1
@@ -125,9 +125,9 @@ define amdgpu_kernel void @v_lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16>
 ; CI-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; CI-NEXT:    s_waitcnt vmcnt(1)
 ; CI-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; CI-NEXT:    v_and_b32_e32 v2, s0, v2
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
-; CI-NEXT:    v_and_b32_e32 v2, s0, v2
 ; CI-NEXT:    v_and_b32_e32 v3, s0, v3
 ; CI-NEXT:    v_lshr_b32_e32 v2, v2, v3
 ; CI-NEXT:    v_lshr_b32_e32 v3, v4, v5
@@ -344,8 +344,8 @@ define amdgpu_kernel void @lshr_imm_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v3, v[0:1]
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshrrev_b16_e64 v2, v3, 8
@@ -517,13 +517,13 @@ define amdgpu_kernel void @v_lshr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16>
 ; CI-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; CI-NEXT:    s_waitcnt vmcnt(1)
 ; CI-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
+; CI-NEXT:    v_and_b32_e32 v2, s0, v2
 ; CI-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
+; CI-NEXT:    v_and_b32_e32 v3, s0, v3
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
-; CI-NEXT:    v_lshrrev_b32_e32 v9, 16, v5
-; CI-NEXT:    v_and_b32_e32 v2, s0, v2
 ; CI-NEXT:    v_and_b32_e32 v4, s0, v4
-; CI-NEXT:    v_and_b32_e32 v3, s0, v3
+; CI-NEXT:    v_lshrrev_b32_e32 v9, 16, v5
 ; CI-NEXT:    v_and_b32_e32 v5, s0, v5
 ; CI-NEXT:    v_lshr_b32_e32 v3, v3, v5
 ; CI-NEXT:    v_lshr_b32_e32 v5, v7, v9

diff  --git a/llvm/test/CodeGen/AMDGPU/max.i16.ll b/llvm/test/CodeGen/AMDGPU/max.i16.ll
index 28e4cdf8dd467..b806ec200d8f2 100644
--- a/llvm/test/CodeGen/AMDGPU/max.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/max.i16.ll
@@ -122,8 +122,8 @@ define amdgpu_kernel void @v_test_imax_sge_v3i16(<3 x i16> addrspace(1)* %out, <
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
 ; VI-NEXT:    flat_load_dword v7, v[2:3]
 ; VI-NEXT:    flat_load_ushort v8, v[0:1]
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s4, v6
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s4, v6
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 4, v0
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc

diff  --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll
index 12eef07032da5..777ee633a9fb8 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll
@@ -99,8 +99,6 @@ define amdgpu_kernel void @local_nontemporal_load_0(
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
-;
-;
     i32 addrspace(3)* %in, i32 addrspace(1)* %out) {
 entry:
   %val = load i32, i32 addrspace(3)* %in, align 4, !nontemporal !0
@@ -203,8 +201,6 @@ define amdgpu_kernel void @local_nontemporal_load_1(
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
-;
-;
     i32 addrspace(3)* %in, i32 addrspace(1)* %out) {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -247,8 +243,8 @@ define amdgpu_kernel void @local_nontemporal_store_0(
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-WGP-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
@@ -260,8 +256,8 @@ define amdgpu_kernel void @local_nontemporal_store_0(
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-CU-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
@@ -285,8 +281,8 @@ define amdgpu_kernel void @local_nontemporal_store_0(
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
@@ -297,14 +293,12 @@ define amdgpu_kernel void @local_nontemporal_store_0(
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
-;
-;
     i32 addrspace(1)* %in, i32 addrspace(3)* %out) {
 entry:
   %val = load i32, i32 addrspace(1)* %in, align 4
@@ -347,8 +341,8 @@ define amdgpu_kernel void @local_nontemporal_store_1(
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GFX10-WGP-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
+; GFX10-WGP-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
@@ -360,8 +354,8 @@ define amdgpu_kernel void @local_nontemporal_store_1(
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GFX10-CU-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
+; GFX10-CU-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
@@ -386,8 +380,8 @@ define amdgpu_kernel void @local_nontemporal_store_1(
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
@@ -398,14 +392,12 @@ define amdgpu_kernel void @local_nontemporal_store_1(
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GFX90A-TGSPLIT-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
-;
-;
     i32 addrspace(1)* %in, i32 addrspace(3)* %out) {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()

diff  --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
index 6112b70a4b86a..3e7e8db05e299 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
@@ -193,8 +193,8 @@ define amdgpu_kernel void @local_volatile_store_0(
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-WGP-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
@@ -206,8 +206,8 @@ define amdgpu_kernel void @local_volatile_store_0(
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-CU-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
@@ -267,8 +267,8 @@ define amdgpu_kernel void @local_volatile_store_1(
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GFX10-WGP-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
+; GFX10-WGP-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
@@ -280,8 +280,8 @@ define amdgpu_kernel void @local_volatile_store_1(
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GFX10-CU-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
+; GFX10-CU-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1

diff  --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll
index a1a6b64391b07..7bc0fcf4dd742 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll
@@ -85,9 +85,9 @@ define amdgpu_kernel void @private_nontemporal_load_0(
 ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    s_add_u32 s8, s8, s3
 ; SKIP-CACHE-INV-NEXT:    s_addc_u32 s9, s9, 0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen glc slc
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
@@ -125,8 +125,6 @@ define amdgpu_kernel void @private_nontemporal_load_0(
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
-;
-;
     i32 addrspace(5)* %in, i32 addrspace(1)* %out) {
 entry:
   %val = load i32, i32 addrspace(5)* %in, align 4, !nontemporal !0
@@ -215,9 +213,9 @@ define amdgpu_kernel void @private_nontemporal_load_1(
 ; SKIP-CACHE-INV-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_add_i32_e32 v0, vcc, s4, v0
 ; SKIP-CACHE-INV-NEXT:    s_add_u32 s8, s8, s3
 ; SKIP-CACHE-INV-NEXT:    s_addc_u32 s9, s9, 0
-; SKIP-CACHE-INV-NEXT:    v_add_i32_e32 v0, vcc, s4, v0
 ; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen glc slc
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
@@ -255,8 +253,6 @@ define amdgpu_kernel void @private_nontemporal_load_1(
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
-;
-;
     i32 addrspace(5)* %in, i32 addrspace(1)* %out) {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -382,8 +378,6 @@ define amdgpu_kernel void @private_nontemporal_store_0(
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[8:11], 0 offen glc slc
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
-;
-;
     i32 addrspace(1)* %in, i32 addrspace(5)* %out) {
 entry:
   %val = load i32, i32 addrspace(1)* %in, align 4
@@ -400,9 +394,9 @@ define amdgpu_kernel void @private_nontemporal_store_1(
 ; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x2
 ; GFX6-NEXT:    s_add_u32 s8, s8, s7
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX6-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX6-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s0
@@ -417,9 +411,9 @@ define amdgpu_kernel void @private_nontemporal_store_1(
 ; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
 ; GFX7-NEXT:    s_add_u32 s8, s8, s7
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX7-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX7-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s0
@@ -510,8 +504,6 @@ define amdgpu_kernel void @private_nontemporal_store_1(
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v1, v0, s[8:11], 0 offen glc slc
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
-;
-;
     i32 addrspace(1)* %in, i32 addrspace(5)* %out) {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()

diff  --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll
index fe4d6d65248a3..a8dac1389ebb9 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll
@@ -85,9 +85,9 @@ define amdgpu_kernel void @private_volatile_load_0(
 ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    s_add_u32 s8, s8, s3
 ; SKIP-CACHE-INV-NEXT:    s_addc_u32 s9, s9, 0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
@@ -183,9 +183,9 @@ define amdgpu_kernel void @private_volatile_load_1(
 ; SKIP-CACHE-INV-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_add_i32_e32 v0, vcc, s4, v0
 ; SKIP-CACHE-INV-NEXT:    s_add_u32 s8, s8, s3
 ; SKIP-CACHE-INV-NEXT:    s_addc_u32 s9, s9, 0
-; SKIP-CACHE-INV-NEXT:    v_add_i32_e32 v0, vcc, s4, v0
 ; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
@@ -327,9 +327,9 @@ define amdgpu_kernel void @private_volatile_store_1(
 ; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
 ; GFX7-NEXT:    s_add_u32 s8, s8, s7
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX7-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX7-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s0

diff  --git a/llvm/test/CodeGen/AMDGPU/memory_clause.ll b/llvm/test/CodeGen/AMDGPU/memory_clause.ll
index a0cab6b8d7140..ab39fae82d37e 100644
--- a/llvm/test/CodeGen/AMDGPU/memory_clause.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory_clause.ll
@@ -81,11 +81,11 @@ define amdgpu_kernel void @scalar_clause(<4 x i32> addrspace(1)* noalias nocaptu
 ; GCN-NEXT:    s_load_dwordx4 s[12:15], s[16:17], 0x30
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    v_mov_b32_e32 v4, s4
-; GCN-NEXT:    v_mov_b32_e32 v8, s8
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN-NEXT:    v_mov_b32_e32 v2, s2
 ; GCN-NEXT:    v_mov_b32_e32 v3, s3
+; GCN-NEXT:    v_mov_b32_e32 v4, s4
+; GCN-NEXT:    v_mov_b32_e32 v8, s8
 ; GCN-NEXT:    v_mov_b32_e32 v5, s5
 ; GCN-NEXT:    v_mov_b32_e32 v6, s6
 ; GCN-NEXT:    v_mov_b32_e32 v7, s7
@@ -116,18 +116,18 @@ define amdgpu_kernel void @scalar_clause(<4 x i32> addrspace(1)* noalias nocaptu
 ; GCN-SCRATCH-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x30
 ; GCN-SCRATCH-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-SCRATCH-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-SCRATCH-NEXT:    v_mov_b32_e32 v4, s4
 ; GCN-SCRATCH-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN-SCRATCH-NEXT:    v_mov_b32_e32 v2, s2
 ; GCN-SCRATCH-NEXT:    v_mov_b32_e32 v3, s3
-; GCN-SCRATCH-NEXT:    v_mov_b32_e32 v8, s8
+; GCN-SCRATCH-NEXT:    v_mov_b32_e32 v4, s4
 ; GCN-SCRATCH-NEXT:    v_mov_b32_e32 v5, s5
 ; GCN-SCRATCH-NEXT:    v_mov_b32_e32 v6, s6
 ; GCN-SCRATCH-NEXT:    v_mov_b32_e32 v7, s7
-; GCN-SCRATCH-NEXT:    v_mov_b32_e32 v12, s12
+; GCN-SCRATCH-NEXT:    v_mov_b32_e32 v8, s8
 ; GCN-SCRATCH-NEXT:    v_mov_b32_e32 v9, s9
 ; GCN-SCRATCH-NEXT:    v_mov_b32_e32 v10, s10
 ; GCN-SCRATCH-NEXT:    v_mov_b32_e32 v11, s11
+; GCN-SCRATCH-NEXT:    v_mov_b32_e32 v12, s12
 ; GCN-SCRATCH-NEXT:    v_mov_b32_e32 v13, s13
 ; GCN-SCRATCH-NEXT:    v_mov_b32_e32 v14, s14
 ; GCN-SCRATCH-NEXT:    v_mov_b32_e32 v15, s15

diff  --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll
index 4f720fff6c666..741b33b13667d 100644
--- a/llvm/test/CodeGen/AMDGPU/min.ll
+++ b/llvm/test/CodeGen/AMDGPU/min.ll
@@ -110,9 +110,9 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(<4 x i8> addrspace(1)* %out, [8
 ; GCN: s_load_dword s
 ; GCN: s_load_dword s
 
-; SI: s_ashr_i32
 ; SI: s_ashr_i32
 ; SI: s_sext_i32_i16
+; SI: s_ashr_i32
 ; SI: s_sext_i32_i16
 ; SI: s_min_i32
 ; SI: s_min_i32

diff  --git a/llvm/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll b/llvm/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll
index 2144b4b4cb220..22614903d8c36 100644
--- a/llvm/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll
+++ b/llvm/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll
@@ -9,11 +9,11 @@
 ; GCN: s_load_dwordx2 s{{\[}}[[ARG1LO:[0-9]+]]:[[ARG1HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
 
 ; GCN-NOT: v_mov_b32
-; GCN: v_mov_b32_e32 v[[VARG1LO:[0-9]+]], s[[ARG1LO]]
 ; GCN: buffer_load_dwordx2 v{{\[}}[[LDPTRLO:[0-9]+]]:[[LDPTRHI:[0-9]+]]{{\]}}
-; GCN-NOT: v_mov_b32
+; GCN: v_mov_b32_e32 v[[VARG1LO:[0-9]+]], s[[ARG1LO]]
 ; GCN: v_mov_b32_e32 v[[VARG1HI:[0-9]+]], s[[ARG1HI]]
 ; GCN-NOT: v_mov_b32
+; GCN-NOT: v_mov_b32
 
 ; GCN: v_add_i32_e32 v[[PTRLO:[0-9]+]], vcc, v[[LDPTRLO]], v[[VARG1LO]]
 ; GCN: v_addc_u32_e32 v[[PTRHI:[0-9]+]], vcc, v[[LDPTRHI]], v[[VARG1HI]]

diff  --git a/llvm/test/CodeGen/AMDGPU/mul.i16.ll b/llvm/test/CodeGen/AMDGPU/mul.i16.ll
index e393686996bbc..bc42cc7fd0f80 100644
--- a/llvm/test/CodeGen/AMDGPU/mul.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul.i16.ll
@@ -80,9 +80,9 @@ define <3 x i16> @v_mul_v3i16(<3 x i16> %a, <3 x i16> %b) {
 ; SI: v_mul_u32_u24
 
 ; VI: v_mul_lo_u16_sdwa
-; VI: v_mul_lo_u16_e32
 ; VI: v_mul_lo_u16_sdwa
 ; VI: v_mul_lo_u16_e32
+; VI: v_mul_lo_u16_e32
 ; VI: v_or_b32_e32
 ; VI: v_or_b32_e32
 

diff  --git a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
index e8da96158adc3..d1389784cf9ef 100644
--- a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
@@ -82,8 +82,8 @@ define void @lsr_order_mul24_1(i32 %arg, i32 %arg1, i32 %arg2, float addrspace(3
 ; GFX9-NEXT:    v_mul_lo_u32 v8, v8, v15
 ; GFX9-NEXT:    v_sub_u32_e32 v19, v9, v18
 ; GFX9-NEXT:    v_cmp_lt_u32_e64 s[6:7], v19, v14
-; GFX9-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
 ; GFX9-NEXT:    v_sub_u32_e32 v12, v12, v18
+; GFX9-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
 ; GFX9-NEXT:    v_add_u32_e32 v8, v12, v8
 ; GFX9-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v9, 0
@@ -151,8 +151,8 @@ define void @slsr1_0(i32 %b.arg, i32 %s.arg) #0 {
 ; GFX9-LABEL: slsr1_0:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mul_u32_u24_e32 v3, v0, v1
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffffff, v1
+; GFX9-NEXT:    v_mul_u32_u24_e32 v3, v0, v1
 ; GFX9-NEXT:    global_store_dword v[0:1], v3, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mad_u32_u24 v0, v0, v1, v2

diff  --git a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
index 41dd57aec6091..1df10fd22edab 100644
--- a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
@@ -142,10 +142,10 @@ define amdgpu_kernel void @test_umul24_i16_vgpr_sext(i32 addrspace(1)* %out, i16
 ; VI-NEXT:    s_mov_b32 s3, 0xf000
 ; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_add_u32_e32 v2, vcc, s6, v0
 ; VI-NEXT:    v_mov_b32_e32 v3, s7
-; VI-NEXT:    v_lshlrev_b32_e32 v0, 1, v1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, s6, v0
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, v3, v4, vcc
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 1, v1
 ; VI-NEXT:    v_mov_b32_e32 v1, s7
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, v1, v4, vcc
@@ -268,10 +268,10 @@ define amdgpu_kernel void @test_umul24_i16_vgpr(i32 addrspace(1)* %out, i16 addr
 ; VI-NEXT:    s_mov_b32 s3, 0xf000
 ; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_add_u32_e32 v2, vcc, s6, v0
 ; VI-NEXT:    v_mov_b32_e32 v3, s7
-; VI-NEXT:    v_lshlrev_b32_e32 v0, 1, v1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, s6, v0
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 1, v1
 ; VI-NEXT:    v_mov_b32_e32 v1, s7
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
@@ -367,9 +367,9 @@ define amdgpu_kernel void @test_umul24_i8_vgpr(i32 addrspace(1)* %out, i8 addrsp
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s0, s4
 ; GFX9-NEXT:    global_load_ubyte v2, v0, s[6:7]
 ; GFX9-NEXT:    global_load_ubyte v3, v1, s[8:9]
+; GFX9-NEXT:    s_mov_b32 s0, s4
 ; GFX9-NEXT:    s_mov_b32 s1, s5
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mul_lo_u16_e32 v0, v2, v3
@@ -534,8 +534,8 @@ define amdgpu_kernel void @test_umul24_i64(i64 addrspace(1)* %out, i64 %a, i64 %
 ; VI-NEXT:    s_and_b32 s5, s6, s4
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_and_b32 s4, s7, s4
-; VI-NEXT:    v_mov_b32_e32 v0, s7
 ; VI-NEXT:    s_mul_i32 s5, s5, s4
+; VI-NEXT:    v_mov_b32_e32 v0, s7
 ; VI-NEXT:    v_mul_hi_u32_u24_e32 v1, s6, v0
 ; VI-NEXT:    v_mov_b32_e32 v0, s5
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -708,10 +708,10 @@ define amdgpu_kernel void @test_umul24_i33(i64 addrspace(1)* %out, i33 %a, i33 %
 ; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_and_b32 s3, s2, s1
-; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    s_and_b32 s1, s0, s1
-; VI-NEXT:    v_mul_hi_u32_u24_e32 v0, s2, v0
+; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    s_mul_i32 s3, s3, s1
+; VI-NEXT:    v_mul_hi_u32_u24_e32 v0, s2, v0
 ; VI-NEXT:    v_and_b32_e32 v1, 1, v0
 ; VI-NEXT:    v_mov_b32_e32 v0, s3
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0

diff  --git a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
index eae7e1e649b4d..035dab4e70be8 100644
--- a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
+++ b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
@@ -30,8 +30,8 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
 ; MUBUF-NEXT:    s_add_i32 s6, s32, 0x1000
 ; MUBUF-NEXT:    s_lshl_b32 s7, s10, 2
 ; MUBUF-NEXT:    s_mov_b32 s32, s6
-; MUBUF-NEXT:    v_mov_b32_e32 v2, s6
 ; MUBUF-NEXT:    v_mov_b32_e32 v1, 0
+; MUBUF-NEXT:    v_mov_b32_e32 v2, s6
 ; MUBUF-NEXT:    v_mov_b32_e32 v3, 1
 ; MUBUF-NEXT:    s_add_i32 s6, s6, s7
 ; MUBUF-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -63,8 +63,8 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
 ; FLATSCR-NEXT:    s_cmp_lg_u32 s5, 0
 ; FLATSCR-NEXT:    s_cbranch_scc1 BB0_3
 ; FLATSCR-NEXT:  ; %bb.2: ; %bb.1
-; FLATSCR-NEXT:    s_add_i32 s2, s32, 0x1000
 ; FLATSCR-NEXT:    v_mov_b32_e32 v1, 0
+; FLATSCR-NEXT:    s_add_i32 s2, s32, 0x1000
 ; FLATSCR-NEXT:    v_mov_b32_e32 v2, 1
 ; FLATSCR-NEXT:    s_lshl_b32 s3, s6, 2
 ; FLATSCR-NEXT:    s_mov_b32 s32, s2
@@ -130,8 +130,8 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
 ; MUBUF-NEXT:    s_and_b32 s6, s6, 0xfffff000
 ; MUBUF-NEXT:    s_lshl_b32 s7, s7, 2
 ; MUBUF-NEXT:    s_mov_b32 s32, s6
-; MUBUF-NEXT:    v_mov_b32_e32 v2, s6
 ; MUBUF-NEXT:    v_mov_b32_e32 v1, 0
+; MUBUF-NEXT:    v_mov_b32_e32 v2, s6
 ; MUBUF-NEXT:    v_mov_b32_e32 v3, 1
 ; MUBUF-NEXT:    s_add_i32 s6, s6, s7
 ; MUBUF-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -161,8 +161,8 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
 ; FLATSCR-NEXT:    s_cbranch_scc1 BB1_2
 ; FLATSCR-NEXT:  ; %bb.1: ; %bb.0
 ; FLATSCR-NEXT:    s_add_i32 s2, s32, 0x1000
-; FLATSCR-NEXT:    s_and_b32 s2, s2, 0xfffff000
 ; FLATSCR-NEXT:    v_mov_b32_e32 v1, 0
+; FLATSCR-NEXT:    s_and_b32 s2, s2, 0xfffff000
 ; FLATSCR-NEXT:    v_mov_b32_e32 v2, 1
 ; FLATSCR-NEXT:    s_lshl_b32 s3, s3, 2
 ; FLATSCR-NEXT:    s_mov_b32 s32, s2
@@ -356,8 +356,8 @@ define void @func_non_entry_block_static_alloca_align64(i32 addrspace(1)* %out,
 ; FLATSCR-NEXT:    s_and_b32 s2, s2, 0xfffff000
 ; FLATSCR-NEXT:    v_mov_b32_e32 v5, 0
 ; FLATSCR-NEXT:    v_mov_b32_e32 v6, 1
-; FLATSCR-NEXT:    v_lshl_add_u32 v2, v3, 2, s2
 ; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[5:6], s2
+; FLATSCR-NEXT:    v_lshl_add_u32 v2, v3, 2, s2
 ; FLATSCR-NEXT:    scratch_load_dword v2, v2, off
 ; FLATSCR-NEXT:    v_and_b32_e32 v3, 0x3ff, v4
 ; FLATSCR-NEXT:    s_mov_b32 s32, s2

diff  --git a/llvm/test/CodeGen/AMDGPU/saddo.ll b/llvm/test/CodeGen/AMDGPU/saddo.ll
index 9aed54b12c948..1be7b34cfae92 100644
--- a/llvm/test/CodeGen/AMDGPU/saddo.ll
+++ b/llvm/test/CodeGen/AMDGPU/saddo.ll
@@ -42,10 +42,10 @@ define amdgpu_kernel void @saddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s6
 ; VI-NEXT:    s_add_u32 s2, s6, s0
-; VI-NEXT:    s_addc_u32 s3, s7, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s7
-; VI-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[1:2]
+; VI-NEXT:    s_addc_u32 s3, s7, s1
 ; VI-NEXT:    v_cmp_lt_i64_e64 s[8:9], s[0:1], 0
+; VI-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[1:2]
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    s_xor_b64 s[0:1], s[8:9], vcc
 ; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
@@ -66,8 +66,8 @@ define amdgpu_kernel void @saddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b
 ; GFX9-NEXT:    s_add_u32 s0, s6, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9-NEXT:    s_addc_u32 s1, s7, s3
-; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
 ; GFX9-NEXT:    v_cmp_lt_i64_e64 s[8:9], s[2:3], 0
+; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    s_xor_b64 s[2:3], s[8:9], vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[2:3]
@@ -116,8 +116,8 @@ define amdgpu_kernel void @s_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)*
 ; SI-NEXT:    s_cselect_b64 s[10:11], -1, 0
 ; SI-NEXT:    s_cmp_lt_i32 s12, s8
 ; SI-NEXT:    s_mov_b32 s1, s5
-; SI-NEXT:    v_mov_b32_e32 v0, s12
 ; SI-NEXT:    s_cselect_b64 s[8:9], -1, 0
+; SI-NEXT:    v_mov_b32_e32 v0, s12
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_xor_b64 s[0:1], s[10:11], s[8:9]
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -143,9 +143,9 @@ define amdgpu_kernel void @s_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)*
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    v_mov_b32_e32 v4, s4
 ; VI-NEXT:    s_xor_b64 s[0:1], s[2:3], s[0:1]
-; VI-NEXT:    flat_store_dword v[0:1], v4
 ; VI-NEXT:    v_mov_b32_e32 v2, s6
 ; VI-NEXT:    v_mov_b32_e32 v3, s7
+; VI-NEXT:    flat_store_dword v[0:1], v4
 ; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; VI-NEXT:    flat_store_byte v[2:3], v0
 ; VI-NEXT:    s_endpgm
@@ -306,9 +306,9 @@ define amdgpu_kernel void @s_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)*
 ; SI-NEXT:    s_xor_b64 s[4:5], s[4:5], vcc
 ; SI-NEXT:    s_mov_b32 s0, s2
 ; SI-NEXT:    s_mov_b32 s1, s3
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
 ; SI-NEXT:    s_mov_b32 s2, s10
 ; SI-NEXT:    s_mov_b32 s3, s11
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
@@ -319,14 +319,14 @@ define amdgpu_kernel void @s_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)*
 ; VI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v4, s4
 ; VI-NEXT:    s_add_u32 s0, s4, s6
+; VI-NEXT:    v_mov_b32_e32 v4, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    s_addc_u32 s1, s5, s7
 ; VI-NEXT:    v_mov_b32_e32 v5, s5
-; VI-NEXT:    v_cmp_lt_i64_e32 vcc, s[0:1], v[4:5]
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
+; VI-NEXT:    v_cmp_lt_i64_e32 vcc, s[0:1], v[4:5]
 ; VI-NEXT:    v_cmp_lt_i64_e64 s[2:3], s[6:7], 0
 ; VI-NEXT:    v_mov_b32_e32 v5, s1
 ; VI-NEXT:    v_mov_b32_e32 v4, s0
@@ -345,9 +345,9 @@ define amdgpu_kernel void @s_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)*
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:    s_addc_u32 s9, s5, s7
+; GFX9-NEXT:    v_cmp_lt_i64_e64 s[10:11], s[6:7], 0
 ; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s8
-; GFX9-NEXT:    v_cmp_lt_i64_e64 s[10:11], s[6:7], 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_xor_b64 s[0:1], s[10:11], vcc
@@ -508,9 +508,9 @@ define amdgpu_kernel void @v_saddo_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32>
 ; SI-NEXT:    v_add_i32_e32 v4, vcc, v0, v2
 ; SI-NEXT:    v_cmp_gt_i32_e64 s[0:1], 0, v3
 ; SI-NEXT:    v_cmp_lt_i32_e64 s[4:5], v5, v1
-; SI-NEXT:    s_xor_b64 s[0:1], s[0:1], s[4:5]
 ; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v2
 ; SI-NEXT:    v_cmp_lt_i32_e64 s[2:3], v4, v0
+; SI-NEXT:    s_xor_b64 s[0:1], s[0:1], s[4:5]
 ; SI-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
 ; SI-NEXT:    s_xor_b64 s[0:1], vcc, s[2:3]
 ; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
@@ -537,9 +537,9 @@ define amdgpu_kernel void @v_saddo_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32>
 ; VI-NEXT:    v_add_u32_e32 v8, vcc, v0, v2
 ; VI-NEXT:    v_cmp_gt_i32_e64 s[0:1], 0, v3
 ; VI-NEXT:    v_cmp_lt_i32_e64 s[4:5], v9, v1
-; VI-NEXT:    s_xor_b64 s[0:1], s[0:1], s[4:5]
 ; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v2
 ; VI-NEXT:    v_cmp_lt_i32_e64 s[2:3], v8, v0
+; VI-NEXT:    s_xor_b64 s[0:1], s[0:1], s[4:5]
 ; VI-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
 ; VI-NEXT:    s_xor_b64 s[0:1], vcc, s[2:3]
 ; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
@@ -558,8 +558,8 @@ define amdgpu_kernel void @v_saddo_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32>
 ; GFX9-NEXT:    v_add_u32_e32 v5, v1, v3
 ; GFX9-NEXT:    v_add_i32 v1, v1, v3 clamp
 ; GFX9-NEXT:    v_add_u32_e32 v4, v0, v2
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, v5, v1
 ; GFX9-NEXT:    v_add_i32 v0, v0, v2 clamp
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, v5, v1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, v4, v0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc

diff  --git a/llvm/test/CodeGen/AMDGPU/saddsat.ll b/llvm/test/CodeGen/AMDGPU/saddsat.ll
index 827d23dbd4637..8d77d86cb278e 100644
--- a/llvm/test/CodeGen/AMDGPU/saddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/saddsat.ll
@@ -153,8 +153,8 @@ define <2 x i16> @v_saddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
 ; GFX8-NEXT:    v_add_u16_e32 v4, v3, v2
-; GFX8-NEXT:    v_cmp_gt_i16_e64 s[4:5], 0, v2
 ; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, v4, v3
+; GFX8-NEXT:    v_cmp_gt_i16_e64 s[4:5], 0, v2
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v2, 15, v4
 ; GFX8-NEXT:    s_movk_i32 s6, 0x8000
 ; GFX8-NEXT:    v_xor_b32_e32 v2, s6, v2
@@ -200,16 +200,16 @@ define <3 x i16> @v_saddsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) {
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
 ; GFX6-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
 ; GFX6-NEXT:    v_min_i32_e32 v1, s4, v1
 ; GFX6-NEXT:    s_movk_i32 s5, 0x8000
 ; GFX6-NEXT:    v_min_i32_e32 v0, s4, v0
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
 ; GFX6-NEXT:    v_max_i32_e32 v1, s5, v1
 ; GFX6-NEXT:    v_max_i32_e32 v0, s5, v0
 ; GFX6-NEXT:    v_min_i32_e32 v2, s4, v2
-; GFX6-NEXT:    v_max_i32_e32 v3, s5, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX6-NEXT:    v_max_i32_e32 v3, s5, v2
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_or_b32_e32 v2, 0xffff0000, v3
 ; GFX6-NEXT:    v_alignbit_b32 v1, v3, v1, 16
@@ -221,8 +221,8 @@ define <3 x i16> @v_saddsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) {
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
 ; GFX8-NEXT:    v_add_u16_e32 v6, v5, v4
-; GFX8-NEXT:    v_cmp_gt_i16_e64 s[4:5], 0, v4
 ; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, v6, v5
+; GFX8-NEXT:    v_cmp_gt_i16_e64 s[4:5], 0, v4
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v4, 15, v6
 ; GFX8-NEXT:    s_movk_i32 s6, 0x8000
 ; GFX8-NEXT:    v_xor_b32_e32 v4, s6, v4
@@ -287,9 +287,9 @@ define <2 x float> @v_saddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 ; GFX6-NEXT:    v_bfe_i32 v3, v3, 0, 16
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_and_b32_e32 v0, s6, v0
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v3, v7
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
 ; GFX6-NEXT:    v_min_i32_e32 v1, s4, v1
 ; GFX6-NEXT:    v_min_i32_e32 v2, s4, v2
 ; GFX6-NEXT:    v_max_i32_e32 v1, s5, v1
@@ -305,8 +305,8 @@ define <2 x float> @v_saddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
 ; GFX8-NEXT:    v_add_u16_e32 v6, v5, v4
-; GFX8-NEXT:    v_cmp_gt_i16_e64 s[4:5], 0, v4
 ; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, v6, v5
+; GFX8-NEXT:    v_cmp_gt_i16_e64 s[4:5], 0, v4
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v4, 15, v6
 ; GFX8-NEXT:    s_movk_i32 s6, 0x8000
 ; GFX8-NEXT:    v_xor_b32_e32 v4, s6, v4
@@ -318,14 +318,14 @@ define <2 x float> @v_saddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v0, 15, v2
 ; GFX8-NEXT:    v_xor_b32_e32 v0, s6, v0
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
 ; GFX8-NEXT:    v_add_u16_e32 v5, v4, v2
-; GFX8-NEXT:    v_cmp_gt_i16_e64 s[4:5], 0, v2
 ; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, v5, v4
+; GFX8-NEXT:    v_cmp_gt_i16_e64 s[4:5], 0, v2
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v2, 15, v5
 ; GFX8-NEXT:    v_xor_b32_e32 v2, s6, v2
 ; GFX8-NEXT:    s_xor_b64 vcc, s[4:5], vcc
@@ -373,8 +373,8 @@ define <2 x i32> @v_saddsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX6-NEXT:    v_add_i32_e64 v2, s[4:5], v1, v3
-; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v2, v1
 ; GFX6-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v3
+; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v2, v1
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 31, v2
 ; GFX6-NEXT:    v_xor_b32_e32 v1, s6, v1
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
@@ -393,8 +393,8 @@ define <2 x i32> @v_saddsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX8-NEXT:    v_add_u32_e64 v2, s[4:5], v1, v3
-; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v2, v1
 ; GFX8-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v3
+; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v2, v1
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v2
 ; GFX8-NEXT:    v_xor_b32_e32 v1, s6, v1
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
@@ -467,8 +467,8 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, v0, v2
-; GFX10-NEXT:    v_cmp_gt_i64_e64 s4, 0, v[2:3]
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_i64_e64 s4, 0, v[2:3]
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
 ; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1]
 ; GFX10-NEXT:    v_xor_b32_e32 v1, 0x80000000, v6

diff  --git a/llvm/test/CodeGen/AMDGPU/sdiv.ll b/llvm/test/CodeGen/AMDGPU/sdiv.ll
index a978119e12a33..a62afdfbc744c 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv.ll
@@ -34,12 +34,12 @@ define amdgpu_kernel void @sdiv_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %i
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v3, v1
 ; GCN-NEXT:    v_sub_i32_e32 v4, vcc, 0, v1
 ; GCN-NEXT:    v_ashrrev_i32_e32 v5, 31, v0
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v3, v3
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
 ; GCN-NEXT:    v_xor_b32_e32 v0, v0, v5
-; GCN-NEXT:    v_xor_b32_e32 v2, v5, v2
 ; GCN-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GCN-NEXT:    v_xor_b32_e32 v2, v5, v2
 ; GCN-NEXT:    v_mul_lo_u32 v4, v4, v3
 ; GCN-NEXT:    v_mul_hi_u32 v4, v3, v4
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
@@ -48,8 +48,8 @@ define amdgpu_kernel void @sdiv_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %i
 ; GCN-NEXT:    v_add_i32_e32 v5, vcc, 1, v3
 ; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, v4, v0
 ; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v1
-; GCN-NEXT:    v_subrev_i32_e32 v4, vcc, v1, v0
 ; GCN-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
+; GCN-NEXT:    v_subrev_i32_e32 v4, vcc, v1, v0
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, 1, v3
 ; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
@@ -79,12 +79,12 @@ define amdgpu_kernel void @sdiv_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %i
 ; TONGA-NEXT:    v_cvt_f32_u32_e32 v3, v1
 ; TONGA-NEXT:    v_sub_u32_e32 v4, vcc, 0, v1
 ; TONGA-NEXT:    v_ashrrev_i32_e32 v5, 31, v0
-; TONGA-NEXT:    v_add_u32_e32 v0, vcc, v5, v0
 ; TONGA-NEXT:    v_rcp_iflag_f32_e32 v3, v3
+; TONGA-NEXT:    v_add_u32_e32 v0, vcc, v5, v0
 ; TONGA-NEXT:    v_xor_b32_e32 v0, v0, v5
-; TONGA-NEXT:    v_xor_b32_e32 v2, v5, v2
 ; TONGA-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
 ; TONGA-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; TONGA-NEXT:    v_xor_b32_e32 v2, v5, v2
 ; TONGA-NEXT:    v_mul_lo_u32 v4, v4, v3
 ; TONGA-NEXT:    v_mul_hi_u32 v4, v3, v4
 ; TONGA-NEXT:    v_add_u32_e32 v3, vcc, v4, v3
@@ -93,8 +93,8 @@ define amdgpu_kernel void @sdiv_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %i
 ; TONGA-NEXT:    v_add_u32_e32 v5, vcc, 1, v3
 ; TONGA-NEXT:    v_subrev_u32_e32 v0, vcc, v4, v0
 ; TONGA-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v1
-; TONGA-NEXT:    v_subrev_u32_e32 v4, vcc, v1, v0
 ; TONGA-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
+; TONGA-NEXT:    v_subrev_u32_e32 v4, vcc, v1, v0
 ; TONGA-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
 ; TONGA-NEXT:    v_add_u32_e32 v4, vcc, 1, v3
 ; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
@@ -414,44 +414,44 @@ define amdgpu_kernel void @sdiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> ad
 ; GCN-NEXT:    v_xor_b32_e32 v2, v2, v5
 ; GCN-NEXT:    v_xor_b32_e32 v3, v3, v7
 ; GCN-NEXT:    v_xor_b32_e32 v8, v4, v5
-; GCN-NEXT:    v_cvt_f32_u32_e32 v5, v2
 ; GCN-NEXT:    v_xor_b32_e32 v9, v6, v7
+; GCN-NEXT:    v_cvt_f32_u32_e32 v5, v2
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v7, v3
 ; GCN-NEXT:    v_sub_i32_e32 v10, vcc, 0, v2
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v5, v5
-; GCN-NEXT:    v_sub_i32_e32 v11, vcc, 0, v3
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v7, v7
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v4, v0
+; GCN-NEXT:    v_sub_i32_e32 v11, vcc, 0, v3
 ; GCN-NEXT:    v_mul_f32_e32 v5, s2, v5
-; GCN-NEXT:    v_cvt_u32_f32_e32 v5, v5
 ; GCN-NEXT:    v_mul_f32_e32 v7, s2, v7
+; GCN-NEXT:    v_cvt_u32_f32_e32 v5, v5
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v7, v7
-; GCN-NEXT:    v_add_i32_e32 v1, vcc, v6, v1
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v4, v0
 ; GCN-NEXT:    v_mul_lo_u32 v10, v10, v5
-; GCN-NEXT:    v_xor_b32_e32 v0, v0, v4
 ; GCN-NEXT:    v_mul_lo_u32 v11, v11, v7
-; GCN-NEXT:    v_xor_b32_e32 v1, v1, v6
+; GCN-NEXT:    v_add_i32_e32 v1, vcc, v6, v1
+; GCN-NEXT:    v_xor_b32_e32 v0, v0, v4
 ; GCN-NEXT:    v_mul_hi_u32 v4, v5, v10
+; GCN-NEXT:    v_xor_b32_e32 v1, v1, v6
 ; GCN-NEXT:    v_mul_hi_u32 v6, v7, v11
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
-; GCN-NEXT:    v_mul_hi_u32 v4, v0, v4
 ; GCN-NEXT:    v_add_i32_e32 v5, vcc, v6, v7
+; GCN-NEXT:    v_mul_hi_u32 v4, v0, v4
 ; GCN-NEXT:    v_mul_hi_u32 v5, v1, v5
 ; GCN-NEXT:    v_mul_lo_u32 v6, v4, v2
-; GCN-NEXT:    v_add_i32_e32 v7, vcc, 1, v4
 ; GCN-NEXT:    v_mul_lo_u32 v10, v5, v3
-; GCN-NEXT:    v_add_i32_e32 v11, vcc, 1, v5
+; GCN-NEXT:    v_add_i32_e32 v7, vcc, 1, v4
 ; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, v6, v0
-; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v2
 ; GCN-NEXT:    v_subrev_i32_e32 v1, vcc, v10, v1
+; GCN-NEXT:    v_add_i32_e32 v11, vcc, 1, v5
+; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v2
 ; GCN-NEXT:    v_cmp_ge_u32_e64 s[2:3], v1, v3
-; GCN-NEXT:    v_subrev_i32_e32 v6, vcc, v2, v0
 ; GCN-NEXT:    v_cndmask_b32_e64 v4, v4, v7, s[0:1]
-; GCN-NEXT:    v_subrev_i32_e32 v7, vcc, v3, v1
+; GCN-NEXT:    v_subrev_i32_e32 v6, vcc, v2, v0
 ; GCN-NEXT:    v_cndmask_b32_e64 v5, v5, v11, s[2:3]
+; GCN-NEXT:    v_subrev_i32_e32 v7, vcc, v3, v1
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[0:1]
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[2:3]
 ; GCN-NEXT:    v_add_i32_e32 v6, vcc, 1, v4
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[2:3]
 ; GCN-NEXT:    v_add_i32_e32 v7, vcc, 1, v5
 ; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v4, v6, vcc
@@ -488,44 +488,44 @@ define amdgpu_kernel void @sdiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> ad
 ; TONGA-NEXT:    v_xor_b32_e32 v2, v2, v5
 ; TONGA-NEXT:    v_xor_b32_e32 v3, v3, v7
 ; TONGA-NEXT:    v_xor_b32_e32 v8, v4, v5
-; TONGA-NEXT:    v_cvt_f32_u32_e32 v5, v2
 ; TONGA-NEXT:    v_xor_b32_e32 v9, v6, v7
+; TONGA-NEXT:    v_cvt_f32_u32_e32 v5, v2
 ; TONGA-NEXT:    v_cvt_f32_u32_e32 v7, v3
 ; TONGA-NEXT:    v_sub_u32_e32 v10, vcc, 0, v2
 ; TONGA-NEXT:    v_rcp_iflag_f32_e32 v5, v5
-; TONGA-NEXT:    v_sub_u32_e32 v11, vcc, 0, v3
 ; TONGA-NEXT:    v_rcp_iflag_f32_e32 v7, v7
-; TONGA-NEXT:    v_add_u32_e32 v0, vcc, v4, v0
+; TONGA-NEXT:    v_sub_u32_e32 v11, vcc, 0, v3
 ; TONGA-NEXT:    v_mul_f32_e32 v5, s2, v5
-; TONGA-NEXT:    v_cvt_u32_f32_e32 v5, v5
 ; TONGA-NEXT:    v_mul_f32_e32 v7, s2, v7
+; TONGA-NEXT:    v_cvt_u32_f32_e32 v5, v5
 ; TONGA-NEXT:    v_cvt_u32_f32_e32 v7, v7
-; TONGA-NEXT:    v_add_u32_e32 v1, vcc, v6, v1
+; TONGA-NEXT:    v_add_u32_e32 v0, vcc, v4, v0
 ; TONGA-NEXT:    v_mul_lo_u32 v10, v10, v5
-; TONGA-NEXT:    v_xor_b32_e32 v0, v0, v4
 ; TONGA-NEXT:    v_mul_lo_u32 v11, v11, v7
-; TONGA-NEXT:    v_xor_b32_e32 v1, v1, v6
+; TONGA-NEXT:    v_add_u32_e32 v1, vcc, v6, v1
+; TONGA-NEXT:    v_xor_b32_e32 v0, v0, v4
 ; TONGA-NEXT:    v_mul_hi_u32 v4, v5, v10
+; TONGA-NEXT:    v_xor_b32_e32 v1, v1, v6
 ; TONGA-NEXT:    v_mul_hi_u32 v6, v7, v11
 ; TONGA-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
-; TONGA-NEXT:    v_mul_hi_u32 v4, v0, v4
 ; TONGA-NEXT:    v_add_u32_e32 v5, vcc, v6, v7
+; TONGA-NEXT:    v_mul_hi_u32 v4, v0, v4
 ; TONGA-NEXT:    v_mul_hi_u32 v5, v1, v5
 ; TONGA-NEXT:    v_mul_lo_u32 v6, v4, v2
-; TONGA-NEXT:    v_add_u32_e32 v7, vcc, 1, v4
 ; TONGA-NEXT:    v_mul_lo_u32 v10, v5, v3
-; TONGA-NEXT:    v_add_u32_e32 v11, vcc, 1, v5
+; TONGA-NEXT:    v_add_u32_e32 v7, vcc, 1, v4
 ; TONGA-NEXT:    v_subrev_u32_e32 v0, vcc, v6, v0
-; TONGA-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v2
 ; TONGA-NEXT:    v_subrev_u32_e32 v1, vcc, v10, v1
+; TONGA-NEXT:    v_add_u32_e32 v11, vcc, 1, v5
+; TONGA-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v2
 ; TONGA-NEXT:    v_cmp_ge_u32_e64 s[2:3], v1, v3
-; TONGA-NEXT:    v_subrev_u32_e32 v6, vcc, v2, v0
 ; TONGA-NEXT:    v_cndmask_b32_e64 v4, v4, v7, s[0:1]
-; TONGA-NEXT:    v_subrev_u32_e32 v7, vcc, v3, v1
+; TONGA-NEXT:    v_subrev_u32_e32 v6, vcc, v2, v0
 ; TONGA-NEXT:    v_cndmask_b32_e64 v5, v5, v11, s[2:3]
+; TONGA-NEXT:    v_subrev_u32_e32 v7, vcc, v3, v1
 ; TONGA-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[0:1]
-; TONGA-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[2:3]
 ; TONGA-NEXT:    v_add_u32_e32 v6, vcc, 1, v4
+; TONGA-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[2:3]
 ; TONGA-NEXT:    v_add_u32_e32 v7, vcc, 1, v5
 ; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
 ; TONGA-NEXT:    v_cndmask_b32_e32 v0, v4, v6, vcc
@@ -593,16 +593,16 @@ define amdgpu_kernel void @sdiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> ad
 ; GFX9-NEXT:    v_sub_u32_e32 v1, v1, v9
 ; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
 ; GFX9-NEXT:    v_sub_u32_e32 v8, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v10, vcc
 ; GFX9-NEXT:    v_cmp_ge_u32_e64 s[0:1], v1, v3
 ; GFX9-NEXT:    v_sub_u32_e32 v9, v1, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v10, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, v11, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s[0:1]
 ; GFX9-NEXT:    v_add_u32_e32 v8, 1, v6
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s[0:1]
 ; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v8, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v9, 1, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v8, vcc
 ; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v9, vcc
 ; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v4
@@ -830,20 +830,20 @@ define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> ad
 ; GCN-NEXT:    v_add_i32_e32 v6, vcc, v13, v6
 ; GCN-NEXT:    v_ashrrev_i32_e32 v10, 31, v1
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v8, v8
-; GCN-NEXT:    v_mul_f32_e32 v9, s2, v9
 ; GCN-NEXT:    v_xor_b32_e32 v6, v6, v13
+; GCN-NEXT:    v_mul_f32_e32 v9, s2, v9
 ; GCN-NEXT:    v_xor_b32_e32 v16, v10, v11
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v11, v6
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v9, v9
 ; GCN-NEXT:    v_ashrrev_i32_e32 v12, 31, v2
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v12, v2
-; GCN-NEXT:    v_mul_f32_e32 v8, s2, v8
 ; GCN-NEXT:    v_xor_b32_e32 v17, v12, v13
 ; GCN-NEXT:    v_xor_b32_e32 v2, v2, v12
+; GCN-NEXT:    v_mul_f32_e32 v8, s2, v8
 ; GCN-NEXT:    v_sub_i32_e32 v12, vcc, 0, v5
+; GCN-NEXT:    v_cvt_u32_f32_e32 v8, v8
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v11, v11
 ; GCN-NEXT:    v_mul_lo_u32 v12, v12, v9
-; GCN-NEXT:    v_cvt_u32_f32_e32 v8, v8
 ; GCN-NEXT:    v_add_i32_e32 v1, vcc, v10, v1
 ; GCN-NEXT:    v_xor_b32_e32 v1, v1, v10
 ; GCN-NEXT:    v_sub_i32_e32 v10, vcc, 0, v4
@@ -857,15 +857,15 @@ define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> ad
 ; GCN-NEXT:    v_mul_lo_u32 v12, v12, v11
 ; GCN-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
 ; GCN-NEXT:    v_mul_hi_u32 v8, v0, v8
-; GCN-NEXT:    v_ashrrev_i32_e32 v14, 31, v7
 ; GCN-NEXT:    v_mul_hi_u32 v12, v11, v12
+; GCN-NEXT:    v_ashrrev_i32_e32 v14, 31, v7
 ; GCN-NEXT:    v_add_i32_e32 v7, vcc, v14, v7
 ; GCN-NEXT:    v_xor_b32_e32 v7, v7, v14
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v10, v7
 ; GCN-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
 ; GCN-NEXT:    v_mul_lo_u32 v12, v8, v4
-; GCN-NEXT:    v_mul_hi_u32 v9, v1, v9
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v10, v10
+; GCN-NEXT:    v_mul_hi_u32 v9, v1, v9
 ; GCN-NEXT:    v_mul_hi_u32 v11, v2, v11
 ; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v0, v12
 ; GCN-NEXT:    v_add_i32_e32 v12, vcc, 1, v8
@@ -873,27 +873,27 @@ define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> ad
 ; GCN-NEXT:    v_cndmask_b32_e64 v8, v8, v12, s[0:1]
 ; GCN-NEXT:    v_sub_i32_e32 v12, vcc, v0, v4
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v12, s[0:1]
+; GCN-NEXT:    v_mul_f32_e32 v10, s2, v10
 ; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v4
 ; GCN-NEXT:    v_mul_lo_u32 v0, v9, v5
-; GCN-NEXT:    v_mul_f32_e32 v10, s2, v10
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v4, v10
 ; GCN-NEXT:    v_mul_lo_u32 v10, v11, v6
+; GCN-NEXT:    v_add_i32_e32 v12, vcc, 1, v8
 ; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v1, v0
 ; GCN-NEXT:    v_add_i32_e32 v1, vcc, 1, v9
-; GCN-NEXT:    v_cmp_ge_u32_e64 s[2:3], v0, v5
 ; GCN-NEXT:    v_sub_i32_e32 v2, vcc, v2, v10
+; GCN-NEXT:    v_cmp_ge_u32_e64 s[2:3], v0, v5
+; GCN-NEXT:    v_add_i32_e32 v10, vcc, 1, v11
 ; GCN-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[2:3]
 ; GCN-NEXT:    v_sub_i32_e32 v9, vcc, v0, v5
-; GCN-NEXT:    v_add_i32_e32 v10, vcc, 1, v11
 ; GCN-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v6
-; GCN-NEXT:    v_add_i32_e32 v12, vcc, 1, v8
 ; GCN-NEXT:    v_cndmask_b32_e64 v10, v11, v10, s[4:5]
 ; GCN-NEXT:    v_sub_i32_e32 v11, vcc, v2, v6
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v9, s[2:3]
 ; GCN-NEXT:    v_add_i32_e32 v9, vcc, 1, v1
 ; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v5
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v9, vcc
 ; GCN-NEXT:    v_cndmask_b32_e64 v8, v8, v12, s[0:1]
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v9, vcc
 ; GCN-NEXT:    v_xor_b32_e32 v1, v8, v15
 ; GCN-NEXT:    v_xor_b32_e32 v5, v0, v16
 ; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v1, v15
@@ -902,8 +902,8 @@ define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> ad
 ; GCN-NEXT:    v_mul_lo_u32 v5, v5, v4
 ; GCN-NEXT:    v_ashrrev_i32_e32 v9, 31, v3
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, v9, v3
-; GCN-NEXT:    v_xor_b32_e32 v3, v3, v9
 ; GCN-NEXT:    v_mul_hi_u32 v5, v4, v5
+; GCN-NEXT:    v_xor_b32_e32 v3, v3, v9
 ; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v11, s[4:5]
 ; GCN-NEXT:    v_add_i32_e32 v8, vcc, 1, v10
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
@@ -962,20 +962,20 @@ define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> ad
 ; TONGA-NEXT:    v_add_u32_e32 v6, vcc, v13, v6
 ; TONGA-NEXT:    v_ashrrev_i32_e32 v10, 31, v1
 ; TONGA-NEXT:    v_rcp_iflag_f32_e32 v8, v8
-; TONGA-NEXT:    v_mul_f32_e32 v9, s2, v9
 ; TONGA-NEXT:    v_xor_b32_e32 v6, v6, v13
+; TONGA-NEXT:    v_mul_f32_e32 v9, s2, v9
 ; TONGA-NEXT:    v_xor_b32_e32 v16, v10, v11
 ; TONGA-NEXT:    v_cvt_f32_u32_e32 v11, v6
 ; TONGA-NEXT:    v_cvt_u32_f32_e32 v9, v9
 ; TONGA-NEXT:    v_ashrrev_i32_e32 v12, 31, v2
 ; TONGA-NEXT:    v_add_u32_e32 v2, vcc, v12, v2
-; TONGA-NEXT:    v_mul_f32_e32 v8, s2, v8
 ; TONGA-NEXT:    v_xor_b32_e32 v17, v12, v13
 ; TONGA-NEXT:    v_xor_b32_e32 v2, v2, v12
+; TONGA-NEXT:    v_mul_f32_e32 v8, s2, v8
 ; TONGA-NEXT:    v_sub_u32_e32 v12, vcc, 0, v5
+; TONGA-NEXT:    v_cvt_u32_f32_e32 v8, v8
 ; TONGA-NEXT:    v_rcp_iflag_f32_e32 v11, v11
 ; TONGA-NEXT:    v_mul_lo_u32 v12, v12, v9
-; TONGA-NEXT:    v_cvt_u32_f32_e32 v8, v8
 ; TONGA-NEXT:    v_add_u32_e32 v1, vcc, v10, v1
 ; TONGA-NEXT:    v_xor_b32_e32 v1, v1, v10
 ; TONGA-NEXT:    v_sub_u32_e32 v10, vcc, 0, v4
@@ -989,15 +989,15 @@ define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> ad
 ; TONGA-NEXT:    v_mul_lo_u32 v12, v12, v11
 ; TONGA-NEXT:    v_add_u32_e32 v8, vcc, v10, v8
 ; TONGA-NEXT:    v_mul_hi_u32 v8, v0, v8
-; TONGA-NEXT:    v_ashrrev_i32_e32 v14, 31, v7
 ; TONGA-NEXT:    v_mul_hi_u32 v12, v11, v12
+; TONGA-NEXT:    v_ashrrev_i32_e32 v14, 31, v7
 ; TONGA-NEXT:    v_add_u32_e32 v7, vcc, v14, v7
 ; TONGA-NEXT:    v_xor_b32_e32 v7, v7, v14
 ; TONGA-NEXT:    v_cvt_f32_u32_e32 v10, v7
 ; TONGA-NEXT:    v_add_u32_e32 v11, vcc, v12, v11
 ; TONGA-NEXT:    v_mul_lo_u32 v12, v8, v4
-; TONGA-NEXT:    v_mul_hi_u32 v9, v1, v9
 ; TONGA-NEXT:    v_rcp_iflag_f32_e32 v10, v10
+; TONGA-NEXT:    v_mul_hi_u32 v9, v1, v9
 ; TONGA-NEXT:    v_mul_hi_u32 v11, v2, v11
 ; TONGA-NEXT:    v_sub_u32_e32 v0, vcc, v0, v12
 ; TONGA-NEXT:    v_add_u32_e32 v12, vcc, 1, v8
@@ -1005,27 +1005,27 @@ define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> ad
 ; TONGA-NEXT:    v_cndmask_b32_e64 v8, v8, v12, s[0:1]
 ; TONGA-NEXT:    v_sub_u32_e32 v12, vcc, v0, v4
 ; TONGA-NEXT:    v_cndmask_b32_e64 v0, v0, v12, s[0:1]
+; TONGA-NEXT:    v_mul_f32_e32 v10, s2, v10
 ; TONGA-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v4
 ; TONGA-NEXT:    v_mul_lo_u32 v0, v9, v5
-; TONGA-NEXT:    v_mul_f32_e32 v10, s2, v10
 ; TONGA-NEXT:    v_cvt_u32_f32_e32 v4, v10
 ; TONGA-NEXT:    v_mul_lo_u32 v10, v11, v6
+; TONGA-NEXT:    v_add_u32_e32 v12, vcc, 1, v8
 ; TONGA-NEXT:    v_sub_u32_e32 v0, vcc, v1, v0
 ; TONGA-NEXT:    v_add_u32_e32 v1, vcc, 1, v9
-; TONGA-NEXT:    v_cmp_ge_u32_e64 s[2:3], v0, v5
 ; TONGA-NEXT:    v_sub_u32_e32 v2, vcc, v2, v10
+; TONGA-NEXT:    v_cmp_ge_u32_e64 s[2:3], v0, v5
+; TONGA-NEXT:    v_add_u32_e32 v10, vcc, 1, v11
 ; TONGA-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[2:3]
 ; TONGA-NEXT:    v_sub_u32_e32 v9, vcc, v0, v5
-; TONGA-NEXT:    v_add_u32_e32 v10, vcc, 1, v11
 ; TONGA-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v6
-; TONGA-NEXT:    v_add_u32_e32 v12, vcc, 1, v8
 ; TONGA-NEXT:    v_cndmask_b32_e64 v10, v11, v10, s[4:5]
 ; TONGA-NEXT:    v_sub_u32_e32 v11, vcc, v2, v6
 ; TONGA-NEXT:    v_cndmask_b32_e64 v0, v0, v9, s[2:3]
 ; TONGA-NEXT:    v_add_u32_e32 v9, vcc, 1, v1
 ; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v5
-; TONGA-NEXT:    v_cndmask_b32_e32 v0, v1, v9, vcc
 ; TONGA-NEXT:    v_cndmask_b32_e64 v8, v8, v12, s[0:1]
+; TONGA-NEXT:    v_cndmask_b32_e32 v0, v1, v9, vcc
 ; TONGA-NEXT:    v_xor_b32_e32 v1, v8, v15
 ; TONGA-NEXT:    v_xor_b32_e32 v5, v0, v16
 ; TONGA-NEXT:    v_subrev_u32_e32 v0, vcc, v15, v1
@@ -1034,8 +1034,8 @@ define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> ad
 ; TONGA-NEXT:    v_mul_lo_u32 v5, v5, v4
 ; TONGA-NEXT:    v_ashrrev_i32_e32 v9, 31, v3
 ; TONGA-NEXT:    v_add_u32_e32 v3, vcc, v9, v3
-; TONGA-NEXT:    v_xor_b32_e32 v3, v3, v9
 ; TONGA-NEXT:    v_mul_hi_u32 v5, v4, v5
+; TONGA-NEXT:    v_xor_b32_e32 v3, v3, v9
 ; TONGA-NEXT:    v_cndmask_b32_e64 v2, v2, v11, s[4:5]
 ; TONGA-NEXT:    v_add_u32_e32 v8, vcc, 1, v10
 ; TONGA-NEXT:    v_add_u32_e32 v4, vcc, v5, v4
@@ -1085,8 +1085,8 @@ define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> ad
 ; GFX9-NEXT:    v_xor_b32_e32 v4, v4, v9
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v10, 31, v1
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v13, 31, v6
-; GFX9-NEXT:    v_add_u32_e32 v5, v5, v11
 ; GFX9-NEXT:    v_xor_b32_e32 v16, v8, v9
+; GFX9-NEXT:    v_add_u32_e32 v5, v5, v11
 ; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v8
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v8, v4
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v12, 31, v2
@@ -1099,21 +1099,21 @@ define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> ad
 ; GFX9-NEXT:    v_add_u32_e32 v7, v7, v15
 ; GFX9-NEXT:    v_xor_b32_e32 v17, v10, v11
 ; GFX9-NEXT:    v_xor_b32_e32 v1, v1, v10
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v10, v5
 ; GFX9-NEXT:    v_xor_b32_e32 v6, v6, v13
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v10, v5
 ; GFX9-NEXT:    v_add_u32_e32 v3, v3, v14
 ; GFX9-NEXT:    v_xor_b32_e32 v18, v12, v13
 ; GFX9-NEXT:    v_xor_b32_e32 v2, v2, v12
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v12, v6
 ; GFX9-NEXT:    v_xor_b32_e32 v7, v7, v15
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v8, v8
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v12, v6
 ; GFX9-NEXT:    v_xor_b32_e32 v19, v14, v15
 ; GFX9-NEXT:    v_xor_b32_e32 v3, v3, v14
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v14, v7
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v8, v8
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v10, v10
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v12, v12
-; GFX9-NEXT:    v_mul_f32_e32 v8, s2, v8
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v14, v14
+; GFX9-NEXT:    v_mul_f32_e32 v8, s2, v8
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v8, v8
 ; GFX9-NEXT:    v_mul_f32_e32 v10, s2, v10
 ; GFX9-NEXT:    v_mul_f32_e32 v12, s2, v12
@@ -1121,8 +1121,8 @@ define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> ad
 ; GFX9-NEXT:    v_sub_u32_e32 v9, 0, v4
 ; GFX9-NEXT:    v_mul_f32_e32 v14, s2, v14
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v12, v12
-; GFX9-NEXT:    v_mul_lo_u32 v9, v9, v8
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v14, v14
+; GFX9-NEXT:    v_mul_lo_u32 v9, v9, v8
 ; GFX9-NEXT:    v_sub_u32_e32 v11, 0, v5
 ; GFX9-NEXT:    v_sub_u32_e32 v13, 0, v6
 ; GFX9-NEXT:    v_mul_lo_u32 v11, v11, v10
@@ -1140,45 +1140,45 @@ define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> ad
 ; GFX9-NEXT:    v_mul_hi_u32 v9, v1, v9
 ; GFX9-NEXT:    v_add_u32_e32 v11, v14, v15
 ; GFX9-NEXT:    v_mul_hi_u32 v10, v2, v10
-; GFX9-NEXT:    v_mul_lo_u32 v12, v8, v4
 ; GFX9-NEXT:    v_mul_hi_u32 v11, v3, v11
+; GFX9-NEXT:    v_mul_lo_u32 v12, v8, v4
 ; GFX9-NEXT:    v_mul_lo_u32 v14, v9, v5
 ; GFX9-NEXT:    v_mul_lo_u32 v15, v10, v6
+; GFX9-NEXT:    v_add_u32_e32 v13, 1, v8
 ; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v12
 ; GFX9-NEXT:    v_mul_lo_u32 v12, v11, v7
 ; GFX9-NEXT:    v_sub_u32_e32 v1, v1, v14
-; GFX9-NEXT:    v_add_u32_e32 v13, 1, v8
 ; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v4
+; GFX9-NEXT:    v_add_u32_e32 v14, 1, v9
 ; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v15
 ; GFX9-NEXT:    v_cndmask_b32_e32 v8, v8, v13, vcc
 ; GFX9-NEXT:    v_sub_u32_e32 v13, v0, v4
-; GFX9-NEXT:    v_add_u32_e32 v14, 1, v9
 ; GFX9-NEXT:    v_cmp_ge_u32_e64 s[0:1], v1, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v13, vcc
+; GFX9-NEXT:    v_add_u32_e32 v15, 1, v10
 ; GFX9-NEXT:    v_sub_u32_e32 v3, v3, v12
 ; GFX9-NEXT:    v_cndmask_b32_e64 v9, v9, v14, s[0:1]
 ; GFX9-NEXT:    v_sub_u32_e32 v14, v1, v5
-; GFX9-NEXT:    v_add_u32_e32 v15, 1, v10
 ; GFX9-NEXT:    v_cmp_ge_u32_e64 s[2:3], v2, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v13, vcc
+; GFX9-NEXT:    v_add_u32_e32 v12, 1, v11
 ; GFX9-NEXT:    v_cndmask_b32_e64 v10, v10, v15, s[2:3]
 ; GFX9-NEXT:    v_sub_u32_e32 v15, v2, v6
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v14, s[0:1]
-; GFX9-NEXT:    v_add_u32_e32 v12, 1, v11
 ; GFX9-NEXT:    v_cmp_ge_u32_e64 s[4:5], v3, v7
 ; GFX9-NEXT:    v_add_u32_e32 v13, 1, v8
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v14, s[0:1]
 ; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v8, v13, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v11, v11, v12, s[4:5]
 ; GFX9-NEXT:    v_sub_u32_e32 v12, v3, v7
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v15, s[2:3]
 ; GFX9-NEXT:    v_add_u32_e32 v14, 1, v9
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v15, s[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v8, v13, vcc
 ; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v9, v14, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v12, s[4:5]
 ; GFX9-NEXT:    v_add_u32_e32 v15, 1, v10
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v12, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v9, v14, vcc
 ; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v10, v15, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v12, 1, v11
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v10, v15, vcc
 ; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v7
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v11, v12, vcc
 ; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v16
@@ -1990,13 +1990,13 @@ define amdgpu_kernel void @v_sdiv_i25(i32 addrspace(1)* %out, i25 addrspace(1)*
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v3, v2
 ; GCN-NEXT:    v_sub_i32_e32 v4, vcc, 0, v2
 ; GCN-NEXT:    v_bfe_i32 v5, v0, 0, 25
-; GCN-NEXT:    v_bfe_i32 v0, v0, 24, 1
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v3, v3
+; GCN-NEXT:    v_bfe_i32 v0, v0, 24, 1
 ; GCN-NEXT:    v_add_i32_e32 v5, vcc, v0, v5
-; GCN-NEXT:    v_xor_b32_e32 v5, v5, v0
-; GCN-NEXT:    v_xor_b32_e32 v0, v0, v1
 ; GCN-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GCN-NEXT:    v_xor_b32_e32 v5, v5, v0
+; GCN-NEXT:    v_xor_b32_e32 v0, v0, v1
 ; GCN-NEXT:    v_mul_lo_u32 v4, v4, v3
 ; GCN-NEXT:    v_mul_hi_u32 v4, v3, v4
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
@@ -2038,13 +2038,13 @@ define amdgpu_kernel void @v_sdiv_i25(i32 addrspace(1)* %out, i25 addrspace(1)*
 ; TONGA-NEXT:    v_cvt_f32_u32_e32 v3, v2
 ; TONGA-NEXT:    v_sub_u32_e32 v4, vcc, 0, v2
 ; TONGA-NEXT:    v_bfe_i32 v5, v0, 0, 25
-; TONGA-NEXT:    v_bfe_i32 v0, v0, 24, 1
 ; TONGA-NEXT:    v_rcp_iflag_f32_e32 v3, v3
+; TONGA-NEXT:    v_bfe_i32 v0, v0, 24, 1
 ; TONGA-NEXT:    v_add_u32_e32 v5, vcc, v0, v5
-; TONGA-NEXT:    v_xor_b32_e32 v5, v5, v0
-; TONGA-NEXT:    v_xor_b32_e32 v0, v0, v1
 ; TONGA-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
 ; TONGA-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; TONGA-NEXT:    v_xor_b32_e32 v5, v5, v0
+; TONGA-NEXT:    v_xor_b32_e32 v0, v0, v1
 ; TONGA-NEXT:    v_mul_lo_u32 v4, v4, v3
 ; TONGA-NEXT:    v_mul_hi_u32 v4, v3, v4
 ; TONGA-NEXT:    v_add_u32_e32 v3, vcc, v4, v3

diff  --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
index 104f0335e34a8..bebacfa27d15f 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
@@ -29,10 +29,10 @@ define amdgpu_kernel void @s_test_sdiv(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ; GCN-NEXT:    v_mul_f32_e32 v2, 0x2f800000, v0
 ; GCN-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v2
-; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GCN-NEXT:    v_mul_hi_u32 v4, s4, v0
+; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_lo_u32 v3, s4, v2
+; GCN-NEXT:    v_mul_hi_u32 v4, s4, v0
 ; GCN-NEXT:    v_mul_lo_u32 v6, s5, v0
 ; GCN-NEXT:    v_mul_lo_u32 v5, s4, v0
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
@@ -50,8 +50,8 @@ define amdgpu_kernel void @s_test_sdiv(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v6, v5, vcc
 ; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v9, v1, vcc
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GCN-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v3
 ; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v7, v5, vcc
+; GCN-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v3
 ; GCN-NEXT:    v_addc_u32_e64 v3, vcc, v2, v4, s[0:1]
 ; GCN-NEXT:    v_mul_lo_u32 v5, s4, v3
 ; GCN-NEXT:    v_mul_hi_u32 v6, s4, v0
@@ -61,8 +61,8 @@ define amdgpu_kernel void @s_test_sdiv(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ; GCN-NEXT:    v_mul_lo_u32 v6, s4, v0
 ; GCN-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
 ; GCN-NEXT:    v_mul_lo_u32 v10, v0, v5
-; GCN-NEXT:    v_mul_hi_u32 v12, v0, v5
 ; GCN-NEXT:    v_mul_hi_u32 v11, v0, v6
+; GCN-NEXT:    v_mul_hi_u32 v12, v0, v5
 ; GCN-NEXT:    v_mul_hi_u32 v9, v3, v6
 ; GCN-NEXT:    v_mul_lo_u32 v6, v3, v6
 ; GCN-NEXT:    v_mul_hi_u32 v8, v3, v5
@@ -77,10 +77,10 @@ define amdgpu_kernel void @s_test_sdiv(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; GCN-NEXT:    v_addc_u32_e64 v2, vcc, v2, v5, s[0:1]
 ; GCN-NEXT:    s_add_u32 s0, s10, s14
-; GCN-NEXT:    s_addc_u32 s1, s11, s14
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
-; GCN-NEXT:    s_xor_b64 s[10:11], s[0:1], s[14:15]
+; GCN-NEXT:    s_addc_u32 s1, s11, s14
 ; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
+; GCN-NEXT:    s_xor_b64 s[10:11], s[0:1], s[14:15]
 ; GCN-NEXT:    v_mul_lo_u32 v3, s10, v2
 ; GCN-NEXT:    v_mul_hi_u32 v4, s10, v0
 ; GCN-NEXT:    v_mul_hi_u32 v5, s10, v2
@@ -150,8 +150,8 @@ define amdgpu_kernel void @s_test_sdiv(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    s_mov_b32 s1, s0
 ; GCN-IR-NEXT:    s_ashr_i32 s2, s9, 31
 ; GCN-IR-NEXT:    s_xor_b64 s[6:7], s[0:1], s[6:7]
-; GCN-IR-NEXT:    s_sub_u32 s10, s6, s0
 ; GCN-IR-NEXT:    s_mov_b32 s3, s2
+; GCN-IR-NEXT:    s_sub_u32 s10, s6, s0
 ; GCN-IR-NEXT:    s_subb_u32 s11, s7, s0
 ; GCN-IR-NEXT:    s_xor_b64 s[6:7], s[2:3], s[8:9]
 ; GCN-IR-NEXT:    s_sub_u32 s6, s6, s2
@@ -194,14 +194,14 @@ define amdgpu_kernel void @s_test_sdiv(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    s_addc_u32 s21, s7, -1
 ; GCN-IR-NEXT:    s_not_b64 s[8:9], s[14:15]
 ; GCN-IR-NEXT:    s_add_u32 s10, s8, s16
-; GCN-IR-NEXT:    s_addc_u32 s11, s9, s15
 ; GCN-IR-NEXT:    s_mov_b32 s17, s15
+; GCN-IR-NEXT:    s_addc_u32 s11, s9, s15
 ; GCN-IR-NEXT:    s_mov_b64 s[14:15], 0
 ; GCN-IR-NEXT:    s_mov_b32 s9, 0
 ; GCN-IR-NEXT:  BB0_3: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    s_lshr_b32 s8, s13, 31
 ; GCN-IR-NEXT:    s_lshl_b64 s[16:17], s[18:19], 1
+; GCN-IR-NEXT:    s_lshr_b32 s8, s13, 31
 ; GCN-IR-NEXT:    s_lshl_b64 s[12:13], s[12:13], 1
 ; GCN-IR-NEXT:    s_or_b64 s[16:17], s[16:17], s[8:9]
 ; GCN-IR-NEXT:    s_or_b64 s[12:13], s[14:15], s[12:13]
@@ -255,15 +255,15 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) {
 ; GCN-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v3, v4, vcc
-; GCN-NEXT:    v_xor_b32_e32 v2, v2, v4
 ; GCN-NEXT:    v_xor_b32_e32 v3, v3, v4
+; GCN-NEXT:    v_xor_b32_e32 v2, v2, v4
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v5, v2
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v6, v3
 ; GCN-NEXT:    v_sub_i32_e32 v7, vcc, 0, v2
 ; GCN-NEXT:    v_subb_u32_e32 v8, vcc, 0, v3, vcc
-; GCN-NEXT:    v_mov_b32_e32 v15, 0
 ; GCN-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v6
 ; GCN-NEXT:    v_rcp_f32_e32 v5, v5
+; GCN-NEXT:    v_mov_b32_e32 v15, 0
 ; GCN-NEXT:    v_mov_b32_e32 v14, 0
 ; GCN-NEXT:    v_mul_f32_e32 v5, 0x5f7ffffc, v5
 ; GCN-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v5
@@ -278,8 +278,8 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) {
 ; GCN-NEXT:    v_mul_lo_u32 v10, v7, v5
 ; GCN-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
 ; GCN-NEXT:    v_mul_lo_u32 v12, v5, v9
-; GCN-NEXT:    v_mul_hi_u32 v11, v5, v9
 ; GCN-NEXT:    v_mul_hi_u32 v13, v5, v10
+; GCN-NEXT:    v_mul_hi_u32 v11, v5, v9
 ; GCN-NEXT:    v_mul_hi_u32 v16, v6, v9
 ; GCN-NEXT:    v_mul_lo_u32 v9, v6, v9
 ; GCN-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
@@ -290,8 +290,8 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) {
 ; GCN-NEXT:    v_addc_u32_e32 v10, vcc, v11, v10, vcc
 ; GCN-NEXT:    v_addc_u32_e32 v11, vcc, v16, v14, vcc
 ; GCN-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
-; GCN-NEXT:    v_add_i32_e64 v5, s[4:5], v5, v9
 ; GCN-NEXT:    v_addc_u32_e32 v10, vcc, v15, v11, vcc
+; GCN-NEXT:    v_add_i32_e64 v5, s[4:5], v5, v9
 ; GCN-NEXT:    v_addc_u32_e64 v9, vcc, v6, v10, s[4:5]
 ; GCN-NEXT:    v_mul_lo_u32 v11, v7, v9
 ; GCN-NEXT:    v_mul_hi_u32 v12, v7, v5
@@ -304,8 +304,8 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) {
 ; GCN-NEXT:    v_mul_hi_u32 v17, v5, v8
 ; GCN-NEXT:    v_mul_hi_u32 v12, v9, v7
 ; GCN-NEXT:    v_mul_lo_u32 v7, v9, v7
-; GCN-NEXT:    v_add_i32_e32 v13, vcc, v16, v13
 ; GCN-NEXT:    v_mul_hi_u32 v11, v9, v8
+; GCN-NEXT:    v_add_i32_e32 v13, vcc, v16, v13
 ; GCN-NEXT:    v_addc_u32_e32 v16, vcc, v15, v17, vcc
 ; GCN-NEXT:    v_mul_lo_u32 v8, v9, v8
 ; GCN-NEXT:    v_add_i32_e32 v7, vcc, v13, v7
@@ -352,9 +352,9 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) {
 ; GCN-NEXT:    v_cmp_ge_u32_e64 s[4:5], v10, v2
 ; GCN-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
 ; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], v9, v3
-; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v1, v8, vcc
 ; GCN-NEXT:    v_cndmask_b32_e64 v9, v11, v10, s[4:5]
 ; GCN-NEXT:    v_add_i32_e64 v10, s[4:5], 2, v5
+; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v1, v8, vcc
 ; GCN-NEXT:    v_addc_u32_e64 v11, s[4:5], 0, v6, s[4:5]
 ; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
 ; GCN-NEXT:    v_add_i32_e64 v12, s[4:5], 1, v5
@@ -369,8 +369,8 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) {
 ; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GCN-NEXT:    v_cndmask_b32_e64 v1, v12, v10, s[4:5]
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v6, v9, vcc
-; GCN-NEXT:    v_xor_b32_e32 v2, v7, v4
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GCN-NEXT:    v_xor_b32_e32 v2, v7, v4
 ; GCN-NEXT:    v_xor_b32_e32 v3, v0, v2
 ; GCN-NEXT:    v_xor_b32_e32 v0, v1, v2
 ; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
@@ -383,12 +383,12 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v4, 31, v1
 ; GCN-IR-NEXT:    v_xor_b32_e32 v0, v4, v0
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v5, 31, v3
-; GCN-IR-NEXT:    v_sub_i32_e32 v9, vcc, v0, v4
 ; GCN-IR-NEXT:    v_xor_b32_e32 v1, v4, v1
+; GCN-IR-NEXT:    v_sub_i32_e32 v9, vcc, v0, v4
 ; GCN-IR-NEXT:    v_subb_u32_e32 v10, vcc, v1, v4, vcc
 ; GCN-IR-NEXT:    v_xor_b32_e32 v1, v5, v2
-; GCN-IR-NEXT:    v_sub_i32_e32 v2, vcc, v1, v5
 ; GCN-IR-NEXT:    v_xor_b32_e32 v0, v5, v3
+; GCN-IR-NEXT:    v_sub_i32_e32 v2, vcc, v1, v5
 ; GCN-IR-NEXT:    v_subb_u32_e32 v3, vcc, v0, v5, vcc
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[9:10]
@@ -421,8 +421,8 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    v_addc_u32_e32 v16, vcc, 0, v8, vcc
 ; GCN-IR-NEXT:    v_cmp_ge_u64_e32 vcc, v[15:16], v[7:8]
 ; GCN-IR-NEXT:    v_sub_i32_e64 v7, s[4:5], 63, v7
-; GCN-IR-NEXT:    v_mov_b32_e32 v11, 0
 ; GCN-IR-NEXT:    v_lshl_b64 v[7:8], v[9:10], v7
+; GCN-IR-NEXT:    v_mov_b32_e32 v11, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v12, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -434,8 +434,8 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    v_not_b32_e32 v0, v0
 ; GCN-IR-NEXT:    v_lshr_b64 v[15:16], v[9:10], v15
 ; GCN-IR-NEXT:    v_not_b32_e32 v10, v17
-; GCN-IR-NEXT:    v_add_i32_e32 v9, vcc, v0, v13
 ; GCN-IR-NEXT:    v_mov_b32_e32 v17, 0
+; GCN-IR-NEXT:    v_add_i32_e32 v9, vcc, v0, v13
 ; GCN-IR-NEXT:    v_mov_b32_e32 v18, 0
 ; GCN-IR-NEXT:    v_addc_u32_e32 v10, vcc, v10, v14, vcc
 ; GCN-IR-NEXT:  BB1_3: ; %udiv-do-while
@@ -448,15 +448,15 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    v_subb_u32_e32 v11, vcc, v20, v14, vcc
 ; GCN-IR-NEXT:    v_or_b32_e32 v7, v17, v7
 ; GCN-IR-NEXT:    v_add_i32_e32 v17, vcc, 1, v9
-; GCN-IR-NEXT:    v_ashrrev_i32_e32 v13, 31, v11
 ; GCN-IR-NEXT:    v_or_b32_e32 v8, v18, v8
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v13, 31, v11
 ; GCN-IR-NEXT:    v_addc_u32_e32 v18, vcc, 0, v10, vcc
-; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, v[17:18], v[9:10]
-; GCN-IR-NEXT:    v_mov_b32_e32 v9, v17
 ; GCN-IR-NEXT:    v_mov_b32_e32 v12, 0
 ; GCN-IR-NEXT:    v_and_b32_e32 v11, 1, v13
 ; GCN-IR-NEXT:    v_and_b32_e32 v16, v13, v3
 ; GCN-IR-NEXT:    v_and_b32_e32 v13, v13, v2
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, v[17:18], v[9:10]
+; GCN-IR-NEXT:    v_mov_b32_e32 v9, v17
 ; GCN-IR-NEXT:    v_sub_i32_e64 v15, s[4:5], v0, v13
 ; GCN-IR-NEXT:    v_mov_b32_e32 v10, v18
 ; GCN-IR-NEXT:    v_mov_b32_e32 v18, v12
@@ -475,8 +475,8 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) {
 ; GCN-IR-NEXT:  BB1_6: ; %Flow4
 ; GCN-IR-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GCN-IR-NEXT:    v_xor_b32_e32 v0, v5, v4
-; GCN-IR-NEXT:    v_xor_b32_e32 v3, v11, v0
 ; GCN-IR-NEXT:    v_xor_b32_e32 v1, v1, v6
+; GCN-IR-NEXT:    v_xor_b32_e32 v3, v11, v0
 ; GCN-IR-NEXT:    v_xor_b32_e32 v2, v12, v1
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v3, v0
 ; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
@@ -1008,11 +1008,11 @@ define amdgpu_kernel void @s_test_sdiv24_48(i48 addrspace(1)* %out, i48 %x, i48
 ; GCN-IR-NEXT:    s_sext_i32_i16 s7, s0
 ; GCN-IR-NEXT:    s_ashr_i32 s0, s3, 31
 ; GCN-IR-NEXT:    s_mov_b32 s1, s0
-; GCN-IR-NEXT:    s_ashr_i32 s2, s7, 31
 ; GCN-IR-NEXT:    s_ashr_i64 s[12:13], s[6:7], 24
+; GCN-IR-NEXT:    s_ashr_i32 s2, s7, 31
 ; GCN-IR-NEXT:    s_xor_b64 s[6:7], s[0:1], s[8:9]
-; GCN-IR-NEXT:    s_sub_u32 s10, s6, s0
 ; GCN-IR-NEXT:    s_mov_b32 s3, s2
+; GCN-IR-NEXT:    s_sub_u32 s10, s6, s0
 ; GCN-IR-NEXT:    s_subb_u32 s11, s7, s0
 ; GCN-IR-NEXT:    s_xor_b64 s[6:7], s[2:3], s[12:13]
 ; GCN-IR-NEXT:    s_sub_u32 s6, s6, s2
@@ -1055,14 +1055,14 @@ define amdgpu_kernel void @s_test_sdiv24_48(i48 addrspace(1)* %out, i48 %x, i48
 ; GCN-IR-NEXT:    s_addc_u32 s21, s7, -1
 ; GCN-IR-NEXT:    s_not_b64 s[8:9], s[14:15]
 ; GCN-IR-NEXT:    s_add_u32 s10, s8, s16
-; GCN-IR-NEXT:    s_addc_u32 s11, s9, s15
 ; GCN-IR-NEXT:    s_mov_b32 s17, s15
+; GCN-IR-NEXT:    s_addc_u32 s11, s9, s15
 ; GCN-IR-NEXT:    s_mov_b64 s[14:15], 0
 ; GCN-IR-NEXT:    s_mov_b32 s9, 0
 ; GCN-IR-NEXT:  BB9_3: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    s_lshr_b32 s8, s13, 31
 ; GCN-IR-NEXT:    s_lshl_b64 s[16:17], s[18:19], 1
+; GCN-IR-NEXT:    s_lshr_b32 s8, s13, 31
 ; GCN-IR-NEXT:    s_lshl_b64 s[12:13], s[12:13], 1
 ; GCN-IR-NEXT:    s_or_b64 s[16:17], s[16:17], s[8:9]
 ; GCN-IR-NEXT:    s_or_b64 s[12:13], s[14:15], s[12:13]
@@ -1120,8 +1120,8 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_ashr_i32 s2, s7, 31
 ; GCN-NEXT:    s_add_u32 s0, s6, s2
-; GCN-NEXT:    s_addc_u32 s1, s7, s2
 ; GCN-NEXT:    s_mov_b32 s3, s2
+; GCN-NEXT:    s_addc_u32 s1, s7, s2
 ; GCN-NEXT:    s_xor_b64 s[8:9], s[0:1], s[2:3]
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s8
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s9
@@ -1135,10 +1135,10 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v0
 ; GCN-NEXT:    v_trunc_f32_e32 v3, v3
 ; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v3
-; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GCN-NEXT:    v_mul_hi_u32 v5, s3, v0
+; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_lo_u32 v4, s3, v3
+; GCN-NEXT:    v_mul_hi_u32 v5, s3, v0
 ; GCN-NEXT:    v_mul_lo_u32 v7, s6, v0
 ; GCN-NEXT:    v_mul_lo_u32 v6, s3, v0
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
@@ -1148,16 +1148,16 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    v_mul_hi_u32 v9, v0, v4
 ; GCN-NEXT:    v_mul_lo_u32 v8, v3, v6
 ; GCN-NEXT:    v_mul_hi_u32 v6, v3, v6
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
 ; GCN-NEXT:    v_mul_hi_u32 v10, v3, v4
+; GCN-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
 ; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v2, v9, vcc
 ; GCN-NEXT:    v_mul_lo_u32 v4, v3, v4
 ; GCN-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
 ; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v7, v6, vcc
 ; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v10, v1, vcc
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
-; GCN-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v4
 ; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v2, v6, vcc
+; GCN-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v4
 ; GCN-NEXT:    v_addc_u32_e64 v4, vcc, v3, v5, s[0:1]
 ; GCN-NEXT:    v_mul_lo_u32 v6, s3, v4
 ; GCN-NEXT:    v_mul_hi_u32 v7, s3, v0
@@ -1167,8 +1167,8 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    v_mul_lo_u32 v7, s3, v0
 ; GCN-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
 ; GCN-NEXT:    v_mul_lo_u32 v10, v0, v6
-; GCN-NEXT:    v_mul_hi_u32 v12, v0, v6
 ; GCN-NEXT:    v_mul_hi_u32 v11, v0, v7
+; GCN-NEXT:    v_mul_hi_u32 v12, v0, v6
 ; GCN-NEXT:    v_mul_hi_u32 v9, v4, v7
 ; GCN-NEXT:    v_mul_lo_u32 v7, v4, v7
 ; GCN-NEXT:    v_mul_hi_u32 v8, v4, v6
@@ -1207,8 +1207,8 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    v_cndmask_b32_e64 v4, v6, v5, s[0:1]
 ; GCN-NEXT:    v_add_i32_e64 v5, s[0:1], 2, v0
 ; GCN-NEXT:    v_addc_u32_e64 v6, s[0:1], 0, v2, s[0:1]
-; GCN-NEXT:    v_subb_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN-NEXT:    v_add_i32_e64 v7, s[0:1], 1, v0
+; GCN-NEXT:    v_subb_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN-NEXT:    v_addc_u32_e64 v2, s[0:1], 0, v2, s[0:1]
 ; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
 ; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
@@ -1274,8 +1274,8 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-IR-NEXT:    s_mov_b32 s7, 0
 ; GCN-IR-NEXT:  BB10_3: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    s_lshr_b32 s6, s11, 31
 ; GCN-IR-NEXT:    s_lshl_b64 s[14:15], s[14:15], 1
+; GCN-IR-NEXT:    s_lshr_b32 s6, s11, 31
 ; GCN-IR-NEXT:    s_lshl_b64 s[10:11], s[10:11], 1
 ; GCN-IR-NEXT:    s_or_b64 s[14:15], s[14:15], s[6:7]
 ; GCN-IR-NEXT:    s_or_b64 s[10:11], s[12:13], s[10:11]
@@ -1326,15 +1326,15 @@ define i64 @v_test_sdiv_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v1, v2, vcc
-; GCN-NEXT:    v_xor_b32_e32 v0, v0, v2
 ; GCN-NEXT:    v_xor_b32_e32 v1, v1, v2
+; GCN-NEXT:    v_xor_b32_e32 v0, v0, v2
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v3, v0
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v4, v1
 ; GCN-NEXT:    v_sub_i32_e32 v5, vcc, 0, v0
 ; GCN-NEXT:    v_subb_u32_e32 v6, vcc, 0, v1, vcc
-; GCN-NEXT:    v_mov_b32_e32 v13, 0
 ; GCN-NEXT:    v_mac_f32_e32 v3, 0x4f800000, v4
 ; GCN-NEXT:    v_rcp_f32_e32 v3, v3
+; GCN-NEXT:    v_mov_b32_e32 v13, 0
 ; GCN-NEXT:    v_mov_b32_e32 v12, 0
 ; GCN-NEXT:    v_mul_f32_e32 v3, 0x5f7ffffc, v3
 ; GCN-NEXT:    v_mul_f32_e32 v4, 0x2f800000, v3
@@ -1349,8 +1349,8 @@ define i64 @v_test_sdiv_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_mul_lo_u32 v8, v5, v3
 ; GCN-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
 ; GCN-NEXT:    v_mul_lo_u32 v10, v3, v7
-; GCN-NEXT:    v_mul_hi_u32 v9, v3, v7
 ; GCN-NEXT:    v_mul_hi_u32 v11, v3, v8
+; GCN-NEXT:    v_mul_hi_u32 v9, v3, v7
 ; GCN-NEXT:    v_mul_hi_u32 v14, v4, v7
 ; GCN-NEXT:    v_mul_lo_u32 v7, v4, v7
 ; GCN-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
@@ -1361,8 +1361,8 @@ define i64 @v_test_sdiv_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_addc_u32_e32 v8, vcc, v9, v8, vcc
 ; GCN-NEXT:    v_addc_u32_e32 v9, vcc, v14, v12, vcc
 ; GCN-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; GCN-NEXT:    v_add_i32_e64 v3, s[4:5], v3, v7
 ; GCN-NEXT:    v_addc_u32_e32 v8, vcc, v13, v9, vcc
+; GCN-NEXT:    v_add_i32_e64 v3, s[4:5], v3, v7
 ; GCN-NEXT:    v_addc_u32_e64 v7, vcc, v4, v8, s[4:5]
 ; GCN-NEXT:    v_mul_lo_u32 v9, v5, v7
 ; GCN-NEXT:    v_mul_hi_u32 v10, v5, v3
@@ -1375,8 +1375,8 @@ define i64 @v_test_sdiv_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_mul_hi_u32 v15, v3, v6
 ; GCN-NEXT:    v_mul_hi_u32 v10, v7, v5
 ; GCN-NEXT:    v_mul_lo_u32 v5, v7, v5
-; GCN-NEXT:    v_add_i32_e32 v11, vcc, v14, v11
 ; GCN-NEXT:    v_mul_hi_u32 v9, v7, v6
+; GCN-NEXT:    v_add_i32_e32 v11, vcc, v14, v11
 ; GCN-NEXT:    v_addc_u32_e32 v14, vcc, v13, v15, vcc
 ; GCN-NEXT:    v_mul_lo_u32 v6, v7, v6
 ; GCN-NEXT:    v_add_i32_e32 v5, vcc, v11, v5
@@ -1411,8 +1411,8 @@ define i64 @v_test_sdiv_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_add_i32_e64 v7, s[4:5], 2, v3
 ; GCN-NEXT:    v_addc_u32_e64 v8, s[4:5], 0, v13, s[4:5]
 ; GCN-NEXT:    v_add_i32_e64 v9, s[4:5], 1, v3
-; GCN-NEXT:    v_subb_u32_e32 v4, vcc, 0, v4, vcc
 ; GCN-NEXT:    v_addc_u32_e64 v10, s[4:5], 0, v13, s[4:5]
+; GCN-NEXT:    v_subb_u32_e32 v4, vcc, 0, v4, vcc
 ; GCN-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v6
 ; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v4, v1
 ; GCN-NEXT:    v_cndmask_b32_e64 v6, v10, v8, s[4:5]
@@ -1436,8 +1436,8 @@ define i64 @v_test_sdiv_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
 ; GCN-IR-NEXT:    v_xor_b32_e32 v0, v2, v0
-; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
 ; GCN-IR-NEXT:    v_xor_b32_e32 v1, v2, v1
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
 ; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v4, v0
 ; GCN-IR-NEXT:    v_add_i32_e32 v4, vcc, 32, v4
@@ -1463,8 +1463,8 @@ define i64 @v_test_sdiv_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_addc_u32_e32 v11, vcc, 0, v5, vcc
 ; GCN-IR-NEXT:    v_cmp_ge_u64_e32 vcc, v[10:11], v[4:5]
 ; GCN-IR-NEXT:    v_sub_i32_e64 v4, s[4:5], 63, v4
-; GCN-IR-NEXT:    v_mov_b32_e32 v6, 0
 ; GCN-IR-NEXT:    v_lshl_b64 v[4:5], 24, v4
+; GCN-IR-NEXT:    v_mov_b32_e32 v6, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -1473,9 +1473,9 @@ define i64 @v_test_sdiv_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
 ; GCN-IR-NEXT:    v_add_i32_e32 v14, vcc, -1, v0
 ; GCN-IR-NEXT:    v_addc_u32_e32 v15, vcc, -1, v1, vcc
-; GCN-IR-NEXT:    v_sub_i32_e32 v8, vcc, 58, v8
-; GCN-IR-NEXT:    v_mov_b32_e32 v12, 0
 ; GCN-IR-NEXT:    v_lshr_b64 v[10:11], 24, v10
+; GCN-IR-NEXT:    v_mov_b32_e32 v12, 0
+; GCN-IR-NEXT:    v_sub_i32_e32 v8, vcc, 58, v8
 ; GCN-IR-NEXT:    v_mov_b32_e32 v13, 0
 ; GCN-IR-NEXT:    v_subb_u32_e32 v9, vcc, 0, v9, vcc
 ; GCN-IR-NEXT:  BB11_3: ; %udiv-do-while
@@ -1488,15 +1488,15 @@ define i64 @v_test_sdiv_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_subb_u32_e32 v6, vcc, v15, v11, vcc
 ; GCN-IR-NEXT:    v_or_b32_e32 v4, v12, v4
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v12, 31, v6
-; GCN-IR-NEXT:    v_and_b32_e32 v17, v12, v0
 ; GCN-IR-NEXT:    v_and_b32_e32 v6, 1, v12
 ; GCN-IR-NEXT:    v_and_b32_e32 v16, v12, v1
+; GCN-IR-NEXT:    v_and_b32_e32 v17, v12, v0
 ; GCN-IR-NEXT:    v_add_i32_e32 v12, vcc, 1, v8
 ; GCN-IR-NEXT:    v_or_b32_e32 v5, v13, v5
 ; GCN-IR-NEXT:    v_addc_u32_e32 v13, vcc, 0, v9, vcc
+; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
 ; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, v[12:13], v[8:9]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v8, v12
-; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
 ; GCN-IR-NEXT:    v_sub_i32_e64 v10, s[4:5], v10, v17
 ; GCN-IR-NEXT:    v_mov_b32_e32 v9, v13
 ; GCN-IR-NEXT:    v_mov_b32_e32 v13, v7
@@ -1530,15 +1530,15 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v1, v2, vcc
-; GCN-NEXT:    v_xor_b32_e32 v0, v0, v2
 ; GCN-NEXT:    v_xor_b32_e32 v1, v1, v2
+; GCN-NEXT:    v_xor_b32_e32 v0, v0, v2
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v3, v0
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v4, v1
 ; GCN-NEXT:    v_sub_i32_e32 v5, vcc, 0, v0
 ; GCN-NEXT:    v_subb_u32_e32 v6, vcc, 0, v1, vcc
-; GCN-NEXT:    v_mov_b32_e32 v13, 0
 ; GCN-NEXT:    v_mac_f32_e32 v3, 0x4f800000, v4
 ; GCN-NEXT:    v_rcp_f32_e32 v3, v3
+; GCN-NEXT:    v_mov_b32_e32 v13, 0
 ; GCN-NEXT:    v_mov_b32_e32 v12, 0
 ; GCN-NEXT:    v_mul_f32_e32 v3, 0x5f7ffffc, v3
 ; GCN-NEXT:    v_mul_f32_e32 v4, 0x2f800000, v3
@@ -1553,8 +1553,8 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_mul_lo_u32 v8, v5, v3
 ; GCN-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
 ; GCN-NEXT:    v_mul_lo_u32 v10, v3, v7
-; GCN-NEXT:    v_mul_hi_u32 v9, v3, v7
 ; GCN-NEXT:    v_mul_hi_u32 v11, v3, v8
+; GCN-NEXT:    v_mul_hi_u32 v9, v3, v7
 ; GCN-NEXT:    v_mul_hi_u32 v14, v4, v7
 ; GCN-NEXT:    v_mul_lo_u32 v7, v4, v7
 ; GCN-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
@@ -1565,8 +1565,8 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_addc_u32_e32 v8, vcc, v9, v8, vcc
 ; GCN-NEXT:    v_addc_u32_e32 v9, vcc, v14, v12, vcc
 ; GCN-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; GCN-NEXT:    v_add_i32_e64 v3, s[4:5], v3, v7
 ; GCN-NEXT:    v_addc_u32_e32 v8, vcc, v13, v9, vcc
+; GCN-NEXT:    v_add_i32_e64 v3, s[4:5], v3, v7
 ; GCN-NEXT:    v_addc_u32_e64 v7, vcc, v4, v8, s[4:5]
 ; GCN-NEXT:    v_mul_lo_u32 v9, v5, v7
 ; GCN-NEXT:    v_mul_hi_u32 v10, v5, v3
@@ -1579,8 +1579,8 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_mul_hi_u32 v15, v3, v6
 ; GCN-NEXT:    v_mul_hi_u32 v10, v7, v5
 ; GCN-NEXT:    v_mul_lo_u32 v5, v7, v5
-; GCN-NEXT:    v_add_i32_e32 v11, vcc, v14, v11
 ; GCN-NEXT:    v_mul_hi_u32 v9, v7, v6
+; GCN-NEXT:    v_add_i32_e32 v11, vcc, v14, v11
 ; GCN-NEXT:    v_addc_u32_e32 v14, vcc, v13, v15, vcc
 ; GCN-NEXT:    v_mul_lo_u32 v6, v7, v6
 ; GCN-NEXT:    v_add_i32_e32 v5, vcc, v11, v5
@@ -1616,8 +1616,8 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_add_i32_e64 v7, s[4:5], 2, v3
 ; GCN-NEXT:    v_addc_u32_e64 v8, s[4:5], 0, v13, s[4:5]
 ; GCN-NEXT:    v_add_i32_e64 v9, s[4:5], 1, v3
-; GCN-NEXT:    v_subb_u32_e32 v4, vcc, 0, v4, vcc
 ; GCN-NEXT:    v_addc_u32_e64 v10, s[4:5], 0, v13, s[4:5]
+; GCN-NEXT:    v_subb_u32_e32 v4, vcc, 0, v4, vcc
 ; GCN-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v6
 ; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v4, v1
 ; GCN-NEXT:    v_cndmask_b32_e64 v6, v10, v8, s[4:5]
@@ -1641,8 +1641,8 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
 ; GCN-IR-NEXT:    v_xor_b32_e32 v0, v2, v0
-; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
 ; GCN-IR-NEXT:    v_xor_b32_e32 v1, v2, v1
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
 ; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v4, v0
 ; GCN-IR-NEXT:    v_add_i32_e32 v4, vcc, 32, v4
@@ -1654,8 +1654,8 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
 ; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[4:5]
 ; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0x8000
-; GCN-IR-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
 ; GCN-IR-NEXT:    v_mov_b32_e32 v8, s8
+; GCN-IR-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
 ; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[4:5]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
 ; GCN-IR-NEXT:    v_cndmask_b32_e64 v8, v8, 0, s[4:5]
@@ -1670,8 +1670,8 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_addc_u32_e32 v11, vcc, 0, v5, vcc
 ; GCN-IR-NEXT:    v_cmp_ge_u64_e32 vcc, v[10:11], v[4:5]
 ; GCN-IR-NEXT:    v_sub_i32_e64 v4, s[4:5], 63, v4
-; GCN-IR-NEXT:    v_mov_b32_e32 v8, 0
 ; GCN-IR-NEXT:    v_lshl_b64 v[4:5], s[8:9], v4
+; GCN-IR-NEXT:    v_mov_b32_e32 v8, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -1679,11 +1679,11 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    s_cbranch_execz BB12_5
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
 ; GCN-IR-NEXT:    v_add_i32_e32 v14, vcc, -1, v0
-; GCN-IR-NEXT:    v_addc_u32_e32 v15, vcc, -1, v1, vcc
 ; GCN-IR-NEXT:    s_mov_b64 s[4:5], 0x8000
-; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, 47, v6
-; GCN-IR-NEXT:    v_mov_b32_e32 v12, 0
+; GCN-IR-NEXT:    v_addc_u32_e32 v15, vcc, -1, v1, vcc
 ; GCN-IR-NEXT:    v_lshr_b64 v[10:11], s[4:5], v10
+; GCN-IR-NEXT:    v_mov_b32_e32 v12, 0
+; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, 47, v6
 ; GCN-IR-NEXT:    v_mov_b32_e32 v13, 0
 ; GCN-IR-NEXT:    v_subb_u32_e32 v7, vcc, 0, v7, vcc
 ; GCN-IR-NEXT:  BB12_3: ; %udiv-do-while
@@ -1696,15 +1696,15 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_subb_u32_e32 v8, vcc, v15, v11, vcc
 ; GCN-IR-NEXT:    v_or_b32_e32 v4, v12, v4
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v12, 31, v8
-; GCN-IR-NEXT:    v_and_b32_e32 v17, v12, v0
 ; GCN-IR-NEXT:    v_and_b32_e32 v8, 1, v12
 ; GCN-IR-NEXT:    v_and_b32_e32 v16, v12, v1
+; GCN-IR-NEXT:    v_and_b32_e32 v17, v12, v0
 ; GCN-IR-NEXT:    v_add_i32_e32 v12, vcc, 1, v6
 ; GCN-IR-NEXT:    v_or_b32_e32 v5, v13, v5
 ; GCN-IR-NEXT:    v_addc_u32_e32 v13, vcc, 0, v7, vcc
+; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
 ; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, v[12:13], v[6:7]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v6, v12
-; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
 ; GCN-IR-NEXT:    v_sub_i32_e64 v10, s[4:5], v10, v17
 ; GCN-IR-NEXT:    v_mov_b32_e32 v7, v13
 ; GCN-IR-NEXT:    v_mov_b32_e32 v13, v9
@@ -1747,8 +1747,8 @@ define i64 @v_test_sdiv_pow2_k_den_i64(i64 %x) {
 ; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
 ; GCN-IR-NEXT:    v_xor_b32_e32 v0, v2, v0
-; GCN-IR-NEXT:    v_sub_i32_e32 v7, vcc, v0, v2
 ; GCN-IR-NEXT:    v_xor_b32_e32 v1, v2, v1
+; GCN-IR-NEXT:    v_sub_i32_e32 v7, vcc, v0, v2
 ; GCN-IR-NEXT:    v_subb_u32_e32 v8, vcc, v1, v2, vcc
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v0, v7
 ; GCN-IR-NEXT:    v_add_i32_e64 v0, s[4:5], 32, v0
@@ -1772,8 +1772,8 @@ define i64 @v_test_sdiv_pow2_k_den_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_addc_u32_e32 v10, vcc, 0, v4, vcc
 ; GCN-IR-NEXT:    v_cmp_ge_u64_e32 vcc, v[9:10], v[3:4]
 ; GCN-IR-NEXT:    v_sub_i32_e64 v3, s[4:5], 63, v3
-; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-IR-NEXT:    v_lshl_b64 v[3:4], v[7:8], v3
+; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v6, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -1796,18 +1796,18 @@ define i64 @v_test_sdiv_pow2_k_den_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_subb_u32_e32 v5, vcc, 0, v10, vcc
 ; GCN-IR-NEXT:    v_or_b32_e32 v3, v11, v3
 ; GCN-IR-NEXT:    v_add_i32_e32 v11, vcc, 1, v7
-; GCN-IR-NEXT:    v_ashrrev_i32_e32 v9, 31, v5
 ; GCN-IR-NEXT:    v_or_b32_e32 v4, v12, v4
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v9, 31, v5
 ; GCN-IR-NEXT:    v_addc_u32_e32 v12, vcc, 0, v8, vcc
-; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, v[11:12], v[7:8]
+; GCN-IR-NEXT:    v_mov_b32_e32 v6, 0
 ; GCN-IR-NEXT:    v_and_b32_e32 v5, 1, v9
 ; GCN-IR-NEXT:    v_and_b32_e32 v9, 0x8000, v9
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, v[11:12], v[7:8]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v7, v11
-; GCN-IR-NEXT:    v_mov_b32_e32 v6, 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v8, v12
-; GCN-IR-NEXT:    v_mov_b32_e32 v12, v6
 ; GCN-IR-NEXT:    v_mov_b32_e32 v13, 0
 ; GCN-IR-NEXT:    v_sub_i32_e64 v9, s[4:5], v0, v9
+; GCN-IR-NEXT:    v_mov_b32_e32 v8, v12
+; GCN-IR-NEXT:    v_mov_b32_e32 v12, v6
 ; GCN-IR-NEXT:    v_subb_u32_e64 v10, s[4:5], v10, v13, s[4:5]
 ; GCN-IR-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v11, v5

diff  --git a/llvm/test/CodeGen/AMDGPU/select.f16.ll b/llvm/test/CodeGen/AMDGPU/select.f16.ll
index b9f1cdb9a70c8..88411f6a8114f 100644
--- a/llvm/test/CodeGen/AMDGPU/select.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/select.f16.ll
@@ -16,10 +16,10 @@ define amdgpu_kernel void @select_f16(
 ; SI-NEXT:    s_mov_b32 s19, s3
 ; SI-NEXT:    s_mov_b32 s20, s8
 ; SI-NEXT:    s_mov_b32 s21, s9
-; SI-NEXT:    s_mov_b32 s8, s10
-; SI-NEXT:    s_mov_b32 s9, s11
 ; SI-NEXT:    s_mov_b32 s22, s2
 ; SI-NEXT:    s_mov_b32 s23, s3
+; SI-NEXT:    s_mov_b32 s8, s10
+; SI-NEXT:    s_mov_b32 s9, s11
 ; SI-NEXT:    s_mov_b32 s10, s2
 ; SI-NEXT:    s_mov_b32 s11, s3
 ; SI-NEXT:    s_mov_b32 s14, s2
@@ -57,10 +57,10 @@ define amdgpu_kernel void @select_f16(
 ; VI-NEXT:    s_mov_b32 s19, s3
 ; VI-NEXT:    s_mov_b32 s20, s8
 ; VI-NEXT:    s_mov_b32 s21, s9
-; VI-NEXT:    s_mov_b32 s8, s10
-; VI-NEXT:    s_mov_b32 s9, s11
 ; VI-NEXT:    s_mov_b32 s22, s2
 ; VI-NEXT:    s_mov_b32 s23, s3
+; VI-NEXT:    s_mov_b32 s8, s10
+; VI-NEXT:    s_mov_b32 s9, s11
 ; VI-NEXT:    s_mov_b32 s10, s2
 ; VI-NEXT:    s_mov_b32 s11, s3
 ; VI-NEXT:    s_mov_b32 s14, s2
@@ -108,10 +108,10 @@ define amdgpu_kernel void @select_f16_imm_a(
 ; SI-NEXT:    s_mov_b32 s13, s3
 ; SI-NEXT:    s_mov_b32 s16, s4
 ; SI-NEXT:    s_mov_b32 s17, s5
-; SI-NEXT:    s_mov_b32 s4, s6
-; SI-NEXT:    s_mov_b32 s5, s7
 ; SI-NEXT:    s_mov_b32 s18, s10
 ; SI-NEXT:    s_mov_b32 s19, s11
+; SI-NEXT:    s_mov_b32 s4, s6
+; SI-NEXT:    s_mov_b32 s5, s7
 ; SI-NEXT:    s_mov_b32 s6, s10
 ; SI-NEXT:    s_mov_b32 s7, s11
 ; SI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
@@ -143,10 +143,10 @@ define amdgpu_kernel void @select_f16_imm_a(
 ; VI-NEXT:    s_mov_b32 s13, s3
 ; VI-NEXT:    s_mov_b32 s16, s4
 ; VI-NEXT:    s_mov_b32 s17, s5
-; VI-NEXT:    s_mov_b32 s4, s6
-; VI-NEXT:    s_mov_b32 s5, s7
 ; VI-NEXT:    s_mov_b32 s18, s10
 ; VI-NEXT:    s_mov_b32 s19, s11
+; VI-NEXT:    s_mov_b32 s4, s6
+; VI-NEXT:    s_mov_b32 s5, s7
 ; VI-NEXT:    s_mov_b32 s6, s10
 ; VI-NEXT:    s_mov_b32 s7, s11
 ; VI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
@@ -188,10 +188,10 @@ define amdgpu_kernel void @select_f16_imm_b(
 ; SI-NEXT:    s_mov_b32 s13, s3
 ; SI-NEXT:    s_mov_b32 s16, s4
 ; SI-NEXT:    s_mov_b32 s17, s5
-; SI-NEXT:    s_mov_b32 s4, s6
-; SI-NEXT:    s_mov_b32 s5, s7
 ; SI-NEXT:    s_mov_b32 s18, s10
 ; SI-NEXT:    s_mov_b32 s19, s11
+; SI-NEXT:    s_mov_b32 s4, s6
+; SI-NEXT:    s_mov_b32 s5, s7
 ; SI-NEXT:    s_mov_b32 s6, s10
 ; SI-NEXT:    s_mov_b32 s7, s11
 ; SI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
@@ -223,10 +223,10 @@ define amdgpu_kernel void @select_f16_imm_b(
 ; VI-NEXT:    s_mov_b32 s13, s3
 ; VI-NEXT:    s_mov_b32 s16, s4
 ; VI-NEXT:    s_mov_b32 s17, s5
-; VI-NEXT:    s_mov_b32 s4, s6
-; VI-NEXT:    s_mov_b32 s5, s7
 ; VI-NEXT:    s_mov_b32 s18, s10
 ; VI-NEXT:    s_mov_b32 s19, s11
+; VI-NEXT:    s_mov_b32 s4, s6
+; VI-NEXT:    s_mov_b32 s5, s7
 ; VI-NEXT:    s_mov_b32 s6, s10
 ; VI-NEXT:    s_mov_b32 s7, s11
 ; VI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
@@ -268,10 +268,10 @@ define amdgpu_kernel void @select_f16_imm_c(
 ; SI-NEXT:    s_mov_b32 s13, s3
 ; SI-NEXT:    s_mov_b32 s16, s4
 ; SI-NEXT:    s_mov_b32 s17, s5
-; SI-NEXT:    s_mov_b32 s4, s6
-; SI-NEXT:    s_mov_b32 s5, s7
 ; SI-NEXT:    s_mov_b32 s18, s10
 ; SI-NEXT:    s_mov_b32 s19, s11
+; SI-NEXT:    s_mov_b32 s4, s6
+; SI-NEXT:    s_mov_b32 s5, s7
 ; SI-NEXT:    s_mov_b32 s6, s10
 ; SI-NEXT:    s_mov_b32 s7, s11
 ; SI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
@@ -303,10 +303,10 @@ define amdgpu_kernel void @select_f16_imm_c(
 ; VI-NEXT:    s_mov_b32 s13, s3
 ; VI-NEXT:    s_mov_b32 s16, s4
 ; VI-NEXT:    s_mov_b32 s17, s5
-; VI-NEXT:    s_mov_b32 s4, s6
-; VI-NEXT:    s_mov_b32 s5, s7
 ; VI-NEXT:    s_mov_b32 s18, s10
 ; VI-NEXT:    s_mov_b32 s19, s11
+; VI-NEXT:    s_mov_b32 s4, s6
+; VI-NEXT:    s_mov_b32 s5, s7
 ; VI-NEXT:    s_mov_b32 s6, s10
 ; VI-NEXT:    s_mov_b32 s7, s11
 ; VI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
@@ -349,10 +349,10 @@ define amdgpu_kernel void @select_f16_imm_d(
 ; SI-NEXT:    s_mov_b32 s13, s3
 ; SI-NEXT:    s_mov_b32 s16, s4
 ; SI-NEXT:    s_mov_b32 s17, s5
-; SI-NEXT:    s_mov_b32 s4, s6
-; SI-NEXT:    s_mov_b32 s5, s7
 ; SI-NEXT:    s_mov_b32 s18, s10
 ; SI-NEXT:    s_mov_b32 s19, s11
+; SI-NEXT:    s_mov_b32 s4, s6
+; SI-NEXT:    s_mov_b32 s5, s7
 ; SI-NEXT:    s_mov_b32 s6, s10
 ; SI-NEXT:    s_mov_b32 s7, s11
 ; SI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
@@ -384,10 +384,10 @@ define amdgpu_kernel void @select_f16_imm_d(
 ; VI-NEXT:    s_mov_b32 s13, s3
 ; VI-NEXT:    s_mov_b32 s16, s4
 ; VI-NEXT:    s_mov_b32 s17, s5
-; VI-NEXT:    s_mov_b32 s4, s6
-; VI-NEXT:    s_mov_b32 s5, s7
 ; VI-NEXT:    s_mov_b32 s18, s10
 ; VI-NEXT:    s_mov_b32 s19, s11
+; VI-NEXT:    s_mov_b32 s4, s6
+; VI-NEXT:    s_mov_b32 s5, s7
 ; VI-NEXT:    s_mov_b32 s6, s10
 ; VI-NEXT:    s_mov_b32 s7, s11
 ; VI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
@@ -431,10 +431,10 @@ define amdgpu_kernel void @select_v2f16(
 ; SI-NEXT:    s_mov_b32 s19, s3
 ; SI-NEXT:    s_mov_b32 s20, s8
 ; SI-NEXT:    s_mov_b32 s21, s9
-; SI-NEXT:    s_mov_b32 s8, s10
-; SI-NEXT:    s_mov_b32 s9, s11
 ; SI-NEXT:    s_mov_b32 s22, s2
 ; SI-NEXT:    s_mov_b32 s23, s3
+; SI-NEXT:    s_mov_b32 s8, s10
+; SI-NEXT:    s_mov_b32 s9, s11
 ; SI-NEXT:    s_mov_b32 s10, s2
 ; SI-NEXT:    s_mov_b32 s11, s3
 ; SI-NEXT:    s_mov_b32 s14, s2
@@ -451,9 +451,9 @@ define amdgpu_kernel void @select_v2f16(
 ; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v4, v2
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
@@ -464,8 +464,8 @@ define amdgpu_kernel void @select_v2f16(
 ; SI-NEXT:    v_cmp_lt_f32_e32 vcc, v5, v6
 ; SI-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc
 ; SI-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; SI-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -485,10 +485,10 @@ define amdgpu_kernel void @select_v2f16(
 ; VI-NEXT:    s_mov_b32 s19, s3
 ; VI-NEXT:    s_mov_b32 s20, s8
 ; VI-NEXT:    s_mov_b32 s21, s9
-; VI-NEXT:    s_mov_b32 s8, s10
-; VI-NEXT:    s_mov_b32 s9, s11
 ; VI-NEXT:    s_mov_b32 s22, s2
 ; VI-NEXT:    s_mov_b32 s23, s3
+; VI-NEXT:    s_mov_b32 s8, s10
+; VI-NEXT:    s_mov_b32 s9, s11
 ; VI-NEXT:    s_mov_b32 s10, s2
 ; VI-NEXT:    s_mov_b32 s11, s3
 ; VI-NEXT:    s_mov_b32 s14, s2
@@ -502,11 +502,12 @@ define amdgpu_kernel void @select_v2f16(
 ; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
 ; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_cmp_lt_f16_e32 vcc, v0, v1
 ; VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; VI-NEXT:    v_cmp_lt_f16_e32 vcc, v0, v1
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
 ; VI-NEXT:    v_cmp_lt_f16_e32 vcc, v6, v5
 ; VI-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
@@ -543,10 +544,10 @@ define amdgpu_kernel void @select_v2f16_imm_a(
 ; SI-NEXT:    s_mov_b32 s13, s3
 ; SI-NEXT:    s_mov_b32 s16, s4
 ; SI-NEXT:    s_mov_b32 s17, s5
-; SI-NEXT:    s_mov_b32 s4, s6
-; SI-NEXT:    s_mov_b32 s5, s7
 ; SI-NEXT:    s_mov_b32 s18, s10
 ; SI-NEXT:    s_mov_b32 s19, s11
+; SI-NEXT:    s_mov_b32 s4, s6
+; SI-NEXT:    s_mov_b32 s5, s7
 ; SI-NEXT:    s_mov_b32 s6, s10
 ; SI-NEXT:    s_mov_b32 s7, s11
 ; SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0
@@ -570,8 +571,8 @@ define amdgpu_kernel void @select_v2f16_imm_a(
 ; SI-NEXT:    v_cmp_lt_f32_e32 vcc, s2, v3
 ; SI-NEXT:    v_cndmask_b32_e32 v3, v5, v4, vcc
 ; SI-NEXT:    v_cmp_lt_f32_e32 vcc, 0.5, v0
-; SI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -590,10 +591,10 @@ define amdgpu_kernel void @select_v2f16_imm_a(
 ; VI-NEXT:    s_mov_b32 s13, s3
 ; VI-NEXT:    s_mov_b32 s16, s4
 ; VI-NEXT:    s_mov_b32 s17, s5
-; VI-NEXT:    s_mov_b32 s4, s6
-; VI-NEXT:    s_mov_b32 s5, s7
 ; VI-NEXT:    s_mov_b32 s18, s10
 ; VI-NEXT:    s_mov_b32 s19, s11
+; VI-NEXT:    s_mov_b32 s4, s6
+; VI-NEXT:    s_mov_b32 s5, s7
 ; VI-NEXT:    s_mov_b32 s6, s10
 ; VI-NEXT:    s_mov_b32 s7, s11
 ; VI-NEXT:    buffer_load_dword v0, off, s[12:15], 0
@@ -642,10 +643,10 @@ define amdgpu_kernel void @select_v2f16_imm_b(
 ; SI-NEXT:    s_mov_b32 s13, s3
 ; SI-NEXT:    s_mov_b32 s16, s4
 ; SI-NEXT:    s_mov_b32 s17, s5
-; SI-NEXT:    s_mov_b32 s4, s6
-; SI-NEXT:    s_mov_b32 s5, s7
 ; SI-NEXT:    s_mov_b32 s18, s10
 ; SI-NEXT:    s_mov_b32 s19, s11
+; SI-NEXT:    s_mov_b32 s4, s6
+; SI-NEXT:    s_mov_b32 s5, s7
 ; SI-NEXT:    s_mov_b32 s6, s10
 ; SI-NEXT:    s_mov_b32 s7, s11
 ; SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0
@@ -669,8 +670,8 @@ define amdgpu_kernel void @select_v2f16_imm_b(
 ; SI-NEXT:    v_cmp_gt_f32_e32 vcc, s2, v3
 ; SI-NEXT:    v_cndmask_b32_e32 v3, v5, v4, vcc
 ; SI-NEXT:    v_cmp_gt_f32_e32 vcc, 0.5, v0
-; SI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -689,10 +690,10 @@ define amdgpu_kernel void @select_v2f16_imm_b(
 ; VI-NEXT:    s_mov_b32 s13, s3
 ; VI-NEXT:    s_mov_b32 s16, s4
 ; VI-NEXT:    s_mov_b32 s17, s5
-; VI-NEXT:    s_mov_b32 s4, s6
-; VI-NEXT:    s_mov_b32 s5, s7
 ; VI-NEXT:    s_mov_b32 s18, s10
 ; VI-NEXT:    s_mov_b32 s19, s11
+; VI-NEXT:    s_mov_b32 s4, s6
+; VI-NEXT:    s_mov_b32 s5, s7
 ; VI-NEXT:    s_mov_b32 s6, s10
 ; VI-NEXT:    s_mov_b32 s7, s11
 ; VI-NEXT:    buffer_load_dword v0, off, s[12:15], 0
@@ -741,10 +742,10 @@ define amdgpu_kernel void @select_v2f16_imm_c(
 ; SI-NEXT:    s_mov_b32 s13, s3
 ; SI-NEXT:    s_mov_b32 s16, s4
 ; SI-NEXT:    s_mov_b32 s17, s5
-; SI-NEXT:    s_mov_b32 s4, s6
-; SI-NEXT:    s_mov_b32 s5, s7
 ; SI-NEXT:    s_mov_b32 s18, s10
 ; SI-NEXT:    s_mov_b32 s19, s11
+; SI-NEXT:    s_mov_b32 s4, s6
+; SI-NEXT:    s_mov_b32 s5, s7
 ; SI-NEXT:    s_mov_b32 s6, s10
 ; SI-NEXT:    s_mov_b32 s7, s11
 ; SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0
@@ -755,9 +756,9 @@ define amdgpu_kernel void @select_v2f16_imm_c(
 ; SI-NEXT:    s_mov_b32 s9, s1
 ; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v4, v0
+; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
@@ -788,10 +789,10 @@ define amdgpu_kernel void @select_v2f16_imm_c(
 ; VI-NEXT:    s_mov_b32 s13, s3
 ; VI-NEXT:    s_mov_b32 s16, s4
 ; VI-NEXT:    s_mov_b32 s17, s5
-; VI-NEXT:    s_mov_b32 s4, s6
-; VI-NEXT:    s_mov_b32 s5, s7
 ; VI-NEXT:    s_mov_b32 s18, s10
 ; VI-NEXT:    s_mov_b32 s19, s11
+; VI-NEXT:    s_mov_b32 s4, s6
+; VI-NEXT:    s_mov_b32 s5, s7
 ; VI-NEXT:    s_mov_b32 s6, s10
 ; VI-NEXT:    s_mov_b32 s7, s11
 ; VI-NEXT:    buffer_load_dword v0, off, s[12:15], 0
@@ -804,8 +805,8 @@ define amdgpu_kernel void @select_v2f16_imm_c(
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
 ; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_cmp_nlt_f16_e32 vcc, v0, v1
 ; VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; VI-NEXT:    v_cmp_nlt_f16_e32 vcc, v0, v1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
@@ -842,10 +843,10 @@ define amdgpu_kernel void @select_v2f16_imm_d(
 ; SI-NEXT:    s_mov_b32 s13, s3
 ; SI-NEXT:    s_mov_b32 s16, s4
 ; SI-NEXT:    s_mov_b32 s17, s5
-; SI-NEXT:    s_mov_b32 s4, s6
-; SI-NEXT:    s_mov_b32 s5, s7
 ; SI-NEXT:    s_mov_b32 s18, s10
 ; SI-NEXT:    s_mov_b32 s19, s11
+; SI-NEXT:    s_mov_b32 s4, s6
+; SI-NEXT:    s_mov_b32 s5, s7
 ; SI-NEXT:    s_mov_b32 s6, s10
 ; SI-NEXT:    s_mov_b32 s7, s11
 ; SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0
@@ -869,8 +870,8 @@ define amdgpu_kernel void @select_v2f16_imm_d(
 ; SI-NEXT:    v_cmp_lt_f32_e32 vcc, v4, v5
 ; SI-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
 ; SI-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; SI-NEXT:    v_cndmask_b32_e32 v0, 0.5, v2, vcc
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_cndmask_b32_e32 v0, 0.5, v2, vcc
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -889,10 +890,10 @@ define amdgpu_kernel void @select_v2f16_imm_d(
 ; VI-NEXT:    s_mov_b32 s13, s3
 ; VI-NEXT:    s_mov_b32 s16, s4
 ; VI-NEXT:    s_mov_b32 s17, s5
-; VI-NEXT:    s_mov_b32 s4, s6
-; VI-NEXT:    s_mov_b32 s5, s7
 ; VI-NEXT:    s_mov_b32 s18, s10
 ; VI-NEXT:    s_mov_b32 s19, s11
+; VI-NEXT:    s_mov_b32 s4, s6
+; VI-NEXT:    s_mov_b32 s5, s7
 ; VI-NEXT:    s_mov_b32 s6, s10
 ; VI-NEXT:    s_mov_b32 s7, s11
 ; VI-NEXT:    buffer_load_dword v0, off, s[12:15], 0
@@ -905,8 +906,8 @@ define amdgpu_kernel void @select_v2f16_imm_d(
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
 ; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_cmp_lt_f16_e32 vcc, v0, v1
 ; VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; VI-NEXT:    v_cmp_lt_f16_e32 vcc, v0, v1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v2

diff  --git a/llvm/test/CodeGen/AMDGPU/select64.ll b/llvm/test/CodeGen/AMDGPU/select64.ll
index 2a3ae2883e720..c3412ac7f49e9 100644
--- a/llvm/test/CodeGen/AMDGPU/select64.ll
+++ b/llvm/test/CodeGen/AMDGPU/select64.ll
@@ -224,8 +224,8 @@ define amdgpu_kernel void @v_select_i64_split_imm(i64 addrspace(1)* %out, i32 %c
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_cmp_gt_u32 s2, 5
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SI-NEXT:    s_cmp_gt_u32 s2, 5
 ; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, s1
@@ -261,9 +261,9 @@ define amdgpu_kernel void @v_select_i64_split_imm(i64 addrspace(1)* %out, i32 %c
 ; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_cmp_gt_u32 s6, 5
 ; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-NEXT:    s_mov_b32 s4, 0
+; GFX90A-NEXT:    s_cmp_gt_u32 s6, 5
 ; GFX90A-NEXT:    s_mov_b32 s5, 63
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[4:5]

diff  --git a/llvm/test/CodeGen/AMDGPU/shift-i128.ll b/llvm/test/CodeGen/AMDGPU/shift-i128.ll
index 66ad5f5b7759d..1af9d60f20dc5 100644
--- a/llvm/test/CodeGen/AMDGPU/shift-i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/shift-i128.ll
@@ -14,8 +14,8 @@ define i128 @v_shl_i128_vv(i128 %lhs, i128 %rhs) {
 ; GCN-NEXT:    v_or_b32_e32 v8, v6, v8
 ; GCN-NEXT:    v_lshl_b64 v[5:6], v[0:1], v5
 ; GCN-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v4
-; GCN-NEXT:    v_lshl_b64 v[0:1], v[0:1], v4
 ; GCN-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
+; GCN-NEXT:    v_lshl_b64 v[0:1], v[0:1], v4
 ; GCN-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[4:5]
 ; GCN-NEXT:    v_cndmask_b32_e32 v5, v6, v8, vcc
 ; GCN-NEXT:    v_cndmask_b32_e64 v3, v5, v3, s[4:5]
@@ -39,8 +39,8 @@ define i128 @v_lshr_i128_vv(i128 %lhs, i128 %rhs) {
 ; GCN-NEXT:    v_or_b32_e32 v8, v6, v8
 ; GCN-NEXT:    v_lshr_b64 v[5:6], v[2:3], v5
 ; GCN-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v4
-; GCN-NEXT:    v_lshr_b64 v[2:3], v[2:3], v4
 ; GCN-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
+; GCN-NEXT:    v_lshr_b64 v[2:3], v[2:3], v4
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, v5, v0, s[4:5]
 ; GCN-NEXT:    v_cndmask_b32_e32 v5, v6, v8, vcc
 ; GCN-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s[4:5]
@@ -194,10 +194,10 @@ define amdgpu_kernel void @s_shl_i128_ss(i128 %lhs, i128 %rhs) {
 ; GCN-NEXT:    s_lshl_b64 s[2:3], s[4:5], s2
 ; GCN-NEXT:    s_or_b64 s[10:11], s[0:1], s[10:11]
 ; GCN-NEXT:    s_cmp_lt_u32 s8, 64
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmp_eq_u32 s8, 0
 ; GCN-NEXT:    v_mov_b32_e32 v0, s3
 ; GCN-NEXT:    v_mov_b32_e32 v1, s11
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_eq_u32 s8, 0
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GCN-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
@@ -230,13 +230,13 @@ define amdgpu_kernel void @s_lshr_i128_ss(i128 %lhs, i128 %rhs) {
 ; GCN-NEXT:    s_sub_i32 s2, s8, 64
 ; GCN-NEXT:    s_lshr_b64 s[0:1], s[4:5], s8
 ; GCN-NEXT:    s_lshl_b64 s[10:11], s[6:7], s9
-; GCN-NEXT:    s_or_b64 s[10:11], s[0:1], s[10:11]
 ; GCN-NEXT:    s_lshr_b64 s[2:3], s[6:7], s2
+; GCN-NEXT:    s_or_b64 s[10:11], s[0:1], s[10:11]
 ; GCN-NEXT:    s_cmp_lt_u32 s8, 64
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmp_eq_u32 s8, 0
 ; GCN-NEXT:    v_mov_b32_e32 v0, s3
 ; GCN-NEXT:    v_mov_b32_e32 v1, s11
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_eq_u32 s8, 0
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GCN-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
@@ -278,9 +278,9 @@ define amdgpu_kernel void @s_ashr_i128_ss(i128 %lhs, i128 %rhs) {
 ; GCN-NEXT:    s_lshr_b64 s[6:7], s[4:5], s8
 ; GCN-NEXT:    s_or_b64 s[6:7], s[6:7], s[0:1]
 ; GCN-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
-; GCN-NEXT:    s_cmp_eq_u32 s8, 0
 ; GCN-NEXT:    v_mov_b32_e32 v0, s3
 ; GCN-NEXT:    v_mov_b32_e32 v1, s7
+; GCN-NEXT:    s_cmp_eq_u32 s8, 0
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GCN-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
@@ -306,13 +306,13 @@ define <2 x i128> @v_shl_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GCN-NEXT:    v_sub_i32_e32 v16, vcc, 64, v8
 ; GCN-NEXT:    v_lshr_b64 v[16:17], v[0:1], v16
 ; GCN-NEXT:    v_lshl_b64 v[18:19], v[2:3], v8
-; GCN-NEXT:    v_cmp_eq_u64_e64 s[6:7], 0, v[10:11]
 ; GCN-NEXT:    v_cmp_gt_u64_e64 s[4:5], 64, v[8:9]
+; GCN-NEXT:    v_cmp_eq_u64_e64 s[6:7], 0, v[10:11]
 ; GCN-NEXT:    v_or_b32_e32 v11, v9, v11
 ; GCN-NEXT:    v_subrev_i32_e32 v9, vcc, 64, v8
-; GCN-NEXT:    v_or_b32_e32 v10, v8, v10
 ; GCN-NEXT:    v_or_b32_e32 v19, v19, v17
 ; GCN-NEXT:    v_or_b32_e32 v18, v18, v16
+; GCN-NEXT:    v_or_b32_e32 v10, v8, v10
 ; GCN-NEXT:    v_lshl_b64 v[16:17], v[0:1], v9
 ; GCN-NEXT:    s_and_b64 s[4:5], s[6:7], s[4:5]
 ; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[10:11]
@@ -322,10 +322,10 @@ define <2 x i128> @v_shl_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GCN-NEXT:    v_cndmask_b32_e64 v11, v17, v19, s[4:5]
 ; GCN-NEXT:    v_lshr_b64 v[9:10], v[4:5], v9
 ; GCN-NEXT:    v_lshl_b64 v[16:17], v[6:7], v12
-; GCN-NEXT:    v_cmp_eq_u64_e64 s[8:9], 0, v[14:15]
 ; GCN-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc
 ; GCN-NEXT:    v_or_b32_e32 v16, v16, v9
 ; GCN-NEXT:    v_cmp_gt_u64_e64 s[6:7], 64, v[12:13]
+; GCN-NEXT:    v_cmp_eq_u64_e64 s[8:9], 0, v[14:15]
 ; GCN-NEXT:    v_subrev_i32_e32 v9, vcc, 64, v12
 ; GCN-NEXT:    v_or_b32_e32 v11, v17, v10
 ; GCN-NEXT:    v_lshl_b64 v[9:10], v[4:5], v9
@@ -333,9 +333,9 @@ define <2 x i128> @v_shl_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GCN-NEXT:    v_or_b32_e32 v14, v12, v14
 ; GCN-NEXT:    s_and_b64 vcc, s[8:9], s[6:7]
 ; GCN-NEXT:    v_cmp_eq_u64_e64 s[6:7], 0, v[14:15]
+; GCN-NEXT:    v_cndmask_b32_e32 v9, v9, v16, vcc
 ; GCN-NEXT:    v_lshl_b64 v[0:1], v[0:1], v8
 ; GCN-NEXT:    v_lshl_b64 v[4:5], v[4:5], v12
-; GCN-NEXT:    v_cndmask_b32_e32 v9, v9, v16, vcc
 ; GCN-NEXT:    v_cndmask_b32_e64 v6, v9, v6, s[6:7]
 ; GCN-NEXT:    v_cndmask_b32_e32 v9, v10, v11, vcc
 ; GCN-NEXT:    v_cndmask_b32_e64 v7, v9, v7, s[6:7]
@@ -355,13 +355,13 @@ define <2 x i128> @v_lshr_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GCN-NEXT:    v_sub_i32_e32 v16, vcc, 64, v8
 ; GCN-NEXT:    v_lshl_b64 v[16:17], v[2:3], v16
 ; GCN-NEXT:    v_lshr_b64 v[18:19], v[0:1], v8
-; GCN-NEXT:    v_cmp_eq_u64_e64 s[6:7], 0, v[10:11]
 ; GCN-NEXT:    v_cmp_gt_u64_e64 s[4:5], 64, v[8:9]
+; GCN-NEXT:    v_cmp_eq_u64_e64 s[6:7], 0, v[10:11]
 ; GCN-NEXT:    v_or_b32_e32 v11, v9, v11
 ; GCN-NEXT:    v_subrev_i32_e32 v9, vcc, 64, v8
-; GCN-NEXT:    v_or_b32_e32 v10, v8, v10
 ; GCN-NEXT:    v_or_b32_e32 v19, v19, v17
 ; GCN-NEXT:    v_or_b32_e32 v18, v18, v16
+; GCN-NEXT:    v_or_b32_e32 v10, v8, v10
 ; GCN-NEXT:    v_lshr_b64 v[16:17], v[2:3], v9
 ; GCN-NEXT:    s_and_b64 s[4:5], s[6:7], s[4:5]
 ; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[10:11]
@@ -371,10 +371,10 @@ define <2 x i128> @v_lshr_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GCN-NEXT:    v_cndmask_b32_e64 v11, v17, v19, s[4:5]
 ; GCN-NEXT:    v_lshl_b64 v[9:10], v[6:7], v9
 ; GCN-NEXT:    v_lshr_b64 v[16:17], v[4:5], v12
-; GCN-NEXT:    v_cmp_eq_u64_e64 s[8:9], 0, v[14:15]
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v11, v1, vcc
 ; GCN-NEXT:    v_or_b32_e32 v16, v16, v9
 ; GCN-NEXT:    v_cmp_gt_u64_e64 s[6:7], 64, v[12:13]
+; GCN-NEXT:    v_cmp_eq_u64_e64 s[8:9], 0, v[14:15]
 ; GCN-NEXT:    v_subrev_i32_e32 v9, vcc, 64, v12
 ; GCN-NEXT:    v_or_b32_e32 v11, v17, v10
 ; GCN-NEXT:    v_lshr_b64 v[9:10], v[6:7], v9
@@ -382,9 +382,9 @@ define <2 x i128> @v_lshr_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GCN-NEXT:    v_or_b32_e32 v14, v12, v14
 ; GCN-NEXT:    s_and_b64 vcc, s[8:9], s[6:7]
 ; GCN-NEXT:    v_cmp_eq_u64_e64 s[6:7], 0, v[14:15]
+; GCN-NEXT:    v_cndmask_b32_e32 v9, v9, v16, vcc
 ; GCN-NEXT:    v_lshr_b64 v[2:3], v[2:3], v8
 ; GCN-NEXT:    v_lshr_b64 v[6:7], v[6:7], v12
-; GCN-NEXT:    v_cndmask_b32_e32 v9, v9, v16, vcc
 ; GCN-NEXT:    v_cndmask_b32_e64 v4, v9, v4, s[6:7]
 ; GCN-NEXT:    v_cndmask_b32_e32 v9, v10, v11, vcc
 ; GCN-NEXT:    v_cndmask_b32_e64 v5, v9, v5, s[6:7]
@@ -404,13 +404,13 @@ define <2 x i128> @v_ashr_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GCN-NEXT:    v_sub_i32_e32 v16, vcc, 64, v8
 ; GCN-NEXT:    v_lshl_b64 v[16:17], v[2:3], v16
 ; GCN-NEXT:    v_lshr_b64 v[18:19], v[0:1], v8
-; GCN-NEXT:    v_cmp_eq_u64_e64 s[6:7], 0, v[10:11]
 ; GCN-NEXT:    v_cmp_gt_u64_e64 s[4:5], 64, v[8:9]
+; GCN-NEXT:    v_cmp_eq_u64_e64 s[6:7], 0, v[10:11]
 ; GCN-NEXT:    v_or_b32_e32 v11, v9, v11
 ; GCN-NEXT:    v_subrev_i32_e32 v9, vcc, 64, v8
-; GCN-NEXT:    v_or_b32_e32 v10, v8, v10
 ; GCN-NEXT:    v_or_b32_e32 v19, v19, v17
 ; GCN-NEXT:    v_or_b32_e32 v18, v18, v16
+; GCN-NEXT:    v_or_b32_e32 v10, v8, v10
 ; GCN-NEXT:    v_ashr_i64 v[16:17], v[2:3], v9
 ; GCN-NEXT:    s_and_b64 s[4:5], s[6:7], s[4:5]
 ; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[10:11]
@@ -420,10 +420,10 @@ define <2 x i128> @v_ashr_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GCN-NEXT:    v_cndmask_b32_e64 v11, v17, v19, s[4:5]
 ; GCN-NEXT:    v_lshl_b64 v[9:10], v[6:7], v9
 ; GCN-NEXT:    v_lshr_b64 v[16:17], v[4:5], v12
-; GCN-NEXT:    v_cmp_eq_u64_e64 s[8:9], 0, v[14:15]
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v11, v1, vcc
 ; GCN-NEXT:    v_or_b32_e32 v16, v16, v9
 ; GCN-NEXT:    v_cmp_gt_u64_e64 s[6:7], 64, v[12:13]
+; GCN-NEXT:    v_cmp_eq_u64_e64 s[8:9], 0, v[14:15]
 ; GCN-NEXT:    v_subrev_i32_e32 v9, vcc, 64, v12
 ; GCN-NEXT:    v_or_b32_e32 v11, v17, v10
 ; GCN-NEXT:    v_ashr_i64 v[9:10], v[6:7], v9
@@ -457,16 +457,16 @@ define amdgpu_kernel void @s_shl_v2i128ss(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GCN-NEXT:    v_mov_b32_e32 v8, 0
 ; GCN-NEXT:    v_mov_b32_e32 v11, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_sub_i32 s6, 64, s16
 ; GCN-NEXT:    v_cmp_lt_u64_e64 s[0:1], s[16:17], 64
 ; GCN-NEXT:    v_cmp_eq_u64_e64 s[2:3], s[18:19], 0
+; GCN-NEXT:    s_sub_i32 s6, 64, s16
 ; GCN-NEXT:    s_sub_i32 s4, s16, 64
 ; GCN-NEXT:    s_lshr_b64 s[6:7], s[8:9], s6
 ; GCN-NEXT:    s_lshl_b64 s[24:25], s[10:11], s16
-; GCN-NEXT:    s_and_b64 vcc, s[2:3], s[0:1]
-; GCN-NEXT:    s_or_b64 s[0:1], s[16:17], s[18:19]
 ; GCN-NEXT:    s_lshl_b64 s[4:5], s[8:9], s4
 ; GCN-NEXT:    s_or_b64 s[6:7], s[24:25], s[6:7]
+; GCN-NEXT:    s_and_b64 vcc, s[2:3], s[0:1]
+; GCN-NEXT:    s_or_b64 s[0:1], s[16:17], s[18:19]
 ; GCN-NEXT:    v_mov_b32_e32 v0, s5
 ; GCN-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[0:1], 0
@@ -477,10 +477,10 @@ define amdgpu_kernel void @s_shl_v2i128ss(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GCN-NEXT:    v_mov_b32_e32 v1, s6
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GCN-NEXT:    v_mov_b32_e32 v1, s10
-; GCN-NEXT:    s_sub_i32 s6, 64, s20
 ; GCN-NEXT:    v_cndmask_b32_e64 v2, v0, v1, s[0:1]
 ; GCN-NEXT:    v_cmp_lt_u64_e64 s[0:1], s[20:21], 64
 ; GCN-NEXT:    v_cmp_eq_u64_e64 s[2:3], s[22:23], 0
+; GCN-NEXT:    s_sub_i32 s6, 64, s20
 ; GCN-NEXT:    s_sub_i32 s4, s20, 64
 ; GCN-NEXT:    s_lshr_b64 s[6:7], s[12:13], s6
 ; GCN-NEXT:    s_lshl_b64 s[10:11], s[14:15], s20
@@ -527,16 +527,16 @@ define amdgpu_kernel void @s_lshr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GCN-NEXT:    v_mov_b32_e32 v8, 0
 ; GCN-NEXT:    v_mov_b32_e32 v11, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_sub_i32 s6, 64, s16
 ; GCN-NEXT:    v_cmp_lt_u64_e64 s[0:1], s[16:17], 64
 ; GCN-NEXT:    v_cmp_eq_u64_e64 s[2:3], s[18:19], 0
+; GCN-NEXT:    s_sub_i32 s6, 64, s16
 ; GCN-NEXT:    s_sub_i32 s4, s16, 64
 ; GCN-NEXT:    s_lshl_b64 s[6:7], s[10:11], s6
 ; GCN-NEXT:    s_lshr_b64 s[24:25], s[8:9], s16
+; GCN-NEXT:    s_lshr_b64 s[4:5], s[10:11], s4
 ; GCN-NEXT:    s_or_b64 s[6:7], s[24:25], s[6:7]
 ; GCN-NEXT:    s_and_b64 vcc, s[2:3], s[0:1]
 ; GCN-NEXT:    s_or_b64 s[0:1], s[16:17], s[18:19]
-; GCN-NEXT:    s_lshr_b64 s[4:5], s[10:11], s4
 ; GCN-NEXT:    v_mov_b32_e32 v0, s5
 ; GCN-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[0:1], 0
@@ -547,10 +547,10 @@ define amdgpu_kernel void @s_lshr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GCN-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GCN-NEXT:    v_mov_b32_e32 v2, s8
-; GCN-NEXT:    s_sub_i32 s6, 64, s20
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; GCN-NEXT:    v_cmp_lt_u64_e64 s[0:1], s[20:21], 64
 ; GCN-NEXT:    v_cmp_eq_u64_e64 s[2:3], s[22:23], 0
+; GCN-NEXT:    s_sub_i32 s6, 64, s20
 ; GCN-NEXT:    s_sub_i32 s4, s20, 64
 ; GCN-NEXT:    s_lshl_b64 s[6:7], s[14:15], s6
 ; GCN-NEXT:    s_lshr_b64 s[8:9], s[12:13], s20
@@ -597,16 +597,16 @@ define amdgpu_kernel void @s_ashr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GCN-NEXT:    v_mov_b32_e32 v8, 0
 ; GCN-NEXT:    v_mov_b32_e32 v11, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_sub_i32 s6, 64, s16
 ; GCN-NEXT:    v_cmp_lt_u64_e64 s[0:1], s[16:17], 64
 ; GCN-NEXT:    v_cmp_eq_u64_e64 s[2:3], s[18:19], 0
+; GCN-NEXT:    s_sub_i32 s6, 64, s16
 ; GCN-NEXT:    s_sub_i32 s4, s16, 64
 ; GCN-NEXT:    s_lshl_b64 s[6:7], s[10:11], s6
 ; GCN-NEXT:    s_lshr_b64 s[24:25], s[8:9], s16
+; GCN-NEXT:    s_ashr_i64 s[4:5], s[10:11], s4
 ; GCN-NEXT:    s_or_b64 s[6:7], s[24:25], s[6:7]
 ; GCN-NEXT:    s_and_b64 vcc, s[2:3], s[0:1]
 ; GCN-NEXT:    s_or_b64 s[0:1], s[16:17], s[18:19]
-; GCN-NEXT:    s_ashr_i64 s[4:5], s[10:11], s4
 ; GCN-NEXT:    v_mov_b32_e32 v0, s5
 ; GCN-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[0:1], 0
@@ -617,10 +617,10 @@ define amdgpu_kernel void @s_ashr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GCN-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GCN-NEXT:    v_mov_b32_e32 v2, s8
-; GCN-NEXT:    s_sub_i32 s6, 64, s20
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; GCN-NEXT:    v_cmp_lt_u64_e64 s[0:1], s[20:21], 64
 ; GCN-NEXT:    v_cmp_eq_u64_e64 s[2:3], s[22:23], 0
+; GCN-NEXT:    s_sub_i32 s6, 64, s20
 ; GCN-NEXT:    s_sub_i32 s4, s20, 64
 ; GCN-NEXT:    s_lshl_b64 s[6:7], s[14:15], s6
 ; GCN-NEXT:    s_lshr_b64 s[8:9], s[12:13], s20
@@ -639,13 +639,13 @@ define amdgpu_kernel void @s_ashr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
 ; GCN-NEXT:    v_mov_b32_e32 v3, s12
 ; GCN-NEXT:    v_cndmask_b32_e64 v4, v2, v3, s[2:3]
-; GCN-NEXT:    s_ashr_i64 s[2:3], s[10:11], s16
 ; GCN-NEXT:    s_ashr_i32 s4, s11, 31
+; GCN-NEXT:    s_ashr_i64 s[2:3], s[10:11], s16
 ; GCN-NEXT:    v_mov_b32_e32 v2, s4
 ; GCN-NEXT:    v_mov_b32_e32 v3, s3
 ; GCN-NEXT:    v_mov_b32_e32 v6, s2
-; GCN-NEXT:    s_ashr_i64 s[2:3], s[14:15], s20
 ; GCN-NEXT:    s_ashr_i32 s4, s15, 31
+; GCN-NEXT:    s_ashr_i64 s[2:3], s[14:15], s20
 ; GCN-NEXT:    v_cndmask_b32_e32 v3, v2, v3, vcc
 ; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
 ; GCN-NEXT:    v_mov_b32_e32 v6, s4

diff  --git a/llvm/test/CodeGen/AMDGPU/shl.ll b/llvm/test/CodeGen/AMDGPU/shl.ll
index b62a21811426e..7272250c35356 100644
--- a/llvm/test/CodeGen/AMDGPU/shl.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl.ll
@@ -551,10 +551,10 @@ define amdgpu_kernel void @shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> add
 ; SI-NEXT:    s_mov_b32 s8, s6
 ; SI-NEXT:    s_mov_b32 s9, s7
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; SI-NEXT:    v_mov_b32_e32 v1, 0
 ; SI-NEXT:    s_mov_b32 s14, 0
 ; SI-NEXT:    s_mov_b32 s15, s3
+; SI-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; SI-NEXT:    buffer_load_dword v2, off, s[8:11], 0
 ; SI-NEXT:    buffer_load_dword v0, v[0:1], s[12:15], 0 addr64 offset:4
 ; SI-NEXT:    s_mov_b32 s6, 0xffff
@@ -578,18 +578,18 @@ define amdgpu_kernel void @shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> add
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_load_dword s4, s[2:3], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 4, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
-; VI-NEXT:    s_load_dword s4, s[2:3], 0x0
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshr_b32 s5, s4, 16
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshlrev_b16_e64 v2, v0, s4
 ; VI-NEXT:    v_lshlrev_b16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -658,11 +658,11 @@ define amdgpu_kernel void @shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> add
 ; SI-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v8, s0, v4
 ; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
 ; SI-NEXT:    v_and_b32_e32 v9, s0, v5
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
 ; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
 ; SI-NEXT:    v_lshl_b32_e32 v5, v7, v5
 ; SI-NEXT:    v_lshl_b32_e32 v3, v3, v9

diff  --git a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
index 1c7c1db259231..f196848df5bf9 100644
--- a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
@@ -28,8 +28,8 @@ define amdgpu_kernel void @s_shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %
 ; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshr_b32 s1, s2, 16
-; VI-NEXT:    s_lshr_b32 s8, s0, 16
 ; VI-NEXT:    s_and_b32 s2, s2, s3
+; VI-NEXT:    s_lshr_b32 s8, s0, 16
 ; VI-NEXT:    s_and_b32 s0, s0, s3
 ; VI-NEXT:    s_lshl_b32 s0, s2, s0
 ; VI-NEXT:    s_lshl_b32 s1, s1, s8
@@ -347,8 +347,8 @@ define amdgpu_kernel void @shl_imm_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i1
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v3, v[0:1]
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshlrev_b16_e64 v2, v3, 8
@@ -522,11 +522,11 @@ define amdgpu_kernel void @v_shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> a
 ; CI-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; CI-NEXT:    s_waitcnt vmcnt(1)
 ; CI-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
+; CI-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    v_and_b32_e32 v8, s0, v4
 ; CI-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
 ; CI-NEXT:    v_and_b32_e32 v9, s0, v5
-; CI-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
 ; CI-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
 ; CI-NEXT:    v_lshl_b32_e32 v5, v7, v5
 ; CI-NEXT:    v_lshl_b32_e32 v3, v3, v9
@@ -584,8 +584,8 @@ define amdgpu_kernel void @shl_v_imm_v4i16(<4 x i16> addrspace(1)* %out, <4 x i1
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; VI-NEXT:    s_mov_b32 s2, 0xff000000
@@ -596,9 +596,9 @@ define amdgpu_kernel void @shl_v_imm_v4i16(<4 x i16> addrspace(1)* %out, <4 x i1
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 8, v1
 ; VI-NEXT:    v_lshlrev_b16_e32 v5, 8, v0
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
-; VI-NEXT:    v_and_b32_e32 v0, s2, v0
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; VI-NEXT:    v_and_b32_e32 v4, s2, v4
+; VI-NEXT:    v_and_b32_e32 v0, s2, v0
 ; VI-NEXT:    v_or_b32_e32 v1, v1, v4
 ; VI-NEXT:    v_or_b32_e32 v0, v5, v0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]

diff  --git a/llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll b/llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll
index 0782f172a2ce6..30cecc085776d 100644
--- a/llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll
@@ -282,9 +282,8 @@ define amdgpu_kernel void @atomic_umax_shl_base_lds_0(i32 addrspace(1)* %out, i3
 
 ; GCN-LABEL: {{^}}shl_add_ptr_combine_2use_lds:
 ; GCN: v_lshlrev_b32_e32 [[SCALE0:v[0-9]+]], 3, v0
-; GCN: ds_write_b32 [[SCALE0]], v{{[0-9]+}} offset:32
-
 ; GCN: v_lshlrev_b32_e32 [[SCALE1:v[0-9]+]], 4, v0
+; GCN: ds_write_b32 [[SCALE0]], v{{[0-9]+}} offset:32
 ; GCN: ds_write_b32 [[SCALE1]], v{{[0-9]+}} offset:64
 define void @shl_add_ptr_combine_2use_lds(i32 %idx) #0 {
   %idx.add = add nuw i32 %idx, 4
@@ -333,9 +332,8 @@ define void @shl_add_ptr_combine_2use_both_max_lds_offset(i32 %idx) #0 {
 
 ; GCN-LABEL: {{^}}shl_add_ptr_combine_2use_private:
 ; GCN: v_lshlrev_b32_e32 [[SCALE0:v[0-9]+]], 2, v0
-; GCN: buffer_store_dword v{{[0-9]+}}, [[SCALE0]], s[0:3], 0 offen offset:16
-
 ; GCN: v_lshlrev_b32_e32 [[SCALE1:v[0-9]+]], 3, v0
+; GCN: buffer_store_dword v{{[0-9]+}}, [[SCALE0]], s[0:3], 0 offen offset:16
 ; GCN: buffer_store_dword v{{[0-9]+}}, [[SCALE1]], s[0:3], 0 offen offset:32
 define void @shl_add_ptr_combine_2use_private(i16 zeroext %idx.arg) #0 {
   %idx = zext i16 %idx.arg to i32
@@ -388,10 +386,9 @@ define void @shl_add_ptr_combine_2use_both_max_private_offset(i16 zeroext %idx.a
 ; GCN-LABEL: {{^}}shl_or_ptr_combine_2use_lds:
 ; GCN: v_lshlrev_b32_e32 [[SCALE0:v[0-9]+]], 3, v0
 ; GCN: v_or_b32_e32 [[SCALE1:v[0-9]+]], 32, [[SCALE0]]
+; GCN: v_lshlrev_b32_e32 [[SCALE2:v[0-9]+]], 4, v0
 ; GCN: ds_write_b32 [[SCALE1]], v{{[0-9]+}}
-
-; GCN: v_lshlrev_b32_e32 [[SCALE1:v[0-9]+]], 4, v0
-; GCN: ds_write_b32 [[SCALE1]], v{{[0-9]+}} offset:64
+; GCN: ds_write_b32 [[SCALE2]], v{{[0-9]+}} offset:64
 define void @shl_or_ptr_combine_2use_lds(i32 %idx) #0 {
   %idx.add = or i32 %idx, 4
   %shl0 = shl i32 %idx.add, 3

diff  --git a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
index 4ffa65d0a5bf4..767c79efdae02 100644
--- a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
+++ b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
@@ -836,8 +836,8 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(i16 addrspace(1)* %out,
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    flat_load_ushort v4, v[0:1] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    v_subrev_u16_e32 v2, 64, v3
 ; VI-NEXT:    v_subrev_u16_e32 v3, 64, v4
@@ -923,8 +923,8 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_64(<2 x i16> addrspace(1)* %out
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v3, v[0:1]
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_sub_u16_sdwa v2, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -994,8 +994,8 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_64(<2 x i16> addrspace(1)* %out,
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v3, v[0:1]
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_u16_e32 v2, -7, v3
@@ -1066,8 +1066,8 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_123(<2 x i16> addrspace(1)* %ou
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v3, v[0:1]
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_u16_sdwa v2, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -1137,8 +1137,8 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_0(<2 x i16> addrspace(1)* %out,
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v3, v[0:1]
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v3
@@ -1413,8 +1413,8 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_neg32(<2 x i16> addrspace(1)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v3, v[0:1]
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_sub_u16_sdwa v2, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -1549,8 +1549,8 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_0(<2 x i16> addrspace(1)* %o
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v3, v[0:1]
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v3
@@ -1621,8 +1621,8 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_neg16(<2 x i16> addrspace(1)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v3, v[0:1]
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_u16_e32 v2, -16, v3
@@ -1757,8 +1757,8 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_0(<2 x i16> addrspace(1)* %o
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v3, v[0:1]
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v3
@@ -1823,14 +1823,14 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(<2 x i16> addrspace(1)*
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v3, v[0:1]
 ; VI-NEXT:    s_movk_i32 s2, 0xc400
 ; VI-NEXT:    v_mov_b32_e32 v4, s2
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_u16_e32 v2, s2, v3
@@ -1896,14 +1896,14 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(<2 x i16> addrspace(1
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v3, v[0:1]
 ; VI-NEXT:    s_movk_i32 s2, 0x4400
 ; VI-NEXT:    v_mov_b32_e32 v4, s2
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_u16_e32 v2, s2, v3
@@ -1969,14 +1969,14 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fptwo(<2 x i16> addrspace(1)*
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v3, v[0:1]
 ; VI-NEXT:    s_movk_i32 s2, 0x4000
 ; VI-NEXT:    v_mov_b32_e32 v4, s2
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_u16_e32 v2, s2, v3
@@ -2042,14 +2042,14 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfptwo(<2 x i16> addrspace(1
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v3, v[0:1]
 ; VI-NEXT:    s_movk_i32 s2, 0xc000
 ; VI-NEXT:    v_mov_b32_e32 v4, s2
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_u16_e32 v2, s2, v3
@@ -2117,9 +2117,9 @@ define amdgpu_kernel void @v_test_v2i16_x_add_undef_neg32(<2 x i16> addrspace(1)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v1, 32
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT:    v_mov_b32_e32 v1, 32
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_sub_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD

diff  --git a/llvm/test/CodeGen/AMDGPU/sign_extend.ll b/llvm/test/CodeGen/AMDGPU/sign_extend.ll
index 85fb85c851eb6..6c0269793f324 100644
--- a/llvm/test/CodeGen/AMDGPU/sign_extend.ll
+++ b/llvm/test/CodeGen/AMDGPU/sign_extend.ll
@@ -487,10 +487,10 @@ define amdgpu_kernel void @s_sext_v4i16_to_v4i32(i32 addrspace(1)* %out, i64 %a)
 ; SI-NEXT:    s_ashr_i32 s5, s6, 16
 ; SI-NEXT:    s_sext_i32_i16 s6, s6
 ; SI-NEXT:    v_mov_b32_e32 v0, s6
+; SI-NEXT:    s_sext_i32_i16 s7, s7
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, s5
-; SI-NEXT:    s_sext_i32_i16 s7, s7
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, s7
@@ -513,10 +513,10 @@ define amdgpu_kernel void @s_sext_v4i16_to_v4i32(i32 addrspace(1)* %out, i64 %a)
 ; VI-NEXT:    s_mov_b32 s0, s4
 ; VI-NEXT:    v_mov_b32_e32 v0, s6
 ; VI-NEXT:    s_ashr_i32 s4, s7, 16
+; VI-NEXT:    s_sext_i32_i16 s7, s7
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s5
-; VI-NEXT:    s_sext_i32_i16 s7, s7
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s7

diff  --git a/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll b/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll
index 7ea3f9249b103..e2f5503fcf8d0 100644
--- a/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll
@@ -410,10 +410,10 @@ define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f32(<4 x float> addrspace(1)*
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v14, 31, v14
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v16, 31, v16
 ; GFX8-NEXT:    v_add_u32_e32 v11, vcc, -1, v11
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
 ; GFX8-NEXT:    v_add_u32_e32 v13, vcc, -1, v13
 ; GFX8-NEXT:    v_add_u32_e32 v15, vcc, -1, v15
 ; GFX8-NEXT:    v_add_u32_e32 v17, vcc, -1, v17
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
 ; GFX8-NEXT:    v_add_u32_e32 v12, vcc, 32, v12
 ; GFX8-NEXT:    v_add_u32_e32 v14, vcc, 32, v14
 ; GFX8-NEXT:    v_add_u32_e32 v16, vcc, 32, v16
@@ -421,26 +421,26 @@ define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f32(<4 x float> addrspace(1)*
 ; GFX8-NEXT:    v_min_u32_e32 v11, v13, v12
 ; GFX8-NEXT:    v_min_u32_e32 v12, v15, v14
 ; GFX8-NEXT:    v_min_u32_e32 v13, v17, v16
-; GFX8-NEXT:    v_lshlrev_b64 v[5:6], v11, v[5:6]
-; GFX8-NEXT:    v_lshlrev_b64 v[3:4], v12, v[3:4]
 ; GFX8-NEXT:    v_lshlrev_b64 v[7:8], v0, v[7:8]
 ; GFX8-NEXT:    v_sub_u32_e32 v14, vcc, 32, v0
+; GFX8-NEXT:    v_lshlrev_b64 v[5:6], v11, v[5:6]
+; GFX8-NEXT:    v_lshlrev_b64 v[3:4], v12, v[3:4]
 ; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v13, v[1:2]
 ; GFX8-NEXT:    v_min_u32_e32 v7, 1, v7
 ; GFX8-NEXT:    v_min_u32_e32 v5, 1, v5
 ; GFX8-NEXT:    v_min_u32_e32 v3, 1, v3
 ; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX8-NEXT:    v_or_b32_e32 v3, v4, v3
-; GFX8-NEXT:    v_or_b32_e32 v5, v6, v5
 ; GFX8-NEXT:    v_or_b32_e32 v7, v8, v7
+; GFX8-NEXT:    v_or_b32_e32 v5, v6, v5
+; GFX8-NEXT:    v_or_b32_e32 v3, v4, v3
 ; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    v_cvt_f32_i32_e32 v1, v7
 ; GFX8-NEXT:    v_cvt_f32_i32_e32 v4, v5
 ; GFX8-NEXT:    v_cvt_f32_i32_e32 v3, v3
 ; GFX8-NEXT:    v_cvt_f32_i32_e32 v5, v0
-; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v13
 ; GFX8-NEXT:    v_sub_u32_e32 v11, vcc, 32, v11
 ; GFX8-NEXT:    v_sub_u32_e32 v12, vcc, 32, v12
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v13
 ; GFX8-NEXT:    v_ldexp_f32 v1, v1, v14
 ; GFX8-NEXT:    v_ldexp_f32 v0, v4, v11
 ; GFX8-NEXT:    v_ldexp_f32 v3, v3, v12
@@ -641,10 +641,10 @@ define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f16(<4 x half> addrspace(1)*
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v14, 31, v14
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v16, 31, v16
 ; GFX8-NEXT:    v_add_u32_e32 v11, vcc, -1, v11
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
 ; GFX8-NEXT:    v_add_u32_e32 v13, vcc, -1, v13
 ; GFX8-NEXT:    v_add_u32_e32 v15, vcc, -1, v15
 ; GFX8-NEXT:    v_add_u32_e32 v17, vcc, -1, v17
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
 ; GFX8-NEXT:    v_add_u32_e32 v12, vcc, 32, v12
 ; GFX8-NEXT:    v_add_u32_e32 v14, vcc, 32, v14
 ; GFX8-NEXT:    v_add_u32_e32 v16, vcc, 32, v16
@@ -652,19 +652,19 @@ define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f16(<4 x half> addrspace(1)*
 ; GFX8-NEXT:    v_min_u32_e32 v11, v13, v12
 ; GFX8-NEXT:    v_min_u32_e32 v12, v15, v14
 ; GFX8-NEXT:    v_min_u32_e32 v13, v17, v16
-; GFX8-NEXT:    v_lshlrev_b64 v[5:6], v11, v[5:6]
-; GFX8-NEXT:    v_lshlrev_b64 v[3:4], v12, v[3:4]
 ; GFX8-NEXT:    v_lshlrev_b64 v[7:8], v0, v[7:8]
 ; GFX8-NEXT:    v_sub_u32_e32 v14, vcc, 32, v0
+; GFX8-NEXT:    v_lshlrev_b64 v[5:6], v11, v[5:6]
+; GFX8-NEXT:    v_lshlrev_b64 v[3:4], v12, v[3:4]
 ; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v13, v[1:2]
 ; GFX8-NEXT:    v_min_u32_e32 v7, 1, v7
 ; GFX8-NEXT:    v_min_u32_e32 v5, 1, v5
 ; GFX8-NEXT:    v_min_u32_e32 v3, 1, v3
 ; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX8-NEXT:    v_or_b32_e32 v3, v4, v3
-; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    v_or_b32_e32 v7, v8, v7
 ; GFX8-NEXT:    v_or_b32_e32 v5, v6, v5
+; GFX8-NEXT:    v_or_b32_e32 v3, v4, v3
+; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    v_cvt_f32_i32_e32 v1, v7
 ; GFX8-NEXT:    v_cvt_f32_i32_e32 v4, v5
 ; GFX8-NEXT:    v_cvt_f32_i32_e32 v3, v3
@@ -672,18 +672,18 @@ define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f16(<4 x half> addrspace(1)*
 ; GFX8-NEXT:    v_sub_u32_e32 v11, vcc, 32, v11
 ; GFX8-NEXT:    v_sub_u32_e32 v12, vcc, 32, v12
 ; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v13
+; GFX8-NEXT:    v_ldexp_f32 v1, v1, v14
 ; GFX8-NEXT:    v_ldexp_f32 v4, v4, v11
 ; GFX8-NEXT:    v_ldexp_f32 v3, v3, v12
 ; GFX8-NEXT:    v_ldexp_f32 v0, v0, v2
-; GFX8-NEXT:    v_ldexp_f32 v1, v1, v14
-; GFX8-NEXT:    v_cvt_f16_f32_e32 v5, v0
-; GFX8-NEXT:    v_cvt_f16_f32_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
 ; GFX8-NEXT:    v_cvt_f16_f32_sdwa v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
 ; GFX8-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX8-NEXT:    v_cvt_f16_f32_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT:    v_cvt_f16_f32_e32 v5, v0
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v9
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v10, vcc
-; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
 ; GFX8-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
 ; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GFX8-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()

diff  --git a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
index 81e5c38a544b1..f189bccc7430d 100644
--- a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
+++ b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
@@ -100,22 +100,39 @@ define amdgpu_ps void @test_kill_depth_var(float %x) #0 {
 
 ; FIXME: Ideally only one early-exit would be emitted
 define amdgpu_ps void @test_kill_depth_var_x2_same(float %x) #0 {
-; WAVE64-LABEL: test_kill_depth_var_x2_same:
-; WAVE64:       ; %bb.0:
-; WAVE64-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v0
-; WAVE64-NEXT:    s_mov_b64 s[0:1], exec
-; WAVE64-NEXT:    s_andn2_b64 s[0:1], s[0:1], vcc
-; WAVE64-NEXT:    s_cbranch_scc0 BB4_2
-; WAVE64-NEXT:  ; %bb.1:
-; WAVE64-NEXT:    s_andn2_b64 exec, exec, vcc
-; WAVE64-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v0
-; WAVE64-NEXT:    s_andn2_b64 s[0:1], s[0:1], vcc
-; WAVE64-NEXT:    s_cbranch_scc0 BB4_2
-; WAVE64-NEXT:    s_endpgm
-; WAVE64-NEXT:  BB4_2:
-; WAVE64-NEXT:    s_mov_b64 exec, 0
-; WAVE64-NEXT:    exp null off, off, off, off done vm
-; WAVE64-NEXT:    s_endpgm
+; SI-LABEL: test_kill_depth_var_x2_same:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_mov_b64 s[0:1], exec
+; SI-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v0
+; SI-NEXT:    s_andn2_b64 s[0:1], s[0:1], vcc
+; SI-NEXT:    s_cbranch_scc0 BB4_2
+; SI-NEXT:  ; %bb.1:
+; SI-NEXT:    s_andn2_b64 exec, exec, vcc
+; SI-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v0
+; SI-NEXT:    s_andn2_b64 s[0:1], s[0:1], vcc
+; SI-NEXT:    s_cbranch_scc0 BB4_2
+; SI-NEXT:    s_endpgm
+; SI-NEXT:  BB4_2:
+; SI-NEXT:    s_mov_b64 exec, 0
+; SI-NEXT:    exp null off, off, off, off done vm
+; SI-NEXT:    s_endpgm
+;
+; GFX10-WAVE64-LABEL: test_kill_depth_var_x2_same:
+; GFX10-WAVE64:       ; %bb.0:
+; GFX10-WAVE64-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v0
+; GFX10-WAVE64-NEXT:    s_mov_b64 s[0:1], exec
+; GFX10-WAVE64-NEXT:    s_andn2_b64 s[0:1], s[0:1], vcc
+; GFX10-WAVE64-NEXT:    s_cbranch_scc0 BB4_2
+; GFX10-WAVE64-NEXT:  ; %bb.1:
+; GFX10-WAVE64-NEXT:    s_andn2_b64 exec, exec, vcc
+; GFX10-WAVE64-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v0
+; GFX10-WAVE64-NEXT:    s_andn2_b64 s[0:1], s[0:1], vcc
+; GFX10-WAVE64-NEXT:    s_cbranch_scc0 BB4_2
+; GFX10-WAVE64-NEXT:    s_endpgm
+; GFX10-WAVE64-NEXT:  BB4_2:
+; GFX10-WAVE64-NEXT:    s_mov_b64 exec, 0
+; GFX10-WAVE64-NEXT:    exp null off, off, off, off done vm
+; GFX10-WAVE64-NEXT:    s_endpgm
 ;
 ; GFX10-WAVE32-LABEL: test_kill_depth_var_x2_same:
 ; GFX10-WAVE32:       ; %bb.0:
@@ -141,22 +158,39 @@ define amdgpu_ps void @test_kill_depth_var_x2_same(float %x) #0 {
 
 ; FIXME: Ideally only one early-exit would be emitted
 define amdgpu_ps void @test_kill_depth_var_x2(float %x, float %y) #0 {
-; WAVE64-LABEL: test_kill_depth_var_x2:
-; WAVE64:       ; %bb.0:
-; WAVE64-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v0
-; WAVE64-NEXT:    s_mov_b64 s[0:1], exec
-; WAVE64-NEXT:    s_andn2_b64 s[0:1], s[0:1], vcc
-; WAVE64-NEXT:    s_cbranch_scc0 BB5_2
-; WAVE64-NEXT:  ; %bb.1:
-; WAVE64-NEXT:    s_andn2_b64 exec, exec, vcc
-; WAVE64-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v1
-; WAVE64-NEXT:    s_andn2_b64 s[0:1], s[0:1], vcc
-; WAVE64-NEXT:    s_cbranch_scc0 BB5_2
-; WAVE64-NEXT:    s_endpgm
-; WAVE64-NEXT:  BB5_2:
-; WAVE64-NEXT:    s_mov_b64 exec, 0
-; WAVE64-NEXT:    exp null off, off, off, off done vm
-; WAVE64-NEXT:    s_endpgm
+; SI-LABEL: test_kill_depth_var_x2:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_mov_b64 s[0:1], exec
+; SI-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v0
+; SI-NEXT:    s_andn2_b64 s[0:1], s[0:1], vcc
+; SI-NEXT:    s_cbranch_scc0 BB5_2
+; SI-NEXT:  ; %bb.1:
+; SI-NEXT:    s_andn2_b64 exec, exec, vcc
+; SI-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v1
+; SI-NEXT:    s_andn2_b64 s[0:1], s[0:1], vcc
+; SI-NEXT:    s_cbranch_scc0 BB5_2
+; SI-NEXT:    s_endpgm
+; SI-NEXT:  BB5_2:
+; SI-NEXT:    s_mov_b64 exec, 0
+; SI-NEXT:    exp null off, off, off, off done vm
+; SI-NEXT:    s_endpgm
+;
+; GFX10-WAVE64-LABEL: test_kill_depth_var_x2:
+; GFX10-WAVE64:       ; %bb.0:
+; GFX10-WAVE64-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v0
+; GFX10-WAVE64-NEXT:    s_mov_b64 s[0:1], exec
+; GFX10-WAVE64-NEXT:    s_andn2_b64 s[0:1], s[0:1], vcc
+; GFX10-WAVE64-NEXT:    s_cbranch_scc0 BB5_2
+; GFX10-WAVE64-NEXT:  ; %bb.1:
+; GFX10-WAVE64-NEXT:    s_andn2_b64 exec, exec, vcc
+; GFX10-WAVE64-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v1
+; GFX10-WAVE64-NEXT:    s_andn2_b64 s[0:1], s[0:1], vcc
+; GFX10-WAVE64-NEXT:    s_cbranch_scc0 BB5_2
+; GFX10-WAVE64-NEXT:    s_endpgm
+; GFX10-WAVE64-NEXT:  BB5_2:
+; GFX10-WAVE64-NEXT:    s_mov_b64 exec, 0
+; GFX10-WAVE64-NEXT:    exp null off, off, off, off done vm
+; GFX10-WAVE64-NEXT:    s_endpgm
 ;
 ; GFX10-WAVE32-LABEL: test_kill_depth_var_x2:
 ; GFX10-WAVE32:       ; %bb.0:
@@ -182,25 +216,45 @@ define amdgpu_ps void @test_kill_depth_var_x2(float %x, float %y) #0 {
 }
 
 define amdgpu_ps void @test_kill_depth_var_x2_instructions(float %x) #0 {
-; WAVE64-LABEL: test_kill_depth_var_x2_instructions:
-; WAVE64:       ; %bb.0:
-; WAVE64-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v0
-; WAVE64-NEXT:    s_mov_b64 s[0:1], exec
-; WAVE64-NEXT:    s_andn2_b64 s[0:1], s[0:1], vcc
-; WAVE64-NEXT:    s_cbranch_scc0 BB6_2
-; WAVE64-NEXT:  ; %bb.1:
-; WAVE64-NEXT:    s_andn2_b64 exec, exec, vcc
-; WAVE64-NEXT:    ;;#ASMSTART
-; WAVE64-NEXT:    v_mov_b32_e64 v7, -1
-; WAVE64-NEXT:    ;;#ASMEND
-; WAVE64-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v7
-; WAVE64-NEXT:    s_andn2_b64 s[0:1], s[0:1], vcc
-; WAVE64-NEXT:    s_cbranch_scc0 BB6_2
-; WAVE64-NEXT:    s_endpgm
-; WAVE64-NEXT:  BB6_2:
-; WAVE64-NEXT:    s_mov_b64 exec, 0
-; WAVE64-NEXT:    exp null off, off, off, off done vm
-; WAVE64-NEXT:    s_endpgm
+; SI-LABEL: test_kill_depth_var_x2_instructions:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_mov_b64 s[0:1], exec
+; SI-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v0
+; SI-NEXT:    s_andn2_b64 s[0:1], s[0:1], vcc
+; SI-NEXT:    s_cbranch_scc0 BB6_2
+; SI-NEXT:  ; %bb.1:
+; SI-NEXT:    s_andn2_b64 exec, exec, vcc
+; SI-NEXT:    ;;#ASMSTART
+; SI-NEXT:    v_mov_b32_e64 v7, -1
+; SI-NEXT:    ;;#ASMEND
+; SI-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v7
+; SI-NEXT:    s_andn2_b64 s[0:1], s[0:1], vcc
+; SI-NEXT:    s_cbranch_scc0 BB6_2
+; SI-NEXT:    s_endpgm
+; SI-NEXT:  BB6_2:
+; SI-NEXT:    s_mov_b64 exec, 0
+; SI-NEXT:    exp null off, off, off, off done vm
+; SI-NEXT:    s_endpgm
+;
+; GFX10-WAVE64-LABEL: test_kill_depth_var_x2_instructions:
+; GFX10-WAVE64:       ; %bb.0:
+; GFX10-WAVE64-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v0
+; GFX10-WAVE64-NEXT:    s_mov_b64 s[0:1], exec
+; GFX10-WAVE64-NEXT:    s_andn2_b64 s[0:1], s[0:1], vcc
+; GFX10-WAVE64-NEXT:    s_cbranch_scc0 BB6_2
+; GFX10-WAVE64-NEXT:  ; %bb.1:
+; GFX10-WAVE64-NEXT:    s_andn2_b64 exec, exec, vcc
+; GFX10-WAVE64-NEXT:    ;;#ASMSTART
+; GFX10-WAVE64-NEXT:    v_mov_b32_e64 v7, -1
+; GFX10-WAVE64-NEXT:    ;;#ASMEND
+; GFX10-WAVE64-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v7
+; GFX10-WAVE64-NEXT:    s_andn2_b64 s[0:1], s[0:1], vcc
+; GFX10-WAVE64-NEXT:    s_cbranch_scc0 BB6_2
+; GFX10-WAVE64-NEXT:    s_endpgm
+; GFX10-WAVE64-NEXT:  BB6_2:
+; GFX10-WAVE64-NEXT:    s_mov_b64 exec, 0
+; GFX10-WAVE64-NEXT:    exp null off, off, off, off done vm
+; GFX10-WAVE64-NEXT:    s_endpgm
 ;
 ; GFX10-WAVE32-LABEL: test_kill_depth_var_x2_instructions:
 ; GFX10-WAVE32:       ; %bb.0:
@@ -231,40 +285,75 @@ define amdgpu_ps void @test_kill_depth_var_x2_instructions(float %x) #0 {
 
 ; FIXME: why does the skip depend on the asm length in the same block?
 define amdgpu_ps float @test_kill_control_flow(i32 inreg %arg) #0 {
-; WAVE64-LABEL: test_kill_control_flow:
-; WAVE64:       ; %bb.0: ; %entry
-; WAVE64-NEXT:    s_cmp_lg_u32 s0, 0
-; WAVE64-NEXT:    s_cbranch_scc0 BB7_2
-; WAVE64-NEXT:  ; %bb.1: ; %exit
-; WAVE64-NEXT:    v_mov_b32_e32 v0, 1.0
-; WAVE64-NEXT:    s_branch BB7_5
-; WAVE64-NEXT:  BB7_2: ; %bb
-; WAVE64-NEXT:    ;;#ASMSTART
-; WAVE64-NEXT:    v_mov_b32_e64 v7, -1
-; WAVE64-NEXT:    v_nop_e64
-; WAVE64-NEXT:    v_nop_e64
-; WAVE64-NEXT:    v_nop_e64
-; WAVE64-NEXT:    v_nop_e64
-; WAVE64-NEXT:    v_nop_e64
-; WAVE64-NEXT:    v_nop_e64
-; WAVE64-NEXT:    v_nop_e64
-; WAVE64-NEXT:    v_nop_e64
-; WAVE64-NEXT:    v_nop_e64
-; WAVE64-NEXT:    v_nop_e64
-; WAVE64-NEXT:    ;;#ASMEND
-; WAVE64-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v7
-; WAVE64-NEXT:    s_mov_b64 s[2:3], exec
-; WAVE64-NEXT:    s_andn2_b64 s[2:3], s[2:3], vcc
-; WAVE64-NEXT:    s_cbranch_scc0 BB7_4
-; WAVE64-NEXT:  ; %bb.3: ; %bb
-; WAVE64-NEXT:    s_andn2_b64 exec, exec, vcc
-; WAVE64-NEXT:    v_mov_b32_e32 v0, 1.0
-; WAVE64-NEXT:    s_branch BB7_5
-; WAVE64-NEXT:  BB7_4:
-; WAVE64-NEXT:    s_mov_b64 exec, 0
-; WAVE64-NEXT:    exp null off, off, off, off done vm
-; WAVE64-NEXT:    s_endpgm
-; WAVE64-NEXT:  BB7_5:
+; SI-LABEL: test_kill_control_flow:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_cmp_lg_u32 s0, 0
+; SI-NEXT:    s_cbranch_scc0 BB7_2
+; SI-NEXT:  ; %bb.1: ; %exit
+; SI-NEXT:    v_mov_b32_e32 v0, 1.0
+; SI-NEXT:    s_branch BB7_5
+; SI-NEXT:  BB7_2: ; %bb
+; SI-NEXT:    s_mov_b64 s[2:3], exec
+; SI-NEXT:    ;;#ASMSTART
+; SI-NEXT:    v_mov_b32_e64 v7, -1
+; SI-NEXT:    v_nop_e64
+; SI-NEXT:    v_nop_e64
+; SI-NEXT:    v_nop_e64
+; SI-NEXT:    v_nop_e64
+; SI-NEXT:    v_nop_e64
+; SI-NEXT:    v_nop_e64
+; SI-NEXT:    v_nop_e64
+; SI-NEXT:    v_nop_e64
+; SI-NEXT:    v_nop_e64
+; SI-NEXT:    v_nop_e64
+; SI-NEXT:    ;;#ASMEND
+; SI-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v7
+; SI-NEXT:    s_andn2_b64 s[2:3], s[2:3], vcc
+; SI-NEXT:    s_cbranch_scc0 BB7_4
+; SI-NEXT:  ; %bb.3: ; %bb
+; SI-NEXT:    s_andn2_b64 exec, exec, vcc
+; SI-NEXT:    v_mov_b32_e32 v0, 1.0
+; SI-NEXT:    s_branch BB7_5
+; SI-NEXT:  BB7_4:
+; SI-NEXT:    s_mov_b64 exec, 0
+; SI-NEXT:    exp null off, off, off, off done vm
+; SI-NEXT:    s_endpgm
+; SI-NEXT:  BB7_5:
+;
+; GFX10-WAVE64-LABEL: test_kill_control_flow:
+; GFX10-WAVE64:       ; %bb.0: ; %entry
+; GFX10-WAVE64-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX10-WAVE64-NEXT:    s_cbranch_scc0 BB7_2
+; GFX10-WAVE64-NEXT:  ; %bb.1: ; %exit
+; GFX10-WAVE64-NEXT:    v_mov_b32_e32 v0, 1.0
+; GFX10-WAVE64-NEXT:    s_branch BB7_5
+; GFX10-WAVE64-NEXT:  BB7_2: ; %bb
+; GFX10-WAVE64-NEXT:    ;;#ASMSTART
+; GFX10-WAVE64-NEXT:    v_mov_b32_e64 v7, -1
+; GFX10-WAVE64-NEXT:    v_nop_e64
+; GFX10-WAVE64-NEXT:    v_nop_e64
+; GFX10-WAVE64-NEXT:    v_nop_e64
+; GFX10-WAVE64-NEXT:    v_nop_e64
+; GFX10-WAVE64-NEXT:    v_nop_e64
+; GFX10-WAVE64-NEXT:    v_nop_e64
+; GFX10-WAVE64-NEXT:    v_nop_e64
+; GFX10-WAVE64-NEXT:    v_nop_e64
+; GFX10-WAVE64-NEXT:    v_nop_e64
+; GFX10-WAVE64-NEXT:    v_nop_e64
+; GFX10-WAVE64-NEXT:    ;;#ASMEND
+; GFX10-WAVE64-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v7
+; GFX10-WAVE64-NEXT:    s_mov_b64 s[2:3], exec
+; GFX10-WAVE64-NEXT:    s_andn2_b64 s[2:3], s[2:3], vcc
+; GFX10-WAVE64-NEXT:    s_cbranch_scc0 BB7_4
+; GFX10-WAVE64-NEXT:  ; %bb.3: ; %bb
+; GFX10-WAVE64-NEXT:    s_andn2_b64 exec, exec, vcc
+; GFX10-WAVE64-NEXT:    v_mov_b32_e32 v0, 1.0
+; GFX10-WAVE64-NEXT:    s_branch BB7_5
+; GFX10-WAVE64-NEXT:  BB7_4:
+; GFX10-WAVE64-NEXT:    s_mov_b64 exec, 0
+; GFX10-WAVE64-NEXT:    exp null off, off, off, off done vm
+; GFX10-WAVE64-NEXT:    s_endpgm
+; GFX10-WAVE64-NEXT:  BB7_5:
 ;
 ; GFX10-WAVE32-LABEL: test_kill_control_flow:
 ; GFX10-WAVE32:       ; %bb.0: ; %entry
@@ -332,6 +421,7 @@ define amdgpu_ps void @test_kill_control_flow_remainder(i32 inreg %arg) #0 {
 ; SI-NEXT:    v_mov_b32_e32 v9, 0
 ; SI-NEXT:    s_cbranch_scc1 BB8_3
 ; SI-NEXT:  ; %bb.1: ; %bb
+; SI-NEXT:    s_mov_b64 s[2:3], exec
 ; SI-NEXT:    ;;#ASMSTART
 ; SI-NEXT:    v_mov_b32_e64 v7, -1
 ; SI-NEXT:    v_nop_e64
@@ -347,7 +437,6 @@ define amdgpu_ps void @test_kill_control_flow_remainder(i32 inreg %arg) #0 {
 ; SI-NEXT:    v_nop_e64
 ; SI-NEXT:    ;;#ASMEND
 ; SI-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v7
-; SI-NEXT:    s_mov_b64 s[2:3], exec
 ; SI-NEXT:    s_andn2_b64 s[2:3], s[2:3], vcc
 ; SI-NEXT:    ;;#ASMSTART
 ; SI-NEXT:    v_mov_b32_e64 v8, -1

diff  --git a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
index 1bb7d8f1a4dd5..5d85d5d81189a 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
@@ -49,15 +49,15 @@ entry:
 ; GFX6: s_add_i32 s32, s32, 0x[[OFFSET:[0-9a-f]+]]
 ; GFX6: s_add_i32 s32, s32, 0x[[OFFSET:[0-9a-f]+]]
 ; GFX6: s_add_i32 s32, s32, 0x[[OFFSET:[0-9a-f]+]]
-; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6: s_waitcnt expcnt(0)
 ; GFX6-NEXT: buffer_load_dword v{{[0-9]+}}, off, s[{{[0-9:]+}}], s32
 ; GFX6-NEXT: s_add_i32 s32, s32, 0x[[OFFSET:[0-9a-f]+]]
 ; GFX6: NumSgprs: 48
 ; GFX6: ScratchSize: 8608
 
 ; FLATSCR:           s_movk_i32 [[SOFF1:s[0-9]+]], 0x
-; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; FLATSCR-NEXT:      scratch_store_dwordx4 off, v[{{[0-9:]+}}], [[SOFF1]] ; 16-byte Folded Spill
+; GFX9-FLATSCR:      s_waitcnt vmcnt(0)
+; FLATSCR:           scratch_store_dwordx4 off, v[{{[0-9:]+}}], [[SOFF1]] ; 16-byte Folded Spill
 ; FLATSCR:           s_movk_i32 [[SOFF2:s[0-9]+]], 0x
 ; FLATSCR:           scratch_load_dwordx4 v[{{[0-9:]+}}], off, [[SOFF2]] ; 16-byte Folded Reload
 define amdgpu_kernel void @test_limited_sgpr(<64 x i32> addrspace(1)* %out, <64 x i32> addrspace(1)* %in) #0 {

diff  --git a/llvm/test/CodeGen/AMDGPU/sra.ll b/llvm/test/CodeGen/AMDGPU/sra.ll
index 0ae4d0321e8df..04d86771fc660 100644
--- a/llvm/test/CodeGen/AMDGPU/sra.ll
+++ b/llvm/test/CodeGen/AMDGPU/sra.ll
@@ -162,8 +162,8 @@ define amdgpu_kernel void @ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> ad
 ; SI-NEXT:    s_mov_b32 s1, s5
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_bfe_i32 v2, v0, 0, 16
-; SI-NEXT:    v_and_b32_e32 v3, s6, v1
 ; SI-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
+; SI-NEXT:    v_and_b32_e32 v3, s6, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_ashrrev_i32_e32 v0, v1, v0
 ; SI-NEXT:    v_ashrrev_i32_e32 v1, v3, v2
@@ -250,16 +250,16 @@ define amdgpu_kernel void @ashr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> ad
 ; SI-NEXT:    s_mov_b32 s1, s5
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_bfe_i32 v4, v0, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
 ; SI-NEXT:    v_bfe_i32 v5, v1, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
 ; SI-NEXT:    v_and_b32_e32 v6, s6, v2
-; SI-NEXT:    v_and_b32_e32 v7, s6, v3
-; SI-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
 ; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
+; SI-NEXT:    v_and_b32_e32 v7, s6, v3
 ; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; SI-NEXT:    v_ashrrev_i32_e32 v1, v3, v1
-; SI-NEXT:    v_ashrrev_i32_e32 v0, v2, v0
 ; SI-NEXT:    v_ashrrev_i32_e32 v3, v7, v5
+; SI-NEXT:    v_ashrrev_i32_e32 v0, v2, v0
 ; SI-NEXT:    v_ashrrev_i32_e32 v2, v6, v4
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_and_b32_e32 v3, s6, v3

diff  --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll
index e36b7893c618b..fac510e8dbda8 100644
--- a/llvm/test/CodeGen/AMDGPU/srem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem64.ll
@@ -24,10 +24,10 @@ define amdgpu_kernel void @s_test_srem(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ; GCN-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v0
 ; GCN-NEXT:    v_trunc_f32_e32 v3, v3
 ; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v3
-; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GCN-NEXT:    v_mul_hi_u32 v5, s2, v0
+; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_lo_u32 v4, s2, v3
+; GCN-NEXT:    v_mul_hi_u32 v5, s2, v0
 ; GCN-NEXT:    v_mul_lo_u32 v7, s3, v0
 ; GCN-NEXT:    v_mul_lo_u32 v6, s2, v0
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
@@ -45,8 +45,8 @@ define amdgpu_kernel void @s_test_srem(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v7, v6, vcc
 ; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v9, v1, vcc
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
-; GCN-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v4
 ; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v2, v6, vcc
+; GCN-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v4
 ; GCN-NEXT:    v_addc_u32_e64 v4, vcc, v3, v5, s[0:1]
 ; GCN-NEXT:    v_mul_lo_u32 v6, s2, v4
 ; GCN-NEXT:    v_mul_hi_u32 v7, s2, v0
@@ -55,8 +55,8 @@ define amdgpu_kernel void @s_test_srem(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ; GCN-NEXT:    v_mul_lo_u32 v7, s2, v0
 ; GCN-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
 ; GCN-NEXT:    v_mul_lo_u32 v10, v0, v6
-; GCN-NEXT:    v_mul_hi_u32 v12, v0, v6
 ; GCN-NEXT:    v_mul_hi_u32 v11, v0, v7
+; GCN-NEXT:    v_mul_hi_u32 v12, v0, v6
 ; GCN-NEXT:    v_mul_hi_u32 v9, v4, v7
 ; GCN-NEXT:    v_mul_lo_u32 v7, v4, v7
 ; GCN-NEXT:    v_mul_hi_u32 v8, v4, v6
@@ -99,12 +99,12 @@ define amdgpu_kernel void @s_test_srem(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ; GCN-NEXT:    v_subrev_i32_e64 v4, s[0:1], s12, v0
 ; GCN-NEXT:    v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1]
 ; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s13, v5
-; GCN-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1]
 ; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[2:3]
 ; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s12, v4
-; GCN-NEXT:    v_subrev_i32_e64 v3, s[0:1], s12, v4
+; GCN-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1]
 ; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[2:3]
 ; GCN-NEXT:    v_cmp_eq_u32_e64 s[2:3], s13, v5
+; GCN-NEXT:    v_subrev_i32_e64 v3, s[0:1], s12, v4
 ; GCN-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[2:3]
 ; GCN-NEXT:    v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1]
 ; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
@@ -173,8 +173,8 @@ define amdgpu_kernel void @s_test_srem(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    s_mov_b32 s3, 0
 ; GCN-IR-NEXT:  BB0_3: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    s_lshr_b32 s2, s11, 31
 ; GCN-IR-NEXT:    s_lshl_b64 s[14:15], s[14:15], 1
+; GCN-IR-NEXT:    s_lshr_b32 s2, s11, 31
 ; GCN-IR-NEXT:    s_lshl_b64 s[10:11], s[10:11], 1
 ; GCN-IR-NEXT:    s_or_b64 s[14:15], s[14:15], s[2:3]
 ; GCN-IR-NEXT:    s_or_b64 s[10:11], s[12:13], s[10:11]
@@ -239,9 +239,9 @@ define i64 @v_test_srem(i64 %x, i64 %y) {
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v5, v3
 ; GCN-NEXT:    v_sub_i32_e32 v6, vcc, 0, v2
 ; GCN-NEXT:    v_subb_u32_e32 v7, vcc, 0, v3, vcc
-; GCN-NEXT:    v_mov_b32_e32 v14, 0
 ; GCN-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
 ; GCN-NEXT:    v_rcp_f32_e32 v4, v4
+; GCN-NEXT:    v_mov_b32_e32 v14, 0
 ; GCN-NEXT:    v_mov_b32_e32 v13, 0
 ; GCN-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
 ; GCN-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
@@ -256,8 +256,8 @@ define i64 @v_test_srem(i64 %x, i64 %y) {
 ; GCN-NEXT:    v_mul_lo_u32 v9, v6, v4
 ; GCN-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
 ; GCN-NEXT:    v_mul_lo_u32 v11, v4, v8
-; GCN-NEXT:    v_mul_hi_u32 v10, v4, v8
 ; GCN-NEXT:    v_mul_hi_u32 v12, v4, v9
+; GCN-NEXT:    v_mul_hi_u32 v10, v4, v8
 ; GCN-NEXT:    v_mul_hi_u32 v15, v5, v8
 ; GCN-NEXT:    v_mul_lo_u32 v8, v5, v8
 ; GCN-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
@@ -268,8 +268,8 @@ define i64 @v_test_srem(i64 %x, i64 %y) {
 ; GCN-NEXT:    v_addc_u32_e32 v9, vcc, v10, v9, vcc
 ; GCN-NEXT:    v_addc_u32_e32 v10, vcc, v15, v13, vcc
 ; GCN-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; GCN-NEXT:    v_add_i32_e64 v4, s[4:5], v4, v8
 ; GCN-NEXT:    v_addc_u32_e32 v9, vcc, v14, v10, vcc
+; GCN-NEXT:    v_add_i32_e64 v4, s[4:5], v4, v8
 ; GCN-NEXT:    v_addc_u32_e64 v8, vcc, v5, v9, s[4:5]
 ; GCN-NEXT:    v_mul_lo_u32 v10, v6, v8
 ; GCN-NEXT:    v_mul_hi_u32 v11, v6, v4
@@ -282,8 +282,8 @@ define i64 @v_test_srem(i64 %x, i64 %y) {
 ; GCN-NEXT:    v_mul_hi_u32 v16, v4, v7
 ; GCN-NEXT:    v_mul_hi_u32 v11, v8, v6
 ; GCN-NEXT:    v_mul_lo_u32 v6, v8, v6
-; GCN-NEXT:    v_add_i32_e32 v12, vcc, v15, v12
 ; GCN-NEXT:    v_mul_hi_u32 v10, v8, v7
+; GCN-NEXT:    v_add_i32_e32 v12, vcc, v15, v12
 ; GCN-NEXT:    v_addc_u32_e32 v15, vcc, v14, v16, vcc
 ; GCN-NEXT:    v_mul_lo_u32 v7, v8, v7
 ; GCN-NEXT:    v_add_i32_e32 v6, vcc, v12, v6
@@ -326,13 +326,13 @@ define i64 @v_test_srem(i64 %x, i64 %y) {
 ; GCN-NEXT:    v_sub_i32_e64 v7, s[4:5], v0, v2
 ; GCN-NEXT:    v_subbrev_u32_e64 v8, s[6:7], 0, v4, s[4:5]
 ; GCN-NEXT:    v_cmp_ge_u32_e64 s[6:7], v8, v3
-; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v1, v5, vcc
 ; GCN-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[6:7]
 ; GCN-NEXT:    v_cmp_ge_u32_e64 s[6:7], v7, v2
+; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v1, v5, vcc
 ; GCN-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[6:7]
 ; GCN-NEXT:    v_cmp_eq_u32_e64 s[6:7], v8, v3
-; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
 ; GCN-NEXT:    v_subb_u32_e64 v4, s[4:5], v4, v3, s[4:5]
+; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
 ; GCN-NEXT:    v_cndmask_b32_e64 v9, v9, v10, s[6:7]
 ; GCN-NEXT:    v_sub_i32_e64 v10, s[4:5], v7, v2
 ; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
@@ -340,12 +340,12 @@ define i64 @v_test_srem(i64 %x, i64 %y) {
 ; GCN-NEXT:    v_subbrev_u32_e64 v4, s[4:5], 0, v4, s[4:5]
 ; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
-; GCN-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
 ; GCN-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v9
+; GCN-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
 ; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
 ; GCN-NEXT:    v_cndmask_b32_e64 v2, v7, v10, s[4:5]
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GCN-NEXT:    v_cndmask_b32_e64 v4, v8, v4, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GCN-NEXT:    v_xor_b32_e32 v0, v0, v6
 ; GCN-NEXT:    v_xor_b32_e32 v1, v1, v6
@@ -361,10 +361,10 @@ define i64 @v_test_srem(i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v6, 31, v3
 ; GCN-IR-NEXT:    v_xor_b32_e32 v1, v1, v4
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
-; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v4, vcc
 ; GCN-IR-NEXT:    v_xor_b32_e32 v2, v2, v6
-; GCN-IR-NEXT:    v_sub_i32_e32 v5, vcc, v2, v6
+; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v4, vcc
 ; GCN-IR-NEXT:    v_xor_b32_e32 v3, v3, v6
+; GCN-IR-NEXT:    v_sub_i32_e32 v5, vcc, v2, v6
 ; GCN-IR-NEXT:    v_subb_u32_e32 v6, vcc, v3, v6, vcc
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[5:6]
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
@@ -396,8 +396,8 @@ define i64 @v_test_srem(i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    v_addc_u32_e32 v15, vcc, 0, v8, vcc
 ; GCN-IR-NEXT:    v_cmp_ge_u64_e32 vcc, v[14:15], v[7:8]
 ; GCN-IR-NEXT:    v_sub_i32_e64 v7, s[4:5], 63, v7
-; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
 ; GCN-IR-NEXT:    v_lshl_b64 v[7:8], v[0:1], v7
+; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v10, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -407,10 +407,10 @@ define i64 @v_test_srem(i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    v_add_i32_e32 v18, vcc, -1, v5
 ; GCN-IR-NEXT:    v_addc_u32_e32 v19, vcc, -1, v6, vcc
 ; GCN-IR-NEXT:    v_not_b32_e32 v3, v3
+; GCN-IR-NEXT:    v_lshr_b64 v[14:15], v[0:1], v14
 ; GCN-IR-NEXT:    v_not_b32_e32 v9, v11
-; GCN-IR-NEXT:    v_add_i32_e32 v11, vcc, v3, v12
 ; GCN-IR-NEXT:    v_mov_b32_e32 v16, 0
-; GCN-IR-NEXT:    v_lshr_b64 v[14:15], v[0:1], v14
+; GCN-IR-NEXT:    v_add_i32_e32 v11, vcc, v3, v12
 ; GCN-IR-NEXT:    v_mov_b32_e32 v17, 0
 ; GCN-IR-NEXT:    v_addc_u32_e32 v12, vcc, v9, v13, vcc
 ; GCN-IR-NEXT:  BB1_3: ; %udiv-do-while
@@ -423,15 +423,15 @@ define i64 @v_test_srem(i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    v_subb_u32_e32 v9, vcc, v19, v15, vcc
 ; GCN-IR-NEXT:    v_or_b32_e32 v7, v16, v7
 ; GCN-IR-NEXT:    v_add_i32_e32 v16, vcc, 1, v11
-; GCN-IR-NEXT:    v_ashrrev_i32_e32 v13, 31, v9
 ; GCN-IR-NEXT:    v_or_b32_e32 v8, v17, v8
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v13, 31, v9
 ; GCN-IR-NEXT:    v_addc_u32_e32 v17, vcc, 0, v12, vcc
-; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, v[16:17], v[11:12]
-; GCN-IR-NEXT:    v_mov_b32_e32 v11, v16
 ; GCN-IR-NEXT:    v_mov_b32_e32 v10, 0
 ; GCN-IR-NEXT:    v_and_b32_e32 v9, 1, v13
 ; GCN-IR-NEXT:    v_and_b32_e32 v20, v13, v6
 ; GCN-IR-NEXT:    v_and_b32_e32 v13, v13, v5
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, v[16:17], v[11:12]
+; GCN-IR-NEXT:    v_mov_b32_e32 v11, v16
 ; GCN-IR-NEXT:    v_sub_i32_e64 v14, s[4:5], v3, v13
 ; GCN-IR-NEXT:    v_mov_b32_e32 v12, v17
 ; GCN-IR-NEXT:    v_mov_b32_e32 v17, v10
@@ -900,10 +900,10 @@ define amdgpu_kernel void @s_test_srem33_64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-NEXT:    v_mul_f32_e32 v2, 0x2f800000, v0
 ; GCN-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v2
-; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GCN-NEXT:    v_mul_hi_u32 v4, s4, v0
+; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_lo_u32 v3, s4, v2
+; GCN-NEXT:    v_mul_hi_u32 v4, s4, v0
 ; GCN-NEXT:    v_mul_lo_u32 v6, s5, v0
 ; GCN-NEXT:    v_mul_lo_u32 v5, s4, v0
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
@@ -921,8 +921,8 @@ define amdgpu_kernel void @s_test_srem33_64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v6, v5, vcc
 ; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v9, v1, vcc
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GCN-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v3
 ; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v7, v5, vcc
+; GCN-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v3
 ; GCN-NEXT:    v_addc_u32_e64 v3, vcc, v2, v4, s[0:1]
 ; GCN-NEXT:    v_mul_lo_u32 v5, s4, v3
 ; GCN-NEXT:    v_mul_hi_u32 v6, s4, v0
@@ -932,8 +932,8 @@ define amdgpu_kernel void @s_test_srem33_64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-NEXT:    v_mul_lo_u32 v6, s4, v0
 ; GCN-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
 ; GCN-NEXT:    v_mul_lo_u32 v10, v0, v5
-; GCN-NEXT:    v_mul_hi_u32 v12, v0, v5
 ; GCN-NEXT:    v_mul_hi_u32 v11, v0, v6
+; GCN-NEXT:    v_mul_hi_u32 v12, v0, v5
 ; GCN-NEXT:    v_mul_hi_u32 v9, v3, v6
 ; GCN-NEXT:    v_mul_lo_u32 v6, v3, v6
 ; GCN-NEXT:    v_mul_hi_u32 v8, v3, v5
@@ -950,8 +950,8 @@ define amdgpu_kernel void @s_test_srem33_64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-NEXT:    s_add_u32 s0, s2, s10
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
 ; GCN-NEXT:    s_addc_u32 s1, s3, s10
-; GCN-NEXT:    s_xor_b64 s[14:15], s[0:1], s[10:11]
 ; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
+; GCN-NEXT:    s_xor_b64 s[14:15], s[0:1], s[10:11]
 ; GCN-NEXT:    v_mul_lo_u32 v3, s14, v2
 ; GCN-NEXT:    v_mul_hi_u32 v4, s14, v0
 ; GCN-NEXT:    v_mul_hi_u32 v5, s14, v2
@@ -980,12 +980,12 @@ define amdgpu_kernel void @s_test_srem33_64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-NEXT:    v_subrev_i32_e64 v4, s[0:1], s12, v0
 ; GCN-NEXT:    v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1]
 ; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s13, v5
-; GCN-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1]
 ; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[2:3]
 ; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s12, v4
-; GCN-NEXT:    v_subrev_i32_e64 v3, s[0:1], s12, v4
+; GCN-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1]
 ; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[2:3]
 ; GCN-NEXT:    v_cmp_eq_u32_e64 s[2:3], s13, v5
+; GCN-NEXT:    v_subrev_i32_e64 v3, s[0:1], s12, v4
 ; GCN-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[2:3]
 ; GCN-NEXT:    v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1]
 ; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
@@ -1071,8 +1071,8 @@ define amdgpu_kernel void @s_test_srem33_64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-IR-NEXT:    s_mov_b32 s7, 0
 ; GCN-IR-NEXT:  BB8_3: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    s_lshr_b32 s6, s13, 31
 ; GCN-IR-NEXT:    s_lshl_b64 s[16:17], s[16:17], 1
+; GCN-IR-NEXT:    s_lshr_b32 s6, s13, 31
 ; GCN-IR-NEXT:    s_lshl_b64 s[12:13], s[12:13], 1
 ; GCN-IR-NEXT:    s_or_b64 s[16:17], s[16:17], s[6:7]
 ; GCN-IR-NEXT:    s_or_b64 s[12:13], s[14:15], s[12:13]
@@ -1111,8 +1111,8 @@ define amdgpu_kernel void @s_test_srem33_64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
 ; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
-; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v2, s3
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
 ; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
 ; GCN-IR-NEXT:    v_xor_b32_e32 v0, s0, v0
 ; GCN-IR-NEXT:    v_xor_b32_e32 v1, s1, v1
@@ -1180,8 +1180,8 @@ define amdgpu_kernel void @s_test_srem24_48(i48 addrspace(1)* %out, i48 %x, i48
 ; GCN-IR-NEXT:    s_sext_i32_i16 s7, s0
 ; GCN-IR-NEXT:    s_ashr_i32 s0, s3, 31
 ; GCN-IR-NEXT:    s_ashr_i32 s12, s7, 31
-; GCN-IR-NEXT:    s_mov_b32 s1, s0
 ; GCN-IR-NEXT:    s_ashr_i64 s[10:11], s[6:7], 24
+; GCN-IR-NEXT:    s_mov_b32 s1, s0
 ; GCN-IR-NEXT:    s_mov_b32 s13, s12
 ; GCN-IR-NEXT:    s_xor_b64 s[2:3], s[8:9], s[0:1]
 ; GCN-IR-NEXT:    s_xor_b64 s[6:7], s[10:11], s[12:13]
@@ -1233,8 +1233,8 @@ define amdgpu_kernel void @s_test_srem24_48(i48 addrspace(1)* %out, i48 %x, i48
 ; GCN-IR-NEXT:    s_mov_b32 s9, 0
 ; GCN-IR-NEXT:  BB9_3: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    s_lshr_b32 s8, s13, 31
 ; GCN-IR-NEXT:    s_lshl_b64 s[16:17], s[16:17], 1
+; GCN-IR-NEXT:    s_lshr_b32 s8, s13, 31
 ; GCN-IR-NEXT:    s_lshl_b64 s[12:13], s[12:13], 1
 ; GCN-IR-NEXT:    s_or_b64 s[16:17], s[16:17], s[8:9]
 ; GCN-IR-NEXT:    s_or_b64 s[12:13], s[14:15], s[12:13]
@@ -1273,8 +1273,8 @@ define amdgpu_kernel void @s_test_srem24_48(i48 addrspace(1)* %out, i48 %x, i48
 ; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
 ; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
-; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v2, s3
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
 ; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
 ; GCN-IR-NEXT:    v_xor_b32_e32 v0, s0, v0
 ; GCN-IR-NEXT:    v_xor_b32_e32 v1, s1, v1
@@ -1300,8 +1300,8 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_ashr_i32 s0, s7, 31
 ; GCN-NEXT:    s_add_u32 s2, s6, s0
-; GCN-NEXT:    s_addc_u32 s3, s7, s0
 ; GCN-NEXT:    s_mov_b32 s1, s0
+; GCN-NEXT:    s_addc_u32 s3, s7, s0
 ; GCN-NEXT:    s_xor_b64 s[8:9], s[2:3], s[0:1]
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s8
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s9
@@ -1316,10 +1316,10 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v0
 ; GCN-NEXT:    v_trunc_f32_e32 v3, v3
 ; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v3
-; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GCN-NEXT:    v_mul_hi_u32 v5, s2, v0
+; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_lo_u32 v4, s2, v3
+; GCN-NEXT:    v_mul_hi_u32 v5, s2, v0
 ; GCN-NEXT:    v_mul_lo_u32 v7, s3, v0
 ; GCN-NEXT:    v_mul_lo_u32 v6, s2, v0
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
@@ -1329,16 +1329,16 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    v_mul_hi_u32 v9, v0, v4
 ; GCN-NEXT:    v_mul_lo_u32 v8, v3, v6
 ; GCN-NEXT:    v_mul_hi_u32 v6, v3, v6
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
 ; GCN-NEXT:    v_mul_hi_u32 v10, v3, v4
+; GCN-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
 ; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v2, v9, vcc
 ; GCN-NEXT:    v_mul_lo_u32 v4, v3, v4
 ; GCN-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
 ; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v7, v6, vcc
 ; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v10, v1, vcc
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
-; GCN-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v4
 ; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v2, v6, vcc
+; GCN-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v4
 ; GCN-NEXT:    v_addc_u32_e64 v4, vcc, v3, v5, s[0:1]
 ; GCN-NEXT:    v_mul_lo_u32 v6, s2, v4
 ; GCN-NEXT:    v_mul_hi_u32 v7, s2, v0
@@ -1347,8 +1347,8 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    v_mul_lo_u32 v7, s2, v0
 ; GCN-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
 ; GCN-NEXT:    v_mul_lo_u32 v10, v0, v6
-; GCN-NEXT:    v_mul_hi_u32 v12, v0, v6
 ; GCN-NEXT:    v_mul_hi_u32 v11, v0, v7
+; GCN-NEXT:    v_mul_hi_u32 v12, v0, v6
 ; GCN-NEXT:    v_mul_hi_u32 v9, v4, v7
 ; GCN-NEXT:    v_mul_lo_u32 v7, v4, v7
 ; GCN-NEXT:    v_mul_hi_u32 v8, v4, v6
@@ -1380,15 +1380,15 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    v_subrev_i32_e64 v4, s[0:1], s8, v0
 ; GCN-NEXT:    v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1]
 ; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s9, v5
-; GCN-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1]
 ; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[2:3]
 ; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s8, v4
-; GCN-NEXT:    v_subrev_i32_e64 v3, s[0:1], s8, v4
+; GCN-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1]
 ; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[2:3]
 ; GCN-NEXT:    v_cmp_eq_u32_e64 s[2:3], s9, v5
-; GCN-NEXT:    v_subb_u32_e32 v1, vcc, 0, v1, vcc
+; GCN-NEXT:    v_subrev_i32_e64 v3, s[0:1], s8, v4
 ; GCN-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[2:3]
 ; GCN-NEXT:    v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1]
+; GCN-NEXT:    v_subb_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
 ; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
 ; GCN-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[0:1]
@@ -1448,8 +1448,8 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-IR-NEXT:    s_mov_b32 s3, 0
 ; GCN-IR-NEXT:  BB10_3: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    s_lshr_b32 s2, s9, 31
 ; GCN-IR-NEXT:    s_lshl_b64 s[12:13], s[12:13], 1
+; GCN-IR-NEXT:    s_lshr_b32 s2, s9, 31
 ; GCN-IR-NEXT:    s_lshl_b64 s[8:9], s[8:9], 1
 ; GCN-IR-NEXT:    s_or_b64 s[12:13], s[12:13], s[2:3]
 ; GCN-IR-NEXT:    s_or_b64 s[8:9], s[10:11], s[8:9]
@@ -1509,9 +1509,9 @@ define i64 @v_test_srem_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v3, v1
 ; GCN-NEXT:    v_sub_i32_e32 v4, vcc, 0, v0
 ; GCN-NEXT:    v_subb_u32_e32 v5, vcc, 0, v1, vcc
-; GCN-NEXT:    v_mov_b32_e32 v12, 0
 ; GCN-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v3
 ; GCN-NEXT:    v_rcp_f32_e32 v2, v2
+; GCN-NEXT:    v_mov_b32_e32 v12, 0
 ; GCN-NEXT:    v_mov_b32_e32 v11, 0
 ; GCN-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
 ; GCN-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
@@ -1526,8 +1526,8 @@ define i64 @v_test_srem_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_mul_lo_u32 v7, v4, v2
 ; GCN-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
 ; GCN-NEXT:    v_mul_lo_u32 v9, v2, v6
-; GCN-NEXT:    v_mul_hi_u32 v8, v2, v6
 ; GCN-NEXT:    v_mul_hi_u32 v10, v2, v7
+; GCN-NEXT:    v_mul_hi_u32 v8, v2, v6
 ; GCN-NEXT:    v_mul_hi_u32 v13, v3, v6
 ; GCN-NEXT:    v_mul_lo_u32 v6, v3, v6
 ; GCN-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
@@ -1538,8 +1538,8 @@ define i64 @v_test_srem_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v8, v7, vcc
 ; GCN-NEXT:    v_addc_u32_e32 v8, vcc, v13, v11, vcc
 ; GCN-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; GCN-NEXT:    v_add_i32_e64 v2, s[4:5], v2, v6
 ; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v12, v8, vcc
+; GCN-NEXT:    v_add_i32_e64 v2, s[4:5], v2, v6
 ; GCN-NEXT:    v_addc_u32_e64 v6, vcc, v3, v7, s[4:5]
 ; GCN-NEXT:    v_mul_lo_u32 v8, v4, v6
 ; GCN-NEXT:    v_mul_hi_u32 v9, v4, v2
@@ -1552,8 +1552,8 @@ define i64 @v_test_srem_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_mul_hi_u32 v14, v2, v5
 ; GCN-NEXT:    v_mul_hi_u32 v9, v6, v4
 ; GCN-NEXT:    v_mul_lo_u32 v4, v6, v4
-; GCN-NEXT:    v_add_i32_e32 v10, vcc, v13, v10
 ; GCN-NEXT:    v_mul_hi_u32 v8, v6, v5
+; GCN-NEXT:    v_add_i32_e32 v10, vcc, v13, v10
 ; GCN-NEXT:    v_addc_u32_e32 v13, vcc, v12, v14, vcc
 ; GCN-NEXT:    v_mul_lo_u32 v5, v6, v5
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v10, v4
@@ -1585,9 +1585,9 @@ define i64 @v_test_srem_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[6:7]
 ; GCN-NEXT:    v_cmp_eq_u32_e64 s[6:7], v6, v1
 ; GCN-NEXT:    v_subb_u32_e64 v4, s[4:5], v4, v1, s[4:5]
-; GCN-NEXT:    v_subb_u32_e32 v3, vcc, 0, v3, vcc
 ; GCN-NEXT:    v_cndmask_b32_e64 v7, v7, v8, s[6:7]
 ; GCN-NEXT:    v_sub_i32_e64 v8, s[4:5], v5, v0
+; GCN-NEXT:    v_subb_u32_e32 v3, vcc, 0, v3, vcc
 ; GCN-NEXT:    v_subbrev_u32_e64 v4, s[4:5], 0, v4, s[4:5]
 ; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v1
 ; GCN-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v7
@@ -1634,8 +1634,8 @@ define i64 @v_test_srem_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_addc_u32_e32 v9, vcc, 0, v4, vcc
 ; GCN-IR-NEXT:    v_sub_i32_e64 v2, s[4:5], 63, v3
 ; GCN-IR-NEXT:    v_cmp_ge_u64_e32 vcc, v[8:9], v[3:4]
-; GCN-IR-NEXT:    v_mov_b32_e32 v4, 0
 ; GCN-IR-NEXT:    v_lshl_b64 v[2:3], 24, v2
+; GCN-IR-NEXT:    v_mov_b32_e32 v4, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -1644,9 +1644,9 @@ define i64 @v_test_srem_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
 ; GCN-IR-NEXT:    v_add_i32_e32 v12, vcc, -1, v0
 ; GCN-IR-NEXT:    v_addc_u32_e32 v13, vcc, -1, v1, vcc
-; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, 58, v6
-; GCN-IR-NEXT:    v_mov_b32_e32 v10, 0
 ; GCN-IR-NEXT:    v_lshr_b64 v[8:9], 24, v8
+; GCN-IR-NEXT:    v_mov_b32_e32 v10, 0
+; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, 58, v6
 ; GCN-IR-NEXT:    v_mov_b32_e32 v11, 0
 ; GCN-IR-NEXT:    v_subb_u32_e32 v7, vcc, 0, v7, vcc
 ; GCN-IR-NEXT:  BB11_3: ; %udiv-do-while
@@ -1659,15 +1659,15 @@ define i64 @v_test_srem_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_subb_u32_e32 v4, vcc, v13, v9, vcc
 ; GCN-IR-NEXT:    v_or_b32_e32 v2, v10, v2
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v10, 31, v4
-; GCN-IR-NEXT:    v_and_b32_e32 v15, v10, v0
 ; GCN-IR-NEXT:    v_and_b32_e32 v4, 1, v10
 ; GCN-IR-NEXT:    v_and_b32_e32 v14, v10, v1
+; GCN-IR-NEXT:    v_and_b32_e32 v15, v10, v0
 ; GCN-IR-NEXT:    v_add_i32_e32 v10, vcc, 1, v6
 ; GCN-IR-NEXT:    v_or_b32_e32 v3, v11, v3
 ; GCN-IR-NEXT:    v_addc_u32_e32 v11, vcc, 0, v7, vcc
+; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, v[10:11], v[6:7]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v6, v10
-; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-IR-NEXT:    v_sub_i32_e64 v8, s[4:5], v8, v15
 ; GCN-IR-NEXT:    v_mov_b32_e32 v7, v11
 ; GCN-IR-NEXT:    v_mov_b32_e32 v11, v5
@@ -1711,9 +1711,9 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v3, v1
 ; GCN-NEXT:    v_sub_i32_e32 v4, vcc, 0, v0
 ; GCN-NEXT:    v_subb_u32_e32 v5, vcc, 0, v1, vcc
-; GCN-NEXT:    v_mov_b32_e32 v12, 0
 ; GCN-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v3
 ; GCN-NEXT:    v_rcp_f32_e32 v2, v2
+; GCN-NEXT:    v_mov_b32_e32 v12, 0
 ; GCN-NEXT:    v_mov_b32_e32 v11, 0
 ; GCN-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
 ; GCN-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
@@ -1728,8 +1728,8 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_mul_lo_u32 v7, v4, v2
 ; GCN-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
 ; GCN-NEXT:    v_mul_lo_u32 v9, v2, v6
-; GCN-NEXT:    v_mul_hi_u32 v8, v2, v6
 ; GCN-NEXT:    v_mul_hi_u32 v10, v2, v7
+; GCN-NEXT:    v_mul_hi_u32 v8, v2, v6
 ; GCN-NEXT:    v_mul_hi_u32 v13, v3, v6
 ; GCN-NEXT:    v_mul_lo_u32 v6, v3, v6
 ; GCN-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
@@ -1740,8 +1740,8 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v8, v7, vcc
 ; GCN-NEXT:    v_addc_u32_e32 v8, vcc, v13, v11, vcc
 ; GCN-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; GCN-NEXT:    v_add_i32_e64 v2, s[4:5], v2, v6
 ; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v12, v8, vcc
+; GCN-NEXT:    v_add_i32_e64 v2, s[4:5], v2, v6
 ; GCN-NEXT:    v_addc_u32_e64 v6, vcc, v3, v7, s[4:5]
 ; GCN-NEXT:    v_mul_lo_u32 v8, v4, v6
 ; GCN-NEXT:    v_mul_hi_u32 v9, v4, v2
@@ -1754,8 +1754,8 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_mul_hi_u32 v14, v2, v5
 ; GCN-NEXT:    v_mul_hi_u32 v9, v6, v4
 ; GCN-NEXT:    v_mul_lo_u32 v4, v6, v4
-; GCN-NEXT:    v_add_i32_e32 v10, vcc, v13, v10
 ; GCN-NEXT:    v_mul_hi_u32 v8, v6, v5
+; GCN-NEXT:    v_add_i32_e32 v10, vcc, v13, v10
 ; GCN-NEXT:    v_addc_u32_e32 v13, vcc, v12, v14, vcc
 ; GCN-NEXT:    v_mul_lo_u32 v5, v6, v5
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v10, v4
@@ -1788,9 +1788,9 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[6:7]
 ; GCN-NEXT:    v_cmp_eq_u32_e64 s[6:7], v6, v1
 ; GCN-NEXT:    v_subb_u32_e64 v4, s[4:5], v4, v1, s[4:5]
-; GCN-NEXT:    v_subb_u32_e32 v3, vcc, 0, v3, vcc
 ; GCN-NEXT:    v_cndmask_b32_e64 v7, v7, v8, s[6:7]
 ; GCN-NEXT:    v_sub_i32_e64 v8, s[4:5], v5, v0
+; GCN-NEXT:    v_subb_u32_e32 v3, vcc, 0, v3, vcc
 ; GCN-NEXT:    v_subbrev_u32_e64 v4, s[4:5], 0, v4, s[4:5]
 ; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v1
 ; GCN-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v7
@@ -1824,8 +1824,8 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
 ; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[2:3]
 ; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0x8000
-; GCN-IR-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
 ; GCN-IR-NEXT:    v_mov_b32_e32 v6, s8
+; GCN-IR-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
 ; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[2:3]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-IR-NEXT:    v_cndmask_b32_e64 v6, v6, 0, s[4:5]
@@ -1839,8 +1839,8 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_addc_u32_e32 v9, vcc, 0, v3, vcc
 ; GCN-IR-NEXT:    v_cmp_ge_u64_e32 vcc, v[8:9], v[2:3]
 ; GCN-IR-NEXT:    v_sub_i32_e64 v2, s[4:5], 63, v2
-; GCN-IR-NEXT:    v_mov_b32_e32 v6, 0
 ; GCN-IR-NEXT:    v_lshl_b64 v[2:3], s[8:9], v2
+; GCN-IR-NEXT:    v_mov_b32_e32 v6, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -1848,11 +1848,11 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    s_cbranch_execz BB12_5
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
 ; GCN-IR-NEXT:    v_add_i32_e32 v12, vcc, -1, v0
-; GCN-IR-NEXT:    v_addc_u32_e32 v13, vcc, -1, v1, vcc
 ; GCN-IR-NEXT:    s_mov_b64 s[4:5], 0x8000
-; GCN-IR-NEXT:    v_sub_i32_e32 v4, vcc, 47, v4
-; GCN-IR-NEXT:    v_mov_b32_e32 v10, 0
+; GCN-IR-NEXT:    v_addc_u32_e32 v13, vcc, -1, v1, vcc
 ; GCN-IR-NEXT:    v_lshr_b64 v[8:9], s[4:5], v8
+; GCN-IR-NEXT:    v_mov_b32_e32 v10, 0
+; GCN-IR-NEXT:    v_sub_i32_e32 v4, vcc, 47, v4
 ; GCN-IR-NEXT:    v_mov_b32_e32 v11, 0
 ; GCN-IR-NEXT:    v_subb_u32_e32 v5, vcc, 0, v5, vcc
 ; GCN-IR-NEXT:  BB12_3: ; %udiv-do-while
@@ -1865,15 +1865,15 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_subb_u32_e32 v6, vcc, v13, v9, vcc
 ; GCN-IR-NEXT:    v_or_b32_e32 v2, v10, v2
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v10, 31, v6
-; GCN-IR-NEXT:    v_and_b32_e32 v15, v10, v0
 ; GCN-IR-NEXT:    v_and_b32_e32 v6, 1, v10
 ; GCN-IR-NEXT:    v_and_b32_e32 v14, v10, v1
+; GCN-IR-NEXT:    v_and_b32_e32 v15, v10, v0
 ; GCN-IR-NEXT:    v_add_i32_e32 v10, vcc, 1, v4
 ; GCN-IR-NEXT:    v_or_b32_e32 v3, v11, v3
 ; GCN-IR-NEXT:    v_addc_u32_e32 v11, vcc, 0, v5, vcc
+; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
 ; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, v[10:11], v[4:5]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v4, v10
-; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
 ; GCN-IR-NEXT:    v_sub_i32_e64 v8, s[4:5], v8, v15
 ; GCN-IR-NEXT:    v_mov_b32_e32 v5, v11
 ; GCN-IR-NEXT:    v_mov_b32_e32 v11, v7
@@ -1922,8 +1922,8 @@ define i64 @v_test_srem_pow2_k_den_i64(i64 %x) {
 ; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
 ; GCN-IR-NEXT:    v_xor_b32_e32 v0, v0, v2
-; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
 ; GCN-IR-NEXT:    v_xor_b32_e32 v1, v1, v2
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
 ; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v3, v0
 ; GCN-IR-NEXT:    v_add_i32_e64 v3, s[4:5], 32, v3
@@ -1947,17 +1947,17 @@ define i64 @v_test_srem_pow2_k_den_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_addc_u32_e32 v10, vcc, 0, v5, vcc
 ; GCN-IR-NEXT:    v_cmp_ge_u64_e32 vcc, v[9:10], v[4:5]
 ; GCN-IR-NEXT:    v_sub_i32_e64 v4, s[4:5], 63, v4
-; GCN-IR-NEXT:    v_mov_b32_e32 v6, 0
 ; GCN-IR-NEXT:    v_lshl_b64 v[4:5], v[0:1], v4
+; GCN-IR-NEXT:    v_mov_b32_e32 v6, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN-IR-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
 ; GCN-IR-NEXT:    s_cbranch_execz BB13_5
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT:    v_mov_b32_e32 v12, 0
 ; GCN-IR-NEXT:    v_lshr_b64 v[10:11], v[0:1], v9
 ; GCN-IR-NEXT:    v_add_i32_e32 v8, vcc, 0xffffffcf, v8
+; GCN-IR-NEXT:    v_mov_b32_e32 v12, 0
 ; GCN-IR-NEXT:    v_addc_u32_e64 v9, s[4:5], 0, -1, vcc
 ; GCN-IR-NEXT:    v_mov_b32_e32 v13, 0
 ; GCN-IR-NEXT:    s_movk_i32 s12, 0x7fff
@@ -1971,18 +1971,18 @@ define i64 @v_test_srem_pow2_k_den_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_subb_u32_e32 v6, vcc, 0, v11, vcc
 ; GCN-IR-NEXT:    v_or_b32_e32 v4, v12, v4
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v12, 31, v6
-; GCN-IR-NEXT:    v_and_b32_e32 v15, 0x8000, v12
 ; GCN-IR-NEXT:    v_and_b32_e32 v6, 1, v12
+; GCN-IR-NEXT:    v_and_b32_e32 v15, 0x8000, v12
 ; GCN-IR-NEXT:    v_add_i32_e32 v12, vcc, 1, v8
 ; GCN-IR-NEXT:    v_or_b32_e32 v5, v13, v5
 ; GCN-IR-NEXT:    v_addc_u32_e32 v13, vcc, 0, v9, vcc
+; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
 ; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, v[12:13], v[8:9]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v8, v12
-; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v9, v13
-; GCN-IR-NEXT:    v_mov_b32_e32 v13, v7
 ; GCN-IR-NEXT:    v_mov_b32_e32 v14, 0
 ; GCN-IR-NEXT:    v_sub_i32_e64 v10, s[4:5], v10, v15
+; GCN-IR-NEXT:    v_mov_b32_e32 v9, v13
+; GCN-IR-NEXT:    v_mov_b32_e32 v13, v7
 ; GCN-IR-NEXT:    v_subb_u32_e64 v11, s[4:5], v11, v14, s[4:5]
 ; GCN-IR-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v12, v6

diff  --git a/llvm/test/CodeGen/AMDGPU/srl.ll b/llvm/test/CodeGen/AMDGPU/srl.ll
index 73116bb4a985d..43ecb0d9188ba 100644
--- a/llvm/test/CodeGen/AMDGPU/srl.ll
+++ b/llvm/test/CodeGen/AMDGPU/srl.ll
@@ -428,8 +428,8 @@ define amdgpu_kernel void @v_lshr_32_i64(i64 addrspace(1)* %out, i64 addrspace(1
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    v_mov_b32_e32 v1, 0
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; SI-NEXT:    v_mov_b32_e32 v1, 0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; SI-NEXT:    s_mov_b64 s[0:1], s[2:3]

diff  --git a/llvm/test/CodeGen/AMDGPU/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/ssubsat.ll
index 3e8a725ca6c88..4a7a6b77c042e 100644
--- a/llvm/test/CodeGen/AMDGPU/ssubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/ssubsat.ll
@@ -153,8 +153,8 @@ define <2 x i16> @v_ssubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
 ; GFX8-NEXT:    v_sub_u16_e32 v4, v3, v2
-; GFX8-NEXT:    v_cmp_lt_i16_e64 s[4:5], 0, v2
 ; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, v4, v3
+; GFX8-NEXT:    v_cmp_lt_i16_e64 s[4:5], 0, v2
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v2, 15, v4
 ; GFX8-NEXT:    s_movk_i32 s6, 0x8000
 ; GFX8-NEXT:    v_xor_b32_e32 v2, s6, v2
@@ -200,17 +200,17 @@ define <3 x i16> @v_ssubsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) {
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v4
 ; GFX6-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v3
-; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v5
 ; GFX6-NEXT:    v_min_i32_e32 v1, s4, v1
 ; GFX6-NEXT:    s_movk_i32 s5, 0x8000
 ; GFX6-NEXT:    v_min_i32_e32 v0, s4, v0
+; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v5
 ; GFX6-NEXT:    v_max_i32_e32 v1, s5, v1
-; GFX6-NEXT:    v_min_i32_e32 v2, s4, v2
 ; GFX6-NEXT:    v_max_i32_e32 v0, s5, v0
 ; GFX6-NEXT:    s_mov_b32 s6, 0xffff
-; GFX6-NEXT:    v_max_i32_e32 v3, s5, v2
+; GFX6-NEXT:    v_min_i32_e32 v2, s4, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_and_b32_e32 v0, s6, v0
+; GFX6-NEXT:    v_max_i32_e32 v3, s5, v2
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_and_b32_e32 v2, s6, v3
 ; GFX6-NEXT:    v_alignbit_b32 v1, v3, v1, 16
@@ -222,8 +222,8 @@ define <3 x i16> @v_ssubsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) {
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
 ; GFX8-NEXT:    v_sub_u16_e32 v6, v5, v4
-; GFX8-NEXT:    v_cmp_lt_i16_e64 s[4:5], 0, v4
 ; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, v6, v5
+; GFX8-NEXT:    v_cmp_lt_i16_e64 s[4:5], 0, v4
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v4, 15, v6
 ; GFX8-NEXT:    s_movk_i32 s6, 0x8000
 ; GFX8-NEXT:    v_xor_b32_e32 v4, s6, v4
@@ -288,9 +288,9 @@ define <2 x float> @v_ssubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 ; GFX6-NEXT:    v_bfe_i32 v3, v3, 0, 16
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_and_b32_e32 v0, s6, v0
-; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v6
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v3, v7
+; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v6
 ; GFX6-NEXT:    v_min_i32_e32 v1, s4, v1
 ; GFX6-NEXT:    v_min_i32_e32 v2, s4, v2
 ; GFX6-NEXT:    v_max_i32_e32 v1, s5, v1
@@ -306,8 +306,8 @@ define <2 x float> @v_ssubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
 ; GFX8-NEXT:    v_sub_u16_e32 v6, v5, v4
-; GFX8-NEXT:    v_cmp_lt_i16_e64 s[4:5], 0, v4
 ; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, v6, v5
+; GFX8-NEXT:    v_cmp_lt_i16_e64 s[4:5], 0, v4
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v4, 15, v6
 ; GFX8-NEXT:    s_movk_i32 s6, 0x8000
 ; GFX8-NEXT:    v_xor_b32_e32 v4, s6, v4
@@ -319,14 +319,14 @@ define <2 x float> @v_ssubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v0, 15, v2
 ; GFX8-NEXT:    v_xor_b32_e32 v0, s6, v0
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
 ; GFX8-NEXT:    v_sub_u16_e32 v5, v4, v2
-; GFX8-NEXT:    v_cmp_lt_i16_e64 s[4:5], 0, v2
 ; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, v5, v4
+; GFX8-NEXT:    v_cmp_lt_i16_e64 s[4:5], 0, v2
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v2, 15, v5
 ; GFX8-NEXT:    v_xor_b32_e32 v2, s6, v2
 ; GFX8-NEXT:    s_xor_b64 vcc, s[4:5], vcc
@@ -374,8 +374,8 @@ define <2 x i32> @v_ssubsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v2, s[4:5], v1, v3
-; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v2, v1
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v3
+; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v2, v1
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 31, v2
 ; GFX6-NEXT:    v_xor_b32_e32 v1, s6, v1
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
@@ -394,8 +394,8 @@ define <2 x i32> @v_ssubsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v2, s[4:5], v1, v3
-; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v2, v1
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v3
+; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v2, v1
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v2
 ; GFX8-NEXT:    v_xor_b32_e32 v1, s6, v1
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
@@ -433,15 +433,15 @@ define <3 x i32> @v_ssubsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) {
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v3, s[4:5], v1, v4
-; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v3, v1
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v4
+; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v3, v1
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 31, v3
 ; GFX6-NEXT:    v_xor_b32_e32 v1, s6, v1
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v3, s[4:5], v2, v5
-; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v3, v2
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v5
+; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v3, v2
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 31, v3
 ; GFX6-NEXT:    v_xor_b32_e32 v2, s6, v2
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
@@ -460,15 +460,15 @@ define <3 x i32> @v_ssubsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) {
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v3, s[4:5], v1, v4
-; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v3, v1
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v4
+; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v3, v1
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v3
 ; GFX8-NEXT:    v_xor_b32_e32 v1, s6, v1
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v3, s[4:5], v2, v5
-; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v3, v2
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v5
+; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v3, v2
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 31, v3
 ; GFX8-NEXT:    v_xor_b32_e32 v2, s6, v2
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
@@ -508,22 +508,22 @@ define <4 x i32> @v_ssubsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v4, s[4:5], v1, v5
-; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v4, v1
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v5
+; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v4, v1
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 31, v4
 ; GFX6-NEXT:    v_xor_b32_e32 v1, s6, v1
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v4, s[4:5], v2, v6
-; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v4, v2
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v6
+; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v4, v2
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 31, v4
 ; GFX6-NEXT:    v_xor_b32_e32 v2, s6, v2
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v4, s[4:5], v3, v7
-; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v4, v3
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v7
+; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v4, v3
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v3, 31, v4
 ; GFX6-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
@@ -542,22 +542,22 @@ define <4 x i32> @v_ssubsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v4, s[4:5], v1, v5
-; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v4, v1
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v5
+; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v4, v1
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v4
 ; GFX8-NEXT:    v_xor_b32_e32 v1, s6, v1
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v4, s[4:5], v2, v6
-; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v4, v2
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v6
+; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v4, v2
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 31, v4
 ; GFX8-NEXT:    v_xor_b32_e32 v2, s6, v2
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v4, s[4:5], v3, v7
-; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v4, v3
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v7
+; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v4, v3
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 31, v4
 ; GFX8-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
@@ -599,51 +599,51 @@ define <8 x i32> @v_ssubsat_v8i32(<8 x i32> %lhs, <8 x i32> %rhs) {
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v8, s[4:5], v1, v9
-; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v1
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v9
+; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v1
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 31, v8
 ; GFX6-NEXT:    v_xor_b32_e32 v1, s6, v1
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v8, s[4:5], v2, v10
-; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v2
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v10
+; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v2
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 31, v8
 ; GFX6-NEXT:    v_xor_b32_e32 v2, s6, v2
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v8, s[4:5], v3, v11
-; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v3
-; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v11
 ; GFX6-NEXT:    v_bfrev_b32_e32 v16, 1
+; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v11
+; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v3
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v3, 31, v8
 ; GFX6-NEXT:    v_xor_b32_e32 v3, v16, v3
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v8, v3, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v8, s[4:5], v4, v12
-; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v4
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v12
+; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v4
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v4, 31, v8
 ; GFX6-NEXT:    v_xor_b32_e32 v4, v16, v4
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v8, s[4:5], v5, v13
-; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v5
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v13
+; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v5
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v5, 31, v8
 ; GFX6-NEXT:    v_xor_b32_e32 v5, v16, v5
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v5, v8, v5, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v8, s[4:5], v6, v14
-; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v6
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v14
+; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v6
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v6, 31, v8
 ; GFX6-NEXT:    v_xor_b32_e32 v6, v16, v6
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v6, v8, v6, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v8, s[4:5], v7, v15
-; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v7
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v15
+; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v7
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v7, 31, v8
 ; GFX6-NEXT:    v_xor_b32_e32 v7, v16, v7
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
@@ -662,51 +662,51 @@ define <8 x i32> @v_ssubsat_v8i32(<8 x i32> %lhs, <8 x i32> %rhs) {
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v8, s[4:5], v1, v9
-; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v1
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v9
+; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v1
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v8
 ; GFX8-NEXT:    v_xor_b32_e32 v1, s6, v1
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v8, s[4:5], v2, v10
-; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v2
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v10
+; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v2
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 31, v8
 ; GFX8-NEXT:    v_xor_b32_e32 v2, s6, v2
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v8, s[4:5], v3, v11
-; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v3
-; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v11
 ; GFX8-NEXT:    v_bfrev_b32_e32 v16, 1
+; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v11
+; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v3
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 31, v8
 ; GFX8-NEXT:    v_xor_b32_e32 v3, v16, v3
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v8, v3, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v8, s[4:5], v4, v12
-; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v4
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v12
+; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v4
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v4, 31, v8
 ; GFX8-NEXT:    v_xor_b32_e32 v4, v16, v4
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v8, s[4:5], v5, v13
-; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v5
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v13
+; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v5
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v5, 31, v8
 ; GFX8-NEXT:    v_xor_b32_e32 v5, v16, v5
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v5, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v8, s[4:5], v6, v14
-; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v6
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v14
+; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v6
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v6, 31, v8
 ; GFX8-NEXT:    v_xor_b32_e32 v6, v16, v6
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, v8, v6, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v8, s[4:5], v7, v15
-; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v7
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v15
+; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v7
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v7, 31, v8
 ; GFX8-NEXT:    v_xor_b32_e32 v7, v16, v7
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
@@ -763,100 +763,100 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v16, v1, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v16, s[4:5], v2, v18
-; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v2
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v18
+; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v2
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 31, v16
 ; GFX6-NEXT:    v_xor_b32_e32 v2, s6, v2
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v16, v2, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v16, s[4:5], v3, v19
-; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v3
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v19
+; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v3
 ; GFX6-NEXT:    v_bfrev_b32_e32 v17, 1
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v3, 31, v16
 ; GFX6-NEXT:    v_xor_b32_e32 v3, v17, v3
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v16, v3, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v16, s[4:5], v4, v20
-; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v4
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v20
+; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v4
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v4, 31, v16
 ; GFX6-NEXT:    v_xor_b32_e32 v4, v17, v4
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v4, v16, v4, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v16, s[4:5], v5, v21
-; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v5
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v21
+; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v5
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v5, 31, v16
 ; GFX6-NEXT:    v_xor_b32_e32 v5, v17, v5
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v5, v16, v5, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v16, s[4:5], v6, v22
-; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v6
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v22
+; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v6
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v6, 31, v16
 ; GFX6-NEXT:    v_xor_b32_e32 v6, v17, v6
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v6, v16, v6, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v16, s[4:5], v7, v23
-; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v7
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v23
+; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v7
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v7, 31, v16
 ; GFX6-NEXT:    v_xor_b32_e32 v7, v17, v7
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v7, v16, v7, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v16, s[4:5], v8, v24
-; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v8
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v24
+; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v8
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v8, 31, v16
 ; GFX6-NEXT:    v_xor_b32_e32 v8, v17, v8
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v8, v16, v8, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v16, s[4:5], v9, v25
-; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v9
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v25
+; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v9
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v9, 31, v16
 ; GFX6-NEXT:    v_xor_b32_e32 v9, v17, v9
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v9, v16, v9, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v16, s[4:5], v10, v26
-; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v10
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v26
+; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v10
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v10, 31, v16
 ; GFX6-NEXT:    v_xor_b32_e32 v10, v17, v10
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v10, v16, v10, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v16, s[4:5], v11, v27
-; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v11
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v27
+; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v11
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v11, 31, v16
 ; GFX6-NEXT:    v_xor_b32_e32 v11, v17, v11
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v11, v16, v11, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v16, s[4:5], v12, v28
-; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v12
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v28
+; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v12
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v12, 31, v16
 ; GFX6-NEXT:    v_xor_b32_e32 v12, v17, v12
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v12, v16, v12, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v16, s[4:5], v13, v29
-; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v13
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v29
+; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v13
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v13, 31, v16
 ; GFX6-NEXT:    v_xor_b32_e32 v13, v17, v13
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v13, v16, v13, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v16, s[4:5], v14, v30
-; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v14
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v30
+; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v14
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v14, 31, v16
 ; GFX6-NEXT:    v_xor_b32_e32 v14, v17, v14
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v14, v16, v14, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v16, s[4:5], v15, v31
-; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v15
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v31
+; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v15
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v15, 31, v16
 ; GFX6-NEXT:    v_xor_b32_e32 v15, v17, v15
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
@@ -882,100 +882,100 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v16, v1, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v16, s[4:5], v2, v18
-; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v2
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v18
+; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v2
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 31, v16
 ; GFX8-NEXT:    v_xor_b32_e32 v2, s6, v2
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v16, v2, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v16, s[4:5], v3, v19
-; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v3
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v19
+; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v3
 ; GFX8-NEXT:    v_bfrev_b32_e32 v17, 1
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 31, v16
 ; GFX8-NEXT:    v_xor_b32_e32 v3, v17, v3
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v16, v3, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v16, s[4:5], v4, v20
-; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v4
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v20
+; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v4
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v4, 31, v16
 ; GFX8-NEXT:    v_xor_b32_e32 v4, v17, v4
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v16, v4, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v16, s[4:5], v5, v21
-; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v5
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v21
+; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v5
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v5, 31, v16
 ; GFX8-NEXT:    v_xor_b32_e32 v5, v17, v5
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v16, v5, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v16, s[4:5], v6, v22
-; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v6
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v22
+; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v6
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v6, 31, v16
 ; GFX8-NEXT:    v_xor_b32_e32 v6, v17, v6
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, v16, v6, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v16, s[4:5], v7, v23
-; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v7
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v23
+; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v7
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v7, 31, v16
 ; GFX8-NEXT:    v_xor_b32_e32 v7, v17, v7
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, v16, v7, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v16, s[4:5], v8, v24
-; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v8
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v24
+; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v8
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v8, 31, v16
 ; GFX8-NEXT:    v_xor_b32_e32 v8, v17, v8
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v8, v16, v8, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v16, s[4:5], v9, v25
-; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v9
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v25
+; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v9
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v9, 31, v16
 ; GFX8-NEXT:    v_xor_b32_e32 v9, v17, v9
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v9, v16, v9, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v16, s[4:5], v10, v26
-; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v10
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v26
+; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v10
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v10, 31, v16
 ; GFX8-NEXT:    v_xor_b32_e32 v10, v17, v10
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v10, v16, v10, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v16, s[4:5], v11, v27
-; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v11
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v27
+; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v11
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v11, 31, v16
 ; GFX8-NEXT:    v_xor_b32_e32 v11, v17, v11
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v11, v16, v11, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v16, s[4:5], v12, v28
-; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v12
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v28
+; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v12
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v12, 31, v16
 ; GFX8-NEXT:    v_xor_b32_e32 v12, v17, v12
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v12, v16, v12, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v16, s[4:5], v13, v29
-; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v13
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v29
+; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v13
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v13, 31, v16
 ; GFX8-NEXT:    v_xor_b32_e32 v13, v17, v13
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v13, v16, v13, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v16, s[4:5], v14, v30
-; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v14
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v30
+; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v14
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v14, 31, v16
 ; GFX8-NEXT:    v_xor_b32_e32 v14, v17, v14
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v14, v16, v14, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v16, s[4:5], v15, v31
-; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v15
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v31
+; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v15
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v15, 31, v16
 ; GFX8-NEXT:    v_xor_b32_e32 v15, v17, v15
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
@@ -1077,8 +1077,8 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_sub_co_u32 v4, vcc_lo, v0, v2
-; GFX10-NEXT:    v_cmp_lt_i64_e64 s4, 0, v[2:3]
 ; GFX10-NEXT:    v_sub_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_i64_e64 s4, 0, v[2:3]
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
 ; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1]
 ; GFX10-NEXT:    v_xor_b32_e32 v1, 0x80000000, v6

diff  --git a/llvm/test/CodeGen/AMDGPU/stack-realign.ll b/llvm/test/CodeGen/AMDGPU/stack-realign.ll
index c27e03d3db73c..4f7e048701330 100644
--- a/llvm/test/CodeGen/AMDGPU/stack-realign.ll
+++ b/llvm/test/CodeGen/AMDGPU/stack-realign.ll
@@ -193,12 +193,12 @@ define i32 @needs_align1024_stack_args_used_inside_loop(%struct.Data addrspace(5
 
 ; GCN-LABEL: needs_align1024_stack_args_used_inside_loop:
 ; GCN: s_mov_b32 [[FP_COPY:s[0-9]+]], s33
-; GCN-NEXT: s_add_i32 s33, s32, 0xffc0
 ; GCN-NEXT: s_mov_b32 [[BP_COPY:s[0-9]+]], s34
+; GCN-NEXT: s_add_i32 s33, s32, 0xffc0
 ; GCN-NEXT: s_mov_b32 s34, s32
 ; GCN-NEXT: s_and_b32 s33, s33, 0xffff0000
-; GCN-NEXT: v_mov_b32_e32 v{{[0-9]+}}, 0
 ; GCN-NEXT: v_lshrrev_b32_e64 [[VGPR_REG:v[0-9]+]], 6, s34
+; GCN-NEXT: v_mov_b32_e32 v{{[0-9]+}}, 0
 ; GCN: s_add_i32 s32, s32, 0x30000
 ; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33 offset:1024
 ; GCN: buffer_load_dword v{{[0-9]+}}, [[VGPR_REG]], s[0:3], 0 offen

diff  --git a/llvm/test/CodeGen/AMDGPU/store-local.128.ll b/llvm/test/CodeGen/AMDGPU/store-local.128.ll
index cf2f5577df4bc..6c66b1ecc9256 100644
--- a/llvm/test/CodeGen/AMDGPU/store-local.128.ll
+++ b/llvm/test/CodeGen/AMDGPU/store-local.128.ll
@@ -223,9 +223,9 @@ define amdgpu_kernel void @store_lds_v4i32_align1(<4 x i32> addrspace(3)* %out,
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-NEXT:    s_lshr_b32 s3, s6, 24
+; GFX10-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX10-NEXT:    s_lshr_b32 s0, s7, 8
 ; GFX10-NEXT:    s_lshr_b32 s2, s6, 8
-; GFX10-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX10-NEXT:    s_lshr_b32 s6, s5, 8
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX10-NEXT:    s_lshr_b32 s1, s7, 24
@@ -234,8 +234,8 @@ define amdgpu_kernel void @store_lds_v4i32_align1(<4 x i32> addrspace(3)* %out,
 ; GFX10-NEXT:    v_mov_b32_e32 v5, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v9, s6
 ; GFX10-NEXT:    s_lshr_b32 s0, s4, 8
-; GFX10-NEXT:    v_mov_b32_e32 v6, s1
 ; GFX10-NEXT:    v_mov_b32_e32 v4, s4
+; GFX10-NEXT:    v_mov_b32_e32 v6, s1
 ; GFX10-NEXT:    v_mov_b32_e32 v7, s2
 ; GFX10-NEXT:    ds_write_b8 v0, v1 offset:12
 ; GFX10-NEXT:    ds_write_b8_d16_hi v0, v1 offset:14
@@ -248,8 +248,8 @@ define amdgpu_kernel void @store_lds_v4i32_align1(<4 x i32> addrspace(3)* %out,
 ; GFX10-NEXT:    ds_write_b8 v0, v5 offset:13
 ; GFX10-NEXT:    ds_write_b8 v0, v6 offset:15
 ; GFX10-NEXT:    ds_write_b8 v0, v7 offset:9
-; GFX10-NEXT:    s_lshr_b32 s1, s4, 24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-NEXT:    s_lshr_b32 s1, s4, 24
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX10-NEXT:    ds_write_b8 v0, v8 offset:11
@@ -374,8 +374,8 @@ define amdgpu_kernel void @store_lds_v4i32_align4(<4 x i32> addrspace(3)* %out,
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s5
-; GFX9-NEXT:    ds_write2_b32 v0, v1, v2 offset1:1
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s6
+; GFX9-NEXT:    ds_write2_b32 v0, v1, v2 offset1:1
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9-NEXT:    ds_write2_b32 v0, v3, v1 offset0:2 offset1:3
 ; GFX9-NEXT:    s_endpgm

diff  --git a/llvm/test/CodeGen/AMDGPU/store-local.96.ll b/llvm/test/CodeGen/AMDGPU/store-local.96.ll
index d54d41824c7cc..14c5ee77b44e0 100644
--- a/llvm/test/CodeGen/AMDGPU/store-local.96.ll
+++ b/llvm/test/CodeGen/AMDGPU/store-local.96.ll
@@ -38,8 +38,8 @@ define amdgpu_kernel void @store_lds_v3i32(<3 x i32> addrspace(3)* %out, <3 x i3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s2
-; GFX6-NEXT:    ds_write_b32 v2, v1 offset:8
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s0
+; GFX6-NEXT:    ds_write_b32 v2, v1 offset:8
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NEXT:    ds_write_b64 v2, v[0:1]
 ; GFX6-NEXT:    s_endpgm
@@ -189,12 +189,12 @@ define amdgpu_kernel void @store_lds_v3i32_align1(<3 x i32> addrspace(3)* %out,
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX10-NEXT:    s_lshr_b32 s0, s6, 8
 ; GFX10-NEXT:    s_lshr_b32 s1, s6, 24
 ; GFX10-NEXT:    s_lshr_b32 s2, s5, 8
 ; GFX10-NEXT:    s_lshr_b32 s3, s5, 24
 ; GFX10-NEXT:    s_lshr_b32 s5, s4, 8
-; GFX10-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX10-NEXT:    s_lshr_b32 s4, s4, 24
 ; GFX10-NEXT:    v_mov_b32_e32 v4, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v5, s1
@@ -387,8 +387,8 @@ define amdgpu_kernel void @store_lds_v3i32_align8(<3 x i32> addrspace(3)* %out,
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    ds_write_b32 v2, v1 offset:8
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s0
+; GFX7-NEXT:    ds_write_b32 v2, v1 offset:8
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    ds_write_b64 v2, v[0:1]
 ; GFX7-NEXT:    s_endpgm
@@ -401,8 +401,8 @@ define amdgpu_kernel void @store_lds_v3i32_align8(<3 x i32> addrspace(3)* %out,
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s2
-; GFX6-NEXT:    ds_write_b32 v2, v1 offset:8
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s0
+; GFX6-NEXT:    ds_write_b32 v2, v1 offset:8
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NEXT:    ds_write_b64 v2, v[0:1]
 ; GFX6-NEXT:    s_endpgm
@@ -458,8 +458,8 @@ define amdgpu_kernel void @store_lds_v3i32_align16(<3 x i32> addrspace(3)* %out,
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s2
-; GFX6-NEXT:    ds_write_b32 v2, v1 offset:8
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s0
+; GFX6-NEXT:    ds_write_b32 v2, v1 offset:8
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NEXT:    ds_write_b64 v2, v[0:1]
 ; GFX6-NEXT:    s_endpgm

diff  --git a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
index c4d02ae3eec05..aae863834a879 100644
--- a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
+++ b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
@@ -92,8 +92,8 @@ define amdgpu_kernel void @local_store_i55(i55 addrspace(3)* %ptr, i55 %arg) #0
 ; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX9-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_and_b32 s3, s2, 0xffff
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    s_and_b32 s3, s2, 0xffff
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX9-NEXT:    ds_write_b16 v1, v2 offset:4
@@ -244,8 +244,8 @@ define amdgpu_kernel void @local_store_i65(i65 addrspace(3)* %ptr, i65 %arg) #0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_and_b32 s2, s2, 1
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s3
-; GFX10-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s2
+; GFX10-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-NEXT:    ds_write_b8 v2, v3 offset:8
 ; GFX10-NEXT:    ds_write_b64 v2, v[0:1]

diff  --git a/llvm/test/CodeGen/AMDGPU/strict_fadd.f16.ll b/llvm/test/CodeGen/AMDGPU/strict_fadd.f16.ll
index ec3f62238f3f3..884b56e7a5304 100644
--- a/llvm/test/CodeGen/AMDGPU/strict_fadd.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/strict_fadd.f16.ll
@@ -164,8 +164,8 @@ define <4 x half> @v_constained_fadd_v4f16_fpexcept_strict(<4 x half> %x, <4 x h
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_f16_sdwa v4, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT:    v_add_f16_e32 v1, v1, v3
 ; GFX9-NEXT:    v_add_f16_sdwa v5, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_add_f16_e32 v1, v1, v3
 ; GFX9-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX9-NEXT:    v_lshl_or_b32 v0, v5, 16, v0
 ; GFX9-NEXT:    v_lshl_or_b32 v1, v4, 16, v1
@@ -175,8 +175,8 @@ define <4 x half> @v_constained_fadd_v4f16_fpexcept_strict(<4 x half> %x, <4 x h
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_f16_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_add_f16_e32 v1, v1, v3
 ; GFX8-NEXT:    v_add_f16_sdwa v5, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_add_f16_e32 v1, v1, v3
 ; GFX8-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v5
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
@@ -187,8 +187,8 @@ define <4 x half> @v_constained_fadd_v4f16_fpexcept_strict(<4 x half> %x, <4 x h
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_add_f16_e32 v4, v0, v2
-; GFX10-NEXT:    v_add_f16_e32 v6, v1, v3
 ; GFX10-NEXT:    v_mov_b32_e32 v5, 0xffff
+; GFX10-NEXT:    v_add_f16_e32 v6, v1, v3
 ; GFX10-NEXT:    v_add_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX10-NEXT:    v_add_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX10-NEXT:    v_and_b32_e32 v2, v5, v4

diff  --git a/llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll b/llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll
index 0aa92534c4374..773838e376cae 100644
--- a/llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll
@@ -31,9 +31,9 @@ define <2 x half> @v_constained_fma_v2f16_fpexcept_strict(<2 x half> %x, <2 x ha
 ; GFX8-LABEL: v_constained_fma_v2f16_fpexcept_strict:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
 ; GFX8-NEXT:    v_fma_f16 v3, v5, v4, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX8-NEXT:    v_fma_f16 v0, v0, v1, v2
@@ -61,9 +61,9 @@ define <3 x half> @v_constained_fma_v3f16_fpexcept_strict(<3 x half> %x, <3 x ha
 ; GFX8-LABEL: v_constained_fma_v3f16_fpexcept_strict:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
 ; GFX8-NEXT:    v_fma_f16 v6, v8, v7, v6
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
 ; GFX8-NEXT:    v_fma_f16 v0, v0, v2, v4
@@ -87,19 +87,19 @@ define <4 x half> @v_constained_fma_v4f16_fpexcept_strict(<4 x half> %x, <4 x ha
 ; GFX9-LABEL: v_constained_fma_v4f16_fpexcept_strict:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v5
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
 ; GFX9-NEXT:    v_fma_f16 v6, v8, v7, v6
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
+; GFX9-NEXT:    v_fma_f16 v1, v1, v3, v5
 ; GFX9-NEXT:    v_fma_f16 v0, v0, v2, v4
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0xffff
-; GFX9-NEXT:    v_fma_f16 v1, v1, v3, v5
-; GFX9-NEXT:    v_and_b32_e32 v1, v2, v1
 ; GFX9-NEXT:    v_fma_f16 v7, v9, v8, v7
 ; GFX9-NEXT:    v_and_b32_e32 v0, v2, v0
+; GFX9-NEXT:    v_and_b32_e32 v1, v2, v1
 ; GFX9-NEXT:    v_lshl_or_b32 v0, v7, 16, v0
 ; GFX9-NEXT:    v_lshl_or_b32 v1, v6, 16, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -107,18 +107,18 @@ define <4 x half> @v_constained_fma_v4f16_fpexcept_strict(<4 x half> %x, <4 x ha
 ; GFX8-LABEL: v_constained_fma_v4f16_fpexcept_strict:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v5
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
 ; GFX8-NEXT:    v_fma_f16 v6, v8, v7, v6
-; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
 ; GFX8-NEXT:    v_fma_f16 v7, v9, v8, v7
 ; GFX8-NEXT:    v_fma_f16 v0, v0, v2, v4
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX8-NEXT:    v_fma_f16 v1, v1, v3, v5
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -128,9 +128,9 @@ define <4 x half> @v_constained_fma_v4f16_fpexcept_strict(<4 x half> %x, <4 x ha
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v5
-; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v0
 ; GFX10-NEXT:    v_fmac_f16_e32 v4, v0, v2
@@ -215,9 +215,9 @@ define <2 x half> @v_constained_fma_v2f16_fpexcept_strict_fneg_fneg(<2 x half> %
 ; GFX8-LABEL: v_constained_fma_v2f16_fpexcept_strict_fneg_fneg:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
 ; GFX8-NEXT:    v_fma_f16 v3, -v5, -v4, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX8-NEXT:    v_fma_f16 v0, -v0, -v1, v2

diff  --git a/llvm/test/CodeGen/AMDGPU/strict_fmul.f16.ll b/llvm/test/CodeGen/AMDGPU/strict_fmul.f16.ll
index 4019e39df83f8..34c5e908ac310 100644
--- a/llvm/test/CodeGen/AMDGPU/strict_fmul.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/strict_fmul.f16.ll
@@ -164,8 +164,8 @@ define <4 x half> @v_constained_fmul_v4f16_fpexcept_strict(<4 x half> %x, <4 x h
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mul_f16_sdwa v4, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT:    v_mul_f16_e32 v1, v1, v3
 ; GFX9-NEXT:    v_mul_f16_sdwa v5, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_mul_f16_e32 v1, v1, v3
 ; GFX9-NEXT:    v_mul_f16_e32 v0, v0, v2
 ; GFX9-NEXT:    v_lshl_or_b32 v0, v5, 16, v0
 ; GFX9-NEXT:    v_lshl_or_b32 v1, v4, 16, v1
@@ -175,8 +175,8 @@ define <4 x half> @v_constained_fmul_v4f16_fpexcept_strict(<4 x half> %x, <4 x h
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mul_f16_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_mul_f16_e32 v1, v1, v3
 ; GFX8-NEXT:    v_mul_f16_sdwa v5, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_mul_f16_e32 v1, v1, v3
 ; GFX8-NEXT:    v_mul_f16_e32 v0, v0, v2
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v5
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
@@ -187,8 +187,8 @@ define <4 x half> @v_constained_fmul_v4f16_fpexcept_strict(<4 x half> %x, <4 x h
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_mul_f16_e32 v4, v0, v2
-; GFX10-NEXT:    v_mul_f16_e32 v6, v1, v3
 ; GFX10-NEXT:    v_mov_b32_e32 v5, 0xffff
+; GFX10-NEXT:    v_mul_f16_e32 v6, v1, v3
 ; GFX10-NEXT:    v_mul_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX10-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX10-NEXT:    v_and_b32_e32 v2, v5, v4

diff  --git a/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll b/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll
index 73e2b55408655..4cf75f68a7a1d 100644
--- a/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll
@@ -184,8 +184,8 @@ define <4 x half> @v_constained_fsub_v4f16_fpexcept_strict(<4 x half> %x, <4 x h
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_sub_f16_sdwa v4, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT:    v_sub_f16_e32 v1, v1, v3
 ; GFX9-NEXT:    v_sub_f16_sdwa v5, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_sub_f16_e32 v1, v1, v3
 ; GFX9-NEXT:    v_sub_f16_e32 v0, v0, v2
 ; GFX9-NEXT:    v_lshl_or_b32 v0, v5, 16, v0
 ; GFX9-NEXT:    v_lshl_or_b32 v1, v4, 16, v1
@@ -195,8 +195,8 @@ define <4 x half> @v_constained_fsub_v4f16_fpexcept_strict(<4 x half> %x, <4 x h
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_sub_f16_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_sub_f16_e32 v1, v1, v3
 ; GFX8-NEXT:    v_sub_f16_sdwa v5, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_sub_f16_e32 v1, v1, v3
 ; GFX8-NEXT:    v_sub_f16_e32 v0, v0, v2
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v5
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
@@ -207,8 +207,8 @@ define <4 x half> @v_constained_fsub_v4f16_fpexcept_strict(<4 x half> %x, <4 x h
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_sub_f16_e32 v4, v0, v2
-; GFX10-NEXT:    v_sub_f16_e32 v6, v1, v3
 ; GFX10-NEXT:    v_mov_b32_e32 v5, 0xffff
+; GFX10-NEXT:    v_sub_f16_e32 v6, v1, v3
 ; GFX10-NEXT:    v_sub_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX10-NEXT:    v_sub_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX10-NEXT:    v_and_b32_e32 v2, v5, v4
@@ -239,9 +239,9 @@ define amdgpu_ps <2 x half> @s_constained_fsub_v2f16_fpexcept_strict(<2 x half>
 ; GFX9-LABEL: s_constained_fsub_v2f16_fpexcept_strict:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_lshr_b32 s0, s3, 16
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    s_lshr_b32 s1, s2, 16
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    v_sub_f16_e32 v0, s1, v0
 ; GFX9-NEXT:    v_sub_f16_e32 v1, s2, v1
 ; GFX9-NEXT:    v_lshl_or_b32 v0, v0, 16, v1

diff  --git a/llvm/test/CodeGen/AMDGPU/trunc.ll b/llvm/test/CodeGen/AMDGPU/trunc.ll
index 4cb532a1928b8..af6fe1eddadd1 100644
--- a/llvm/test/CodeGen/AMDGPU/trunc.ll
+++ b/llvm/test/CodeGen/AMDGPU/trunc.ll
@@ -41,7 +41,6 @@ define amdgpu_kernel void @trunc_load_shl_i64(i32 addrspace(1)* %out, [8 x i32],
 ; GCN: s_lshl_b64 s{{\[}}[[LO_SHL:[0-9]+]]:{{[0-9]+\]}}, s{{\[}}[[LO_SREG]]:{{[0-9]+\]}}, 2
 ; GCN: s_add_u32 s[[LO_SREG2:[0-9]+]], s[[LO_SHL]],
 ; GCN: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG2]]
-; GCN: s_addc_u32
 ; SI: buffer_store_dword v[[LO_VREG]],
 ; VI: flat_store_dword v[{{[0-9:]+}}], v[[LO_VREG]]
 ; GCN: v_mov_b32_e32

diff  --git a/llvm/test/CodeGen/AMDGPU/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/uaddsat.ll
index 3a50f89dbabaf..94710c3c10edd 100644
--- a/llvm/test/CodeGen/AMDGPU/uaddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/uaddsat.ll
@@ -88,9 +88,9 @@ define <2 x i16> @v_uaddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
 ; GFX6-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX6-NEXT:    v_and_b32_e32 v3, s4, v3
 ; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
-; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
 ; GFX6-NEXT:    v_and_b32_e32 v2, s4, v2
 ; GFX6-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GFX6-NEXT:    v_min_u32_e32 v1, s4, v1
 ; GFX6-NEXT:    v_min_u32_e32 v0, s4, v0
@@ -123,17 +123,17 @@ define <3 x i16> @v_uaddsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) {
 ; GFX6-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX6-NEXT:    v_and_b32_e32 v4, s4, v4
 ; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
-; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
 ; GFX6-NEXT:    v_and_b32_e32 v5, s4, v5
 ; GFX6-NEXT:    v_and_b32_e32 v2, s4, v2
 ; GFX6-NEXT:    v_and_b32_e32 v3, s4, v3
 ; GFX6-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
 ; GFX6-NEXT:    v_min_u32_e32 v1, s4, v1
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
-; GFX6-NEXT:    v_min_u32_e32 v3, s4, v2
 ; GFX6-NEXT:    v_min_u32_e32 v0, s4, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_min_u32_e32 v3, s4, v2
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_or_b32_e32 v2, 0xffff0000, v3
 ; GFX6-NEXT:    v_alignbit_b32 v1, v3, v1, 16
@@ -165,9 +165,9 @@ define <2 x float> @v_uaddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 ; GFX6-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX6-NEXT:    v_and_b32_e32 v5, s4, v5
 ; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
-; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v5
 ; GFX6-NEXT:    v_and_b32_e32 v4, s4, v4
 ; GFX6-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v5
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
 ; GFX6-NEXT:    v_min_u32_e32 v1, s4, v1
 ; GFX6-NEXT:    v_and_b32_e32 v7, s4, v7

diff  --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll
index 44c5a097f29de..cc829b8e7eb36 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll
@@ -22,10 +22,10 @@ define amdgpu_kernel void @s_test_udiv_i64(i64 addrspace(1)* %out, i64 %x, i64 %
 ; GCN-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v0
 ; GCN-NEXT:    v_trunc_f32_e32 v3, v3
 ; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v3
-; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GCN-NEXT:    v_mul_hi_u32 v5, s4, v0
+; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_lo_u32 v4, s4, v3
+; GCN-NEXT:    v_mul_hi_u32 v5, s4, v0
 ; GCN-NEXT:    v_mul_lo_u32 v7, s5, v0
 ; GCN-NEXT:    v_mul_lo_u32 v6, s4, v0
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
@@ -43,8 +43,8 @@ define amdgpu_kernel void @s_test_udiv_i64(i64 addrspace(1)* %out, i64 %x, i64 %
 ; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v7, v6, vcc
 ; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v9, v1, vcc
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
-; GCN-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v4
 ; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v2, v6, vcc
+; GCN-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v4
 ; GCN-NEXT:    v_addc_u32_e64 v4, vcc, v3, v5, s[0:1]
 ; GCN-NEXT:    v_mul_lo_u32 v6, s4, v4
 ; GCN-NEXT:    v_mul_hi_u32 v7, s4, v0
@@ -54,8 +54,8 @@ define amdgpu_kernel void @s_test_udiv_i64(i64 addrspace(1)* %out, i64 %x, i64 %
 ; GCN-NEXT:    v_mul_lo_u32 v7, s4, v0
 ; GCN-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
 ; GCN-NEXT:    v_mul_lo_u32 v10, v0, v6
-; GCN-NEXT:    v_mul_hi_u32 v12, v0, v6
 ; GCN-NEXT:    v_mul_hi_u32 v11, v0, v7
+; GCN-NEXT:    v_mul_hi_u32 v12, v0, v6
 ; GCN-NEXT:    v_mul_hi_u32 v9, v4, v7
 ; GCN-NEXT:    v_mul_lo_u32 v7, v4, v7
 ; GCN-NEXT:    v_mul_hi_u32 v8, v4, v6
@@ -134,8 +134,8 @@ define amdgpu_kernel void @s_test_udiv_i64(i64 addrspace(1)* %out, i64 %x, i64 %
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[6:7], 0
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[8:9], s[0:1], 0
 ; GCN-IR-NEXT:    s_flbit_i32_b32 s12, s0
-; GCN-IR-NEXT:    s_or_b64 s[14:15], s[8:9], s[10:11]
 ; GCN-IR-NEXT:    s_add_i32 s12, s12, 32
+; GCN-IR-NEXT:    s_or_b64 s[14:15], s[8:9], s[10:11]
 ; GCN-IR-NEXT:    s_flbit_i32_b32 s8, s1
 ; GCN-IR-NEXT:    s_min_u32 s10, s12, s8
 ; GCN-IR-NEXT:    s_flbit_i32_b32 s8, s6
@@ -168,14 +168,14 @@ define amdgpu_kernel void @s_test_udiv_i64(i64 addrspace(1)* %out, i64 %x, i64 %
 ; GCN-IR-NEXT:    s_addc_u32 s17, s1, -1
 ; GCN-IR-NEXT:    s_not_b64 s[2:3], s[10:11]
 ; GCN-IR-NEXT:    s_add_u32 s6, s2, s12
-; GCN-IR-NEXT:    s_addc_u32 s7, s3, s11
 ; GCN-IR-NEXT:    s_mov_b32 s13, s11
+; GCN-IR-NEXT:    s_addc_u32 s7, s3, s11
 ; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
 ; GCN-IR-NEXT:    s_mov_b32 s3, 0
 ; GCN-IR-NEXT:  BB0_3: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    s_lshr_b32 s2, s9, 31
 ; GCN-IR-NEXT:    s_lshl_b64 s[12:13], s[14:15], 1
+; GCN-IR-NEXT:    s_lshr_b32 s2, s9, 31
 ; GCN-IR-NEXT:    s_lshl_b64 s[8:9], s[8:9], 1
 ; GCN-IR-NEXT:    s_or_b64 s[12:13], s[12:13], s[2:3]
 ; GCN-IR-NEXT:    s_or_b64 s[8:9], s[10:11], s[8:9]
@@ -224,25 +224,25 @@ define i64 @v_test_udiv_i64(i64 %x, i64 %y) {
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v5, v3
 ; GCN-NEXT:    v_sub_i32_e32 v6, vcc, 0, v2
 ; GCN-NEXT:    v_subb_u32_e32 v7, vcc, 0, v3, vcc
-; GCN-NEXT:    v_mov_b32_e32 v14, 0
 ; GCN-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
 ; GCN-NEXT:    v_rcp_f32_e32 v4, v4
+; GCN-NEXT:    v_mov_b32_e32 v14, 0
 ; GCN-NEXT:    v_mov_b32_e32 v13, 0
 ; GCN-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
 ; GCN-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
 ; GCN-NEXT:    v_trunc_f32_e32 v5, v5
 ; GCN-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v5
-; GCN-NEXT:    v_cvt_u32_f32_e32 v4, v4
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; GCN-NEXT:    v_mul_hi_u32 v8, v6, v4
+; GCN-NEXT:    v_cvt_u32_f32_e32 v4, v4
 ; GCN-NEXT:    v_mul_lo_u32 v9, v6, v5
+; GCN-NEXT:    v_mul_hi_u32 v8, v6, v4
 ; GCN-NEXT:    v_mul_lo_u32 v10, v7, v4
 ; GCN-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
 ; GCN-NEXT:    v_mul_lo_u32 v9, v6, v4
 ; GCN-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
 ; GCN-NEXT:    v_mul_lo_u32 v11, v4, v8
-; GCN-NEXT:    v_mul_hi_u32 v10, v4, v8
 ; GCN-NEXT:    v_mul_hi_u32 v12, v4, v9
+; GCN-NEXT:    v_mul_hi_u32 v10, v4, v8
 ; GCN-NEXT:    v_mul_hi_u32 v15, v5, v8
 ; GCN-NEXT:    v_mul_lo_u32 v8, v5, v8
 ; GCN-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
@@ -253,8 +253,8 @@ define i64 @v_test_udiv_i64(i64 %x, i64 %y) {
 ; GCN-NEXT:    v_addc_u32_e32 v9, vcc, v10, v9, vcc
 ; GCN-NEXT:    v_addc_u32_e32 v10, vcc, v15, v13, vcc
 ; GCN-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; GCN-NEXT:    v_add_i32_e64 v4, s[4:5], v4, v8
 ; GCN-NEXT:    v_addc_u32_e32 v9, vcc, v14, v10, vcc
+; GCN-NEXT:    v_add_i32_e64 v4, s[4:5], v4, v8
 ; GCN-NEXT:    v_addc_u32_e64 v8, vcc, v5, v9, s[4:5]
 ; GCN-NEXT:    v_mul_lo_u32 v10, v6, v8
 ; GCN-NEXT:    v_mul_hi_u32 v11, v6, v4
@@ -267,8 +267,8 @@ define i64 @v_test_udiv_i64(i64 %x, i64 %y) {
 ; GCN-NEXT:    v_mul_hi_u32 v16, v4, v7
 ; GCN-NEXT:    v_mul_hi_u32 v11, v8, v6
 ; GCN-NEXT:    v_mul_lo_u32 v6, v8, v6
-; GCN-NEXT:    v_add_i32_e32 v12, vcc, v15, v12
 ; GCN-NEXT:    v_mul_hi_u32 v10, v8, v7
+; GCN-NEXT:    v_add_i32_e32 v12, vcc, v15, v12
 ; GCN-NEXT:    v_addc_u32_e32 v15, vcc, v14, v16, vcc
 ; GCN-NEXT:    v_mul_lo_u32 v7, v8, v7
 ; GCN-NEXT:    v_add_i32_e32 v6, vcc, v12, v6
@@ -310,9 +310,9 @@ define i64 @v_test_udiv_i64(i64 %x, i64 %y) {
 ; GCN-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v2
 ; GCN-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
 ; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], v7, v3
-; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v1, v6, vcc
 ; GCN-NEXT:    v_cndmask_b32_e64 v7, v9, v8, s[4:5]
 ; GCN-NEXT:    v_add_i32_e64 v8, s[4:5], 2, v4
+; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v1, v6, vcc
 ; GCN-NEXT:    v_addc_u32_e64 v9, s[4:5], 0, v5, s[4:5]
 ; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
 ; GCN-NEXT:    v_add_i32_e64 v10, s[4:5], 1, v4
@@ -362,8 +362,8 @@ define i64 @v_test_udiv_i64(i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    v_addc_u32_e32 v13, vcc, 0, v7, vcc
 ; GCN-IR-NEXT:    v_sub_i32_e64 v4, s[4:5], 63, v6
 ; GCN-IR-NEXT:    v_cmp_ge_u64_e32 vcc, v[12:13], v[6:7]
-; GCN-IR-NEXT:    v_mov_b32_e32 v6, 0
 ; GCN-IR-NEXT:    v_lshl_b64 v[4:5], v[0:1], v4
+; GCN-IR-NEXT:    v_mov_b32_e32 v6, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -375,8 +375,8 @@ define i64 @v_test_udiv_i64(i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    v_addc_u32_e32 v15, vcc, -1, v3, vcc
 ; GCN-IR-NEXT:    v_not_b32_e32 v0, v8
 ; GCN-IR-NEXT:    v_not_b32_e32 v1, v9
-; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v10
 ; GCN-IR-NEXT:    v_mov_b32_e32 v8, 0
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v10
 ; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
 ; GCN-IR-NEXT:    v_addc_u32_e32 v1, vcc, v1, v11, vcc
 ; GCN-IR-NEXT:  BB1_3: ; %udiv-do-while
@@ -389,15 +389,15 @@ define i64 @v_test_udiv_i64(i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    v_subb_u32_e32 v6, vcc, v15, v11, vcc
 ; GCN-IR-NEXT:    v_or_b32_e32 v4, v8, v4
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v8, 31, v6
-; GCN-IR-NEXT:    v_and_b32_e32 v12, v8, v2
 ; GCN-IR-NEXT:    v_and_b32_e32 v6, 1, v8
 ; GCN-IR-NEXT:    v_and_b32_e32 v13, v8, v3
+; GCN-IR-NEXT:    v_and_b32_e32 v12, v8, v2
 ; GCN-IR-NEXT:    v_add_i32_e32 v8, vcc, 1, v0
 ; GCN-IR-NEXT:    v_or_b32_e32 v5, v9, v5
 ; GCN-IR-NEXT:    v_addc_u32_e32 v9, vcc, 0, v1, vcc
+; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
 ; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, v[8:9], v[0:1]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v0, v8
-; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
 ; GCN-IR-NEXT:    v_sub_i32_e64 v12, s[4:5], v10, v12
 ; GCN-IR-NEXT:    v_mov_b32_e32 v1, v9
 ; GCN-IR-NEXT:    v_mov_b32_e32 v9, v7
@@ -729,8 +729,8 @@ define amdgpu_kernel void @s_test_udiv24_i48(i48 addrspace(1)* %out, i48 %x, i48
 ; GCN-NEXT:    v_mul_lo_u32 v4, s2, v1
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
 ; GCN-NEXT:    v_mul_lo_u32 v6, v1, v3
-; GCN-NEXT:    v_mul_hi_u32 v5, v1, v3
 ; GCN-NEXT:    v_mul_hi_u32 v7, v1, v4
+; GCN-NEXT:    v_mul_hi_u32 v5, v1, v3
 ; GCN-NEXT:    v_mul_hi_u32 v10, v2, v3
 ; GCN-NEXT:    v_mul_lo_u32 v3, v2, v3
 ; GCN-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
@@ -741,8 +741,8 @@ define amdgpu_kernel void @s_test_udiv24_i48(i48 addrspace(1)* %out, i48 %x, i48
 ; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v5, v4, vcc
 ; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v10, v8, vcc
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GCN-NEXT:    v_add_i32_e64 v1, s[0:1], v1, v3
 ; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v9, v5, vcc
+; GCN-NEXT:    v_add_i32_e64 v1, s[0:1], v1, v3
 ; GCN-NEXT:    v_addc_u32_e64 v3, vcc, v2, v4, s[0:1]
 ; GCN-NEXT:    v_mul_lo_u32 v5, s2, v3
 ; GCN-NEXT:    v_mul_hi_u32 v6, s2, v1
@@ -751,8 +751,8 @@ define amdgpu_kernel void @s_test_udiv24_i48(i48 addrspace(1)* %out, i48 %x, i48
 ; GCN-NEXT:    v_mul_lo_u32 v6, s2, v1
 ; GCN-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
 ; GCN-NEXT:    v_mul_lo_u32 v11, v1, v5
-; GCN-NEXT:    v_mul_hi_u32 v13, v1, v5
 ; GCN-NEXT:    v_mul_hi_u32 v12, v1, v6
+; GCN-NEXT:    v_mul_hi_u32 v13, v1, v5
 ; GCN-NEXT:    v_mul_hi_u32 v10, v3, v6
 ; GCN-NEXT:    v_mul_lo_u32 v6, v3, v6
 ; GCN-NEXT:    v_mul_hi_u32 v7, v3, v5
@@ -768,8 +768,8 @@ define amdgpu_kernel void @s_test_udiv24_i48(i48 addrspace(1)* %out, i48 %x, i48
 ; GCN-NEXT:    v_addc_u32_e64 v2, vcc, v2, v5, s[0:1]
 ; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
 ; GCN-NEXT:    v_mov_b32_e32 v3, s8
-; GCN-NEXT:    v_alignbit_b32 v3, s6, v3, 24
 ; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
+; GCN-NEXT:    v_alignbit_b32 v3, s6, v3, 24
 ; GCN-NEXT:    v_mul_lo_u32 v4, v3, v2
 ; GCN-NEXT:    v_mul_hi_u32 v1, v3, v1
 ; GCN-NEXT:    v_mul_hi_u32 v2, v3, v2
@@ -794,14 +794,14 @@ define amdgpu_kernel void @s_test_udiv24_i48(i48 addrspace(1)* %out, i48 %x, i48
 ; GCN-NEXT:    v_sub_i32_e32 v7, vcc, v3, v0
 ; GCN-NEXT:    v_subbrev_u32_e32 v10, vcc, 0, v6, vcc
 ; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v7, v0
-; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], v3, v0
 ; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v10
+; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], v3, v0
 ; GCN-NEXT:    v_cndmask_b32_e32 v7, -1, v7, vcc
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[0:1]
 ; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v6
-; GCN-NEXT:    v_cndmask_b32_e64 v0, -1, v0, s[0:1]
 ; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
+; GCN-NEXT:    v_cndmask_b32_e64 v0, -1, v0, s[0:1]
 ; GCN-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
 ; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v0
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, v1, v4, s[0:1]
@@ -865,14 +865,14 @@ define amdgpu_kernel void @s_test_udiv24_i48(i48 addrspace(1)* %out, i48 %x, i48
 ; GCN-IR-NEXT:    s_addc_u32 s17, s3, -1
 ; GCN-IR-NEXT:    s_not_b64 s[0:1], s[10:11]
 ; GCN-IR-NEXT:    s_add_u32 s6, s0, s12
-; GCN-IR-NEXT:    s_addc_u32 s7, s1, s11
 ; GCN-IR-NEXT:    s_mov_b32 s13, s11
+; GCN-IR-NEXT:    s_addc_u32 s7, s1, s11
 ; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
 ; GCN-IR-NEXT:    s_mov_b32 s1, 0
 ; GCN-IR-NEXT:  BB7_3: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    s_lshr_b32 s0, s9, 31
 ; GCN-IR-NEXT:    s_lshl_b64 s[12:13], s[14:15], 1
+; GCN-IR-NEXT:    s_lshr_b32 s0, s9, 31
 ; GCN-IR-NEXT:    s_lshl_b64 s[8:9], s[8:9], 1
 ; GCN-IR-NEXT:    s_or_b64 s[12:13], s[12:13], s[0:1]
 ; GCN-IR-NEXT:    s_or_b64 s[8:9], s[10:11], s[8:9]
@@ -937,10 +937,10 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v0
 ; GCN-NEXT:    v_trunc_f32_e32 v3, v3
 ; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v3
-; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GCN-NEXT:    v_mul_hi_u32 v5, s2, v0
+; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_lo_u32 v4, s2, v3
+; GCN-NEXT:    v_mul_hi_u32 v5, s2, v0
 ; GCN-NEXT:    v_mul_lo_u32 v7, s3, v0
 ; GCN-NEXT:    v_mul_lo_u32 v6, s2, v0
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
@@ -950,16 +950,16 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    v_mul_hi_u32 v9, v0, v4
 ; GCN-NEXT:    v_mul_hi_u32 v8, v3, v6
 ; GCN-NEXT:    v_mul_lo_u32 v6, v3, v6
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
 ; GCN-NEXT:    v_mul_hi_u32 v10, v3, v4
+; GCN-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
 ; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v2, v9, vcc
 ; GCN-NEXT:    v_mul_lo_u32 v4, v3, v4
 ; GCN-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v7, v8, vcc
 ; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v10, v1, vcc
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
-; GCN-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v4
 ; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v2, v6, vcc
+; GCN-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v4
 ; GCN-NEXT:    v_addc_u32_e64 v4, vcc, v3, v5, s[0:1]
 ; GCN-NEXT:    v_mul_lo_u32 v6, s2, v4
 ; GCN-NEXT:    v_mul_hi_u32 v7, s2, v0
@@ -968,8 +968,8 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    v_mul_lo_u32 v7, s2, v0
 ; GCN-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
 ; GCN-NEXT:    v_mul_lo_u32 v10, v0, v6
-; GCN-NEXT:    v_mul_hi_u32 v12, v0, v6
 ; GCN-NEXT:    v_mul_hi_u32 v11, v0, v7
+; GCN-NEXT:    v_mul_hi_u32 v12, v0, v6
 ; GCN-NEXT:    v_mul_hi_u32 v9, v4, v7
 ; GCN-NEXT:    v_mul_lo_u32 v7, v4, v7
 ; GCN-NEXT:    v_mul_hi_u32 v8, v4, v6
@@ -1008,8 +1008,8 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    v_cndmask_b32_e64 v4, v6, v5, s[0:1]
 ; GCN-NEXT:    v_add_i32_e64 v5, s[0:1], 2, v0
 ; GCN-NEXT:    v_addc_u32_e64 v6, s[0:1], 0, v2, s[0:1]
-; GCN-NEXT:    v_subb_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN-NEXT:    v_add_i32_e64 v7, s[0:1], 1, v0
+; GCN-NEXT:    v_subb_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN-NEXT:    v_addc_u32_e64 v2, s[0:1], 0, v2, s[0:1]
 ; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s7, v1
 ; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
@@ -1065,8 +1065,8 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-IR-NEXT:    s_mov_b32 s5, 0
 ; GCN-IR-NEXT:  BB8_3: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    s_lshr_b32 s4, s9, 31
 ; GCN-IR-NEXT:    s_lshl_b64 s[12:13], s[12:13], 1
+; GCN-IR-NEXT:    s_lshr_b32 s4, s9, 31
 ; GCN-IR-NEXT:    s_lshl_b64 s[8:9], s[8:9], 1
 ; GCN-IR-NEXT:    s_or_b64 s[12:13], s[12:13], s[4:5]
 ; GCN-IR-NEXT:    s_or_b64 s[8:9], s[10:11], s[8:9]
@@ -1118,18 +1118,18 @@ define i64 @v_test_udiv_pow2_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v3, v1
 ; GCN-NEXT:    v_sub_i32_e32 v4, vcc, 0, v0
 ; GCN-NEXT:    v_subb_u32_e32 v5, vcc, 0, v1, vcc
-; GCN-NEXT:    v_mov_b32_e32 v12, 0
 ; GCN-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v3
 ; GCN-NEXT:    v_rcp_f32_e32 v2, v2
+; GCN-NEXT:    v_mov_b32_e32 v12, 0
 ; GCN-NEXT:    v_mov_b32_e32 v11, 0
 ; GCN-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
 ; GCN-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
 ; GCN-NEXT:    v_trunc_f32_e32 v3, v3
 ; GCN-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v3
-; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GCN-NEXT:    v_mul_hi_u32 v7, v4, v2
+; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GCN-NEXT:    v_mul_lo_u32 v6, v4, v3
+; GCN-NEXT:    v_mul_hi_u32 v7, v4, v2
 ; GCN-NEXT:    v_mul_lo_u32 v8, v5, v2
 ; GCN-NEXT:    v_mul_lo_u32 v9, v4, v2
 ; GCN-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
@@ -1147,8 +1147,8 @@ define i64 @v_test_udiv_pow2_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v7, v9, vcc
 ; GCN-NEXT:    v_addc_u32_e32 v8, vcc, v13, v11, vcc
 ; GCN-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; GCN-NEXT:    v_add_i32_e64 v2, s[4:5], v2, v6
 ; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v12, v8, vcc
+; GCN-NEXT:    v_add_i32_e64 v2, s[4:5], v2, v6
 ; GCN-NEXT:    v_addc_u32_e64 v6, vcc, v3, v7, s[4:5]
 ; GCN-NEXT:    v_mul_lo_u32 v8, v4, v6
 ; GCN-NEXT:    v_mul_hi_u32 v9, v4, v2
@@ -1161,8 +1161,8 @@ define i64 @v_test_udiv_pow2_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_mul_hi_u32 v14, v2, v5
 ; GCN-NEXT:    v_mul_hi_u32 v9, v6, v4
 ; GCN-NEXT:    v_mul_lo_u32 v4, v6, v4
-; GCN-NEXT:    v_add_i32_e32 v10, vcc, v13, v10
 ; GCN-NEXT:    v_mul_hi_u32 v8, v6, v5
+; GCN-NEXT:    v_add_i32_e32 v10, vcc, v13, v10
 ; GCN-NEXT:    v_addc_u32_e32 v13, vcc, v12, v14, vcc
 ; GCN-NEXT:    v_mul_lo_u32 v5, v6, v5
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v10, v4
@@ -1193,8 +1193,8 @@ define i64 @v_test_udiv_pow2_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_add_i32_e64 v6, s[4:5], 2, v2
 ; GCN-NEXT:    v_addc_u32_e64 v7, s[4:5], 0, v12, s[4:5]
 ; GCN-NEXT:    v_add_i32_e64 v8, s[4:5], 1, v2
-; GCN-NEXT:    v_subb_u32_e32 v3, vcc, 0, v3, vcc
 ; GCN-NEXT:    v_addc_u32_e64 v9, s[4:5], 0, v12, s[4:5]
+; GCN-NEXT:    v_subb_u32_e32 v3, vcc, 0, v3, vcc
 ; GCN-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v5
 ; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v1
 ; GCN-NEXT:    v_cndmask_b32_e64 v5, v8, v6, s[4:5]
@@ -1221,8 +1221,8 @@ define i64 @v_test_udiv_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
 ; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[6:7]
 ; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0x8000
-; GCN-IR-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
 ; GCN-IR-NEXT:    v_mov_b32_e32 v2, s8
+; GCN-IR-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
 ; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[6:7]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-IR-NEXT:    v_cndmask_b32_e64 v2, v2, 0, s[4:5]
@@ -1236,8 +1236,8 @@ define i64 @v_test_udiv_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_addc_u32_e32 v9, vcc, 0, v7, vcc
 ; GCN-IR-NEXT:    v_sub_i32_e64 v2, s[4:5], 63, v6
 ; GCN-IR-NEXT:    v_cmp_ge_u64_e32 vcc, v[8:9], v[6:7]
-; GCN-IR-NEXT:    v_mov_b32_e32 v6, 0
 ; GCN-IR-NEXT:    v_lshl_b64 v[2:3], s[8:9], v2
+; GCN-IR-NEXT:    v_mov_b32_e32 v6, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -1245,11 +1245,11 @@ define i64 @v_test_udiv_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    s_cbranch_execz BB9_5
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
 ; GCN-IR-NEXT:    v_add_i32_e32 v12, vcc, -1, v0
-; GCN-IR-NEXT:    v_addc_u32_e32 v13, vcc, -1, v1, vcc
 ; GCN-IR-NEXT:    s_mov_b64 s[4:5], 0x8000
-; GCN-IR-NEXT:    v_sub_i32_e32 v4, vcc, 47, v4
-; GCN-IR-NEXT:    v_mov_b32_e32 v10, 0
+; GCN-IR-NEXT:    v_addc_u32_e32 v13, vcc, -1, v1, vcc
 ; GCN-IR-NEXT:    v_lshr_b64 v[8:9], s[4:5], v8
+; GCN-IR-NEXT:    v_mov_b32_e32 v10, 0
+; GCN-IR-NEXT:    v_sub_i32_e32 v4, vcc, 47, v4
 ; GCN-IR-NEXT:    v_mov_b32_e32 v11, 0
 ; GCN-IR-NEXT:    v_subb_u32_e32 v5, vcc, 0, v5, vcc
 ; GCN-IR-NEXT:  BB9_3: ; %udiv-do-while
@@ -1262,15 +1262,15 @@ define i64 @v_test_udiv_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_subb_u32_e32 v6, vcc, v13, v9, vcc
 ; GCN-IR-NEXT:    v_or_b32_e32 v2, v10, v2
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v10, 31, v6
-; GCN-IR-NEXT:    v_and_b32_e32 v15, v10, v0
 ; GCN-IR-NEXT:    v_and_b32_e32 v6, 1, v10
 ; GCN-IR-NEXT:    v_and_b32_e32 v14, v10, v1
+; GCN-IR-NEXT:    v_and_b32_e32 v15, v10, v0
 ; GCN-IR-NEXT:    v_add_i32_e32 v10, vcc, 1, v4
 ; GCN-IR-NEXT:    v_or_b32_e32 v3, v11, v3
 ; GCN-IR-NEXT:    v_addc_u32_e32 v11, vcc, 0, v5, vcc
+; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
 ; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, v[10:11], v[4:5]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v4, v10
-; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
 ; GCN-IR-NEXT:    v_sub_i32_e64 v8, s[4:5], v8, v15
 ; GCN-IR-NEXT:    v_mov_b32_e32 v5, v11
 ; GCN-IR-NEXT:    v_mov_b32_e32 v11, v7
@@ -1327,8 +1327,8 @@ define i64 @v_test_udiv_pow2_k_den_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_addc_u32_e32 v8, vcc, 0, v5, vcc
 ; GCN-IR-NEXT:    v_sub_i32_e64 v2, s[4:5], 63, v4
 ; GCN-IR-NEXT:    v_cmp_ge_u64_e32 vcc, v[7:8], v[4:5]
-; GCN-IR-NEXT:    v_mov_b32_e32 v4, 0
 ; GCN-IR-NEXT:    v_lshl_b64 v[2:3], v[0:1], v2
+; GCN-IR-NEXT:    v_mov_b32_e32 v4, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -1351,18 +1351,18 @@ define i64 @v_test_udiv_pow2_k_den_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_subb_u32_e32 v4, vcc, 0, v8, vcc
 ; GCN-IR-NEXT:    v_or_b32_e32 v2, v9, v2
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v9, 31, v4
-; GCN-IR-NEXT:    v_and_b32_e32 v11, 0x8000, v9
 ; GCN-IR-NEXT:    v_and_b32_e32 v4, 1, v9
+; GCN-IR-NEXT:    v_and_b32_e32 v11, 0x8000, v9
 ; GCN-IR-NEXT:    v_add_i32_e32 v9, vcc, 1, v0
 ; GCN-IR-NEXT:    v_or_b32_e32 v3, v10, v3
 ; GCN-IR-NEXT:    v_addc_u32_e32 v10, vcc, 0, v1, vcc
+; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, v[9:10], v[0:1]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v0, v9
-; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, v10
-; GCN-IR-NEXT:    v_mov_b32_e32 v10, v5
 ; GCN-IR-NEXT:    v_mov_b32_e32 v6, 0
 ; GCN-IR-NEXT:    v_sub_i32_e64 v7, s[4:5], v7, v11
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, v10
+; GCN-IR-NEXT:    v_mov_b32_e32 v10, v5
 ; GCN-IR-NEXT:    v_subb_u32_e64 v8, s[4:5], v8, v6, s[4:5]
 ; GCN-IR-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v9, v4
@@ -1416,25 +1416,25 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    v_mul_lo_u32 v6, v1, v4
 ; GCN-NEXT:    v_mul_hi_u32 v4, v1, v4
 ; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v8, v3, vcc
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s4, s8
 ; GCN-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v3, v4, vcc
 ; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v9, v7, vcc
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GCN-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v2
 ; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
-; GCN-NEXT:    v_mul_hi_u32 v4, v0, s2
+; GCN-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v2
 ; GCN-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[0:1]
+; GCN-NEXT:    v_mul_hi_u32 v4, v0, s2
 ; GCN-NEXT:    v_mul_lo_u32 v5, v2, s2
 ; GCN-NEXT:    v_mul_lo_u32 v6, v0, s2
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s4, s8
 ; GCN-NEXT:    v_subrev_i32_e32 v4, vcc, v0, v4
-; GCN-NEXT:    s_mov_b32 s5, s9
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
 ; GCN-NEXT:    v_mul_lo_u32 v5, v0, v4
 ; GCN-NEXT:    v_mul_hi_u32 v9, v0, v6
 ; GCN-NEXT:    v_mul_hi_u32 v10, v0, v4
 ; GCN-NEXT:    v_mul_hi_u32 v11, v2, v4
+; GCN-NEXT:    s_mov_b32 s5, s9
 ; GCN-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
 ; GCN-NEXT:    v_addc_u32_e32 v9, vcc, v8, v10, vcc
 ; GCN-NEXT:    v_mul_lo_u32 v10, v2, v6
@@ -1471,8 +1471,8 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    v_add_i32_e32 v6, vcc, 1, v0
 ; GCN-NEXT:    v_addc_u32_e32 v7, vcc, 0, v1, vcc
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
-; GCN-NEXT:    v_sub_i32_e32 v8, vcc, s10, v8
 ; GCN-NEXT:    v_mov_b32_e32 v5, s11
+; GCN-NEXT:    v_sub_i32_e32 v8, vcc, s10, v8
 ; GCN-NEXT:    v_subb_u32_e32 v4, vcc, v5, v4, vcc
 ; GCN-NEXT:    v_subrev_i32_e32 v5, vcc, 24, v8
 ; GCN-NEXT:    v_subbrev_u32_e32 v9, vcc, 0, v4, vcc
@@ -1530,8 +1530,8 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-IR-NEXT:    s_mov_b32 s5, 0
 ; GCN-IR-NEXT:  BB11_3: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    s_lshr_b32 s4, s7, 31
 ; GCN-IR-NEXT:    s_lshl_b64 s[10:11], s[10:11], 1
+; GCN-IR-NEXT:    s_lshr_b32 s4, s7, 31
 ; GCN-IR-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
 ; GCN-IR-NEXT:    s_or_b64 s[10:11], s[10:11], s[4:5]
 ; GCN-IR-NEXT:    s_or_b64 s[6:7], s[8:9], s[6:7]
@@ -1605,10 +1605,10 @@ define i64 @v_test_udiv_k_den_i64(i64 %x) {
 ; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v5, v6, vcc
 ; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v11, v9, vcc
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
-; GCN-NEXT:    v_add_i32_e64 v2, s[4:5], v2, v4
 ; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v10, v6, vcc
-; GCN-NEXT:    v_mul_hi_u32 v6, v2, s6
+; GCN-NEXT:    v_add_i32_e64 v2, s[4:5], v2, v4
 ; GCN-NEXT:    v_addc_u32_e64 v4, vcc, v3, v5, s[4:5]
+; GCN-NEXT:    v_mul_hi_u32 v6, v2, s6
 ; GCN-NEXT:    v_mul_lo_u32 v7, v4, s6
 ; GCN-NEXT:    v_mul_lo_u32 v8, v2, s6
 ; GCN-NEXT:    v_subrev_i32_e32 v6, vcc, v2, v6
@@ -1697,8 +1697,8 @@ define i64 @v_test_udiv_k_den_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_addc_u32_e32 v8, vcc, 0, v5, vcc
 ; GCN-IR-NEXT:    v_sub_i32_e64 v2, s[4:5], 63, v4
 ; GCN-IR-NEXT:    v_cmp_ge_u64_e32 vcc, v[7:8], v[4:5]
-; GCN-IR-NEXT:    v_mov_b32_e32 v4, 0
 ; GCN-IR-NEXT:    v_lshl_b64 v[2:3], v[0:1], v2
+; GCN-IR-NEXT:    v_mov_b32_e32 v4, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -1720,14 +1720,14 @@ define i64 @v_test_udiv_k_den_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_subb_u32_e32 v4, vcc, 0, v8, vcc
 ; GCN-IR-NEXT:    v_or_b32_e32 v2, v9, v2
 ; GCN-IR-NEXT:    v_add_i32_e32 v9, vcc, 1, v0
-; GCN-IR-NEXT:    v_ashrrev_i32_e32 v7, 31, v4
 ; GCN-IR-NEXT:    v_or_b32_e32 v3, v10, v3
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v7, 31, v4
 ; GCN-IR-NEXT:    v_addc_u32_e32 v10, vcc, 0, v1, vcc
-; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, v[9:10], v[0:1]
+; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-IR-NEXT:    v_and_b32_e32 v4, 1, v7
 ; GCN-IR-NEXT:    v_and_b32_e32 v7, 24, v7
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, v[9:10], v[0:1]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v0, v9
-; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-IR-NEXT:    v_sub_i32_e64 v7, s[4:5], v6, v7
 ; GCN-IR-NEXT:    v_mov_b32_e32 v1, v10
 ; GCN-IR-NEXT:    v_mov_b32_e32 v10, v5

diff  --git a/llvm/test/CodeGen/AMDGPU/udivrem.ll b/llvm/test/CodeGen/AMDGPU/udivrem.ll
index b01e04c5e6103..65de9c05fe675 100644
--- a/llvm/test/CodeGen/AMDGPU/udivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/udivrem.ll
@@ -100,8 +100,8 @@ define amdgpu_kernel void @test_udivrem(i32 addrspace(1)* %out0, [8 x i32], i32
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 1, v2
 ; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s6, v3
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[0:1]
-; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    v_subrev_u32_e32 v4, vcc, s6, v3
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, v3, v4, s[0:1]
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s5
@@ -165,8 +165,8 @@ define amdgpu_kernel void @test_udivrem_v2(<2 x i32> addrspace(1)* %out, <2 x i3
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v1
 ; GFX6-NEXT:    v_mul_f32_e32 v0, s2, v0
-; GFX6-NEXT:    v_mul_f32_e32 v1, s2, v1
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX6-NEXT:    v_mul_f32_e32 v1, s2, v1
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX6-NEXT:    s_sub_i32 s2, 0, s6
 ; GFX6-NEXT:    v_mul_lo_u32 v2, s2, v0
@@ -209,8 +209,8 @@ define amdgpu_kernel void @test_udivrem_v2(<2 x i32> addrspace(1)* %out, <2 x i3
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v1, v1
 ; GFX8-NEXT:    v_mul_f32_e32 v0, s2, v0
-; GFX8-NEXT:    v_mul_f32_e32 v1, s2, v1
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, s2, v1
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX8-NEXT:    s_sub_i32 s2, 0, s6
 ; GFX8-NEXT:    v_mul_lo_u32 v2, s2, v0
@@ -219,14 +219,14 @@ define amdgpu_kernel void @test_udivrem_v2(<2 x i32> addrspace(1)* %out, <2 x i3
 ; GFX8-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX8-NEXT:    v_mul_hi_u32 v3, v1, v3
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
-; GFX8-NEXT:    v_mul_hi_u32 v0, s4, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT:    v_mul_hi_u32 v0, s4, v0
 ; GFX8-NEXT:    v_mul_hi_u32 v1, s5, v1
 ; GFX8-NEXT:    v_mul_lo_u32 v0, v0, s6
 ; GFX8-NEXT:    v_mul_lo_u32 v1, v1, s7
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s4, v0
-; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, s6, v0
 ; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, s5, v1
+; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, s6, v0
 ; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, s7, v1
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s6, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
@@ -345,8 +345,8 @@ define amdgpu_kernel void @test_udivrem_v4(<4 x i32> addrspace(1)* %out, <4 x i3
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s4, v0
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v4, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v1, s5, v1
-; GFX6-NEXT:    v_mul_f32_e32 v2, s13, v3
 ; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s8
+; GFX6-NEXT:    v_mul_f32_e32 v2, s13, v3
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s9
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
@@ -361,8 +361,8 @@ define amdgpu_kernel void @test_udivrem_v4(<4 x i32> addrspace(1)* %out, <4 x i3
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s5, v1
 ; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s9, v1
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX6-NEXT:    v_mul_hi_u32 v3, v2, v3
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v5
 ; GFX6-NEXT:    s_sub_i32 s4, 0, s11
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
@@ -370,19 +370,19 @@ define amdgpu_kernel void @test_udivrem_v4(<4 x i32> addrspace(1)* %out, <4 x i3
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s9, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v2, s6, v2
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
 ; GFX6-NEXT:    v_mul_lo_u32 v5, s4, v3
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX6-NEXT:    v_mul_lo_u32 v2, v2, s10
 ; GFX6-NEXT:    v_mul_hi_u32 v4, v3, v5
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s6, v2
-; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s10, v2
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
 ; GFX6-NEXT:    v_mul_hi_u32 v3, s7, v3
+; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s10, v2
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
+; GFX6-NEXT:    v_mul_lo_u32 v3, v3, s11
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
 ; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s10, v2
-; GFX6-NEXT:    v_mul_lo_u32 v3, v3, s11
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s7, v3
@@ -423,8 +423,8 @@ define amdgpu_kernel void @test_udivrem_v4(<4 x i32> addrspace(1)* %out, <4 x i3
 ; GFX8-NEXT:    v_mul_hi_u32 v0, s4, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v4, v1
 ; GFX8-NEXT:    v_mul_hi_u32 v1, s5, v1
-; GFX8-NEXT:    v_mul_f32_e32 v2, s12, v3
 ; GFX8-NEXT:    v_mul_lo_u32 v0, v0, s8
+; GFX8-NEXT:    v_mul_f32_e32 v2, s12, v3
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GFX8-NEXT:    v_mul_lo_u32 v1, v1, s9
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s4, v0
@@ -438,8 +438,8 @@ define amdgpu_kernel void @test_udivrem_v4(<4 x i32> addrspace(1)* %out, <4 x i3
 ; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, s5, v1
 ; GFX8-NEXT:    v_subrev_u32_e32 v4, vcc, s9, v1
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX8-NEXT:    v_mul_hi_u32 v3, v2, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v4, v5
 ; GFX8-NEXT:    s_sub_i32 s2, 0, s11
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
@@ -447,19 +447,19 @@ define amdgpu_kernel void @test_udivrem_v4(<4 x i32> addrspace(1)* %out, <4 x i3
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; GFX8-NEXT:    v_subrev_u32_e32 v4, vcc, s9, v1
 ; GFX8-NEXT:    v_mul_hi_u32 v2, s6, v2
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
 ; GFX8-NEXT:    v_mul_lo_u32 v5, s2, v3
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX8-NEXT:    v_mul_lo_u32 v2, v2, s10
 ; GFX8-NEXT:    v_mul_hi_u32 v4, v3, v5
 ; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s6, v2
-; GFX8-NEXT:    v_subrev_u32_e32 v5, vcc, s10, v2
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v4, v3
 ; GFX8-NEXT:    v_mul_hi_u32 v3, s7, v3
+; GFX8-NEXT:    v_subrev_u32_e32 v5, vcc, s10, v2
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
+; GFX8-NEXT:    v_mul_lo_u32 v3, v3, s11
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
 ; GFX8-NEXT:    v_subrev_u32_e32 v4, vcc, s10, v2
-; GFX8-NEXT:    v_mul_lo_u32 v3, v3, s11
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, s7, v3

diff  --git a/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll b/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll
index 9017a5e0db46b..0091f8e35e7e9 100644
--- a/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll
@@ -241,14 +241,14 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f32(<2 x float> addrspace(1)*
 ; GFX8-NEXT:    s_flbit_i32_b32 s6, s3
 ; GFX8-NEXT:    s_flbit_i32_b32 s7, s1
 ; GFX8-NEXT:    s_min_u32 s6, s6, 32
-; GFX8-NEXT:    s_lshl_b64 s[2:3], s[2:3], s6
 ; GFX8-NEXT:    s_min_u32 s7, s7, 32
+; GFX8-NEXT:    s_lshl_b64 s[2:3], s[2:3], s6
 ; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], s7
 ; GFX8-NEXT:    s_min_u32 s2, s2, 1
-; GFX8-NEXT:    s_min_u32 s0, s0, 1
 ; GFX8-NEXT:    s_or_b32 s2, s3, s2
-; GFX8-NEXT:    s_or_b32 s0, s1, s0
+; GFX8-NEXT:    s_min_u32 s0, s0, 1
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s2
+; GFX8-NEXT:    s_or_b32 s0, s1, s0
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v2, s0
 ; GFX8-NEXT:    s_sub_i32 s0, 32, s6
 ; GFX8-NEXT:    v_ldexp_f32 v1, v0, s0
@@ -341,26 +341,26 @@ define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f32(<4 x float> addrspace(1)*
 ; GFX8-NEXT:    v_min_u32_e32 v11, 32, v11
 ; GFX8-NEXT:    v_min_u32_e32 v12, 32, v12
 ; GFX8-NEXT:    v_min_u32_e32 v13, 32, v13
-; GFX8-NEXT:    v_lshlrev_b64 v[5:6], v11, v[5:6]
-; GFX8-NEXT:    v_lshlrev_b64 v[3:4], v12, v[3:4]
 ; GFX8-NEXT:    v_lshlrev_b64 v[7:8], v0, v[7:8]
 ; GFX8-NEXT:    v_sub_u32_e32 v14, vcc, 32, v0
+; GFX8-NEXT:    v_lshlrev_b64 v[5:6], v11, v[5:6]
+; GFX8-NEXT:    v_lshlrev_b64 v[3:4], v12, v[3:4]
 ; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v13, v[1:2]
 ; GFX8-NEXT:    v_min_u32_e32 v7, 1, v7
 ; GFX8-NEXT:    v_min_u32_e32 v5, 1, v5
 ; GFX8-NEXT:    v_min_u32_e32 v3, 1, v3
 ; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX8-NEXT:    v_or_b32_e32 v3, v4, v3
-; GFX8-NEXT:    v_or_b32_e32 v5, v6, v5
 ; GFX8-NEXT:    v_or_b32_e32 v7, v8, v7
+; GFX8-NEXT:    v_or_b32_e32 v5, v6, v5
+; GFX8-NEXT:    v_or_b32_e32 v3, v4, v3
 ; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v1, v7
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v4, v5
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v3, v3
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v5, v0
-; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v13
 ; GFX8-NEXT:    v_sub_u32_e32 v11, vcc, 32, v11
 ; GFX8-NEXT:    v_sub_u32_e32 v12, vcc, 32, v12
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v13
 ; GFX8-NEXT:    v_ldexp_f32 v1, v1, v14
 ; GFX8-NEXT:    v_ldexp_f32 v0, v4, v11
 ; GFX8-NEXT:    v_ldexp_f32 v3, v3, v12
@@ -418,8 +418,8 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f16(<2 x half> addrspace(1)*
 ; GFX8-NEXT:    s_min_u32 s9, s3, 32
 ; GFX8-NEXT:    s_lshl_b64 s[2:3], s[6:7], s8
 ; GFX8-NEXT:    s_min_u32 s2, s2, 1
-; GFX8-NEXT:    s_or_b32 s2, s3, s2
 ; GFX8-NEXT:    s_lshl_b64 s[4:5], s[4:5], s9
+; GFX8-NEXT:    s_or_b32 s2, s3, s2
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s2
 ; GFX8-NEXT:    s_min_u32 s2, s4, 1
 ; GFX8-NEXT:    s_or_b32 s2, s5, s2
@@ -524,19 +524,19 @@ define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f16(<4 x half> addrspace(1)*
 ; GFX8-NEXT:    v_min_u32_e32 v11, 32, v11
 ; GFX8-NEXT:    v_min_u32_e32 v12, 32, v12
 ; GFX8-NEXT:    v_min_u32_e32 v13, 32, v13
-; GFX8-NEXT:    v_lshlrev_b64 v[5:6], v11, v[5:6]
-; GFX8-NEXT:    v_lshlrev_b64 v[3:4], v12, v[3:4]
 ; GFX8-NEXT:    v_lshlrev_b64 v[7:8], v0, v[7:8]
 ; GFX8-NEXT:    v_sub_u32_e32 v14, vcc, 32, v0
+; GFX8-NEXT:    v_lshlrev_b64 v[5:6], v11, v[5:6]
+; GFX8-NEXT:    v_lshlrev_b64 v[3:4], v12, v[3:4]
 ; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v13, v[1:2]
 ; GFX8-NEXT:    v_min_u32_e32 v7, 1, v7
 ; GFX8-NEXT:    v_min_u32_e32 v5, 1, v5
 ; GFX8-NEXT:    v_min_u32_e32 v3, 1, v3
 ; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX8-NEXT:    v_or_b32_e32 v3, v4, v3
-; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    v_or_b32_e32 v7, v8, v7
 ; GFX8-NEXT:    v_or_b32_e32 v5, v6, v5
+; GFX8-NEXT:    v_or_b32_e32 v3, v4, v3
+; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v1, v7
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v4, v5
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v3, v3
@@ -544,18 +544,18 @@ define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f16(<4 x half> addrspace(1)*
 ; GFX8-NEXT:    v_sub_u32_e32 v11, vcc, 32, v11
 ; GFX8-NEXT:    v_sub_u32_e32 v12, vcc, 32, v12
 ; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v13
+; GFX8-NEXT:    v_ldexp_f32 v1, v1, v14
 ; GFX8-NEXT:    v_ldexp_f32 v4, v4, v11
 ; GFX8-NEXT:    v_ldexp_f32 v3, v3, v12
 ; GFX8-NEXT:    v_ldexp_f32 v0, v0, v2
-; GFX8-NEXT:    v_ldexp_f32 v1, v1, v14
-; GFX8-NEXT:    v_cvt_f16_f32_e32 v5, v0
-; GFX8-NEXT:    v_cvt_f16_f32_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
 ; GFX8-NEXT:    v_cvt_f16_f32_sdwa v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
 ; GFX8-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX8-NEXT:    v_cvt_f16_f32_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT:    v_cvt_f16_f32_e32 v5, v0
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v9
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v10, vcc
-; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
 ; GFX8-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
 ; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GFX8-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()

diff  --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll
index 1822c76618f3d..a0a4b73262a79 100644
--- a/llvm/test/CodeGen/AMDGPU/urem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/urem64.ll
@@ -24,10 +24,10 @@ define amdgpu_kernel void @s_test_urem_i64(i64 addrspace(1)* %out, i64 %x, i64 %
 ; GCN-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v0
 ; GCN-NEXT:    v_trunc_f32_e32 v3, v3
 ; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v3
-; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GCN-NEXT:    v_mul_hi_u32 v5, s2, v0
+; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_lo_u32 v4, s2, v3
+; GCN-NEXT:    v_mul_hi_u32 v5, s2, v0
 ; GCN-NEXT:    v_mul_lo_u32 v7, s3, v0
 ; GCN-NEXT:    v_mul_lo_u32 v6, s2, v0
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
@@ -45,8 +45,8 @@ define amdgpu_kernel void @s_test_urem_i64(i64 addrspace(1)* %out, i64 %x, i64 %
 ; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v7, v6, vcc
 ; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v9, v1, vcc
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
-; GCN-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v4
 ; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v2, v6, vcc
+; GCN-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v4
 ; GCN-NEXT:    v_addc_u32_e64 v4, vcc, v3, v5, s[0:1]
 ; GCN-NEXT:    v_mul_lo_u32 v6, s2, v4
 ; GCN-NEXT:    v_mul_hi_u32 v7, s2, v0
@@ -55,8 +55,8 @@ define amdgpu_kernel void @s_test_urem_i64(i64 addrspace(1)* %out, i64 %x, i64 %
 ; GCN-NEXT:    v_mul_lo_u32 v7, s2, v0
 ; GCN-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
 ; GCN-NEXT:    v_mul_lo_u32 v10, v0, v6
-; GCN-NEXT:    v_mul_hi_u32 v12, v0, v6
 ; GCN-NEXT:    v_mul_hi_u32 v11, v0, v7
+; GCN-NEXT:    v_mul_hi_u32 v12, v0, v6
 ; GCN-NEXT:    v_mul_hi_u32 v9, v4, v7
 ; GCN-NEXT:    v_mul_lo_u32 v7, v4, v7
 ; GCN-NEXT:    v_mul_hi_u32 v8, v4, v6
@@ -99,12 +99,12 @@ define amdgpu_kernel void @s_test_urem_i64(i64 addrspace(1)* %out, i64 %x, i64 %
 ; GCN-NEXT:    v_subrev_i32_e64 v4, s[0:1], s12, v0
 ; GCN-NEXT:    v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1]
 ; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s13, v5
-; GCN-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1]
 ; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[2:3]
 ; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s12, v4
-; GCN-NEXT:    v_subrev_i32_e64 v3, s[0:1], s12, v4
+; GCN-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1]
 ; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[2:3]
 ; GCN-NEXT:    v_cmp_eq_u32_e64 s[2:3], s13, v5
+; GCN-NEXT:    v_subrev_i32_e64 v3, s[0:1], s12, v4
 ; GCN-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[2:3]
 ; GCN-NEXT:    v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1]
 ; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
@@ -173,8 +173,8 @@ define amdgpu_kernel void @s_test_urem_i64(i64 addrspace(1)* %out, i64 %x, i64 %
 ; GCN-IR-NEXT:    s_mov_b32 s3, 0
 ; GCN-IR-NEXT:  BB0_3: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    s_lshr_b32 s2, s11, 31
 ; GCN-IR-NEXT:    s_lshl_b64 s[14:15], s[14:15], 1
+; GCN-IR-NEXT:    s_lshr_b32 s2, s11, 31
 ; GCN-IR-NEXT:    s_lshl_b64 s[10:11], s[10:11], 1
 ; GCN-IR-NEXT:    s_or_b64 s[14:15], s[14:15], s[2:3]
 ; GCN-IR-NEXT:    s_or_b64 s[10:11], s[12:13], s[10:11]
@@ -234,25 +234,25 @@ define i64 @v_test_urem_i64(i64 %x, i64 %y) {
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v5, v3
 ; GCN-NEXT:    v_sub_i32_e32 v6, vcc, 0, v2
 ; GCN-NEXT:    v_subb_u32_e32 v7, vcc, 0, v3, vcc
-; GCN-NEXT:    v_mov_b32_e32 v14, 0
 ; GCN-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
 ; GCN-NEXT:    v_rcp_f32_e32 v4, v4
+; GCN-NEXT:    v_mov_b32_e32 v14, 0
 ; GCN-NEXT:    v_mov_b32_e32 v13, 0
 ; GCN-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
 ; GCN-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
 ; GCN-NEXT:    v_trunc_f32_e32 v5, v5
 ; GCN-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v5
-; GCN-NEXT:    v_cvt_u32_f32_e32 v4, v4
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; GCN-NEXT:    v_mul_hi_u32 v8, v6, v4
+; GCN-NEXT:    v_cvt_u32_f32_e32 v4, v4
 ; GCN-NEXT:    v_mul_lo_u32 v9, v6, v5
+; GCN-NEXT:    v_mul_hi_u32 v8, v6, v4
 ; GCN-NEXT:    v_mul_lo_u32 v10, v7, v4
 ; GCN-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
 ; GCN-NEXT:    v_mul_lo_u32 v9, v6, v4
 ; GCN-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
 ; GCN-NEXT:    v_mul_lo_u32 v11, v4, v8
-; GCN-NEXT:    v_mul_hi_u32 v10, v4, v8
 ; GCN-NEXT:    v_mul_hi_u32 v12, v4, v9
+; GCN-NEXT:    v_mul_hi_u32 v10, v4, v8
 ; GCN-NEXT:    v_mul_hi_u32 v15, v5, v8
 ; GCN-NEXT:    v_mul_lo_u32 v8, v5, v8
 ; GCN-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
@@ -263,8 +263,8 @@ define i64 @v_test_urem_i64(i64 %x, i64 %y) {
 ; GCN-NEXT:    v_addc_u32_e32 v9, vcc, v10, v9, vcc
 ; GCN-NEXT:    v_addc_u32_e32 v10, vcc, v15, v13, vcc
 ; GCN-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; GCN-NEXT:    v_add_i32_e64 v4, s[4:5], v4, v8
 ; GCN-NEXT:    v_addc_u32_e32 v9, vcc, v14, v10, vcc
+; GCN-NEXT:    v_add_i32_e64 v4, s[4:5], v4, v8
 ; GCN-NEXT:    v_addc_u32_e64 v8, vcc, v5, v9, s[4:5]
 ; GCN-NEXT:    v_mul_lo_u32 v10, v6, v8
 ; GCN-NEXT:    v_mul_hi_u32 v11, v6, v4
@@ -277,8 +277,8 @@ define i64 @v_test_urem_i64(i64 %x, i64 %y) {
 ; GCN-NEXT:    v_mul_hi_u32 v16, v4, v7
 ; GCN-NEXT:    v_mul_hi_u32 v11, v8, v6
 ; GCN-NEXT:    v_mul_lo_u32 v6, v8, v6
-; GCN-NEXT:    v_add_i32_e32 v12, vcc, v15, v12
 ; GCN-NEXT:    v_mul_hi_u32 v10, v8, v7
+; GCN-NEXT:    v_add_i32_e32 v12, vcc, v15, v12
 ; GCN-NEXT:    v_addc_u32_e32 v15, vcc, v14, v16, vcc
 ; GCN-NEXT:    v_mul_lo_u32 v7, v8, v7
 ; GCN-NEXT:    v_add_i32_e32 v6, vcc, v12, v6
@@ -316,13 +316,13 @@ define i64 @v_test_urem_i64(i64 %x, i64 %y) {
 ; GCN-NEXT:    v_sub_i32_e64 v6, s[4:5], v0, v2
 ; GCN-NEXT:    v_subbrev_u32_e64 v7, s[6:7], 0, v4, s[4:5]
 ; GCN-NEXT:    v_cmp_ge_u32_e64 s[6:7], v7, v3
-; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v1, v5, vcc
 ; GCN-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[6:7]
 ; GCN-NEXT:    v_cmp_ge_u32_e64 s[6:7], v6, v2
+; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v1, v5, vcc
 ; GCN-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[6:7]
 ; GCN-NEXT:    v_cmp_eq_u32_e64 s[6:7], v7, v3
-; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
 ; GCN-NEXT:    v_subb_u32_e64 v4, s[4:5], v4, v3, s[4:5]
+; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
 ; GCN-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s[6:7]
 ; GCN-NEXT:    v_sub_i32_e64 v9, s[4:5], v6, v2
 ; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
@@ -330,10 +330,10 @@ define i64 @v_test_urem_i64(i64 %x, i64 %y) {
 ; GCN-NEXT:    v_subbrev_u32_e64 v4, s[4:5], 0, v4, s[4:5]
 ; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
-; GCN-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
 ; GCN-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v8
-; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
+; GCN-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
 ; GCN-NEXT:    v_cndmask_b32_e64 v6, v6, v9, s[4:5]
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
 ; GCN-NEXT:    v_cndmask_b32_e64 v2, v7, v4, s[4:5]
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
@@ -371,8 +371,8 @@ define i64 @v_test_urem_i64(i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    v_addc_u32_e32 v13, vcc, 0, v6, vcc
 ; GCN-IR-NEXT:    v_sub_i32_e64 v4, s[4:5], 63, v5
 ; GCN-IR-NEXT:    v_cmp_ge_u64_e32 vcc, v[12:13], v[5:6]
-; GCN-IR-NEXT:    v_mov_b32_e32 v6, 0
 ; GCN-IR-NEXT:    v_lshl_b64 v[4:5], v[0:1], v4
+; GCN-IR-NEXT:    v_mov_b32_e32 v6, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -382,9 +382,9 @@ define i64 @v_test_urem_i64(i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    v_add_i32_e32 v16, vcc, -1, v2
 ; GCN-IR-NEXT:    v_addc_u32_e32 v17, vcc, -1, v3, vcc
 ; GCN-IR-NEXT:    v_not_b32_e32 v6, v8
-; GCN-IR-NEXT:    v_mov_b32_e32 v14, 0
 ; GCN-IR-NEXT:    v_lshr_b64 v[12:13], v[0:1], v12
 ; GCN-IR-NEXT:    v_not_b32_e32 v7, v9
+; GCN-IR-NEXT:    v_mov_b32_e32 v14, 0
 ; GCN-IR-NEXT:    v_add_i32_e32 v8, vcc, v6, v10
 ; GCN-IR-NEXT:    v_mov_b32_e32 v15, 0
 ; GCN-IR-NEXT:    v_addc_u32_e32 v9, vcc, v7, v11, vcc
@@ -398,15 +398,15 @@ define i64 @v_test_urem_i64(i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    v_subb_u32_e32 v6, vcc, v17, v11, vcc
 ; GCN-IR-NEXT:    v_or_b32_e32 v4, v14, v4
 ; GCN-IR-NEXT:    v_add_i32_e32 v14, vcc, 1, v8
-; GCN-IR-NEXT:    v_ashrrev_i32_e32 v12, 31, v6
 ; GCN-IR-NEXT:    v_or_b32_e32 v5, v15, v5
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v12, 31, v6
 ; GCN-IR-NEXT:    v_addc_u32_e32 v15, vcc, 0, v9, vcc
-; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, v[14:15], v[8:9]
-; GCN-IR-NEXT:    v_mov_b32_e32 v8, v14
 ; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
 ; GCN-IR-NEXT:    v_and_b32_e32 v6, 1, v12
 ; GCN-IR-NEXT:    v_and_b32_e32 v13, v12, v3
 ; GCN-IR-NEXT:    v_and_b32_e32 v12, v12, v2
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, v[14:15], v[8:9]
+; GCN-IR-NEXT:    v_mov_b32_e32 v8, v14
 ; GCN-IR-NEXT:    v_sub_i32_e64 v12, s[4:5], v10, v12
 ; GCN-IR-NEXT:    v_mov_b32_e32 v9, v15
 ; GCN-IR-NEXT:    v_mov_b32_e32 v15, v7
@@ -758,10 +758,10 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v0
 ; GCN-NEXT:    v_trunc_f32_e32 v3, v3
 ; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v3
-; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GCN-NEXT:    v_mul_hi_u32 v5, s2, v0
+; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_lo_u32 v4, s2, v3
+; GCN-NEXT:    v_mul_hi_u32 v5, s2, v0
 ; GCN-NEXT:    v_mul_lo_u32 v7, s3, v0
 ; GCN-NEXT:    v_mul_lo_u32 v6, s2, v0
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
@@ -771,16 +771,16 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    v_mul_hi_u32 v9, v0, v4
 ; GCN-NEXT:    v_mul_hi_u32 v8, v3, v6
 ; GCN-NEXT:    v_mul_lo_u32 v6, v3, v6
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
 ; GCN-NEXT:    v_mul_hi_u32 v10, v3, v4
+; GCN-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
 ; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v2, v9, vcc
 ; GCN-NEXT:    v_mul_lo_u32 v4, v3, v4
 ; GCN-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v7, v8, vcc
 ; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v10, v1, vcc
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
-; GCN-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v4
 ; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v2, v6, vcc
+; GCN-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v4
 ; GCN-NEXT:    v_addc_u32_e64 v4, vcc, v3, v5, s[0:1]
 ; GCN-NEXT:    v_mul_lo_u32 v6, s2, v4
 ; GCN-NEXT:    v_mul_hi_u32 v7, s2, v0
@@ -789,8 +789,8 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    v_mul_lo_u32 v7, s2, v0
 ; GCN-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
 ; GCN-NEXT:    v_mul_lo_u32 v10, v0, v6
-; GCN-NEXT:    v_mul_hi_u32 v12, v0, v6
 ; GCN-NEXT:    v_mul_hi_u32 v11, v0, v7
+; GCN-NEXT:    v_mul_hi_u32 v12, v0, v6
 ; GCN-NEXT:    v_mul_hi_u32 v9, v4, v7
 ; GCN-NEXT:    v_mul_lo_u32 v7, v4, v7
 ; GCN-NEXT:    v_mul_hi_u32 v8, v4, v6
@@ -822,15 +822,15 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    v_subrev_i32_e64 v4, s[0:1], s6, v0
 ; GCN-NEXT:    v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1]
 ; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s7, v5
-; GCN-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1]
 ; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[2:3]
 ; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s6, v4
-; GCN-NEXT:    v_subrev_i32_e64 v3, s[0:1], s6, v4
+; GCN-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1]
 ; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[2:3]
 ; GCN-NEXT:    v_cmp_eq_u32_e64 s[2:3], s7, v5
-; GCN-NEXT:    v_subb_u32_e32 v1, vcc, 0, v1, vcc
+; GCN-NEXT:    v_subrev_i32_e64 v3, s[0:1], s6, v4
 ; GCN-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[2:3]
 ; GCN-NEXT:    v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1]
+; GCN-NEXT:    v_subb_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
 ; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s7, v1
 ; GCN-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[0:1]
@@ -885,8 +885,8 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-IR-NEXT:    s_mov_b32 s5, 0
 ; GCN-IR-NEXT:  BB6_3: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    s_lshr_b32 s4, s9, 31
 ; GCN-IR-NEXT:    s_lshl_b64 s[12:13], s[12:13], 1
+; GCN-IR-NEXT:    s_lshr_b32 s4, s9, 31
 ; GCN-IR-NEXT:    s_lshl_b64 s[8:9], s[8:9], 1
 ; GCN-IR-NEXT:    s_or_b64 s[12:13], s[12:13], s[4:5]
 ; GCN-IR-NEXT:    s_or_b64 s[8:9], s[10:11], s[8:9]
@@ -967,25 +967,25 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    v_mul_lo_u32 v6, v1, v4
 ; GCN-NEXT:    v_mul_hi_u32 v4, v1, v4
 ; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v8, v3, vcc
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s8, s4
 ; GCN-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v3, v4, vcc
 ; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v9, v7, vcc
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GCN-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v2
 ; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
-; GCN-NEXT:    v_mul_hi_u32 v4, v0, s2
+; GCN-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v2
 ; GCN-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[0:1]
+; GCN-NEXT:    v_mul_hi_u32 v4, v0, s2
 ; GCN-NEXT:    v_mul_lo_u32 v5, v2, s2
 ; GCN-NEXT:    v_mul_lo_u32 v6, v0, s2
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s8, s4
 ; GCN-NEXT:    v_subrev_i32_e32 v4, vcc, v0, v4
-; GCN-NEXT:    s_mov_b32 s9, s5
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
 ; GCN-NEXT:    v_mul_lo_u32 v5, v0, v4
 ; GCN-NEXT:    v_mul_hi_u32 v9, v0, v6
 ; GCN-NEXT:    v_mul_hi_u32 v10, v0, v4
 ; GCN-NEXT:    v_mul_hi_u32 v11, v2, v4
+; GCN-NEXT:    s_mov_b32 s9, s5
 ; GCN-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
 ; GCN-NEXT:    v_addc_u32_e32 v9, vcc, v8, v10, vcc
 ; GCN-NEXT:    v_mul_lo_u32 v10, v2, v6
@@ -1014,12 +1014,12 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v5, v7, vcc
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v8, v2, vcc
-; GCN-NEXT:    v_mul_hi_u32 v2, v0, 24
 ; GCN-NEXT:    v_mul_lo_u32 v1, v1, 24
+; GCN-NEXT:    v_mul_hi_u32 v2, v0, 24
 ; GCN-NEXT:    v_mul_lo_u32 v0, v0, 24
 ; GCN-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
-; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s6, v0
 ; GCN-NEXT:    v_mov_b32_e32 v2, s7
+; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s6, v0
 ; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
 ; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, 24, v0
 ; GCN-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v1, vcc
@@ -1079,8 +1079,8 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-IR-NEXT:    s_mov_b32 s5, 0
 ; GCN-IR-NEXT:  BB7_3: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    s_lshr_b32 s4, s9, 31
 ; GCN-IR-NEXT:    s_lshl_b64 s[12:13], s[12:13], 1
+; GCN-IR-NEXT:    s_lshr_b32 s4, s9, 31
 ; GCN-IR-NEXT:    s_lshl_b64 s[8:9], s[8:9], 1
 ; GCN-IR-NEXT:    s_or_b64 s[12:13], s[12:13], s[4:5]
 ; GCN-IR-NEXT:    s_or_b64 s[8:9], s[10:11], s[8:9]
@@ -1111,8 +1111,8 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-IR-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-IR-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[10:11]
 ; GCN-IR-NEXT:  BB7_6: ; %udiv-end
-; GCN-IR-NEXT:    v_mul_hi_u32 v2, v0, 24
 ; GCN-IR-NEXT:    v_mul_lo_u32 v1, v1, 24
+; GCN-IR-NEXT:    v_mul_hi_u32 v2, v0, 24
 ; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, 24
 ; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-IR-NEXT:    s_mov_b32 s6, -1
@@ -1143,18 +1143,18 @@ define i64 @v_test_urem_pow2_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v3, v1
 ; GCN-NEXT:    v_sub_i32_e32 v4, vcc, 0, v0
 ; GCN-NEXT:    v_subb_u32_e32 v5, vcc, 0, v1, vcc
-; GCN-NEXT:    v_mov_b32_e32 v12, 0
 ; GCN-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v3
 ; GCN-NEXT:    v_rcp_f32_e32 v2, v2
+; GCN-NEXT:    v_mov_b32_e32 v12, 0
 ; GCN-NEXT:    v_mov_b32_e32 v11, 0
 ; GCN-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
 ; GCN-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
 ; GCN-NEXT:    v_trunc_f32_e32 v3, v3
 ; GCN-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v3
-; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GCN-NEXT:    v_mul_hi_u32 v7, v4, v2
+; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GCN-NEXT:    v_mul_lo_u32 v6, v4, v3
+; GCN-NEXT:    v_mul_hi_u32 v7, v4, v2
 ; GCN-NEXT:    v_mul_lo_u32 v8, v5, v2
 ; GCN-NEXT:    v_mul_lo_u32 v9, v4, v2
 ; GCN-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
@@ -1172,8 +1172,8 @@ define i64 @v_test_urem_pow2_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v7, v9, vcc
 ; GCN-NEXT:    v_addc_u32_e32 v8, vcc, v13, v11, vcc
 ; GCN-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; GCN-NEXT:    v_add_i32_e64 v2, s[4:5], v2, v6
 ; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v12, v8, vcc
+; GCN-NEXT:    v_add_i32_e64 v2, s[4:5], v2, v6
 ; GCN-NEXT:    v_addc_u32_e64 v6, vcc, v3, v7, s[4:5]
 ; GCN-NEXT:    v_mul_lo_u32 v8, v4, v6
 ; GCN-NEXT:    v_mul_hi_u32 v9, v4, v2
@@ -1186,8 +1186,8 @@ define i64 @v_test_urem_pow2_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_mul_hi_u32 v14, v2, v5
 ; GCN-NEXT:    v_mul_hi_u32 v9, v6, v4
 ; GCN-NEXT:    v_mul_lo_u32 v4, v6, v4
-; GCN-NEXT:    v_add_i32_e32 v10, vcc, v13, v10
 ; GCN-NEXT:    v_mul_hi_u32 v8, v6, v5
+; GCN-NEXT:    v_add_i32_e32 v10, vcc, v13, v10
 ; GCN-NEXT:    v_addc_u32_e32 v13, vcc, v12, v14, vcc
 ; GCN-NEXT:    v_mul_lo_u32 v5, v6, v5
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v10, v4
@@ -1215,9 +1215,9 @@ define i64 @v_test_urem_pow2_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[6:7]
 ; GCN-NEXT:    v_cmp_eq_u32_e64 s[6:7], v6, v1
 ; GCN-NEXT:    v_subb_u32_e64 v4, s[4:5], v4, v1, s[4:5]
-; GCN-NEXT:    v_subb_u32_e32 v3, vcc, 0, v3, vcc
 ; GCN-NEXT:    v_cndmask_b32_e64 v7, v7, v8, s[6:7]
 ; GCN-NEXT:    v_sub_i32_e64 v8, s[4:5], v5, v0
+; GCN-NEXT:    v_subb_u32_e32 v3, vcc, 0, v3, vcc
 ; GCN-NEXT:    v_subbrev_u32_e64 v4, s[4:5], 0, v4, s[4:5]
 ; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v1
 ; GCN-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v7
@@ -1245,8 +1245,8 @@ define i64 @v_test_urem_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
 ; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[2:3]
 ; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0x8000
-; GCN-IR-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
 ; GCN-IR-NEXT:    v_mov_b32_e32 v6, s8
+; GCN-IR-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
 ; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[2:3]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-IR-NEXT:    v_cndmask_b32_e64 v6, v6, 0, s[4:5]
@@ -1260,8 +1260,8 @@ define i64 @v_test_urem_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_addc_u32_e32 v9, vcc, 0, v3, vcc
 ; GCN-IR-NEXT:    v_cmp_ge_u64_e32 vcc, v[8:9], v[2:3]
 ; GCN-IR-NEXT:    v_sub_i32_e64 v2, s[4:5], 63, v2
-; GCN-IR-NEXT:    v_mov_b32_e32 v6, 0
 ; GCN-IR-NEXT:    v_lshl_b64 v[2:3], s[8:9], v2
+; GCN-IR-NEXT:    v_mov_b32_e32 v6, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -1269,11 +1269,11 @@ define i64 @v_test_urem_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    s_cbranch_execz BB8_5
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
 ; GCN-IR-NEXT:    v_add_i32_e32 v12, vcc, -1, v0
-; GCN-IR-NEXT:    v_addc_u32_e32 v13, vcc, -1, v1, vcc
 ; GCN-IR-NEXT:    s_mov_b64 s[4:5], 0x8000
-; GCN-IR-NEXT:    v_sub_i32_e32 v4, vcc, 47, v4
-; GCN-IR-NEXT:    v_mov_b32_e32 v10, 0
+; GCN-IR-NEXT:    v_addc_u32_e32 v13, vcc, -1, v1, vcc
 ; GCN-IR-NEXT:    v_lshr_b64 v[8:9], s[4:5], v8
+; GCN-IR-NEXT:    v_mov_b32_e32 v10, 0
+; GCN-IR-NEXT:    v_sub_i32_e32 v4, vcc, 47, v4
 ; GCN-IR-NEXT:    v_mov_b32_e32 v11, 0
 ; GCN-IR-NEXT:    v_subb_u32_e32 v5, vcc, 0, v5, vcc
 ; GCN-IR-NEXT:  BB8_3: ; %udiv-do-while
@@ -1286,15 +1286,15 @@ define i64 @v_test_urem_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_subb_u32_e32 v6, vcc, v13, v9, vcc
 ; GCN-IR-NEXT:    v_or_b32_e32 v2, v10, v2
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v10, 31, v6
-; GCN-IR-NEXT:    v_and_b32_e32 v15, v10, v0
 ; GCN-IR-NEXT:    v_and_b32_e32 v6, 1, v10
 ; GCN-IR-NEXT:    v_and_b32_e32 v14, v10, v1
+; GCN-IR-NEXT:    v_and_b32_e32 v15, v10, v0
 ; GCN-IR-NEXT:    v_add_i32_e32 v10, vcc, 1, v4
 ; GCN-IR-NEXT:    v_or_b32_e32 v3, v11, v3
 ; GCN-IR-NEXT:    v_addc_u32_e32 v11, vcc, 0, v5, vcc
+; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
 ; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, v[10:11], v[4:5]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v4, v10
-; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
 ; GCN-IR-NEXT:    v_sub_i32_e64 v8, s[4:5], v8, v15
 ; GCN-IR-NEXT:    v_mov_b32_e32 v5, v11
 ; GCN-IR-NEXT:    v_mov_b32_e32 v11, v7
@@ -1357,17 +1357,17 @@ define i64 @v_test_urem_pow2_k_den_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_addc_u32_e32 v8, vcc, 0, v3, vcc
 ; GCN-IR-NEXT:    v_cmp_ge_u64_e32 vcc, v[7:8], v[2:3]
 ; GCN-IR-NEXT:    v_sub_i32_e64 v2, s[4:5], 63, v2
-; GCN-IR-NEXT:    v_mov_b32_e32 v4, 0
 ; GCN-IR-NEXT:    v_lshl_b64 v[2:3], v[0:1], v2
+; GCN-IR-NEXT:    v_mov_b32_e32 v4, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN-IR-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
 ; GCN-IR-NEXT:    s_cbranch_execz BB9_5
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT:    v_mov_b32_e32 v10, 0
 ; GCN-IR-NEXT:    v_lshr_b64 v[8:9], v[0:1], v7
 ; GCN-IR-NEXT:    v_add_i32_e32 v6, vcc, 0xffffffcf, v6
+; GCN-IR-NEXT:    v_mov_b32_e32 v10, 0
 ; GCN-IR-NEXT:    v_addc_u32_e64 v7, s[4:5], 0, -1, vcc
 ; GCN-IR-NEXT:    v_mov_b32_e32 v11, 0
 ; GCN-IR-NEXT:    s_movk_i32 s12, 0x7fff
@@ -1381,18 +1381,18 @@ define i64 @v_test_urem_pow2_k_den_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_subb_u32_e32 v4, vcc, 0, v9, vcc
 ; GCN-IR-NEXT:    v_or_b32_e32 v2, v10, v2
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v10, 31, v4
-; GCN-IR-NEXT:    v_and_b32_e32 v13, 0x8000, v10
 ; GCN-IR-NEXT:    v_and_b32_e32 v4, 1, v10
+; GCN-IR-NEXT:    v_and_b32_e32 v13, 0x8000, v10
 ; GCN-IR-NEXT:    v_add_i32_e32 v10, vcc, 1, v6
 ; GCN-IR-NEXT:    v_or_b32_e32 v3, v11, v3
 ; GCN-IR-NEXT:    v_addc_u32_e32 v11, vcc, 0, v7, vcc
+; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, v[10:11], v[6:7]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v6, v10
-; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v7, v11
-; GCN-IR-NEXT:    v_mov_b32_e32 v11, v5
 ; GCN-IR-NEXT:    v_mov_b32_e32 v12, 0
 ; GCN-IR-NEXT:    v_sub_i32_e64 v8, s[4:5], v8, v13
+; GCN-IR-NEXT:    v_mov_b32_e32 v7, v11
+; GCN-IR-NEXT:    v_mov_b32_e32 v11, v5
 ; GCN-IR-NEXT:    v_subb_u32_e64 v9, s[4:5], v9, v12, s[4:5]
 ; GCN-IR-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v10, v4

diff  --git a/llvm/test/CodeGen/AMDGPU/usubsat.ll b/llvm/test/CodeGen/AMDGPU/usubsat.ll
index c1062c82ba5a6..0c23ee9cf64a5 100644
--- a/llvm/test/CodeGen/AMDGPU/usubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/usubsat.ll
@@ -110,9 +110,9 @@ define <2 x i16> @v_usubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
 ; GFX6-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX6-NEXT:    v_and_b32_e32 v4, s4, v3
 ; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
-; GFX6-NEXT:    v_max_u32_e32 v1, v1, v4
 ; GFX6-NEXT:    v_and_b32_e32 v2, s4, v2
 ; GFX6-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX6-NEXT:    v_max_u32_e32 v1, v1, v4
 ; GFX6-NEXT:    v_max_u32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v3
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
@@ -152,9 +152,9 @@ define <3 x i16> @v_usubsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) {
 ; GFX6-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX6-NEXT:    v_and_b32_e32 v6, s4, v4
 ; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
-; GFX6-NEXT:    v_max_u32_e32 v1, v1, v6
 ; GFX6-NEXT:    v_and_b32_e32 v3, s4, v3
 ; GFX6-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX6-NEXT:    v_max_u32_e32 v1, v1, v6
 ; GFX6-NEXT:    v_max_u32_e32 v0, v0, v3
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v4
 ; GFX6-NEXT:    v_and_b32_e32 v5, s4, v5
@@ -201,9 +201,9 @@ define <2 x float> @v_usubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 ; GFX6-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX6-NEXT:    v_and_b32_e32 v9, s4, v5
 ; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
-; GFX6-NEXT:    v_max_u32_e32 v1, v1, v9
 ; GFX6-NEXT:    v_and_b32_e32 v4, s4, v4
 ; GFX6-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX6-NEXT:    v_max_u32_e32 v1, v1, v9
 ; GFX6-NEXT:    v_max_u32_e32 v0, v0, v4
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v5
 ; GFX6-NEXT:    v_and_b32_e32 v8, s4, v7

diff  --git a/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll b/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll
index c1c03baaf5122..393e80f6be132 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll
@@ -16,8 +16,8 @@ target datalayout = "A5"
 ; GCN_PROMOTE: s_cmp_lg_u32 s{{[0-9]+}}, 2
 ; GCN-PROMOTE: s_cmp_eq_u32 s{{[0-9]+}}, 1
 ; GCN-PROMOTE: s_cselect_b64 [[CC1:[^,]+]], -1, 0
-; GCN-PROMOTE: s_cselect_b64 vcc, -1, 0
 ; GCN-PROMOTE: v_cndmask_b32_e{{32|64}} [[IND1:v[0-9]+]], 0, 1, [[CC1]]
+; GCN-PROMOTE: s_cselect_b64 vcc, -1, 0
 ; GCN_PROMOTE: s_cmp_lg_u32 s{{[0-9]+}}, 3
 ; GCN-PROMOTE: v_cndmask_b32_e{{32|64}} [[IND2:v[0-9]+]], 2, [[IND1]], vcc
 ; GCN-PROMOTE: s_cselect_b64 vcc, -1, 0
@@ -322,11 +322,11 @@ entry:
 ; GCN-ALLOCA-COUNT-4: buffer_store_dword
 ; GCN-ALLOCA:         buffer_load_dword
 
-; GCN_PROMOTE: s_cmp_lg_u32 s{{[0-9]+}}, 2
 ; GCN-PROMOTE: s_cmp_eq_u32 s{{[0-9]+}}, 1
 ; GCN-PROMOTE: s_cselect_b64 [[CC1:[^,]+]], -1, 0
-; GCN-PROMOTE: s_cselect_b64 vcc, -1, 0
+; GCN_PROMOTE: s_cmp_lg_u32 s{{[0-9]+}}, 2
 ; GCN-PROMOTE: v_cndmask_b32_e{{32|64}} [[IND1:v[0-9]+]], 0, 1, [[CC1]]
+; GCN-PROMOTE: s_cselect_b64 vcc, -1, 0
 ; GCN_PROMOTE: s_cmp_lg_u32 s{{[0-9]+}}, 3
 ; GCN-PROMOTE: v_cndmask_b32_e{{32|64}} [[IND2:v[0-9]+]], 2, [[IND1]], vcc
 ; GCN-PROMOTE: s_cselect_b64 vcc, -1, 0

diff  --git a/llvm/test/CodeGen/AMDGPU/vector-extract-insert.ll b/llvm/test/CodeGen/AMDGPU/vector-extract-insert.ll
index 7d6f80c7daa94..da3d5f428a4a9 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-extract-insert.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-extract-insert.ll
@@ -36,12 +36,12 @@ define amdgpu_kernel void @extract_insert_
diff erent_dynelt_v4i32(i32 addrspace(1
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
 ; GCN-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0xd
-; GCN-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b64 s[4:5], s[10:11]
-; GCN-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
+; GCN-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-NEXT:    buffer_load_dwordx4 v[1:4], v[4:5], s[4:7], 0 addr64
 ; GCN-NEXT:    s_load_dword s14, s[0:1], 0xf
 ; GCN-NEXT:    s_cmp_eq_u32 s13, 3
@@ -60,10 +60,10 @@ define amdgpu_kernel void @extract_insert_
diff erent_dynelt_v4i32(i32 addrspace(1
 ; GCN-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_cndmask_b32_e32 v4, v4, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    v_cndmask_b32_e64 v3, v3, v0, s[0:1]
 ; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v0, s[2:3]
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s[4:5]
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmp_eq_u32 s14, 2
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
@@ -113,12 +113,12 @@ define amdgpu_kernel void @extract_insert_same_dynelt_v4f32(float addrspace(1)*
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
 ; GCN-NEXT:    s_load_dword s8, s[0:1], 0xd
-; GCN-NEXT:    v_mov_b32_e32 v2, 0
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_mov_b32 s2, 0
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 4, v0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b64 s[0:1], s[6:7]
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 4, v0
+; GCN-NEXT:    v_mov_b32_e32 v2, 0
 ; GCN-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
 ; GCN-NEXT:    v_mov_b32_e32 v5, v2
 ; GCN-NEXT:    buffer_load_dwordx4 v[0:3], v[1:2], s[0:3], 0 addr64 glc

diff  --git a/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll b/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll
index 2f3007d06ac04..39380db968410 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll
@@ -10,8 +10,8 @@ define void @vgpr_descriptor_waterfall_loop_idom_update(<4 x i32>* %arg) #0 {
 ; GCN-NEXT:    ; =>This Loop Header: Depth=1
 ; GCN-NEXT:    ; Child Loop BB0_2 Depth 2
 ; GCN-NEXT:    v_add_co_u32 v6, vcc_lo, v0, 8
-; GCN-NEXT:    s_mov_b32 s5, exec_lo
 ; GCN-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo
+; GCN-NEXT:    s_mov_b32 s5, exec_lo
 ; GCN-NEXT:    s_clause 0x1
 ; GCN-NEXT:    flat_load_dwordx2 v[4:5], v[6:7]
 ; GCN-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
@@ -26,8 +26,8 @@ define void @vgpr_descriptor_waterfall_loop_idom_update(<4 x i32>* %arg) #0 {
 ; GCN-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[4:5]
 ; GCN-NEXT:    s_and_b32 s4, vcc_lo, s4
 ; GCN-NEXT:    s_and_saveexec_b32 s4, s4
-; GCN-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
 ; GCN-NEXT:    buffer_store_dword v0, v0, s[8:11], 0 offen
+; GCN-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
 ; GCN-NEXT:    s_waitcnt_depctr 0xffe3
 ; GCN-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
 ; GCN-NEXT:    s_cbranch_execnz BB0_2

diff  --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll
index 09572a2b85bb6..3087fdf9d5474 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll
@@ -166,8 +166,8 @@ define amdgpu_ps float @loop(i32 %z, float %v, i32 inreg %bound, float(float)* %
 ; SI-NEXT:    s_mov_b32 s39, 0x31c16000
 ; SI-NEXT:    s_add_u32 s36, s36, s1
 ; SI-NEXT:    s_addc_u32 s37, s37, 0
-; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    s_mov_b32 s32, 0
+; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; SI-NEXT:    s_xor_b32 s33, exec_lo, s0
 ; SI-NEXT:    s_cbranch_execz BB3_4
@@ -239,13 +239,13 @@ define amdgpu_ps float @loop_with_use(i32 %z, float %v, i32 inreg %bound, float(
 ; SI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
 ; SI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; SI-NEXT:    s_mov_b32 s38, -1
-; SI-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 6, v0
 ; SI-NEXT:    v_mov_b32_e32 v40, v1
+; SI-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 6, v0
 ; SI-NEXT:    s_mov_b32 s39, 0x31c16000
 ; SI-NEXT:    s_add_u32 s36, s36, s1
 ; SI-NEXT:    s_addc_u32 s37, s37, 0
-; SI-NEXT:    ; implicit-def: $vgpr0
 ; SI-NEXT:    s_mov_b32 s32, 0
+; SI-NEXT:    ; implicit-def: $vgpr0
 ; SI-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; SI-NEXT:    s_xor_b32 s33, exec_lo, s0
 ; SI-NEXT:    s_cbranch_execz BB4_4

diff  --git a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
index e17c98ab7c45f..4052166c5a8e6 100644
--- a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
+++ b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
@@ -148,8 +148,8 @@ define amdgpu_kernel void @widen_i17_constant_load(i17 addrspace(4)* %arg) {
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; VI-NEXT:    v_mov_b32_e32 v0, 0
-; VI-NEXT:    v_mov_b32_e32 v2, 2
 ; VI-NEXT:    v_mov_b32_e32 v1, 0
+; VI-NEXT:    v_mov_b32_e32 v2, 2
 ; VI-NEXT:    v_mov_b32_e32 v3, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_load_dword s0, s[0:1], 0x0

diff  --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
index e6ee4a519caa8..7061a2370938e 100644
--- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
@@ -518,8 +518,8 @@ define amdgpu_gfx i64 @strict_wwm_called_i64(i64 %a) noinline {
 ; GFX9-O3-NEXT:    v_add_co_u32_e32 v2, vcc, v0, v0
 ; GFX9-O3-NEXT:    v_addc_co_u32_e32 v3, vcc, v1, v1, vcc
 ; GFX9-O3-NEXT:    v_mul_lo_u32 v4, v3, v0
-; GFX9-O3-NEXT:    v_mul_hi_u32 v5, v2, v0
 ; GFX9-O3-NEXT:    v_mul_lo_u32 v1, v2, v1
+; GFX9-O3-NEXT:    v_mul_hi_u32 v5, v2, v0
 ; GFX9-O3-NEXT:    v_mul_lo_u32 v0, v2, v0
 ; GFX9-O3-NEXT:    v_add3_u32 v1, v5, v1, v4
 ; GFX9-O3-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v2
@@ -823,9 +823,9 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in
 ; GFX9-O3-NEXT:    s_not_b64 exec, exec
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v9, v3
-; GFX9-O3-NEXT:    v_mov_b32_e32 v11, v5
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v8, v2
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v10, v4
+; GFX9-O3-NEXT:    v_mov_b32_e32 v11, v5
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v12, v6
 ; GFX9-O3-NEXT:    buffer_store_dwordx4 v[7:10], v0, s[4:7], 0 offen
 ; GFX9-O3-NEXT:    buffer_store_dwordx2 v[11:12], v0, s[4:7], 0 offen offset:16

diff  --git a/llvm/test/CodeGen/AMDGPU/xor3.ll b/llvm/test/CodeGen/AMDGPU/xor3.ll
index 5cd41c0c9b193..b43fb96de2c16 100644
--- a/llvm/test/CodeGen/AMDGPU/xor3.ll
+++ b/llvm/test/CodeGen/AMDGPU/xor3.ll
@@ -138,9 +138,9 @@ define amdgpu_ps <2 x float> @xor3_multiuse_inner(i32 %a, i32 %b, i32 %c) {
 define amdgpu_ps float @xor3_uniform_vgpr(float inreg %a, float inreg %b, float inreg %c) {
 ; GFX9-LABEL: xor3_uniform_vgpr:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0x40400000
 ; GFX9-NEXT:    v_add_f32_e64 v0, s2, 1.0
 ; GFX9-NEXT:    v_add_f32_e64 v1, s3, 2.0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x40400000
 ; GFX9-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v2


        


More information about the llvm-commits mailing list