[llvm] [AArch64] Improve scheduling latency into Bundles (PR #86310)

David Green via llvm-commits llvm-commits at lists.llvm.org
Mon Apr 1 05:46:51 PDT 2024


https://github.com/davemgreen updated https://github.com/llvm/llvm-project/pull/86310

>From ef0c219f16345c99f293042e986b60bebdb27e1e Mon Sep 17 00:00:00 2001
From: David Green <david.green at arm.com>
Date: Mon, 1 Apr 2024 13:46:35 +0100
Subject: [PATCH] [AArch64] Improve scheduling latency into Bundles

By default the scheduling info of instructions into a BUNDLE are given a
latency of 0 as they operate on the implicit register of the bundle. This
modifies that for AArch64 so that the latency is adjusted to use the latency
from the instruction in the bundle instead. This essentially assumes that the
bundled instructions are executed in a single cycle, which for AArch64 is
probably OK considering they are mostly used for MOVPFX bundles, where this can
help create slightly better scheduling especially for in-order cores.
---
 .../llvm/CodeGen/TargetSubtargetInfo.h        |   4 +-
 llvm/lib/CodeGen/MachinePipeliner.cpp         |   3 +-
 llvm/lib/CodeGen/ScheduleDAGInstrs.cpp        |   8 +-
 .../SelectionDAG/ScheduleDAGSDNodes.cpp       |   2 +-
 llvm/lib/Target/AArch64/AArch64Subtarget.cpp  |  39 ++++++
 llvm/lib/Target/AArch64/AArch64Subtarget.h    |   3 +
 llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp    |   5 +-
 llvm/lib/Target/AMDGPU/GCNSubtarget.h         |   3 +-
 llvm/lib/Target/Hexagon/HexagonSubtarget.cpp  |   6 +-
 llvm/lib/Target/Hexagon/HexagonSubtarget.h    |   3 +-
 ...x-deinterleaving-add-mull-scalable-fast.ll |   6 +-
 .../CodeGen/AArch64/llvm-ir-to-intrinsic.ll   |   2 +-
 llvm/test/CodeGen/AArch64/misched-bundle.mir  |  36 +++---
 .../AArch64/sve-fixed-length-build-vector.ll  |   2 +-
 .../AArch64/sve-fixed-length-fp-arith.ll      |  24 ++--
 .../AArch64/sve-fixed-length-fp-fma.ll        |   6 +-
 .../AArch64/sve-fixed-length-fp-minmax.ll     |  24 ++--
 .../AArch64/sve-fixed-length-int-arith.ll     |   8 +-
 .../AArch64/sve-fixed-length-int-minmax.ll    |  32 ++---
 .../AArch64/sve-fixed-length-int-mulh.ll      |  16 +--
 .../AArch64/sve-fixed-length-int-rem.ll       |  96 +++++++-------
 .../AArch64/sve-fixed-length-int-shifts.ll    |  24 ++--
 llvm/test/CodeGen/AArch64/sve-fpext-load.ll   |   2 +-
 llvm/test/CodeGen/AArch64/sve-fptosi-sat.ll   |  70 +++++-----
 llvm/test/CodeGen/AArch64/sve-fptoui-sat.ll   | 122 +++++++++---------
 .../AArch64/sve-gather-scatter-addr-opts.ll   |  10 +-
 .../AArch64/sve-masked-gather-legalize.ll     |   4 +-
 llvm/test/CodeGen/AArch64/sve-smulo-sdnode.ll |  24 ++--
 llvm/test/CodeGen/AArch64/sve-split-fcvt.ll   |  34 ++---
 ...aming-mode-fixed-length-fp-extend-trunc.ll |   6 +-
 ...sve-streaming-mode-fixed-length-int-div.ll |   8 +-
 ...sve-streaming-mode-fixed-length-int-rem.ll |  32 ++---
 ...e-streaming-mode-fixed-length-int-to-fp.ll |  20 +--
 llvm/test/CodeGen/AArch64/sve-umulo-sdnode.ll |  16 +--
 .../test/CodeGen/AArch64/sve-vecreduce-dot.ll |   2 +-
 llvm/test/CodeGen/AArch64/sve2-xar.ll         |   2 +-
 36 files changed, 377 insertions(+), 327 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h b/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h
index 7f8ed5c5019890..5023e29ce145ac 100644
--- a/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h
@@ -235,7 +235,9 @@ class TargetSubtargetInfo : public MCSubtargetInfo {
   // and UseOpIdx are the indices of the operands in Def and Use, respectively.
   // Otherwise, either may be -1.
   virtual void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use,
-                                     int UseOpIdx, SDep &Dep) const {}
+                                     int UseOpIdx, SDep &Dep,
+                                     const TargetSchedModel *SchedModel) const {
+  }
 
   // For use with PostRAScheduling: get the anti-dependence breaking that should
   // be performed before post-RA scheduling.
diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp
index b9c6765be445a0..7d34e123b2bfbd 100644
--- a/llvm/lib/CodeGen/MachinePipeliner.cpp
+++ b/llvm/lib/CodeGen/MachinePipeliner.cpp
@@ -919,7 +919,8 @@ void SwingSchedulerDAG::updatePhiDependences() {
           if (!MI->isPHI()) {
             SDep Dep(SU, SDep::Data, Reg);
             Dep.setLatency(0);
-            ST.adjustSchedDependency(SU, 0, &I, MO.getOperandNo(), Dep);
+            ST.adjustSchedDependency(SU, 0, &I, MO.getOperandNo(), Dep,
+                                     &SchedModel);
             I.addPred(Dep);
           } else {
             HasPhiUse = Reg;
diff --git a/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp b/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
index 51ede7992af53d..c848ce4af86ccc 100644
--- a/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
+++ b/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
@@ -282,7 +282,7 @@ void ScheduleDAGInstrs::addPhysRegDataDeps(SUnit *SU, unsigned OperIdx) {
       } else {
         Dep.setLatency(0);
       }
-      ST.adjustSchedDependency(SU, OperIdx, UseSU, UseOpIdx, Dep);
+      ST.adjustSchedDependency(SU, OperIdx, UseSU, UseOpIdx, Dep, &SchedModel);
       UseSU->addPred(Dep);
     }
   }
@@ -323,7 +323,8 @@ void ScheduleDAGInstrs::addPhysRegDeps(SUnit *SU, unsigned OperIdx) {
           Dep.setLatency(
               SchedModel.computeOutputLatency(MI, OperIdx, DefInstr));
         }
-        ST.adjustSchedDependency(SU, OperIdx, DefSU, I->OpIdx, Dep);
+        ST.adjustSchedDependency(SU, OperIdx, DefSU, I->OpIdx, Dep,
+                                 &SchedModel);
         DefSU->addPred(Dep);
       }
     }
@@ -453,7 +454,8 @@ void ScheduleDAGInstrs::addVRegDefDeps(SUnit *SU, unsigned OperIdx) {
         SDep Dep(SU, SDep::Data, Reg);
         Dep.setLatency(SchedModel.computeOperandLatency(MI, OperIdx, Use,
                                                         I->OperandIndex));
-        ST.adjustSchedDependency(SU, OperIdx, UseSU, I->OperandIndex, Dep);
+        ST.adjustSchedDependency(SU, OperIdx, UseSU, I->OperandIndex, Dep,
+                                 &SchedModel);
         UseSU->addPred(Dep);
       }
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
index c9e2745f00c958..1066b1423ae4e4 100644
--- a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
@@ -512,7 +512,7 @@ void ScheduleDAGSDNodes::AddSchedEdges() {
         Dep.setLatency(OpLatency);
         if (!isChain && !UnitLatencies) {
           computeOperandLatency(OpN, N, i, Dep);
-          ST.adjustSchedDependency(OpSU, DefIdx, &SU, i, Dep);
+          ST.adjustSchedDependency(OpSU, DefIdx, &SU, i, Dep, nullptr);
         }
 
         if (!SU.addPred(Dep) && !Dep.isCtrl() && OpSU->NumRegDefsLeft > 1) {
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
index bb268b2ba926cb..9eb1ecbb031580 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -472,6 +472,45 @@ void AArch64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
   Policy.DisableLatencyHeuristic = DisableLatencySchedHeuristic;
 }
 
+void AArch64Subtarget::adjustSchedDependency(
+    SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, SDep &Dep,
+    const TargetSchedModel *SchedModel) const {
+  if (!SchedModel || Dep.getKind() != SDep::Kind::Data || !Dep.getReg() ||
+      !Def->isInstr() || !Use->isInstr() ||
+      (Def->getInstr()->getOpcode() != TargetOpcode::BUNDLE &&
+       Use->getInstr()->getOpcode() != TargetOpcode::BUNDLE))
+    return;
+
+  // If the Def is a BUNDLE, find the last instruction in the bundle that defs
+  // the register.
+  const MachineInstr *DefMI = Def->getInstr();
+  if (DefMI->getOpcode() == TargetOpcode::BUNDLE) {
+    Register Reg = DefMI->getOperand(DefOpIdx).getReg();
+    for (const auto &Op : const_mi_bundle_ops(*DefMI)) {
+      if (Op.isReg() && Op.isDef() && Op.getReg() == Reg) {
+        DefMI = Op.getParent();
+        DefOpIdx = Op.getOperandNo();
+      }
+    }
+  }
+
+  // If the Use is a BUNDLE, find the first instruction that uses the Reg.
+  const MachineInstr *UseMI = Use->getInstr();
+  if (UseMI->getOpcode() == TargetOpcode::BUNDLE) {
+    Register Reg = UseMI->getOperand(UseOpIdx).getReg();
+    for (const auto &Op : const_mi_bundle_ops(*UseMI)) {
+      if (Op.isReg() && Op.isUse() && Op.getReg() == Reg) {
+        UseMI = Op.getParent();
+        UseOpIdx = Op.getOperandNo();
+        break;
+      }
+    }
+  }
+
+  Dep.setLatency(
+      SchedModel->computeOperandLatency(DefMI, DefOpIdx, UseMI, UseOpIdx));
+}
+
 bool AArch64Subtarget::enableEarlyIfConversion() const {
   return EnableEarlyIfConvert;
 }
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h
index 95bef7a76bcab7..c5ebd4c6cc615a 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.h
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -354,6 +354,9 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
 
   void overrideSchedPolicy(MachineSchedPolicy &Policy,
                            unsigned NumRegionInstrs) const override;
+  void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx,
+                             SDep &Dep,
+                             const TargetSchedModel *SchedModel) const override;
 
   bool enableEarlyIfConversion() const override;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index fa77b94fc22def..4e35a6e405a5fa 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -849,8 +849,9 @@ unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
   return getBaseMaxNumVGPRs(F, MFI.getWavesPerEU());
 }
 
-void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use,
-                                         int UseOpIdx, SDep &Dep) const {
+void GCNSubtarget::adjustSchedDependency(
+    SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, SDep &Dep,
+    const TargetSchedModel *SchedModel) const {
   if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() ||
       !Def->isInstr() || !Use->isInstr())
     return;
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 4da10beabe3162..7763acd6602c4b 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1495,7 +1495,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   }
 
   void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx,
-                             SDep &Dep) const override;
+                             SDep &Dep,
+                             const TargetSchedModel *SchedModel) const override;
 
   // \returns true if it's beneficial on this subtarget for the scheduler to
   // cluster stores as well as loads.
diff --git a/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp b/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp
index 2d320e6b0cad75..da8ab5c4b21bbf 100644
--- a/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp
@@ -437,9 +437,9 @@ bool HexagonSubtarget::useAA() const {
 
 /// Perform target specific adjustments to the latency of a schedule
 /// dependency.
-void HexagonSubtarget::adjustSchedDependency(SUnit *Src, int SrcOpIdx,
-                                             SUnit *Dst, int DstOpIdx,
-                                             SDep &Dep) const {
+void HexagonSubtarget::adjustSchedDependency(
+    SUnit *Src, int SrcOpIdx, SUnit *Dst, int DstOpIdx, SDep &Dep,
+    const TargetSchedModel *SchedModel) const {
   if (!Src->isInstr() || !Dst->isInstr())
     return;
 
diff --git a/llvm/lib/Target/Hexagon/HexagonSubtarget.h b/llvm/lib/Target/Hexagon/HexagonSubtarget.h
index e56007ee21e75f..da3f8f56ccc4a7 100644
--- a/llvm/lib/Target/Hexagon/HexagonSubtarget.h
+++ b/llvm/lib/Target/Hexagon/HexagonSubtarget.h
@@ -308,7 +308,8 @@ class HexagonSubtarget : public HexagonGenSubtargetInfo {
   /// Perform target specific adjustments to the latency of a schedule
   /// dependency.
   void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx,
-                             SDep &Dep) const override;
+                             SDep &Dep,
+                             const TargetSchedModel *SchedModel) const override;
 
   unsigned getVectorLength() const {
     assert(useHVXOps());
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-fast.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-fast.ll
index aac43f7b960fed..1751f99b3e92c8 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-fast.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-fast.ll
@@ -190,8 +190,8 @@ define <vscale x 4 x double> @mul_add_rot_mull(<vscale x 4 x double> %a, <vscale
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    uzp1 z24.d, z2.d, z3.d
 ; CHECK-NEXT:    uzp2 z25.d, z0.d, z1.d
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    uzp2 z2.d, z2.d, z3.d
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    uzp1 z0.d, z0.d, z1.d
 ; CHECK-NEXT:    uzp1 z26.d, z6.d, z7.d
 ; CHECK-NEXT:    fmul z1.d, z24.d, z25.d
@@ -199,12 +199,12 @@ define <vscale x 4 x double> @mul_add_rot_mull(<vscale x 4 x double> %a, <vscale
 ; CHECK-NEXT:    uzp2 z25.d, z4.d, z5.d
 ; CHECK-NEXT:    uzp1 z4.d, z4.d, z5.d
 ; CHECK-NEXT:    uzp2 z5.d, z6.d, z7.d
-; CHECK-NEXT:    fmla z1.d, p0/m, z2.d, z0.d
 ; CHECK-NEXT:    fmla z3.d, p0/m, z26.d, z25.d
+; CHECK-NEXT:    fmla z1.d, p0/m, z2.d, z0.d
 ; CHECK-NEXT:    movprfx z2, z3
 ; CHECK-NEXT:    fmla z2.d, p0/m, z5.d, z4.d
-; CHECK-NEXT:    fnmls z2.d, p0/m, z24.d, z0.d
 ; CHECK-NEXT:    fmla z1.d, p0/m, z26.d, z4.d
+; CHECK-NEXT:    fnmls z2.d, p0/m, z24.d, z0.d
 ; CHECK-NEXT:    fmls z1.d, p0/m, z5.d, z25.d
 ; CHECK-NEXT:    zip1 z0.d, z2.d, z1.d
 ; CHECK-NEXT:    zip2 z1.d, z2.d, z1.d
diff --git a/llvm/test/CodeGen/AArch64/llvm-ir-to-intrinsic.ll b/llvm/test/CodeGen/AArch64/llvm-ir-to-intrinsic.ll
index f8cba4ce4b63bf..ab15bf564ec425 100644
--- a/llvm/test/CodeGen/AArch64/llvm-ir-to-intrinsic.ll
+++ b/llvm/test/CodeGen/AArch64/llvm-ir-to-intrinsic.ll
@@ -1145,8 +1145,8 @@ define <vscale x 4 x i64> @fshl_rot_illegal_i64(<vscale x 4 x i64> %a, <vscale x
 ; CHECK-NEXT:    subr z3.d, z3.d, #0 // =0x0
 ; CHECK-NEXT:    and z4.d, z4.d, #0x3f
 ; CHECK-NEXT:    and z2.d, z2.d, #0x3f
-; CHECK-NEXT:    and z3.d, z3.d, #0x3f
 ; CHECK-NEXT:    and z5.d, z5.d, #0x3f
+; CHECK-NEXT:    and z3.d, z3.d, #0x3f
 ; CHECK-NEXT:    lslr z4.d, p0/m, z4.d, z0.d
 ; CHECK-NEXT:    lsr z0.d, p0/m, z0.d, z2.d
 ; CHECK-NEXT:    movprfx z2, z1
diff --git a/llvm/test/CodeGen/AArch64/misched-bundle.mir b/llvm/test/CodeGen/AArch64/misched-bundle.mir
index a947c04a4229e6..9adcd2904a250a 100644
--- a/llvm/test/CodeGen/AArch64/misched-bundle.mir
+++ b/llvm/test/CodeGen/AArch64/misched-bundle.mir
@@ -46,9 +46,9 @@
 # CHECK-NEXT:   # rdefs left       : 0
 # CHECK-NEXT:   Latency            : 3
 # CHECK-NEXT:   Depth              : 0
-# CHECK-NEXT:   Height             : 0
+# CHECK-NEXT:   Height             : 7
 # CHECK-NEXT:   Successors:
-# CHECK-NEXT:     SU(7): Data Latency=0 Reg=$z3
+# CHECK-NEXT:     SU(7): Data Latency=3 Reg=$z3
 # CHECK-NEXT:     SU(9): Ord  Latency=0 Memory
 # CHECK-NEXT:     SU(8): Ord  Latency=0 Memory
 # CHECK-NEXT:   Single Issue       : false;
@@ -58,9 +58,9 @@
 # CHECK-NEXT:   # rdefs left       : 0
 # CHECK-NEXT:   Latency            : 3
 # CHECK-NEXT:   Depth              : 0
-# CHECK-NEXT:   Height             : 0
+# CHECK-NEXT:   Height             : 7
 # CHECK-NEXT:   Successors:
-# CHECK-NEXT:     SU(7): Data Latency=0 Reg=$z4
+# CHECK-NEXT:     SU(7): Data Latency=3 Reg=$z4
 # CHECK-NEXT:     SU(9): Ord  Latency=0 Memory
 # CHECK-NEXT:     SU(8): Ord  Latency=0 Memory
 # CHECK-NEXT:   Single Issue       : false;
@@ -70,9 +70,9 @@
 # CHECK-NEXT:   # rdefs left       : 0
 # CHECK-NEXT:   Latency            : 3
 # CHECK-NEXT:   Depth              : 0
-# CHECK-NEXT:   Height             : 0
+# CHECK-NEXT:   Height             : 7
 # CHECK-NEXT:   Successors:
-# CHECK-NEXT:     SU(7): Data Latency=0 Reg=$z5
+# CHECK-NEXT:     SU(7): Data Latency=3 Reg=$z5
 # CHECK-NEXT:     SU(9): Ord  Latency=0 Memory
 # CHECK-NEXT:     SU(8): Ord  Latency=0 Memory
 # CHECK-NEXT:   Single Issue       : false;
@@ -98,15 +98,15 @@
 # CHECK-NEXT:   # rdefs left       : 0
 # CHECK-NEXT:   Latency            : 1
 # CHECK-NEXT:   Depth              : 3
-# CHECK-NEXT:   Height             : 0
+# CHECK-NEXT:   Height             : 4
 # CHECK-NEXT:   Predecessors:
 # CHECK-NEXT:     SU(6): Anti Latency=0
-# CHECK-NEXT:     SU(5): Data Latency=0 Reg=$z5
-# CHECK-NEXT:     SU(4): Data Latency=0 Reg=$z4
-# CHECK-NEXT:     SU(3): Data Latency=0 Reg=$z3
+# CHECK-NEXT:     SU(5): Data Latency=3 Reg=$z5
+# CHECK-NEXT:     SU(4): Data Latency=3 Reg=$z4
+# CHECK-NEXT:     SU(3): Data Latency=3 Reg=$z3
 # CHECK-NEXT:     SU(1): Out  Latency=1
 # CHECK-NEXT:   Successors:
-# CHECK-NEXT:     SU(9): Data Latency=0 Reg=$z1
+# CHECK-NEXT:     SU(9): Data Latency=4 Reg=$z1
 # CHECK-NEXT:   Single Issue       : false;
 # CHECK-NEXT: SU(8):   ST1H killed renamable $z0, renamable $p0, renamable $x0, renamable $x10 :: (store unknown-size, align 1)
 # CHECK-NEXT:   # preds left       : 7
@@ -135,7 +135,7 @@
 # CHECK-NEXT:   Height             : 0
 # CHECK-NEXT:   Predecessors:
 # CHECK-NEXT:     SU(8): Ord  Latency=0 Memory
-# CHECK-NEXT:     SU(7): Data Latency=0 Reg=$z1
+# CHECK-NEXT:     SU(7): Data Latency=4 Reg=$z1
 # CHECK-NEXT:     SU(5): Ord  Latency=0 Memory
 # CHECK-NEXT:     SU(4): Ord  Latency=0 Memory
 # CHECK-NEXT:     SU(3): Ord  Latency=0 Memory
@@ -159,24 +159,24 @@ body:             |
   bb.0.entry:
     liveins: $p0, $x0, $x1, $x2, $x10, $x11, $x12, $x13
 
+
     ; CHECK-LABEL: name: test
     ; CHECK: liveins: $p0, $x0, $x1, $x2, $x10, $x11, $x12, $x13
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: renamable $z0 = LD1H renamable $p0, renamable $x1, renamable $x10 :: (load unknown-size, align 1)
     ; CHECK-NEXT: renamable $z1 = LD1H renamable $p0, renamable $x2, renamable $x10 :: (load unknown-size, align 1)
     ; CHECK-NEXT: renamable $z2 = LD1H renamable $p0, renamable $x0, renamable $x10 :: (load unknown-size, align 1)
-    ; CHECK-NEXT: $z0 = FMAD_ZPmZZ_H renamable $p0, killed $z0, renamable $z1, killed renamable $z2
     ; CHECK-NEXT: renamable $z3 = LD1H renamable $p0, renamable $x11, renamable $x10 :: (load unknown-size, align 1)
     ; CHECK-NEXT: renamable $z4 = LD1H renamable $p0, renamable $x12, renamable $x10 :: (load unknown-size, align 1)
     ; CHECK-NEXT: renamable $z5 = LD1H renamable $p0, renamable $x13, renamable $x10 :: (load unknown-size, align 1)
-    ; CHECK-NEXT: ST1H killed renamable $z0, renamable $p0, renamable $x0, renamable $x10 :: (store unknown-size, align 1)
-    ; CHECK-NEXT: BUNDLE implicit-def $z1, implicit-def $q1, implicit-def $d1, implicit-def $s1, implicit-def $h1, implicit-def $b1, implicit $z5, implicit $p0, implicit $z4, implicit $z3 {
+    ; CHECK-NEXT: $z0 = FMAD_ZPmZZ_H renamable $p0, killed $z0, killed renamable $z1, killed renamable $z2
+    ; CHECK-NEXT: BUNDLE implicit-def $z1, implicit-def $q1, implicit-def $d1, implicit-def $s1, implicit-def $h1, implicit-def $b1, implicit $z5, implicit $p0, implicit killed $z4, implicit killed $z3 {
     ; CHECK-NEXT:   $z1 = MOVPRFX_ZZ $z5
-    ; CHECK-NEXT:   $z1 = FMLA_ZPmZZ_H renamable $p0, internal $z1, renamable $z4, renamable $z3
+    ; CHECK-NEXT:   $z1 = FMLA_ZPmZZ_H renamable $p0, internal killed $z1, killed renamable $z4, killed renamable $z3
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: ST1H renamable $z1, renamable $p0, renamable $x13, renamable $x10 :: (store unknown-size, align 1)
+    ; CHECK-NEXT: ST1H killed renamable $z0, renamable $p0, renamable $x0, renamable $x10 :: (store unknown-size, align 1)
+    ; CHECK-NEXT: ST1H killed renamable $z1, renamable $p0, renamable $x13, renamable $x10 :: (store unknown-size, align 1)
     ; CHECK-NEXT: RET_ReallyLR
-
     renamable $z0 = LD1H renamable $p0, renamable $x1, renamable $x10 :: (load unknown-size)
     renamable $z1 = LD1H renamable $p0, renamable $x2, renamable $x10 :: (load unknown-size)
     renamable $z2 = LD1H renamable $p0, renamable $x0, renamable $x10 :: (load unknown-size)
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-build-vector.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-build-vector.ll
index 0e3307d1617298..ad482118ec0bbe 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-build-vector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-build-vector.ll
@@ -43,7 +43,7 @@ define void @build_vector_minus2_dec32_v4i64(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: build_vector_minus2_dec32_v4i64:
 ; VBITS_GE_256:       // %bb.0:
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
-; VBITS_GE_256-NEXT:    mov x8, #-32
+; VBITS_GE_256-NEXT:    mov x8, #-32 // =0xffffffffffffffe0
 ; VBITS_GE_256-NEXT:    index z0.d, #-2, x8
 ; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-arith.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-arith.ll
index 64c4eea0ee5f36..6fbae7edfec0a3 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-arith.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-arith.ll
@@ -57,8 +57,8 @@ define void @fadd_v32f16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    fadd z0.h, p0/m, z0.h, z1.h
 ; VBITS_GE_256-NEXT:    movprfx z1, z2
 ; VBITS_GE_256-NEXT:    fadd z1.h, p0/m, z1.h, z3.h
-; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
 ; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: fadd_v32f16:
@@ -156,8 +156,8 @@ define void @fadd_v16f32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    fadd z0.s, p0/m, z0.s, z1.s
 ; VBITS_GE_256-NEXT:    movprfx z1, z2
 ; VBITS_GE_256-NEXT:    fadd z1.s, p0/m, z1.s, z3.s
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
 ; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: fadd_v16f32:
@@ -255,8 +255,8 @@ define void @fadd_v8f64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    fadd z0.d, p0/m, z0.d, z1.d
 ; VBITS_GE_256-NEXT:    movprfx z1, z2
 ; VBITS_GE_256-NEXT:    fadd z1.d, p0/m, z1.d, z3.d
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
 ; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: fadd_v8f64:
@@ -662,8 +662,8 @@ define void @fma_v32f16(ptr %a, ptr %b, ptr %c) #0 {
 ; VBITS_GE_256-NEXT:    fmad z0.h, p0/m, z1.h, z2.h
 ; VBITS_GE_256-NEXT:    movprfx z1, z5
 ; VBITS_GE_256-NEXT:    fmla z1.h, p0/m, z3.h, z4.h
-; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
 ; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: fma_v32f16:
@@ -773,8 +773,8 @@ define void @fma_v16f32(ptr %a, ptr %b, ptr %c) #0 {
 ; VBITS_GE_256-NEXT:    fmad z0.s, p0/m, z1.s, z2.s
 ; VBITS_GE_256-NEXT:    movprfx z1, z5
 ; VBITS_GE_256-NEXT:    fmla z1.s, p0/m, z3.s, z4.s
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
 ; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: fma_v16f32:
@@ -883,8 +883,8 @@ define void @fma_v8f64(ptr %a, ptr %b, ptr %c) #0 {
 ; VBITS_GE_256-NEXT:    fmad z0.d, p0/m, z1.d, z2.d
 ; VBITS_GE_256-NEXT:    movprfx z1, z5
 ; VBITS_GE_256-NEXT:    fmla z1.d, p0/m, z3.d, z4.d
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
 ; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: fma_v8f64:
@@ -992,8 +992,8 @@ define void @fmul_v32f16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    fmul z0.h, p0/m, z0.h, z1.h
 ; VBITS_GE_256-NEXT:    movprfx z1, z2
 ; VBITS_GE_256-NEXT:    fmul z1.h, p0/m, z1.h, z3.h
-; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
 ; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: fmul_v32f16:
@@ -1091,8 +1091,8 @@ define void @fmul_v16f32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    fmul z0.s, p0/m, z0.s, z1.s
 ; VBITS_GE_256-NEXT:    movprfx z1, z2
 ; VBITS_GE_256-NEXT:    fmul z1.s, p0/m, z1.s, z3.s
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
 ; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: fmul_v16f32:
@@ -1190,8 +1190,8 @@ define void @fmul_v8f64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    fmul z0.d, p0/m, z0.d, z1.d
 ; VBITS_GE_256-NEXT:    movprfx z1, z2
 ; VBITS_GE_256-NEXT:    fmul z1.d, p0/m, z1.d, z3.d
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
 ; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: fmul_v8f64:
@@ -1829,8 +1829,8 @@ define void @fsub_v32f16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    fsub z0.h, p0/m, z0.h, z1.h
 ; VBITS_GE_256-NEXT:    movprfx z1, z2
 ; VBITS_GE_256-NEXT:    fsub z1.h, p0/m, z1.h, z3.h
-; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
 ; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: fsub_v32f16:
@@ -1928,8 +1928,8 @@ define void @fsub_v16f32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    fsub z0.s, p0/m, z0.s, z1.s
 ; VBITS_GE_256-NEXT:    movprfx z1, z2
 ; VBITS_GE_256-NEXT:    fsub z1.s, p0/m, z1.s, z3.s
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
 ; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: fsub_v16f32:
@@ -2027,8 +2027,8 @@ define void @fsub_v8f64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    fsub z0.d, p0/m, z0.d, z1.d
 ; VBITS_GE_256-NEXT:    movprfx z1, z2
 ; VBITS_GE_256-NEXT:    fsub z1.d, p0/m, z1.d, z3.d
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
 ; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: fsub_v8f64:
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-fma.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-fma.ll
index d3713b989301b2..e1ec5ee5f61371 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-fma.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-fma.ll
@@ -66,8 +66,8 @@ define void @fma_v32f16(ptr %a, ptr %b, ptr %c) #0 {
 ; VBITS_GE_256-NEXT:    fmad z0.h, p0/m, z1.h, z2.h
 ; VBITS_GE_256-NEXT:    movprfx z1, z5
 ; VBITS_GE_256-NEXT:    fmla z1.h, p0/m, z3.h, z4.h
-; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
 ; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: fma_v32f16:
@@ -183,8 +183,8 @@ define void @fma_v16f32(ptr %a, ptr %b, ptr %c) #0 {
 ; VBITS_GE_256-NEXT:    fmad z0.s, p0/m, z1.s, z2.s
 ; VBITS_GE_256-NEXT:    movprfx z1, z5
 ; VBITS_GE_256-NEXT:    fmla z1.s, p0/m, z3.s, z4.s
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
 ; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: fma_v16f32:
@@ -299,8 +299,8 @@ define void @fma_v8f64(ptr %a, ptr %b, ptr %c) #0 {
 ; VBITS_GE_256-NEXT:    fmad z0.d, p0/m, z1.d, z2.d
 ; VBITS_GE_256-NEXT:    movprfx z1, z5
 ; VBITS_GE_256-NEXT:    fmla z1.d, p0/m, z3.d, z4.d
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
 ; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: fma_v8f64:
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-minmax.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-minmax.ll
index d20f083066a832..de60deeafaf328 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-minmax.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-minmax.ll
@@ -57,8 +57,8 @@ define void @fmaxnm_v32f16(ptr %a, ptr %b) #0 {
 ; VBITS_EQ_256-NEXT:    fmaxnm z0.h, p0/m, z0.h, z1.h
 ; VBITS_EQ_256-NEXT:    movprfx z1, z2
 ; VBITS_EQ_256-NEXT:    fmaxnm z1.h, p0/m, z1.h, z3.h
-; VBITS_EQ_256-NEXT:    st1h { z1.h }, p0, [x0]
 ; VBITS_EQ_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_EQ_256-NEXT:    st1h { z1.h }, p0, [x0]
 ; VBITS_EQ_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: fmaxnm_v32f16:
@@ -156,8 +156,8 @@ define void @fmaxnm_v16f32(ptr %a, ptr %b) #0 {
 ; VBITS_EQ_256-NEXT:    fmaxnm z0.s, p0/m, z0.s, z1.s
 ; VBITS_EQ_256-NEXT:    movprfx z1, z2
 ; VBITS_EQ_256-NEXT:    fmaxnm z1.s, p0/m, z1.s, z3.s
-; VBITS_EQ_256-NEXT:    st1w { z1.s }, p0, [x0]
 ; VBITS_EQ_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_EQ_256-NEXT:    st1w { z1.s }, p0, [x0]
 ; VBITS_EQ_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: fmaxnm_v16f32:
@@ -255,8 +255,8 @@ define void @fmaxnm_v8f64(ptr %a, ptr %b) #0 {
 ; VBITS_EQ_256-NEXT:    fmaxnm z0.d, p0/m, z0.d, z1.d
 ; VBITS_EQ_256-NEXT:    movprfx z1, z2
 ; VBITS_EQ_256-NEXT:    fmaxnm z1.d, p0/m, z1.d, z3.d
-; VBITS_EQ_256-NEXT:    st1d { z1.d }, p0, [x0]
 ; VBITS_EQ_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_EQ_256-NEXT:    st1d { z1.d }, p0, [x0]
 ; VBITS_EQ_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: fmaxnm_v8f64:
@@ -358,8 +358,8 @@ define void @fminnm_v32f16(ptr %a, ptr %b) #0 {
 ; VBITS_EQ_256-NEXT:    fminnm z0.h, p0/m, z0.h, z1.h
 ; VBITS_EQ_256-NEXT:    movprfx z1, z2
 ; VBITS_EQ_256-NEXT:    fminnm z1.h, p0/m, z1.h, z3.h
-; VBITS_EQ_256-NEXT:    st1h { z1.h }, p0, [x0]
 ; VBITS_EQ_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_EQ_256-NEXT:    st1h { z1.h }, p0, [x0]
 ; VBITS_EQ_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: fminnm_v32f16:
@@ -457,8 +457,8 @@ define void @fminnm_v16f32(ptr %a, ptr %b) #0 {
 ; VBITS_EQ_256-NEXT:    fminnm z0.s, p0/m, z0.s, z1.s
 ; VBITS_EQ_256-NEXT:    movprfx z1, z2
 ; VBITS_EQ_256-NEXT:    fminnm z1.s, p0/m, z1.s, z3.s
-; VBITS_EQ_256-NEXT:    st1w { z1.s }, p0, [x0]
 ; VBITS_EQ_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_EQ_256-NEXT:    st1w { z1.s }, p0, [x0]
 ; VBITS_EQ_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: fminnm_v16f32:
@@ -556,8 +556,8 @@ define void @fminnm_v8f64(ptr %a, ptr %b) #0 {
 ; VBITS_EQ_256-NEXT:    fminnm z0.d, p0/m, z0.d, z1.d
 ; VBITS_EQ_256-NEXT:    movprfx z1, z2
 ; VBITS_EQ_256-NEXT:    fminnm z1.d, p0/m, z1.d, z3.d
-; VBITS_EQ_256-NEXT:    st1d { z1.d }, p0, [x0]
 ; VBITS_EQ_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_EQ_256-NEXT:    st1d { z1.d }, p0, [x0]
 ; VBITS_EQ_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: fminnm_v8f64:
@@ -659,8 +659,8 @@ define void @fmax_v32f16(ptr %a, ptr %b) #0 {
 ; VBITS_EQ_256-NEXT:    fmax z0.h, p0/m, z0.h, z1.h
 ; VBITS_EQ_256-NEXT:    movprfx z1, z2
 ; VBITS_EQ_256-NEXT:    fmax z1.h, p0/m, z1.h, z3.h
-; VBITS_EQ_256-NEXT:    st1h { z1.h }, p0, [x0]
 ; VBITS_EQ_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_EQ_256-NEXT:    st1h { z1.h }, p0, [x0]
 ; VBITS_EQ_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: fmax_v32f16:
@@ -758,8 +758,8 @@ define void @fmax_v16f32(ptr %a, ptr %b) #0 {
 ; VBITS_EQ_256-NEXT:    fmax z0.s, p0/m, z0.s, z1.s
 ; VBITS_EQ_256-NEXT:    movprfx z1, z2
 ; VBITS_EQ_256-NEXT:    fmax z1.s, p0/m, z1.s, z3.s
-; VBITS_EQ_256-NEXT:    st1w { z1.s }, p0, [x0]
 ; VBITS_EQ_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_EQ_256-NEXT:    st1w { z1.s }, p0, [x0]
 ; VBITS_EQ_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: fmax_v16f32:
@@ -857,8 +857,8 @@ define void @fmax_v8f64(ptr %a, ptr %b) #0 {
 ; VBITS_EQ_256-NEXT:    fmax z0.d, p0/m, z0.d, z1.d
 ; VBITS_EQ_256-NEXT:    movprfx z1, z2
 ; VBITS_EQ_256-NEXT:    fmax z1.d, p0/m, z1.d, z3.d
-; VBITS_EQ_256-NEXT:    st1d { z1.d }, p0, [x0]
 ; VBITS_EQ_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_EQ_256-NEXT:    st1d { z1.d }, p0, [x0]
 ; VBITS_EQ_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: fmax_v8f64:
@@ -960,8 +960,8 @@ define void @fmin_v32f16(ptr %a, ptr %b) #0 {
 ; VBITS_EQ_256-NEXT:    fmin z0.h, p0/m, z0.h, z1.h
 ; VBITS_EQ_256-NEXT:    movprfx z1, z2
 ; VBITS_EQ_256-NEXT:    fmin z1.h, p0/m, z1.h, z3.h
-; VBITS_EQ_256-NEXT:    st1h { z1.h }, p0, [x0]
 ; VBITS_EQ_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_EQ_256-NEXT:    st1h { z1.h }, p0, [x0]
 ; VBITS_EQ_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: fmin_v32f16:
@@ -1059,8 +1059,8 @@ define void @fmin_v16f32(ptr %a, ptr %b) #0 {
 ; VBITS_EQ_256-NEXT:    fmin z0.s, p0/m, z0.s, z1.s
 ; VBITS_EQ_256-NEXT:    movprfx z1, z2
 ; VBITS_EQ_256-NEXT:    fmin z1.s, p0/m, z1.s, z3.s
-; VBITS_EQ_256-NEXT:    st1w { z1.s }, p0, [x0]
 ; VBITS_EQ_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_EQ_256-NEXT:    st1w { z1.s }, p0, [x0]
 ; VBITS_EQ_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: fmin_v16f32:
@@ -1158,8 +1158,8 @@ define void @fmin_v8f64(ptr %a, ptr %b) #0 {
 ; VBITS_EQ_256-NEXT:    fmin z0.d, p0/m, z0.d, z1.d
 ; VBITS_EQ_256-NEXT:    movprfx z1, z2
 ; VBITS_EQ_256-NEXT:    fmin z1.d, p0/m, z1.d, z3.d
-; VBITS_EQ_256-NEXT:    st1d { z1.d }, p0, [x0]
 ; VBITS_EQ_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_EQ_256-NEXT:    st1d { z1.d }, p0, [x0]
 ; VBITS_EQ_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: fmin_v8f64:
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-arith.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-arith.ll
index 8f2aeade48a0f8..b45a62f4e7581a 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-arith.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-arith.ll
@@ -458,8 +458,8 @@ define void @mul_v64i8(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    mul z0.b, p0/m, z0.b, z1.b
 ; VBITS_GE_256-NEXT:    movprfx z1, z2
 ; VBITS_GE_256-NEXT:    mul z1.b, p0/m, z1.b, z3.b
-; VBITS_GE_256-NEXT:    st1b { z1.b }, p0, [x0]
 ; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x0, x8]
+; VBITS_GE_256-NEXT:    st1b { z1.b }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: mul_v64i8:
@@ -557,8 +557,8 @@ define void @mul_v32i16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    mul z0.h, p0/m, z0.h, z1.h
 ; VBITS_GE_256-NEXT:    movprfx z1, z2
 ; VBITS_GE_256-NEXT:    mul z1.h, p0/m, z1.h, z3.h
-; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
 ; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: mul_v32i16:
@@ -656,8 +656,8 @@ define void @mul_v16i32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    mul z0.s, p0/m, z0.s, z1.s
 ; VBITS_GE_256-NEXT:    movprfx z1, z2
 ; VBITS_GE_256-NEXT:    mul z1.s, p0/m, z1.s, z3.s
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
 ; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: mul_v16i32:
@@ -761,8 +761,8 @@ define void @mul_v8i64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    mul z0.d, p0/m, z0.d, z1.d
 ; VBITS_GE_256-NEXT:    movprfx z1, z2
 ; VBITS_GE_256-NEXT:    mul z1.d, p0/m, z1.d, z3.d
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
 ; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: mul_v8i64:
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-minmax.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-minmax.ll
index 296e4818b1cd7f..4926684ddc2de7 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-minmax.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-minmax.ll
@@ -57,8 +57,8 @@ define void @smax_v64i8(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    smax z0.b, p0/m, z0.b, z1.b
 ; VBITS_GE_256-NEXT:    movprfx z1, z2
 ; VBITS_GE_256-NEXT:    smax z1.b, p0/m, z1.b, z3.b
-; VBITS_GE_256-NEXT:    st1b { z1.b }, p0, [x0]
 ; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x0, x8]
+; VBITS_GE_256-NEXT:    st1b { z1.b }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: smax_v64i8:
@@ -156,8 +156,8 @@ define void @smax_v32i16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    smax z0.h, p0/m, z0.h, z1.h
 ; VBITS_GE_256-NEXT:    movprfx z1, z2
 ; VBITS_GE_256-NEXT:    smax z1.h, p0/m, z1.h, z3.h
-; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
 ; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: smax_v32i16:
@@ -255,8 +255,8 @@ define void @smax_v16i32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    smax z0.s, p0/m, z0.s, z1.s
 ; VBITS_GE_256-NEXT:    movprfx z1, z2
 ; VBITS_GE_256-NEXT:    smax z1.s, p0/m, z1.s, z3.s
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
 ; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: smax_v16i32:
@@ -362,8 +362,8 @@ define void @smax_v8i64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    smax z0.d, p0/m, z0.d, z1.d
 ; VBITS_GE_256-NEXT:    movprfx z1, z2
 ; VBITS_GE_256-NEXT:    smax z1.d, p0/m, z1.d, z3.d
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
 ; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: smax_v8i64:
@@ -465,8 +465,8 @@ define void @smin_v64i8(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    smin z0.b, p0/m, z0.b, z1.b
 ; VBITS_GE_256-NEXT:    movprfx z1, z2
 ; VBITS_GE_256-NEXT:    smin z1.b, p0/m, z1.b, z3.b
-; VBITS_GE_256-NEXT:    st1b { z1.b }, p0, [x0]
 ; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x0, x8]
+; VBITS_GE_256-NEXT:    st1b { z1.b }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: smin_v64i8:
@@ -564,8 +564,8 @@ define void @smin_v32i16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    smin z0.h, p0/m, z0.h, z1.h
 ; VBITS_GE_256-NEXT:    movprfx z1, z2
 ; VBITS_GE_256-NEXT:    smin z1.h, p0/m, z1.h, z3.h
-; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
 ; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: smin_v32i16:
@@ -663,8 +663,8 @@ define void @smin_v16i32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    smin z0.s, p0/m, z0.s, z1.s
 ; VBITS_GE_256-NEXT:    movprfx z1, z2
 ; VBITS_GE_256-NEXT:    smin z1.s, p0/m, z1.s, z3.s
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
 ; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: smin_v16i32:
@@ -770,8 +770,8 @@ define void @smin_v8i64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    smin z0.d, p0/m, z0.d, z1.d
 ; VBITS_GE_256-NEXT:    movprfx z1, z2
 ; VBITS_GE_256-NEXT:    smin z1.d, p0/m, z1.d, z3.d
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
 ; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: smin_v8i64:
@@ -873,8 +873,8 @@ define void @umax_v64i8(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    umax z0.b, p0/m, z0.b, z1.b
 ; VBITS_GE_256-NEXT:    movprfx z1, z2
 ; VBITS_GE_256-NEXT:    umax z1.b, p0/m, z1.b, z3.b
-; VBITS_GE_256-NEXT:    st1b { z1.b }, p0, [x0]
 ; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x0, x8]
+; VBITS_GE_256-NEXT:    st1b { z1.b }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: umax_v64i8:
@@ -972,8 +972,8 @@ define void @umax_v32i16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    umax z0.h, p0/m, z0.h, z1.h
 ; VBITS_GE_256-NEXT:    movprfx z1, z2
 ; VBITS_GE_256-NEXT:    umax z1.h, p0/m, z1.h, z3.h
-; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
 ; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: umax_v32i16:
@@ -1071,8 +1071,8 @@ define void @umax_v16i32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    umax z0.s, p0/m, z0.s, z1.s
 ; VBITS_GE_256-NEXT:    movprfx z1, z2
 ; VBITS_GE_256-NEXT:    umax z1.s, p0/m, z1.s, z3.s
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
 ; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: umax_v16i32:
@@ -1178,8 +1178,8 @@ define void @umax_v8i64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    umax z0.d, p0/m, z0.d, z1.d
 ; VBITS_GE_256-NEXT:    movprfx z1, z2
 ; VBITS_GE_256-NEXT:    umax z1.d, p0/m, z1.d, z3.d
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
 ; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: umax_v8i64:
@@ -1281,8 +1281,8 @@ define void @umin_v64i8(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    umin z0.b, p0/m, z0.b, z1.b
 ; VBITS_GE_256-NEXT:    movprfx z1, z2
 ; VBITS_GE_256-NEXT:    umin z1.b, p0/m, z1.b, z3.b
-; VBITS_GE_256-NEXT:    st1b { z1.b }, p0, [x0]
 ; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x0, x8]
+; VBITS_GE_256-NEXT:    st1b { z1.b }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: umin_v64i8:
@@ -1380,8 +1380,8 @@ define void @umin_v32i16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    umin z0.h, p0/m, z0.h, z1.h
 ; VBITS_GE_256-NEXT:    movprfx z1, z2
 ; VBITS_GE_256-NEXT:    umin z1.h, p0/m, z1.h, z3.h
-; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
 ; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: umin_v32i16:
@@ -1479,8 +1479,8 @@ define void @umin_v16i32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    umin z0.s, p0/m, z0.s, z1.s
 ; VBITS_GE_256-NEXT:    movprfx z1, z2
 ; VBITS_GE_256-NEXT:    umin z1.s, p0/m, z1.s, z3.s
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
 ; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: umin_v16i32:
@@ -1586,8 +1586,8 @@ define void @umin_v8i64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    umin z0.d, p0/m, z0.d, z1.d
 ; VBITS_GE_256-NEXT:    movprfx z1, z2
 ; VBITS_GE_256-NEXT:    umin z1.d, p0/m, z1.d, z3.d
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
 ; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: umin_v8i64:
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll
index 995da2be0cc170..331aab17a4cea7 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll
@@ -82,8 +82,8 @@ define void @smulh_v64i8(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    smulh z0.b, p0/m, z0.b, z1.b
 ; VBITS_GE_256-NEXT:    movprfx z1, z2
 ; VBITS_GE_256-NEXT:    smulh z1.b, p0/m, z1.b, z3.b
-; VBITS_GE_256-NEXT:    st1b { z1.b }, p0, [x0]
 ; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x0, x8]
+; VBITS_GE_256-NEXT:    st1b { z1.b }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: smulh_v64i8:
@@ -215,8 +215,8 @@ define void @smulh_v32i16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    smulh z0.h, p0/m, z0.h, z1.h
 ; VBITS_GE_256-NEXT:    movprfx z1, z2
 ; VBITS_GE_256-NEXT:    smulh z1.h, p0/m, z1.h, z3.h
-; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
 ; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: smulh_v32i16:
@@ -346,8 +346,8 @@ define void @smulh_v16i32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    smulh z0.s, p0/m, z0.s, z1.s
 ; VBITS_GE_256-NEXT:    movprfx z1, z2
 ; VBITS_GE_256-NEXT:    smulh z1.s, p0/m, z1.s, z3.s
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
 ; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: smulh_v16i32:
@@ -479,8 +479,8 @@ define void @smulh_v8i64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    smulh z0.d, p0/m, z0.d, z1.d
 ; VBITS_GE_256-NEXT:    movprfx z1, z2
 ; VBITS_GE_256-NEXT:    smulh z1.d, p0/m, z1.d, z3.d
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
 ; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: smulh_v8i64:
@@ -615,8 +615,8 @@ define void @umulh_v64i8(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    umulh z0.b, p0/m, z0.b, z1.b
 ; VBITS_GE_256-NEXT:    movprfx z1, z2
 ; VBITS_GE_256-NEXT:    umulh z1.b, p0/m, z1.b, z3.b
-; VBITS_GE_256-NEXT:    st1b { z1.b }, p0, [x0]
 ; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x0, x8]
+; VBITS_GE_256-NEXT:    st1b { z1.b }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: umulh_v64i8:
@@ -749,8 +749,8 @@ define void @umulh_v32i16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    umulh z0.h, p0/m, z0.h, z1.h
 ; VBITS_GE_256-NEXT:    movprfx z1, z2
 ; VBITS_GE_256-NEXT:    umulh z1.h, p0/m, z1.h, z3.h
-; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
 ; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: umulh_v32i16:
@@ -882,8 +882,8 @@ define void @umulh_v16i32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    umulh z0.s, p0/m, z0.s, z1.s
 ; VBITS_GE_256-NEXT:    movprfx z1, z2
 ; VBITS_GE_256-NEXT:    umulh z1.s, p0/m, z1.s, z3.s
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
 ; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: umulh_v16i32:
@@ -1013,8 +1013,8 @@ define void @umulh_v8i64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    umulh z0.d, p0/m, z0.d, z1.d
 ; VBITS_GE_256-NEXT:    movprfx z1, z2
 ; VBITS_GE_256-NEXT:    umulh z1.d, p0/m, z1.d, z3.d
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
 ; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: umulh_v8i64:
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll
index 45960617f1d935..38444d83c1d7e4 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll
@@ -533,9 +533,9 @@ define void @srem_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 {
 define <2 x i32> @srem_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(1,0) #0 {
 ; CHECK-LABEL: srem_v2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
-; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    movprfx z2, z0
 ; CHECK-NEXT:    sdiv z2.s, p0/m, z2.s, z1.s
 ; CHECK-NEXT:    mls v0.2s, v2.2s, v1.2s
@@ -549,9 +549,9 @@ define <2 x i32> @srem_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(1,0) #
 define <4 x i32> @srem_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(1,0) #0 {
 ; CHECK-LABEL: srem_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
-; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    movprfx z2, z0
 ; CHECK-NEXT:    sdiv z2.s, p0/m, z2.s, z1.s
 ; CHECK-NEXT:    mls v0.4s, v2.4s, v1.4s
@@ -582,25 +582,25 @@ define void @srem_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @srem_v16i32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_128-LABEL: srem_v16i32:
 ; VBITS_GE_128:       // %bb.0:
-; VBITS_GE_128-NEXT:    ldp q0, q3, [x1]
 ; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
+; VBITS_GE_128-NEXT:    ldp q0, q3, [x1]
 ; VBITS_GE_128-NEXT:    ldp q1, q2, [x0]
-; VBITS_GE_128-NEXT:    movprfx z4, z1
-; VBITS_GE_128-NEXT:    sdiv z4.s, p0/m, z4.s, z0.s
 ; VBITS_GE_128-NEXT:    ldp q16, q5, [x0, #32]
-; VBITS_GE_128-NEXT:    movprfx z19, z2
-; VBITS_GE_128-NEXT:    sdiv z19.s, p0/m, z19.s, z3.s
 ; VBITS_GE_128-NEXT:    ldp q17, q6, [x1, #32]
+; VBITS_GE_128-NEXT:    movprfx z4, z1
+; VBITS_GE_128-NEXT:    sdiv z4.s, p0/m, z4.s, z0.s
 ; VBITS_GE_128-NEXT:    movprfx z7, z5
 ; VBITS_GE_128-NEXT:    sdiv z7.s, p0/m, z7.s, z6.s
 ; VBITS_GE_128-NEXT:    movprfx z18, z16
 ; VBITS_GE_128-NEXT:    sdiv z18.s, p0/m, z18.s, z17.s
-; VBITS_GE_128-NEXT:    mls v1.4s, v4.4s, v0.4s
-; VBITS_GE_128-NEXT:    mls v2.4s, v19.4s, v3.4s
+; VBITS_GE_128-NEXT:    movprfx z19, z2
+; VBITS_GE_128-NEXT:    sdiv z19.s, p0/m, z19.s, z3.s
 ; VBITS_GE_128-NEXT:    mls v16.4s, v18.4s, v17.4s
 ; VBITS_GE_128-NEXT:    mls v5.4s, v7.4s, v6.4s
-; VBITS_GE_128-NEXT:    stp q1, q2, [x0]
+; VBITS_GE_128-NEXT:    mls v1.4s, v4.4s, v0.4s
+; VBITS_GE_128-NEXT:    mls v2.4s, v19.4s, v3.4s
 ; VBITS_GE_128-NEXT:    stp q16, q5, [x0, #32]
+; VBITS_GE_128-NEXT:    stp q1, q2, [x0]
 ; VBITS_GE_128-NEXT:    ret
 ;
 ; VBITS_GE_256-LABEL: srem_v16i32:
@@ -609,17 +609,17 @@ define void @srem_v16i32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT:    movprfx z2, z0
-; VBITS_GE_256-NEXT:    sdiv z2.s, p0/m, z2.s, z1.s
 ; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1w { z4.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    movprfx z2, z0
+; VBITS_GE_256-NEXT:    sdiv z2.s, p0/m, z2.s, z1.s
 ; VBITS_GE_256-NEXT:    movprfx z5, z3
 ; VBITS_GE_256-NEXT:    sdiv z5.s, p0/m, z5.s, z4.s
 ; VBITS_GE_256-NEXT:    mls z0.s, p0/m, z2.s, z1.s
 ; VBITS_GE_256-NEXT:    movprfx z1, z3
 ; VBITS_GE_256-NEXT:    mls z1.s, p0/m, z5.s, z4.s
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
 ; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: srem_v16i32:
@@ -681,8 +681,8 @@ define <1 x i64> @srem_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(1,0) #
 ; CHECK-LABEL: srem_v1i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d, vl1
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    movprfx z2, z0
 ; CHECK-NEXT:    sdiv z2.d, p0/m, z2.d, z1.d
 ; CHECK-NEXT:    mls z0.d, p0/m, z2.d, z1.d
@@ -698,8 +698,8 @@ define <2 x i64> @srem_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(1,0) #
 ; CHECK-LABEL: srem_v2i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    movprfx z2, z0
 ; CHECK-NEXT:    sdiv z2.d, p0/m, z2.d, z1.d
 ; CHECK-NEXT:    mls z0.d, p0/m, z2.d, z1.d
@@ -733,16 +733,16 @@ define void @srem_v8i64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_128-NEXT:    ptrue p0.d, vl2
 ; VBITS_GE_128-NEXT:    ldp q0, q3, [x1]
 ; VBITS_GE_128-NEXT:    ldp q1, q2, [x0]
-; VBITS_GE_128-NEXT:    movprfx z4, z1
-; VBITS_GE_128-NEXT:    sdiv z4.d, p0/m, z4.d, z0.d
 ; VBITS_GE_128-NEXT:    ldp q16, q5, [x0, #32]
-; VBITS_GE_128-NEXT:    movprfx z19, z2
-; VBITS_GE_128-NEXT:    sdiv z19.d, p0/m, z19.d, z3.d
 ; VBITS_GE_128-NEXT:    ldp q17, q6, [x1, #32]
+; VBITS_GE_128-NEXT:    movprfx z4, z1
+; VBITS_GE_128-NEXT:    sdiv z4.d, p0/m, z4.d, z0.d
 ; VBITS_GE_128-NEXT:    movprfx z7, z5
 ; VBITS_GE_128-NEXT:    sdiv z7.d, p0/m, z7.d, z6.d
 ; VBITS_GE_128-NEXT:    movprfx z18, z16
 ; VBITS_GE_128-NEXT:    sdiv z18.d, p0/m, z18.d, z17.d
+; VBITS_GE_128-NEXT:    movprfx z19, z2
+; VBITS_GE_128-NEXT:    sdiv z19.d, p0/m, z19.d, z3.d
 ; VBITS_GE_128-NEXT:    mls z16.d, p0/m, z18.d, z17.d
 ; VBITS_GE_128-NEXT:    mls z5.d, p0/m, z7.d, z6.d
 ; VBITS_GE_128-NEXT:    msb z0.d, p0/m, z4.d, z1.d
@@ -758,17 +758,17 @@ define void @srem_v8i64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    movprfx z2, z0
-; VBITS_GE_256-NEXT:    sdiv z2.d, p0/m, z2.d, z1.d
 ; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1d { z4.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    movprfx z2, z0
+; VBITS_GE_256-NEXT:    sdiv z2.d, p0/m, z2.d, z1.d
 ; VBITS_GE_256-NEXT:    movprfx z5, z3
 ; VBITS_GE_256-NEXT:    sdiv z5.d, p0/m, z5.d, z4.d
 ; VBITS_GE_256-NEXT:    mls z0.d, p0/m, z2.d, z1.d
 ; VBITS_GE_256-NEXT:    movprfx z1, z3
 ; VBITS_GE_256-NEXT:    mls z1.d, p0/m, z5.d, z4.d
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
 ; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: srem_v8i64:
@@ -1351,9 +1351,9 @@ define void @urem_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 {
 define <2 x i32> @urem_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(1,0) #0 {
 ; CHECK-LABEL: urem_v2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
-; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    movprfx z2, z0
 ; CHECK-NEXT:    udiv z2.s, p0/m, z2.s, z1.s
 ; CHECK-NEXT:    mls v0.2s, v2.2s, v1.2s
@@ -1367,9 +1367,9 @@ define <2 x i32> @urem_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(1,0) #
 define <4 x i32> @urem_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(1,0) #0 {
 ; CHECK-LABEL: urem_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
-; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    movprfx z2, z0
 ; CHECK-NEXT:    udiv z2.s, p0/m, z2.s, z1.s
 ; CHECK-NEXT:    mls v0.4s, v2.4s, v1.4s
@@ -1400,25 +1400,25 @@ define void @urem_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @urem_v16i32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_128-LABEL: urem_v16i32:
 ; VBITS_GE_128:       // %bb.0:
-; VBITS_GE_128-NEXT:    ldp q0, q3, [x1]
 ; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
+; VBITS_GE_128-NEXT:    ldp q0, q3, [x1]
 ; VBITS_GE_128-NEXT:    ldp q1, q2, [x0]
-; VBITS_GE_128-NEXT:    movprfx z4, z1
-; VBITS_GE_128-NEXT:    udiv z4.s, p0/m, z4.s, z0.s
 ; VBITS_GE_128-NEXT:    ldp q16, q5, [x0, #32]
-; VBITS_GE_128-NEXT:    movprfx z19, z2
-; VBITS_GE_128-NEXT:    udiv z19.s, p0/m, z19.s, z3.s
 ; VBITS_GE_128-NEXT:    ldp q17, q6, [x1, #32]
+; VBITS_GE_128-NEXT:    movprfx z4, z1
+; VBITS_GE_128-NEXT:    udiv z4.s, p0/m, z4.s, z0.s
 ; VBITS_GE_128-NEXT:    movprfx z7, z5
 ; VBITS_GE_128-NEXT:    udiv z7.s, p0/m, z7.s, z6.s
 ; VBITS_GE_128-NEXT:    movprfx z18, z16
 ; VBITS_GE_128-NEXT:    udiv z18.s, p0/m, z18.s, z17.s
-; VBITS_GE_128-NEXT:    mls v1.4s, v4.4s, v0.4s
-; VBITS_GE_128-NEXT:    mls v2.4s, v19.4s, v3.4s
+; VBITS_GE_128-NEXT:    movprfx z19, z2
+; VBITS_GE_128-NEXT:    udiv z19.s, p0/m, z19.s, z3.s
 ; VBITS_GE_128-NEXT:    mls v16.4s, v18.4s, v17.4s
 ; VBITS_GE_128-NEXT:    mls v5.4s, v7.4s, v6.4s
-; VBITS_GE_128-NEXT:    stp q1, q2, [x0]
+; VBITS_GE_128-NEXT:    mls v1.4s, v4.4s, v0.4s
+; VBITS_GE_128-NEXT:    mls v2.4s, v19.4s, v3.4s
 ; VBITS_GE_128-NEXT:    stp q16, q5, [x0, #32]
+; VBITS_GE_128-NEXT:    stp q1, q2, [x0]
 ; VBITS_GE_128-NEXT:    ret
 ;
 ; VBITS_GE_256-LABEL: urem_v16i32:
@@ -1427,17 +1427,17 @@ define void @urem_v16i32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT:    movprfx z2, z0
-; VBITS_GE_256-NEXT:    udiv z2.s, p0/m, z2.s, z1.s
 ; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1w { z4.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    movprfx z2, z0
+; VBITS_GE_256-NEXT:    udiv z2.s, p0/m, z2.s, z1.s
 ; VBITS_GE_256-NEXT:    movprfx z5, z3
 ; VBITS_GE_256-NEXT:    udiv z5.s, p0/m, z5.s, z4.s
 ; VBITS_GE_256-NEXT:    mls z0.s, p0/m, z2.s, z1.s
 ; VBITS_GE_256-NEXT:    movprfx z1, z3
 ; VBITS_GE_256-NEXT:    mls z1.s, p0/m, z5.s, z4.s
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
 ; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: urem_v16i32:
@@ -1499,8 +1499,8 @@ define <1 x i64> @urem_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(1,0) #
 ; CHECK-LABEL: urem_v1i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d, vl1
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    movprfx z2, z0
 ; CHECK-NEXT:    udiv z2.d, p0/m, z2.d, z1.d
 ; CHECK-NEXT:    mls z0.d, p0/m, z2.d, z1.d
@@ -1516,8 +1516,8 @@ define <2 x i64> @urem_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(1,0) #
 ; CHECK-LABEL: urem_v2i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    movprfx z2, z0
 ; CHECK-NEXT:    udiv z2.d, p0/m, z2.d, z1.d
 ; CHECK-NEXT:    mls z0.d, p0/m, z2.d, z1.d
@@ -1551,16 +1551,16 @@ define void @urem_v8i64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_128-NEXT:    ptrue p0.d, vl2
 ; VBITS_GE_128-NEXT:    ldp q0, q3, [x1]
 ; VBITS_GE_128-NEXT:    ldp q1, q2, [x0]
-; VBITS_GE_128-NEXT:    movprfx z4, z1
-; VBITS_GE_128-NEXT:    udiv z4.d, p0/m, z4.d, z0.d
 ; VBITS_GE_128-NEXT:    ldp q16, q5, [x0, #32]
-; VBITS_GE_128-NEXT:    movprfx z19, z2
-; VBITS_GE_128-NEXT:    udiv z19.d, p0/m, z19.d, z3.d
 ; VBITS_GE_128-NEXT:    ldp q17, q6, [x1, #32]
+; VBITS_GE_128-NEXT:    movprfx z4, z1
+; VBITS_GE_128-NEXT:    udiv z4.d, p0/m, z4.d, z0.d
 ; VBITS_GE_128-NEXT:    movprfx z7, z5
 ; VBITS_GE_128-NEXT:    udiv z7.d, p0/m, z7.d, z6.d
 ; VBITS_GE_128-NEXT:    movprfx z18, z16
 ; VBITS_GE_128-NEXT:    udiv z18.d, p0/m, z18.d, z17.d
+; VBITS_GE_128-NEXT:    movprfx z19, z2
+; VBITS_GE_128-NEXT:    udiv z19.d, p0/m, z19.d, z3.d
 ; VBITS_GE_128-NEXT:    mls z16.d, p0/m, z18.d, z17.d
 ; VBITS_GE_128-NEXT:    mls z5.d, p0/m, z7.d, z6.d
 ; VBITS_GE_128-NEXT:    msb z0.d, p0/m, z4.d, z1.d
@@ -1576,17 +1576,17 @@ define void @urem_v8i64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    movprfx z2, z0
-; VBITS_GE_256-NEXT:    udiv z2.d, p0/m, z2.d, z1.d
 ; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1d { z4.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    movprfx z2, z0
+; VBITS_GE_256-NEXT:    udiv z2.d, p0/m, z2.d, z1.d
 ; VBITS_GE_256-NEXT:    movprfx z5, z3
 ; VBITS_GE_256-NEXT:    udiv z5.d, p0/m, z5.d, z4.d
 ; VBITS_GE_256-NEXT:    mls z0.d, p0/m, z2.d, z1.d
 ; VBITS_GE_256-NEXT:    movprfx z1, z3
 ; VBITS_GE_256-NEXT:    mls z1.d, p0/m, z5.d, z4.d
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
 ; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: urem_v8i64:
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-shifts.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-shifts.ll
index 7184cc9705973d..0fa8c8f50e29cd 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-shifts.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-shifts.ll
@@ -59,8 +59,8 @@ define void @ashr_v64i8(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    asr z0.b, p0/m, z0.b, z1.b
 ; VBITS_GE_256-NEXT:    movprfx z1, z2
 ; VBITS_GE_256-NEXT:    asr z1.b, p0/m, z1.b, z3.b
-; VBITS_GE_256-NEXT:    st1b { z1.b }, p0, [x0]
 ; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x0, x8]
+; VBITS_GE_256-NEXT:    st1b { z1.b }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: ashr_v64i8:
@@ -160,8 +160,8 @@ define void @ashr_v32i16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    asr z0.h, p0/m, z0.h, z1.h
 ; VBITS_GE_256-NEXT:    movprfx z1, z2
 ; VBITS_GE_256-NEXT:    asr z1.h, p0/m, z1.h, z3.h
-; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
 ; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: ashr_v32i16:
@@ -261,8 +261,8 @@ define void @ashr_v16i32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    asr z0.s, p0/m, z0.s, z1.s
 ; VBITS_GE_256-NEXT:    movprfx z1, z2
 ; VBITS_GE_256-NEXT:    asr z1.s, p0/m, z1.s, z3.s
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
 ; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: ashr_v16i32:
@@ -362,8 +362,8 @@ define void @ashr_v8i64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    asr z0.d, p0/m, z0.d, z1.d
 ; VBITS_GE_256-NEXT:    movprfx z1, z2
 ; VBITS_GE_256-NEXT:    asr z1.d, p0/m, z1.d, z3.d
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
 ; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: ashr_v8i64:
@@ -467,8 +467,8 @@ define void @lshr_v64i8(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    lsr z0.b, p0/m, z0.b, z1.b
 ; VBITS_GE_256-NEXT:    movprfx z1, z2
 ; VBITS_GE_256-NEXT:    lsr z1.b, p0/m, z1.b, z3.b
-; VBITS_GE_256-NEXT:    st1b { z1.b }, p0, [x0]
 ; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x0, x8]
+; VBITS_GE_256-NEXT:    st1b { z1.b }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: lshr_v64i8:
@@ -568,8 +568,8 @@ define void @lshr_v32i16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    lsr z0.h, p0/m, z0.h, z1.h
 ; VBITS_GE_256-NEXT:    movprfx z1, z2
 ; VBITS_GE_256-NEXT:    lsr z1.h, p0/m, z1.h, z3.h
-; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
 ; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: lshr_v32i16:
@@ -669,8 +669,8 @@ define void @lshr_v16i32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    lsr z0.s, p0/m, z0.s, z1.s
 ; VBITS_GE_256-NEXT:    movprfx z1, z2
 ; VBITS_GE_256-NEXT:    lsr z1.s, p0/m, z1.s, z3.s
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
 ; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: lshr_v16i32:
@@ -770,8 +770,8 @@ define void @lshr_v8i64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    lsr z0.d, p0/m, z0.d, z1.d
 ; VBITS_GE_256-NEXT:    movprfx z1, z2
 ; VBITS_GE_256-NEXT:    lsr z1.d, p0/m, z1.d, z3.d
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
 ; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: lshr_v8i64:
@@ -873,8 +873,8 @@ define void @shl_v64i8(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    lsl z0.b, p0/m, z0.b, z1.b
 ; VBITS_GE_256-NEXT:    movprfx z1, z2
 ; VBITS_GE_256-NEXT:    lsl z1.b, p0/m, z1.b, z3.b
-; VBITS_GE_256-NEXT:    st1b { z1.b }, p0, [x0]
 ; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x0, x8]
+; VBITS_GE_256-NEXT:    st1b { z1.b }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: shl_v64i8:
@@ -972,8 +972,8 @@ define void @shl_v32i16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    lsl z0.h, p0/m, z0.h, z1.h
 ; VBITS_GE_256-NEXT:    movprfx z1, z2
 ; VBITS_GE_256-NEXT:    lsl z1.h, p0/m, z1.h, z3.h
-; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
 ; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: shl_v32i16:
@@ -1071,8 +1071,8 @@ define void @shl_v16i32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    lsl z0.s, p0/m, z0.s, z1.s
 ; VBITS_GE_256-NEXT:    movprfx z1, z2
 ; VBITS_GE_256-NEXT:    lsl z1.s, p0/m, z1.s, z3.s
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
 ; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: shl_v16i32:
@@ -1170,8 +1170,8 @@ define void @shl_v8i64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    lsl z0.d, p0/m, z0.d, z1.d
 ; VBITS_GE_256-NEXT:    movprfx z1, z2
 ; VBITS_GE_256-NEXT:    lsl z1.d, p0/m, z1.d, z3.d
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
 ; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: shl_v8i64:
diff --git a/llvm/test/CodeGen/AArch64/sve-fpext-load.ll b/llvm/test/CodeGen/AArch64/sve-fpext-load.ll
index 78decf5ed1e3c7..aec2150124c3fc 100644
--- a/llvm/test/CodeGen/AArch64/sve-fpext-load.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fpext-load.ll
@@ -46,13 +46,13 @@ define <vscale x 8 x double> @ext8_f16_f64(ptr %ptr, i64 %index) {
 ; CHECK-NEXT:    uunpkhi z1.d, z1.s
 ; CHECK-NEXT:    uunpklo z3.d, z0.s
 ; CHECK-NEXT:    uunpkhi z4.d, z0.s
+; CHECK-NEXT:    fcvt z1.d, p0/m, z1.h
 ; CHECK-NEXT:    movprfx z0, z2
 ; CHECK-NEXT:    fcvt z0.d, p0/m, z2.h
 ; CHECK-NEXT:    movprfx z2, z3
 ; CHECK-NEXT:    fcvt z2.d, p0/m, z3.h
 ; CHECK-NEXT:    movprfx z3, z4
 ; CHECK-NEXT:    fcvt z3.d, p0/m, z4.h
-; CHECK-NEXT:    fcvt z1.d, p0/m, z1.h
 ; CHECK-NEXT:    ret
   %load = load <vscale x 8 x half>, ptr %ptr, align 4
   %load.ext = fpext <vscale x 8 x half> %load to <vscale x 8 x double>
diff --git a/llvm/test/CodeGen/AArch64/sve-fptosi-sat.ll b/llvm/test/CodeGen/AArch64/sve-fptosi-sat.ll
index 43113df07c079f..813f1601e809e6 100644
--- a/llvm/test/CodeGen/AArch64/sve-fptosi-sat.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fptosi-sat.ll
@@ -74,12 +74,12 @@ define <vscale x 8 x i32> @test_signed_v8f32_v8i32(<vscale x 8 x float> %f) {
 ; CHECK-NEXT:    mov w8, #1325400063 // =0x4effffff
 ; CHECK-NEXT:    mov z6.s, #0x7fffffff
 ; CHECK-NEXT:    mov z4.s, w8
-; CHECK-NEXT:    movprfx z5, z1
-; CHECK-NEXT:    fcvtzs z5.s, p0/m, z1.s
 ; CHECK-NEXT:    fcmge p1.s, p0/z, z0.s, z2.s
 ; CHECK-NEXT:    fcmge p2.s, p0/z, z1.s, z2.s
 ; CHECK-NEXT:    movprfx z2, z0
 ; CHECK-NEXT:    fcvtzs z2.s, p0/m, z0.s
+; CHECK-NEXT:    movprfx z5, z1
+; CHECK-NEXT:    fcvtzs z5.s, p0/m, z1.s
 ; CHECK-NEXT:    fcmgt p3.s, p0/z, z0.s, z4.s
 ; CHECK-NEXT:    fcmgt p4.s, p0/z, z1.s, z4.s
 ; CHECK-NEXT:    not p1.b, p0/z, p1.b
@@ -134,17 +134,17 @@ define <vscale x 8 x i16> @test_signed_v8f32_v8i16(<vscale x 8 x float> %f) {
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov w8, #-956301312 // =0xc7000000
-; CHECK-NEXT:    movprfx z4, z0
-; CHECK-NEXT:    fcvtzs z4.s, p0/m, z0.s
+; CHECK-NEXT:    mov z5.s, #32767 // =0x7fff
 ; CHECK-NEXT:    mov z2.s, w8
 ; CHECK-NEXT:    mov w8, #65024 // =0xfe00
-; CHECK-NEXT:    mov z5.s, #32767 // =0x7fff
 ; CHECK-NEXT:    movk w8, #18175, lsl #16
 ; CHECK-NEXT:    mov z3.s, w8
 ; CHECK-NEXT:    fcmge p1.s, p0/z, z1.s, z2.s
 ; CHECK-NEXT:    fcmge p2.s, p0/z, z0.s, z2.s
 ; CHECK-NEXT:    movprfx z2, z1
 ; CHECK-NEXT:    fcvtzs z2.s, p0/m, z1.s
+; CHECK-NEXT:    movprfx z4, z0
+; CHECK-NEXT:    fcvtzs z4.s, p0/m, z0.s
 ; CHECK-NEXT:    fcmgt p3.s, p0/z, z1.s, z3.s
 ; CHECK-NEXT:    fcmgt p4.s, p0/z, z0.s, z3.s
 ; CHECK-NEXT:    not p1.b, p0/z, p1.b
@@ -206,13 +206,13 @@ define <vscale x 4 x i64> @test_signed_v4f32_v4i64(<vscale x 4 x float> %f) {
 ; CHECK-NEXT:    mov w8, #1593835519 // =0x5effffff
 ; CHECK-NEXT:    mov z3.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z4.s, w8
-; CHECK-NEXT:    movprfx z5, z0
-; CHECK-NEXT:    fcvtzs z5.d, p0/m, z0.s
 ; CHECK-NEXT:    mov z6.d, #0x7fffffffffffffff
 ; CHECK-NEXT:    fcmge p1.s, p0/z, z1.s, z2.s
 ; CHECK-NEXT:    fcmge p2.s, p0/z, z0.s, z2.s
 ; CHECK-NEXT:    movprfx z2, z1
 ; CHECK-NEXT:    fcvtzs z2.d, p0/m, z1.s
+; CHECK-NEXT:    movprfx z5, z0
+; CHECK-NEXT:    fcvtzs z5.d, p0/m, z0.s
 ; CHECK-NEXT:    fcmgt p3.s, p0/z, z1.s, z4.s
 ; CHECK-NEXT:    fcmgt p4.s, p0/z, z0.s, z4.s
 ; CHECK-NEXT:    not p1.b, p0/z, p1.b
@@ -283,13 +283,13 @@ define <vscale x 4 x i32> @test_signed_v4f64_v4i32(<vscale x 4 x double> %f) {
 ; CHECK-NEXT:    mov x8, #281474972516352 // =0xffffffc00000
 ; CHECK-NEXT:    mov z6.d, #0x7fffffff
 ; CHECK-NEXT:    movk x8, #16863, lsl #48
-; CHECK-NEXT:    movprfx z5, z0
-; CHECK-NEXT:    fcvtzs z5.d, p0/m, z0.d
 ; CHECK-NEXT:    mov z4.d, x8
 ; CHECK-NEXT:    fcmge p1.d, p0/z, z1.d, z2.d
 ; CHECK-NEXT:    fcmge p2.d, p0/z, z0.d, z2.d
 ; CHECK-NEXT:    movprfx z2, z1
 ; CHECK-NEXT:    fcvtzs z2.d, p0/m, z1.d
+; CHECK-NEXT:    movprfx z5, z0
+; CHECK-NEXT:    fcvtzs z5.d, p0/m, z0.d
 ; CHECK-NEXT:    fcmgt p3.d, p0/z, z1.d, z4.d
 ; CHECK-NEXT:    fcmgt p4.d, p0/z, z0.d, z4.d
 ; CHECK-NEXT:    not p1.b, p0/z, p1.b
@@ -324,24 +324,24 @@ define <vscale x 8 x i32> @test_signed_v8f64_v8i32(<vscale x 8 x double> %f) {
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov x8, #-4476578029606273024 // =0xc1e0000000000000
-; CHECK-NEXT:    movprfx z6, z1
-; CHECK-NEXT:    fcvtzs z6.d, p0/m, z1.d
+; CHECK-NEXT:    mov z26.d, #0x7fffffff
 ; CHECK-NEXT:    mov z4.d, x8
 ; CHECK-NEXT:    mov x8, #281474972516352 // =0xffffffc00000
-; CHECK-NEXT:    movprfx z7, z0
-; CHECK-NEXT:    fcvtzs z7.d, p0/m, z0.d
 ; CHECK-NEXT:    movk x8, #16863, lsl #48
-; CHECK-NEXT:    mov z26.d, #0x7fffffff
-; CHECK-NEXT:    movprfx z24, z3
-; CHECK-NEXT:    fcvtzs z24.d, p0/m, z3.d
 ; CHECK-NEXT:    mov z5.d, x8
-; CHECK-NEXT:    movprfx z25, z2
-; CHECK-NEXT:    fcvtzs z25.d, p0/m, z2.d
 ; CHECK-NEXT:    fcmge p1.d, p0/z, z1.d, z4.d
 ; CHECK-NEXT:    fcmge p2.d, p0/z, z0.d, z4.d
 ; CHECK-NEXT:    fcmge p3.d, p0/z, z3.d, z4.d
 ; CHECK-NEXT:    fcmge p4.d, p0/z, z2.d, z4.d
 ; CHECK-NEXT:    mov z4.d, #0xffffffff80000000
+; CHECK-NEXT:    movprfx z6, z1
+; CHECK-NEXT:    fcvtzs z6.d, p0/m, z1.d
+; CHECK-NEXT:    movprfx z7, z0
+; CHECK-NEXT:    fcvtzs z7.d, p0/m, z0.d
+; CHECK-NEXT:    movprfx z24, z3
+; CHECK-NEXT:    fcvtzs z24.d, p0/m, z3.d
+; CHECK-NEXT:    movprfx z25, z2
+; CHECK-NEXT:    fcvtzs z25.d, p0/m, z2.d
 ; CHECK-NEXT:    fcmgt p5.d, p0/z, z1.d, z5.d
 ; CHECK-NEXT:    fcmgt p6.d, p0/z, z0.d, z5.d
 ; CHECK-NEXT:    fcmgt p7.d, p0/z, z3.d, z5.d
@@ -389,17 +389,17 @@ define <vscale x 4 x i16> @test_signed_v4f64_v4i16(<vscale x 4 x double> %f) {
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov x8, #-4548635623644200960 // =0xc0e0000000000000
-; CHECK-NEXT:    movprfx z4, z0
-; CHECK-NEXT:    fcvtzs z4.d, p0/m, z0.d
+; CHECK-NEXT:    mov z5.d, #32767 // =0x7fff
 ; CHECK-NEXT:    mov z2.d, x8
 ; CHECK-NEXT:    mov x8, #281200098803712 // =0xffc000000000
-; CHECK-NEXT:    mov z5.d, #32767 // =0x7fff
 ; CHECK-NEXT:    movk x8, #16607, lsl #48
 ; CHECK-NEXT:    mov z3.d, x8
 ; CHECK-NEXT:    fcmge p1.d, p0/z, z1.d, z2.d
 ; CHECK-NEXT:    fcmge p2.d, p0/z, z0.d, z2.d
 ; CHECK-NEXT:    movprfx z2, z1
 ; CHECK-NEXT:    fcvtzs z2.d, p0/m, z1.d
+; CHECK-NEXT:    movprfx z4, z0
+; CHECK-NEXT:    fcvtzs z4.d, p0/m, z0.d
 ; CHECK-NEXT:    fcmgt p3.d, p0/z, z1.d, z3.d
 ; CHECK-NEXT:    fcmgt p4.d, p0/z, z0.d, z3.d
 ; CHECK-NEXT:    not p1.b, p0/z, p1.b
@@ -434,23 +434,23 @@ define <vscale x 8 x i16> @test_signed_v8f64_v8i16(<vscale x 8 x double> %f) {
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov x8, #-4548635623644200960 // =0xc0e0000000000000
+; CHECK-NEXT:    mov z25.d, #32767 // =0x7fff
+; CHECK-NEXT:    mov z4.d, x8
+; CHECK-NEXT:    mov x8, #281200098803712 // =0xffc000000000
+; CHECK-NEXT:    movk x8, #16607, lsl #48
+; CHECK-NEXT:    fcmge p1.d, p0/z, z3.d, z4.d
+; CHECK-NEXT:    fcmge p2.d, p0/z, z2.d, z4.d
+; CHECK-NEXT:    fcmge p3.d, p0/z, z1.d, z4.d
+; CHECK-NEXT:    fcmge p4.d, p0/z, z0.d, z4.d
 ; CHECK-NEXT:    movprfx z5, z3
 ; CHECK-NEXT:    fcvtzs z5.d, p0/m, z3.d
 ; CHECK-NEXT:    mov z4.d, x8
-; CHECK-NEXT:    mov x8, #281200098803712 // =0xffc000000000
 ; CHECK-NEXT:    movprfx z6, z2
 ; CHECK-NEXT:    fcvtzs z6.d, p0/m, z2.d
-; CHECK-NEXT:    movk x8, #16607, lsl #48
 ; CHECK-NEXT:    movprfx z7, z1
 ; CHECK-NEXT:    fcvtzs z7.d, p0/m, z1.d
 ; CHECK-NEXT:    movprfx z24, z0
 ; CHECK-NEXT:    fcvtzs z24.d, p0/m, z0.d
-; CHECK-NEXT:    mov z25.d, #32767 // =0x7fff
-; CHECK-NEXT:    fcmge p1.d, p0/z, z3.d, z4.d
-; CHECK-NEXT:    fcmge p2.d, p0/z, z2.d, z4.d
-; CHECK-NEXT:    fcmge p3.d, p0/z, z1.d, z4.d
-; CHECK-NEXT:    fcmge p4.d, p0/z, z0.d, z4.d
-; CHECK-NEXT:    mov z4.d, x8
 ; CHECK-NEXT:    not p1.b, p0/z, p1.b
 ; CHECK-NEXT:    fcmgt p5.d, p0/z, z3.d, z4.d
 ; CHECK-NEXT:    fcmgt p6.d, p0/z, z2.d, z4.d
@@ -528,12 +528,12 @@ define <vscale x 4 x i64> @test_signed_v4f64_v4i64(<vscale x 4 x double> %f) {
 ; CHECK-NEXT:    mov x8, #4890909195324358655 // =0x43dfffffffffffff
 ; CHECK-NEXT:    mov z6.d, #0x7fffffffffffffff
 ; CHECK-NEXT:    mov z4.d, x8
-; CHECK-NEXT:    movprfx z5, z1
-; CHECK-NEXT:    fcvtzs z5.d, p0/m, z1.d
 ; CHECK-NEXT:    fcmge p1.d, p0/z, z0.d, z2.d
 ; CHECK-NEXT:    fcmge p2.d, p0/z, z1.d, z2.d
 ; CHECK-NEXT:    movprfx z2, z0
 ; CHECK-NEXT:    fcvtzs z2.d, p0/m, z0.d
+; CHECK-NEXT:    movprfx z5, z1
+; CHECK-NEXT:    fcvtzs z5.d, p0/m, z1.d
 ; CHECK-NEXT:    fcmgt p3.d, p0/z, z0.d, z4.d
 ; CHECK-NEXT:    fcmgt p4.d, p0/z, z1.d, z4.d
 ; CHECK-NEXT:    not p1.b, p0/z, p1.b
@@ -629,13 +629,13 @@ define <vscale x 8 x i32> @test_signed_v8f16_v8i32(<vscale x 8 x half> %f) {
 ; CHECK-NEXT:    mov w8, #31743 // =0x7bff
 ; CHECK-NEXT:    mov z3.s, #0x80000000
 ; CHECK-NEXT:    mov z4.h, w8
-; CHECK-NEXT:    movprfx z5, z0
-; CHECK-NEXT:    fcvtzs z5.s, p0/m, z0.h
 ; CHECK-NEXT:    mov z6.s, #0x7fffffff
 ; CHECK-NEXT:    fcmge p1.h, p0/z, z1.h, z2.h
 ; CHECK-NEXT:    fcmge p2.h, p0/z, z0.h, z2.h
 ; CHECK-NEXT:    movprfx z2, z1
 ; CHECK-NEXT:    fcvtzs z2.s, p0/m, z1.h
+; CHECK-NEXT:    movprfx z5, z0
+; CHECK-NEXT:    fcvtzs z5.s, p0/m, z0.h
 ; CHECK-NEXT:    fcmgt p3.h, p0/z, z1.h, z4.h
 ; CHECK-NEXT:    fcmgt p4.h, p0/z, z0.h, z4.h
 ; CHECK-NEXT:    not p1.b, p0/z, p1.b
@@ -742,13 +742,13 @@ define <vscale x 4 x i64> @test_signed_v4f16_v4i64(<vscale x 4 x half> %f) {
 ; CHECK-NEXT:    mov w8, #31743 // =0x7bff
 ; CHECK-NEXT:    mov z3.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z4.h, w8
-; CHECK-NEXT:    movprfx z5, z0
-; CHECK-NEXT:    fcvtzs z5.d, p0/m, z0.h
 ; CHECK-NEXT:    mov z6.d, #0x7fffffffffffffff
 ; CHECK-NEXT:    fcmge p1.h, p0/z, z1.h, z2.h
 ; CHECK-NEXT:    fcmge p2.h, p0/z, z0.h, z2.h
 ; CHECK-NEXT:    movprfx z2, z1
 ; CHECK-NEXT:    fcvtzs z2.d, p0/m, z1.h
+; CHECK-NEXT:    movprfx z5, z0
+; CHECK-NEXT:    fcvtzs z5.d, p0/m, z0.h
 ; CHECK-NEXT:    fcmgt p3.h, p0/z, z1.h, z4.h
 ; CHECK-NEXT:    fcmgt p4.h, p0/z, z0.h, z4.h
 ; CHECK-NEXT:    not p1.b, p0/z, p1.b
diff --git a/llvm/test/CodeGen/AArch64/sve-fptoui-sat.ll b/llvm/test/CodeGen/AArch64/sve-fptoui-sat.ll
index c9e06fd9f3414e..c56c0b37888dc0 100644
--- a/llvm/test/CodeGen/AArch64/sve-fptoui-sat.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fptoui-sat.ll
@@ -16,10 +16,10 @@ define <vscale x 2 x i32> @test_signed_v2f32_v2i32(<vscale x 2 x float> %f) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov w8, #1333788671 // =0x4f7fffff
-; CHECK-NEXT:    movprfx z2, z0
-; CHECK-NEXT:    fcvtzu z2.d, p0/m, z0.s
 ; CHECK-NEXT:    mov z1.s, w8
 ; CHECK-NEXT:    fcmge p1.s, p0/z, z0.s, #0.0
+; CHECK-NEXT:    movprfx z2, z0
+; CHECK-NEXT:    fcvtzu z2.d, p0/m, z0.s
 ; CHECK-NEXT:    not p1.b, p0/z, p1.b
 ; CHECK-NEXT:    fcmgt p0.s, p0/z, z0.s, z1.s
 ; CHECK-NEXT:    mov z0.d, #0xffffffff
@@ -35,10 +35,10 @@ define <vscale x 4 x i32> @test_signed_v4f32_v4i32(<vscale x 4 x float> %f) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov w8, #1333788671 // =0x4f7fffff
-; CHECK-NEXT:    movprfx z1, z0
-; CHECK-NEXT:    fcvtzu z1.s, p0/m, z0.s
 ; CHECK-NEXT:    mov z2.s, w8
 ; CHECK-NEXT:    fcmge p1.s, p0/z, z0.s, #0.0
+; CHECK-NEXT:    movprfx z1, z0
+; CHECK-NEXT:    fcvtzu z1.s, p0/m, z0.s
 ; CHECK-NEXT:    not p1.b, p0/z, p1.b
 ; CHECK-NEXT:    fcmgt p0.s, p0/z, z0.s, z2.s
 ; CHECK-NEXT:    mov z1.s, p1/m, #0 // =0x0
@@ -54,13 +54,13 @@ define <vscale x 8 x i32> @test_signed_v8f32_v8i32(<vscale x 8 x float> %f) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov w8, #1333788671 // =0x4f7fffff
+; CHECK-NEXT:    mov z4.s, w8
+; CHECK-NEXT:    fcmge p1.s, p0/z, z0.s, #0.0
+; CHECK-NEXT:    fcmge p2.s, p0/z, z1.s, #0.0
 ; CHECK-NEXT:    movprfx z2, z0
 ; CHECK-NEXT:    fcvtzu z2.s, p0/m, z0.s
-; CHECK-NEXT:    mov z4.s, w8
 ; CHECK-NEXT:    movprfx z3, z1
 ; CHECK-NEXT:    fcvtzu z3.s, p0/m, z1.s
-; CHECK-NEXT:    fcmge p1.s, p0/z, z0.s, #0.0
-; CHECK-NEXT:    fcmge p2.s, p0/z, z1.s, #0.0
 ; CHECK-NEXT:    fcmgt p3.s, p0/z, z0.s, z4.s
 ; CHECK-NEXT:    not p1.b, p0/z, p1.b
 ; CHECK-NEXT:    not p2.b, p0/z, p2.b
@@ -81,11 +81,11 @@ define <vscale x 4 x i16> @test_signed_v4f32_v4i16(<vscale x 4 x float> %f) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov w8, #65280 // =0xff00
-; CHECK-NEXT:    movprfx z2, z0
-; CHECK-NEXT:    fcvtzu z2.s, p0/m, z0.s
 ; CHECK-NEXT:    movk w8, #18303, lsl #16
 ; CHECK-NEXT:    mov z1.s, w8
 ; CHECK-NEXT:    fcmge p1.s, p0/z, z0.s, #0.0
+; CHECK-NEXT:    movprfx z2, z0
+; CHECK-NEXT:    fcvtzu z2.s, p0/m, z0.s
 ; CHECK-NEXT:    not p1.b, p0/z, p1.b
 ; CHECK-NEXT:    fcmgt p0.s, p0/z, z0.s, z1.s
 ; CHECK-NEXT:    mov z0.s, #65535 // =0xffff
@@ -101,14 +101,14 @@ define <vscale x 8 x i16> @test_signed_v8f32_v8i16(<vscale x 8 x float> %f) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov w8, #65280 // =0xff00
-; CHECK-NEXT:    movprfx z3, z1
-; CHECK-NEXT:    fcvtzu z3.s, p0/m, z1.s
 ; CHECK-NEXT:    movk w8, #18303, lsl #16
-; CHECK-NEXT:    movprfx z4, z0
-; CHECK-NEXT:    fcvtzu z4.s, p0/m, z0.s
 ; CHECK-NEXT:    mov z2.s, w8
 ; CHECK-NEXT:    fcmge p1.s, p0/z, z1.s, #0.0
 ; CHECK-NEXT:    fcmge p2.s, p0/z, z0.s, #0.0
+; CHECK-NEXT:    movprfx z3, z1
+; CHECK-NEXT:    fcvtzu z3.s, p0/m, z1.s
+; CHECK-NEXT:    movprfx z4, z0
+; CHECK-NEXT:    fcvtzu z4.s, p0/m, z0.s
 ; CHECK-NEXT:    fcmgt p3.s, p0/z, z1.s, z2.s
 ; CHECK-NEXT:    not p1.b, p0/z, p1.b
 ; CHECK-NEXT:    not p2.b, p0/z, p2.b
@@ -129,10 +129,10 @@ define <vscale x 2 x i64> @test_signed_v2f32_v2i64(<vscale x 2 x float> %f) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov w8, #1602224127 // =0x5f7fffff
-; CHECK-NEXT:    movprfx z1, z0
-; CHECK-NEXT:    fcvtzu z1.d, p0/m, z0.s
 ; CHECK-NEXT:    mov z2.s, w8
 ; CHECK-NEXT:    fcmge p1.s, p0/z, z0.s, #0.0
+; CHECK-NEXT:    movprfx z1, z0
+; CHECK-NEXT:    fcvtzu z1.d, p0/m, z0.s
 ; CHECK-NEXT:    not p1.b, p0/z, p1.b
 ; CHECK-NEXT:    fcmgt p0.s, p0/z, z0.s, z2.s
 ; CHECK-NEXT:    mov z1.d, p1/m, #0 // =0x0
@@ -150,13 +150,13 @@ define <vscale x 4 x i64> @test_signed_v4f32_v4i64(<vscale x 4 x float> %f) {
 ; CHECK-NEXT:    uunpklo z2.d, z0.s
 ; CHECK-NEXT:    uunpkhi z3.d, z0.s
 ; CHECK-NEXT:    mov w8, #1602224127 // =0x5f7fffff
+; CHECK-NEXT:    mov z4.s, w8
+; CHECK-NEXT:    fcmge p1.s, p0/z, z2.s, #0.0
+; CHECK-NEXT:    fcmge p2.s, p0/z, z3.s, #0.0
 ; CHECK-NEXT:    movprfx z0, z2
 ; CHECK-NEXT:    fcvtzu z0.d, p0/m, z2.s
 ; CHECK-NEXT:    movprfx z1, z3
 ; CHECK-NEXT:    fcvtzu z1.d, p0/m, z3.s
-; CHECK-NEXT:    mov z4.s, w8
-; CHECK-NEXT:    fcmge p1.s, p0/z, z2.s, #0.0
-; CHECK-NEXT:    fcmge p2.s, p0/z, z3.s, #0.0
 ; CHECK-NEXT:    fcmgt p3.s, p0/z, z2.s, z4.s
 ; CHECK-NEXT:    not p1.b, p0/z, p1.b
 ; CHECK-NEXT:    not p2.b, p0/z, p2.b
@@ -185,11 +185,11 @@ define <vscale x 2 x i32> @test_signed_v2f64_v2i32(<vscale x 2 x double> %f) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov x8, #281474974613504 // =0xffffffe00000
-; CHECK-NEXT:    movprfx z2, z0
-; CHECK-NEXT:    fcvtzu z2.d, p0/m, z0.d
 ; CHECK-NEXT:    movk x8, #16879, lsl #48
 ; CHECK-NEXT:    mov z1.d, x8
 ; CHECK-NEXT:    fcmge p1.d, p0/z, z0.d, #0.0
+; CHECK-NEXT:    movprfx z2, z0
+; CHECK-NEXT:    fcvtzu z2.d, p0/m, z0.d
 ; CHECK-NEXT:    not p1.b, p0/z, p1.b
 ; CHECK-NEXT:    fcmgt p0.d, p0/z, z0.d, z1.d
 ; CHECK-NEXT:    mov z0.d, #0xffffffff
@@ -205,14 +205,14 @@ define <vscale x 4 x i32> @test_signed_v4f64_v4i32(<vscale x 4 x double> %f) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov x8, #281474974613504 // =0xffffffe00000
-; CHECK-NEXT:    movprfx z3, z1
-; CHECK-NEXT:    fcvtzu z3.d, p0/m, z1.d
 ; CHECK-NEXT:    movk x8, #16879, lsl #48
-; CHECK-NEXT:    movprfx z4, z0
-; CHECK-NEXT:    fcvtzu z4.d, p0/m, z0.d
 ; CHECK-NEXT:    mov z2.d, x8
 ; CHECK-NEXT:    fcmge p1.d, p0/z, z1.d, #0.0
 ; CHECK-NEXT:    fcmge p2.d, p0/z, z0.d, #0.0
+; CHECK-NEXT:    movprfx z3, z1
+; CHECK-NEXT:    fcvtzu z3.d, p0/m, z1.d
+; CHECK-NEXT:    movprfx z4, z0
+; CHECK-NEXT:    fcvtzu z4.d, p0/m, z0.d
 ; CHECK-NEXT:    fcmgt p3.d, p0/z, z1.d, z2.d
 ; CHECK-NEXT:    not p1.b, p0/z, p1.b
 ; CHECK-NEXT:    not p2.b, p0/z, p2.b
@@ -240,20 +240,20 @@ define <vscale x 8 x i32> @test_signed_v8f64_v8i32(<vscale x 8 x double> %f) {
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov x8, #281474974613504 // =0xffffffe00000
+; CHECK-NEXT:    movk x8, #16879, lsl #48
+; CHECK-NEXT:    mov z4.d, x8
+; CHECK-NEXT:    fcmge p1.d, p0/z, z1.d, #0.0
+; CHECK-NEXT:    fcmge p2.d, p0/z, z0.d, #0.0
+; CHECK-NEXT:    fcmge p3.d, p0/z, z3.d, #0.0
+; CHECK-NEXT:    fcmge p4.d, p0/z, z2.d, #0.0
 ; CHECK-NEXT:    movprfx z5, z1
 ; CHECK-NEXT:    fcvtzu z5.d, p0/m, z1.d
-; CHECK-NEXT:    movk x8, #16879, lsl #48
 ; CHECK-NEXT:    movprfx z6, z0
 ; CHECK-NEXT:    fcvtzu z6.d, p0/m, z0.d
 ; CHECK-NEXT:    movprfx z7, z3
 ; CHECK-NEXT:    fcvtzu z7.d, p0/m, z3.d
-; CHECK-NEXT:    mov z4.d, x8
 ; CHECK-NEXT:    movprfx z24, z2
 ; CHECK-NEXT:    fcvtzu z24.d, p0/m, z2.d
-; CHECK-NEXT:    fcmge p1.d, p0/z, z1.d, #0.0
-; CHECK-NEXT:    fcmge p2.d, p0/z, z0.d, #0.0
-; CHECK-NEXT:    fcmge p3.d, p0/z, z3.d, #0.0
-; CHECK-NEXT:    fcmge p4.d, p0/z, z2.d, #0.0
 ; CHECK-NEXT:    fcmgt p5.d, p0/z, z1.d, z4.d
 ; CHECK-NEXT:    fcmgt p6.d, p0/z, z0.d, z4.d
 ; CHECK-NEXT:    mov z0.d, #0xffffffff
@@ -288,14 +288,14 @@ define <vscale x 4 x i16> @test_signed_v4f64_v4i16(<vscale x 4 x double> %f) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov x8, #281337537757184 // =0xffe000000000
-; CHECK-NEXT:    movprfx z3, z1
-; CHECK-NEXT:    fcvtzu z3.d, p0/m, z1.d
 ; CHECK-NEXT:    movk x8, #16623, lsl #48
-; CHECK-NEXT:    movprfx z4, z0
-; CHECK-NEXT:    fcvtzu z4.d, p0/m, z0.d
 ; CHECK-NEXT:    mov z2.d, x8
 ; CHECK-NEXT:    fcmge p1.d, p0/z, z1.d, #0.0
 ; CHECK-NEXT:    fcmge p2.d, p0/z, z0.d, #0.0
+; CHECK-NEXT:    movprfx z3, z1
+; CHECK-NEXT:    fcvtzu z3.d, p0/m, z1.d
+; CHECK-NEXT:    movprfx z4, z0
+; CHECK-NEXT:    fcvtzu z4.d, p0/m, z0.d
 ; CHECK-NEXT:    fcmgt p3.d, p0/z, z1.d, z2.d
 ; CHECK-NEXT:    not p1.b, p0/z, p1.b
 ; CHECK-NEXT:    not p2.b, p0/z, p2.b
@@ -323,20 +323,20 @@ define <vscale x 8 x i16> @test_signed_v8f64_v8i16(<vscale x 8 x double> %f) {
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov x8, #281337537757184 // =0xffe000000000
+; CHECK-NEXT:    movk x8, #16623, lsl #48
+; CHECK-NEXT:    mov z4.d, x8
+; CHECK-NEXT:    fcmge p1.d, p0/z, z3.d, #0.0
+; CHECK-NEXT:    fcmge p2.d, p0/z, z2.d, #0.0
+; CHECK-NEXT:    fcmge p3.d, p0/z, z1.d, #0.0
+; CHECK-NEXT:    fcmge p4.d, p0/z, z0.d, #0.0
 ; CHECK-NEXT:    movprfx z5, z3
 ; CHECK-NEXT:    fcvtzu z5.d, p0/m, z3.d
-; CHECK-NEXT:    movk x8, #16623, lsl #48
 ; CHECK-NEXT:    movprfx z6, z2
 ; CHECK-NEXT:    fcvtzu z6.d, p0/m, z2.d
 ; CHECK-NEXT:    movprfx z7, z1
 ; CHECK-NEXT:    fcvtzu z7.d, p0/m, z1.d
-; CHECK-NEXT:    mov z4.d, x8
 ; CHECK-NEXT:    movprfx z24, z0
 ; CHECK-NEXT:    fcvtzu z24.d, p0/m, z0.d
-; CHECK-NEXT:    fcmge p1.d, p0/z, z3.d, #0.0
-; CHECK-NEXT:    fcmge p2.d, p0/z, z2.d, #0.0
-; CHECK-NEXT:    fcmge p3.d, p0/z, z1.d, #0.0
-; CHECK-NEXT:    fcmge p4.d, p0/z, z0.d, #0.0
 ; CHECK-NEXT:    fcmgt p5.d, p0/z, z3.d, z4.d
 ; CHECK-NEXT:    fcmgt p6.d, p0/z, z2.d, z4.d
 ; CHECK-NEXT:    mov z2.d, #65535 // =0xffff
@@ -372,10 +372,10 @@ define <vscale x 2 x i64> @test_signed_v2f64_v2i64(<vscale x 2 x double> %f) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov x8, #4895412794951729151 // =0x43efffffffffffff
-; CHECK-NEXT:    movprfx z1, z0
-; CHECK-NEXT:    fcvtzu z1.d, p0/m, z0.d
 ; CHECK-NEXT:    mov z2.d, x8
 ; CHECK-NEXT:    fcmge p1.d, p0/z, z0.d, #0.0
+; CHECK-NEXT:    movprfx z1, z0
+; CHECK-NEXT:    fcvtzu z1.d, p0/m, z0.d
 ; CHECK-NEXT:    not p1.b, p0/z, p1.b
 ; CHECK-NEXT:    fcmgt p0.d, p0/z, z0.d, z2.d
 ; CHECK-NEXT:    mov z1.d, p1/m, #0 // =0x0
@@ -391,13 +391,13 @@ define <vscale x 4 x i64> @test_signed_v4f64_v4i64(<vscale x 4 x double> %f) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov x8, #4895412794951729151 // =0x43efffffffffffff
+; CHECK-NEXT:    mov z4.d, x8
+; CHECK-NEXT:    fcmge p1.d, p0/z, z0.d, #0.0
+; CHECK-NEXT:    fcmge p2.d, p0/z, z1.d, #0.0
 ; CHECK-NEXT:    movprfx z2, z0
 ; CHECK-NEXT:    fcvtzu z2.d, p0/m, z0.d
-; CHECK-NEXT:    mov z4.d, x8
 ; CHECK-NEXT:    movprfx z3, z1
 ; CHECK-NEXT:    fcvtzu z3.d, p0/m, z1.d
-; CHECK-NEXT:    fcmge p1.d, p0/z, z0.d, #0.0
-; CHECK-NEXT:    fcmge p2.d, p0/z, z1.d, #0.0
 ; CHECK-NEXT:    fcmgt p3.d, p0/z, z0.d, z4.d
 ; CHECK-NEXT:    not p1.b, p0/z, p1.b
 ; CHECK-NEXT:    not p2.b, p0/z, p2.b
@@ -429,10 +429,10 @@ define <vscale x 2 x i32> @test_signed_v2f16_v2i32(<vscale x 2 x half> %f) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov w8, #31743 // =0x7bff
-; CHECK-NEXT:    movprfx z2, z0
-; CHECK-NEXT:    fcvtzu z2.d, p0/m, z0.h
 ; CHECK-NEXT:    mov z1.h, w8
 ; CHECK-NEXT:    fcmge p1.h, p0/z, z0.h, #0.0
+; CHECK-NEXT:    movprfx z2, z0
+; CHECK-NEXT:    fcvtzu z2.d, p0/m, z0.h
 ; CHECK-NEXT:    not p1.b, p0/z, p1.b
 ; CHECK-NEXT:    fcmgt p0.h, p0/z, z0.h, z1.h
 ; CHECK-NEXT:    mov z0.d, #0xffffffff
@@ -448,10 +448,10 @@ define <vscale x 4 x i32> @test_signed_v4f16_v4i32(<vscale x 4 x half> %f) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov w8, #31743 // =0x7bff
-; CHECK-NEXT:    movprfx z1, z0
-; CHECK-NEXT:    fcvtzu z1.s, p0/m, z0.h
 ; CHECK-NEXT:    mov z2.h, w8
 ; CHECK-NEXT:    fcmge p1.h, p0/z, z0.h, #0.0
+; CHECK-NEXT:    movprfx z1, z0
+; CHECK-NEXT:    fcvtzu z1.s, p0/m, z0.h
 ; CHECK-NEXT:    not p1.b, p0/z, p1.b
 ; CHECK-NEXT:    fcmgt p0.h, p0/z, z0.h, z2.h
 ; CHECK-NEXT:    mov z1.s, p1/m, #0 // =0x0
@@ -469,13 +469,13 @@ define <vscale x 8 x i32> @test_signed_v8f16_v8i32(<vscale x 8 x half> %f) {
 ; CHECK-NEXT:    uunpklo z2.s, z0.h
 ; CHECK-NEXT:    uunpkhi z3.s, z0.h
 ; CHECK-NEXT:    mov w8, #31743 // =0x7bff
+; CHECK-NEXT:    mov z4.h, w8
+; CHECK-NEXT:    fcmge p1.h, p0/z, z2.h, #0.0
+; CHECK-NEXT:    fcmge p2.h, p0/z, z3.h, #0.0
 ; CHECK-NEXT:    movprfx z0, z2
 ; CHECK-NEXT:    fcvtzu z0.s, p0/m, z2.h
 ; CHECK-NEXT:    movprfx z1, z3
 ; CHECK-NEXT:    fcvtzu z1.s, p0/m, z3.h
-; CHECK-NEXT:    mov z4.h, w8
-; CHECK-NEXT:    fcmge p1.h, p0/z, z2.h, #0.0
-; CHECK-NEXT:    fcmge p2.h, p0/z, z3.h, #0.0
 ; CHECK-NEXT:    fcmgt p3.h, p0/z, z2.h, z4.h
 ; CHECK-NEXT:    not p1.b, p0/z, p1.b
 ; CHECK-NEXT:    not p2.b, p0/z, p2.b
@@ -494,10 +494,10 @@ define <vscale x 4 x i16> @test_signed_v4f16_v4i16(<vscale x 4 x half> %f) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov w8, #31743 // =0x7bff
-; CHECK-NEXT:    movprfx z2, z0
-; CHECK-NEXT:    fcvtzu z2.s, p0/m, z0.h
 ; CHECK-NEXT:    mov z1.h, w8
 ; CHECK-NEXT:    fcmge p1.h, p0/z, z0.h, #0.0
+; CHECK-NEXT:    movprfx z2, z0
+; CHECK-NEXT:    fcvtzu z2.s, p0/m, z0.h
 ; CHECK-NEXT:    not p1.b, p0/z, p1.b
 ; CHECK-NEXT:    fcmgt p0.h, p0/z, z0.h, z1.h
 ; CHECK-NEXT:    mov z0.s, #65535 // =0xffff
@@ -513,10 +513,10 @@ define <vscale x 8 x i16> @test_signed_v8f16_v8i16(<vscale x 8 x half> %f) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov w8, #31743 // =0x7bff
-; CHECK-NEXT:    movprfx z1, z0
-; CHECK-NEXT:    fcvtzu z1.h, p0/m, z0.h
 ; CHECK-NEXT:    mov z2.h, w8
 ; CHECK-NEXT:    fcmge p1.h, p0/z, z0.h, #0.0
+; CHECK-NEXT:    movprfx z1, z0
+; CHECK-NEXT:    fcvtzu z1.h, p0/m, z0.h
 ; CHECK-NEXT:    not p1.b, p0/z, p1.b
 ; CHECK-NEXT:    fcmgt p0.h, p0/z, z0.h, z2.h
 ; CHECK-NEXT:    mov z1.h, p1/m, #0 // =0x0
@@ -532,10 +532,10 @@ define <vscale x 2 x i64> @test_signed_v2f16_v2i64(<vscale x 2 x half> %f) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov w8, #31743 // =0x7bff
-; CHECK-NEXT:    movprfx z1, z0
-; CHECK-NEXT:    fcvtzu z1.d, p0/m, z0.h
 ; CHECK-NEXT:    mov z2.h, w8
 ; CHECK-NEXT:    fcmge p1.h, p0/z, z0.h, #0.0
+; CHECK-NEXT:    movprfx z1, z0
+; CHECK-NEXT:    fcvtzu z1.d, p0/m, z0.h
 ; CHECK-NEXT:    not p1.b, p0/z, p1.b
 ; CHECK-NEXT:    fcmgt p0.h, p0/z, z0.h, z2.h
 ; CHECK-NEXT:    mov z1.d, p1/m, #0 // =0x0
@@ -553,13 +553,13 @@ define <vscale x 4 x i64> @test_signed_v4f16_v4i64(<vscale x 4 x half> %f) {
 ; CHECK-NEXT:    uunpklo z2.d, z0.s
 ; CHECK-NEXT:    uunpkhi z3.d, z0.s
 ; CHECK-NEXT:    mov w8, #31743 // =0x7bff
+; CHECK-NEXT:    mov z4.h, w8
+; CHECK-NEXT:    fcmge p1.h, p0/z, z2.h, #0.0
+; CHECK-NEXT:    fcmge p2.h, p0/z, z3.h, #0.0
 ; CHECK-NEXT:    movprfx z0, z2
 ; CHECK-NEXT:    fcvtzu z0.d, p0/m, z2.h
 ; CHECK-NEXT:    movprfx z1, z3
 ; CHECK-NEXT:    fcvtzu z1.d, p0/m, z3.h
-; CHECK-NEXT:    mov z4.h, w8
-; CHECK-NEXT:    fcmge p1.h, p0/z, z2.h, #0.0
-; CHECK-NEXT:    fcmge p2.h, p0/z, z3.h, #0.0
 ; CHECK-NEXT:    fcmgt p3.h, p0/z, z2.h, z4.h
 ; CHECK-NEXT:    not p1.b, p0/z, p1.b
 ; CHECK-NEXT:    not p2.b, p0/z, p2.b
diff --git a/llvm/test/CodeGen/AArch64/sve-gather-scatter-addr-opts.ll b/llvm/test/CodeGen/AArch64/sve-gather-scatter-addr-opts.ll
index 1a6e4814cee6d2..5c4c9463528b87 100644
--- a/llvm/test/CodeGen/AArch64/sve-gather-scatter-addr-opts.ll
+++ b/llvm/test/CodeGen/AArch64/sve-gather-scatter-addr-opts.ll
@@ -71,18 +71,18 @@ define <vscale x 4 x i8> @gather_i8_index_offset_8(ptr %base, i64 %offset, <vsca
 define void @scatter_f16_index_offset_var(ptr %base, i64 %offset, i64 %scale, <vscale x 4 x i1> %pg, <vscale x 4 x half> %data) #0 {
 ; CHECK-LABEL: scatter_f16_index_offset_var:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    index z1.d, #0, #1
 ; CHECK-NEXT:    ptrue p1.d
+; CHECK-NEXT:    index z1.d, #0, #1
 ; CHECK-NEXT:    mov z2.d, x1
-; CHECK-NEXT:    movprfx z4, z2
-; CHECK-NEXT:    mla z4.d, p1/m, z1.d, z2.d
 ; CHECK-NEXT:    punpklo p2.h, p0.b
 ; CHECK-NEXT:    uunpklo z3.d, z0.s
-; CHECK-NEXT:    punpkhi p0.h, p0.b
 ; CHECK-NEXT:    uunpkhi z0.d, z0.s
+; CHECK-NEXT:    punpkhi p0.h, p0.b
+; CHECK-NEXT:    movprfx z4, z2
+; CHECK-NEXT:    mla z4.d, p1/m, z1.d, z2.d
 ; CHECK-NEXT:    incd z1.d
-; CHECK-NEXT:    st1h { z3.d }, p2, [x0, z4.d, lsl #1]
 ; CHECK-NEXT:    mad z1.d, p1/m, z2.d, z2.d
+; CHECK-NEXT:    st1h { z3.d }, p2, [x0, z4.d, lsl #1]
 ; CHECK-NEXT:    st1h { z0.d }, p0, [x0, z1.d, lsl #1]
 ; CHECK-NEXT:    ret
   %t0 = insertelement <vscale x 4 x i64> undef, i64 %offset, i32 0
diff --git a/llvm/test/CodeGen/AArch64/sve-masked-gather-legalize.ll b/llvm/test/CodeGen/AArch64/sve-masked-gather-legalize.ll
index 3ecfa7e6c0b186..e40d65efb158be 100644
--- a/llvm/test/CodeGen/AArch64/sve-masked-gather-legalize.ll
+++ b/llvm/test/CodeGen/AArch64/sve-masked-gather-legalize.ll
@@ -145,10 +145,10 @@ define <vscale x 4 x double> @masked_gather_nxv4f64(ptr %base, <vscale x 4 x i16
 ; CHECK-NEXT:    ptrue p1.s
 ; CHECK-NEXT:    movprfx z1, z0
 ; CHECK-NEXT:    sxth z1.s, p1/m, z0.s
-; CHECK-NEXT:    sunpklo z0.d, z1.s
 ; CHECK-NEXT:    punpklo p1.h, p0.b
-; CHECK-NEXT:    sunpkhi z1.d, z1.s
 ; CHECK-NEXT:    punpkhi p0.h, p0.b
+; CHECK-NEXT:    sunpklo z0.d, z1.s
+; CHECK-NEXT:    sunpkhi z1.d, z1.s
 ; CHECK-NEXT:    ld1d { z0.d }, p1/z, [x0, z0.d, lsl #3]
 ; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x0, z1.d, lsl #3]
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/sve-smulo-sdnode.ll b/llvm/test/CodeGen/AArch64/sve-smulo-sdnode.ll
index 871c74a1de14c2..a2429d4975b7ac 100644
--- a/llvm/test/CodeGen/AArch64/sve-smulo-sdnode.ll
+++ b/llvm/test/CodeGen/AArch64/sve-smulo-sdnode.ll
@@ -129,9 +129,9 @@ define <vscale x 64 x i8> @smulo_nxv64i8(<vscale x 64 x i8> %x, <vscale x 64 x i
 ; CHECK-NEXT:    mul z27.b, p0/m, z27.b, z5.b
 ; CHECK-NEXT:    smulh z3.b, p0/m, z3.b, z7.b
 ; CHECK-NEXT:    smulh z0.b, p0/m, z0.b, z4.b
-; CHECK-NEXT:    asr z4.b, z25.b, #7
 ; CHECK-NEXT:    smulh z2.b, p0/m, z2.b, z6.b
 ; CHECK-NEXT:    smulh z1.b, p0/m, z1.b, z5.b
+; CHECK-NEXT:    asr z4.b, z25.b, #7
 ; CHECK-NEXT:    asr z5.b, z24.b, #7
 ; CHECK-NEXT:    asr z6.b, z26.b, #7
 ; CHECK-NEXT:    asr z7.b, z27.b, #7
@@ -140,13 +140,13 @@ define <vscale x 64 x i8> @smulo_nxv64i8(<vscale x 64 x i8> %x, <vscale x 64 x i
 ; CHECK-NEXT:    cmpne p3.b, p0/z, z2.b, z6.b
 ; CHECK-NEXT:    cmpne p0.b, p0/z, z1.b, z7.b
 ; CHECK-NEXT:    mov z25.b, p1/m, #0 // =0x0
+; CHECK-NEXT:    mov z26.b, p3/m, #0 // =0x0
 ; CHECK-NEXT:    mov z24.b, p2/m, #0 // =0x0
 ; CHECK-NEXT:    mov z27.b, p0/m, #0 // =0x0
-; CHECK-NEXT:    mov z26.b, p3/m, #0 // =0x0
 ; CHECK-NEXT:    mov z0.d, z25.d
+; CHECK-NEXT:    mov z2.d, z26.d
 ; CHECK-NEXT:    mov z3.d, z24.d
 ; CHECK-NEXT:    mov z1.d, z27.d
-; CHECK-NEXT:    mov z2.d, z26.d
 ; CHECK-NEXT:    ret
   %a = call { <vscale x 64 x i8>, <vscale x 64 x i1> } @llvm.smul.with.overflow.nxv64i8(<vscale x 64 x i8> %x, <vscale x 64 x i8> %y)
   %b = extractvalue { <vscale x 64 x i8>, <vscale x 64 x i1> } %a, 0
@@ -262,9 +262,9 @@ define <vscale x 32 x i16> @smulo_nxv32i16(<vscale x 32 x i16> %x, <vscale x 32
 ; CHECK-NEXT:    mul z27.h, p0/m, z27.h, z5.h
 ; CHECK-NEXT:    smulh z3.h, p0/m, z3.h, z7.h
 ; CHECK-NEXT:    smulh z0.h, p0/m, z0.h, z4.h
-; CHECK-NEXT:    asr z4.h, z25.h, #15
 ; CHECK-NEXT:    smulh z2.h, p0/m, z2.h, z6.h
 ; CHECK-NEXT:    smulh z1.h, p0/m, z1.h, z5.h
+; CHECK-NEXT:    asr z4.h, z25.h, #15
 ; CHECK-NEXT:    asr z5.h, z24.h, #15
 ; CHECK-NEXT:    asr z6.h, z26.h, #15
 ; CHECK-NEXT:    asr z7.h, z27.h, #15
@@ -273,13 +273,13 @@ define <vscale x 32 x i16> @smulo_nxv32i16(<vscale x 32 x i16> %x, <vscale x 32
 ; CHECK-NEXT:    cmpne p3.h, p0/z, z2.h, z6.h
 ; CHECK-NEXT:    cmpne p0.h, p0/z, z1.h, z7.h
 ; CHECK-NEXT:    mov z25.h, p1/m, #0 // =0x0
+; CHECK-NEXT:    mov z26.h, p3/m, #0 // =0x0
 ; CHECK-NEXT:    mov z24.h, p2/m, #0 // =0x0
 ; CHECK-NEXT:    mov z27.h, p0/m, #0 // =0x0
-; CHECK-NEXT:    mov z26.h, p3/m, #0 // =0x0
 ; CHECK-NEXT:    mov z0.d, z25.d
+; CHECK-NEXT:    mov z2.d, z26.d
 ; CHECK-NEXT:    mov z3.d, z24.d
 ; CHECK-NEXT:    mov z1.d, z27.d
-; CHECK-NEXT:    mov z2.d, z26.d
 ; CHECK-NEXT:    ret
   %a = call { <vscale x 32 x i16>, <vscale x 32 x i1> } @llvm.smul.with.overflow.nxv32i16(<vscale x 32 x i16> %x, <vscale x 32 x i16> %y)
   %b = extractvalue { <vscale x 32 x i16>, <vscale x 32 x i1> } %a, 0
@@ -374,9 +374,9 @@ define <vscale x 16 x i32> @smulo_nxv16i32(<vscale x 16 x i32> %x, <vscale x 16
 ; CHECK-NEXT:    mul z27.s, p0/m, z27.s, z5.s
 ; CHECK-NEXT:    smulh z3.s, p0/m, z3.s, z7.s
 ; CHECK-NEXT:    smulh z0.s, p0/m, z0.s, z4.s
-; CHECK-NEXT:    asr z4.s, z25.s, #31
 ; CHECK-NEXT:    smulh z2.s, p0/m, z2.s, z6.s
 ; CHECK-NEXT:    smulh z1.s, p0/m, z1.s, z5.s
+; CHECK-NEXT:    asr z4.s, z25.s, #31
 ; CHECK-NEXT:    asr z5.s, z24.s, #31
 ; CHECK-NEXT:    asr z6.s, z26.s, #31
 ; CHECK-NEXT:    asr z7.s, z27.s, #31
@@ -385,13 +385,13 @@ define <vscale x 16 x i32> @smulo_nxv16i32(<vscale x 16 x i32> %x, <vscale x 16
 ; CHECK-NEXT:    cmpne p3.s, p0/z, z2.s, z6.s
 ; CHECK-NEXT:    cmpne p0.s, p0/z, z1.s, z7.s
 ; CHECK-NEXT:    mov z25.s, p1/m, #0 // =0x0
+; CHECK-NEXT:    mov z26.s, p3/m, #0 // =0x0
 ; CHECK-NEXT:    mov z24.s, p2/m, #0 // =0x0
 ; CHECK-NEXT:    mov z27.s, p0/m, #0 // =0x0
-; CHECK-NEXT:    mov z26.s, p3/m, #0 // =0x0
 ; CHECK-NEXT:    mov z0.d, z25.d
+; CHECK-NEXT:    mov z2.d, z26.d
 ; CHECK-NEXT:    mov z3.d, z24.d
 ; CHECK-NEXT:    mov z1.d, z27.d
-; CHECK-NEXT:    mov z2.d, z26.d
 ; CHECK-NEXT:    ret
   %a = call { <vscale x 16 x i32>, <vscale x 16 x i1> } @llvm.smul.with.overflow.nxv16i32(<vscale x 16 x i32> %x, <vscale x 16 x i32> %y)
   %b = extractvalue { <vscale x 16 x i32>, <vscale x 16 x i1> } %a, 0
@@ -465,9 +465,9 @@ define <vscale x 8 x i64> @smulo_nxv8i64(<vscale x 8 x i64> %x, <vscale x 8 x i6
 ; CHECK-NEXT:    mul z27.d, p0/m, z27.d, z5.d
 ; CHECK-NEXT:    smulh z3.d, p0/m, z3.d, z7.d
 ; CHECK-NEXT:    smulh z0.d, p0/m, z0.d, z4.d
-; CHECK-NEXT:    asr z4.d, z25.d, #63
 ; CHECK-NEXT:    smulh z2.d, p0/m, z2.d, z6.d
 ; CHECK-NEXT:    smulh z1.d, p0/m, z1.d, z5.d
+; CHECK-NEXT:    asr z4.d, z25.d, #63
 ; CHECK-NEXT:    asr z5.d, z24.d, #63
 ; CHECK-NEXT:    asr z6.d, z26.d, #63
 ; CHECK-NEXT:    asr z7.d, z27.d, #63
@@ -476,13 +476,13 @@ define <vscale x 8 x i64> @smulo_nxv8i64(<vscale x 8 x i64> %x, <vscale x 8 x i6
 ; CHECK-NEXT:    cmpne p3.d, p0/z, z2.d, z6.d
 ; CHECK-NEXT:    cmpne p0.d, p0/z, z1.d, z7.d
 ; CHECK-NEXT:    mov z25.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    mov z26.d, p3/m, #0 // =0x0
 ; CHECK-NEXT:    mov z24.d, p2/m, #0 // =0x0
 ; CHECK-NEXT:    mov z27.d, p0/m, #0 // =0x0
-; CHECK-NEXT:    mov z26.d, p3/m, #0 // =0x0
 ; CHECK-NEXT:    mov z0.d, z25.d
+; CHECK-NEXT:    mov z2.d, z26.d
 ; CHECK-NEXT:    mov z3.d, z24.d
 ; CHECK-NEXT:    mov z1.d, z27.d
-; CHECK-NEXT:    mov z2.d, z26.d
 ; CHECK-NEXT:    ret
   %a = call { <vscale x 8 x i64>, <vscale x 8 x i1> } @llvm.smul.with.overflow.nxv8i64(<vscale x 8 x i64> %x, <vscale x 8 x i64> %y)
   %b = extractvalue { <vscale x 8 x i64>, <vscale x 8 x i1> } %a, 0
diff --git a/llvm/test/CodeGen/AArch64/sve-split-fcvt.ll b/llvm/test/CodeGen/AArch64/sve-split-fcvt.ll
index 44796606e7a1a4..3997409172d03b 100644
--- a/llvm/test/CodeGen/AArch64/sve-split-fcvt.ll
+++ b/llvm/test/CodeGen/AArch64/sve-split-fcvt.ll
@@ -6,8 +6,8 @@
 define <vscale x 8 x float> @fcvts_nxv8f16(<vscale x 8 x half> %a) {
 ; CHECK-LABEL: fcvts_nxv8f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    uunpklo z1.s, z0.h
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    uunpklo z1.s, z0.h
 ; CHECK-NEXT:    uunpkhi z2.s, z0.h
 ; CHECK-NEXT:    movprfx z0, z1
 ; CHECK-NEXT:    fcvt z0.s, p0/m, z1.h
@@ -21,8 +21,8 @@ define <vscale x 8 x float> @fcvts_nxv8f16(<vscale x 8 x half> %a) {
 define <vscale x 4 x double> @fcvtd_nxv4f16(<vscale x 4 x half> %a) {
 ; CHECK-LABEL: fcvtd_nxv4f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    uunpklo z1.d, z0.s
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    uunpklo z1.d, z0.s
 ; CHECK-NEXT:    uunpkhi z2.d, z0.s
 ; CHECK-NEXT:    movprfx z0, z1
 ; CHECK-NEXT:    fcvt z0.d, p0/m, z1.h
@@ -43,13 +43,13 @@ define <vscale x 8 x double> @fcvtd_nxv8f16(<vscale x 8 x half> %a) {
 ; CHECK-NEXT:    uunpkhi z1.d, z1.s
 ; CHECK-NEXT:    uunpklo z3.d, z0.s
 ; CHECK-NEXT:    uunpkhi z4.d, z0.s
+; CHECK-NEXT:    fcvt z1.d, p0/m, z1.h
 ; CHECK-NEXT:    movprfx z0, z2
 ; CHECK-NEXT:    fcvt z0.d, p0/m, z2.h
 ; CHECK-NEXT:    movprfx z2, z3
 ; CHECK-NEXT:    fcvt z2.d, p0/m, z3.h
 ; CHECK-NEXT:    movprfx z3, z4
 ; CHECK-NEXT:    fcvt z3.d, p0/m, z4.h
-; CHECK-NEXT:    fcvt z1.d, p0/m, z1.h
 ; CHECK-NEXT:    ret
   %res = fpext <vscale x 8 x half> %a to <vscale x 8 x double>
   ret <vscale x 8 x double> %res
@@ -58,8 +58,8 @@ define <vscale x 8 x double> @fcvtd_nxv8f16(<vscale x 8 x half> %a) {
 define <vscale x 4 x double> @fcvtd_nxv4f32(<vscale x 4 x float> %a) {
 ; CHECK-LABEL: fcvtd_nxv4f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    uunpklo z1.d, z0.s
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    uunpklo z1.d, z0.s
 ; CHECK-NEXT:    uunpkhi z2.d, z0.s
 ; CHECK-NEXT:    movprfx z0, z1
 ; CHECK-NEXT:    fcvt z0.d, p0/m, z1.s
@@ -73,17 +73,17 @@ define <vscale x 4 x double> @fcvtd_nxv4f32(<vscale x 4 x float> %a) {
 define <vscale x 8 x double> @fcvtd_nxv8f32(<vscale x 8 x float> %a) {
 ; CHECK-LABEL: fcvtd_nxv8f32:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    uunpklo z2.d, z0.s
 ; CHECK-NEXT:    uunpkhi z3.d, z0.s
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    uunpklo z4.d, z1.s
+; CHECK-NEXT:    uunpkhi z5.d, z1.s
 ; CHECK-NEXT:    movprfx z0, z2
 ; CHECK-NEXT:    fcvt z0.d, p0/m, z2.s
-; CHECK-NEXT:    movprfx z2, z4
-; CHECK-NEXT:    fcvt z2.d, p0/m, z4.s
-; CHECK-NEXT:    uunpkhi z5.d, z1.s
 ; CHECK-NEXT:    movprfx z1, z3
 ; CHECK-NEXT:    fcvt z1.d, p0/m, z3.s
+; CHECK-NEXT:    movprfx z2, z4
+; CHECK-NEXT:    fcvt z2.d, p0/m, z4.s
 ; CHECK-NEXT:    movprfx z3, z5
 ; CHECK-NEXT:    fcvt z3.d, p0/m, z5.s
 ; CHECK-NEXT:    ret
@@ -195,8 +195,8 @@ define <vscale x 8 x i16> @fcvtzs_h_nxv8f64(<vscale x 8 x double> %a) {
 define <vscale x 4 x i64> @fcvtzs_d_nxv4f32(<vscale x 4 x float> %a) {
 ; CHECK-LABEL: fcvtzs_d_nxv4f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    uunpklo z1.d, z0.s
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    uunpklo z1.d, z0.s
 ; CHECK-NEXT:    uunpkhi z2.d, z0.s
 ; CHECK-NEXT:    movprfx z0, z1
 ; CHECK-NEXT:    fcvtzs z0.d, p0/m, z1.s
@@ -210,17 +210,17 @@ define <vscale x 4 x i64> @fcvtzs_d_nxv4f32(<vscale x 4 x float> %a) {
 define <vscale x 16 x i32> @fcvtzs_s_nxv16f16(<vscale x 16 x half> %a) {
 ; CHECK-LABEL: fcvtzs_s_nxv16f16:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    uunpklo z2.s, z0.h
 ; CHECK-NEXT:    uunpkhi z3.s, z0.h
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    uunpklo z4.s, z1.h
+; CHECK-NEXT:    uunpkhi z5.s, z1.h
 ; CHECK-NEXT:    movprfx z0, z2
 ; CHECK-NEXT:    fcvtzs z0.s, p0/m, z2.h
-; CHECK-NEXT:    movprfx z2, z4
-; CHECK-NEXT:    fcvtzs z2.s, p0/m, z4.h
-; CHECK-NEXT:    uunpkhi z5.s, z1.h
 ; CHECK-NEXT:    movprfx z1, z3
 ; CHECK-NEXT:    fcvtzs z1.s, p0/m, z3.h
+; CHECK-NEXT:    movprfx z2, z4
+; CHECK-NEXT:    fcvtzs z2.s, p0/m, z4.h
 ; CHECK-NEXT:    movprfx z3, z5
 ; CHECK-NEXT:    fcvtzs z3.s, p0/m, z5.h
 ; CHECK-NEXT:    ret
@@ -247,8 +247,8 @@ define <vscale x 4 x i32> @fcvtzu_s_nxv4f64(<vscale x 4 x double> %a) {
 define <vscale x 4 x i64> @fcvtzu_d_nxv4f32(<vscale x 4 x float> %a) {
 ; CHECK-LABEL: fcvtzu_d_nxv4f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    uunpklo z1.d, z0.s
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    uunpklo z1.d, z0.s
 ; CHECK-NEXT:    uunpkhi z2.d, z0.s
 ; CHECK-NEXT:    movprfx z0, z1
 ; CHECK-NEXT:    fcvtzu z0.d, p0/m, z1.s
@@ -301,13 +301,13 @@ define <vscale x 16 x float> @scvtf_s_nxv16i8(<vscale x 16 x i8> %a) {
 ; CHECK-NEXT:    sunpkhi z1.s, z1.h
 ; CHECK-NEXT:    sunpklo z3.s, z0.h
 ; CHECK-NEXT:    sunpkhi z4.s, z0.h
+; CHECK-NEXT:    scvtf z1.s, p0/m, z1.s
 ; CHECK-NEXT:    movprfx z0, z2
 ; CHECK-NEXT:    scvtf z0.s, p0/m, z2.s
 ; CHECK-NEXT:    movprfx z2, z3
 ; CHECK-NEXT:    scvtf z2.s, p0/m, z3.s
 ; CHECK-NEXT:    movprfx z3, z4
 ; CHECK-NEXT:    scvtf z3.s, p0/m, z4.s
-; CHECK-NEXT:    scvtf z1.s, p0/m, z1.s
 ; CHECK-NEXT:    ret
   %res = sitofp <vscale x 16 x i8> %a to <vscale x 16 x float>
   ret <vscale x 16 x float> %res
@@ -316,8 +316,8 @@ define <vscale x 16 x float> @scvtf_s_nxv16i8(<vscale x 16 x i8> %a) {
 define <vscale x 4 x double> @scvtf_d_nxv4i32(<vscale x 4 x i32> %a) {
 ; CHECK-LABEL: scvtf_d_nxv4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sunpklo z1.d, z0.s
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    sunpklo z1.d, z0.s
 ; CHECK-NEXT:    sunpkhi z2.d, z0.s
 ; CHECK-NEXT:    movprfx z0, z1
 ; CHECK-NEXT:    scvtf z0.d, p0/m, z1.d
@@ -378,8 +378,8 @@ define <vscale x 8 x half> @ucvtf_h_nxv8i64(<vscale x 8 x i64> %a) {
 define <vscale x 4 x double> @ucvtf_d_nxv4i32(<vscale x 4 x i32> %a) {
 ; CHECK-LABEL: ucvtf_d_nxv4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    uunpklo z1.d, z0.s
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    uunpklo z1.d, z0.s
 ; CHECK-NEXT:    uunpkhi z2.d, z0.s
 ; CHECK-NEXT:    movprfx z0, z1
 ; CHECK-NEXT:    ucvtf z0.d, p0/m, z1.d
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-extend-trunc.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-extend-trunc.ll
index 37cc79011a9866..28e02da53af434 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-extend-trunc.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-extend-trunc.ll
@@ -239,10 +239,10 @@ define void @fcvt_v16f16_v16f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mov x8, #4 // =0x4
 ; CHECK-NEXT:    fcvt z0.d, p0/m, z0.h
 ; CHECK-NEXT:    ld1h { z4.d }, p0/z, [x0, x8, lsl #1]
-; CHECK-NEXT:    fcvt z1.d, p0/m, z1.h
 ; CHECK-NEXT:    mov x8, #6 // =0x6
-; CHECK-NEXT:    fcvt z2.d, p0/m, z2.h
+; CHECK-NEXT:    fcvt z1.d, p0/m, z1.h
 ; CHECK-NEXT:    ld1h { z5.d }, p0/z, [x0, x8, lsl #1]
+; CHECK-NEXT:    fcvt z2.d, p0/m, z2.h
 ; CHECK-NEXT:    mov x8, #2 // =0x2
 ; CHECK-NEXT:    fcvt z3.d, p0/m, z3.h
 ; CHECK-NEXT:    ld1h { z7.d }, p0/z, [x0, x8, lsl #1]
@@ -255,8 +255,8 @@ define void @fcvt_v16f16_v16f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q2, q3, [x1, #64]
 ; CHECK-NEXT:    movprfx z2, z7
 ; CHECK-NEXT:    fcvt z2.d, p0/m, z7.h
-; CHECK-NEXT:    stp q1, q2, [x1]
 ; CHECK-NEXT:    stp q4, q0, [x1, #32]
+; CHECK-NEXT:    stp q1, q2, [x1]
 ; CHECK-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %res = fpext <16 x half> %op1 to <16 x double>
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll
index 5e3ce0ddebe2e0..fcf4f21c6ea842 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll
@@ -750,14 +750,14 @@ define void @udiv_constantsplat_v8i32(ptr %a)  {
 ; SVE-NEXT:    mov z0.s, w8
 ; SVE-NEXT:    movprfx z3, z1
 ; SVE-NEXT:    umulh z3.s, p0/m, z3.s, z0.s
-; SVE-NEXT:    sub z1.s, z1.s, z3.s
 ; SVE-NEXT:    umulh z0.s, p0/m, z0.s, z2.s
-; SVE-NEXT:    lsr z1.s, z1.s, #1
+; SVE-NEXT:    sub z1.s, z1.s, z3.s
 ; SVE-NEXT:    sub z2.s, z2.s, z0.s
-; SVE-NEXT:    add z1.s, z1.s, z3.s
+; SVE-NEXT:    lsr z1.s, z1.s, #1
 ; SVE-NEXT:    lsr z2.s, z2.s, #1
-; SVE-NEXT:    lsr z1.s, z1.s, #6
+; SVE-NEXT:    add z1.s, z1.s, z3.s
 ; SVE-NEXT:    add z0.s, z2.s, z0.s
+; SVE-NEXT:    lsr z1.s, z1.s, #6
 ; SVE-NEXT:    lsr z0.s, z0.s, #6
 ; SVE-NEXT:    stp q1, q0, [x0]
 ; SVE-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll
index eb95a410209b45..4a1209b942f4a0 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll
@@ -139,7 +139,6 @@ define void @srem_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    sdiv z5.s, p0/m, z5.s, z4.s
 ; CHECK-NEXT:    ldr q3, [x0]
 ; CHECK-NEXT:    ldr q4, [x1]
-; CHECK-NEXT:    uzp1 z5.h, z5.h, z5.h
 ; CHECK-NEXT:    mov z18.d, z3.d
 ; CHECK-NEXT:    mov z17.d, z4.d
 ; CHECK-NEXT:    uzp1 z6.h, z6.h, z6.h
@@ -154,6 +153,7 @@ define void @srem_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    sdivr z7.s, p0/m, z7.s, z16.s
 ; CHECK-NEXT:    sunpklo z18.s, z18.h
 ; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT:    uzp1 z5.h, z5.h, z5.h
 ; CHECK-NEXT:    sunpklo z17.s, z17.h
 ; CHECK-NEXT:    sdivr z19.s, p0/m, z19.s, z20.s
 ; CHECK-NEXT:    sunpklo z20.h, z3.b
@@ -172,21 +172,21 @@ define void @srem_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    sdivr z18.s, p0/m, z18.s, z20.s
 ; CHECK-NEXT:    ptrue p0.h, vl4
 ; CHECK-NEXT:    uzp1 z19.h, z21.h, z21.h
+; CHECK-NEXT:    splice z16.h, p0, z16.h, z17.h
 ; CHECK-NEXT:    splice z2.h, p0, z2.h, z5.h
 ; CHECK-NEXT:    splice z6.h, p0, z6.h, z7.h
-; CHECK-NEXT:    splice z16.h, p0, z16.h, z17.h
+; CHECK-NEXT:    uzp1 z5.b, z16.b, z16.b
 ; CHECK-NEXT:    uzp1 z2.b, z2.b, z2.b
 ; CHECK-NEXT:    uzp1 z6.b, z6.b, z6.b
-; CHECK-NEXT:    uzp1 z5.b, z16.b, z16.b
 ; CHECK-NEXT:    uzp1 z18.h, z18.h, z18.h
 ; CHECK-NEXT:    splice z19.h, p0, z19.h, z18.h
 ; CHECK-NEXT:    ptrue p0.b, vl8
 ; CHECK-NEXT:    uzp1 z7.b, z19.b, z19.b
 ; CHECK-NEXT:    splice z6.b, p0, z6.b, z2.b
 ; CHECK-NEXT:    splice z7.b, p0, z7.b, z5.b
+; CHECK-NEXT:    mls z0.b, p1/m, z6.b, z1.b
 ; CHECK-NEXT:    movprfx z2, z3
 ; CHECK-NEXT:    mls z2.b, p1/m, z7.b, z4.b
-; CHECK-NEXT:    mls z0.b, p1/m, z6.b, z1.b
 ; CHECK-NEXT:    stp q2, q0, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
@@ -293,8 +293,8 @@ define <2 x i32> @srem_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; CHECK-LABEL: srem_v2i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s, vl2
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    movprfx z2, z0
 ; CHECK-NEXT:    sdiv z2.s, p0/m, z2.s, z1.s
 ; CHECK-NEXT:    mls z0.s, p0/m, z2.s, z1.s
@@ -308,8 +308,8 @@ define <4 x i32> @srem_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; CHECK-LABEL: srem_v4i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    movprfx z2, z0
 ; CHECK-NEXT:    sdiv z2.s, p0/m, z2.s, z1.s
 ; CHECK-NEXT:    mls z0.s, p0/m, z2.s, z1.s
@@ -345,8 +345,8 @@ define <1 x i64> @srem_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; CHECK-LABEL: srem_v1i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d, vl1
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    movprfx z2, z0
 ; CHECK-NEXT:    sdiv z2.d, p0/m, z2.d, z1.d
 ; CHECK-NEXT:    mls z0.d, p0/m, z2.d, z1.d
@@ -360,8 +360,8 @@ define <2 x i64> @srem_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; CHECK-LABEL: srem_v2i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    movprfx z2, z0
 ; CHECK-NEXT:    sdiv z2.d, p0/m, z2.d, z1.d
 ; CHECK-NEXT:    mls z0.d, p0/m, z2.d, z1.d
@@ -528,7 +528,6 @@ define void @urem_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    udiv z5.s, p0/m, z5.s, z4.s
 ; CHECK-NEXT:    ldr q3, [x0]
 ; CHECK-NEXT:    ldr q4, [x1]
-; CHECK-NEXT:    uzp1 z5.h, z5.h, z5.h
 ; CHECK-NEXT:    mov z18.d, z3.d
 ; CHECK-NEXT:    mov z17.d, z4.d
 ; CHECK-NEXT:    uzp1 z6.h, z6.h, z6.h
@@ -543,6 +542,7 @@ define void @urem_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    udivr z7.s, p0/m, z7.s, z16.s
 ; CHECK-NEXT:    uunpklo z18.s, z18.h
 ; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT:    uzp1 z5.h, z5.h, z5.h
 ; CHECK-NEXT:    uunpklo z17.s, z17.h
 ; CHECK-NEXT:    udivr z19.s, p0/m, z19.s, z20.s
 ; CHECK-NEXT:    uunpklo z20.h, z3.b
@@ -561,21 +561,21 @@ define void @urem_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    udivr z18.s, p0/m, z18.s, z20.s
 ; CHECK-NEXT:    ptrue p0.h, vl4
 ; CHECK-NEXT:    uzp1 z19.h, z21.h, z21.h
+; CHECK-NEXT:    splice z16.h, p0, z16.h, z17.h
 ; CHECK-NEXT:    splice z2.h, p0, z2.h, z5.h
 ; CHECK-NEXT:    splice z6.h, p0, z6.h, z7.h
-; CHECK-NEXT:    splice z16.h, p0, z16.h, z17.h
+; CHECK-NEXT:    uzp1 z5.b, z16.b, z16.b
 ; CHECK-NEXT:    uzp1 z2.b, z2.b, z2.b
 ; CHECK-NEXT:    uzp1 z6.b, z6.b, z6.b
-; CHECK-NEXT:    uzp1 z5.b, z16.b, z16.b
 ; CHECK-NEXT:    uzp1 z18.h, z18.h, z18.h
 ; CHECK-NEXT:    splice z19.h, p0, z19.h, z18.h
 ; CHECK-NEXT:    ptrue p0.b, vl8
 ; CHECK-NEXT:    uzp1 z7.b, z19.b, z19.b
 ; CHECK-NEXT:    splice z6.b, p0, z6.b, z2.b
 ; CHECK-NEXT:    splice z7.b, p0, z7.b, z5.b
+; CHECK-NEXT:    mls z0.b, p1/m, z6.b, z1.b
 ; CHECK-NEXT:    movprfx z2, z3
 ; CHECK-NEXT:    mls z2.b, p1/m, z7.b, z4.b
-; CHECK-NEXT:    mls z0.b, p1/m, z6.b, z1.b
 ; CHECK-NEXT:    stp q2, q0, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
@@ -682,8 +682,8 @@ define <2 x i32> @urem_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; CHECK-LABEL: urem_v2i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s, vl2
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    movprfx z2, z0
 ; CHECK-NEXT:    udiv z2.s, p0/m, z2.s, z1.s
 ; CHECK-NEXT:    mls z0.s, p0/m, z2.s, z1.s
@@ -697,8 +697,8 @@ define <4 x i32> @urem_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; CHECK-LABEL: urem_v4i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    movprfx z2, z0
 ; CHECK-NEXT:    udiv z2.s, p0/m, z2.s, z1.s
 ; CHECK-NEXT:    mls z0.s, p0/m, z2.s, z1.s
@@ -734,8 +734,8 @@ define <1 x i64> @urem_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; CHECK-LABEL: urem_v1i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d, vl1
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    movprfx z2, z0
 ; CHECK-NEXT:    udiv z2.d, p0/m, z2.d, z1.d
 ; CHECK-NEXT:    mls z0.d, p0/m, z2.d, z1.d
@@ -749,8 +749,8 @@ define <2 x i64> @urem_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; CHECK-LABEL: urem_v2i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    movprfx z2, z0
 ; CHECK-NEXT:    udiv z2.d, p0/m, z2.d, z1.d
 ; CHECK-NEXT:    mls z0.d, p0/m, z2.d, z1.d
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
index c110e89326cc0c..5c5cf68135bf8d 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
@@ -221,21 +221,21 @@ define void @ucvtf_v16i16_v16f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ext z6.b, z6.b, z1.b, #8
 ; CHECK-NEXT:    uunpklo z1.d, z1.s
 ; CHECK-NEXT:    ucvtf z2.d, p0/m, z2.d
-; CHECK-NEXT:    ucvtf z3.d, p0/m, z3.d
 ; CHECK-NEXT:    uunpklo z7.d, z7.s
+; CHECK-NEXT:    ucvtf z3.d, p0/m, z3.d
 ; CHECK-NEXT:    uunpklo z5.d, z5.s
+; CHECK-NEXT:    uunpklo z6.d, z6.s
 ; CHECK-NEXT:    ucvtf z4.d, p0/m, z4.d
 ; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.d
-; CHECK-NEXT:    uunpklo z6.d, z6.s
 ; CHECK-NEXT:    ucvtf z1.d, p0/m, z1.d
 ; CHECK-NEXT:    ucvtf z5.d, p0/m, z5.d
 ; CHECK-NEXT:    stp q2, q4, [x1, #64]
 ; CHECK-NEXT:    movprfx z2, z6
 ; CHECK-NEXT:    ucvtf z2.d, p0/m, z6.d
-; CHECK-NEXT:    stp q1, q2, [x1, #32]
 ; CHECK-NEXT:    stp q0, q5, [x1, #96]
 ; CHECK-NEXT:    movprfx z0, z7
 ; CHECK-NEXT:    ucvtf z0.d, p0/m, z7.d
+; CHECK-NEXT:    stp q1, q2, [x1, #32]
 ; CHECK-NEXT:    stp q3, q0, [x1]
 ; CHECK-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
@@ -481,10 +481,10 @@ define <8 x half> @ucvtf_v8i64_v8f16(ptr %a) {
 ; CHECK-NEXT:    uzp1 z3.s, z3.s, z3.s
 ; CHECK-NEXT:    uzp1 z2.s, z2.s, z2.s
 ; CHECK-NEXT:    splice z1.s, p0, z1.s, z0.s
-; CHECK-NEXT:    movprfx z0, z1
-; CHECK-NEXT:    fcvt z0.h, p1/m, z1.s
 ; CHECK-NEXT:    splice z2.s, p0, z2.s, z3.s
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    fcvt z0.h, p1/m, z1.s
 ; CHECK-NEXT:    movprfx z1, z2
 ; CHECK-NEXT:    fcvt z1.h, p1/m, z2.s
 ; CHECK-NEXT:    uzp1 z2.h, z0.h, z0.h
@@ -794,21 +794,21 @@ define void @scvtf_v16i16_v16f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ext z6.b, z6.b, z1.b, #8
 ; CHECK-NEXT:    sunpklo z1.d, z1.s
 ; CHECK-NEXT:    scvtf z2.d, p0/m, z2.d
-; CHECK-NEXT:    scvtf z3.d, p0/m, z3.d
 ; CHECK-NEXT:    sunpklo z7.d, z7.s
+; CHECK-NEXT:    scvtf z3.d, p0/m, z3.d
 ; CHECK-NEXT:    sunpklo z5.d, z5.s
+; CHECK-NEXT:    sunpklo z6.d, z6.s
 ; CHECK-NEXT:    scvtf z4.d, p0/m, z4.d
 ; CHECK-NEXT:    scvtf z0.d, p0/m, z0.d
-; CHECK-NEXT:    sunpklo z6.d, z6.s
 ; CHECK-NEXT:    scvtf z1.d, p0/m, z1.d
 ; CHECK-NEXT:    scvtf z5.d, p0/m, z5.d
 ; CHECK-NEXT:    stp q2, q4, [x1, #64]
 ; CHECK-NEXT:    movprfx z2, z6
 ; CHECK-NEXT:    scvtf z2.d, p0/m, z6.d
-; CHECK-NEXT:    stp q1, q2, [x1, #32]
 ; CHECK-NEXT:    stp q0, q5, [x1, #96]
 ; CHECK-NEXT:    movprfx z0, z7
 ; CHECK-NEXT:    scvtf z0.d, p0/m, z7.d
+; CHECK-NEXT:    stp q1, q2, [x1, #32]
 ; CHECK-NEXT:    stp q3, q0, [x1]
 ; CHECK-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
@@ -982,16 +982,16 @@ define void @scvtf_v16i32_v16f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    sunpklo z0.d, z0.s
 ; CHECK-NEXT:    sunpklo z1.d, z1.s
 ; CHECK-NEXT:    ext z6.b, z6.b, z4.b, #8
-; CHECK-NEXT:    sunpklo z4.d, z4.s
 ; CHECK-NEXT:    ext z7.b, z7.b, z5.b, #8
+; CHECK-NEXT:    sunpklo z4.d, z4.s
 ; CHECK-NEXT:    sunpklo z5.d, z5.s
 ; CHECK-NEXT:    sunpklo z2.d, z2.s
 ; CHECK-NEXT:    sunpklo z3.d, z3.s
 ; CHECK-NEXT:    scvtf z0.d, p0/m, z0.d
 ; CHECK-NEXT:    sunpklo z6.d, z6.s
+; CHECK-NEXT:    sunpklo z7.d, z7.s
 ; CHECK-NEXT:    scvtf z1.d, p0/m, z1.d
 ; CHECK-NEXT:    scvtf z4.d, p0/m, z4.d
-; CHECK-NEXT:    sunpklo z7.d, z7.s
 ; CHECK-NEXT:    scvtf z2.d, p0/m, z2.d
 ; CHECK-NEXT:    scvtf z3.d, p0/m, z3.d
 ; CHECK-NEXT:    stp q1, q3, [x1, #64]
diff --git a/llvm/test/CodeGen/AArch64/sve-umulo-sdnode.ll b/llvm/test/CodeGen/AArch64/sve-umulo-sdnode.ll
index 45d796f68c4249..36d64725742e57 100644
--- a/llvm/test/CodeGen/AArch64/sve-umulo-sdnode.ll
+++ b/llvm/test/CodeGen/AArch64/sve-umulo-sdnode.ll
@@ -120,16 +120,16 @@ define <vscale x 64 x i8> @umulo_nxv64i8(<vscale x 64 x i8> %x, <vscale x 64 x i
 ; CHECK-NEXT:    umulh z27.b, p0/m, z27.b, z5.b
 ; CHECK-NEXT:    mul z3.b, p0/m, z3.b, z7.b
 ; CHECK-NEXT:    mul z0.b, p0/m, z0.b, z4.b
-; CHECK-NEXT:    cmpne p1.b, p0/z, z25.b, #0
 ; CHECK-NEXT:    mul z2.b, p0/m, z2.b, z6.b
 ; CHECK-NEXT:    mul z1.b, p0/m, z1.b, z5.b
+; CHECK-NEXT:    cmpne p1.b, p0/z, z25.b, #0
 ; CHECK-NEXT:    cmpne p2.b, p0/z, z24.b, #0
 ; CHECK-NEXT:    cmpne p3.b, p0/z, z26.b, #0
 ; CHECK-NEXT:    cmpne p0.b, p0/z, z27.b, #0
 ; CHECK-NEXT:    mov z0.b, p1/m, #0 // =0x0
+; CHECK-NEXT:    mov z2.b, p3/m, #0 // =0x0
 ; CHECK-NEXT:    mov z3.b, p2/m, #0 // =0x0
 ; CHECK-NEXT:    mov z1.b, p0/m, #0 // =0x0
-; CHECK-NEXT:    mov z2.b, p3/m, #0 // =0x0
 ; CHECK-NEXT:    ret
   %a = call { <vscale x 64 x i8>, <vscale x 64 x i1> } @llvm.umul.with.overflow.nxv64i8(<vscale x 64 x i8> %x, <vscale x 64 x i8> %y)
   %b = extractvalue { <vscale x 64 x i8>, <vscale x 64 x i1> } %a, 0
@@ -237,16 +237,16 @@ define <vscale x 32 x i16> @umulo_nxv32i16(<vscale x 32 x i16> %x, <vscale x 32
 ; CHECK-NEXT:    umulh z27.h, p0/m, z27.h, z5.h
 ; CHECK-NEXT:    mul z3.h, p0/m, z3.h, z7.h
 ; CHECK-NEXT:    mul z0.h, p0/m, z0.h, z4.h
-; CHECK-NEXT:    cmpne p1.h, p0/z, z25.h, #0
 ; CHECK-NEXT:    mul z2.h, p0/m, z2.h, z6.h
 ; CHECK-NEXT:    mul z1.h, p0/m, z1.h, z5.h
+; CHECK-NEXT:    cmpne p1.h, p0/z, z25.h, #0
 ; CHECK-NEXT:    cmpne p2.h, p0/z, z24.h, #0
 ; CHECK-NEXT:    cmpne p3.h, p0/z, z26.h, #0
 ; CHECK-NEXT:    cmpne p0.h, p0/z, z27.h, #0
 ; CHECK-NEXT:    mov z0.h, p1/m, #0 // =0x0
+; CHECK-NEXT:    mov z2.h, p3/m, #0 // =0x0
 ; CHECK-NEXT:    mov z3.h, p2/m, #0 // =0x0
 ; CHECK-NEXT:    mov z1.h, p0/m, #0 // =0x0
-; CHECK-NEXT:    mov z2.h, p3/m, #0 // =0x0
 ; CHECK-NEXT:    ret
   %a = call { <vscale x 32 x i16>, <vscale x 32 x i1> } @llvm.umul.with.overflow.nxv32i16(<vscale x 32 x i16> %x, <vscale x 32 x i16> %y)
   %b = extractvalue { <vscale x 32 x i16>, <vscale x 32 x i1> } %a, 0
@@ -334,16 +334,16 @@ define <vscale x 16 x i32> @umulo_nxv16i32(<vscale x 16 x i32> %x, <vscale x 16
 ; CHECK-NEXT:    umulh z27.s, p0/m, z27.s, z5.s
 ; CHECK-NEXT:    mul z3.s, p0/m, z3.s, z7.s
 ; CHECK-NEXT:    mul z0.s, p0/m, z0.s, z4.s
-; CHECK-NEXT:    cmpne p1.s, p0/z, z25.s, #0
 ; CHECK-NEXT:    mul z2.s, p0/m, z2.s, z6.s
 ; CHECK-NEXT:    mul z1.s, p0/m, z1.s, z5.s
+; CHECK-NEXT:    cmpne p1.s, p0/z, z25.s, #0
 ; CHECK-NEXT:    cmpne p2.s, p0/z, z24.s, #0
 ; CHECK-NEXT:    cmpne p3.s, p0/z, z26.s, #0
 ; CHECK-NEXT:    cmpne p0.s, p0/z, z27.s, #0
 ; CHECK-NEXT:    mov z0.s, p1/m, #0 // =0x0
+; CHECK-NEXT:    mov z2.s, p3/m, #0 // =0x0
 ; CHECK-NEXT:    mov z3.s, p2/m, #0 // =0x0
 ; CHECK-NEXT:    mov z1.s, p0/m, #0 // =0x0
-; CHECK-NEXT:    mov z2.s, p3/m, #0 // =0x0
 ; CHECK-NEXT:    ret
   %a = call { <vscale x 16 x i32>, <vscale x 16 x i1> } @llvm.umul.with.overflow.nxv16i32(<vscale x 16 x i32> %x, <vscale x 16 x i32> %y)
   %b = extractvalue { <vscale x 16 x i32>, <vscale x 16 x i1> } %a, 0
@@ -411,16 +411,16 @@ define <vscale x 8 x i64> @umulo_nxv8i64(<vscale x 8 x i64> %x, <vscale x 8 x i6
 ; CHECK-NEXT:    umulh z27.d, p0/m, z27.d, z5.d
 ; CHECK-NEXT:    mul z3.d, p0/m, z3.d, z7.d
 ; CHECK-NEXT:    mul z0.d, p0/m, z0.d, z4.d
-; CHECK-NEXT:    cmpne p1.d, p0/z, z25.d, #0
 ; CHECK-NEXT:    mul z2.d, p0/m, z2.d, z6.d
 ; CHECK-NEXT:    mul z1.d, p0/m, z1.d, z5.d
+; CHECK-NEXT:    cmpne p1.d, p0/z, z25.d, #0
 ; CHECK-NEXT:    cmpne p2.d, p0/z, z24.d, #0
 ; CHECK-NEXT:    cmpne p3.d, p0/z, z26.d, #0
 ; CHECK-NEXT:    cmpne p0.d, p0/z, z27.d, #0
 ; CHECK-NEXT:    mov z0.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    mov z2.d, p3/m, #0 // =0x0
 ; CHECK-NEXT:    mov z3.d, p2/m, #0 // =0x0
 ; CHECK-NEXT:    mov z1.d, p0/m, #0 // =0x0
-; CHECK-NEXT:    mov z2.d, p3/m, #0 // =0x0
 ; CHECK-NEXT:    ret
   %a = call { <vscale x 8 x i64>, <vscale x 8 x i1> } @llvm.umul.with.overflow.nxv8i64(<vscale x 8 x i64> %x, <vscale x 8 x i64> %y)
   %b = extractvalue { <vscale x 8 x i64>, <vscale x 8 x i1> } %a, 0
diff --git a/llvm/test/CodeGen/AArch64/sve-vecreduce-dot.ll b/llvm/test/CodeGen/AArch64/sve-vecreduce-dot.ll
index 4d0d2a991d0e56..194d1071301d4d 100644
--- a/llvm/test/CodeGen/AArch64/sve-vecreduce-dot.ll
+++ b/llvm/test/CodeGen/AArch64/sve-vecreduce-dot.ll
@@ -31,8 +31,8 @@ define i32 @test(<vscale x 32 x i8> %bin.rdx, <vscale x 32 x i8> %bin.rdx2)  {
 ; CHECK-NEXT:    sunpkhi z24.s, z3.h
 ; CHECK-NEXT:    mul z0.s, p0/m, z0.s, z2.s
 ; CHECK-NEXT:    sunpkhi z2.s, z7.h
-; CHECK-NEXT:    sunpklo z3.s, z3.h
 ; CHECK-NEXT:    sunpklo z7.s, z7.h
+; CHECK-NEXT:    sunpklo z3.s, z3.h
 ; CHECK-NEXT:    mla z0.s, p0/m, z25.s, z24.s
 ; CHECK-NEXT:    mad z2.s, p0/m, z6.s, z4.s
 ; CHECK-NEXT:    mad z1.s, p0/m, z3.s, z26.s
diff --git a/llvm/test/CodeGen/AArch64/sve2-xar.ll b/llvm/test/CodeGen/AArch64/sve2-xar.ll
index 62680522bab93a..e297ade6b9ae1f 100644
--- a/llvm/test/CodeGen/AArch64/sve2-xar.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-xar.ll
@@ -155,8 +155,8 @@ define <vscale x 2 x i64> @xar_nxv2i64_l_neg1(<vscale x 2 x i64> %x, <vscale x 2
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    subr z2.d, z2.d, #0 // =0x0
 ; CHECK-NEXT:    eor z0.d, z0.d, z1.d
-; CHECK-NEXT:    and z2.d, z2.d, #0x3f
 ; CHECK-NEXT:    and z3.d, z3.d, #0x3f
+; CHECK-NEXT:    and z2.d, z2.d, #0x3f
 ; CHECK-NEXT:    movprfx z1, z0
 ; CHECK-NEXT:    lsl z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    lsr z0.d, p0/m, z0.d, z2.d



More information about the llvm-commits mailing list