[llvm] [DAG] Handle truncated splat in isBoolConstant (PR #145473)

David Green via llvm-commits llvm-commits at lists.llvm.org
Thu Jul 10 08:52:06 PDT 2025


https://github.com/davemgreen updated https://github.com/llvm/llvm-project/pull/145473

>From da133f3f08ed5c1ae38d39d05852a4b2917457ed Mon Sep 17 00:00:00 2001
From: David Green <david.green at arm.com>
Date: Thu, 10 Jul 2025 16:51:13 +0100
Subject: [PATCH] [DAG] Handle truncated splat in isBoolConstant

This allows truncated splat / buildvector in isBoolConstant, to allow certain
not instructions to be recognized post-legalization, and allow vselect to
optimize.

An override for x86 avx512 predicated vectors is required to avoid an infinite
recursion from the code that detects zero vectors. From:
```
  // Check if the first operand is all zeros and Cond type is vXi1.
  // If this an avx512 target we can improve the use of zero masking by
  // swapping the operands and inverting the condition.
```
---
 llvm/include/llvm/CodeGen/SelectionDAG.h      |    3 +-
 llvm/include/llvm/CodeGen/TargetLowering.h    |    5 +
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |    5 +-
 .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp |   11 +-
 llvm/lib/Target/X86/X86ISelLowering.cpp       |   10 +
 llvm/lib/Target/X86/X86ISelLowering.h         |    2 +
 llvm/test/CodeGen/AArch64/sve-fptosi-sat.ll   |  575 ++---
 llvm/test/CodeGen/AArch64/sve-fptoui-sat.ll   |  346 ++-
 llvm/test/CodeGen/AArch64/sve-llrint.ll       | 2048 ++++++++---------
 llvm/test/CodeGen/AArch64/sve-lrint.ll        | 2048 ++++++++---------
 .../CodeGen/AArch64/sve-pred-selectop2.ll     |  285 +--
 .../CodeGen/AArch64/sve-pred-selectop3.ll     |  177 +-
 .../RISCV/rvv/fixed-vectors-select-addsub.ll  |   12 +-
 llvm/test/CodeGen/X86/pr78897.ll              |   24 +-
 14 files changed, 2400 insertions(+), 3151 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index b856b4786573b..657951ddafd4f 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -2488,8 +2488,7 @@ class SelectionDAG {
 
   /// Check if a value \op N is a constant using the target's BooleanContent for
   /// its type.
-  LLVM_ABI std::optional<bool>
-  isBoolConstant(SDValue N, bool AllowTruncation = false) const;
+  LLVM_ABI std::optional<bool> isBoolConstant(SDValue N) const;
 
   /// Set CallSiteInfo to be associated with Node.
   void addCallSiteInfo(const SDNode *Node, CallSiteInfo &&CallInfo) {
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index c9e5d9999138f..a248eb7444b20 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -4371,6 +4371,11 @@ class LLVM_ABI TargetLowering : public TargetLoweringBase {
            Op.getOpcode() == ISD::SPLAT_VECTOR_PARTS;
   }
 
+  /// Return true if the given select/vselect should be considered canonical and
+  /// not be transformed. Currently only used for "vselect (not Cond), N1, N2 ->
+  /// vselect Cond, N2, N1".
+  virtual bool isTargetCanonicalSelect(SDNode *N) const { return false; }
+
   struct DAGCombinerInfo {
     void *DC;  // The DAG Combiner object.
     CombineLevel Level;
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 9ffdda28f7899..c32e709e162a9 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -13194,8 +13194,9 @@ SDValue DAGCombiner::visitVSELECT(SDNode *N) {
     return V;
 
   // vselect (not Cond), N1, N2 -> vselect Cond, N2, N1
-  if (SDValue F = extractBooleanFlip(N0, DAG, TLI, false))
-    return DAG.getSelect(DL, VT, F, N2, N1);
+  if (!TLI.isTargetCanonicalSelect(N))
+    if (SDValue F = extractBooleanFlip(N0, DAG, TLI, false))
+      return DAG.getSelect(DL, VT, F, N2, N1);
 
   // select (sext m), (add X, C), X --> (add X, (and C, (sext m))))
   if (N1.getOpcode() == ISD::ADD && N1.getOperand(0) == N2 && N1->hasOneUse() &&
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 5a4cc466d2bce..58be4fb7e8331 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -10459,7 +10459,7 @@ SDValue SelectionDAG::simplifySelect(SDValue Cond, SDValue T, SDValue F) {
 
   // select true, T, F --> T
   // select false, T, F --> F
-  if (auto C = isBoolConstant(Cond, /*AllowTruncation=*/true))
+  if (auto C = isBoolConstant(Cond))
     return *C ? T : F;
 
   // select ?, T, T --> T
@@ -13688,13 +13688,14 @@ bool SelectionDAG::isConstantFPBuildVectorOrConstantFP(SDValue N) const {
   return false;
 }
 
-std::optional<bool> SelectionDAG::isBoolConstant(SDValue N,
-                                                 bool AllowTruncation) const {
-  ConstantSDNode *Const = isConstOrConstSplat(N, false, AllowTruncation);
+std::optional<bool> SelectionDAG::isBoolConstant(SDValue N) const {
+  ConstantSDNode *Const =
+      isConstOrConstSplat(N, false, /*AllowTruncation=*/true);
   if (!Const)
     return std::nullopt;
 
-  const APInt &CVal = Const->getAPIntValue();
+  EVT VT = N->getValueType(0);
+  const APInt CVal = Const->getAPIntValue().trunc(VT.getScalarSizeInBits());
   switch (TLI->getBooleanContents(N.getValueType())) {
   case TargetLowering::ZeroOrOneBooleanContent:
     if (CVal.isOne())
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 1ad1b47a94d28..5e35d5630d667 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -4975,6 +4975,16 @@ X86TargetLowering::getTargetConstantFromLoad(LoadSDNode *LD) const {
   return getTargetConstantFromNode(LD);
 }
 
+bool X86TargetLowering::isTargetCanonicalSelect(SDNode *N) const {
+  // Do not fold (vselect not(C), X, 0s) to (vselect C, Os, X)
+  SDValue Cond = N->getOperand(0);
+  SDValue RHS = N->getOperand(2);
+  EVT CondVT = Cond.getValueType();
+  return N->getOpcode() == ISD::VSELECT && Subtarget.hasAVX512() &&
+         CondVT.getVectorElementType() == MVT::i1 &&
+         ISD::isBuildVectorAllZeros(RHS.getNode());
+}
+
 // Extract raw constant bits from constant pools.
 static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
                                           APInt &UndefElts,
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 3039b7eeb38ff..6bcb7a36e91b5 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1356,6 +1356,8 @@ namespace llvm {
              TargetLowering::isTargetCanonicalConstantNode(Op);
     }
 
+    bool isTargetCanonicalSelect(SDNode *N) const override;
+
     const Constant *getTargetConstantFromLoad(LoadSDNode *LD) const override;
 
     SDValue unwrapAddress(SDValue N) const override;
diff --git a/llvm/test/CodeGen/AArch64/sve-fptosi-sat.ll b/llvm/test/CodeGen/AArch64/sve-fptosi-sat.ll
index 584c29ebcfc04..1b6b92af8c64a 100644
--- a/llvm/test/CodeGen/AArch64/sve-fptosi-sat.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fptosi-sat.ll
@@ -16,19 +16,16 @@ define <vscale x 2 x i32> @test_signed_v2f32_v2i32(<vscale x 2 x float> %f) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, #-822083584 // =0xcf000000
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    mov z2.d, #0xffffffff80000000
 ; CHECK-NEXT:    mov z1.s, w8
 ; CHECK-NEXT:    mov w8, #1325400063 // =0x4effffff
-; CHECK-NEXT:    mov z3.s, w8
+; CHECK-NEXT:    mov z2.s, w8
 ; CHECK-NEXT:    fcmge p1.s, p0/z, z0.s, z1.s
-; CHECK-NEXT:    movprfx z1, z0
-; CHECK-NEXT:    fcvtzs z1.d, p0/m, z0.s
-; CHECK-NEXT:    fcmgt p2.s, p0/z, z0.s, z3.s
-; CHECK-NEXT:    mov z3.d, #0x7fffffff
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    mov z1.d, #0xffffffff80000000
+; CHECK-NEXT:    fcmgt p2.s, p0/z, z0.s, z2.s
+; CHECK-NEXT:    mov z2.d, #0x7fffffff
 ; CHECK-NEXT:    fcmuo p0.s, p0/z, z0.s, z0.s
-; CHECK-NEXT:    mov z1.d, p1/m, z2.d
-; CHECK-NEXT:    sel z0.d, p2, z3.d, z1.d
+; CHECK-NEXT:    fcvtzs z1.d, p1/m, z0.s
+; CHECK-NEXT:    sel z0.d, p2, z2.d, z1.d
 ; CHECK-NEXT:    mov z0.d, p0/m, #0 // =0x0
 ; CHECK-NEXT:    ret
     %x = call <vscale x 2 x i32> @llvm.fptosi.sat.nxv2f32.nxv2i32(<vscale x 2 x float> %f)
@@ -40,19 +37,16 @@ define <vscale x 4 x i32> @test_signed_v4f32_v4i32(<vscale x 4 x float> %f) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, #-822083584 // =0xcf000000
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    mov z2.s, #0x80000000
 ; CHECK-NEXT:    mov z1.s, w8
 ; CHECK-NEXT:    mov w8, #1325400063 // =0x4effffff
-; CHECK-NEXT:    mov z3.s, w8
+; CHECK-NEXT:    mov z2.s, w8
 ; CHECK-NEXT:    fcmge p1.s, p0/z, z0.s, z1.s
-; CHECK-NEXT:    movprfx z1, z0
-; CHECK-NEXT:    fcvtzs z1.s, p0/m, z0.s
-; CHECK-NEXT:    fcmgt p2.s, p0/z, z0.s, z3.s
-; CHECK-NEXT:    mov z3.s, #0x7fffffff
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    mov z1.s, #0x80000000
+; CHECK-NEXT:    fcmgt p2.s, p0/z, z0.s, z2.s
+; CHECK-NEXT:    mov z2.s, #0x7fffffff
 ; CHECK-NEXT:    fcmuo p0.s, p0/z, z0.s, z0.s
-; CHECK-NEXT:    mov z1.s, p1/m, z2.s
-; CHECK-NEXT:    sel z0.s, p2, z3.s, z1.s
+; CHECK-NEXT:    fcvtzs z1.s, p1/m, z0.s
+; CHECK-NEXT:    sel z0.s, p2, z2.s, z1.s
 ; CHECK-NEXT:    mov z0.s, p0/m, #0 // =0x0
 ; CHECK-NEXT:    ret
     %x = call <vscale x 4 x i32> @llvm.fptosi.sat.nxv4f32.nxv4i32(<vscale x 4 x float> %f)
@@ -62,39 +56,26 @@ define <vscale x 4 x i32> @test_signed_v4f32_v4i32(<vscale x 4 x float> %f) {
 define <vscale x 8 x i32> @test_signed_v8f32_v8i32(<vscale x 8 x float> %f) {
 ; CHECK-LABEL: test_signed_v8f32_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
-; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    mov w8, #-822083584 // =0xcf000000
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    mov z6.s, #0x7fffffff
+; CHECK-NEXT:    mov z3.s, #0x80000000
 ; CHECK-NEXT:    mov z2.s, w8
 ; CHECK-NEXT:    mov w8, #1325400063 // =0x4effffff
-; CHECK-NEXT:    mov z3.s, w8
-; CHECK-NEXT:    movprfx z4, z0
-; CHECK-NEXT:    fcvtzs z4.s, p0/m, z0.s
-; CHECK-NEXT:    movprfx z5, z1
-; CHECK-NEXT:    fcvtzs z5.s, p0/m, z1.s
+; CHECK-NEXT:    mov z4.s, #0x80000000
+; CHECK-NEXT:    mov z5.s, w8
 ; CHECK-NEXT:    fcmge p1.s, p0/z, z0.s, z2.s
 ; CHECK-NEXT:    fcmge p2.s, p0/z, z1.s, z2.s
-; CHECK-NEXT:    mov z2.s, #0x80000000
-; CHECK-NEXT:    fcmgt p3.s, p0/z, z0.s, z3.s
-; CHECK-NEXT:    fcmgt p4.s, p0/z, z1.s, z3.s
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    sel z3.s, p1, z2.s, z4.s
-; CHECK-NEXT:    fcmuo p1.s, p0/z, z0.s, z0.s
+; CHECK-NEXT:    mov z2.s, #0x7fffffff
+; CHECK-NEXT:    fcmgt p3.s, p0/z, z1.s, z5.s
+; CHECK-NEXT:    fcvtzs z3.s, p1/m, z0.s
+; CHECK-NEXT:    fcmgt p1.s, p0/z, z0.s, z5.s
+; CHECK-NEXT:    fcvtzs z4.s, p2/m, z1.s
+; CHECK-NEXT:    fcmuo p2.s, p0/z, z0.s, z0.s
 ; CHECK-NEXT:    fcmuo p0.s, p0/z, z1.s, z1.s
-; CHECK-NEXT:    sel z2.s, p2, z2.s, z5.s
-; CHECK-NEXT:    sel z0.s, p3, z6.s, z3.s
-; CHECK-NEXT:    sel z1.s, p4, z6.s, z2.s
-; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    mov z0.s, p1/m, #0 // =0x0
+; CHECK-NEXT:    sel z0.s, p1, z2.s, z3.s
+; CHECK-NEXT:    sel z1.s, p3, z2.s, z4.s
+; CHECK-NEXT:    mov z0.s, p2/m, #0 // =0x0
 ; CHECK-NEXT:    mov z1.s, p0/m, #0 // =0x0
-; CHECK-NEXT:    addvl sp, sp, #1
-; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
     %x = call <vscale x 8 x i32> @llvm.fptosi.sat.nxv8f32.nxv8i32(<vscale x 8 x float> %f)
     ret <vscale x 8 x i32> %x
@@ -105,19 +86,17 @@ define <vscale x 4 x i16> @test_signed_v4f32_v4i16(<vscale x 4 x float> %f) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, #-956301312 // =0xc7000000
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    mov z2.s, #-32768 // =0xffffffffffff8000
 ; CHECK-NEXT:    mov z1.s, w8
 ; CHECK-NEXT:    mov w8, #65024 // =0xfe00
 ; CHECK-NEXT:    movk w8, #18175, lsl #16
-; CHECK-NEXT:    mov z2.s, w8
 ; CHECK-NEXT:    fcmge p1.s, p0/z, z0.s, z1.s
-; CHECK-NEXT:    movprfx z1, z0
-; CHECK-NEXT:    fcvtzs z1.s, p0/m, z0.s
-; CHECK-NEXT:    fcmgt p2.s, p0/z, z0.s, z2.s
-; CHECK-NEXT:    mov z2.s, #32767 // =0x7fff
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    mov z1.s, w8
+; CHECK-NEXT:    fcmgt p2.s, p0/z, z0.s, z1.s
+; CHECK-NEXT:    mov z1.s, #32767 // =0x7fff
 ; CHECK-NEXT:    fcmuo p0.s, p0/z, z0.s, z0.s
-; CHECK-NEXT:    mov z1.s, p1/m, #-32768 // =0xffffffffffff8000
-; CHECK-NEXT:    sel z0.s, p2, z2.s, z1.s
+; CHECK-NEXT:    fcvtzs z2.s, p1/m, z0.s
+; CHECK-NEXT:    sel z0.s, p2, z1.s, z2.s
 ; CHECK-NEXT:    mov z0.s, p0/m, #0 // =0x0
 ; CHECK-NEXT:    ret
     %x = call <vscale x 4 x i16> @llvm.fptosi.sat.nxv4f32.nxv4i16(<vscale x 4 x float> %f)
@@ -127,40 +106,28 @@ define <vscale x 4 x i16> @test_signed_v4f32_v4i16(<vscale x 4 x float> %f) {
 define <vscale x 8 x i16> @test_signed_v8f32_v8i16(<vscale x 8 x float> %f) {
 ; CHECK-LABEL: test_signed_v8f32_v8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
-; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    mov w8, #-956301312 // =0xc7000000
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    mov z5.s, #32767 // =0x7fff
+; CHECK-NEXT:    mov z3.s, #-32768 // =0xffffffffffff8000
 ; CHECK-NEXT:    mov z2.s, w8
 ; CHECK-NEXT:    mov w8, #65024 // =0xfe00
+; CHECK-NEXT:    mov z5.s, #32767 // =0x7fff
 ; CHECK-NEXT:    movk w8, #18175, lsl #16
-; CHECK-NEXT:    movprfx z3, z1
-; CHECK-NEXT:    fcvtzs z3.s, p0/m, z1.s
-; CHECK-NEXT:    movprfx z4, z0
-; CHECK-NEXT:    fcvtzs z4.s, p0/m, z0.s
+; CHECK-NEXT:    mov z4.s, w8
 ; CHECK-NEXT:    fcmge p1.s, p0/z, z1.s, z2.s
 ; CHECK-NEXT:    fcmge p2.s, p0/z, z0.s, z2.s
-; CHECK-NEXT:    mov z2.s, w8
-; CHECK-NEXT:    fcmgt p3.s, p0/z, z1.s, z2.s
-; CHECK-NEXT:    fcmgt p4.s, p0/z, z0.s, z2.s
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    mov z3.s, p1/m, #-32768 // =0xffffffffffff8000
-; CHECK-NEXT:    fcmuo p1.s, p0/z, z1.s, z1.s
+; CHECK-NEXT:    mov z2.s, #-32768 // =0xffffffffffff8000
+; CHECK-NEXT:    fcmgt p3.s, p0/z, z0.s, z4.s
+; CHECK-NEXT:    fcvtzs z3.s, p1/m, z1.s
+; CHECK-NEXT:    fcmgt p1.s, p0/z, z1.s, z4.s
+; CHECK-NEXT:    fcvtzs z2.s, p2/m, z0.s
+; CHECK-NEXT:    fcmuo p2.s, p0/z, z1.s, z1.s
 ; CHECK-NEXT:    fcmuo p0.s, p0/z, z0.s, z0.s
-; CHECK-NEXT:    mov z4.s, p2/m, #-32768 // =0xffffffffffff8000
-; CHECK-NEXT:    sel z0.s, p3, z5.s, z3.s
-; CHECK-NEXT:    sel z1.s, p4, z5.s, z4.s
-; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    mov z0.s, p1/m, #0 // =0x0
+; CHECK-NEXT:    sel z0.s, p1, z5.s, z3.s
+; CHECK-NEXT:    sel z1.s, p3, z5.s, z2.s
+; CHECK-NEXT:    mov z0.s, p2/m, #0 // =0x0
 ; CHECK-NEXT:    mov z1.s, p0/m, #0 // =0x0
 ; CHECK-NEXT:    uzp1 z0.h, z1.h, z0.h
-; CHECK-NEXT:    addvl sp, sp, #1
-; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
     %x = call <vscale x 8 x i16> @llvm.fptosi.sat.nxv8f32.nxv8i16(<vscale x 8 x float> %f)
     ret <vscale x 8 x i16> %x
@@ -171,19 +138,16 @@ define <vscale x 2 x i64> @test_signed_v2f32_v2i64(<vscale x 2 x float> %f) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, #-553648128 // =0xdf000000
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    mov z2.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z1.s, w8
 ; CHECK-NEXT:    mov w8, #1593835519 // =0x5effffff
-; CHECK-NEXT:    mov z3.s, w8
+; CHECK-NEXT:    mov z2.s, w8
 ; CHECK-NEXT:    fcmge p1.s, p0/z, z0.s, z1.s
-; CHECK-NEXT:    movprfx z1, z0
-; CHECK-NEXT:    fcvtzs z1.d, p0/m, z0.s
-; CHECK-NEXT:    fcmgt p2.s, p0/z, z0.s, z3.s
-; CHECK-NEXT:    mov z3.d, #0x7fffffffffffffff
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    mov z1.d, #0x8000000000000000
+; CHECK-NEXT:    fcmgt p2.s, p0/z, z0.s, z2.s
+; CHECK-NEXT:    mov z2.d, #0x7fffffffffffffff
 ; CHECK-NEXT:    fcmuo p0.s, p0/z, z0.s, z0.s
-; CHECK-NEXT:    mov z1.d, p1/m, z2.d
-; CHECK-NEXT:    sel z0.d, p2, z3.d, z1.d
+; CHECK-NEXT:    fcvtzs z1.d, p1/m, z0.s
+; CHECK-NEXT:    sel z0.d, p2, z2.d, z1.d
 ; CHECK-NEXT:    mov z0.d, p0/m, #0 // =0x0
 ; CHECK-NEXT:    ret
     %x = call <vscale x 2 x i64> @llvm.fptosi.sat.nxv2f32.nxv2i64(<vscale x 2 x float> %f)
@@ -193,41 +157,28 @@ define <vscale x 2 x i64> @test_signed_v2f32_v2i64(<vscale x 2 x float> %f) {
 define <vscale x 4 x i64> @test_signed_v4f32_v4i64(<vscale x 4 x float> %f) {
 ; CHECK-LABEL: test_signed_v4f32_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
-; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    mov w8, #-553648128 // =0xdf000000
 ; CHECK-NEXT:    uunpklo z1.d, z0.s
 ; CHECK-NEXT:    uunpkhi z0.d, z0.s
 ; CHECK-NEXT:    mov z2.s, w8
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov w8, #1593835519 // =0x5effffff
-; CHECK-NEXT:    mov z3.s, w8
-; CHECK-NEXT:    mov z6.d, #0x7fffffffffffffff
+; CHECK-NEXT:    mov z3.d, #0x8000000000000000
+; CHECK-NEXT:    mov z4.d, #0x8000000000000000
+; CHECK-NEXT:    mov z5.s, w8
 ; CHECK-NEXT:    fcmge p1.s, p0/z, z1.s, z2.s
 ; CHECK-NEXT:    fcmge p2.s, p0/z, z0.s, z2.s
-; CHECK-NEXT:    mov z2.d, #0x8000000000000000
-; CHECK-NEXT:    movprfx z4, z1
-; CHECK-NEXT:    fcvtzs z4.d, p0/m, z1.s
-; CHECK-NEXT:    movprfx z5, z0
-; CHECK-NEXT:    fcvtzs z5.d, p0/m, z0.s
-; CHECK-NEXT:    fcmgt p3.s, p0/z, z1.s, z3.s
-; CHECK-NEXT:    fcmgt p4.s, p0/z, z0.s, z3.s
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    sel z3.d, p1, z2.d, z4.d
-; CHECK-NEXT:    fcmuo p1.s, p0/z, z1.s, z1.s
+; CHECK-NEXT:    mov z2.d, #0x7fffffffffffffff
+; CHECK-NEXT:    fcmgt p3.s, p0/z, z0.s, z5.s
+; CHECK-NEXT:    fcvtzs z3.d, p1/m, z1.s
+; CHECK-NEXT:    fcmgt p1.s, p0/z, z1.s, z5.s
+; CHECK-NEXT:    fcvtzs z4.d, p2/m, z0.s
+; CHECK-NEXT:    fcmuo p2.s, p0/z, z1.s, z1.s
 ; CHECK-NEXT:    fcmuo p0.s, p0/z, z0.s, z0.s
-; CHECK-NEXT:    sel z2.d, p2, z2.d, z5.d
-; CHECK-NEXT:    sel z0.d, p3, z6.d, z3.d
-; CHECK-NEXT:    sel z1.d, p4, z6.d, z2.d
-; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    mov z0.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    sel z0.d, p1, z2.d, z3.d
+; CHECK-NEXT:    sel z1.d, p3, z2.d, z4.d
+; CHECK-NEXT:    mov z0.d, p2/m, #0 // =0x0
 ; CHECK-NEXT:    mov z1.d, p0/m, #0 // =0x0
-; CHECK-NEXT:    addvl sp, sp, #1
-; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
     %x = call <vscale x 4 x i64> @llvm.fptosi.sat.nxv4f32.nxv4i64(<vscale x 4 x float> %f)
     ret <vscale x 4 x i64> %x
@@ -248,20 +199,17 @@ define <vscale x 2 x i32> @test_signed_v2f64_v2i32(<vscale x 2 x double> %f) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov x8, #-4476578029606273024 // =0xc1e0000000000000
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    mov z2.d, #0xffffffff80000000
 ; CHECK-NEXT:    mov z1.d, x8
 ; CHECK-NEXT:    mov x8, #281474972516352 // =0xffffffc00000
 ; CHECK-NEXT:    movk x8, #16863, lsl #48
-; CHECK-NEXT:    mov z3.d, x8
+; CHECK-NEXT:    mov z2.d, x8
 ; CHECK-NEXT:    fcmge p1.d, p0/z, z0.d, z1.d
-; CHECK-NEXT:    movprfx z1, z0
-; CHECK-NEXT:    fcvtzs z1.d, p0/m, z0.d
-; CHECK-NEXT:    fcmgt p2.d, p0/z, z0.d, z3.d
-; CHECK-NEXT:    mov z3.d, #0x7fffffff
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    mov z1.d, #0xffffffff80000000
+; CHECK-NEXT:    fcmgt p2.d, p0/z, z0.d, z2.d
+; CHECK-NEXT:    mov z2.d, #0x7fffffff
 ; CHECK-NEXT:    fcmuo p0.d, p0/z, z0.d, z0.d
-; CHECK-NEXT:    mov z1.d, p1/m, z2.d
-; CHECK-NEXT:    sel z0.d, p2, z3.d, z1.d
+; CHECK-NEXT:    fcvtzs z1.d, p1/m, z0.d
+; CHECK-NEXT:    sel z0.d, p2, z2.d, z1.d
 ; CHECK-NEXT:    mov z0.d, p0/m, #0 // =0x0
 ; CHECK-NEXT:    ret
     %x = call <vscale x 2 x i32> @llvm.fptosi.sat.nxv2f64.nxv2i32(<vscale x 2 x double> %f)
@@ -271,41 +219,28 @@ define <vscale x 2 x i32> @test_signed_v2f64_v2i32(<vscale x 2 x double> %f) {
 define <vscale x 4 x i32> @test_signed_v4f64_v4i32(<vscale x 4 x double> %f) {
 ; CHECK-LABEL: test_signed_v4f64_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
-; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    mov x8, #-4476578029606273024 // =0xc1e0000000000000
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    mov z6.d, #0x7fffffff
+; CHECK-NEXT:    mov z3.d, #0xffffffff80000000
 ; CHECK-NEXT:    mov z2.d, x8
 ; CHECK-NEXT:    mov x8, #281474972516352 // =0xffffffc00000
+; CHECK-NEXT:    mov z4.d, #0xffffffff80000000
 ; CHECK-NEXT:    movk x8, #16863, lsl #48
-; CHECK-NEXT:    movprfx z4, z1
-; CHECK-NEXT:    fcvtzs z4.d, p0/m, z1.d
-; CHECK-NEXT:    movprfx z5, z0
-; CHECK-NEXT:    fcvtzs z5.d, p0/m, z0.d
-; CHECK-NEXT:    mov z3.d, x8
+; CHECK-NEXT:    mov z5.d, x8
 ; CHECK-NEXT:    fcmge p1.d, p0/z, z1.d, z2.d
 ; CHECK-NEXT:    fcmge p2.d, p0/z, z0.d, z2.d
-; CHECK-NEXT:    mov z2.d, #0xffffffff80000000
-; CHECK-NEXT:    fcmgt p3.d, p0/z, z1.d, z3.d
-; CHECK-NEXT:    fcmgt p4.d, p0/z, z0.d, z3.d
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    sel z3.d, p1, z2.d, z4.d
-; CHECK-NEXT:    fcmuo p1.d, p0/z, z1.d, z1.d
+; CHECK-NEXT:    mov z2.d, #0x7fffffff
+; CHECK-NEXT:    fcmgt p3.d, p0/z, z0.d, z5.d
+; CHECK-NEXT:    fcvtzs z3.d, p1/m, z1.d
+; CHECK-NEXT:    fcmgt p1.d, p0/z, z1.d, z5.d
+; CHECK-NEXT:    fcvtzs z4.d, p2/m, z0.d
+; CHECK-NEXT:    fcmuo p2.d, p0/z, z1.d, z1.d
 ; CHECK-NEXT:    fcmuo p0.d, p0/z, z0.d, z0.d
-; CHECK-NEXT:    sel z2.d, p2, z2.d, z5.d
-; CHECK-NEXT:    sel z0.d, p3, z6.d, z3.d
-; CHECK-NEXT:    sel z1.d, p4, z6.d, z2.d
-; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    mov z0.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    sel z0.d, p1, z2.d, z3.d
+; CHECK-NEXT:    sel z1.d, p3, z2.d, z4.d
+; CHECK-NEXT:    mov z0.d, p2/m, #0 // =0x0
 ; CHECK-NEXT:    mov z1.d, p0/m, #0 // =0x0
 ; CHECK-NEXT:    uzp1 z0.s, z1.s, z0.s
-; CHECK-NEXT:    addvl sp, sp, #1
-; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
     %x = call <vscale x 4 x i32> @llvm.fptosi.sat.nxv4f64.nxv4i32(<vscale x 4 x double> %f)
     ret <vscale x 4 x i32> %x
@@ -316,7 +251,6 @@ define <vscale x 8 x i32> @test_signed_v8f64_v8i32(<vscale x 8 x double> %f) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    str p7, [sp, #4, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p6, [sp, #5, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
@@ -327,48 +261,38 @@ define <vscale x 8 x i32> @test_signed_v8f64_v8i32(<vscale x 8 x double> %f) {
 ; CHECK-NEXT:    mov z5.d, #0xffffffff80000000
 ; CHECK-NEXT:    mov z4.d, x8
 ; CHECK-NEXT:    mov x8, #281474972516352 // =0xffffffc00000
-; CHECK-NEXT:    mov z26.d, #0x7fffffff
+; CHECK-NEXT:    mov z6.d, #0xffffffff80000000
 ; CHECK-NEXT:    movk x8, #16863, lsl #48
-; CHECK-NEXT:    movprfx z7, z0
-; CHECK-NEXT:    fcvtzs z7.d, p0/m, z0.d
-; CHECK-NEXT:    movprfx z24, z3
-; CHECK-NEXT:    fcvtzs z24.d, p0/m, z3.d
-; CHECK-NEXT:    mov z6.d, x8
-; CHECK-NEXT:    movprfx z25, z2
-; CHECK-NEXT:    fcvtzs z25.d, p0/m, z2.d
+; CHECK-NEXT:    mov z7.d, #0xffffffff80000000
+; CHECK-NEXT:    mov z24.d, #0xffffffff80000000
+; CHECK-NEXT:    mov z25.d, x8
+; CHECK-NEXT:    fcmuo p6.d, p0/z, z0.d, z0.d
 ; CHECK-NEXT:    fcmge p1.d, p0/z, z1.d, z4.d
 ; CHECK-NEXT:    fcmge p2.d, p0/z, z0.d, z4.d
 ; CHECK-NEXT:    fcmge p3.d, p0/z, z3.d, z4.d
 ; CHECK-NEXT:    fcmge p4.d, p0/z, z2.d, z4.d
-; CHECK-NEXT:    movprfx z4, z1
-; CHECK-NEXT:    fcvtzs z4.d, p0/m, z1.d
-; CHECK-NEXT:    fcmgt p5.d, p0/z, z1.d, z6.d
-; CHECK-NEXT:    fcmgt p6.d, p0/z, z0.d, z6.d
-; CHECK-NEXT:    fcmgt p7.d, p0/z, z3.d, z6.d
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    not p3.b, p0/z, p3.b
-; CHECK-NEXT:    mov z4.d, p1/m, z5.d
-; CHECK-NEXT:    fcmgt p1.d, p0/z, z2.d, z6.d
-; CHECK-NEXT:    not p4.b, p0/z, p4.b
-; CHECK-NEXT:    sel z6.d, p2, z5.d, z7.d
-; CHECK-NEXT:    fcmuo p2.d, p0/z, z1.d, z1.d
-; CHECK-NEXT:    sel z7.d, p3, z5.d, z24.d
-; CHECK-NEXT:    fcmuo p3.d, p0/z, z0.d, z0.d
-; CHECK-NEXT:    sel z5.d, p4, z5.d, z25.d
-; CHECK-NEXT:    fcmuo p4.d, p0/z, z3.d, z3.d
+; CHECK-NEXT:    mov z4.d, #0x7fffffff
+; CHECK-NEXT:    fcmgt p5.d, p0/z, z2.d, z25.d
+; CHECK-NEXT:    fcvtzs z5.d, p1/m, z1.d
+; CHECK-NEXT:    fcmgt p1.d, p0/z, z1.d, z25.d
+; CHECK-NEXT:    fcvtzs z6.d, p2/m, z0.d
+; CHECK-NEXT:    fcvtzs z7.d, p3/m, z3.d
+; CHECK-NEXT:    fcmgt p2.d, p0/z, z0.d, z25.d
+; CHECK-NEXT:    fcmgt p3.d, p0/z, z3.d, z25.d
+; CHECK-NEXT:    fcvtzs z24.d, p4/m, z2.d
+; CHECK-NEXT:    fcmuo p4.d, p0/z, z1.d, z1.d
+; CHECK-NEXT:    sel z0.d, p1, z4.d, z5.d
+; CHECK-NEXT:    fcmuo p1.d, p0/z, z3.d, z3.d
 ; CHECK-NEXT:    fcmuo p0.d, p0/z, z2.d, z2.d
-; CHECK-NEXT:    sel z0.d, p5, z26.d, z4.d
-; CHECK-NEXT:    sel z1.d, p6, z26.d, z6.d
-; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    sel z2.d, p7, z26.d, z7.d
-; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    sel z3.d, p1, z26.d, z5.d
+; CHECK-NEXT:    sel z1.d, p2, z4.d, z6.d
+; CHECK-NEXT:    sel z2.d, p3, z4.d, z7.d
+; CHECK-NEXT:    sel z3.d, p5, z4.d, z24.d
 ; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    mov z0.d, p2/m, #0 // =0x0
-; CHECK-NEXT:    mov z1.d, p3/m, #0 // =0x0
-; CHECK-NEXT:    mov z2.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    mov z0.d, p4/m, #0 // =0x0
 ; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z1.d, p6/m, #0 // =0x0
+; CHECK-NEXT:    mov z2.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    mov z3.d, p0/m, #0 // =0x0
 ; CHECK-NEXT:    uzp1 z0.s, z1.s, z0.s
 ; CHECK-NEXT:    uzp1 z1.s, z3.s, z2.s
@@ -382,40 +306,28 @@ define <vscale x 8 x i32> @test_signed_v8f64_v8i32(<vscale x 8 x double> %f) {
 define <vscale x 4 x i16> @test_signed_v4f64_v4i16(<vscale x 4 x double> %f) {
 ; CHECK-LABEL: test_signed_v4f64_v4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
-; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    mov x8, #-4548635623644200960 // =0xc0e0000000000000
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    mov z5.d, #32767 // =0x7fff
+; CHECK-NEXT:    mov z3.d, #-32768 // =0xffffffffffff8000
 ; CHECK-NEXT:    mov z2.d, x8
 ; CHECK-NEXT:    mov x8, #281200098803712 // =0xffc000000000
+; CHECK-NEXT:    mov z5.d, #32767 // =0x7fff
 ; CHECK-NEXT:    movk x8, #16607, lsl #48
-; CHECK-NEXT:    movprfx z3, z1
-; CHECK-NEXT:    fcvtzs z3.d, p0/m, z1.d
-; CHECK-NEXT:    movprfx z4, z0
-; CHECK-NEXT:    fcvtzs z4.d, p0/m, z0.d
+; CHECK-NEXT:    mov z4.d, x8
 ; CHECK-NEXT:    fcmge p1.d, p0/z, z1.d, z2.d
 ; CHECK-NEXT:    fcmge p2.d, p0/z, z0.d, z2.d
-; CHECK-NEXT:    mov z2.d, x8
-; CHECK-NEXT:    fcmgt p3.d, p0/z, z1.d, z2.d
-; CHECK-NEXT:    fcmgt p4.d, p0/z, z0.d, z2.d
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    mov z3.d, p1/m, #-32768 // =0xffffffffffff8000
-; CHECK-NEXT:    fcmuo p1.d, p0/z, z1.d, z1.d
+; CHECK-NEXT:    mov z2.d, #-32768 // =0xffffffffffff8000
+; CHECK-NEXT:    fcmgt p3.d, p0/z, z0.d, z4.d
+; CHECK-NEXT:    fcvtzs z3.d, p1/m, z1.d
+; CHECK-NEXT:    fcmgt p1.d, p0/z, z1.d, z4.d
+; CHECK-NEXT:    fcvtzs z2.d, p2/m, z0.d
+; CHECK-NEXT:    fcmuo p2.d, p0/z, z1.d, z1.d
 ; CHECK-NEXT:    fcmuo p0.d, p0/z, z0.d, z0.d
-; CHECK-NEXT:    mov z4.d, p2/m, #-32768 // =0xffffffffffff8000
-; CHECK-NEXT:    sel z0.d, p3, z5.d, z3.d
-; CHECK-NEXT:    sel z1.d, p4, z5.d, z4.d
-; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    mov z0.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    sel z0.d, p1, z5.d, z3.d
+; CHECK-NEXT:    sel z1.d, p3, z5.d, z2.d
+; CHECK-NEXT:    mov z0.d, p2/m, #0 // =0x0
 ; CHECK-NEXT:    mov z1.d, p0/m, #0 // =0x0
 ; CHECK-NEXT:    uzp1 z0.s, z1.s, z0.s
-; CHECK-NEXT:    addvl sp, sp, #1
-; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
     %x = call <vscale x 4 x i16> @llvm.fptosi.sat.nxv4f64.nxv4i16(<vscale x 4 x double> %f)
     ret <vscale x 4 x i16> %x
@@ -426,7 +338,6 @@ define <vscale x 8 x i16> @test_signed_v8f64_v8i16(<vscale x 8 x double> %f) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    str p7, [sp, #4, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p6, [sp, #5, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
@@ -434,50 +345,41 @@ define <vscale x 8 x i16> @test_signed_v8f64_v8i16(<vscale x 8 x double> %f) {
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    mov x8, #-4548635623644200960 // =0xc0e0000000000000
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    mov z25.d, #32767 // =0x7fff
+; CHECK-NEXT:    mov z5.d, #-32768 // =0xffffffffffff8000
 ; CHECK-NEXT:    mov z4.d, x8
 ; CHECK-NEXT:    mov x8, #281200098803712 // =0xffc000000000
+; CHECK-NEXT:    mov z6.d, #-32768 // =0xffffffffffff8000
 ; CHECK-NEXT:    movk x8, #16607, lsl #48
-; CHECK-NEXT:    movprfx z6, z2
-; CHECK-NEXT:    fcvtzs z6.d, p0/m, z2.d
-; CHECK-NEXT:    movprfx z7, z1
-; CHECK-NEXT:    fcvtzs z7.d, p0/m, z1.d
-; CHECK-NEXT:    mov z5.d, x8
-; CHECK-NEXT:    movprfx z24, z0
-; CHECK-NEXT:    fcvtzs z24.d, p0/m, z0.d
+; CHECK-NEXT:    mov z7.d, #-32768 // =0xffffffffffff8000
+; CHECK-NEXT:    mov z25.d, #32767 // =0x7fff
+; CHECK-NEXT:    mov z24.d, x8
+; CHECK-NEXT:    fcmuo p6.d, p0/z, z2.d, z2.d
 ; CHECK-NEXT:    fcmge p1.d, p0/z, z3.d, z4.d
 ; CHECK-NEXT:    fcmge p2.d, p0/z, z2.d, z4.d
 ; CHECK-NEXT:    fcmge p3.d, p0/z, z1.d, z4.d
 ; CHECK-NEXT:    fcmge p4.d, p0/z, z0.d, z4.d
-; CHECK-NEXT:    movprfx z4, z3
-; CHECK-NEXT:    fcvtzs z4.d, p0/m, z3.d
-; CHECK-NEXT:    fcmgt p5.d, p0/z, z3.d, z5.d
-; CHECK-NEXT:    fcmgt p6.d, p0/z, z2.d, z5.d
-; CHECK-NEXT:    fcmgt p7.d, p0/z, z1.d, z5.d
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    not p3.b, p0/z, p3.b
-; CHECK-NEXT:    mov z4.d, p1/m, #-32768 // =0xffffffffffff8000
-; CHECK-NEXT:    fcmgt p1.d, p0/z, z0.d, z5.d
-; CHECK-NEXT:    not p4.b, p0/z, p4.b
-; CHECK-NEXT:    mov z6.d, p2/m, #-32768 // =0xffffffffffff8000
-; CHECK-NEXT:    fcmuo p2.d, p0/z, z3.d, z3.d
-; CHECK-NEXT:    mov z7.d, p3/m, #-32768 // =0xffffffffffff8000
-; CHECK-NEXT:    fcmuo p3.d, p0/z, z2.d, z2.d
-; CHECK-NEXT:    mov z24.d, p4/m, #-32768 // =0xffffffffffff8000
-; CHECK-NEXT:    fcmuo p4.d, p0/z, z1.d, z1.d
+; CHECK-NEXT:    mov z4.d, #-32768 // =0xffffffffffff8000
+; CHECK-NEXT:    fcmgt p5.d, p0/z, z0.d, z24.d
+; CHECK-NEXT:    fcvtzs z5.d, p1/m, z3.d
+; CHECK-NEXT:    fcmgt p1.d, p0/z, z3.d, z24.d
+; CHECK-NEXT:    fcvtzs z6.d, p2/m, z2.d
+; CHECK-NEXT:    fcvtzs z7.d, p3/m, z1.d
+; CHECK-NEXT:    fcmgt p2.d, p0/z, z2.d, z24.d
+; CHECK-NEXT:    fcmgt p3.d, p0/z, z1.d, z24.d
+; CHECK-NEXT:    fcvtzs z4.d, p4/m, z0.d
+; CHECK-NEXT:    fcmuo p4.d, p0/z, z3.d, z3.d
+; CHECK-NEXT:    sel z2.d, p1, z25.d, z5.d
+; CHECK-NEXT:    fcmuo p1.d, p0/z, z1.d, z1.d
 ; CHECK-NEXT:    fcmuo p0.d, p0/z, z0.d, z0.d
-; CHECK-NEXT:    sel z2.d, p5, z25.d, z4.d
-; CHECK-NEXT:    sel z0.d, p6, z25.d, z6.d
-; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    sel z1.d, p7, z25.d, z7.d
-; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    sel z3.d, p1, z25.d, z24.d
+; CHECK-NEXT:    sel z0.d, p2, z25.d, z6.d
+; CHECK-NEXT:    sel z1.d, p3, z25.d, z7.d
+; CHECK-NEXT:    sel z3.d, p5, z25.d, z4.d
 ; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    mov z2.d, p2/m, #0 // =0x0
-; CHECK-NEXT:    mov z0.d, p3/m, #0 // =0x0
-; CHECK-NEXT:    mov z1.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    mov z2.d, p4/m, #0 // =0x0
 ; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z0.d, p6/m, #0 // =0x0
+; CHECK-NEXT:    mov z1.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    mov z3.d, p0/m, #0 // =0x0
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z2.s
 ; CHECK-NEXT:    uzp1 z1.s, z3.s, z1.s
@@ -494,19 +396,16 @@ define <vscale x 2 x i64> @test_signed_v2f64_v2i64(<vscale x 2 x double> %f) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov x8, #-4332462841530417152 // =0xc3e0000000000000
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    mov z2.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z1.d, x8
 ; CHECK-NEXT:    mov x8, #4890909195324358655 // =0x43dfffffffffffff
-; CHECK-NEXT:    mov z3.d, x8
+; CHECK-NEXT:    mov z2.d, x8
 ; CHECK-NEXT:    fcmge p1.d, p0/z, z0.d, z1.d
-; CHECK-NEXT:    movprfx z1, z0
-; CHECK-NEXT:    fcvtzs z1.d, p0/m, z0.d
-; CHECK-NEXT:    fcmgt p2.d, p0/z, z0.d, z3.d
-; CHECK-NEXT:    mov z3.d, #0x7fffffffffffffff
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    mov z1.d, #0x8000000000000000
+; CHECK-NEXT:    fcmgt p2.d, p0/z, z0.d, z2.d
+; CHECK-NEXT:    mov z2.d, #0x7fffffffffffffff
 ; CHECK-NEXT:    fcmuo p0.d, p0/z, z0.d, z0.d
-; CHECK-NEXT:    mov z1.d, p1/m, z2.d
-; CHECK-NEXT:    sel z0.d, p2, z3.d, z1.d
+; CHECK-NEXT:    fcvtzs z1.d, p1/m, z0.d
+; CHECK-NEXT:    sel z0.d, p2, z2.d, z1.d
 ; CHECK-NEXT:    mov z0.d, p0/m, #0 // =0x0
 ; CHECK-NEXT:    ret
     %x = call <vscale x 2 x i64> @llvm.fptosi.sat.nxv2f64.nxv2i64(<vscale x 2 x double> %f)
@@ -516,39 +415,26 @@ define <vscale x 2 x i64> @test_signed_v2f64_v2i64(<vscale x 2 x double> %f) {
 define <vscale x 4 x i64> @test_signed_v4f64_v4i64(<vscale x 4 x double> %f) {
 ; CHECK-LABEL: test_signed_v4f64_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
-; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    mov x8, #-4332462841530417152 // =0xc3e0000000000000
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    mov z6.d, #0x7fffffffffffffff
+; CHECK-NEXT:    mov z3.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z2.d, x8
 ; CHECK-NEXT:    mov x8, #4890909195324358655 // =0x43dfffffffffffff
-; CHECK-NEXT:    mov z3.d, x8
-; CHECK-NEXT:    movprfx z4, z0
-; CHECK-NEXT:    fcvtzs z4.d, p0/m, z0.d
-; CHECK-NEXT:    movprfx z5, z1
-; CHECK-NEXT:    fcvtzs z5.d, p0/m, z1.d
+; CHECK-NEXT:    mov z4.d, #0x8000000000000000
+; CHECK-NEXT:    mov z5.d, x8
 ; CHECK-NEXT:    fcmge p1.d, p0/z, z0.d, z2.d
 ; CHECK-NEXT:    fcmge p2.d, p0/z, z1.d, z2.d
-; CHECK-NEXT:    mov z2.d, #0x8000000000000000
-; CHECK-NEXT:    fcmgt p3.d, p0/z, z0.d, z3.d
-; CHECK-NEXT:    fcmgt p4.d, p0/z, z1.d, z3.d
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    sel z3.d, p1, z2.d, z4.d
-; CHECK-NEXT:    fcmuo p1.d, p0/z, z0.d, z0.d
+; CHECK-NEXT:    mov z2.d, #0x7fffffffffffffff
+; CHECK-NEXT:    fcmgt p3.d, p0/z, z1.d, z5.d
+; CHECK-NEXT:    fcvtzs z3.d, p1/m, z0.d
+; CHECK-NEXT:    fcmgt p1.d, p0/z, z0.d, z5.d
+; CHECK-NEXT:    fcvtzs z4.d, p2/m, z1.d
+; CHECK-NEXT:    fcmuo p2.d, p0/z, z0.d, z0.d
 ; CHECK-NEXT:    fcmuo p0.d, p0/z, z1.d, z1.d
-; CHECK-NEXT:    sel z2.d, p2, z2.d, z5.d
-; CHECK-NEXT:    sel z0.d, p3, z6.d, z3.d
-; CHECK-NEXT:    sel z1.d, p4, z6.d, z2.d
-; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    mov z0.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    sel z0.d, p1, z2.d, z3.d
+; CHECK-NEXT:    sel z1.d, p3, z2.d, z4.d
+; CHECK-NEXT:    mov z0.d, p2/m, #0 // =0x0
 ; CHECK-NEXT:    mov z1.d, p0/m, #0 // =0x0
-; CHECK-NEXT:    addvl sp, sp, #1
-; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
     %x = call <vscale x 4 x i64> @llvm.fptosi.sat.nxv4f64.nxv4i64(<vscale x 4 x double> %f)
     ret <vscale x 4 x i64> %x
@@ -570,19 +456,16 @@ define <vscale x 2 x i32> @test_signed_v2f16_v2i32(<vscale x 2 x half> %f) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, #64511 // =0xfbff
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    mov z2.d, #0xffffffff80000000
 ; CHECK-NEXT:    mov z1.h, w8
 ; CHECK-NEXT:    mov w8, #31743 // =0x7bff
-; CHECK-NEXT:    mov z3.h, w8
+; CHECK-NEXT:    mov z2.h, w8
 ; CHECK-NEXT:    fcmge p1.h, p0/z, z0.h, z1.h
-; CHECK-NEXT:    movprfx z1, z0
-; CHECK-NEXT:    fcvtzs z1.d, p0/m, z0.h
-; CHECK-NEXT:    fcmgt p2.h, p0/z, z0.h, z3.h
-; CHECK-NEXT:    mov z3.d, #0x7fffffff
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    mov z1.d, #0xffffffff80000000
+; CHECK-NEXT:    fcmgt p2.h, p0/z, z0.h, z2.h
+; CHECK-NEXT:    mov z2.d, #0x7fffffff
 ; CHECK-NEXT:    fcmuo p0.h, p0/z, z0.h, z0.h
-; CHECK-NEXT:    mov z1.d, p1/m, z2.d
-; CHECK-NEXT:    sel z0.d, p2, z3.d, z1.d
+; CHECK-NEXT:    fcvtzs z1.d, p1/m, z0.h
+; CHECK-NEXT:    sel z0.d, p2, z2.d, z1.d
 ; CHECK-NEXT:    mov z0.d, p0/m, #0 // =0x0
 ; CHECK-NEXT:    ret
     %x = call <vscale x 2 x i32> @llvm.fptosi.sat.nxv2f16.nxv2i32(<vscale x 2 x half> %f)
@@ -594,19 +477,16 @@ define <vscale x 4 x i32> @test_signed_v4f16_v4i32(<vscale x 4 x half> %f) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, #64511 // =0xfbff
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    mov z2.s, #0x80000000
 ; CHECK-NEXT:    mov z1.h, w8
 ; CHECK-NEXT:    mov w8, #31743 // =0x7bff
-; CHECK-NEXT:    mov z3.h, w8
+; CHECK-NEXT:    mov z2.h, w8
 ; CHECK-NEXT:    fcmge p1.h, p0/z, z0.h, z1.h
-; CHECK-NEXT:    movprfx z1, z0
-; CHECK-NEXT:    fcvtzs z1.s, p0/m, z0.h
-; CHECK-NEXT:    fcmgt p2.h, p0/z, z0.h, z3.h
-; CHECK-NEXT:    mov z3.s, #0x7fffffff
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    mov z1.s, #0x80000000
+; CHECK-NEXT:    fcmgt p2.h, p0/z, z0.h, z2.h
+; CHECK-NEXT:    mov z2.s, #0x7fffffff
 ; CHECK-NEXT:    fcmuo p0.h, p0/z, z0.h, z0.h
-; CHECK-NEXT:    mov z1.s, p1/m, z2.s
-; CHECK-NEXT:    sel z0.s, p2, z3.s, z1.s
+; CHECK-NEXT:    fcvtzs z1.s, p1/m, z0.h
+; CHECK-NEXT:    sel z0.s, p2, z2.s, z1.s
 ; CHECK-NEXT:    mov z0.s, p0/m, #0 // =0x0
 ; CHECK-NEXT:    ret
     %x = call <vscale x 4 x i32> @llvm.fptosi.sat.nxv4f16.nxv4i32(<vscale x 4 x half> %f)
@@ -616,41 +496,28 @@ define <vscale x 4 x i32> @test_signed_v4f16_v4i32(<vscale x 4 x half> %f) {
 define <vscale x 8 x i32> @test_signed_v8f16_v8i32(<vscale x 8 x half> %f) {
 ; CHECK-LABEL: test_signed_v8f16_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
-; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    mov w8, #64511 // =0xfbff
 ; CHECK-NEXT:    uunpklo z1.s, z0.h
 ; CHECK-NEXT:    uunpkhi z0.s, z0.h
 ; CHECK-NEXT:    mov z2.h, w8
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov w8, #31743 // =0x7bff
-; CHECK-NEXT:    mov z3.h, w8
-; CHECK-NEXT:    mov z6.s, #0x7fffffff
+; CHECK-NEXT:    mov z3.s, #0x80000000
+; CHECK-NEXT:    mov z4.s, #0x80000000
+; CHECK-NEXT:    mov z5.h, w8
 ; CHECK-NEXT:    fcmge p1.h, p0/z, z1.h, z2.h
 ; CHECK-NEXT:    fcmge p2.h, p0/z, z0.h, z2.h
-; CHECK-NEXT:    mov z2.s, #0x80000000
-; CHECK-NEXT:    movprfx z4, z1
-; CHECK-NEXT:    fcvtzs z4.s, p0/m, z1.h
-; CHECK-NEXT:    movprfx z5, z0
-; CHECK-NEXT:    fcvtzs z5.s, p0/m, z0.h
-; CHECK-NEXT:    fcmgt p3.h, p0/z, z1.h, z3.h
-; CHECK-NEXT:    fcmgt p4.h, p0/z, z0.h, z3.h
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    sel z3.s, p1, z2.s, z4.s
-; CHECK-NEXT:    fcmuo p1.h, p0/z, z1.h, z1.h
+; CHECK-NEXT:    mov z2.s, #0x7fffffff
+; CHECK-NEXT:    fcmgt p3.h, p0/z, z0.h, z5.h
+; CHECK-NEXT:    fcvtzs z3.s, p1/m, z1.h
+; CHECK-NEXT:    fcmgt p1.h, p0/z, z1.h, z5.h
+; CHECK-NEXT:    fcvtzs z4.s, p2/m, z0.h
+; CHECK-NEXT:    fcmuo p2.h, p0/z, z1.h, z1.h
 ; CHECK-NEXT:    fcmuo p0.h, p0/z, z0.h, z0.h
-; CHECK-NEXT:    sel z2.s, p2, z2.s, z5.s
-; CHECK-NEXT:    sel z0.s, p3, z6.s, z3.s
-; CHECK-NEXT:    sel z1.s, p4, z6.s, z2.s
-; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    mov z0.s, p1/m, #0 // =0x0
+; CHECK-NEXT:    sel z0.s, p1, z2.s, z3.s
+; CHECK-NEXT:    sel z1.s, p3, z2.s, z4.s
+; CHECK-NEXT:    mov z0.s, p2/m, #0 // =0x0
 ; CHECK-NEXT:    mov z1.s, p0/m, #0 // =0x0
-; CHECK-NEXT:    addvl sp, sp, #1
-; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
     %x = call <vscale x 8 x i32> @llvm.fptosi.sat.nxv8f16.nxv8i32(<vscale x 8 x half> %f)
     ret <vscale x 8 x i32> %x
@@ -661,18 +528,16 @@ define <vscale x 4 x i16> @test_signed_v4f16_v4i16(<vscale x 4 x half> %f) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, #63488 // =0xf800
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    mov z2.s, #-32768 // =0xffffffffffff8000
 ; CHECK-NEXT:    mov z1.h, w8
 ; CHECK-NEXT:    mov w8, #30719 // =0x77ff
-; CHECK-NEXT:    mov z2.h, w8
 ; CHECK-NEXT:    fcmge p1.h, p0/z, z0.h, z1.h
-; CHECK-NEXT:    movprfx z1, z0
-; CHECK-NEXT:    fcvtzs z1.s, p0/m, z0.h
-; CHECK-NEXT:    fcmgt p2.h, p0/z, z0.h, z2.h
-; CHECK-NEXT:    mov z2.s, #32767 // =0x7fff
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    mov z1.h, w8
+; CHECK-NEXT:    fcmgt p2.h, p0/z, z0.h, z1.h
+; CHECK-NEXT:    mov z1.s, #32767 // =0x7fff
 ; CHECK-NEXT:    fcmuo p0.h, p0/z, z0.h, z0.h
-; CHECK-NEXT:    mov z1.s, p1/m, #-32768 // =0xffffffffffff8000
-; CHECK-NEXT:    sel z0.s, p2, z2.s, z1.s
+; CHECK-NEXT:    fcvtzs z2.s, p1/m, z0.h
+; CHECK-NEXT:    sel z0.s, p2, z1.s, z2.s
 ; CHECK-NEXT:    mov z0.s, p0/m, #0 // =0x0
 ; CHECK-NEXT:    ret
     %x = call <vscale x 4 x i16> @llvm.fptosi.sat.nxv4f16.nxv4i16(<vscale x 4 x half> %f)
@@ -684,18 +549,16 @@ define <vscale x 8 x i16> @test_signed_v8f16_v8i16(<vscale x 8 x half> %f) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, #63488 // =0xf800
 ; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    mov z2.h, #-32768 // =0xffffffffffff8000
 ; CHECK-NEXT:    mov z1.h, w8
 ; CHECK-NEXT:    mov w8, #30719 // =0x77ff
-; CHECK-NEXT:    mov z2.h, w8
 ; CHECK-NEXT:    fcmge p1.h, p0/z, z0.h, z1.h
-; CHECK-NEXT:    movprfx z1, z0
-; CHECK-NEXT:    fcvtzs z1.h, p0/m, z0.h
-; CHECK-NEXT:    fcmgt p2.h, p0/z, z0.h, z2.h
-; CHECK-NEXT:    mov z2.h, #32767 // =0x7fff
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    mov z1.h, w8
+; CHECK-NEXT:    fcmgt p2.h, p0/z, z0.h, z1.h
+; CHECK-NEXT:    mov z1.h, #32767 // =0x7fff
 ; CHECK-NEXT:    fcmuo p0.h, p0/z, z0.h, z0.h
-; CHECK-NEXT:    mov z1.h, p1/m, #-32768 // =0xffffffffffff8000
-; CHECK-NEXT:    sel z0.h, p2, z2.h, z1.h
+; CHECK-NEXT:    fcvtzs z2.h, p1/m, z0.h
+; CHECK-NEXT:    sel z0.h, p2, z1.h, z2.h
 ; CHECK-NEXT:    mov z0.h, p0/m, #0 // =0x0
 ; CHECK-NEXT:    ret
     %x = call <vscale x 8 x i16> @llvm.fptosi.sat.nxv8f16.nxv8i16(<vscale x 8 x half> %f)
@@ -707,19 +570,16 @@ define <vscale x 2 x i64> @test_signed_v2f16_v2i64(<vscale x 2 x half> %f) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, #64511 // =0xfbff
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    mov z2.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z1.h, w8
 ; CHECK-NEXT:    mov w8, #31743 // =0x7bff
-; CHECK-NEXT:    mov z3.h, w8
+; CHECK-NEXT:    mov z2.h, w8
 ; CHECK-NEXT:    fcmge p1.h, p0/z, z0.h, z1.h
-; CHECK-NEXT:    movprfx z1, z0
-; CHECK-NEXT:    fcvtzs z1.d, p0/m, z0.h
-; CHECK-NEXT:    fcmgt p2.h, p0/z, z0.h, z3.h
-; CHECK-NEXT:    mov z3.d, #0x7fffffffffffffff
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    mov z1.d, #0x8000000000000000
+; CHECK-NEXT:    fcmgt p2.h, p0/z, z0.h, z2.h
+; CHECK-NEXT:    mov z2.d, #0x7fffffffffffffff
 ; CHECK-NEXT:    fcmuo p0.h, p0/z, z0.h, z0.h
-; CHECK-NEXT:    mov z1.d, p1/m, z2.d
-; CHECK-NEXT:    sel z0.d, p2, z3.d, z1.d
+; CHECK-NEXT:    fcvtzs z1.d, p1/m, z0.h
+; CHECK-NEXT:    sel z0.d, p2, z2.d, z1.d
 ; CHECK-NEXT:    mov z0.d, p0/m, #0 // =0x0
 ; CHECK-NEXT:    ret
     %x = call <vscale x 2 x i64> @llvm.fptosi.sat.nxv2f16.nxv2i64(<vscale x 2 x half> %f)
@@ -729,41 +589,28 @@ define <vscale x 2 x i64> @test_signed_v2f16_v2i64(<vscale x 2 x half> %f) {
 define <vscale x 4 x i64> @test_signed_v4f16_v4i64(<vscale x 4 x half> %f) {
 ; CHECK-LABEL: test_signed_v4f16_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
-; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    mov w8, #64511 // =0xfbff
 ; CHECK-NEXT:    uunpklo z1.d, z0.s
 ; CHECK-NEXT:    uunpkhi z0.d, z0.s
 ; CHECK-NEXT:    mov z2.h, w8
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov w8, #31743 // =0x7bff
-; CHECK-NEXT:    mov z3.h, w8
-; CHECK-NEXT:    mov z6.d, #0x7fffffffffffffff
+; CHECK-NEXT:    mov z3.d, #0x8000000000000000
+; CHECK-NEXT:    mov z4.d, #0x8000000000000000
+; CHECK-NEXT:    mov z5.h, w8
 ; CHECK-NEXT:    fcmge p1.h, p0/z, z1.h, z2.h
 ; CHECK-NEXT:    fcmge p2.h, p0/z, z0.h, z2.h
-; CHECK-NEXT:    mov z2.d, #0x8000000000000000
-; CHECK-NEXT:    movprfx z4, z1
-; CHECK-NEXT:    fcvtzs z4.d, p0/m, z1.h
-; CHECK-NEXT:    movprfx z5, z0
-; CHECK-NEXT:    fcvtzs z5.d, p0/m, z0.h
-; CHECK-NEXT:    fcmgt p3.h, p0/z, z1.h, z3.h
-; CHECK-NEXT:    fcmgt p4.h, p0/z, z0.h, z3.h
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    sel z3.d, p1, z2.d, z4.d
-; CHECK-NEXT:    fcmuo p1.h, p0/z, z1.h, z1.h
+; CHECK-NEXT:    mov z2.d, #0x7fffffffffffffff
+; CHECK-NEXT:    fcmgt p3.h, p0/z, z0.h, z5.h
+; CHECK-NEXT:    fcvtzs z3.d, p1/m, z1.h
+; CHECK-NEXT:    fcmgt p1.h, p0/z, z1.h, z5.h
+; CHECK-NEXT:    fcvtzs z4.d, p2/m, z0.h
+; CHECK-NEXT:    fcmuo p2.h, p0/z, z1.h, z1.h
 ; CHECK-NEXT:    fcmuo p0.h, p0/z, z0.h, z0.h
-; CHECK-NEXT:    sel z2.d, p2, z2.d, z5.d
-; CHECK-NEXT:    sel z0.d, p3, z6.d, z3.d
-; CHECK-NEXT:    sel z1.d, p4, z6.d, z2.d
-; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    mov z0.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    sel z0.d, p1, z2.d, z3.d
+; CHECK-NEXT:    sel z1.d, p3, z2.d, z4.d
+; CHECK-NEXT:    mov z0.d, p2/m, #0 // =0x0
 ; CHECK-NEXT:    mov z1.d, p0/m, #0 // =0x0
-; CHECK-NEXT:    addvl sp, sp, #1
-; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
     %x = call <vscale x 4 x i64> @llvm.fptosi.sat.nxv4f16.nxv4i64(<vscale x 4 x half> %f)
     ret <vscale x 4 x i64> %x
diff --git a/llvm/test/CodeGen/AArch64/sve-fptoui-sat.ll b/llvm/test/CodeGen/AArch64/sve-fptoui-sat.ll
index ed352ffec339f..b3aefb8460985 100644
--- a/llvm/test/CodeGen/AArch64/sve-fptoui-sat.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fptoui-sat.ll
@@ -16,15 +16,13 @@ define <vscale x 2 x i32> @test_signed_v2f32_v2i32(<vscale x 2 x float> %f) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov w8, #1333788671 // =0x4f7fffff
-; CHECK-NEXT:    mov z1.s, w8
+; CHECK-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-NEXT:    mov z2.s, w8
 ; CHECK-NEXT:    fcmge p1.s, p0/z, z0.s, #0.0
-; CHECK-NEXT:    movprfx z2, z0
-; CHECK-NEXT:    fcvtzu z2.d, p0/m, z0.s
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    fcmgt p0.s, p0/z, z0.s, z1.s
+; CHECK-NEXT:    fcmgt p0.s, p0/z, z0.s, z2.s
+; CHECK-NEXT:    fcvtzu z1.d, p1/m, z0.s
 ; CHECK-NEXT:    mov z0.d, #0xffffffff
-; CHECK-NEXT:    mov z2.d, p1/m, #0 // =0x0
-; CHECK-NEXT:    sel z0.d, p0, z0.d, z2.d
+; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    ret
     %x = call <vscale x 2 x i32> @llvm.fptoui.sat.nxv2f32.nxv2i32(<vscale x 2 x float> %f)
     ret <vscale x 2 x i32> %x
@@ -35,13 +33,11 @@ define <vscale x 4 x i32> @test_signed_v4f32_v4i32(<vscale x 4 x float> %f) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov w8, #1333788671 // =0x4f7fffff
+; CHECK-NEXT:    movi v1.2d, #0000000000000000
 ; CHECK-NEXT:    mov z2.s, w8
 ; CHECK-NEXT:    fcmge p1.s, p0/z, z0.s, #0.0
-; CHECK-NEXT:    movprfx z1, z0
-; CHECK-NEXT:    fcvtzu z1.s, p0/m, z0.s
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
 ; CHECK-NEXT:    fcmgt p0.s, p0/z, z0.s, z2.s
-; CHECK-NEXT:    mov z1.s, p1/m, #0 // =0x0
+; CHECK-NEXT:    fcvtzu z1.s, p1/m, z0.s
 ; CHECK-NEXT:    mov z1.s, p0/m, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z0.d, z1.d
 ; CHECK-NEXT:    ret
@@ -53,21 +49,17 @@ define <vscale x 8 x i32> @test_signed_v8f32_v8i32(<vscale x 8 x float> %f) {
 ; CHECK-LABEL: test_signed_v8f32_v8i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    movi v2.2d, #0000000000000000
 ; CHECK-NEXT:    mov w8, #1333788671 // =0x4f7fffff
+; CHECK-NEXT:    movi v3.2d, #0000000000000000
 ; CHECK-NEXT:    mov z4.s, w8
 ; CHECK-NEXT:    fcmge p1.s, p0/z, z0.s, #0.0
 ; CHECK-NEXT:    fcmge p2.s, p0/z, z1.s, #0.0
-; CHECK-NEXT:    movprfx z2, z0
-; CHECK-NEXT:    fcvtzu z2.s, p0/m, z0.s
-; CHECK-NEXT:    movprfx z3, z1
-; CHECK-NEXT:    fcvtzu z3.s, p0/m, z1.s
-; CHECK-NEXT:    fcmgt p3.s, p0/z, z0.s, z4.s
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    not p2.b, p0/z, p2.b
+; CHECK-NEXT:    fcvtzu z2.s, p1/m, z0.s
+; CHECK-NEXT:    fcmgt p1.s, p0/z, z0.s, z4.s
 ; CHECK-NEXT:    fcmgt p0.s, p0/z, z1.s, z4.s
-; CHECK-NEXT:    mov z2.s, p1/m, #0 // =0x0
-; CHECK-NEXT:    mov z3.s, p2/m, #0 // =0x0
-; CHECK-NEXT:    mov z2.s, p3/m, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    fcvtzu z3.s, p2/m, z1.s
+; CHECK-NEXT:    mov z2.s, p1/m, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z3.s, p0/m, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z0.d, z2.d
 ; CHECK-NEXT:    mov z1.d, z3.d
@@ -81,16 +73,14 @@ define <vscale x 4 x i16> @test_signed_v4f32_v4i16(<vscale x 4 x float> %f) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov w8, #65280 // =0xff00
+; CHECK-NEXT:    movi v1.2d, #0000000000000000
 ; CHECK-NEXT:    movk w8, #18303, lsl #16
 ; CHECK-NEXT:    fcmge p1.s, p0/z, z0.s, #0.0
-; CHECK-NEXT:    mov z1.s, w8
-; CHECK-NEXT:    movprfx z2, z0
-; CHECK-NEXT:    fcvtzu z2.s, p0/m, z0.s
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    fcmgt p0.s, p0/z, z0.s, z1.s
+; CHECK-NEXT:    mov z2.s, w8
+; CHECK-NEXT:    fcmgt p0.s, p0/z, z0.s, z2.s
+; CHECK-NEXT:    fcvtzu z1.s, p1/m, z0.s
 ; CHECK-NEXT:    mov z0.s, #65535 // =0xffff
-; CHECK-NEXT:    mov z2.s, p1/m, #0 // =0x0
-; CHECK-NEXT:    sel z0.s, p0, z0.s, z2.s
+; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    ret
     %x = call <vscale x 4 x i16> @llvm.fptoui.sat.nxv4f32.nxv4i16(<vscale x 4 x float> %f)
     ret <vscale x 4 x i16> %x
@@ -101,24 +91,20 @@ define <vscale x 8 x i16> @test_signed_v8f32_v8i16(<vscale x 8 x float> %f) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov w8, #65280 // =0xff00
+; CHECK-NEXT:    movi v2.2d, #0000000000000000
 ; CHECK-NEXT:    movk w8, #18303, lsl #16
+; CHECK-NEXT:    movi v3.2d, #0000000000000000
 ; CHECK-NEXT:    fcmge p1.s, p0/z, z1.s, #0.0
 ; CHECK-NEXT:    fcmge p2.s, p0/z, z0.s, #0.0
-; CHECK-NEXT:    mov z2.s, w8
-; CHECK-NEXT:    movprfx z3, z1
-; CHECK-NEXT:    fcvtzu z3.s, p0/m, z1.s
-; CHECK-NEXT:    movprfx z4, z0
-; CHECK-NEXT:    fcvtzu z4.s, p0/m, z0.s
-; CHECK-NEXT:    fcmgt p3.s, p0/z, z1.s, z2.s
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    fcmgt p0.s, p0/z, z0.s, z2.s
-; CHECK-NEXT:    mov z0.s, #65535 // =0xffff
-; CHECK-NEXT:    mov z3.s, p1/m, #0 // =0x0
-; CHECK-NEXT:    mov z4.s, p2/m, #0 // =0x0
-; CHECK-NEXT:    sel z1.s, p3, z0.s, z3.s
-; CHECK-NEXT:    sel z0.s, p0, z0.s, z4.s
-; CHECK-NEXT:    uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT:    mov z4.s, w8
+; CHECK-NEXT:    fcvtzu z2.s, p1/m, z1.s
+; CHECK-NEXT:    fcmgt p1.s, p0/z, z1.s, z4.s
+; CHECK-NEXT:    mov z1.s, #65535 // =0xffff
+; CHECK-NEXT:    fcmgt p0.s, p0/z, z0.s, z4.s
+; CHECK-NEXT:    fcvtzu z3.s, p2/m, z0.s
+; CHECK-NEXT:    sel z0.s, p1, z1.s, z2.s
+; CHECK-NEXT:    sel z1.s, p0, z1.s, z3.s
+; CHECK-NEXT:    uzp1 z0.h, z1.h, z0.h
 ; CHECK-NEXT:    ret
     %x = call <vscale x 8 x i16> @llvm.fptoui.sat.nxv8f32.nxv8i16(<vscale x 8 x float> %f)
     ret <vscale x 8 x i16> %x
@@ -129,13 +115,11 @@ define <vscale x 2 x i64> @test_signed_v2f32_v2i64(<vscale x 2 x float> %f) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov w8, #1602224127 // =0x5f7fffff
+; CHECK-NEXT:    movi v1.2d, #0000000000000000
 ; CHECK-NEXT:    mov z2.s, w8
 ; CHECK-NEXT:    fcmge p1.s, p0/z, z0.s, #0.0
-; CHECK-NEXT:    movprfx z1, z0
-; CHECK-NEXT:    fcvtzu z1.d, p0/m, z0.s
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
 ; CHECK-NEXT:    fcmgt p0.s, p0/z, z0.s, z2.s
-; CHECK-NEXT:    mov z1.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    fcvtzu z1.d, p1/m, z0.s
 ; CHECK-NEXT:    mov z1.d, p0/m, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z0.d, z1.d
 ; CHECK-NEXT:    ret
@@ -150,20 +134,16 @@ define <vscale x 4 x i64> @test_signed_v4f32_v4i64(<vscale x 4 x float> %f) {
 ; CHECK-NEXT:    uunpkhi z3.d, z0.s
 ; CHECK-NEXT:    mov w8, #1602224127 // =0x5f7fffff
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-NEXT:    movi v1.2d, #0000000000000000
 ; CHECK-NEXT:    mov z4.s, w8
 ; CHECK-NEXT:    fcmge p1.s, p0/z, z2.s, #0.0
 ; CHECK-NEXT:    fcmge p2.s, p0/z, z3.s, #0.0
-; CHECK-NEXT:    movprfx z0, z2
-; CHECK-NEXT:    fcvtzu z0.d, p0/m, z2.s
-; CHECK-NEXT:    movprfx z1, z3
-; CHECK-NEXT:    fcvtzu z1.d, p0/m, z3.s
-; CHECK-NEXT:    fcmgt p3.s, p0/z, z2.s, z4.s
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    not p2.b, p0/z, p2.b
+; CHECK-NEXT:    fcvtzu z0.d, p1/m, z2.s
+; CHECK-NEXT:    fcmgt p1.s, p0/z, z2.s, z4.s
 ; CHECK-NEXT:    fcmgt p0.s, p0/z, z3.s, z4.s
-; CHECK-NEXT:    mov z0.d, p1/m, #0 // =0x0
-; CHECK-NEXT:    mov z1.d, p2/m, #0 // =0x0
-; CHECK-NEXT:    mov z0.d, p3/m, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    fcvtzu z1.d, p2/m, z3.s
+; CHECK-NEXT:    mov z0.d, p1/m, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z1.d, p0/m, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    ret
     %x = call <vscale x 4 x i64> @llvm.fptoui.sat.nxv4f32.nxv4i64(<vscale x 4 x float> %f)
@@ -185,16 +165,14 @@ define <vscale x 2 x i32> @test_signed_v2f64_v2i32(<vscale x 2 x double> %f) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov x8, #281474974613504 // =0xffffffe00000
+; CHECK-NEXT:    movi v1.2d, #0000000000000000
 ; CHECK-NEXT:    movk x8, #16879, lsl #48
 ; CHECK-NEXT:    fcmge p1.d, p0/z, z0.d, #0.0
-; CHECK-NEXT:    mov z1.d, x8
-; CHECK-NEXT:    movprfx z2, z0
-; CHECK-NEXT:    fcvtzu z2.d, p0/m, z0.d
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    fcmgt p0.d, p0/z, z0.d, z1.d
+; CHECK-NEXT:    mov z2.d, x8
+; CHECK-NEXT:    fcmgt p0.d, p0/z, z0.d, z2.d
+; CHECK-NEXT:    fcvtzu z1.d, p1/m, z0.d
 ; CHECK-NEXT:    mov z0.d, #0xffffffff
-; CHECK-NEXT:    mov z2.d, p1/m, #0 // =0x0
-; CHECK-NEXT:    sel z0.d, p0, z0.d, z2.d
+; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    ret
     %x = call <vscale x 2 x i32> @llvm.fptoui.sat.nxv2f64.nxv2i32(<vscale x 2 x double> %f)
     ret <vscale x 2 x i32> %x
@@ -205,24 +183,20 @@ define <vscale x 4 x i32> @test_signed_v4f64_v4i32(<vscale x 4 x double> %f) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov x8, #281474974613504 // =0xffffffe00000
+; CHECK-NEXT:    movi v2.2d, #0000000000000000
 ; CHECK-NEXT:    movk x8, #16879, lsl #48
+; CHECK-NEXT:    movi v3.2d, #0000000000000000
 ; CHECK-NEXT:    fcmge p1.d, p0/z, z1.d, #0.0
 ; CHECK-NEXT:    fcmge p2.d, p0/z, z0.d, #0.0
-; CHECK-NEXT:    mov z2.d, x8
-; CHECK-NEXT:    movprfx z3, z1
-; CHECK-NEXT:    fcvtzu z3.d, p0/m, z1.d
-; CHECK-NEXT:    movprfx z4, z0
-; CHECK-NEXT:    fcvtzu z4.d, p0/m, z0.d
-; CHECK-NEXT:    fcmgt p3.d, p0/z, z1.d, z2.d
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    fcmgt p0.d, p0/z, z0.d, z2.d
-; CHECK-NEXT:    mov z0.d, #0xffffffff
-; CHECK-NEXT:    mov z3.d, p1/m, #0 // =0x0
-; CHECK-NEXT:    mov z4.d, p2/m, #0 // =0x0
-; CHECK-NEXT:    sel z1.d, p3, z0.d, z3.d
-; CHECK-NEXT:    sel z0.d, p0, z0.d, z4.d
-; CHECK-NEXT:    uzp1 z0.s, z0.s, z1.s
+; CHECK-NEXT:    mov z4.d, x8
+; CHECK-NEXT:    fcvtzu z2.d, p1/m, z1.d
+; CHECK-NEXT:    fcmgt p1.d, p0/z, z1.d, z4.d
+; CHECK-NEXT:    mov z1.d, #0xffffffff
+; CHECK-NEXT:    fcmgt p0.d, p0/z, z0.d, z4.d
+; CHECK-NEXT:    fcvtzu z3.d, p2/m, z0.d
+; CHECK-NEXT:    sel z0.d, p1, z1.d, z2.d
+; CHECK-NEXT:    sel z1.d, p0, z1.d, z3.d
+; CHECK-NEXT:    uzp1 z0.s, z1.s, z0.s
 ; CHECK-NEXT:    ret
     %x = call <vscale x 4 x i32> @llvm.fptoui.sat.nxv4f64.nxv4i32(<vscale x 4 x double> %f)
     ret <vscale x 4 x i32> %x
@@ -233,47 +207,35 @@ define <vscale x 8 x i32> @test_signed_v8f64_v8i32(<vscale x 8 x double> %f) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    str p6, [sp, #5, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov x8, #281474974613504 // =0xffffffe00000
+; CHECK-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-NEXT:    movi v5.2d, #0000000000000000
+; CHECK-NEXT:    movi v6.2d, #0000000000000000
 ; CHECK-NEXT:    movk x8, #16879, lsl #48
 ; CHECK-NEXT:    fcmge p1.d, p0/z, z1.d, #0.0
 ; CHECK-NEXT:    fcmge p2.d, p0/z, z0.d, #0.0
 ; CHECK-NEXT:    fcmge p3.d, p0/z, z3.d, #0.0
+; CHECK-NEXT:    movi v7.2d, #0000000000000000
 ; CHECK-NEXT:    fcmge p4.d, p0/z, z2.d, #0.0
-; CHECK-NEXT:    movprfx z5, z1
-; CHECK-NEXT:    fcvtzu z5.d, p0/m, z1.d
-; CHECK-NEXT:    mov z4.d, x8
-; CHECK-NEXT:    movprfx z6, z0
-; CHECK-NEXT:    fcvtzu z6.d, p0/m, z0.d
-; CHECK-NEXT:    movprfx z7, z3
-; CHECK-NEXT:    fcvtzu z7.d, p0/m, z3.d
-; CHECK-NEXT:    movprfx z24, z2
-; CHECK-NEXT:    fcvtzu z24.d, p0/m, z2.d
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    fcmgt p5.d, p0/z, z1.d, z4.d
-; CHECK-NEXT:    fcmgt p6.d, p0/z, z0.d, z4.d
-; CHECK-NEXT:    not p2.b, p0/z, p2.b
+; CHECK-NEXT:    mov z24.d, x8
+; CHECK-NEXT:    fcvtzu z4.d, p1/m, z1.d
+; CHECK-NEXT:    fcvtzu z5.d, p2/m, z0.d
+; CHECK-NEXT:    fcvtzu z6.d, p3/m, z3.d
+; CHECK-NEXT:    fcmgt p1.d, p0/z, z1.d, z24.d
+; CHECK-NEXT:    fcmgt p2.d, p0/z, z0.d, z24.d
 ; CHECK-NEXT:    mov z0.d, #0xffffffff
-; CHECK-NEXT:    not p3.b, p0/z, p3.b
-; CHECK-NEXT:    mov z5.d, p1/m, #0 // =0x0
-; CHECK-NEXT:    fcmgt p1.d, p0/z, z3.d, z4.d
-; CHECK-NEXT:    not p4.b, p0/z, p4.b
-; CHECK-NEXT:    fcmgt p0.d, p0/z, z2.d, z4.d
-; CHECK-NEXT:    mov z6.d, p2/m, #0 // =0x0
-; CHECK-NEXT:    mov z7.d, p3/m, #0 // =0x0
-; CHECK-NEXT:    mov z24.d, p4/m, #0 // =0x0
-; CHECK-NEXT:    sel z1.d, p5, z0.d, z5.d
-; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    sel z2.d, p6, z0.d, z6.d
-; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    sel z3.d, p1, z0.d, z7.d
+; CHECK-NEXT:    fcvtzu z7.d, p4/m, z2.d
+; CHECK-NEXT:    fcmgt p3.d, p0/z, z3.d, z24.d
 ; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    sel z4.d, p0, z0.d, z24.d
+; CHECK-NEXT:    fcmgt p0.d, p0/z, z2.d, z24.d
+; CHECK-NEXT:    sel z1.d, p1, z0.d, z4.d
+; CHECK-NEXT:    sel z2.d, p2, z0.d, z5.d
+; CHECK-NEXT:    sel z3.d, p3, z0.d, z6.d
+; CHECK-NEXT:    sel z4.d, p0, z0.d, z7.d
 ; CHECK-NEXT:    uzp1 z0.s, z2.s, z1.s
 ; CHECK-NEXT:    uzp1 z1.s, z4.s, z3.s
 ; CHECK-NEXT:    addvl sp, sp, #1
@@ -288,24 +250,20 @@ define <vscale x 4 x i16> @test_signed_v4f64_v4i16(<vscale x 4 x double> %f) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov x8, #281337537757184 // =0xffe000000000
+; CHECK-NEXT:    movi v2.2d, #0000000000000000
 ; CHECK-NEXT:    movk x8, #16623, lsl #48
+; CHECK-NEXT:    movi v3.2d, #0000000000000000
 ; CHECK-NEXT:    fcmge p1.d, p0/z, z1.d, #0.0
 ; CHECK-NEXT:    fcmge p2.d, p0/z, z0.d, #0.0
-; CHECK-NEXT:    mov z2.d, x8
-; CHECK-NEXT:    movprfx z3, z1
-; CHECK-NEXT:    fcvtzu z3.d, p0/m, z1.d
-; CHECK-NEXT:    movprfx z4, z0
-; CHECK-NEXT:    fcvtzu z4.d, p0/m, z0.d
-; CHECK-NEXT:    fcmgt p3.d, p0/z, z1.d, z2.d
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    fcmgt p0.d, p0/z, z0.d, z2.d
-; CHECK-NEXT:    mov z0.d, #65535 // =0xffff
-; CHECK-NEXT:    mov z3.d, p1/m, #0 // =0x0
-; CHECK-NEXT:    mov z4.d, p2/m, #0 // =0x0
-; CHECK-NEXT:    sel z1.d, p3, z0.d, z3.d
-; CHECK-NEXT:    sel z0.d, p0, z0.d, z4.d
-; CHECK-NEXT:    uzp1 z0.s, z0.s, z1.s
+; CHECK-NEXT:    mov z4.d, x8
+; CHECK-NEXT:    fcvtzu z2.d, p1/m, z1.d
+; CHECK-NEXT:    fcmgt p1.d, p0/z, z1.d, z4.d
+; CHECK-NEXT:    mov z1.d, #65535 // =0xffff
+; CHECK-NEXT:    fcmgt p0.d, p0/z, z0.d, z4.d
+; CHECK-NEXT:    fcvtzu z3.d, p2/m, z0.d
+; CHECK-NEXT:    sel z0.d, p1, z1.d, z2.d
+; CHECK-NEXT:    sel z1.d, p0, z1.d, z3.d
+; CHECK-NEXT:    uzp1 z0.s, z1.s, z0.s
 ; CHECK-NEXT:    ret
     %x = call <vscale x 4 x i16> @llvm.fptoui.sat.nxv4f64.nxv4i16(<vscale x 4 x double> %f)
     ret <vscale x 4 x i16> %x
@@ -316,47 +274,35 @@ define <vscale x 8 x i16> @test_signed_v8f64_v8i16(<vscale x 8 x double> %f) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    str p6, [sp, #5, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov x8, #281337537757184 // =0xffe000000000
+; CHECK-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-NEXT:    movi v5.2d, #0000000000000000
+; CHECK-NEXT:    movi v6.2d, #0000000000000000
 ; CHECK-NEXT:    movk x8, #16623, lsl #48
 ; CHECK-NEXT:    fcmge p1.d, p0/z, z3.d, #0.0
 ; CHECK-NEXT:    fcmge p2.d, p0/z, z2.d, #0.0
 ; CHECK-NEXT:    fcmge p3.d, p0/z, z1.d, #0.0
+; CHECK-NEXT:    movi v7.2d, #0000000000000000
 ; CHECK-NEXT:    fcmge p4.d, p0/z, z0.d, #0.0
-; CHECK-NEXT:    movprfx z5, z3
-; CHECK-NEXT:    fcvtzu z5.d, p0/m, z3.d
-; CHECK-NEXT:    mov z4.d, x8
-; CHECK-NEXT:    movprfx z6, z2
-; CHECK-NEXT:    fcvtzu z6.d, p0/m, z2.d
-; CHECK-NEXT:    movprfx z7, z1
-; CHECK-NEXT:    fcvtzu z7.d, p0/m, z1.d
-; CHECK-NEXT:    movprfx z24, z0
-; CHECK-NEXT:    fcvtzu z24.d, p0/m, z0.d
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    fcmgt p5.d, p0/z, z3.d, z4.d
-; CHECK-NEXT:    fcmgt p6.d, p0/z, z2.d, z4.d
-; CHECK-NEXT:    not p2.b, p0/z, p2.b
+; CHECK-NEXT:    mov z24.d, x8
+; CHECK-NEXT:    fcvtzu z4.d, p1/m, z3.d
+; CHECK-NEXT:    fcvtzu z5.d, p2/m, z2.d
+; CHECK-NEXT:    fcvtzu z6.d, p3/m, z1.d
+; CHECK-NEXT:    fcmgt p1.d, p0/z, z3.d, z24.d
+; CHECK-NEXT:    fcmgt p2.d, p0/z, z2.d, z24.d
 ; CHECK-NEXT:    mov z2.d, #65535 // =0xffff
-; CHECK-NEXT:    not p3.b, p0/z, p3.b
-; CHECK-NEXT:    mov z5.d, p1/m, #0 // =0x0
-; CHECK-NEXT:    fcmgt p1.d, p0/z, z1.d, z4.d
-; CHECK-NEXT:    not p4.b, p0/z, p4.b
-; CHECK-NEXT:    fcmgt p0.d, p0/z, z0.d, z4.d
-; CHECK-NEXT:    mov z6.d, p2/m, #0 // =0x0
-; CHECK-NEXT:    mov z7.d, p3/m, #0 // =0x0
-; CHECK-NEXT:    mov z24.d, p4/m, #0 // =0x0
-; CHECK-NEXT:    sel z0.d, p5, z2.d, z5.d
-; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    sel z1.d, p6, z2.d, z6.d
-; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    sel z3.d, p1, z2.d, z7.d
+; CHECK-NEXT:    fcvtzu z7.d, p4/m, z0.d
+; CHECK-NEXT:    fcmgt p3.d, p0/z, z1.d, z24.d
 ; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    sel z2.d, p0, z2.d, z24.d
+; CHECK-NEXT:    fcmgt p0.d, p0/z, z0.d, z24.d
+; CHECK-NEXT:    sel z0.d, p1, z2.d, z4.d
+; CHECK-NEXT:    sel z1.d, p2, z2.d, z5.d
+; CHECK-NEXT:    sel z3.d, p3, z2.d, z6.d
+; CHECK-NEXT:    sel z2.d, p0, z2.d, z7.d
 ; CHECK-NEXT:    uzp1 z0.s, z1.s, z0.s
 ; CHECK-NEXT:    uzp1 z1.s, z2.s, z3.s
 ; CHECK-NEXT:    uzp1 z0.h, z1.h, z0.h
@@ -372,13 +318,11 @@ define <vscale x 2 x i64> @test_signed_v2f64_v2i64(<vscale x 2 x double> %f) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov x8, #4895412794951729151 // =0x43efffffffffffff
+; CHECK-NEXT:    movi v1.2d, #0000000000000000
 ; CHECK-NEXT:    mov z2.d, x8
 ; CHECK-NEXT:    fcmge p1.d, p0/z, z0.d, #0.0
-; CHECK-NEXT:    movprfx z1, z0
-; CHECK-NEXT:    fcvtzu z1.d, p0/m, z0.d
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
 ; CHECK-NEXT:    fcmgt p0.d, p0/z, z0.d, z2.d
-; CHECK-NEXT:    mov z1.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    fcvtzu z1.d, p1/m, z0.d
 ; CHECK-NEXT:    mov z1.d, p0/m, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z0.d, z1.d
 ; CHECK-NEXT:    ret
@@ -390,21 +334,17 @@ define <vscale x 4 x i64> @test_signed_v4f64_v4i64(<vscale x 4 x double> %f) {
 ; CHECK-LABEL: test_signed_v4f64_v4i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movi v2.2d, #0000000000000000
 ; CHECK-NEXT:    mov x8, #4895412794951729151 // =0x43efffffffffffff
+; CHECK-NEXT:    movi v3.2d, #0000000000000000
 ; CHECK-NEXT:    mov z4.d, x8
 ; CHECK-NEXT:    fcmge p1.d, p0/z, z0.d, #0.0
 ; CHECK-NEXT:    fcmge p2.d, p0/z, z1.d, #0.0
-; CHECK-NEXT:    movprfx z2, z0
-; CHECK-NEXT:    fcvtzu z2.d, p0/m, z0.d
-; CHECK-NEXT:    movprfx z3, z1
-; CHECK-NEXT:    fcvtzu z3.d, p0/m, z1.d
-; CHECK-NEXT:    fcmgt p3.d, p0/z, z0.d, z4.d
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    not p2.b, p0/z, p2.b
+; CHECK-NEXT:    fcvtzu z2.d, p1/m, z0.d
+; CHECK-NEXT:    fcmgt p1.d, p0/z, z0.d, z4.d
 ; CHECK-NEXT:    fcmgt p0.d, p0/z, z1.d, z4.d
-; CHECK-NEXT:    mov z2.d, p1/m, #0 // =0x0
-; CHECK-NEXT:    mov z3.d, p2/m, #0 // =0x0
-; CHECK-NEXT:    mov z2.d, p3/m, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    fcvtzu z3.d, p2/m, z1.d
+; CHECK-NEXT:    mov z2.d, p1/m, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z3.d, p0/m, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z0.d, z2.d
 ; CHECK-NEXT:    mov z1.d, z3.d
@@ -429,15 +369,13 @@ define <vscale x 2 x i32> @test_signed_v2f16_v2i32(<vscale x 2 x half> %f) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov w8, #31743 // =0x7bff
-; CHECK-NEXT:    mov z1.h, w8
+; CHECK-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-NEXT:    mov z2.h, w8
 ; CHECK-NEXT:    fcmge p1.h, p0/z, z0.h, #0.0
-; CHECK-NEXT:    movprfx z2, z0
-; CHECK-NEXT:    fcvtzu z2.d, p0/m, z0.h
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    fcmgt p0.h, p0/z, z0.h, z1.h
+; CHECK-NEXT:    fcmgt p0.h, p0/z, z0.h, z2.h
+; CHECK-NEXT:    fcvtzu z1.d, p1/m, z0.h
 ; CHECK-NEXT:    mov z0.d, #0xffffffff
-; CHECK-NEXT:    mov z2.d, p1/m, #0 // =0x0
-; CHECK-NEXT:    sel z0.d, p0, z0.d, z2.d
+; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    ret
     %x = call <vscale x 2 x i32> @llvm.fptoui.sat.nxv2f16.nxv2i32(<vscale x 2 x half> %f)
     ret <vscale x 2 x i32> %x
@@ -448,13 +386,11 @@ define <vscale x 4 x i32> @test_signed_v4f16_v4i32(<vscale x 4 x half> %f) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov w8, #31743 // =0x7bff
+; CHECK-NEXT:    movi v1.2d, #0000000000000000
 ; CHECK-NEXT:    mov z2.h, w8
 ; CHECK-NEXT:    fcmge p1.h, p0/z, z0.h, #0.0
-; CHECK-NEXT:    movprfx z1, z0
-; CHECK-NEXT:    fcvtzu z1.s, p0/m, z0.h
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
 ; CHECK-NEXT:    fcmgt p0.h, p0/z, z0.h, z2.h
-; CHECK-NEXT:    mov z1.s, p1/m, #0 // =0x0
+; CHECK-NEXT:    fcvtzu z1.s, p1/m, z0.h
 ; CHECK-NEXT:    mov z1.s, p0/m, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z0.d, z1.d
 ; CHECK-NEXT:    ret
@@ -469,20 +405,16 @@ define <vscale x 8 x i32> @test_signed_v8f16_v8i32(<vscale x 8 x half> %f) {
 ; CHECK-NEXT:    uunpkhi z3.s, z0.h
 ; CHECK-NEXT:    mov w8, #31743 // =0x7bff
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-NEXT:    movi v1.2d, #0000000000000000
 ; CHECK-NEXT:    mov z4.h, w8
 ; CHECK-NEXT:    fcmge p1.h, p0/z, z2.h, #0.0
 ; CHECK-NEXT:    fcmge p2.h, p0/z, z3.h, #0.0
-; CHECK-NEXT:    movprfx z0, z2
-; CHECK-NEXT:    fcvtzu z0.s, p0/m, z2.h
-; CHECK-NEXT:    movprfx z1, z3
-; CHECK-NEXT:    fcvtzu z1.s, p0/m, z3.h
-; CHECK-NEXT:    fcmgt p3.h, p0/z, z2.h, z4.h
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    not p2.b, p0/z, p2.b
+; CHECK-NEXT:    fcvtzu z0.s, p1/m, z2.h
+; CHECK-NEXT:    fcmgt p1.h, p0/z, z2.h, z4.h
 ; CHECK-NEXT:    fcmgt p0.h, p0/z, z3.h, z4.h
-; CHECK-NEXT:    mov z0.s, p1/m, #0 // =0x0
-; CHECK-NEXT:    mov z1.s, p2/m, #0 // =0x0
-; CHECK-NEXT:    mov z0.s, p3/m, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    fcvtzu z1.s, p2/m, z3.h
+; CHECK-NEXT:    mov z0.s, p1/m, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z1.s, p0/m, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    ret
     %x = call <vscale x 8 x i32> @llvm.fptoui.sat.nxv8f16.nxv8i32(<vscale x 8 x half> %f)
@@ -494,15 +426,13 @@ define <vscale x 4 x i16> @test_signed_v4f16_v4i16(<vscale x 4 x half> %f) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov w8, #31743 // =0x7bff
-; CHECK-NEXT:    mov z1.h, w8
+; CHECK-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-NEXT:    mov z2.h, w8
 ; CHECK-NEXT:    fcmge p1.h, p0/z, z0.h, #0.0
-; CHECK-NEXT:    movprfx z2, z0
-; CHECK-NEXT:    fcvtzu z2.s, p0/m, z0.h
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    fcmgt p0.h, p0/z, z0.h, z1.h
+; CHECK-NEXT:    fcmgt p0.h, p0/z, z0.h, z2.h
+; CHECK-NEXT:    fcvtzu z1.s, p1/m, z0.h
 ; CHECK-NEXT:    mov z0.s, #65535 // =0xffff
-; CHECK-NEXT:    mov z2.s, p1/m, #0 // =0x0
-; CHECK-NEXT:    sel z0.s, p0, z0.s, z2.s
+; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    ret
     %x = call <vscale x 4 x i16> @llvm.fptoui.sat.nxv4f16.nxv4i16(<vscale x 4 x half> %f)
     ret <vscale x 4 x i16> %x
@@ -513,13 +443,11 @@ define <vscale x 8 x i16> @test_signed_v8f16_v8i16(<vscale x 8 x half> %f) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov w8, #31743 // =0x7bff
+; CHECK-NEXT:    movi v1.2d, #0000000000000000
 ; CHECK-NEXT:    mov z2.h, w8
 ; CHECK-NEXT:    fcmge p1.h, p0/z, z0.h, #0.0
-; CHECK-NEXT:    movprfx z1, z0
-; CHECK-NEXT:    fcvtzu z1.h, p0/m, z0.h
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
 ; CHECK-NEXT:    fcmgt p0.h, p0/z, z0.h, z2.h
-; CHECK-NEXT:    mov z1.h, p1/m, #0 // =0x0
+; CHECK-NEXT:    fcvtzu z1.h, p1/m, z0.h
 ; CHECK-NEXT:    mov z1.h, p0/m, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z0.d, z1.d
 ; CHECK-NEXT:    ret
@@ -532,13 +460,11 @@ define <vscale x 2 x i64> @test_signed_v2f16_v2i64(<vscale x 2 x half> %f) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov w8, #31743 // =0x7bff
+; CHECK-NEXT:    movi v1.2d, #0000000000000000
 ; CHECK-NEXT:    mov z2.h, w8
 ; CHECK-NEXT:    fcmge p1.h, p0/z, z0.h, #0.0
-; CHECK-NEXT:    movprfx z1, z0
-; CHECK-NEXT:    fcvtzu z1.d, p0/m, z0.h
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
 ; CHECK-NEXT:    fcmgt p0.h, p0/z, z0.h, z2.h
-; CHECK-NEXT:    mov z1.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    fcvtzu z1.d, p1/m, z0.h
 ; CHECK-NEXT:    mov z1.d, p0/m, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z0.d, z1.d
 ; CHECK-NEXT:    ret
@@ -553,20 +479,16 @@ define <vscale x 4 x i64> @test_signed_v4f16_v4i64(<vscale x 4 x half> %f) {
 ; CHECK-NEXT:    uunpkhi z3.d, z0.s
 ; CHECK-NEXT:    mov w8, #31743 // =0x7bff
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-NEXT:    movi v1.2d, #0000000000000000
 ; CHECK-NEXT:    mov z4.h, w8
 ; CHECK-NEXT:    fcmge p1.h, p0/z, z2.h, #0.0
 ; CHECK-NEXT:    fcmge p2.h, p0/z, z3.h, #0.0
-; CHECK-NEXT:    movprfx z0, z2
-; CHECK-NEXT:    fcvtzu z0.d, p0/m, z2.h
-; CHECK-NEXT:    movprfx z1, z3
-; CHECK-NEXT:    fcvtzu z1.d, p0/m, z3.h
-; CHECK-NEXT:    fcmgt p3.h, p0/z, z2.h, z4.h
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    not p2.b, p0/z, p2.b
+; CHECK-NEXT:    fcvtzu z0.d, p1/m, z2.h
+; CHECK-NEXT:    fcmgt p1.h, p0/z, z2.h, z4.h
 ; CHECK-NEXT:    fcmgt p0.h, p0/z, z3.h, z4.h
-; CHECK-NEXT:    mov z0.d, p1/m, #0 // =0x0
-; CHECK-NEXT:    mov z1.d, p2/m, #0 // =0x0
-; CHECK-NEXT:    mov z0.d, p3/m, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    fcvtzu z1.d, p2/m, z3.h
+; CHECK-NEXT:    mov z0.d, p1/m, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z1.d, p0/m, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    ret
     %x = call <vscale x 4 x i64> @llvm.fptoui.sat.nxv4f16.nxv4i64(<vscale x 4 x half> %f)
diff --git a/llvm/test/CodeGen/AArch64/sve-llrint.ll b/llvm/test/CodeGen/AArch64/sve-llrint.ll
index 16e0e0c4661b6..b0198cf9d1247 100644
--- a/llvm/test/CodeGen/AArch64/sve-llrint.ll
+++ b/llvm/test/CodeGen/AArch64/sve-llrint.ll
@@ -6,20 +6,17 @@ define <vscale x 1 x i64> @llrint_v1i64_v1f16(<vscale x 1 x half> %x) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov w8, #64511 // =0xfbff
-; CHECK-NEXT:    mov z2.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z1.h, w8
 ; CHECK-NEXT:    mov w8, #31743 // =0x7bff
 ; CHECK-NEXT:    frintx z0.h, p0/m, z0.h
-; CHECK-NEXT:    mov z3.h, w8
+; CHECK-NEXT:    mov z2.h, w8
 ; CHECK-NEXT:    fcmge p1.h, p0/z, z0.h, z1.h
-; CHECK-NEXT:    movprfx z1, z0
-; CHECK-NEXT:    fcvtzs z1.d, p0/m, z0.h
-; CHECK-NEXT:    fcmgt p2.h, p0/z, z0.h, z3.h
-; CHECK-NEXT:    mov z3.d, #0x7fffffffffffffff
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    mov z1.d, #0x8000000000000000
+; CHECK-NEXT:    fcvtzs z1.d, p1/m, z0.h
+; CHECK-NEXT:    fcmgt p1.h, p0/z, z0.h, z2.h
+; CHECK-NEXT:    mov z2.d, #0x7fffffffffffffff
 ; CHECK-NEXT:    fcmuo p0.h, p0/z, z0.h, z0.h
-; CHECK-NEXT:    mov z1.d, p1/m, z2.d
-; CHECK-NEXT:    sel z0.d, p2, z3.d, z1.d
+; CHECK-NEXT:    sel z0.d, p1, z2.d, z1.d
 ; CHECK-NEXT:    mov z0.d, p0/m, #0 // =0x0
 ; CHECK-NEXT:    ret
   %a = call <vscale x 1 x i64> @llvm.llrint.nxv1i64.nxv1f16(<vscale x 1 x half> %x)
@@ -32,20 +29,17 @@ define <vscale x 2 x i64> @llrint_v1i64_v2f16(<vscale x 2 x half> %x) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov w8, #64511 // =0xfbff
-; CHECK-NEXT:    mov z2.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z1.h, w8
 ; CHECK-NEXT:    mov w8, #31743 // =0x7bff
 ; CHECK-NEXT:    frintx z0.h, p0/m, z0.h
-; CHECK-NEXT:    mov z3.h, w8
+; CHECK-NEXT:    mov z2.h, w8
 ; CHECK-NEXT:    fcmge p1.h, p0/z, z0.h, z1.h
-; CHECK-NEXT:    movprfx z1, z0
-; CHECK-NEXT:    fcvtzs z1.d, p0/m, z0.h
-; CHECK-NEXT:    fcmgt p2.h, p0/z, z0.h, z3.h
-; CHECK-NEXT:    mov z3.d, #0x7fffffffffffffff
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    mov z1.d, #0x8000000000000000
+; CHECK-NEXT:    fcvtzs z1.d, p1/m, z0.h
+; CHECK-NEXT:    fcmgt p1.h, p0/z, z0.h, z2.h
+; CHECK-NEXT:    mov z2.d, #0x7fffffffffffffff
 ; CHECK-NEXT:    fcmuo p0.h, p0/z, z0.h, z0.h
-; CHECK-NEXT:    mov z1.d, p1/m, z2.d
-; CHECK-NEXT:    sel z0.d, p2, z3.d, z1.d
+; CHECK-NEXT:    sel z0.d, p1, z2.d, z1.d
 ; CHECK-NEXT:    mov z0.d, p0/m, #0 // =0x0
 ; CHECK-NEXT:    ret
   %a = call <vscale x 2 x i64> @llvm.llrint.nxv2i64.nxv2f16(<vscale x 2 x half> %x)
@@ -56,43 +50,30 @@ declare <vscale x 2 x i64> @llvm.llrint.nxv2i64.nxv2f16(<vscale x 2 x half>)
 define <vscale x 4 x i64> @llrint_v4i64_v4f16(<vscale x 4 x half> %x) {
 ; CHECK-LABEL: llrint_v4i64_v4f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
-; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    uunpklo z1.d, z0.s
 ; CHECK-NEXT:    uunpkhi z0.d, z0.s
 ; CHECK-NEXT:    mov w8, #64511 // =0xfbff
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z2.h, w8
 ; CHECK-NEXT:    mov w8, #31743 // =0x7bff
-; CHECK-NEXT:    mov z3.h, w8
-; CHECK-NEXT:    mov z6.d, #0x7fffffffffffffff
+; CHECK-NEXT:    mov z3.d, #0x8000000000000000
+; CHECK-NEXT:    mov z4.d, #0x8000000000000000
+; CHECK-NEXT:    mov z5.h, w8
 ; CHECK-NEXT:    frintx z1.h, p0/m, z1.h
 ; CHECK-NEXT:    frintx z0.h, p0/m, z0.h
 ; CHECK-NEXT:    fcmge p1.h, p0/z, z1.h, z2.h
 ; CHECK-NEXT:    fcmge p2.h, p0/z, z0.h, z2.h
-; CHECK-NEXT:    mov z2.d, #0x8000000000000000
-; CHECK-NEXT:    movprfx z4, z1
-; CHECK-NEXT:    fcvtzs z4.d, p0/m, z1.h
-; CHECK-NEXT:    movprfx z5, z0
-; CHECK-NEXT:    fcvtzs z5.d, p0/m, z0.h
-; CHECK-NEXT:    fcmgt p3.h, p0/z, z1.h, z3.h
-; CHECK-NEXT:    fcmgt p4.h, p0/z, z0.h, z3.h
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    sel z3.d, p1, z2.d, z4.d
-; CHECK-NEXT:    fcmuo p1.h, p0/z, z1.h, z1.h
+; CHECK-NEXT:    mov z2.d, #0x7fffffffffffffff
+; CHECK-NEXT:    fcmuo p3.h, p0/z, z1.h, z1.h
+; CHECK-NEXT:    fcvtzs z3.d, p1/m, z1.h
+; CHECK-NEXT:    fcmgt p1.h, p0/z, z1.h, z5.h
+; CHECK-NEXT:    fcvtzs z4.d, p2/m, z0.h
+; CHECK-NEXT:    fcmgt p2.h, p0/z, z0.h, z5.h
 ; CHECK-NEXT:    fcmuo p0.h, p0/z, z0.h, z0.h
-; CHECK-NEXT:    sel z2.d, p2, z2.d, z5.d
-; CHECK-NEXT:    sel z0.d, p3, z6.d, z3.d
-; CHECK-NEXT:    sel z1.d, p4, z6.d, z2.d
-; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    mov z0.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    sel z0.d, p1, z2.d, z3.d
+; CHECK-NEXT:    sel z1.d, p2, z2.d, z4.d
+; CHECK-NEXT:    mov z0.d, p3/m, #0 // =0x0
 ; CHECK-NEXT:    mov z1.d, p0/m, #0 // =0x0
-; CHECK-NEXT:    addvl sp, sp, #1
-; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
   %a = call <vscale x 4 x i64> @llvm.llrint.nxv4i64.nxv4f16(<vscale x 4 x half> %x)
   ret <vscale x 4 x i64> %a
@@ -104,7 +85,6 @@ define <vscale x 8 x i64> @llrint_v8i64_v8f16(<vscale x 8 x half> %x) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    str p7, [sp, #4, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p6, [sp, #5, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
@@ -116,8 +96,10 @@ define <vscale x 8 x i64> @llrint_v8i64_v8f16(<vscale x 8 x half> %x) {
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z4.h, w8
 ; CHECK-NEXT:    mov w8, #31743 // =0x7bff
-; CHECK-NEXT:    mov z6.h, w8
-; CHECK-NEXT:    mov z26.d, #0x7fffffffffffffff
+; CHECK-NEXT:    mov z6.d, #0x8000000000000000
+; CHECK-NEXT:    mov z25.h, w8
+; CHECK-NEXT:    mov z7.d, #0x8000000000000000
+; CHECK-NEXT:    mov z24.d, #0x8000000000000000
 ; CHECK-NEXT:    uunpklo z2.d, z1.s
 ; CHECK-NEXT:    uunpkhi z1.d, z1.s
 ; CHECK-NEXT:    uunpklo z3.d, z0.s
@@ -132,41 +114,29 @@ define <vscale x 8 x i64> @llrint_v8i64_v8f16(<vscale x 8 x half> %x) {
 ; CHECK-NEXT:    fcmge p2.h, p0/z, z1.h, z4.h
 ; CHECK-NEXT:    fcmge p3.h, p0/z, z3.h, z4.h
 ; CHECK-NEXT:    fcmge p4.h, p0/z, z5.h, z4.h
-; CHECK-NEXT:    movprfx z4, z2
-; CHECK-NEXT:    fcvtzs z4.d, p0/m, z2.h
-; CHECK-NEXT:    movprfx z7, z1
-; CHECK-NEXT:    fcvtzs z7.d, p0/m, z1.h
-; CHECK-NEXT:    movprfx z24, z3
-; CHECK-NEXT:    fcvtzs z24.d, p0/m, z3.h
-; CHECK-NEXT:    movprfx z25, z5
-; CHECK-NEXT:    fcvtzs z25.d, p0/m, z5.h
-; CHECK-NEXT:    fcmgt p7.h, p0/z, z3.h, z6.h
-; CHECK-NEXT:    fcmgt p5.h, p0/z, z2.h, z6.h
-; CHECK-NEXT:    fcmgt p6.h, p0/z, z1.h, z6.h
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    not p3.b, p0/z, p3.b
-; CHECK-NEXT:    mov z4.d, p1/m, z0.d
-; CHECK-NEXT:    fcmgt p1.h, p0/z, z5.h, z6.h
-; CHECK-NEXT:    not p4.b, p0/z, p4.b
-; CHECK-NEXT:    sel z6.d, p2, z0.d, z7.d
-; CHECK-NEXT:    fcmuo p2.h, p0/z, z2.h, z2.h
-; CHECK-NEXT:    sel z7.d, p3, z0.d, z24.d
-; CHECK-NEXT:    fcmuo p3.h, p0/z, z1.h, z1.h
-; CHECK-NEXT:    sel z24.d, p4, z0.d, z25.d
-; CHECK-NEXT:    fcmuo p4.h, p0/z, z3.h, z3.h
+; CHECK-NEXT:    mov z4.d, #0x7fffffffffffffff
+; CHECK-NEXT:    fcmgt p5.h, p0/z, z5.h, z25.h
+; CHECK-NEXT:    fcmuo p6.h, p0/z, z1.h, z1.h
+; CHECK-NEXT:    fcvtzs z0.d, p1/m, z2.h
+; CHECK-NEXT:    fcvtzs z6.d, p2/m, z1.h
+; CHECK-NEXT:    fcmgt p1.h, p0/z, z2.h, z25.h
+; CHECK-NEXT:    fcmgt p2.h, p0/z, z1.h, z25.h
+; CHECK-NEXT:    fcvtzs z7.d, p3/m, z3.h
+; CHECK-NEXT:    fcmgt p3.h, p0/z, z3.h, z25.h
+; CHECK-NEXT:    fcvtzs z24.d, p4/m, z5.h
+; CHECK-NEXT:    fcmuo p4.h, p0/z, z2.h, z2.h
+; CHECK-NEXT:    mov z0.d, p1/m, z4.d
+; CHECK-NEXT:    fcmuo p1.h, p0/z, z3.h, z3.h
 ; CHECK-NEXT:    fcmuo p0.h, p0/z, z5.h, z5.h
-; CHECK-NEXT:    sel z0.d, p5, z26.d, z4.d
-; CHECK-NEXT:    sel z1.d, p6, z26.d, z6.d
-; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    sel z2.d, p7, z26.d, z7.d
-; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    sel z3.d, p1, z26.d, z24.d
+; CHECK-NEXT:    sel z1.d, p2, z4.d, z6.d
+; CHECK-NEXT:    sel z2.d, p3, z4.d, z7.d
+; CHECK-NEXT:    sel z3.d, p5, z4.d, z24.d
 ; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    mov z0.d, p2/m, #0 // =0x0
-; CHECK-NEXT:    mov z1.d, p3/m, #0 // =0x0
-; CHECK-NEXT:    mov z2.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    mov z0.d, p4/m, #0 // =0x0
 ; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z1.d, p6/m, #0 // =0x0
+; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z2.d, p1/m, #0 // =0x0
 ; CHECK-NEXT:    mov z3.d, p0/m, #0 // =0x0
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -180,7 +150,7 @@ define <vscale x 16 x i64> @llrint_v16i64_v16f16(<vscale x 16 x half> %x) {
 ; CHECK-LABEL: llrint_v16i64_v16f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    addvl sp, sp, #-3
+; CHECK-NEXT:    addvl sp, sp, #-4
 ; CHECK-NEXT:    str p10, [sp, #1, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p9, [sp, #2, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p8, [sp, #3, mul vl] // 2-byte Folded Spill
@@ -188,124 +158,110 @@ define <vscale x 16 x i64> @llrint_v16i64_v16f16(<vscale x 16 x half> %x) {
 ; CHECK-NEXT:    str p6, [sp, #5, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    str z9, [sp, #1, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT:    str z8, [sp, #2, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG
+; CHECK-NEXT:    str z10, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z9, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z8, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
 ; CHECK-NEXT:    .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG
 ; CHECK-NEXT:    uunpklo z2.s, z0.h
 ; CHECK-NEXT:    uunpkhi z0.s, z0.h
 ; CHECK-NEXT:    mov w8, #64511 // =0xfbff
-; CHECK-NEXT:    uunpklo z4.s, z1.h
-; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    uunpklo z3.s, z1.h
 ; CHECK-NEXT:    uunpkhi z1.s, z1.h
-; CHECK-NEXT:    mov z5.h, w8
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov z24.h, w8
 ; CHECK-NEXT:    mov w8, #31743 // =0x7bff
-; CHECK-NEXT:    mov z25.d, #0x8000000000000000
-; CHECK-NEXT:    mov z27.h, w8
-; CHECK-NEXT:    mov z7.d, #0x7fffffffffffffff
-; CHECK-NEXT:    uunpklo z3.d, z2.s
+; CHECK-NEXT:    mov z7.d, #0x8000000000000000
+; CHECK-NEXT:    mov z27.d, #0x8000000000000000
+; CHECK-NEXT:    mov z28.d, #0x8000000000000000
+; CHECK-NEXT:    mov z30.d, #0x8000000000000000
+; CHECK-NEXT:    uunpklo z4.d, z2.s
 ; CHECK-NEXT:    uunpkhi z2.d, z2.s
-; CHECK-NEXT:    uunpklo z6.d, z0.s
+; CHECK-NEXT:    uunpklo z5.d, z0.s
 ; CHECK-NEXT:    uunpkhi z0.d, z0.s
-; CHECK-NEXT:    uunpklo z24.d, z4.s
-; CHECK-NEXT:    uunpkhi z4.d, z4.s
-; CHECK-NEXT:    uunpklo z26.d, z1.s
-; CHECK-NEXT:    uunpkhi z1.d, z1.s
+; CHECK-NEXT:    uunpklo z6.d, z3.s
+; CHECK-NEXT:    uunpklo z25.d, z1.s
+; CHECK-NEXT:    uunpkhi z3.d, z3.s
+; CHECK-NEXT:    mov z8.d, #0x8000000000000000
+; CHECK-NEXT:    mov z31.d, #0x8000000000000000
+; CHECK-NEXT:    mov z10.d, #0x7fffffffffffffff
+; CHECK-NEXT:    frintx z4.h, p0/m, z4.h
 ; CHECK-NEXT:    frintx z2.h, p0/m, z2.h
-; CHECK-NEXT:    frintx z3.h, p0/m, z3.h
+; CHECK-NEXT:    frintx z5.h, p0/m, z5.h
+; CHECK-NEXT:    movprfx z26, z0
+; CHECK-NEXT:    frintx z26.h, p0/m, z0.h
+; CHECK-NEXT:    uunpkhi z0.d, z1.s
 ; CHECK-NEXT:    frintx z6.h, p0/m, z6.h
-; CHECK-NEXT:    movprfx z28, z0
-; CHECK-NEXT:    frintx z28.h, p0/m, z0.h
-; CHECK-NEXT:    movprfx z29, z4
-; CHECK-NEXT:    frintx z29.h, p0/m, z4.h
-; CHECK-NEXT:    frintx z24.h, p0/m, z24.h
-; CHECK-NEXT:    movprfx z30, z1
-; CHECK-NEXT:    frintx z30.h, p0/m, z1.h
-; CHECK-NEXT:    frintx z26.h, p0/m, z26.h
-; CHECK-NEXT:    fcmge p5.h, p0/z, z2.h, z5.h
-; CHECK-NEXT:    fcmge p2.h, p0/z, z3.h, z5.h
-; CHECK-NEXT:    movprfx z1, z2
-; CHECK-NEXT:    fcvtzs z1.d, p0/m, z2.h
-; CHECK-NEXT:    movprfx z0, z3
-; CHECK-NEXT:    fcvtzs z0.d, p0/m, z3.h
-; CHECK-NEXT:    fcmge p6.h, p0/z, z6.h, z5.h
-; CHECK-NEXT:    fcmgt p3.h, p0/z, z3.h, z27.h
-; CHECK-NEXT:    fcmuo p1.h, p0/z, z3.h, z3.h
-; CHECK-NEXT:    fcmge p7.h, p0/z, z28.h, z5.h
-; CHECK-NEXT:    movprfx z3, z6
-; CHECK-NEXT:    fcvtzs z3.d, p0/m, z6.h
-; CHECK-NEXT:    fcmge p8.h, p0/z, z24.h, z5.h
-; CHECK-NEXT:    fcmgt p4.h, p0/z, z2.h, z27.h
-; CHECK-NEXT:    fcmge p9.h, p0/z, z26.h, z5.h
-; CHECK-NEXT:    not p5.b, p0/z, p5.b
-; CHECK-NEXT:    movprfx z4, z24
-; CHECK-NEXT:    fcvtzs z4.d, p0/m, z24.h
-; CHECK-NEXT:    fcmge p10.h, p0/z, z30.h, z5.h
-; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    movprfx z31, z26
-; CHECK-NEXT:    fcvtzs z31.d, p0/m, z26.h
-; CHECK-NEXT:    movprfx z8, z30
-; CHECK-NEXT:    fcvtzs z8.d, p0/m, z30.h
-; CHECK-NEXT:    mov z1.d, p5/m, z25.d
-; CHECK-NEXT:    fcmge p5.h, p0/z, z29.h, z5.h
-; CHECK-NEXT:    not p6.b, p0/z, p6.b
-; CHECK-NEXT:    mov z0.d, p2/m, z25.d
-; CHECK-NEXT:    fcmuo p2.h, p0/z, z2.h, z2.h
-; CHECK-NEXT:    movprfx z2, z28
-; CHECK-NEXT:    fcvtzs z2.d, p0/m, z28.h
-; CHECK-NEXT:    movprfx z5, z29
-; CHECK-NEXT:    fcvtzs z5.d, p0/m, z29.h
-; CHECK-NEXT:    not p7.b, p0/z, p7.b
-; CHECK-NEXT:    mov z3.d, p6/m, z25.d
-; CHECK-NEXT:    not p6.b, p0/z, p8.b
-; CHECK-NEXT:    fcmgt p8.h, p0/z, z6.h, z27.h
-; CHECK-NEXT:    mov z1.d, p4/m, z7.d
-; CHECK-NEXT:    not p5.b, p0/z, p5.b
-; CHECK-NEXT:    mov z0.d, p3/m, z7.d
-; CHECK-NEXT:    fcmgt p3.h, p0/z, z29.h, z27.h
-; CHECK-NEXT:    sel z9.d, p7, z25.d, z2.d
-; CHECK-NEXT:    not p7.b, p0/z, p9.b
-; CHECK-NEXT:    mov z4.d, p6/m, z25.d
-; CHECK-NEXT:    not p6.b, p0/z, p10.b
-; CHECK-NEXT:    fcmgt p10.h, p0/z, z28.h, z27.h
-; CHECK-NEXT:    mov z5.d, p5/m, z25.d
-; CHECK-NEXT:    fcmgt p5.h, p0/z, z24.h, z27.h
-; CHECK-NEXT:    fcmuo p9.h, p0/z, z6.h, z6.h
-; CHECK-NEXT:    sel z6.d, p7, z25.d, z31.d
-; CHECK-NEXT:    sel z25.d, p6, z25.d, z8.d
-; CHECK-NEXT:    ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    fcmgt p6.h, p0/z, z26.h, z27.h
-; CHECK-NEXT:    fcmgt p7.h, p0/z, z30.h, z27.h
-; CHECK-NEXT:    fcmuo p4.h, p0/z, z28.h, z28.h
-; CHECK-NEXT:    sel z2.d, p8, z7.d, z3.d
-; CHECK-NEXT:    sel z3.d, p10, z7.d, z9.d
-; CHECK-NEXT:    ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    fcmuo p8.h, p0/z, z29.h, z29.h
-; CHECK-NEXT:    mov z4.d, p5/m, z7.d
-; CHECK-NEXT:    fcmuo p5.h, p0/z, z24.h, z24.h
-; CHECK-NEXT:    fcmuo p10.h, p0/z, z26.h, z26.h
-; CHECK-NEXT:    mov z5.d, p3/m, z7.d
-; CHECK-NEXT:    mov z6.d, p6/m, z7.d
+; CHECK-NEXT:    movprfx z29, z3
+; CHECK-NEXT:    frintx z29.h, p0/m, z3.h
+; CHECK-NEXT:    frintx z25.h, p0/m, z25.h
+; CHECK-NEXT:    mov z1.h, w8
+; CHECK-NEXT:    mov z3.d, #0x8000000000000000
+; CHECK-NEXT:    fcmge p1.h, p0/z, z4.h, z24.h
+; CHECK-NEXT:    fcmge p2.h, p0/z, z2.h, z24.h
+; CHECK-NEXT:    fcmge p3.h, p0/z, z5.h, z24.h
+; CHECK-NEXT:    movprfx z9, z0
+; CHECK-NEXT:    frintx z9.h, p0/m, z0.h
+; CHECK-NEXT:    fcmge p4.h, p0/z, z26.h, z24.h
+; CHECK-NEXT:    fcmge p5.h, p0/z, z6.h, z24.h
+; CHECK-NEXT:    fcmge p7.h, p0/z, z25.h, z24.h
+; CHECK-NEXT:    fcmge p6.h, p0/z, z29.h, z24.h
+; CHECK-NEXT:    fcmgt p8.h, p0/z, z6.h, z1.h
+; CHECK-NEXT:    fcmgt p10.h, p0/z, z25.h, z1.h
+; CHECK-NEXT:    fcmuo p9.h, p0/z, z5.h, z5.h
+; CHECK-NEXT:    fcvtzs z7.d, p1/m, z4.h
+; CHECK-NEXT:    fcmgt p1.h, p0/z, z4.h, z1.h
+; CHECK-NEXT:    fcvtzs z27.d, p2/m, z2.h
+; CHECK-NEXT:    fcmge p2.h, p0/z, z9.h, z24.h
+; CHECK-NEXT:    mov z24.d, #0x8000000000000000
+; CHECK-NEXT:    fcvtzs z28.d, p3/m, z5.h
+; CHECK-NEXT:    fcvtzs z3.d, p4/m, z26.h
+; CHECK-NEXT:    fcvtzs z30.d, p5/m, z6.h
+; CHECK-NEXT:    fcvtzs z8.d, p7/m, z25.h
+; CHECK-NEXT:    fcmgt p4.h, p0/z, z2.h, z1.h
+; CHECK-NEXT:    fcmgt p5.h, p0/z, z5.h, z1.h
+; CHECK-NEXT:    fcmgt p7.h, p0/z, z26.h, z1.h
+; CHECK-NEXT:    fcvtzs z31.d, p6/m, z29.h
+; CHECK-NEXT:    sel z0.d, p1, z10.d, z7.d
+; CHECK-NEXT:    fcmgt p1.h, p0/z, z29.h, z1.h
+; CHECK-NEXT:    fcvtzs z24.d, p2/m, z9.h
+; CHECK-NEXT:    fcmgt p2.h, p0/z, z9.h, z1.h
+; CHECK-NEXT:    fcmuo p3.h, p0/z, z4.h, z4.h
+; CHECK-NEXT:    fcmuo p6.h, p0/z, z2.h, z2.h
+; CHECK-NEXT:    sel z4.d, p8, z10.d, z30.d
+; CHECK-NEXT:    fcmuo p8.h, p0/z, z25.h, z25.h
+; CHECK-NEXT:    sel z1.d, p4, z10.d, z27.d
+; CHECK-NEXT:    fcmuo p4.h, p0/z, z26.h, z26.h
+; CHECK-NEXT:    sel z2.d, p5, z10.d, z28.d
+; CHECK-NEXT:    mov z3.d, p7/m, z10.d
+; CHECK-NEXT:    fcmuo p5.h, p0/z, z6.h, z6.h
+; CHECK-NEXT:    fcmuo p7.h, p0/z, z29.h, z29.h
+; CHECK-NEXT:    sel z5.d, p1, z10.d, z31.d
+; CHECK-NEXT:    sel z6.d, p10, z10.d, z8.d
+; CHECK-NEXT:    ldr z8, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p10, [sp, #1, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    fcmuo p0.h, p0/z, z9.h, z9.h
+; CHECK-NEXT:    ldr z9, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    sel z7.d, p2, z10.d, z24.d
+; CHECK-NEXT:    ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    mov z1.d, p6/m, #0 // =0x0
 ; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    fcmuo p0.h, p0/z, z30.h, z30.h
-; CHECK-NEXT:    sel z7.d, p7, z7.d, z25.d
-; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    mov z2.d, p9/m, #0 // =0x0
-; CHECK-NEXT:    ldr p9, [sp, #2, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    mov z3.d, p4/m, #0 // =0x0
-; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p9, [sp, #2, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    mov z4.d, p5/m, #0 // =0x0
-; CHECK-NEXT:    mov z5.d, p8/m, #0 // =0x0
+; CHECK-NEXT:    mov z5.d, p7/m, #0 // =0x0
+; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z6.d, p8/m, #0 // =0x0
+; CHECK-NEXT:    mov z0.d, p3/m, #0 // =0x0
 ; CHECK-NEXT:    ldr p8, [sp, #3, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    mov z6.d, p10/m, #0 // =0x0
-; CHECK-NEXT:    mov z0.d, p1/m, #0 // =0x0
-; CHECK-NEXT:    ldr p10, [sp, #1, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    mov z1.d, p2/m, #0 // =0x0
 ; CHECK-NEXT:    mov z7.d, p0/m, #0 // =0x0
 ; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    addvl sp, sp, #3
+; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #4
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
   %a = call <vscale x 16 x i64> @llvm.llrint.nxv16i64.nxv16f16(<vscale x 16 x half> %x)
@@ -318,6 +274,8 @@ define <vscale x 32 x i64> @llrint_v32i64_v32f16(<vscale x 32 x half> %x) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-17
+; CHECK-NEXT:    str p11, [sp] // 2-byte Folded Spill
+; CHECK-NEXT:    str p10, [sp, #1, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p9, [sp, #2, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p8, [sp, #3, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p7, [sp, #4, mul vl] // 2-byte Folded Spill
@@ -340,8 +298,8 @@ define <vscale x 32 x i64> @llrint_v32i64_v32f16(<vscale x 32 x half> %x) {
 ; CHECK-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
 ; CHECK-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
 ; CHECK-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT:    addvl sp, sp, #-3
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xa0, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 160 * VG
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 144 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
 ; CHECK-NEXT:    .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
@@ -352,230 +310,191 @@ define <vscale x 32 x i64> @llrint_v32i64_v32f16(<vscale x 32 x half> %x) {
 ; CHECK-NEXT:    .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 56 * VG
 ; CHECK-NEXT:    .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG
 ; CHECK-NEXT:    uunpklo z4.s, z0.h
-; CHECK-NEXT:    uunpkhi z5.s, z0.h
+; CHECK-NEXT:    uunpkhi z0.s, z0.h
 ; CHECK-NEXT:    mov w9, #64511 // =0xfbff
+; CHECK-NEXT:    uunpklo z25.s, z1.h
+; CHECK-NEXT:    uunpkhi z10.s, z1.h
+; CHECK-NEXT:    uunpklo z9.s, z2.h
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    uunpklo z6.s, z1.h
-; CHECK-NEXT:    mov z26.h, w9
-; CHECK-NEXT:    uunpkhi z25.s, z1.h
+; CHECK-NEXT:    uunpkhi z12.s, z3.h
+; CHECK-NEXT:    mov z27.h, w9
 ; CHECK-NEXT:    mov w9, #31743 // =0x7bff
-; CHECK-NEXT:    mov z27.d, #0x8000000000000000
-; CHECK-NEXT:    uunpklo z31.s, z2.h
-; CHECK-NEXT:    uunpkhi z12.s, z2.h
-; CHECK-NEXT:    mov z17.d, z3.d
-; CHECK-NEXT:    uunpklo z0.d, z4.s
-; CHECK-NEXT:    uunpkhi z4.d, z4.s
-; CHECK-NEXT:    uunpklo z7.d, z5.s
-; CHECK-NEXT:    uunpkhi z24.d, z5.s
-; CHECK-NEXT:    uunpklo z28.d, z6.s
-; CHECK-NEXT:    uunpkhi z29.d, z6.s
-; CHECK-NEXT:    uunpklo z8.d, z25.s
-; CHECK-NEXT:    uunpkhi z9.d, z25.s
-; CHECK-NEXT:    uunpklo z16.s, z17.h
-; CHECK-NEXT:    uunpklo z11.d, z31.s
-; CHECK-NEXT:    uunpkhi z14.d, z31.s
-; CHECK-NEXT:    uunpkhi z17.s, z17.h
-; CHECK-NEXT:    movprfx z30, z4
-; CHECK-NEXT:    frintx z30.h, p0/m, z4.h
-; CHECK-NEXT:    movprfx z4, z7
-; CHECK-NEXT:    frintx z4.h, p0/m, z7.h
-; CHECK-NEXT:    frintx z0.h, p0/m, z0.h
-; CHECK-NEXT:    movprfx z6, z24
-; CHECK-NEXT:    frintx z6.h, p0/m, z24.h
-; CHECK-NEXT:    movprfx z7, z28
-; CHECK-NEXT:    frintx z7.h, p0/m, z28.h
-; CHECK-NEXT:    movprfx z25, z29
-; CHECK-NEXT:    frintx z25.h, p0/m, z29.h
-; CHECK-NEXT:    movprfx z3, z9
-; CHECK-NEXT:    frintx z3.h, p0/m, z9.h
-; CHECK-NEXT:    mov z5.h, w9
-; CHECK-NEXT:    movprfx z31, z11
-; CHECK-NEXT:    frintx z31.h, p0/m, z11.h
-; CHECK-NEXT:    movprfx z9, z14
-; CHECK-NEXT:    frintx z9.h, p0/m, z14.h
-; CHECK-NEXT:    fcmge p1.h, p0/z, z0.h, z26.h
-; CHECK-NEXT:    fcmge p4.h, p0/z, z4.h, z26.h
-; CHECK-NEXT:    movprfx z24, z0
-; CHECK-NEXT:    fcvtzs z24.d, p0/m, z0.h
-; CHECK-NEXT:    fcmge p2.h, p0/z, z30.h, z26.h
-; CHECK-NEXT:    movprfx z29, z4
-; CHECK-NEXT:    fcvtzs z29.d, p0/m, z4.h
-; CHECK-NEXT:    fcmge p6.h, p0/z, z6.h, z26.h
-; CHECK-NEXT:    movprfx z28, z30
-; CHECK-NEXT:    fcvtzs z28.d, p0/m, z30.h
-; CHECK-NEXT:    movprfx z10, z6
-; CHECK-NEXT:    fcvtzs z10.d, p0/m, z6.h
-; CHECK-NEXT:    str z0, [sp, #1, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT:    fcmge p3.h, p0/z, z7.h, z26.h
-; CHECK-NEXT:    movprfx z13, z7
-; CHECK-NEXT:    fcvtzs z13.d, p0/m, z7.h
-; CHECK-NEXT:    movprfx z15, z25
-; CHECK-NEXT:    fcvtzs z15.d, p0/m, z25.h
-; CHECK-NEXT:    not p5.b, p0/z, p1.b
-; CHECK-NEXT:    movprfx z18, z3
-; CHECK-NEXT:    fcvtzs z18.d, p0/m, z3.h
-; CHECK-NEXT:    movprfx z20, z31
-; CHECK-NEXT:    fcvtzs z20.d, p0/m, z31.h
-; CHECK-NEXT:    not p4.b, p0/z, p4.b
-; CHECK-NEXT:    movprfx z21, z9
-; CHECK-NEXT:    fcvtzs z21.d, p0/m, z9.h
-; CHECK-NEXT:    fcmgt p1.h, p0/z, z30.h, z5.h
-; CHECK-NEXT:    sel z0.d, p5, z27.d, z24.d
-; CHECK-NEXT:    not p7.b, p0/z, p2.b
-; CHECK-NEXT:    fcmgt p2.h, p0/z, z4.h, z5.h
-; CHECK-NEXT:    mov z29.d, p4/m, z27.d
-; CHECK-NEXT:    fcmge p4.h, p0/z, z25.h, z26.h
-; CHECK-NEXT:    not p5.b, p0/z, p6.b
-; CHECK-NEXT:    not p3.b, p0/z, p3.b
-; CHECK-NEXT:    fcmge p6.h, p0/z, z9.h, z26.h
-; CHECK-NEXT:    fcmgt p9.h, p0/z, z6.h, z5.h
-; CHECK-NEXT:    str z0, [sp, #2, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT:    sel z0.d, p7, z27.d, z28.d
-; CHECK-NEXT:    movprfx z28, z8
-; CHECK-NEXT:    frintx z28.h, p0/m, z8.h
-; CHECK-NEXT:    sel z8.d, p5, z27.d, z10.d
-; CHECK-NEXT:    uunpklo z10.d, z12.s
-; CHECK-NEXT:    uunpkhi z12.d, z12.s
-; CHECK-NEXT:    not p5.b, p0/z, p4.b
-; CHECK-NEXT:    sel z11.d, p3, z27.d, z13.d
-; CHECK-NEXT:    uunpklo z13.d, z16.s
-; CHECK-NEXT:    fcmge p3.h, p0/z, z3.h, z26.h
-; CHECK-NEXT:    not p6.b, p0/z, p6.b
-; CHECK-NEXT:    str z0, [sp] // 16-byte Folded Spill
-; CHECK-NEXT:    sel z24.d, p5, z27.d, z15.d
-; CHECK-NEXT:    uunpkhi z15.d, z16.s
-; CHECK-NEXT:    movprfx z14, z28
-; CHECK-NEXT:    fcvtzs z14.d, p0/m, z28.h
-; CHECK-NEXT:    frintx z10.h, p0/m, z10.h
-; CHECK-NEXT:    uunpklo z16.d, z17.s
-; CHECK-NEXT:    frintx z12.h, p0/m, z12.h
-; CHECK-NEXT:    uunpkhi z17.d, z17.s
-; CHECK-NEXT:    movprfx z19, z13
-; CHECK-NEXT:    frintx z19.h, p0/m, z13.h
-; CHECK-NEXT:    fcmge p4.h, p0/z, z28.h, z26.h
-; CHECK-NEXT:    fcmge p5.h, p0/z, z31.h, z26.h
-; CHECK-NEXT:    not p3.b, p0/z, p3.b
-; CHECK-NEXT:    frintx z15.h, p0/m, z15.h
-; CHECK-NEXT:    fcmge p7.h, p0/z, z10.h, z26.h
+; CHECK-NEXT:    uunpkhi z14.s, z2.h
+; CHECK-NEXT:    uunpklo z15.s, z3.h
+; CHECK-NEXT:    uunpklo z7.d, z0.s
+; CHECK-NEXT:    uunpklo z5.d, z4.s
+; CHECK-NEXT:    uunpkhi z6.d, z4.s
+; CHECK-NEXT:    uunpklo z29.d, z25.s
+; CHECK-NEXT:    uunpkhi z26.d, z0.s
+; CHECK-NEXT:    uunpklo z8.d, z10.s
+; CHECK-NEXT:    uunpkhi z11.d, z10.s
+; CHECK-NEXT:    uunpklo z10.d, z9.s
+; CHECK-NEXT:    uunpkhi z13.d, z9.s
+; CHECK-NEXT:    mov z31.d, #0x8000000000000000
+; CHECK-NEXT:    uunpklo z16.d, z12.s
+; CHECK-NEXT:    uunpklo z18.d, z14.s
+; CHECK-NEXT:    movprfx z1, z7
+; CHECK-NEXT:    frintx z1.h, p0/m, z7.h
+; CHECK-NEXT:    movprfx z4, z5
+; CHECK-NEXT:    frintx z4.h, p0/m, z5.h
+; CHECK-NEXT:    movprfx z5, z6
+; CHECK-NEXT:    frintx z5.h, p0/m, z6.h
+; CHECK-NEXT:    movprfx z7, z29
+; CHECK-NEXT:    frintx z7.h, p0/m, z29.h
+; CHECK-NEXT:    movprfx z6, z26
+; CHECK-NEXT:    frintx z6.h, p0/m, z26.h
+; CHECK-NEXT:    mov z29.d, #0x8000000000000000
+; CHECK-NEXT:    movprfx z9, z11
+; CHECK-NEXT:    frintx z9.h, p0/m, z11.h
+; CHECK-NEXT:    movprfx z3, z10
+; CHECK-NEXT:    frintx z3.h, p0/m, z10.h
+; CHECK-NEXT:    movprfx z10, z13
+; CHECK-NEXT:    frintx z10.h, p0/m, z13.h
+; CHECK-NEXT:    uunpkhi z26.d, z25.s
+; CHECK-NEXT:    uunpkhi z13.d, z12.s
+; CHECK-NEXT:    frintx z8.h, p0/m, z8.h
+; CHECK-NEXT:    fcmge p3.h, p0/z, z1.h, z27.h
+; CHECK-NEXT:    uunpkhi z14.d, z14.s
+; CHECK-NEXT:    mov z0.d, #0x8000000000000000
+; CHECK-NEXT:    fcmge p2.h, p0/z, z7.h, z27.h
+; CHECK-NEXT:    mov z28.d, #0x8000000000000000
+; CHECK-NEXT:    fcmge p4.h, p0/z, z4.h, z27.h
+; CHECK-NEXT:    fcmge p5.h, p0/z, z5.h, z27.h
+; CHECK-NEXT:    uunpklo z19.d, z15.s
+; CHECK-NEXT:    uunpkhi z15.d, z15.s
+; CHECK-NEXT:    movprfx z20, z13
+; CHECK-NEXT:    frintx z20.h, p0/m, z13.h
+; CHECK-NEXT:    mov z30.d, #0x8000000000000000
+; CHECK-NEXT:    fcmge p1.h, p0/z, z6.h, z27.h
+; CHECK-NEXT:    frintx z26.h, p0/m, z26.h
+; CHECK-NEXT:    fcvtzs z29.d, p3/m, z1.h
+; CHECK-NEXT:    fcmge p3.h, p0/z, z9.h, z27.h
+; CHECK-NEXT:    mov z11.d, #0x8000000000000000
+; CHECK-NEXT:    fcvtzs z31.d, p2/m, z7.h
+; CHECK-NEXT:    fcmge p2.h, p0/z, z8.h, z27.h
+; CHECK-NEXT:    mov z17.d, #0x8000000000000000
 ; CHECK-NEXT:    frintx z16.h, p0/m, z16.h
-; CHECK-NEXT:    fcmge p8.h, p0/z, z12.h, z26.h
-; CHECK-NEXT:    frintx z17.h, p0/m, z17.h
-; CHECK-NEXT:    movprfx z23, z19
-; CHECK-NEXT:    fcvtzs z23.d, p0/m, z19.h
-; CHECK-NEXT:    not p4.b, p0/z, p4.b
-; CHECK-NEXT:    not p5.b, p0/z, p5.b
-; CHECK-NEXT:    sel z13.d, p3, z27.d, z18.d
-; CHECK-NEXT:    fcmge p3.h, p0/z, z19.h, z26.h
-; CHECK-NEXT:    movprfx z0, z15
-; CHECK-NEXT:    fcvtzs z0.d, p0/m, z15.h
-; CHECK-NEXT:    sel z22.d, p4, z27.d, z14.d
-; CHECK-NEXT:    sel z18.d, p6, z27.d, z21.d
-; CHECK-NEXT:    movprfx z21, z12
-; CHECK-NEXT:    fcvtzs z21.d, p0/m, z12.h
-; CHECK-NEXT:    movprfx z1, z16
-; CHECK-NEXT:    fcvtzs z1.d, p0/m, z16.h
-; CHECK-NEXT:    sel z14.d, p5, z27.d, z20.d
-; CHECK-NEXT:    fcmge p4.h, p0/z, z15.h, z26.h
-; CHECK-NEXT:    movprfx z20, z10
-; CHECK-NEXT:    fcvtzs z20.d, p0/m, z10.h
-; CHECK-NEXT:    movprfx z2, z17
-; CHECK-NEXT:    fcvtzs z2.d, p0/m, z17.h
-; CHECK-NEXT:    not p5.b, p0/z, p7.b
-; CHECK-NEXT:    fcmge p6.h, p0/z, z16.h, z26.h
-; CHECK-NEXT:    not p7.b, p0/z, p8.b
-; CHECK-NEXT:    fcmge p8.h, p0/z, z17.h, z26.h
-; CHECK-NEXT:    mov z26.d, #0x7fffffffffffffff
-; CHECK-NEXT:    not p3.b, p0/z, p3.b
-; CHECK-NEXT:    not p4.b, p0/z, p4.b
-; CHECK-NEXT:    mov z20.d, p5/m, z27.d
-; CHECK-NEXT:    mov z21.d, p7/m, z27.d
-; CHECK-NEXT:    not p5.b, p0/z, p6.b
-; CHECK-NEXT:    mov z23.d, p3/m, z27.d
-; CHECK-NEXT:    fcmgt p3.h, p0/z, z17.h, z5.h
-; CHECK-NEXT:    not p6.b, p0/z, p8.b
-; CHECK-NEXT:    mov z0.d, p4/m, z27.d
-; CHECK-NEXT:    fcmgt p4.h, p0/z, z16.h, z5.h
-; CHECK-NEXT:    mov z1.d, p5/m, z27.d
-; CHECK-NEXT:    fcmuo p5.h, p0/z, z16.h, z16.h
-; CHECK-NEXT:    mov z29.d, p2/m, z26.d
-; CHECK-NEXT:    mov z2.d, p6/m, z27.d
-; CHECK-NEXT:    ldr z27, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:    fcmgt p6.h, p0/z, z7.h, z5.h
-; CHECK-NEXT:    fcmgt p2.h, p0/z, z12.h, z5.h
-; CHECK-NEXT:    fcmuo p8.h, p0/z, z17.h, z17.h
-; CHECK-NEXT:    fcmgt p7.h, p0/z, z28.h, z5.h
-; CHECK-NEXT:    mov z1.d, p4/m, z26.d
-; CHECK-NEXT:    fcmuo p4.h, p0/z, z15.h, z15.h
-; CHECK-NEXT:    mov z8.d, p9/m, z26.d
-; CHECK-NEXT:    mov z27.d, p1/m, z26.d
-; CHECK-NEXT:    fcmgt p1.h, p0/z, z15.h, z5.h
-; CHECK-NEXT:    mov z2.d, p3/m, z26.d
-; CHECK-NEXT:    fcmgt p3.h, p0/z, z19.h, z5.h
-; CHECK-NEXT:    mov z11.d, p6/m, z26.d
-; CHECK-NEXT:    fcmuo p6.h, p0/z, z19.h, z19.h
-; CHECK-NEXT:    mov z1.d, p5/m, #0 // =0x0
-; CHECK-NEXT:    fcmgt p5.h, p0/z, z9.h, z5.h
-; CHECK-NEXT:    sel z15.d, p2, z26.d, z21.d
-; CHECK-NEXT:    fcmuo p2.h, p0/z, z12.h, z12.h
-; CHECK-NEXT:    mov z2.d, p8/m, #0 // =0x0
-; CHECK-NEXT:    sel z16.d, p7, z26.d, z22.d
-; CHECK-NEXT:    mov z0.d, p1/m, z26.d
-; CHECK-NEXT:    fcmgt p1.h, p0/z, z10.h, z5.h
-; CHECK-NEXT:    str z1, [x8, #14, mul vl]
-; CHECK-NEXT:    sel z17.d, p3, z26.d, z23.d
-; CHECK-NEXT:    fcmuo p3.h, p0/z, z10.h, z10.h
-; CHECK-NEXT:    str z2, [x8, #15, mul vl]
-; CHECK-NEXT:    sel z2.d, p5, z26.d, z18.d
-; CHECK-NEXT:    fcmuo p5.h, p0/z, z9.h, z9.h
-; CHECK-NEXT:    mov z0.d, p4/m, #0 // =0x0
-; CHECK-NEXT:    fcmgt p4.h, p0/z, z3.h, z5.h
-; CHECK-NEXT:    mov z15.d, p2/m, #0 // =0x0
-; CHECK-NEXT:    sel z1.d, p1, z26.d, z20.d
-; CHECK-NEXT:    fcmgt p1.h, p0/z, z31.h, z5.h
-; CHECK-NEXT:    mov z17.d, p6/m, #0 // =0x0
-; CHECK-NEXT:    fcmuo p2.h, p0/z, z31.h, z31.h
+; CHECK-NEXT:    frintx z14.h, p0/m, z14.h
+; CHECK-NEXT:    fcvtzs z0.d, p4/m, z4.h
+; CHECK-NEXT:    fcvtzs z28.d, p5/m, z5.h
+; CHECK-NEXT:    fcmge p4.h, p0/z, z3.h, z27.h
+; CHECK-NEXT:    mov z12.d, #0x8000000000000000
+; CHECK-NEXT:    fcmge p5.h, p0/z, z10.h, z27.h
+; CHECK-NEXT:    mov z13.d, #0x8000000000000000
+; CHECK-NEXT:    frintx z19.h, p0/m, z19.h
+; CHECK-NEXT:    frintx z15.h, p0/m, z15.h
+; CHECK-NEXT:    mov z24.d, #0x8000000000000000
+; CHECK-NEXT:    fcvtzs z30.d, p1/m, z6.h
+; CHECK-NEXT:    fcmge p1.h, p0/z, z26.h, z27.h
+; CHECK-NEXT:    mov z2.d, #0x8000000000000000
+; CHECK-NEXT:    str z0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    frintx z18.h, p0/m, z18.h
+; CHECK-NEXT:    fcvtzs z11.d, p3/m, z9.h
+; CHECK-NEXT:    fcmge p3.h, p0/z, z20.h, z27.h
+; CHECK-NEXT:    mov z25.h, w9
+; CHECK-NEXT:    fcvtzs z17.d, p2/m, z8.h
+; CHECK-NEXT:    fcmge p6.h, p0/z, z16.h, z27.h
+; CHECK-NEXT:    mov z21.d, #0x8000000000000000
+; CHECK-NEXT:    fcmge p2.h, p0/z, z14.h, z27.h
+; CHECK-NEXT:    mov z22.d, #0x8000000000000000
+; CHECK-NEXT:    fcvtzs z12.d, p4/m, z3.h
+; CHECK-NEXT:    fcvtzs z13.d, p5/m, z10.h
+; CHECK-NEXT:    fcmge p4.h, p0/z, z19.h, z27.h
+; CHECK-NEXT:    mov z23.d, #0x8000000000000000
+; CHECK-NEXT:    fcmge p5.h, p0/z, z15.h, z27.h
+; CHECK-NEXT:    mov z0.d, #0x8000000000000000
+; CHECK-NEXT:    fcvtzs z2.d, p1/m, z26.h
+; CHECK-NEXT:    fcmge p1.h, p0/z, z18.h, z27.h
+; CHECK-NEXT:    fcvtzs z24.d, p3/m, z20.h
+; CHECK-NEXT:    mov z27.d, #0x7fffffffffffffff
+; CHECK-NEXT:    fcmgt p11.h, p0/z, z20.h, z25.h
+; CHECK-NEXT:    fcvtzs z21.d, p6/m, z16.h
+; CHECK-NEXT:    fcmgt p3.h, p0/z, z16.h, z25.h
+; CHECK-NEXT:    fcmuo p6.h, p0/z, z16.h, z16.h
+; CHECK-NEXT:    fcvtzs z22.d, p2/m, z14.h
+; CHECK-NEXT:    fcmgt p2.h, p0/z, z8.h, z25.h
+; CHECK-NEXT:    mov z16.d, #0x8000000000000000
+; CHECK-NEXT:    fcmgt p7.h, p0/z, z5.h, z25.h
+; CHECK-NEXT:    fcvtzs z23.d, p4/m, z19.h
+; CHECK-NEXT:    fcvtzs z0.d, p5/m, z15.h
+; CHECK-NEXT:    fcmuo p4.h, p0/z, z20.h, z20.h
+; CHECK-NEXT:    fcmgt p5.h, p0/z, z15.h, z25.h
+; CHECK-NEXT:    mov z24.d, p11/m, z27.d
+; CHECK-NEXT:    sel z20.d, p3, z27.d, z21.d
+; CHECK-NEXT:    fcmgt p3.h, p0/z, z19.h, z25.h
+; CHECK-NEXT:    fcmgt p8.h, p0/z, z1.h, z25.h
+; CHECK-NEXT:    mov z17.d, p2/m, z27.d
+; CHECK-NEXT:    fcvtzs z16.d, p1/m, z18.h
+; CHECK-NEXT:    fcmgt p2.h, p0/z, z18.h, z25.h
+; CHECK-NEXT:    mov z28.d, p7/m, z27.d
+; CHECK-NEXT:    fcmgt p7.h, p0/z, z14.h, z25.h
+; CHECK-NEXT:    fcmuo p1.h, p0/z, z15.h, z15.h
+; CHECK-NEXT:    mov z0.d, p5/m, z27.d
+; CHECK-NEXT:    mov z24.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    fcmgt p5.h, p0/z, z10.h, z25.h
+; CHECK-NEXT:    fcmuo p4.h, p0/z, z19.h, z19.h
+; CHECK-NEXT:    sel z19.d, p3, z27.d, z23.d
+; CHECK-NEXT:    fcmuo p3.h, p0/z, z14.h, z14.h
+; CHECK-NEXT:    mov z20.d, p6/m, #0 // =0x0
+; CHECK-NEXT:    fcmgt p9.h, p0/z, z6.h, z25.h
+; CHECK-NEXT:    fcmgt p10.h, p0/z, z7.h, z25.h
+; CHECK-NEXT:    str z24, [x8, #15, mul vl]
+; CHECK-NEXT:    sel z24.d, p2, z27.d, z16.d
+; CHECK-NEXT:    fcmgt p2.h, p0/z, z3.h, z25.h
+; CHECK-NEXT:    sel z15.d, p7, z27.d, z22.d
+; CHECK-NEXT:    mov z0.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    mov z13.d, p5/m, z27.d
+; CHECK-NEXT:    str z20, [x8, #14, mul vl]
+; CHECK-NEXT:    fcmgt p5.h, p0/z, z9.h, z25.h
+; CHECK-NEXT:    fcmuo p1.h, p0/z, z18.h, z18.h
+; CHECK-NEXT:    mov z19.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p4.h, p0/z, z10.h, z10.h
+; CHECK-NEXT:    mov z29.d, p8/m, z27.d
 ; CHECK-NEXT:    str z0, [x8, #13, mul vl]
-; CHECK-NEXT:    mov z2.d, p5/m, #0 // =0x0
-; CHECK-NEXT:    fcmuo p5.h, p0/z, z25.h, z25.h
-; CHECK-NEXT:    str z17, [x8, #12, mul vl]
-; CHECK-NEXT:    mov z1.d, p3/m, #0 // =0x0
-; CHECK-NEXT:    fcmgt p3.h, p0/z, z25.h, z5.h
+; CHECK-NEXT:    mov z15.d, p3/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p3.h, p0/z, z3.h, z3.h
+; CHECK-NEXT:    sel z0.d, p2, z27.d, z12.d
+; CHECK-NEXT:    fcmuo p2.h, p0/z, z9.h, z9.h
+; CHECK-NEXT:    mov z30.d, p9/m, z27.d
+; CHECK-NEXT:    str z19, [x8, #12, mul vl]
+; CHECK-NEXT:    sel z3.d, p5, z27.d, z11.d
+; CHECK-NEXT:    mov z24.d, p1/m, #0 // =0x0
 ; CHECK-NEXT:    str z15, [x8, #11, mul vl]
-; CHECK-NEXT:    sel z0.d, p1, z26.d, z14.d
-; CHECK-NEXT:    fcmuo p1.h, p0/z, z3.h, z3.h
-; CHECK-NEXT:    sel z3.d, p4, z26.d, z13.d
-; CHECK-NEXT:    fcmuo p4.h, p0/z, z28.h, z28.h
-; CHECK-NEXT:    str z1, [x8, #10, mul vl]
-; CHECK-NEXT:    sel z1.d, p3, z26.d, z24.d
-; CHECK-NEXT:    fcmuo p3.h, p0/z, z7.h, z7.h
-; CHECK-NEXT:    ldr z7, [sp, #1, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    str z2, [x8, #9, mul vl]
-; CHECK-NEXT:    mov z0.d, p2/m, #0 // =0x0
-; CHECK-NEXT:    mov z3.d, p1/m, #0 // =0x0
-; CHECK-NEXT:    fcmuo p1.h, p0/z, z6.h, z6.h
-; CHECK-NEXT:    mov z16.d, p4/m, #0 // =0x0
-; CHECK-NEXT:    fcmuo p4.h, p0/z, z4.h, z4.h
-; CHECK-NEXT:    fcmgt p2.h, p0/z, z7.h, z5.h
-; CHECK-NEXT:    mov z1.d, p5/m, #0 // =0x0
-; CHECK-NEXT:    fcmuo p5.h, p0/z, z30.h, z30.h
+; CHECK-NEXT:    fcmgt p1.h, p0/z, z26.h, z25.h
+; CHECK-NEXT:    mov z13.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    mov z0.d, p3/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p5.h, p0/z, z8.h, z8.h
+; CHECK-NEXT:    fcmuo p4.h, p0/z, z26.h, z26.h
+; CHECK-NEXT:    str z24, [x8, #10, mul vl]
+; CHECK-NEXT:    mov z3.d, p2/m, #0 // =0x0
+; CHECK-NEXT:    fcmgt p3.h, p0/z, z4.h, z25.h
+; CHECK-NEXT:    str z13, [x8, #9, mul vl]
+; CHECK-NEXT:    fcmuo p2.h, p0/z, z6.h, z6.h
+; CHECK-NEXT:    mov z31.d, p10/m, z27.d
 ; CHECK-NEXT:    str z0, [x8, #8, mul vl]
-; CHECK-NEXT:    fcmuo p0.h, p0/z, z7.h, z7.h
-; CHECK-NEXT:    mov z11.d, p3/m, #0 // =0x0
+; CHECK-NEXT:    mov z2.d, p1/m, z27.d
+; CHECK-NEXT:    fcmuo p1.h, p0/z, z7.h, z7.h
 ; CHECK-NEXT:    str z3, [x8, #7, mul vl]
-; CHECK-NEXT:    ldr z0, [sp, #2, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    str z16, [x8, #6, mul vl]
-; CHECK-NEXT:    mov z8.d, p1/m, #0 // =0x0
-; CHECK-NEXT:    str z1, [x8, #5, mul vl]
-; CHECK-NEXT:    mov z29.d, p4/m, #0 // =0x0
-; CHECK-NEXT:    mov z27.d, p5/m, #0 // =0x0
-; CHECK-NEXT:    str z11, [x8, #4, mul vl]
-; CHECK-NEXT:    str z8, [x8, #3, mul vl]
-; CHECK-NEXT:    mov z0.d, p2/m, z26.d
-; CHECK-NEXT:    str z29, [x8, #2, mul vl]
-; CHECK-NEXT:    str z27, [x8, #1, mul vl]
+; CHECK-NEXT:    mov z17.d, p5/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p5.h, p0/z, z1.h, z1.h
+; CHECK-NEXT:    ldr z0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    mov z2.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p4.h, p0/z, z5.h, z5.h
+; CHECK-NEXT:    fcmuo p0.h, p0/z, z4.h, z4.h
+; CHECK-NEXT:    str z17, [x8, #6, mul vl]
+; CHECK-NEXT:    mov z31.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    mov z30.d, p2/m, #0 // =0x0
+; CHECK-NEXT:    mov z0.d, p3/m, z27.d
+; CHECK-NEXT:    mov z29.d, p5/m, #0 // =0x0
+; CHECK-NEXT:    str z2, [x8, #5, mul vl]
+; CHECK-NEXT:    str z31, [x8, #4, mul vl]
+; CHECK-NEXT:    mov z28.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    str z30, [x8, #3, mul vl]
 ; CHECK-NEXT:    mov z0.d, p0/m, #0 // =0x0
+; CHECK-NEXT:    str z29, [x8, #2, mul vl]
+; CHECK-NEXT:    str z28, [x8, #1, mul vl]
 ; CHECK-NEXT:    str z0, [x8]
-; CHECK-NEXT:    addvl sp, sp, #3
+; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -592,6 +511,8 @@ define <vscale x 32 x i64> @llrint_v32i64_v32f16(<vscale x 32 x half> %x) {
 ; CHECK-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p11, [sp] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p10, [sp, #1, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p9, [sp, #2, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p8, [sp, #3, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
@@ -611,20 +532,17 @@ define <vscale x 1 x i64> @llrint_v1i64_v1f32(<vscale x 1 x float> %x) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov w8, #-553648128 // =0xdf000000
-; CHECK-NEXT:    mov z2.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z1.s, w8
 ; CHECK-NEXT:    mov w8, #1593835519 // =0x5effffff
 ; CHECK-NEXT:    frintx z0.s, p0/m, z0.s
-; CHECK-NEXT:    mov z3.s, w8
+; CHECK-NEXT:    mov z2.s, w8
 ; CHECK-NEXT:    fcmge p1.s, p0/z, z0.s, z1.s
-; CHECK-NEXT:    movprfx z1, z0
-; CHECK-NEXT:    fcvtzs z1.d, p0/m, z0.s
-; CHECK-NEXT:    fcmgt p2.s, p0/z, z0.s, z3.s
-; CHECK-NEXT:    mov z3.d, #0x7fffffffffffffff
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    mov z1.d, #0x8000000000000000
+; CHECK-NEXT:    fcvtzs z1.d, p1/m, z0.s
+; CHECK-NEXT:    fcmgt p1.s, p0/z, z0.s, z2.s
+; CHECK-NEXT:    mov z2.d, #0x7fffffffffffffff
 ; CHECK-NEXT:    fcmuo p0.s, p0/z, z0.s, z0.s
-; CHECK-NEXT:    mov z1.d, p1/m, z2.d
-; CHECK-NEXT:    sel z0.d, p2, z3.d, z1.d
+; CHECK-NEXT:    sel z0.d, p1, z2.d, z1.d
 ; CHECK-NEXT:    mov z0.d, p0/m, #0 // =0x0
 ; CHECK-NEXT:    ret
   %a = call <vscale x 1 x i64> @llvm.llrint.nxv1i64.nxv1f32(<vscale x 1 x float> %x)
@@ -637,20 +555,17 @@ define <vscale x 2 x i64> @llrint_v2i64_v2f32(<vscale x 2 x float> %x) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov w8, #-553648128 // =0xdf000000
-; CHECK-NEXT:    mov z2.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z1.s, w8
 ; CHECK-NEXT:    mov w8, #1593835519 // =0x5effffff
 ; CHECK-NEXT:    frintx z0.s, p0/m, z0.s
-; CHECK-NEXT:    mov z3.s, w8
+; CHECK-NEXT:    mov z2.s, w8
 ; CHECK-NEXT:    fcmge p1.s, p0/z, z0.s, z1.s
-; CHECK-NEXT:    movprfx z1, z0
-; CHECK-NEXT:    fcvtzs z1.d, p0/m, z0.s
-; CHECK-NEXT:    fcmgt p2.s, p0/z, z0.s, z3.s
-; CHECK-NEXT:    mov z3.d, #0x7fffffffffffffff
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    mov z1.d, #0x8000000000000000
+; CHECK-NEXT:    fcvtzs z1.d, p1/m, z0.s
+; CHECK-NEXT:    fcmgt p1.s, p0/z, z0.s, z2.s
+; CHECK-NEXT:    mov z2.d, #0x7fffffffffffffff
 ; CHECK-NEXT:    fcmuo p0.s, p0/z, z0.s, z0.s
-; CHECK-NEXT:    mov z1.d, p1/m, z2.d
-; CHECK-NEXT:    sel z0.d, p2, z3.d, z1.d
+; CHECK-NEXT:    sel z0.d, p1, z2.d, z1.d
 ; CHECK-NEXT:    mov z0.d, p0/m, #0 // =0x0
 ; CHECK-NEXT:    ret
   %a = call <vscale x 2 x i64> @llvm.llrint.nxv2i64.nxv2f32(<vscale x 2 x float> %x)
@@ -661,43 +576,30 @@ declare <vscale x 2 x i64> @llvm.llrint.nxv2i64.nxv2f32(<vscale x 2 x float>)
 define <vscale x 4 x i64> @llrint_v4i64_v4f32(<vscale x 4 x float> %x) {
 ; CHECK-LABEL: llrint_v4i64_v4f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
-; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    uunpklo z1.d, z0.s
 ; CHECK-NEXT:    uunpkhi z0.d, z0.s
 ; CHECK-NEXT:    mov w8, #-553648128 // =0xdf000000
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z2.s, w8
 ; CHECK-NEXT:    mov w8, #1593835519 // =0x5effffff
-; CHECK-NEXT:    mov z3.s, w8
-; CHECK-NEXT:    mov z6.d, #0x7fffffffffffffff
+; CHECK-NEXT:    mov z3.d, #0x8000000000000000
+; CHECK-NEXT:    mov z4.d, #0x8000000000000000
+; CHECK-NEXT:    mov z5.s, w8
 ; CHECK-NEXT:    frintx z1.s, p0/m, z1.s
 ; CHECK-NEXT:    frintx z0.s, p0/m, z0.s
 ; CHECK-NEXT:    fcmge p1.s, p0/z, z1.s, z2.s
 ; CHECK-NEXT:    fcmge p2.s, p0/z, z0.s, z2.s
-; CHECK-NEXT:    mov z2.d, #0x8000000000000000
-; CHECK-NEXT:    movprfx z4, z1
-; CHECK-NEXT:    fcvtzs z4.d, p0/m, z1.s
-; CHECK-NEXT:    movprfx z5, z0
-; CHECK-NEXT:    fcvtzs z5.d, p0/m, z0.s
-; CHECK-NEXT:    fcmgt p3.s, p0/z, z1.s, z3.s
-; CHECK-NEXT:    fcmgt p4.s, p0/z, z0.s, z3.s
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    sel z3.d, p1, z2.d, z4.d
-; CHECK-NEXT:    fcmuo p1.s, p0/z, z1.s, z1.s
+; CHECK-NEXT:    mov z2.d, #0x7fffffffffffffff
+; CHECK-NEXT:    fcmuo p3.s, p0/z, z1.s, z1.s
+; CHECK-NEXT:    fcvtzs z3.d, p1/m, z1.s
+; CHECK-NEXT:    fcmgt p1.s, p0/z, z1.s, z5.s
+; CHECK-NEXT:    fcvtzs z4.d, p2/m, z0.s
+; CHECK-NEXT:    fcmgt p2.s, p0/z, z0.s, z5.s
 ; CHECK-NEXT:    fcmuo p0.s, p0/z, z0.s, z0.s
-; CHECK-NEXT:    sel z2.d, p2, z2.d, z5.d
-; CHECK-NEXT:    sel z0.d, p3, z6.d, z3.d
-; CHECK-NEXT:    sel z1.d, p4, z6.d, z2.d
-; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    mov z0.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    sel z0.d, p1, z2.d, z3.d
+; CHECK-NEXT:    sel z1.d, p2, z2.d, z4.d
+; CHECK-NEXT:    mov z0.d, p3/m, #0 // =0x0
 ; CHECK-NEXT:    mov z1.d, p0/m, #0 // =0x0
-; CHECK-NEXT:    addvl sp, sp, #1
-; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
   %a = call <vscale x 4 x i64> @llvm.llrint.nxv4i64.nxv4f32(<vscale x 4 x float> %x)
   ret <vscale x 4 x i64> %a
@@ -709,7 +611,6 @@ define <vscale x 8 x i64> @llrint_v8i64_v8f32(<vscale x 8 x float> %x) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    str p7, [sp, #4, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p6, [sp, #5, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
@@ -718,57 +619,47 @@ define <vscale x 8 x i64> @llrint_v8i64_v8f32(<vscale x 8 x float> %x) {
 ; CHECK-NEXT:    uunpklo z2.d, z0.s
 ; CHECK-NEXT:    uunpkhi z0.d, z0.s
 ; CHECK-NEXT:    mov w8, #-553648128 // =0xdf000000
-; CHECK-NEXT:    uunpklo z3.d, z1.s
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    uunpklo z3.d, z1.s
 ; CHECK-NEXT:    uunpkhi z1.d, z1.s
 ; CHECK-NEXT:    mov z4.s, w8
 ; CHECK-NEXT:    mov w8, #1593835519 // =0x5effffff
 ; CHECK-NEXT:    mov z5.d, #0x8000000000000000
-; CHECK-NEXT:    mov z6.s, w8
-; CHECK-NEXT:    mov z26.d, #0x7fffffffffffffff
+; CHECK-NEXT:    mov z6.d, #0x8000000000000000
+; CHECK-NEXT:    mov z25.s, w8
+; CHECK-NEXT:    mov z7.d, #0x8000000000000000
 ; CHECK-NEXT:    frintx z2.s, p0/m, z2.s
 ; CHECK-NEXT:    frintx z0.s, p0/m, z0.s
+; CHECK-NEXT:    mov z24.d, #0x8000000000000000
 ; CHECK-NEXT:    frintx z3.s, p0/m, z3.s
 ; CHECK-NEXT:    frintx z1.s, p0/m, z1.s
 ; CHECK-NEXT:    fcmge p1.s, p0/z, z2.s, z4.s
 ; CHECK-NEXT:    fcmge p2.s, p0/z, z0.s, z4.s
-; CHECK-NEXT:    movprfx z7, z0
-; CHECK-NEXT:    fcvtzs z7.d, p0/m, z0.s
+; CHECK-NEXT:    fcmuo p6.s, p0/z, z0.s, z0.s
 ; CHECK-NEXT:    fcmge p3.s, p0/z, z3.s, z4.s
 ; CHECK-NEXT:    fcmge p4.s, p0/z, z1.s, z4.s
-; CHECK-NEXT:    movprfx z4, z2
-; CHECK-NEXT:    fcvtzs z4.d, p0/m, z2.s
-; CHECK-NEXT:    movprfx z24, z3
-; CHECK-NEXT:    fcvtzs z24.d, p0/m, z3.s
-; CHECK-NEXT:    movprfx z25, z1
-; CHECK-NEXT:    fcvtzs z25.d, p0/m, z1.s
-; CHECK-NEXT:    fcmgt p7.s, p0/z, z3.s, z6.s
-; CHECK-NEXT:    fcmgt p5.s, p0/z, z2.s, z6.s
-; CHECK-NEXT:    fcmgt p6.s, p0/z, z0.s, z6.s
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    not p3.b, p0/z, p3.b
-; CHECK-NEXT:    mov z4.d, p1/m, z5.d
-; CHECK-NEXT:    fcmgt p1.s, p0/z, z1.s, z6.s
-; CHECK-NEXT:    not p4.b, p0/z, p4.b
-; CHECK-NEXT:    sel z6.d, p2, z5.d, z7.d
-; CHECK-NEXT:    fcmuo p2.s, p0/z, z2.s, z2.s
-; CHECK-NEXT:    sel z7.d, p3, z5.d, z24.d
-; CHECK-NEXT:    fcmuo p3.s, p0/z, z0.s, z0.s
-; CHECK-NEXT:    sel z5.d, p4, z5.d, z25.d
-; CHECK-NEXT:    fcmuo p4.s, p0/z, z3.s, z3.s
+; CHECK-NEXT:    mov z4.d, #0x7fffffffffffffff
+; CHECK-NEXT:    fcmgt p5.s, p0/z, z1.s, z25.s
+; CHECK-NEXT:    fcvtzs z5.d, p1/m, z2.s
+; CHECK-NEXT:    fcvtzs z6.d, p2/m, z0.s
+; CHECK-NEXT:    fcmgt p1.s, p0/z, z2.s, z25.s
+; CHECK-NEXT:    fcmgt p2.s, p0/z, z0.s, z25.s
+; CHECK-NEXT:    fcvtzs z7.d, p3/m, z3.s
+; CHECK-NEXT:    fcmgt p3.s, p0/z, z3.s, z25.s
+; CHECK-NEXT:    fcvtzs z24.d, p4/m, z1.s
+; CHECK-NEXT:    fcmuo p4.s, p0/z, z2.s, z2.s
+; CHECK-NEXT:    sel z0.d, p1, z4.d, z5.d
+; CHECK-NEXT:    fcmuo p1.s, p0/z, z3.s, z3.s
 ; CHECK-NEXT:    fcmuo p0.s, p0/z, z1.s, z1.s
-; CHECK-NEXT:    sel z0.d, p5, z26.d, z4.d
-; CHECK-NEXT:    sel z1.d, p6, z26.d, z6.d
-; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    sel z2.d, p7, z26.d, z7.d
-; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    sel z3.d, p1, z26.d, z5.d
+; CHECK-NEXT:    sel z1.d, p2, z4.d, z6.d
+; CHECK-NEXT:    sel z2.d, p3, z4.d, z7.d
+; CHECK-NEXT:    sel z3.d, p5, z4.d, z24.d
 ; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    mov z0.d, p2/m, #0 // =0x0
-; CHECK-NEXT:    mov z1.d, p3/m, #0 // =0x0
-; CHECK-NEXT:    mov z2.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    mov z0.d, p4/m, #0 // =0x0
 ; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z1.d, p6/m, #0 // =0x0
+; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z2.d, p1/m, #0 // =0x0
 ; CHECK-NEXT:    mov z3.d, p0/m, #0 // =0x0
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -782,7 +673,7 @@ define <vscale x 16 x i64> @llrint_v16i64_v16f32(<vscale x 16 x float> %x) {
 ; CHECK-LABEL: llrint_v16i64_v16f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    addvl sp, sp, #-2
+; CHECK-NEXT:    addvl sp, sp, #-4
 ; CHECK-NEXT:    str p10, [sp, #1, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p9, [sp, #2, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p8, [sp, #3, mul vl] // 2-byte Folded Spill
@@ -790,119 +681,106 @@ define <vscale x 16 x i64> @llrint_v16i64_v16f32(<vscale x 16 x float> %x) {
 ; CHECK-NEXT:    str p6, [sp, #5, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    str z8, [sp, #1, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT:    str z10, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z9, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z8, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG
 ; CHECK-NEXT:    uunpklo z4.d, z0.s
 ; CHECK-NEXT:    uunpkhi z0.d, z0.s
 ; CHECK-NEXT:    mov w8, #-553648128 // =0xdf000000
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    uunpklo z7.d, z1.s
-; CHECK-NEXT:    uunpkhi z1.d, z1.s
-; CHECK-NEXT:    uunpklo z24.d, z2.s
+; CHECK-NEXT:    uunpklo z6.d, z2.s
 ; CHECK-NEXT:    uunpkhi z2.d, z2.s
-; CHECK-NEXT:    uunpklo z25.d, z3.s
-; CHECK-NEXT:    uunpkhi z3.d, z3.s
-; CHECK-NEXT:    mov z26.d, #0x7fffffffffffffff
-; CHECK-NEXT:    movprfx z5, z4
-; CHECK-NEXT:    frintx z5.s, p0/m, z4.s
-; CHECK-NEXT:    movprfx z6, z0
-; CHECK-NEXT:    frintx z6.s, p0/m, z0.s
-; CHECK-NEXT:    mov z4.s, w8
-; CHECK-NEXT:    frintx z7.s, p0/m, z7.s
-; CHECK-NEXT:    movprfx z28, z1
-; CHECK-NEXT:    frintx z28.s, p0/m, z1.s
+; CHECK-NEXT:    uunpklo z5.d, z1.s
+; CHECK-NEXT:    uunpkhi z1.d, z1.s
+; CHECK-NEXT:    uunpklo z7.d, z3.s
+; CHECK-NEXT:    mov z24.s, w8
 ; CHECK-NEXT:    mov w8, #1593835519 // =0x5effffff
-; CHECK-NEXT:    mov z0.d, #0x8000000000000000
-; CHECK-NEXT:    frintx z24.s, p0/m, z24.s
-; CHECK-NEXT:    movprfx z29, z2
-; CHECK-NEXT:    frintx z29.s, p0/m, z2.s
-; CHECK-NEXT:    frintx z25.s, p0/m, z25.s
-; CHECK-NEXT:    movprfx z30, z3
-; CHECK-NEXT:    frintx z30.s, p0/m, z3.s
-; CHECK-NEXT:    mov z27.s, w8
-; CHECK-NEXT:    fcmge p1.s, p0/z, z5.s, z4.s
-; CHECK-NEXT:    fcmge p2.s, p0/z, z6.s, z4.s
-; CHECK-NEXT:    movprfx z1, z5
-; CHECK-NEXT:    fcvtzs z1.d, p0/m, z5.s
-; CHECK-NEXT:    movprfx z2, z6
-; CHECK-NEXT:    fcvtzs z2.d, p0/m, z6.s
-; CHECK-NEXT:    fcmge p5.s, p0/z, z7.s, z4.s
-; CHECK-NEXT:    fcmge p6.s, p0/z, z28.s, z4.s
-; CHECK-NEXT:    movprfx z3, z7
-; CHECK-NEXT:    fcvtzs z3.d, p0/m, z7.s
-; CHECK-NEXT:    fcmge p8.s, p0/z, z29.s, z4.s
-; CHECK-NEXT:    fcmgt p3.s, p0/z, z5.s, z27.s
-; CHECK-NEXT:    fcmgt p7.s, p0/z, z6.s, z27.s
-; CHECK-NEXT:    fcmge p9.s, p0/z, z25.s, z4.s
-; CHECK-NEXT:    movprfx z31, z25
-; CHECK-NEXT:    fcvtzs z31.d, p0/m, z25.s
-; CHECK-NEXT:    not p4.b, p0/z, p1.b
-; CHECK-NEXT:    fcmuo p1.s, p0/z, z5.s, z5.s
-; CHECK-NEXT:    movprfx z5, z28
-; CHECK-NEXT:    fcvtzs z5.d, p0/m, z28.s
-; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    fcmge p10.s, p0/z, z30.s, z4.s
-; CHECK-NEXT:    movprfx z8, z30
-; CHECK-NEXT:    fcvtzs z8.d, p0/m, z30.s
-; CHECK-NEXT:    mov z1.d, p4/m, z0.d
-; CHECK-NEXT:    fcmge p4.s, p0/z, z24.s, z4.s
-; CHECK-NEXT:    movprfx z4, z29
-; CHECK-NEXT:    fcvtzs z4.d, p0/m, z29.s
-; CHECK-NEXT:    mov z2.d, p2/m, z0.d
-; CHECK-NEXT:    fcmuo p2.s, p0/z, z6.s, z6.s
-; CHECK-NEXT:    movprfx z6, z24
-; CHECK-NEXT:    fcvtzs z6.d, p0/m, z24.s
-; CHECK-NEXT:    not p5.b, p0/z, p5.b
-; CHECK-NEXT:    not p6.b, p0/z, p6.b
-; CHECK-NEXT:    not p4.b, p0/z, p4.b
-; CHECK-NEXT:    mov z3.d, p5/m, z0.d
-; CHECK-NEXT:    not p5.b, p0/z, p8.b
-; CHECK-NEXT:    mov z5.d, p6/m, z0.d
-; CHECK-NEXT:    fcmgt p8.s, p0/z, z7.s, z27.s
-; CHECK-NEXT:    not p6.b, p0/z, p9.b
-; CHECK-NEXT:    mov z6.d, p4/m, z0.d
-; CHECK-NEXT:    fcmuo p9.s, p0/z, z7.s, z7.s
-; CHECK-NEXT:    not p4.b, p0/z, p10.b
-; CHECK-NEXT:    fcmgt p10.s, p0/z, z28.s, z27.s
-; CHECK-NEXT:    sel z7.d, p5, z0.d, z4.d
-; CHECK-NEXT:    fcmgt p5.s, p0/z, z24.s, z27.s
-; CHECK-NEXT:    mov z31.d, p6/m, z0.d
-; CHECK-NEXT:    fcmgt p6.s, p0/z, z30.s, z27.s
-; CHECK-NEXT:    mov z8.d, p4/m, z0.d
-; CHECK-NEXT:    sel z0.d, p3, z26.d, z1.d
-; CHECK-NEXT:    fcmgt p3.s, p0/z, z29.s, z27.s
-; CHECK-NEXT:    fcmgt p4.s, p0/z, z25.s, z27.s
-; CHECK-NEXT:    sel z1.d, p7, z26.d, z2.d
-; CHECK-NEXT:    fcmuo p7.s, p0/z, z28.s, z28.s
-; CHECK-NEXT:    sel z2.d, p8, z26.d, z3.d
-; CHECK-NEXT:    sel z3.d, p10, z26.d, z5.d
-; CHECK-NEXT:    fcmuo p8.s, p0/z, z29.s, z29.s
-; CHECK-NEXT:    sel z4.d, p5, z26.d, z6.d
-; CHECK-NEXT:    fcmuo p5.s, p0/z, z24.s, z24.s
-; CHECK-NEXT:    fcmuo p10.s, p0/z, z25.s, z25.s
-; CHECK-NEXT:    sel z5.d, p3, z26.d, z7.d
-; CHECK-NEXT:    fcmuo p0.s, p0/z, z30.s, z30.s
-; CHECK-NEXT:    sel z7.d, p6, z26.d, z8.d
-; CHECK-NEXT:    ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    mov z26.d, #0x8000000000000000
+; CHECK-NEXT:    frintx z4.s, p0/m, z4.s
+; CHECK-NEXT:    frintx z0.s, p0/m, z0.s
+; CHECK-NEXT:    mov z30.s, w8
+; CHECK-NEXT:    movprfx z27, z2
+; CHECK-NEXT:    frintx z27.s, p0/m, z2.s
+; CHECK-NEXT:    uunpkhi z2.d, z3.s
+; CHECK-NEXT:    frintx z6.s, p0/m, z6.s
+; CHECK-NEXT:    movprfx z25, z1
+; CHECK-NEXT:    frintx z25.s, p0/m, z1.s
+; CHECK-NEXT:    frintx z5.s, p0/m, z5.s
+; CHECK-NEXT:    frintx z7.s, p0/m, z7.s
+; CHECK-NEXT:    mov z1.d, #0x8000000000000000
+; CHECK-NEXT:    mov z3.d, #0x8000000000000000
+; CHECK-NEXT:    mov z28.d, #0x8000000000000000
+; CHECK-NEXT:    fcmge p1.s, p0/z, z4.s, z24.s
+; CHECK-NEXT:    fcmge p2.s, p0/z, z0.s, z24.s
+; CHECK-NEXT:    mov z29.d, #0x8000000000000000
+; CHECK-NEXT:    movprfx z9, z2
+; CHECK-NEXT:    frintx z9.s, p0/m, z2.s
+; CHECK-NEXT:    fcmge p5.s, p0/z, z6.s, z24.s
+; CHECK-NEXT:    mov z8.d, #0x8000000000000000
+; CHECK-NEXT:    fcmge p3.s, p0/z, z5.s, z24.s
+; CHECK-NEXT:    fcmge p4.s, p0/z, z25.s, z24.s
+; CHECK-NEXT:    fcmge p7.s, p0/z, z7.s, z24.s
+; CHECK-NEXT:    fcmge p6.s, p0/z, z27.s, z24.s
+; CHECK-NEXT:    mov z31.d, #0x8000000000000000
+; CHECK-NEXT:    mov z10.d, #0x7fffffffffffffff
+; CHECK-NEXT:    fcvtzs z1.d, p1/m, z4.s
+; CHECK-NEXT:    fcmgt p1.s, p0/z, z4.s, z30.s
+; CHECK-NEXT:    fcvtzs z26.d, p2/m, z0.s
+; CHECK-NEXT:    fcmge p2.s, p0/z, z9.s, z24.s
+; CHECK-NEXT:    mov z24.d, #0x8000000000000000
+; CHECK-NEXT:    fcvtzs z29.d, p5/m, z6.s
+; CHECK-NEXT:    fcvtzs z3.d, p3/m, z5.s
+; CHECK-NEXT:    fcvtzs z28.d, p4/m, z25.s
+; CHECK-NEXT:    fcvtzs z8.d, p7/m, z7.s
+; CHECK-NEXT:    fcmgt p4.s, p0/z, z0.s, z30.s
+; CHECK-NEXT:    fcmgt p5.s, p0/z, z5.s, z30.s
+; CHECK-NEXT:    fcmgt p7.s, p0/z, z25.s, z30.s
+; CHECK-NEXT:    fcmgt p8.s, p0/z, z6.s, z30.s
+; CHECK-NEXT:    fcvtzs z31.d, p6/m, z27.s
+; CHECK-NEXT:    fcmuo p6.s, p0/z, z0.s, z0.s
+; CHECK-NEXT:    sel z0.d, p1, z10.d, z1.d
+; CHECK-NEXT:    fcmgt p1.s, p0/z, z27.s, z30.s
+; CHECK-NEXT:    fcmgt p10.s, p0/z, z7.s, z30.s
+; CHECK-NEXT:    fcvtzs z24.d, p2/m, z9.s
+; CHECK-NEXT:    fcmgt p2.s, p0/z, z9.s, z30.s
+; CHECK-NEXT:    fcmuo p3.s, p0/z, z4.s, z4.s
+; CHECK-NEXT:    fcmuo p9.s, p0/z, z5.s, z5.s
+; CHECK-NEXT:    sel z1.d, p4, z10.d, z26.d
+; CHECK-NEXT:    fcmuo p4.s, p0/z, z25.s, z25.s
+; CHECK-NEXT:    sel z2.d, p5, z10.d, z3.d
+; CHECK-NEXT:    sel z3.d, p7, z10.d, z28.d
+; CHECK-NEXT:    sel z4.d, p8, z10.d, z29.d
+; CHECK-NEXT:    fcmuo p5.s, p0/z, z6.s, z6.s
+; CHECK-NEXT:    fcmuo p7.s, p0/z, z27.s, z27.s
+; CHECK-NEXT:    fcmuo p8.s, p0/z, z7.s, z7.s
+; CHECK-NEXT:    sel z5.d, p1, z10.d, z31.d
+; CHECK-NEXT:    sel z6.d, p10, z10.d, z8.d
+; CHECK-NEXT:    ldr z8, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p10, [sp, #1, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    fcmuo p0.s, p0/z, z9.s, z9.s
+; CHECK-NEXT:    ldr z9, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    sel z7.d, p2, z10.d, z24.d
+; CHECK-NEXT:    ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    mov z1.d, p6/m, #0 // =0x0
 ; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    sel z6.d, p4, z26.d, z31.d
-; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    mov z2.d, p9/m, #0 // =0x0
-; CHECK-NEXT:    mov z3.d, p7/m, #0 // =0x0
+; CHECK-NEXT:    mov z3.d, p4/m, #0 // =0x0
 ; CHECK-NEXT:    ldr p9, [sp, #2, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    mov z4.d, p5/m, #0 // =0x0
-; CHECK-NEXT:    mov z5.d, p8/m, #0 // =0x0
+; CHECK-NEXT:    mov z5.d, p7/m, #0 // =0x0
+; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z6.d, p8/m, #0 // =0x0
+; CHECK-NEXT:    mov z0.d, p3/m, #0 // =0x0
 ; CHECK-NEXT:    ldr p8, [sp, #3, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    mov z6.d, p10/m, #0 // =0x0
-; CHECK-NEXT:    mov z0.d, p1/m, #0 // =0x0
-; CHECK-NEXT:    ldr p10, [sp, #1, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    mov z1.d, p2/m, #0 // =0x0
 ; CHECK-NEXT:    mov z7.d, p0/m, #0 // =0x0
-; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    addvl sp, sp, #2
+; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #4
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
   %a = call <vscale x 16 x i64> @llvm.llrint.nxv16i64.nxv16f32(<vscale x 16 x float> %x)
@@ -915,6 +793,8 @@ define <vscale x 32 x i64> @llrint_v32i64_v32f32(<vscale x 32 x float> %x) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-17
+; CHECK-NEXT:    str p11, [sp] // 2-byte Folded Spill
+; CHECK-NEXT:    str p10, [sp, #1, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p9, [sp, #2, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p8, [sp, #3, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p7, [sp, #4, mul vl] // 2-byte Folded Spill
@@ -937,8 +817,8 @@ define <vscale x 32 x i64> @llrint_v32i64_v32f32(<vscale x 32 x float> %x) {
 ; CHECK-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
 ; CHECK-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
 ; CHECK-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT:    addvl sp, sp, #-3
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xa0, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 160 * VG
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 144 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
 ; CHECK-NEXT:    .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
@@ -949,224 +829,185 @@ define <vscale x 32 x i64> @llrint_v32i64_v32f32(<vscale x 32 x float> %x) {
 ; CHECK-NEXT:    .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 56 * VG
 ; CHECK-NEXT:    .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG
 ; CHECK-NEXT:    uunpklo z24.d, z0.s
-; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    uunpkhi z25.d, z0.s
 ; CHECK-NEXT:    mov w9, #-553648128 // =0xdf000000
 ; CHECK-NEXT:    uunpklo z26.d, z1.s
-; CHECK-NEXT:    uunpkhi z25.d, z0.s
-; CHECK-NEXT:    uunpkhi z28.d, z1.s
-; CHECK-NEXT:    mov z29.s, w9
+; CHECK-NEXT:    uunpkhi z1.d, z1.s
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    uunpklo z27.d, z2.s
+; CHECK-NEXT:    uunpkhi z9.d, z2.s
+; CHECK-NEXT:    uunpklo z11.d, z3.s
+; CHECK-NEXT:    uunpkhi z12.d, z3.s
+; CHECK-NEXT:    mov z10.s, w9
 ; CHECK-NEXT:    mov w9, #1593835519 // =0x5effffff
-; CHECK-NEXT:    mov z17.d, z5.d
-; CHECK-NEXT:    mov z27.d, #0x8000000000000000
-; CHECK-NEXT:    uunpkhi z30.d, z2.s
-; CHECK-NEXT:    uunpklo z8.d, z3.s
 ; CHECK-NEXT:    movprfx z0, z24
 ; CHECK-NEXT:    frintx z0.s, p0/m, z24.s
-; CHECK-NEXT:    uunpkhi z9.d, z3.s
+; CHECK-NEXT:    movprfx z24, z25
+; CHECK-NEXT:    frintx z24.s, p0/m, z25.s
+; CHECK-NEXT:    uunpklo z13.d, z4.s
+; CHECK-NEXT:    movprfx z25, z26
+; CHECK-NEXT:    frintx z25.s, p0/m, z26.s
+; CHECK-NEXT:    movprfx z26, z1
+; CHECK-NEXT:    frintx z26.s, p0/m, z1.s
 ; CHECK-NEXT:    uunpkhi z14.d, z4.s
-; CHECK-NEXT:    movprfx z24, z26
-; CHECK-NEXT:    frintx z24.s, p0/m, z26.s
-; CHECK-NEXT:    movprfx z1, z25
-; CHECK-NEXT:    frintx z1.s, p0/m, z25.s
-; CHECK-NEXT:    movprfx z5, z28
-; CHECK-NEXT:    frintx z5.s, p0/m, z28.s
-; CHECK-NEXT:    uunpklo z26.d, z2.s
-; CHECK-NEXT:    uunpklo z16.d, z17.s
-; CHECK-NEXT:    mov z25.s, w9
-; CHECK-NEXT:    movprfx z28, z30
-; CHECK-NEXT:    frintx z28.s, p0/m, z30.s
-; CHECK-NEXT:    movprfx z30, z8
-; CHECK-NEXT:    frintx z30.s, p0/m, z8.s
-; CHECK-NEXT:    fcmge p1.s, p0/z, z0.s, z29.s
-; CHECK-NEXT:    movprfx z31, z0
-; CHECK-NEXT:    fcvtzs z31.d, p0/m, z0.s
-; CHECK-NEXT:    str z0, [sp, #2, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT:    fcmge p2.s, p0/z, z1.s, z29.s
-; CHECK-NEXT:    fcmge p3.s, p0/z, z24.s, z29.s
-; CHECK-NEXT:    fcmge p5.s, p0/z, z5.s, z29.s
-; CHECK-NEXT:    frintx z26.s, p0/m, z26.s
-; CHECK-NEXT:    movprfx z10, z1
-; CHECK-NEXT:    fcvtzs z10.d, p0/m, z1.s
-; CHECK-NEXT:    movprfx z11, z24
-; CHECK-NEXT:    fcvtzs z11.d, p0/m, z24.s
-; CHECK-NEXT:    movprfx z12, z5
-; CHECK-NEXT:    fcvtzs z12.d, p0/m, z5.s
-; CHECK-NEXT:    movprfx z15, z28
-; CHECK-NEXT:    fcvtzs z15.d, p0/m, z28.s
-; CHECK-NEXT:    str z1, [sp, #1, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT:    not p4.b, p0/z, p1.b
-; CHECK-NEXT:    fcmgt p1.s, p0/z, z1.s, z25.s
-; CHECK-NEXT:    fcmgt p9.s, p0/z, z5.s, z25.s
-; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    sel z0.d, p4, z27.d, z31.d
-; CHECK-NEXT:    fcmge p4.s, p0/z, z26.s, z29.s
-; CHECK-NEXT:    not p3.b, p0/z, p3.b
-; CHECK-NEXT:    not p5.b, p0/z, p5.b
-; CHECK-NEXT:    movprfx z13, z26
-; CHECK-NEXT:    fcvtzs z13.d, p0/m, z26.s
-; CHECK-NEXT:    sel z31.d, p2, z27.d, z10.d
-; CHECK-NEXT:    uunpklo z10.d, z4.s
-; CHECK-NEXT:    sel z8.d, p3, z27.d, z11.d
-; CHECK-NEXT:    fcmge p3.s, p0/z, z28.s, z29.s
-; CHECK-NEXT:    sel z11.d, p5, z27.d, z12.d
-; CHECK-NEXT:    movprfx z4, z9
-; CHECK-NEXT:    frintx z4.s, p0/m, z9.s
-; CHECK-NEXT:    str z0, [sp] // 16-byte Folded Spill
-; CHECK-NEXT:    not p5.b, p0/z, p4.b
-; CHECK-NEXT:    fcmge p4.s, p0/z, z30.s, z29.s
-; CHECK-NEXT:    fcmgt p2.s, p0/z, z24.s, z25.s
-; CHECK-NEXT:    sel z12.d, p5, z27.d, z13.d
-; CHECK-NEXT:    uunpkhi z13.d, z17.s
-; CHECK-NEXT:    movprfx z9, z10
-; CHECK-NEXT:    frintx z9.s, p0/m, z10.s
-; CHECK-NEXT:    movprfx z10, z14
-; CHECK-NEXT:    frintx z10.s, p0/m, z14.s
-; CHECK-NEXT:    uunpkhi z17.d, z6.s
-; CHECK-NEXT:    not p3.b, p0/z, p3.b
-; CHECK-NEXT:    uunpklo z14.d, z6.s
-; CHECK-NEXT:    movprfx z6, z16
-; CHECK-NEXT:    frintx z6.s, p0/m, z16.s
-; CHECK-NEXT:    uunpklo z16.d, z7.s
+; CHECK-NEXT:    movprfx z2, z27
+; CHECK-NEXT:    frintx z2.s, p0/m, z27.s
+; CHECK-NEXT:    mov z31.d, #0x8000000000000000
+; CHECK-NEXT:    movprfx z27, z9
+; CHECK-NEXT:    frintx z27.s, p0/m, z9.s
+; CHECK-NEXT:    movprfx z9, z11
+; CHECK-NEXT:    frintx z9.s, p0/m, z11.s
+; CHECK-NEXT:    movprfx z11, z12
+; CHECK-NEXT:    frintx z11.s, p0/m, z12.s
+; CHECK-NEXT:    uunpklo z15.d, z7.s
 ; CHECK-NEXT:    uunpkhi z7.d, z7.s
-; CHECK-NEXT:    sel z3.d, p3, z27.d, z15.d
-; CHECK-NEXT:    fcmge p3.s, p0/z, z4.s, z29.s
-; CHECK-NEXT:    frintx z13.s, p0/m, z13.s
-; CHECK-NEXT:    movprfx z15, z30
-; CHECK-NEXT:    fcvtzs z15.d, p0/m, z30.s
-; CHECK-NEXT:    fcmge p5.s, p0/z, z9.s, z29.s
-; CHECK-NEXT:    fcmge p6.s, p0/z, z10.s, z29.s
-; CHECK-NEXT:    frintx z17.s, p0/m, z17.s
-; CHECK-NEXT:    movprfx z18, z4
-; CHECK-NEXT:    fcvtzs z18.d, p0/m, z4.s
-; CHECK-NEXT:    movprfx z20, z10
-; CHECK-NEXT:    fcvtzs z20.d, p0/m, z10.s
-; CHECK-NEXT:    frintx z16.s, p0/m, z16.s
-; CHECK-NEXT:    not p4.b, p0/z, p4.b
-; CHECK-NEXT:    movprfx z19, z14
-; CHECK-NEXT:    frintx z19.s, p0/m, z14.s
-; CHECK-NEXT:    movprfx z14, z9
-; CHECK-NEXT:    fcvtzs z14.d, p0/m, z9.s
-; CHECK-NEXT:    fcmge p7.s, p0/z, z6.s, z29.s
-; CHECK-NEXT:    fcmge p8.s, p0/z, z13.s, z29.s
+; CHECK-NEXT:    mov z1.d, #0x8000000000000000
+; CHECK-NEXT:    mov z29.d, #0x8000000000000000
+; CHECK-NEXT:    fcmge p3.s, p0/z, z26.s, z10.s
+; CHECK-NEXT:    mov z30.d, #0x8000000000000000
+; CHECK-NEXT:    fcmge p5.s, p0/z, z0.s, z10.s
+; CHECK-NEXT:    mov z8.d, #0x8000000000000000
+; CHECK-NEXT:    fcmge p1.s, p0/z, z24.s, z10.s
+; CHECK-NEXT:    movprfx z12, z13
+; CHECK-NEXT:    frintx z12.s, p0/m, z13.s
+; CHECK-NEXT:    fcmge p2.s, p0/z, z25.s, z10.s
+; CHECK-NEXT:    fcmge p4.s, p0/z, z2.s, z10.s
+; CHECK-NEXT:    movprfx z13, z14
+; CHECK-NEXT:    frintx z13.s, p0/m, z14.s
+; CHECK-NEXT:    uunpklo z17.d, z5.s
+; CHECK-NEXT:    uunpkhi z18.d, z5.s
 ; CHECK-NEXT:    movprfx z21, z7
 ; CHECK-NEXT:    frintx z21.s, p0/m, z7.s
-; CHECK-NEXT:    not p3.b, p0/z, p3.b
-; CHECK-NEXT:    not p6.b, p0/z, p6.b
-; CHECK-NEXT:    mov z15.d, p4/m, z27.d
-; CHECK-NEXT:    fcmge p4.s, p0/z, z17.s, z29.s
-; CHECK-NEXT:    not p5.b, p0/z, p5.b
-; CHECK-NEXT:    sel z7.d, p3, z27.d, z18.d
-; CHECK-NEXT:    movprfx z0, z17
-; CHECK-NEXT:    fcvtzs z0.d, p0/m, z17.s
-; CHECK-NEXT:    sel z18.d, p6, z27.d, z20.d
-; CHECK-NEXT:    movprfx z20, z6
-; CHECK-NEXT:    fcvtzs z20.d, p0/m, z6.s
-; CHECK-NEXT:    fcmge p6.s, p0/z, z16.s, z29.s
-; CHECK-NEXT:    fcmge p3.s, p0/z, z19.s, z29.s
-; CHECK-NEXT:    mov z14.d, p5/m, z27.d
-; CHECK-NEXT:    not p5.b, p0/z, p7.b
-; CHECK-NEXT:    not p7.b, p0/z, p8.b
-; CHECK-NEXT:    fcmge p8.s, p0/z, z21.s, z29.s
-; CHECK-NEXT:    movprfx z1, z16
-; CHECK-NEXT:    fcvtzs z1.d, p0/m, z16.s
-; CHECK-NEXT:    movprfx z22, z13
-; CHECK-NEXT:    fcvtzs z22.d, p0/m, z13.s
-; CHECK-NEXT:    movprfx z23, z19
-; CHECK-NEXT:    fcvtzs z23.d, p0/m, z19.s
-; CHECK-NEXT:    not p4.b, p0/z, p4.b
-; CHECK-NEXT:    movprfx z2, z21
-; CHECK-NEXT:    fcvtzs z2.d, p0/m, z21.s
-; CHECK-NEXT:    mov z29.d, #0x7fffffffffffffff
-; CHECK-NEXT:    mov z20.d, p5/m, z27.d
-; CHECK-NEXT:    not p5.b, p0/z, p6.b
-; CHECK-NEXT:    mov z0.d, p4/m, z27.d
-; CHECK-NEXT:    fcmgt p4.s, p0/z, z16.s, z25.s
-; CHECK-NEXT:    not p3.b, p0/z, p3.b
-; CHECK-NEXT:    not p6.b, p0/z, p8.b
-; CHECK-NEXT:    mov z1.d, p5/m, z27.d
-; CHECK-NEXT:    mov z22.d, p7/m, z27.d
-; CHECK-NEXT:    mov z23.d, p3/m, z27.d
-; CHECK-NEXT:    fcmgt p3.s, p0/z, z21.s, z25.s
-; CHECK-NEXT:    fcmuo p5.s, p0/z, z16.s, z16.s
-; CHECK-NEXT:    mov z2.d, p6/m, z27.d
-; CHECK-NEXT:    sel z27.d, p1, z29.d, z31.d
-; CHECK-NEXT:    fcmgt p1.s, p0/z, z17.s, z25.s
-; CHECK-NEXT:    mov z1.d, p4/m, z29.d
-; CHECK-NEXT:    fcmgt p6.s, p0/z, z26.s, z25.s
-; CHECK-NEXT:    fcmgt p7.s, p0/z, z30.s, z25.s
-; CHECK-NEXT:    sel z31.d, p2, z29.d, z8.d
-; CHECK-NEXT:    fcmgt p2.s, p0/z, z13.s, z25.s
-; CHECK-NEXT:    fcmuo p8.s, p0/z, z21.s, z21.s
-; CHECK-NEXT:    mov z2.d, p3/m, z29.d
-; CHECK-NEXT:    fcmuo p4.s, p0/z, z17.s, z17.s
-; CHECK-NEXT:    fcmgt p3.s, p0/z, z19.s, z25.s
-; CHECK-NEXT:    mov z0.d, p1/m, z29.d
-; CHECK-NEXT:    fcmgt p1.s, p0/z, z6.s, z25.s
-; CHECK-NEXT:    mov z1.d, p5/m, #0 // =0x0
-; CHECK-NEXT:    sel z8.d, p9, z29.d, z11.d
-; CHECK-NEXT:    sel z11.d, p6, z29.d, z12.d
-; CHECK-NEXT:    sel z12.d, p7, z29.d, z15.d
-; CHECK-NEXT:    fcmgt p5.s, p0/z, z10.s, z25.s
-; CHECK-NEXT:    sel z15.d, p2, z29.d, z22.d
-; CHECK-NEXT:    fcmuo p2.s, p0/z, z13.s, z13.s
-; CHECK-NEXT:    str z1, [x8, #14, mul vl]
-; CHECK-NEXT:    mov z2.d, p8/m, #0 // =0x0
+; CHECK-NEXT:    uunpklo z19.d, z6.s
+; CHECK-NEXT:    uunpkhi z20.d, z6.s
+; CHECK-NEXT:    str z0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    fcvtzs z31.d, p3/m, z26.s
+; CHECK-NEXT:    fcmge p3.s, p0/z, z11.s, z10.s
+; CHECK-NEXT:    mov z5.d, #0x8000000000000000
+; CHECK-NEXT:    frintx z15.s, p0/m, z15.s
+; CHECK-NEXT:    fcvtzs z1.d, p5/m, z0.s
+; CHECK-NEXT:    fcvtzs z29.d, p1/m, z24.s
+; CHECK-NEXT:    fcvtzs z30.d, p2/m, z25.s
+; CHECK-NEXT:    fcvtzs z8.d, p4/m, z2.s
+; CHECK-NEXT:    fcmge p1.s, p0/z, z27.s, z10.s
+; CHECK-NEXT:    mov z4.d, #0x8000000000000000
+; CHECK-NEXT:    fcmge p2.s, p0/z, z9.s, z10.s
+; CHECK-NEXT:    mov z16.d, #0x8000000000000000
+; CHECK-NEXT:    fcmge p4.s, p0/z, z12.s, z10.s
+; CHECK-NEXT:    mov z6.d, #0x8000000000000000
+; CHECK-NEXT:    fcmge p5.s, p0/z, z13.s, z10.s
+; CHECK-NEXT:    mov z14.d, #0x8000000000000000
+; CHECK-NEXT:    frintx z17.s, p0/m, z17.s
+; CHECK-NEXT:    frintx z18.s, p0/m, z18.s
+; CHECK-NEXT:    frintx z19.s, p0/m, z19.s
+; CHECK-NEXT:    frintx z20.s, p0/m, z20.s
+; CHECK-NEXT:    mov z28.d, #0x8000000000000000
+; CHECK-NEXT:    fcvtzs z5.d, p3/m, z11.s
+; CHECK-NEXT:    fcmge p3.s, p0/z, z21.s, z10.s
+; CHECK-NEXT:    mov z3.s, w9
+; CHECK-NEXT:    fcmge p6.s, p0/z, z15.s, z10.s
+; CHECK-NEXT:    mov z22.d, #0x8000000000000000
+; CHECK-NEXT:    fcvtzs z4.d, p1/m, z27.s
+; CHECK-NEXT:    fcvtzs z16.d, p2/m, z9.s
+; CHECK-NEXT:    fcvtzs z6.d, p4/m, z12.s
+; CHECK-NEXT:    fcvtzs z14.d, p5/m, z13.s
+; CHECK-NEXT:    fcmge p1.s, p0/z, z17.s, z10.s
+; CHECK-NEXT:    fcmge p2.s, p0/z, z18.s, z10.s
+; CHECK-NEXT:    mov z23.d, #0x8000000000000000
+; CHECK-NEXT:    fcmge p4.s, p0/z, z19.s, z10.s
+; CHECK-NEXT:    fcmge p5.s, p0/z, z20.s, z10.s
+; CHECK-NEXT:    mov z10.d, #0x8000000000000000
+; CHECK-NEXT:    mov z0.d, #0x8000000000000000
+; CHECK-NEXT:    fcvtzs z28.d, p3/m, z21.s
+; CHECK-NEXT:    mov z7.d, #0x7fffffffffffffff
+; CHECK-NEXT:    fcmgt p11.s, p0/z, z21.s, z3.s
+; CHECK-NEXT:    fcvtzs z22.d, p6/m, z15.s
+; CHECK-NEXT:    fcmgt p3.s, p0/z, z15.s, z3.s
+; CHECK-NEXT:    fcmuo p6.s, p0/z, z15.s, z15.s
+; CHECK-NEXT:    mov z15.d, #0x8000000000000000
+; CHECK-NEXT:    fcmgt p7.s, p0/z, z24.s, z3.s
+; CHECK-NEXT:    fcvtzs z23.d, p2/m, z18.s
+; CHECK-NEXT:    fcvtzs z10.d, p5/m, z20.s
+; CHECK-NEXT:    fcmgt p2.s, p0/z, z9.s, z3.s
+; CHECK-NEXT:    fcmgt p5.s, p0/z, z20.s, z3.s
+; CHECK-NEXT:    fcvtzs z0.d, p4/m, z19.s
+; CHECK-NEXT:    fcmuo p4.s, p0/z, z21.s, z21.s
+; CHECK-NEXT:    mov z28.d, p11/m, z7.d
+; CHECK-NEXT:    sel z21.d, p3, z7.d, z22.d
+; CHECK-NEXT:    fcmgt p3.s, p0/z, z19.s, z3.s
+; CHECK-NEXT:    fcvtzs z15.d, p1/m, z17.s
+; CHECK-NEXT:    fcmuo p1.s, p0/z, z20.s, z20.s
+; CHECK-NEXT:    mov z29.d, p7/m, z7.d
+; CHECK-NEXT:    fcmgt p7.s, p0/z, z18.s, z3.s
+; CHECK-NEXT:    mov z16.d, p2/m, z7.d
+; CHECK-NEXT:    fcmgt p2.s, p0/z, z17.s, z3.s
+; CHECK-NEXT:    mov z10.d, p5/m, z7.d
+; CHECK-NEXT:    mov z28.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p4.s, p0/z, z19.s, z19.s
+; CHECK-NEXT:    mov z0.d, p3/m, z7.d
+; CHECK-NEXT:    fcmuo p3.s, p0/z, z18.s, z18.s
+; CHECK-NEXT:    fcmgt p5.s, p0/z, z13.s, z3.s
+; CHECK-NEXT:    mov z21.d, p6/m, #0 // =0x0
+; CHECK-NEXT:    fcmgt p10.s, p0/z, z2.s, z3.s
+; CHECK-NEXT:    fcmgt p8.s, p0/z, z25.s, z3.s
+; CHECK-NEXT:    str z28, [x8, #15, mul vl]
+; CHECK-NEXT:    mov z10.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p1.s, p0/z, z17.s, z17.s
+; CHECK-NEXT:    sel z19.d, p7, z7.d, z23.d
+; CHECK-NEXT:    sel z28.d, p2, z7.d, z15.d
+; CHECK-NEXT:    fcmgt p2.s, p0/z, z12.s, z3.s
+; CHECK-NEXT:    str z21, [x8, #14, mul vl]
 ; CHECK-NEXT:    mov z0.d, p4/m, #0 // =0x0
-; CHECK-NEXT:    sel z1.d, p1, z29.d, z20.d
-; CHECK-NEXT:    fcmgt p1.s, p0/z, z9.s, z25.s
-; CHECK-NEXT:    fcmuo p6.s, p0/z, z19.s, z19.s
-; CHECK-NEXT:    sel z16.d, p3, z29.d, z23.d
-; CHECK-NEXT:    fcmuo p3.s, p0/z, z6.s, z6.s
-; CHECK-NEXT:    fcmgt p4.s, p0/z, z4.s, z25.s
-; CHECK-NEXT:    str z2, [x8, #15, mul vl]
-; CHECK-NEXT:    sel z2.d, p5, z29.d, z18.d
-; CHECK-NEXT:    fcmuo p5.s, p0/z, z10.s, z10.s
-; CHECK-NEXT:    str z0, [x8, #13, mul vl]
-; CHECK-NEXT:    mov z15.d, p2/m, #0 // =0x0
-; CHECK-NEXT:    fcmuo p2.s, p0/z, z9.s, z9.s
-; CHECK-NEXT:    sel z0.d, p1, z29.d, z14.d
-; CHECK-NEXT:    mov z16.d, p6/m, #0 // =0x0
-; CHECK-NEXT:    fcmuo p1.s, p0/z, z4.s, z4.s
-; CHECK-NEXT:    mov z1.d, p3/m, #0 // =0x0
-; CHECK-NEXT:    fcmgt p3.s, p0/z, z28.s, z25.s
-; CHECK-NEXT:    sel z4.d, p4, z29.d, z7.d
-; CHECK-NEXT:    str z15, [x8, #11, mul vl]
-; CHECK-NEXT:    mov z2.d, p5/m, #0 // =0x0
-; CHECK-NEXT:    fcmuo p5.s, p0/z, z28.s, z28.s
-; CHECK-NEXT:    str z16, [x8, #12, mul vl]
-; CHECK-NEXT:    mov z0.d, p2/m, #0 // =0x0
-; CHECK-NEXT:    fcmuo p4.s, p0/z, z30.s, z30.s
-; CHECK-NEXT:    str z1, [x8, #10, mul vl]
-; CHECK-NEXT:    mov z4.d, p1/m, #0 // =0x0
-; CHECK-NEXT:    fcmuo p1.s, p0/z, z5.s, z5.s
-; CHECK-NEXT:    sel z1.d, p3, z29.d, z3.d
-; CHECK-NEXT:    ldr z3, [sp, #2, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    str z2, [x8, #9, mul vl]
+; CHECK-NEXT:    mov z14.d, p5/m, z7.d
+; CHECK-NEXT:    str z10, [x8, #13, mul vl]
+; CHECK-NEXT:    fcmgt p5.s, p0/z, z11.s, z3.s
+; CHECK-NEXT:    fcmuo p4.s, p0/z, z13.s, z13.s
+; CHECK-NEXT:    mov z19.d, p3/m, #0 // =0x0
+; CHECK-NEXT:    mov z28.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    fcmgt p1.s, p0/z, z27.s, z3.s
+; CHECK-NEXT:    str z0, [x8, #12, mul vl]
+; CHECK-NEXT:    fcmuo p3.s, p0/z, z12.s, z12.s
+; CHECK-NEXT:    sel z0.d, p2, z7.d, z6.d
+; CHECK-NEXT:    fcmuo p2.s, p0/z, z11.s, z11.s
+; CHECK-NEXT:    fcmgt p9.s, p0/z, z26.s, z3.s
+; CHECK-NEXT:    mov z30.d, p8/m, z7.d
+; CHECK-NEXT:    str z19, [x8, #11, mul vl]
+; CHECK-NEXT:    mov z5.d, p5/m, z7.d
+; CHECK-NEXT:    fcmuo p5.s, p0/z, z9.s, z9.s
+; CHECK-NEXT:    str z28, [x8, #10, mul vl]
+; CHECK-NEXT:    mov z4.d, p1/m, z7.d
+; CHECK-NEXT:    fcmuo p1.s, p0/z, z2.s, z2.s
+; CHECK-NEXT:    ldr z2, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    mov z14.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p4.s, p0/z, z27.s, z27.s
+; CHECK-NEXT:    mov z0.d, p3/m, #0 // =0x0
+; CHECK-NEXT:    mov z5.d, p2/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p2.s, p0/z, z26.s, z26.s
+; CHECK-NEXT:    mov z16.d, p5/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p5.s, p0/z, z25.s, z25.s
+; CHECK-NEXT:    mov z31.d, p9/m, z7.d
+; CHECK-NEXT:    str z14, [x8, #9, mul vl]
+; CHECK-NEXT:    fcmgt p3.s, p0/z, z2.s, z3.s
+; CHECK-NEXT:    mov z8.d, p10/m, z7.d
 ; CHECK-NEXT:    str z0, [x8, #8, mul vl]
-; CHECK-NEXT:    fcmuo p3.s, p0/z, z26.s, z26.s
-; CHECK-NEXT:    ldr z0, [sp, #1, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    str z4, [x8, #7, mul vl]
-; CHECK-NEXT:    mov z12.d, p4/m, #0 // =0x0
-; CHECK-NEXT:    fcmgt p2.s, p0/z, z3.s, z25.s
-; CHECK-NEXT:    mov z1.d, p5/m, #0 // =0x0
+; CHECK-NEXT:    mov z4.d, p4/m, #0 // =0x0
 ; CHECK-NEXT:    fcmuo p4.s, p0/z, z24.s, z24.s
+; CHECK-NEXT:    str z5, [x8, #7, mul vl]
+; CHECK-NEXT:    fcmuo p0.s, p0/z, z2.s, z2.s
+; CHECK-NEXT:    mov z31.d, p2/m, #0 // =0x0
+; CHECK-NEXT:    str z16, [x8, #6, mul vl]
 ; CHECK-NEXT:    mov z8.d, p1/m, #0 // =0x0
-; CHECK-NEXT:    fcmuo p5.s, p0/z, z0.s, z0.s
-; CHECK-NEXT:    ldr z0, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:    str z12, [x8, #6, mul vl]
-; CHECK-NEXT:    str z1, [x8, #5, mul vl]
-; CHECK-NEXT:    fcmuo p0.s, p0/z, z3.s, z3.s
-; CHECK-NEXT:    mov z11.d, p3/m, #0 // =0x0
-; CHECK-NEXT:    str z8, [x8, #3, mul vl]
-; CHECK-NEXT:    mov z31.d, p4/m, #0 // =0x0
-; CHECK-NEXT:    mov z0.d, p2/m, z29.d
-; CHECK-NEXT:    str z11, [x8, #4, mul vl]
-; CHECK-NEXT:    mov z27.d, p5/m, #0 // =0x0
-; CHECK-NEXT:    str z31, [x8, #2, mul vl]
+; CHECK-NEXT:    mov z30.d, p5/m, #0 // =0x0
+; CHECK-NEXT:    str z4, [x8, #5, mul vl]
+; CHECK-NEXT:    sel z0.d, p3, z7.d, z1.d
+; CHECK-NEXT:    str z31, [x8, #3, mul vl]
+; CHECK-NEXT:    mov z29.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    str z8, [x8, #4, mul vl]
+; CHECK-NEXT:    str z30, [x8, #2, mul vl]
 ; CHECK-NEXT:    mov z0.d, p0/m, #0 // =0x0
-; CHECK-NEXT:    str z27, [x8, #1, mul vl]
+; CHECK-NEXT:    str z29, [x8, #1, mul vl]
 ; CHECK-NEXT:    str z0, [x8]
-; CHECK-NEXT:    addvl sp, sp, #3
+; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1183,6 +1024,8 @@ define <vscale x 32 x i64> @llrint_v32i64_v32f32(<vscale x 32 x float> %x) {
 ; CHECK-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p11, [sp] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p10, [sp, #1, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p9, [sp, #2, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p8, [sp, #3, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
@@ -1202,20 +1045,17 @@ define <vscale x 1 x i64> @llrint_v1i64_v1f64(<vscale x 1 x double> %x) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov x8, #-4332462841530417152 // =0xc3e0000000000000
-; CHECK-NEXT:    mov z2.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z1.d, x8
 ; CHECK-NEXT:    mov x8, #4890909195324358655 // =0x43dfffffffffffff
 ; CHECK-NEXT:    frintx z0.d, p0/m, z0.d
-; CHECK-NEXT:    mov z3.d, x8
+; CHECK-NEXT:    mov z2.d, x8
 ; CHECK-NEXT:    fcmge p1.d, p0/z, z0.d, z1.d
-; CHECK-NEXT:    movprfx z1, z0
-; CHECK-NEXT:    fcvtzs z1.d, p0/m, z0.d
-; CHECK-NEXT:    fcmgt p2.d, p0/z, z0.d, z3.d
-; CHECK-NEXT:    mov z3.d, #0x7fffffffffffffff
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    mov z1.d, #0x8000000000000000
+; CHECK-NEXT:    fcvtzs z1.d, p1/m, z0.d
+; CHECK-NEXT:    fcmgt p1.d, p0/z, z0.d, z2.d
+; CHECK-NEXT:    mov z2.d, #0x7fffffffffffffff
 ; CHECK-NEXT:    fcmuo p0.d, p0/z, z0.d, z0.d
-; CHECK-NEXT:    mov z1.d, p1/m, z2.d
-; CHECK-NEXT:    sel z0.d, p2, z3.d, z1.d
+; CHECK-NEXT:    sel z0.d, p1, z2.d, z1.d
 ; CHECK-NEXT:    mov z0.d, p0/m, #0 // =0x0
 ; CHECK-NEXT:    ret
   %a = call <vscale x 1 x i64> @llvm.llrint.nxv1i64.nxv1f64(<vscale x 1 x double> %x)
@@ -1228,20 +1068,17 @@ define <vscale x 2 x i64> @llrint_v2i64_v2f64(<vscale x 2 x double> %x) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov x8, #-4332462841530417152 // =0xc3e0000000000000
-; CHECK-NEXT:    mov z2.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z1.d, x8
 ; CHECK-NEXT:    mov x8, #4890909195324358655 // =0x43dfffffffffffff
 ; CHECK-NEXT:    frintx z0.d, p0/m, z0.d
-; CHECK-NEXT:    mov z3.d, x8
+; CHECK-NEXT:    mov z2.d, x8
 ; CHECK-NEXT:    fcmge p1.d, p0/z, z0.d, z1.d
-; CHECK-NEXT:    movprfx z1, z0
-; CHECK-NEXT:    fcvtzs z1.d, p0/m, z0.d
-; CHECK-NEXT:    fcmgt p2.d, p0/z, z0.d, z3.d
-; CHECK-NEXT:    mov z3.d, #0x7fffffffffffffff
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    mov z1.d, #0x8000000000000000
+; CHECK-NEXT:    fcvtzs z1.d, p1/m, z0.d
+; CHECK-NEXT:    fcmgt p1.d, p0/z, z0.d, z2.d
+; CHECK-NEXT:    mov z2.d, #0x7fffffffffffffff
 ; CHECK-NEXT:    fcmuo p0.d, p0/z, z0.d, z0.d
-; CHECK-NEXT:    mov z1.d, p1/m, z2.d
-; CHECK-NEXT:    sel z0.d, p2, z3.d, z1.d
+; CHECK-NEXT:    sel z0.d, p1, z2.d, z1.d
 ; CHECK-NEXT:    mov z0.d, p0/m, #0 // =0x0
 ; CHECK-NEXT:    ret
   %a = call <vscale x 2 x i64> @llvm.llrint.nxv2i64.nxv2f64(<vscale x 2 x double> %x)
@@ -1252,41 +1089,28 @@ declare <vscale x 2 x i64> @llvm.llrint.nxv2i64.nxv2f64(<vscale x 2 x double>)
 define <vscale x 4 x i64> @llrint_v4i64_v4f64(<vscale x 4 x double> %x) {
 ; CHECK-LABEL: llrint_v4i64_v4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
-; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov x8, #-4332462841530417152 // =0xc3e0000000000000
-; CHECK-NEXT:    mov z6.d, #0x7fffffffffffffff
+; CHECK-NEXT:    mov z3.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z2.d, x8
 ; CHECK-NEXT:    mov x8, #4890909195324358655 // =0x43dfffffffffffff
+; CHECK-NEXT:    mov z4.d, #0x8000000000000000
 ; CHECK-NEXT:    frintx z0.d, p0/m, z0.d
 ; CHECK-NEXT:    frintx z1.d, p0/m, z1.d
-; CHECK-NEXT:    mov z3.d, x8
+; CHECK-NEXT:    mov z5.d, x8
 ; CHECK-NEXT:    fcmge p1.d, p0/z, z0.d, z2.d
 ; CHECK-NEXT:    fcmge p2.d, p0/z, z1.d, z2.d
-; CHECK-NEXT:    mov z2.d, #0x8000000000000000
-; CHECK-NEXT:    movprfx z4, z0
-; CHECK-NEXT:    fcvtzs z4.d, p0/m, z0.d
-; CHECK-NEXT:    movprfx z5, z1
-; CHECK-NEXT:    fcvtzs z5.d, p0/m, z1.d
-; CHECK-NEXT:    fcmgt p3.d, p0/z, z0.d, z3.d
-; CHECK-NEXT:    fcmgt p4.d, p0/z, z1.d, z3.d
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    sel z3.d, p1, z2.d, z4.d
-; CHECK-NEXT:    fcmuo p1.d, p0/z, z0.d, z0.d
+; CHECK-NEXT:    mov z2.d, #0x7fffffffffffffff
+; CHECK-NEXT:    fcmuo p3.d, p0/z, z0.d, z0.d
+; CHECK-NEXT:    fcvtzs z3.d, p1/m, z0.d
+; CHECK-NEXT:    fcmgt p1.d, p0/z, z0.d, z5.d
+; CHECK-NEXT:    fcvtzs z4.d, p2/m, z1.d
+; CHECK-NEXT:    fcmgt p2.d, p0/z, z1.d, z5.d
 ; CHECK-NEXT:    fcmuo p0.d, p0/z, z1.d, z1.d
-; CHECK-NEXT:    sel z2.d, p2, z2.d, z5.d
-; CHECK-NEXT:    sel z0.d, p3, z6.d, z3.d
-; CHECK-NEXT:    sel z1.d, p4, z6.d, z2.d
-; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    mov z0.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    sel z0.d, p1, z2.d, z3.d
+; CHECK-NEXT:    sel z1.d, p2, z2.d, z4.d
+; CHECK-NEXT:    mov z0.d, p3/m, #0 // =0x0
 ; CHECK-NEXT:    mov z1.d, p0/m, #0 // =0x0
-; CHECK-NEXT:    addvl sp, sp, #1
-; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
   %a = call <vscale x 4 x i64> @llvm.llrint.nxv4i64.nxv4f64(<vscale x 4 x double> %x)
   ret <vscale x 4 x i64> %a
@@ -1298,7 +1122,6 @@ define <vscale x 8 x i64> @llrint_v8i64_v8f64(<vscale x 8 x double> %x) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    str p7, [sp, #4, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p6, [sp, #5, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
@@ -1308,52 +1131,42 @@ define <vscale x 8 x i64> @llrint_v8i64_v8f64(<vscale x 8 x double> %x) {
 ; CHECK-NEXT:    mov x8, #-4332462841530417152 // =0xc3e0000000000000
 ; CHECK-NEXT:    mov z5.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z4.d, x8
+; CHECK-NEXT:    mov z6.d, #0x8000000000000000
 ; CHECK-NEXT:    mov x8, #4890909195324358655 // =0x43dfffffffffffff
-; CHECK-NEXT:    mov z26.d, #0x7fffffffffffffff
 ; CHECK-NEXT:    frintx z0.d, p0/m, z0.d
 ; CHECK-NEXT:    frintx z1.d, p0/m, z1.d
 ; CHECK-NEXT:    frintx z2.d, p0/m, z2.d
 ; CHECK-NEXT:    frintx z3.d, p0/m, z3.d
-; CHECK-NEXT:    mov z6.d, x8
+; CHECK-NEXT:    mov z25.d, x8
+; CHECK-NEXT:    mov z7.d, #0x8000000000000000
+; CHECK-NEXT:    mov z24.d, #0x8000000000000000
 ; CHECK-NEXT:    fcmge p1.d, p0/z, z0.d, z4.d
 ; CHECK-NEXT:    fcmge p2.d, p0/z, z1.d, z4.d
 ; CHECK-NEXT:    fcmge p3.d, p0/z, z2.d, z4.d
 ; CHECK-NEXT:    fcmge p4.d, p0/z, z3.d, z4.d
-; CHECK-NEXT:    movprfx z4, z0
-; CHECK-NEXT:    fcvtzs z4.d, p0/m, z0.d
-; CHECK-NEXT:    movprfx z7, z1
-; CHECK-NEXT:    fcvtzs z7.d, p0/m, z1.d
-; CHECK-NEXT:    movprfx z24, z2
-; CHECK-NEXT:    fcvtzs z24.d, p0/m, z2.d
-; CHECK-NEXT:    movprfx z25, z3
-; CHECK-NEXT:    fcvtzs z25.d, p0/m, z3.d
-; CHECK-NEXT:    fcmgt p7.d, p0/z, z2.d, z6.d
-; CHECK-NEXT:    fcmgt p5.d, p0/z, z0.d, z6.d
-; CHECK-NEXT:    fcmgt p6.d, p0/z, z1.d, z6.d
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    not p3.b, p0/z, p3.b
-; CHECK-NEXT:    mov z4.d, p1/m, z5.d
-; CHECK-NEXT:    fcmgt p1.d, p0/z, z3.d, z6.d
-; CHECK-NEXT:    not p4.b, p0/z, p4.b
-; CHECK-NEXT:    sel z6.d, p2, z5.d, z7.d
-; CHECK-NEXT:    fcmuo p2.d, p0/z, z0.d, z0.d
-; CHECK-NEXT:    sel z7.d, p3, z5.d, z24.d
-; CHECK-NEXT:    fcmuo p3.d, p0/z, z1.d, z1.d
-; CHECK-NEXT:    sel z5.d, p4, z5.d, z25.d
-; CHECK-NEXT:    fcmuo p4.d, p0/z, z2.d, z2.d
+; CHECK-NEXT:    mov z4.d, #0x7fffffffffffffff
+; CHECK-NEXT:    fcmgt p5.d, p0/z, z3.d, z25.d
+; CHECK-NEXT:    fcmuo p6.d, p0/z, z1.d, z1.d
+; CHECK-NEXT:    fcvtzs z5.d, p1/m, z0.d
+; CHECK-NEXT:    fcvtzs z6.d, p2/m, z1.d
+; CHECK-NEXT:    fcmgt p1.d, p0/z, z0.d, z25.d
+; CHECK-NEXT:    fcmgt p2.d, p0/z, z1.d, z25.d
+; CHECK-NEXT:    fcvtzs z7.d, p3/m, z2.d
+; CHECK-NEXT:    fcmgt p3.d, p0/z, z2.d, z25.d
+; CHECK-NEXT:    fcvtzs z24.d, p4/m, z3.d
+; CHECK-NEXT:    fcmuo p4.d, p0/z, z0.d, z0.d
+; CHECK-NEXT:    sel z0.d, p1, z4.d, z5.d
+; CHECK-NEXT:    fcmuo p1.d, p0/z, z2.d, z2.d
 ; CHECK-NEXT:    fcmuo p0.d, p0/z, z3.d, z3.d
-; CHECK-NEXT:    sel z0.d, p5, z26.d, z4.d
-; CHECK-NEXT:    sel z1.d, p6, z26.d, z6.d
-; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    sel z2.d, p7, z26.d, z7.d
-; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    sel z3.d, p1, z26.d, z5.d
+; CHECK-NEXT:    sel z1.d, p2, z4.d, z6.d
+; CHECK-NEXT:    sel z2.d, p3, z4.d, z7.d
+; CHECK-NEXT:    sel z3.d, p5, z4.d, z24.d
 ; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    mov z0.d, p2/m, #0 // =0x0
-; CHECK-NEXT:    mov z1.d, p3/m, #0 // =0x0
-; CHECK-NEXT:    mov z2.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    mov z0.d, p4/m, #0 // =0x0
 ; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z1.d, p6/m, #0 // =0x0
+; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z2.d, p1/m, #0 // =0x0
 ; CHECK-NEXT:    mov z3.d, p0/m, #0 // =0x0
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1367,7 +1180,7 @@ define <vscale x 16 x i64> @llrint_v16f64(<vscale x 16 x double> %x) {
 ; CHECK-LABEL: llrint_v16f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    addvl sp, sp, #-2
+; CHECK-NEXT:    addvl sp, sp, #-3
 ; CHECK-NEXT:    str p10, [sp, #1, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p9, [sp, #2, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p8, [sp, #3, mul vl] // 2-byte Folded Spill
@@ -1375,109 +1188,93 @@ define <vscale x 16 x i64> @llrint_v16f64(<vscale x 16 x double> %x) {
 ; CHECK-NEXT:    str p6, [sp, #5, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    str z8, [sp, #1, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT:    str z9, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z8, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov x8, #-4332462841530417152 // =0xc3e0000000000000
-; CHECK-NEXT:    mov z24.d, #0x7fffffffffffffff
-; CHECK-NEXT:    mov z25.d, x8
+; CHECK-NEXT:    mov z26.d, #0x8000000000000000
+; CHECK-NEXT:    mov z24.d, x8
 ; CHECK-NEXT:    mov x8, #4890909195324358655 // =0x43dfffffffffffff
-; CHECK-NEXT:    movprfx z26, z0
-; CHECK-NEXT:    frintx z26.d, p0/m, z0.d
-; CHECK-NEXT:    movprfx z27, z1
-; CHECK-NEXT:    frintx z27.d, p0/m, z1.d
+; CHECK-NEXT:    mov z27.d, #0x8000000000000000
+; CHECK-NEXT:    frintx z0.d, p0/m, z0.d
+; CHECK-NEXT:    frintx z1.d, p0/m, z1.d
+; CHECK-NEXT:    movprfx z25, z4
+; CHECK-NEXT:    frintx z25.d, p0/m, z4.d
 ; CHECK-NEXT:    frintx z2.d, p0/m, z2.d
-; CHECK-NEXT:    mov z0.d, #0x8000000000000000
-; CHECK-NEXT:    mov z1.d, x8
 ; CHECK-NEXT:    frintx z3.d, p0/m, z3.d
-; CHECK-NEXT:    movprfx z28, z4
-; CHECK-NEXT:    frintx z28.d, p0/m, z4.d
 ; CHECK-NEXT:    frintx z5.d, p0/m, z5.d
 ; CHECK-NEXT:    frintx z6.d, p0/m, z6.d
+; CHECK-NEXT:    mov z30.d, x8
+; CHECK-NEXT:    mov z4.d, #0x8000000000000000
 ; CHECK-NEXT:    frintx z7.d, p0/m, z7.d
-; CHECK-NEXT:    fcmge p1.d, p0/z, z26.d, z25.d
-; CHECK-NEXT:    fcmge p2.d, p0/z, z27.d, z25.d
-; CHECK-NEXT:    movprfx z4, z26
-; CHECK-NEXT:    fcvtzs z4.d, p0/m, z26.d
-; CHECK-NEXT:    fcmge p5.d, p0/z, z2.d, z25.d
-; CHECK-NEXT:    movprfx z29, z27
-; CHECK-NEXT:    fcvtzs z29.d, p0/m, z27.d
-; CHECK-NEXT:    fcmgt p3.d, p0/z, z26.d, z1.d
-; CHECK-NEXT:    fcmge p6.d, p0/z, z3.d, z25.d
-; CHECK-NEXT:    fcmge p8.d, p0/z, z5.d, z25.d
-; CHECK-NEXT:    fcmgt p7.d, p0/z, z27.d, z1.d
-; CHECK-NEXT:    fcmge p9.d, p0/z, z6.d, z25.d
-; CHECK-NEXT:    movprfx z30, z28
-; CHECK-NEXT:    fcvtzs z30.d, p0/m, z28.d
-; CHECK-NEXT:    fcmge p10.d, p0/z, z7.d, z25.d
-; CHECK-NEXT:    not p4.b, p0/z, p1.b
-; CHECK-NEXT:    fcmuo p1.d, p0/z, z26.d, z26.d
-; CHECK-NEXT:    movprfx z26, z2
-; CHECK-NEXT:    fcvtzs z26.d, p0/m, z2.d
-; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    movprfx z31, z6
-; CHECK-NEXT:    fcvtzs z31.d, p0/m, z6.d
-; CHECK-NEXT:    movprfx z8, z7
-; CHECK-NEXT:    fcvtzs z8.d, p0/m, z7.d
-; CHECK-NEXT:    mov z4.d, p4/m, z0.d
-; CHECK-NEXT:    fcmge p4.d, p0/z, z28.d, z25.d
-; CHECK-NEXT:    not p5.b, p0/z, p5.b
-; CHECK-NEXT:    mov z29.d, p2/m, z0.d
-; CHECK-NEXT:    fcmuo p2.d, p0/z, z27.d, z27.d
-; CHECK-NEXT:    movprfx z27, z3
-; CHECK-NEXT:    fcvtzs z27.d, p0/m, z3.d
-; CHECK-NEXT:    sel z25.d, p5, z0.d, z26.d
-; CHECK-NEXT:    movprfx z26, z5
-; CHECK-NEXT:    fcvtzs z26.d, p0/m, z5.d
-; CHECK-NEXT:    not p6.b, p0/z, p6.b
-; CHECK-NEXT:    not p5.b, p0/z, p8.b
-; CHECK-NEXT:    fcmgt p8.d, p0/z, z2.d, z1.d
-; CHECK-NEXT:    not p4.b, p0/z, p4.b
-; CHECK-NEXT:    mov z27.d, p6/m, z0.d
-; CHECK-NEXT:    not p6.b, p0/z, p9.b
+; CHECK-NEXT:    mov z28.d, #0x8000000000000000
+; CHECK-NEXT:    mov z29.d, #0x8000000000000000
+; CHECK-NEXT:    fcmge p1.d, p0/z, z0.d, z24.d
+; CHECK-NEXT:    fcmge p2.d, p0/z, z1.d, z24.d
+; CHECK-NEXT:    fcmge p5.d, p0/z, z25.d, z24.d
+; CHECK-NEXT:    fcmge p3.d, p0/z, z2.d, z24.d
+; CHECK-NEXT:    fcmge p4.d, p0/z, z3.d, z24.d
+; CHECK-NEXT:    fcmge p7.d, p0/z, z5.d, z24.d
+; CHECK-NEXT:    mov z31.d, #0x8000000000000000
+; CHECK-NEXT:    fcmge p6.d, p0/z, z6.d, z24.d
+; CHECK-NEXT:    mov z8.d, #0x8000000000000000
+; CHECK-NEXT:    mov z9.d, #0x7fffffffffffffff
+; CHECK-NEXT:    fcmgt p8.d, p0/z, z25.d, z30.d
+; CHECK-NEXT:    fcmgt p10.d, p0/z, z6.d, z30.d
+; CHECK-NEXT:    fcvtzs z26.d, p1/m, z0.d
+; CHECK-NEXT:    fcmgt p1.d, p0/z, z0.d, z30.d
+; CHECK-NEXT:    fcvtzs z4.d, p2/m, z1.d
+; CHECK-NEXT:    fcmge p2.d, p0/z, z7.d, z24.d
+; CHECK-NEXT:    mov z24.d, #0x8000000000000000
+; CHECK-NEXT:    fcvtzs z27.d, p3/m, z2.d
+; CHECK-NEXT:    fcvtzs z28.d, p4/m, z3.d
+; CHECK-NEXT:    fcvtzs z29.d, p5/m, z25.d
+; CHECK-NEXT:    fcvtzs z31.d, p7/m, z5.d
+; CHECK-NEXT:    fcmgt p4.d, p0/z, z1.d, z30.d
+; CHECK-NEXT:    fcmgt p5.d, p0/z, z2.d, z30.d
+; CHECK-NEXT:    fcmgt p7.d, p0/z, z3.d, z30.d
+; CHECK-NEXT:    fcvtzs z8.d, p6/m, z6.d
+; CHECK-NEXT:    fcmuo p3.d, p0/z, z0.d, z0.d
+; CHECK-NEXT:    sel z0.d, p1, z9.d, z26.d
+; CHECK-NEXT:    fcmgt p1.d, p0/z, z5.d, z30.d
+; CHECK-NEXT:    fcvtzs z24.d, p2/m, z7.d
+; CHECK-NEXT:    fcmgt p2.d, p0/z, z7.d, z30.d
+; CHECK-NEXT:    fcmuo p6.d, p0/z, z1.d, z1.d
 ; CHECK-NEXT:    fcmuo p9.d, p0/z, z2.d, z2.d
-; CHECK-NEXT:    mov z30.d, p4/m, z0.d
-; CHECK-NEXT:    not p4.b, p0/z, p10.b
-; CHECK-NEXT:    fcmgt p10.d, p0/z, z3.d, z1.d
-; CHECK-NEXT:    mov z26.d, p5/m, z0.d
-; CHECK-NEXT:    fcmgt p5.d, p0/z, z28.d, z1.d
-; CHECK-NEXT:    mov z31.d, p6/m, z0.d
-; CHECK-NEXT:    mov z8.d, p4/m, z0.d
-; CHECK-NEXT:    sel z0.d, p3, z24.d, z4.d
-; CHECK-NEXT:    fcmgt p3.d, p0/z, z5.d, z1.d
-; CHECK-NEXT:    fcmgt p4.d, p0/z, z6.d, z1.d
-; CHECK-NEXT:    fcmgt p6.d, p0/z, z7.d, z1.d
-; CHECK-NEXT:    sel z1.d, p7, z24.d, z29.d
-; CHECK-NEXT:    fcmuo p7.d, p0/z, z3.d, z3.d
-; CHECK-NEXT:    sel z2.d, p8, z24.d, z25.d
-; CHECK-NEXT:    sel z3.d, p10, z24.d, z27.d
-; CHECK-NEXT:    sel z4.d, p5, z24.d, z30.d
-; CHECK-NEXT:    fcmuo p5.d, p0/z, z28.d, z28.d
-; CHECK-NEXT:    fcmuo p8.d, p0/z, z5.d, z5.d
-; CHECK-NEXT:    fcmuo p10.d, p0/z, z6.d, z6.d
-; CHECK-NEXT:    sel z5.d, p3, z24.d, z26.d
+; CHECK-NEXT:    sel z1.d, p4, z9.d, z4.d
+; CHECK-NEXT:    fcmuo p4.d, p0/z, z3.d, z3.d
+; CHECK-NEXT:    sel z2.d, p5, z9.d, z27.d
+; CHECK-NEXT:    sel z3.d, p7, z9.d, z28.d
+; CHECK-NEXT:    sel z4.d, p8, z9.d, z29.d
+; CHECK-NEXT:    fcmuo p5.d, p0/z, z25.d, z25.d
+; CHECK-NEXT:    fcmuo p7.d, p0/z, z5.d, z5.d
+; CHECK-NEXT:    fcmuo p8.d, p0/z, z6.d, z6.d
+; CHECK-NEXT:    sel z5.d, p1, z9.d, z31.d
+; CHECK-NEXT:    sel z6.d, p10, z9.d, z8.d
+; CHECK-NEXT:    ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p10, [sp, #1, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    fcmuo p0.d, p0/z, z7.d, z7.d
-; CHECK-NEXT:    sel z6.d, p4, z24.d, z31.d
-; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    sel z7.d, p6, z24.d, z8.d
-; CHECK-NEXT:    ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    sel z7.d, p2, z9.d, z24.d
+; CHECK-NEXT:    ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    mov z1.d, p6/m, #0 // =0x0
 ; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    mov z2.d, p9/m, #0 // =0x0
 ; CHECK-NEXT:    ldr p9, [sp, #2, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    mov z3.d, p7/m, #0 // =0x0
+; CHECK-NEXT:    mov z3.d, p4/m, #0 // =0x0
 ; CHECK-NEXT:    mov z4.d, p5/m, #0 // =0x0
-; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    mov z5.d, p8/m, #0 // =0x0
-; CHECK-NEXT:    mov z6.d, p10/m, #0 // =0x0
-; CHECK-NEXT:    ldr p10, [sp, #1, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    mov z0.d, p1/m, #0 // =0x0
-; CHECK-NEXT:    mov z1.d, p2/m, #0 // =0x0
+; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z5.d, p7/m, #0 // =0x0
+; CHECK-NEXT:    mov z6.d, p8/m, #0 // =0x0
 ; CHECK-NEXT:    ldr p8, [sp, #3, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z0.d, p3/m, #0 // =0x0
 ; CHECK-NEXT:    mov z7.d, p0/m, #0 // =0x0
-; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    addvl sp, sp, #2
+; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #3
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
   %a = call <vscale x 16 x i64> @llvm.llrint.nxv16i64.nxv16f64(<vscale x 16 x double> %x)
@@ -1490,6 +1287,8 @@ define <vscale x 32 x i64> @llrint_v32f64(<vscale x 32 x double> %x) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-17
+; CHECK-NEXT:    str p11, [sp] // 2-byte Folded Spill
+; CHECK-NEXT:    str p10, [sp, #1, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p9, [sp, #2, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p8, [sp, #3, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p7, [sp, #4, mul vl] // 2-byte Folded Spill
@@ -1512,8 +1311,8 @@ define <vscale x 32 x i64> @llrint_v32f64(<vscale x 32 x double> %x) {
 ; CHECK-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
 ; CHECK-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
 ; CHECK-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT:    addvl sp, sp, #-3
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xa0, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 160 * VG
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 144 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
 ; CHECK-NEXT:    .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
@@ -1526,219 +1325,176 @@ define <vscale x 32 x i64> @llrint_v32f64(<vscale x 32 x double> %x) {
 ; CHECK-NEXT:    ldr z0, [x0]
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    ldr z2, [x0, #2, mul vl]
-; CHECK-NEXT:    mov x9, #-4332462841530417152 // =0xc3e0000000000000
-; CHECK-NEXT:    ldr z24, [x0, #6, mul vl]
 ; CHECK-NEXT:    ldr z1, [x0, #1, mul vl]
-; CHECK-NEXT:    mov z7.d, x9
-; CHECK-NEXT:    mov z26.d, #0x8000000000000000
-; CHECK-NEXT:    ldr z3, [x0, #3, mul vl]
+; CHECK-NEXT:    ldr z6, [x0, #4, mul vl]
+; CHECK-NEXT:    mov x9, #-4332462841530417152 // =0xc3e0000000000000
+; CHECK-NEXT:    ldr z5, [x0, #3, mul vl]
+; CHECK-NEXT:    mov z25.d, x9
+; CHECK-NEXT:    mov z28.d, #0x8000000000000000
 ; CHECK-NEXT:    frintx z0.d, p0/m, z0.d
-; CHECK-NEXT:    movprfx z30, z2
-; CHECK-NEXT:    frintx z30.d, p0/m, z2.d
-; CHECK-NEXT:    ldr z6, [x0, #5, mul vl]
-; CHECK-NEXT:    movprfx z25, z24
-; CHECK-NEXT:    frintx z25.d, p0/m, z24.d
-; CHECK-NEXT:    movprfx z12, z1
-; CHECK-NEXT:    frintx z12.d, p0/m, z1.d
-; CHECK-NEXT:    ldr z5, [x0, #4, mul vl]
-; CHECK-NEXT:    frintx z3.d, p0/m, z3.d
-; CHECK-NEXT:    mov x9, #4890909195324358655 // =0x43dfffffffffffff
+; CHECK-NEXT:    movprfx z4, z2
+; CHECK-NEXT:    frintx z4.d, p0/m, z2.d
+; CHECK-NEXT:    mov z27.d, #0x8000000000000000
+; CHECK-NEXT:    frintx z1.d, p0/m, z1.d
 ; CHECK-NEXT:    frintx z6.d, p0/m, z6.d
-; CHECK-NEXT:    mov z4.d, x9
-; CHECK-NEXT:    fcmge p3.d, p0/z, z0.d, z7.d
-; CHECK-NEXT:    movprfx z24, z0
-; CHECK-NEXT:    fcvtzs z24.d, p0/m, z0.d
-; CHECK-NEXT:    fcmge p5.d, p0/z, z30.d, z7.d
-; CHECK-NEXT:    movprfx z28, z30
-; CHECK-NEXT:    fcvtzs z28.d, p0/m, z30.d
-; CHECK-NEXT:    str z0, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    mov z30.d, #0x8000000000000000
 ; CHECK-NEXT:    frintx z5.d, p0/m, z5.d
-; CHECK-NEXT:    fcmge p4.d, p0/z, z12.d, z7.d
-; CHECK-NEXT:    ldr z8, [x0, #7, mul vl]
-; CHECK-NEXT:    ldr z9, [x0, #15, mul vl]
-; CHECK-NEXT:    movprfx z27, z12
-; CHECK-NEXT:    fcvtzs z27.d, p0/m, z12.d
-; CHECK-NEXT:    fcmge p6.d, p0/z, z3.d, z7.d
-; CHECK-NEXT:    fcmge p9.d, p0/z, z6.d, z7.d
-; CHECK-NEXT:    not p7.b, p0/z, p3.b
-; CHECK-NEXT:    movprfx z31, z3
-; CHECK-NEXT:    fcvtzs z31.d, p0/m, z3.d
-; CHECK-NEXT:    movprfx z15, z6
-; CHECK-NEXT:    fcvtzs z15.d, p0/m, z6.d
-; CHECK-NEXT:    not p5.b, p0/z, p5.b
-; CHECK-NEXT:    fcmge p8.d, p0/z, z5.d, z7.d
-; CHECK-NEXT:    movprfx z13, z5
-; CHECK-NEXT:    fcvtzs z13.d, p0/m, z5.d
-; CHECK-NEXT:    sel z0.d, p7, z26.d, z24.d
-; CHECK-NEXT:    not p4.b, p0/z, p4.b
-; CHECK-NEXT:    movprfx z17, z25
-; CHECK-NEXT:    fcvtzs z17.d, p0/m, z25.d
-; CHECK-NEXT:    not p3.b, p0/z, p6.b
-; CHECK-NEXT:    fcmge p6.d, p0/z, z25.d, z7.d
-; CHECK-NEXT:    movprfx z22, z9
-; CHECK-NEXT:    frintx z22.d, p0/m, z9.d
-; CHECK-NEXT:    sel z29.d, p4, z26.d, z27.d
-; CHECK-NEXT:    movprfx z27, z8
-; CHECK-NEXT:    frintx z27.d, p0/m, z8.d
-; CHECK-NEXT:    fcmgt p1.d, p0/z, z12.d, z4.d
-; CHECK-NEXT:    str z0, [sp, #1, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT:    sel z0.d, p5, z26.d, z28.d
-; CHECK-NEXT:    not p4.b, p0/z, p8.b
-; CHECK-NEXT:    ldr z10, [x0, #8, mul vl]
-; CHECK-NEXT:    not p5.b, p0/z, p9.b
-; CHECK-NEXT:    sel z24.d, p3, z26.d, z31.d
-; CHECK-NEXT:    not p3.b, p0/z, p6.b
-; CHECK-NEXT:    movprfx z2, z22
-; CHECK-NEXT:    fcvtzs z2.d, p0/m, z22.d
-; CHECK-NEXT:    fcmgt p2.d, p0/z, z30.d, z4.d
+; CHECK-NEXT:    mov z26.d, #0x8000000000000000
+; CHECK-NEXT:    mov z2.d, #0x8000000000000000
+; CHECK-NEXT:    mov z13.d, #0x8000000000000000
+; CHECK-NEXT:    mov z12.d, #0x8000000000000000
+; CHECK-NEXT:    mov x10, #4890909195324358655 // =0x43dfffffffffffff
 ; CHECK-NEXT:    str z0, [sp] // 16-byte Folded Spill
-; CHECK-NEXT:    fcmge p7.d, p0/z, z27.d, z7.d
-; CHECK-NEXT:    sel z31.d, p5, z26.d, z15.d
-; CHECK-NEXT:    ldr z11, [x0, #9, mul vl]
-; CHECK-NEXT:    movprfx z28, z10
-; CHECK-NEXT:    frintx z28.d, p0/m, z10.d
-; CHECK-NEXT:    ldr z10, [x0, #10, mul vl]
-; CHECK-NEXT:    ldr z18, [x0, #11, mul vl]
-; CHECK-NEXT:    ldr z16, [x0, #13, mul vl]
-; CHECK-NEXT:    ldr z14, [x0, #14, mul vl]
-; CHECK-NEXT:    ldr z19, [x0, #12, mul vl]
-; CHECK-NEXT:    mov z17.d, p3/m, z26.d
-; CHECK-NEXT:    fcmgt p9.d, p0/z, z3.d, z4.d
-; CHECK-NEXT:    movprfx z8, z11
-; CHECK-NEXT:    frintx z8.d, p0/m, z11.d
-; CHECK-NEXT:    sel z11.d, p4, z26.d, z13.d
-; CHECK-NEXT:    frintx z10.d, p0/m, z10.d
-; CHECK-NEXT:    movprfx z13, z18
-; CHECK-NEXT:    frintx z13.d, p0/m, z18.d
-; CHECK-NEXT:    fcmge p5.d, p0/z, z28.d, z7.d
-; CHECK-NEXT:    movprfx z18, z27
-; CHECK-NEXT:    fcvtzs z18.d, p0/m, z27.d
+; CHECK-NEXT:    fcmge p3.d, p0/z, z4.d, z25.d
+; CHECK-NEXT:    fcmge p1.d, p0/z, z0.d, z25.d
+; CHECK-NEXT:    ldr z29, [x0, #7, mul vl]
+; CHECK-NEXT:    ldr z24, [x0, #6, mul vl]
+; CHECK-NEXT:    ldr z10, [x0, #9, mul vl]
+; CHECK-NEXT:    ldr z8, [x0, #8, mul vl]
+; CHECK-NEXT:    ldr z7, [x0, #5, mul vl]
+; CHECK-NEXT:    ldr z14, [x0, #15, mul vl]
+; CHECK-NEXT:    fcmge p2.d, p0/z, z1.d, z25.d
+; CHECK-NEXT:    fcmge p5.d, p0/z, z6.d, z25.d
+; CHECK-NEXT:    ldr z15, [x0, #14, mul vl]
+; CHECK-NEXT:    frintx z29.d, p0/m, z29.d
+; CHECK-NEXT:    frintx z24.d, p0/m, z24.d
+; CHECK-NEXT:    movprfx z11, z10
+; CHECK-NEXT:    frintx z11.d, p0/m, z10.d
+; CHECK-NEXT:    fcmge p4.d, p0/z, z5.d, z25.d
+; CHECK-NEXT:    movprfx z9, z8
+; CHECK-NEXT:    frintx z9.d, p0/m, z8.d
+; CHECK-NEXT:    ldr z16, [x0, #11, mul vl]
+; CHECK-NEXT:    ldr z20, [x0, #13, mul vl]
+; CHECK-NEXT:    frintx z7.d, p0/m, z7.d
+; CHECK-NEXT:    fcvtzs z28.d, p3/m, z4.d
+; CHECK-NEXT:    mov z10.d, #0x8000000000000000
+; CHECK-NEXT:    ldr z18, [x0, #12, mul vl]
+; CHECK-NEXT:    movprfx z19, z14
+; CHECK-NEXT:    frintx z19.d, p0/m, z14.d
+; CHECK-NEXT:    fcmge p3.d, p0/z, z29.d, z25.d
+; CHECK-NEXT:    ldr z17, [x0, #10, mul vl]
+; CHECK-NEXT:    frintx z15.d, p0/m, z15.d
+; CHECK-NEXT:    fcvtzs z27.d, p2/m, z1.d
+; CHECK-NEXT:    fcvtzs z30.d, p5/m, z6.d
+; CHECK-NEXT:    fcmge p2.d, p0/z, z24.d, z25.d
+; CHECK-NEXT:    fcmge p5.d, p0/z, z11.d, z25.d
+; CHECK-NEXT:    mov z14.d, #0x8000000000000000
 ; CHECK-NEXT:    frintx z16.d, p0/m, z16.d
-; CHECK-NEXT:    movprfx z15, z19
-; CHECK-NEXT:    frintx z15.d, p0/m, z19.d
-; CHECK-NEXT:    movprfx z19, z28
-; CHECK-NEXT:    fcvtzs z19.d, p0/m, z28.d
-; CHECK-NEXT:    movprfx z21, z14
-; CHECK-NEXT:    frintx z21.d, p0/m, z14.d
-; CHECK-NEXT:    not p4.b, p0/z, p7.b
-; CHECK-NEXT:    fcmge p6.d, p0/z, z8.d, z7.d
-; CHECK-NEXT:    movprfx z20, z8
-; CHECK-NEXT:    fcvtzs z20.d, p0/m, z8.d
-; CHECK-NEXT:    fcmge p7.d, p0/z, z10.d, z7.d
-; CHECK-NEXT:    fcmge p8.d, p0/z, z13.d, z7.d
-; CHECK-NEXT:    not p5.b, p0/z, p5.b
-; CHECK-NEXT:    sel z9.d, p4, z26.d, z18.d
-; CHECK-NEXT:    fcmge p4.d, p0/z, z16.d, z7.d
-; CHECK-NEXT:    fcmge p3.d, p0/z, z15.d, z7.d
-; CHECK-NEXT:    movprfx z0, z16
-; CHECK-NEXT:    fcvtzs z0.d, p0/m, z16.d
-; CHECK-NEXT:    sel z14.d, p5, z26.d, z19.d
-; CHECK-NEXT:    movprfx z19, z10
-; CHECK-NEXT:    fcvtzs z19.d, p0/m, z10.d
-; CHECK-NEXT:    movprfx z1, z21
-; CHECK-NEXT:    fcvtzs z1.d, p0/m, z21.d
-; CHECK-NEXT:    not p6.b, p0/z, p6.b
-; CHECK-NEXT:    movprfx z23, z15
-; CHECK-NEXT:    fcvtzs z23.d, p0/m, z15.d
-; CHECK-NEXT:    not p5.b, p0/z, p7.b
-; CHECK-NEXT:    sel z18.d, p6, z26.d, z20.d
-; CHECK-NEXT:    fcmge p6.d, p0/z, z21.d, z7.d
-; CHECK-NEXT:    not p7.b, p0/z, p8.b
-; CHECK-NEXT:    fcmge p8.d, p0/z, z22.d, z7.d
-; CHECK-NEXT:    movprfx z20, z13
-; CHECK-NEXT:    fcvtzs z20.d, p0/m, z13.d
-; CHECK-NEXT:    not p4.b, p0/z, p4.b
-; CHECK-NEXT:    mov z7.d, #0x7fffffffffffffff
-; CHECK-NEXT:    mov z19.d, p5/m, z26.d
-; CHECK-NEXT:    not p3.b, p0/z, p3.b
-; CHECK-NEXT:    mov z0.d, p4/m, z26.d
-; CHECK-NEXT:    fcmgt p4.d, p0/z, z21.d, z4.d
-; CHECK-NEXT:    not p5.b, p0/z, p6.b
-; CHECK-NEXT:    mov z23.d, p3/m, z26.d
-; CHECK-NEXT:    fcmgt p3.d, p0/z, z22.d, z4.d
-; CHECK-NEXT:    not p6.b, p0/z, p8.b
-; CHECK-NEXT:    mov z20.d, p7/m, z26.d
-; CHECK-NEXT:    fcmuo p8.d, p0/z, z22.d, z22.d
-; CHECK-NEXT:    mov z1.d, p5/m, z26.d
-; CHECK-NEXT:    fcmuo p5.d, p0/z, z21.d, z21.d
-; CHECK-NEXT:    fcmgt p7.d, p0/z, z25.d, z4.d
-; CHECK-NEXT:    mov z2.d, p6/m, z26.d
-; CHECK-NEXT:    sel z26.d, p1, z7.d, z29.d
-; CHECK-NEXT:    fcmgt p1.d, p0/z, z16.d, z4.d
-; CHECK-NEXT:    ldr z29, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:    fcmgt p6.d, p0/z, z5.d, z4.d
-; CHECK-NEXT:    mov z24.d, p9/m, z7.d
-; CHECK-NEXT:    mov z1.d, p4/m, z7.d
-; CHECK-NEXT:    fcmuo p4.d, p0/z, z16.d, z16.d
-; CHECK-NEXT:    mov z2.d, p3/m, z7.d
-; CHECK-NEXT:    fcmgt p3.d, p0/z, z15.d, z4.d
-; CHECK-NEXT:    mov z17.d, p7/m, z7.d
-; CHECK-NEXT:    mov z29.d, p2/m, z7.d
-; CHECK-NEXT:    fcmgt p2.d, p0/z, z13.d, z4.d
-; CHECK-NEXT:    mov z0.d, p1/m, z7.d
-; CHECK-NEXT:    fcmgt p1.d, p0/z, z10.d, z4.d
-; CHECK-NEXT:    mov z1.d, p5/m, #0 // =0x0
-; CHECK-NEXT:    mov z11.d, p6/m, z7.d
+; CHECK-NEXT:    frintx z20.d, p0/m, z20.d
+; CHECK-NEXT:    fcvtzs z26.d, p4/m, z5.d
+; CHECK-NEXT:    fcmge p4.d, p0/z, z9.d, z25.d
+; CHECK-NEXT:    frintx z18.d, p0/m, z18.d
+; CHECK-NEXT:    mov z31.d, #0x8000000000000000
+; CHECK-NEXT:    fcvtzs z2.d, p1/m, z0.d
+; CHECK-NEXT:    fcmge p1.d, p0/z, z7.d, z25.d
+; CHECK-NEXT:    mov z8.d, #0x8000000000000000
+; CHECK-NEXT:    frintx z17.d, p0/m, z17.d
+; CHECK-NEXT:    fcvtzs z10.d, p3/m, z29.d
+; CHECK-NEXT:    fcmge p3.d, p0/z, z19.d, z25.d
+; CHECK-NEXT:    mov z3.d, x10
+; CHECK-NEXT:    fcmge p6.d, p0/z, z15.d, z25.d
+; CHECK-NEXT:    mov z21.d, #0x8000000000000000
+; CHECK-NEXT:    fcvtzs z13.d, p2/m, z24.d
+; CHECK-NEXT:    fcvtzs z14.d, p5/m, z11.d
+; CHECK-NEXT:    fcmge p2.d, p0/z, z16.d, z25.d
+; CHECK-NEXT:    mov z22.d, #0x8000000000000000
+; CHECK-NEXT:    fcmge p5.d, p0/z, z20.d, z25.d
+; CHECK-NEXT:    mov z0.d, #0x8000000000000000
+; CHECK-NEXT:    fcvtzs z12.d, p4/m, z9.d
+; CHECK-NEXT:    fcmge p4.d, p0/z, z18.d, z25.d
+; CHECK-NEXT:    mov z23.d, #0x8000000000000000
+; CHECK-NEXT:    fcvtzs z8.d, p1/m, z7.d
+; CHECK-NEXT:    fcmge p1.d, p0/z, z17.d, z25.d
+; CHECK-NEXT:    fcvtzs z31.d, p3/m, z19.d
+; CHECK-NEXT:    mov z25.d, #0x7fffffffffffffff
+; CHECK-NEXT:    fcmgt p11.d, p0/z, z19.d, z3.d
+; CHECK-NEXT:    fcvtzs z21.d, p6/m, z15.d
+; CHECK-NEXT:    fcmgt p3.d, p0/z, z15.d, z3.d
 ; CHECK-NEXT:    fcmuo p6.d, p0/z, z15.d, z15.d
-; CHECK-NEXT:    fcmgt p5.d, p0/z, z8.d, z4.d
-; CHECK-NEXT:    mov z2.d, p8/m, #0 // =0x0
-; CHECK-NEXT:    sel z16.d, p3, z7.d, z23.d
-; CHECK-NEXT:    fcmuo p3.d, p0/z, z10.d, z10.d
-; CHECK-NEXT:    mov z0.d, p4/m, #0 // =0x0
-; CHECK-NEXT:    sel z15.d, p2, z7.d, z20.d
-; CHECK-NEXT:    fcmuo p2.d, p0/z, z13.d, z13.d
-; CHECK-NEXT:    str z1, [x8, #14, mul vl]
-; CHECK-NEXT:    sel z1.d, p1, z7.d, z19.d
-; CHECK-NEXT:    fcmgt p1.d, p0/z, z28.d, z4.d
-; CHECK-NEXT:    fcmgt p4.d, p0/z, z27.d, z4.d
-; CHECK-NEXT:    str z2, [x8, #15, mul vl]
-; CHECK-NEXT:    sel z2.d, p5, z7.d, z18.d
-; CHECK-NEXT:    mov z16.d, p6/m, #0 // =0x0
-; CHECK-NEXT:    fcmuo p5.d, p0/z, z8.d, z8.d
+; CHECK-NEXT:    mov z15.d, #0x8000000000000000
+; CHECK-NEXT:    fcmgt p7.d, p0/z, z1.d, z3.d
+; CHECK-NEXT:    fcvtzs z22.d, p2/m, z16.d
+; CHECK-NEXT:    fcvtzs z0.d, p5/m, z20.d
+; CHECK-NEXT:    fcmgt p2.d, p0/z, z24.d, z3.d
+; CHECK-NEXT:    fcmgt p5.d, p0/z, z20.d, z3.d
+; CHECK-NEXT:    fcvtzs z23.d, p4/m, z18.d
+; CHECK-NEXT:    fcmuo p4.d, p0/z, z19.d, z19.d
+; CHECK-NEXT:    mov z31.d, p11/m, z25.d
+; CHECK-NEXT:    sel z19.d, p3, z25.d, z21.d
+; CHECK-NEXT:    fcmgt p3.d, p0/z, z18.d, z3.d
+; CHECK-NEXT:    fcvtzs z15.d, p1/m, z17.d
+; CHECK-NEXT:    fcmuo p1.d, p0/z, z20.d, z20.d
+; CHECK-NEXT:    mov z27.d, p7/m, z25.d
+; CHECK-NEXT:    fcmgt p7.d, p0/z, z16.d, z3.d
+; CHECK-NEXT:    mov z13.d, p2/m, z25.d
+; CHECK-NEXT:    fcmgt p2.d, p0/z, z17.d, z3.d
+; CHECK-NEXT:    mov z0.d, p5/m, z25.d
+; CHECK-NEXT:    mov z31.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p4.d, p0/z, z18.d, z18.d
+; CHECK-NEXT:    sel z20.d, p3, z25.d, z23.d
+; CHECK-NEXT:    fcmuo p3.d, p0/z, z16.d, z16.d
+; CHECK-NEXT:    fcmgt p5.d, p0/z, z11.d, z3.d
+; CHECK-NEXT:    mov z19.d, p6/m, #0 // =0x0
+; CHECK-NEXT:    fcmgt p10.d, p0/z, z6.d, z3.d
+; CHECK-NEXT:    fcmgt p8.d, p0/z, z4.d, z3.d
+; CHECK-NEXT:    str z31, [x8, #15, mul vl]
+; CHECK-NEXT:    mov z0.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p1.d, p0/z, z17.d, z17.d
+; CHECK-NEXT:    sel z18.d, p7, z25.d, z22.d
+; CHECK-NEXT:    sel z31.d, p2, z25.d, z15.d
+; CHECK-NEXT:    fcmgt p2.d, p0/z, z9.d, z3.d
+; CHECK-NEXT:    str z19, [x8, #14, mul vl]
+; CHECK-NEXT:    mov z20.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p4.d, p0/z, z11.d, z11.d
 ; CHECK-NEXT:    str z0, [x8, #13, mul vl]
-; CHECK-NEXT:    mov z15.d, p2/m, #0 // =0x0
-; CHECK-NEXT:    fcmuo p2.d, p0/z, z28.d, z28.d
-; CHECK-NEXT:    mov z1.d, p3/m, #0 // =0x0
-; CHECK-NEXT:    fcmgt p3.d, p0/z, z6.d, z4.d
-; CHECK-NEXT:    sel z0.d, p1, z7.d, z14.d
-; CHECK-NEXT:    fcmuo p1.d, p0/z, z27.d, z27.d
-; CHECK-NEXT:    sel z27.d, p4, z7.d, z9.d
-; CHECK-NEXT:    str z16, [x8, #12, mul vl]
-; CHECK-NEXT:    fcmuo p4.d, p0/z, z25.d, z25.d
-; CHECK-NEXT:    str z15, [x8, #11, mul vl]
-; CHECK-NEXT:    mov z2.d, p5/m, #0 // =0x0
-; CHECK-NEXT:    fcmuo p5.d, p0/z, z6.d, z6.d
-; CHECK-NEXT:    str z1, [x8, #10, mul vl]
-; CHECK-NEXT:    mov z0.d, p2/m, #0 // =0x0
-; CHECK-NEXT:    sel z1.d, p3, z7.d, z31.d
-; CHECK-NEXT:    fcmuo p3.d, p0/z, z5.d, z5.d
-; CHECK-NEXT:    ldr z5, [sp, #2, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    mov z27.d, p1/m, #0 // =0x0
-; CHECK-NEXT:    str z2, [x8, #9, mul vl]
-; CHECK-NEXT:    fcmuo p1.d, p0/z, z3.d, z3.d
+; CHECK-NEXT:    mov z14.d, p5/m, z25.d
+; CHECK-NEXT:    fcmgt p5.d, p0/z, z29.d, z3.d
+; CHECK-NEXT:    mov z18.d, p3/m, #0 // =0x0
+; CHECK-NEXT:    mov z31.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    fcmgt p1.d, p0/z, z7.d, z3.d
+; CHECK-NEXT:    str z20, [x8, #12, mul vl]
+; CHECK-NEXT:    fcmuo p3.d, p0/z, z9.d, z9.d
+; CHECK-NEXT:    sel z0.d, p2, z25.d, z12.d
+; CHECK-NEXT:    mov z14.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p4.d, p0/z, z7.d, z7.d
+; CHECK-NEXT:    fcmuo p2.d, p0/z, z29.d, z29.d
+; CHECK-NEXT:    str z18, [x8, #11, mul vl]
+; CHECK-NEXT:    sel z29.d, p5, z25.d, z10.d
+; CHECK-NEXT:    fcmuo p5.d, p0/z, z24.d, z24.d
+; CHECK-NEXT:    str z31, [x8, #10, mul vl]
+; CHECK-NEXT:    sel z7.d, p1, z25.d, z8.d
+; CHECK-NEXT:    fcmuo p1.d, p0/z, z6.d, z6.d
+; CHECK-NEXT:    ldr z6, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    str z14, [x8, #9, mul vl]
+; CHECK-NEXT:    fcmgt p9.d, p0/z, z5.d, z3.d
+; CHECK-NEXT:    mov z0.d, p3/m, #0 // =0x0
+; CHECK-NEXT:    mov z29.d, p2/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p2.d, p0/z, z5.d, z5.d
+; CHECK-NEXT:    mov z13.d, p5/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p5.d, p0/z, z4.d, z4.d
+; CHECK-NEXT:    mov z7.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    fcmgt p3.d, p0/z, z6.d, z3.d
+; CHECK-NEXT:    fcmuo p4.d, p0/z, z1.d, z1.d
+; CHECK-NEXT:    fcmuo p0.d, p0/z, z6.d, z6.d
 ; CHECK-NEXT:    str z0, [x8, #8, mul vl]
-; CHECK-NEXT:    mov z17.d, p4/m, #0 // =0x0
-; CHECK-NEXT:    fcmuo p4.d, p0/z, z30.d, z30.d
-; CHECK-NEXT:    fcmgt p2.d, p0/z, z5.d, z4.d
-; CHECK-NEXT:    mov z1.d, p5/m, #0 // =0x0
-; CHECK-NEXT:    fcmuo p5.d, p0/z, z12.d, z12.d
-; CHECK-NEXT:    str z27, [x8, #7, mul vl]
-; CHECK-NEXT:    fcmuo p0.d, p0/z, z5.d, z5.d
-; CHECK-NEXT:    mov z11.d, p3/m, #0 // =0x0
-; CHECK-NEXT:    ldr z0, [sp, #1, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    mov z24.d, p1/m, #0 // =0x0
-; CHECK-NEXT:    str z17, [x8, #6, mul vl]
-; CHECK-NEXT:    mov z29.d, p4/m, #0 // =0x0
-; CHECK-NEXT:    str z1, [x8, #5, mul vl]
-; CHECK-NEXT:    mov z26.d, p5/m, #0 // =0x0
-; CHECK-NEXT:    str z11, [x8, #4, mul vl]
-; CHECK-NEXT:    mov z0.d, p2/m, z7.d
-; CHECK-NEXT:    str z24, [x8, #3, mul vl]
-; CHECK-NEXT:    str z29, [x8, #2, mul vl]
-; CHECK-NEXT:    str z26, [x8, #1, mul vl]
+; CHECK-NEXT:    mov z28.d, p8/m, z25.d
+; CHECK-NEXT:    mov z26.d, p9/m, z25.d
+; CHECK-NEXT:    str z29, [x8, #7, mul vl]
+; CHECK-NEXT:    mov z30.d, p10/m, z25.d
+; CHECK-NEXT:    str z13, [x8, #6, mul vl]
+; CHECK-NEXT:    str z7, [x8, #5, mul vl]
+; CHECK-NEXT:    sel z0.d, p3, z25.d, z2.d
+; CHECK-NEXT:    mov z26.d, p2/m, #0 // =0x0
+; CHECK-NEXT:    mov z30.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    mov z28.d, p5/m, #0 // =0x0
+; CHECK-NEXT:    mov z27.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    str z26, [x8, #3, mul vl]
 ; CHECK-NEXT:    mov z0.d, p0/m, #0 // =0x0
+; CHECK-NEXT:    str z30, [x8, #4, mul vl]
+; CHECK-NEXT:    str z28, [x8, #2, mul vl]
+; CHECK-NEXT:    str z27, [x8, #1, mul vl]
 ; CHECK-NEXT:    str z0, [x8]
-; CHECK-NEXT:    addvl sp, sp, #3
+; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1755,6 +1511,8 @@ define <vscale x 32 x i64> @llrint_v32f64(<vscale x 32 x double> %x) {
 ; CHECK-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p11, [sp] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p10, [sp, #1, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p9, [sp, #2, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p8, [sp, #3, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/sve-lrint.ll b/llvm/test/CodeGen/AArch64/sve-lrint.ll
index 908ba2392a437..aa5863901b9d3 100644
--- a/llvm/test/CodeGen/AArch64/sve-lrint.ll
+++ b/llvm/test/CodeGen/AArch64/sve-lrint.ll
@@ -7,20 +7,17 @@ define <vscale x 1 x iXLen> @lrint_v1f16(<vscale x 1 x half> %x) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov w8, #64511 // =0xfbff
-; CHECK-NEXT:    mov z2.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z1.h, w8
 ; CHECK-NEXT:    mov w8, #31743 // =0x7bff
 ; CHECK-NEXT:    frintx z0.h, p0/m, z0.h
-; CHECK-NEXT:    mov z3.h, w8
+; CHECK-NEXT:    mov z2.h, w8
 ; CHECK-NEXT:    fcmge p1.h, p0/z, z0.h, z1.h
-; CHECK-NEXT:    movprfx z1, z0
-; CHECK-NEXT:    fcvtzs z1.d, p0/m, z0.h
-; CHECK-NEXT:    fcmgt p2.h, p0/z, z0.h, z3.h
-; CHECK-NEXT:    mov z3.d, #0x7fffffffffffffff
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    mov z1.d, #0x8000000000000000
+; CHECK-NEXT:    fcvtzs z1.d, p1/m, z0.h
+; CHECK-NEXT:    fcmgt p1.h, p0/z, z0.h, z2.h
+; CHECK-NEXT:    mov z2.d, #0x7fffffffffffffff
 ; CHECK-NEXT:    fcmuo p0.h, p0/z, z0.h, z0.h
-; CHECK-NEXT:    mov z1.d, p1/m, z2.d
-; CHECK-NEXT:    sel z0.d, p2, z3.d, z1.d
+; CHECK-NEXT:    sel z0.d, p1, z2.d, z1.d
 ; CHECK-NEXT:    mov z0.d, p0/m, #0 // =0x0
 ; CHECK-NEXT:    ret
   %a = call <vscale x 1 x iXLen> @llvm.lrint.nxv1iXLen.nxv1f16(<vscale x 1 x half> %x)
@@ -33,20 +30,17 @@ define <vscale x 2 x iXLen> @lrint_v2f16(<vscale x 2 x half> %x) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov w8, #64511 // =0xfbff
-; CHECK-NEXT:    mov z2.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z1.h, w8
 ; CHECK-NEXT:    mov w8, #31743 // =0x7bff
 ; CHECK-NEXT:    frintx z0.h, p0/m, z0.h
-; CHECK-NEXT:    mov z3.h, w8
+; CHECK-NEXT:    mov z2.h, w8
 ; CHECK-NEXT:    fcmge p1.h, p0/z, z0.h, z1.h
-; CHECK-NEXT:    movprfx z1, z0
-; CHECK-NEXT:    fcvtzs z1.d, p0/m, z0.h
-; CHECK-NEXT:    fcmgt p2.h, p0/z, z0.h, z3.h
-; CHECK-NEXT:    mov z3.d, #0x7fffffffffffffff
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    mov z1.d, #0x8000000000000000
+; CHECK-NEXT:    fcvtzs z1.d, p1/m, z0.h
+; CHECK-NEXT:    fcmgt p1.h, p0/z, z0.h, z2.h
+; CHECK-NEXT:    mov z2.d, #0x7fffffffffffffff
 ; CHECK-NEXT:    fcmuo p0.h, p0/z, z0.h, z0.h
-; CHECK-NEXT:    mov z1.d, p1/m, z2.d
-; CHECK-NEXT:    sel z0.d, p2, z3.d, z1.d
+; CHECK-NEXT:    sel z0.d, p1, z2.d, z1.d
 ; CHECK-NEXT:    mov z0.d, p0/m, #0 // =0x0
 ; CHECK-NEXT:    ret
   %a = call <vscale x 2 x iXLen> @llvm.lrint.nxv2iXLen.nxv2f16(<vscale x 2 x half> %x)
@@ -57,43 +51,30 @@ declare <vscale x 2 x iXLen> @llvm.lrint.nxv2iXLen.nxv2f16(<vscale x 2 x half>)
 define <vscale x 4 x iXLen> @lrint_v4f16(<vscale x 4 x half> %x) {
 ; CHECK-LABEL: lrint_v4f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
-; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    uunpklo z1.d, z0.s
 ; CHECK-NEXT:    uunpkhi z0.d, z0.s
 ; CHECK-NEXT:    mov w8, #64511 // =0xfbff
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z2.h, w8
 ; CHECK-NEXT:    mov w8, #31743 // =0x7bff
-; CHECK-NEXT:    mov z3.h, w8
-; CHECK-NEXT:    mov z6.d, #0x7fffffffffffffff
+; CHECK-NEXT:    mov z3.d, #0x8000000000000000
+; CHECK-NEXT:    mov z4.d, #0x8000000000000000
+; CHECK-NEXT:    mov z5.h, w8
 ; CHECK-NEXT:    frintx z1.h, p0/m, z1.h
 ; CHECK-NEXT:    frintx z0.h, p0/m, z0.h
 ; CHECK-NEXT:    fcmge p1.h, p0/z, z1.h, z2.h
 ; CHECK-NEXT:    fcmge p2.h, p0/z, z0.h, z2.h
-; CHECK-NEXT:    mov z2.d, #0x8000000000000000
-; CHECK-NEXT:    movprfx z4, z1
-; CHECK-NEXT:    fcvtzs z4.d, p0/m, z1.h
-; CHECK-NEXT:    movprfx z5, z0
-; CHECK-NEXT:    fcvtzs z5.d, p0/m, z0.h
-; CHECK-NEXT:    fcmgt p3.h, p0/z, z1.h, z3.h
-; CHECK-NEXT:    fcmgt p4.h, p0/z, z0.h, z3.h
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    sel z3.d, p1, z2.d, z4.d
-; CHECK-NEXT:    fcmuo p1.h, p0/z, z1.h, z1.h
+; CHECK-NEXT:    mov z2.d, #0x7fffffffffffffff
+; CHECK-NEXT:    fcmuo p3.h, p0/z, z1.h, z1.h
+; CHECK-NEXT:    fcvtzs z3.d, p1/m, z1.h
+; CHECK-NEXT:    fcmgt p1.h, p0/z, z1.h, z5.h
+; CHECK-NEXT:    fcvtzs z4.d, p2/m, z0.h
+; CHECK-NEXT:    fcmgt p2.h, p0/z, z0.h, z5.h
 ; CHECK-NEXT:    fcmuo p0.h, p0/z, z0.h, z0.h
-; CHECK-NEXT:    sel z2.d, p2, z2.d, z5.d
-; CHECK-NEXT:    sel z0.d, p3, z6.d, z3.d
-; CHECK-NEXT:    sel z1.d, p4, z6.d, z2.d
-; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    mov z0.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    sel z0.d, p1, z2.d, z3.d
+; CHECK-NEXT:    sel z1.d, p2, z2.d, z4.d
+; CHECK-NEXT:    mov z0.d, p3/m, #0 // =0x0
 ; CHECK-NEXT:    mov z1.d, p0/m, #0 // =0x0
-; CHECK-NEXT:    addvl sp, sp, #1
-; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
   %a = call <vscale x 4 x iXLen> @llvm.lrint.nxv4iXLen.nxv4f16(<vscale x 4 x half> %x)
   ret <vscale x 4 x iXLen> %a
@@ -105,7 +86,6 @@ define <vscale x 8 x iXLen> @lrint_v8f16(<vscale x 8 x half> %x) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    str p7, [sp, #4, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p6, [sp, #5, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
@@ -117,8 +97,10 @@ define <vscale x 8 x iXLen> @lrint_v8f16(<vscale x 8 x half> %x) {
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z4.h, w8
 ; CHECK-NEXT:    mov w8, #31743 // =0x7bff
-; CHECK-NEXT:    mov z6.h, w8
-; CHECK-NEXT:    mov z26.d, #0x7fffffffffffffff
+; CHECK-NEXT:    mov z6.d, #0x8000000000000000
+; CHECK-NEXT:    mov z25.h, w8
+; CHECK-NEXT:    mov z7.d, #0x8000000000000000
+; CHECK-NEXT:    mov z24.d, #0x8000000000000000
 ; CHECK-NEXT:    uunpklo z2.d, z1.s
 ; CHECK-NEXT:    uunpkhi z1.d, z1.s
 ; CHECK-NEXT:    uunpklo z3.d, z0.s
@@ -133,41 +115,29 @@ define <vscale x 8 x iXLen> @lrint_v8f16(<vscale x 8 x half> %x) {
 ; CHECK-NEXT:    fcmge p2.h, p0/z, z1.h, z4.h
 ; CHECK-NEXT:    fcmge p3.h, p0/z, z3.h, z4.h
 ; CHECK-NEXT:    fcmge p4.h, p0/z, z5.h, z4.h
-; CHECK-NEXT:    movprfx z4, z2
-; CHECK-NEXT:    fcvtzs z4.d, p0/m, z2.h
-; CHECK-NEXT:    movprfx z7, z1
-; CHECK-NEXT:    fcvtzs z7.d, p0/m, z1.h
-; CHECK-NEXT:    movprfx z24, z3
-; CHECK-NEXT:    fcvtzs z24.d, p0/m, z3.h
-; CHECK-NEXT:    movprfx z25, z5
-; CHECK-NEXT:    fcvtzs z25.d, p0/m, z5.h
-; CHECK-NEXT:    fcmgt p7.h, p0/z, z3.h, z6.h
-; CHECK-NEXT:    fcmgt p5.h, p0/z, z2.h, z6.h
-; CHECK-NEXT:    fcmgt p6.h, p0/z, z1.h, z6.h
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    not p3.b, p0/z, p3.b
-; CHECK-NEXT:    mov z4.d, p1/m, z0.d
-; CHECK-NEXT:    fcmgt p1.h, p0/z, z5.h, z6.h
-; CHECK-NEXT:    not p4.b, p0/z, p4.b
-; CHECK-NEXT:    sel z6.d, p2, z0.d, z7.d
-; CHECK-NEXT:    fcmuo p2.h, p0/z, z2.h, z2.h
-; CHECK-NEXT:    sel z7.d, p3, z0.d, z24.d
-; CHECK-NEXT:    fcmuo p3.h, p0/z, z1.h, z1.h
-; CHECK-NEXT:    sel z24.d, p4, z0.d, z25.d
-; CHECK-NEXT:    fcmuo p4.h, p0/z, z3.h, z3.h
+; CHECK-NEXT:    mov z4.d, #0x7fffffffffffffff
+; CHECK-NEXT:    fcmgt p5.h, p0/z, z5.h, z25.h
+; CHECK-NEXT:    fcmuo p6.h, p0/z, z1.h, z1.h
+; CHECK-NEXT:    fcvtzs z0.d, p1/m, z2.h
+; CHECK-NEXT:    fcvtzs z6.d, p2/m, z1.h
+; CHECK-NEXT:    fcmgt p1.h, p0/z, z2.h, z25.h
+; CHECK-NEXT:    fcmgt p2.h, p0/z, z1.h, z25.h
+; CHECK-NEXT:    fcvtzs z7.d, p3/m, z3.h
+; CHECK-NEXT:    fcmgt p3.h, p0/z, z3.h, z25.h
+; CHECK-NEXT:    fcvtzs z24.d, p4/m, z5.h
+; CHECK-NEXT:    fcmuo p4.h, p0/z, z2.h, z2.h
+; CHECK-NEXT:    mov z0.d, p1/m, z4.d
+; CHECK-NEXT:    fcmuo p1.h, p0/z, z3.h, z3.h
 ; CHECK-NEXT:    fcmuo p0.h, p0/z, z5.h, z5.h
-; CHECK-NEXT:    sel z0.d, p5, z26.d, z4.d
-; CHECK-NEXT:    sel z1.d, p6, z26.d, z6.d
-; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    sel z2.d, p7, z26.d, z7.d
-; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    sel z3.d, p1, z26.d, z24.d
+; CHECK-NEXT:    sel z1.d, p2, z4.d, z6.d
+; CHECK-NEXT:    sel z2.d, p3, z4.d, z7.d
+; CHECK-NEXT:    sel z3.d, p5, z4.d, z24.d
 ; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    mov z0.d, p2/m, #0 // =0x0
-; CHECK-NEXT:    mov z1.d, p3/m, #0 // =0x0
-; CHECK-NEXT:    mov z2.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    mov z0.d, p4/m, #0 // =0x0
 ; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z1.d, p6/m, #0 // =0x0
+; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z2.d, p1/m, #0 // =0x0
 ; CHECK-NEXT:    mov z3.d, p0/m, #0 // =0x0
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -181,7 +151,7 @@ define <vscale x 16 x iXLen> @lrint_v16f16(<vscale x 16 x half> %x) {
 ; CHECK-LABEL: lrint_v16f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    addvl sp, sp, #-3
+; CHECK-NEXT:    addvl sp, sp, #-4
 ; CHECK-NEXT:    str p10, [sp, #1, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p9, [sp, #2, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p8, [sp, #3, mul vl] // 2-byte Folded Spill
@@ -189,124 +159,110 @@ define <vscale x 16 x iXLen> @lrint_v16f16(<vscale x 16 x half> %x) {
 ; CHECK-NEXT:    str p6, [sp, #5, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    str z9, [sp, #1, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT:    str z8, [sp, #2, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG
+; CHECK-NEXT:    str z10, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z9, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z8, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
 ; CHECK-NEXT:    .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG
 ; CHECK-NEXT:    uunpklo z2.s, z0.h
 ; CHECK-NEXT:    uunpkhi z0.s, z0.h
 ; CHECK-NEXT:    mov w8, #64511 // =0xfbff
-; CHECK-NEXT:    uunpklo z4.s, z1.h
-; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    uunpklo z3.s, z1.h
 ; CHECK-NEXT:    uunpkhi z1.s, z1.h
-; CHECK-NEXT:    mov z5.h, w8
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov z24.h, w8
 ; CHECK-NEXT:    mov w8, #31743 // =0x7bff
-; CHECK-NEXT:    mov z25.d, #0x8000000000000000
-; CHECK-NEXT:    mov z27.h, w8
-; CHECK-NEXT:    mov z7.d, #0x7fffffffffffffff
-; CHECK-NEXT:    uunpklo z3.d, z2.s
+; CHECK-NEXT:    mov z7.d, #0x8000000000000000
+; CHECK-NEXT:    mov z27.d, #0x8000000000000000
+; CHECK-NEXT:    mov z28.d, #0x8000000000000000
+; CHECK-NEXT:    mov z30.d, #0x8000000000000000
+; CHECK-NEXT:    uunpklo z4.d, z2.s
 ; CHECK-NEXT:    uunpkhi z2.d, z2.s
-; CHECK-NEXT:    uunpklo z6.d, z0.s
+; CHECK-NEXT:    uunpklo z5.d, z0.s
 ; CHECK-NEXT:    uunpkhi z0.d, z0.s
-; CHECK-NEXT:    uunpklo z24.d, z4.s
-; CHECK-NEXT:    uunpkhi z4.d, z4.s
-; CHECK-NEXT:    uunpklo z26.d, z1.s
-; CHECK-NEXT:    uunpkhi z1.d, z1.s
+; CHECK-NEXT:    uunpklo z6.d, z3.s
+; CHECK-NEXT:    uunpklo z25.d, z1.s
+; CHECK-NEXT:    uunpkhi z3.d, z3.s
+; CHECK-NEXT:    mov z8.d, #0x8000000000000000
+; CHECK-NEXT:    mov z31.d, #0x8000000000000000
+; CHECK-NEXT:    mov z10.d, #0x7fffffffffffffff
+; CHECK-NEXT:    frintx z4.h, p0/m, z4.h
 ; CHECK-NEXT:    frintx z2.h, p0/m, z2.h
-; CHECK-NEXT:    frintx z3.h, p0/m, z3.h
+; CHECK-NEXT:    frintx z5.h, p0/m, z5.h
+; CHECK-NEXT:    movprfx z26, z0
+; CHECK-NEXT:    frintx z26.h, p0/m, z0.h
+; CHECK-NEXT:    uunpkhi z0.d, z1.s
 ; CHECK-NEXT:    frintx z6.h, p0/m, z6.h
-; CHECK-NEXT:    movprfx z28, z0
-; CHECK-NEXT:    frintx z28.h, p0/m, z0.h
-; CHECK-NEXT:    movprfx z29, z4
-; CHECK-NEXT:    frintx z29.h, p0/m, z4.h
-; CHECK-NEXT:    frintx z24.h, p0/m, z24.h
-; CHECK-NEXT:    movprfx z30, z1
-; CHECK-NEXT:    frintx z30.h, p0/m, z1.h
-; CHECK-NEXT:    frintx z26.h, p0/m, z26.h
-; CHECK-NEXT:    fcmge p5.h, p0/z, z2.h, z5.h
-; CHECK-NEXT:    fcmge p2.h, p0/z, z3.h, z5.h
-; CHECK-NEXT:    movprfx z1, z2
-; CHECK-NEXT:    fcvtzs z1.d, p0/m, z2.h
-; CHECK-NEXT:    movprfx z0, z3
-; CHECK-NEXT:    fcvtzs z0.d, p0/m, z3.h
-; CHECK-NEXT:    fcmge p6.h, p0/z, z6.h, z5.h
-; CHECK-NEXT:    fcmgt p3.h, p0/z, z3.h, z27.h
-; CHECK-NEXT:    fcmuo p1.h, p0/z, z3.h, z3.h
-; CHECK-NEXT:    fcmge p7.h, p0/z, z28.h, z5.h
-; CHECK-NEXT:    movprfx z3, z6
-; CHECK-NEXT:    fcvtzs z3.d, p0/m, z6.h
-; CHECK-NEXT:    fcmge p8.h, p0/z, z24.h, z5.h
-; CHECK-NEXT:    fcmgt p4.h, p0/z, z2.h, z27.h
-; CHECK-NEXT:    fcmge p9.h, p0/z, z26.h, z5.h
-; CHECK-NEXT:    not p5.b, p0/z, p5.b
-; CHECK-NEXT:    movprfx z4, z24
-; CHECK-NEXT:    fcvtzs z4.d, p0/m, z24.h
-; CHECK-NEXT:    fcmge p10.h, p0/z, z30.h, z5.h
-; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    movprfx z31, z26
-; CHECK-NEXT:    fcvtzs z31.d, p0/m, z26.h
-; CHECK-NEXT:    movprfx z8, z30
-; CHECK-NEXT:    fcvtzs z8.d, p0/m, z30.h
-; CHECK-NEXT:    mov z1.d, p5/m, z25.d
-; CHECK-NEXT:    fcmge p5.h, p0/z, z29.h, z5.h
-; CHECK-NEXT:    not p6.b, p0/z, p6.b
-; CHECK-NEXT:    mov z0.d, p2/m, z25.d
-; CHECK-NEXT:    fcmuo p2.h, p0/z, z2.h, z2.h
-; CHECK-NEXT:    movprfx z2, z28
-; CHECK-NEXT:    fcvtzs z2.d, p0/m, z28.h
-; CHECK-NEXT:    movprfx z5, z29
-; CHECK-NEXT:    fcvtzs z5.d, p0/m, z29.h
-; CHECK-NEXT:    not p7.b, p0/z, p7.b
-; CHECK-NEXT:    mov z3.d, p6/m, z25.d
-; CHECK-NEXT:    not p6.b, p0/z, p8.b
-; CHECK-NEXT:    fcmgt p8.h, p0/z, z6.h, z27.h
-; CHECK-NEXT:    mov z1.d, p4/m, z7.d
-; CHECK-NEXT:    not p5.b, p0/z, p5.b
-; CHECK-NEXT:    mov z0.d, p3/m, z7.d
-; CHECK-NEXT:    fcmgt p3.h, p0/z, z29.h, z27.h
-; CHECK-NEXT:    sel z9.d, p7, z25.d, z2.d
-; CHECK-NEXT:    not p7.b, p0/z, p9.b
-; CHECK-NEXT:    mov z4.d, p6/m, z25.d
-; CHECK-NEXT:    not p6.b, p0/z, p10.b
-; CHECK-NEXT:    fcmgt p10.h, p0/z, z28.h, z27.h
-; CHECK-NEXT:    mov z5.d, p5/m, z25.d
-; CHECK-NEXT:    fcmgt p5.h, p0/z, z24.h, z27.h
-; CHECK-NEXT:    fcmuo p9.h, p0/z, z6.h, z6.h
-; CHECK-NEXT:    sel z6.d, p7, z25.d, z31.d
-; CHECK-NEXT:    sel z25.d, p6, z25.d, z8.d
-; CHECK-NEXT:    ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    fcmgt p6.h, p0/z, z26.h, z27.h
-; CHECK-NEXT:    fcmgt p7.h, p0/z, z30.h, z27.h
-; CHECK-NEXT:    fcmuo p4.h, p0/z, z28.h, z28.h
-; CHECK-NEXT:    sel z2.d, p8, z7.d, z3.d
-; CHECK-NEXT:    sel z3.d, p10, z7.d, z9.d
-; CHECK-NEXT:    ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    fcmuo p8.h, p0/z, z29.h, z29.h
-; CHECK-NEXT:    mov z4.d, p5/m, z7.d
-; CHECK-NEXT:    fcmuo p5.h, p0/z, z24.h, z24.h
-; CHECK-NEXT:    fcmuo p10.h, p0/z, z26.h, z26.h
-; CHECK-NEXT:    mov z5.d, p3/m, z7.d
-; CHECK-NEXT:    mov z6.d, p6/m, z7.d
+; CHECK-NEXT:    movprfx z29, z3
+; CHECK-NEXT:    frintx z29.h, p0/m, z3.h
+; CHECK-NEXT:    frintx z25.h, p0/m, z25.h
+; CHECK-NEXT:    mov z1.h, w8
+; CHECK-NEXT:    mov z3.d, #0x8000000000000000
+; CHECK-NEXT:    fcmge p1.h, p0/z, z4.h, z24.h
+; CHECK-NEXT:    fcmge p2.h, p0/z, z2.h, z24.h
+; CHECK-NEXT:    fcmge p3.h, p0/z, z5.h, z24.h
+; CHECK-NEXT:    movprfx z9, z0
+; CHECK-NEXT:    frintx z9.h, p0/m, z0.h
+; CHECK-NEXT:    fcmge p4.h, p0/z, z26.h, z24.h
+; CHECK-NEXT:    fcmge p5.h, p0/z, z6.h, z24.h
+; CHECK-NEXT:    fcmge p7.h, p0/z, z25.h, z24.h
+; CHECK-NEXT:    fcmge p6.h, p0/z, z29.h, z24.h
+; CHECK-NEXT:    fcmgt p8.h, p0/z, z6.h, z1.h
+; CHECK-NEXT:    fcmgt p10.h, p0/z, z25.h, z1.h
+; CHECK-NEXT:    fcmuo p9.h, p0/z, z5.h, z5.h
+; CHECK-NEXT:    fcvtzs z7.d, p1/m, z4.h
+; CHECK-NEXT:    fcmgt p1.h, p0/z, z4.h, z1.h
+; CHECK-NEXT:    fcvtzs z27.d, p2/m, z2.h
+; CHECK-NEXT:    fcmge p2.h, p0/z, z9.h, z24.h
+; CHECK-NEXT:    mov z24.d, #0x8000000000000000
+; CHECK-NEXT:    fcvtzs z28.d, p3/m, z5.h
+; CHECK-NEXT:    fcvtzs z3.d, p4/m, z26.h
+; CHECK-NEXT:    fcvtzs z30.d, p5/m, z6.h
+; CHECK-NEXT:    fcvtzs z8.d, p7/m, z25.h
+; CHECK-NEXT:    fcmgt p4.h, p0/z, z2.h, z1.h
+; CHECK-NEXT:    fcmgt p5.h, p0/z, z5.h, z1.h
+; CHECK-NEXT:    fcmgt p7.h, p0/z, z26.h, z1.h
+; CHECK-NEXT:    fcvtzs z31.d, p6/m, z29.h
+; CHECK-NEXT:    sel z0.d, p1, z10.d, z7.d
+; CHECK-NEXT:    fcmgt p1.h, p0/z, z29.h, z1.h
+; CHECK-NEXT:    fcvtzs z24.d, p2/m, z9.h
+; CHECK-NEXT:    fcmgt p2.h, p0/z, z9.h, z1.h
+; CHECK-NEXT:    fcmuo p3.h, p0/z, z4.h, z4.h
+; CHECK-NEXT:    fcmuo p6.h, p0/z, z2.h, z2.h
+; CHECK-NEXT:    sel z4.d, p8, z10.d, z30.d
+; CHECK-NEXT:    fcmuo p8.h, p0/z, z25.h, z25.h
+; CHECK-NEXT:    sel z1.d, p4, z10.d, z27.d
+; CHECK-NEXT:    fcmuo p4.h, p0/z, z26.h, z26.h
+; CHECK-NEXT:    sel z2.d, p5, z10.d, z28.d
+; CHECK-NEXT:    mov z3.d, p7/m, z10.d
+; CHECK-NEXT:    fcmuo p5.h, p0/z, z6.h, z6.h
+; CHECK-NEXT:    fcmuo p7.h, p0/z, z29.h, z29.h
+; CHECK-NEXT:    sel z5.d, p1, z10.d, z31.d
+; CHECK-NEXT:    sel z6.d, p10, z10.d, z8.d
+; CHECK-NEXT:    ldr z8, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p10, [sp, #1, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    fcmuo p0.h, p0/z, z9.h, z9.h
+; CHECK-NEXT:    ldr z9, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    sel z7.d, p2, z10.d, z24.d
+; CHECK-NEXT:    ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    mov z1.d, p6/m, #0 // =0x0
 ; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    fcmuo p0.h, p0/z, z30.h, z30.h
-; CHECK-NEXT:    sel z7.d, p7, z7.d, z25.d
-; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    mov z2.d, p9/m, #0 // =0x0
-; CHECK-NEXT:    ldr p9, [sp, #2, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    mov z3.d, p4/m, #0 // =0x0
-; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p9, [sp, #2, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    mov z4.d, p5/m, #0 // =0x0
-; CHECK-NEXT:    mov z5.d, p8/m, #0 // =0x0
+; CHECK-NEXT:    mov z5.d, p7/m, #0 // =0x0
+; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z6.d, p8/m, #0 // =0x0
+; CHECK-NEXT:    mov z0.d, p3/m, #0 // =0x0
 ; CHECK-NEXT:    ldr p8, [sp, #3, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    mov z6.d, p10/m, #0 // =0x0
-; CHECK-NEXT:    mov z0.d, p1/m, #0 // =0x0
-; CHECK-NEXT:    ldr p10, [sp, #1, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    mov z1.d, p2/m, #0 // =0x0
 ; CHECK-NEXT:    mov z7.d, p0/m, #0 // =0x0
 ; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    addvl sp, sp, #3
+; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #4
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
   %a = call <vscale x 16 x iXLen> @llvm.lrint.nxv16iXLen.nxv16f16(<vscale x 16 x half> %x)
@@ -319,6 +275,8 @@ define <vscale x 32 x iXLen> @lrint_v32f16(<vscale x 32 x half> %x) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-17
+; CHECK-NEXT:    str p11, [sp] // 2-byte Folded Spill
+; CHECK-NEXT:    str p10, [sp, #1, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p9, [sp, #2, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p8, [sp, #3, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p7, [sp, #4, mul vl] // 2-byte Folded Spill
@@ -341,8 +299,8 @@ define <vscale x 32 x iXLen> @lrint_v32f16(<vscale x 32 x half> %x) {
 ; CHECK-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
 ; CHECK-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
 ; CHECK-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT:    addvl sp, sp, #-3
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xa0, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 160 * VG
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 144 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
 ; CHECK-NEXT:    .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
@@ -353,230 +311,191 @@ define <vscale x 32 x iXLen> @lrint_v32f16(<vscale x 32 x half> %x) {
 ; CHECK-NEXT:    .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 56 * VG
 ; CHECK-NEXT:    .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG
 ; CHECK-NEXT:    uunpklo z4.s, z0.h
-; CHECK-NEXT:    uunpkhi z5.s, z0.h
+; CHECK-NEXT:    uunpkhi z0.s, z0.h
 ; CHECK-NEXT:    mov w9, #64511 // =0xfbff
+; CHECK-NEXT:    uunpklo z25.s, z1.h
+; CHECK-NEXT:    uunpkhi z10.s, z1.h
+; CHECK-NEXT:    uunpklo z9.s, z2.h
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    uunpklo z6.s, z1.h
-; CHECK-NEXT:    mov z26.h, w9
-; CHECK-NEXT:    uunpkhi z25.s, z1.h
+; CHECK-NEXT:    uunpkhi z12.s, z3.h
+; CHECK-NEXT:    mov z27.h, w9
 ; CHECK-NEXT:    mov w9, #31743 // =0x7bff
-; CHECK-NEXT:    mov z27.d, #0x8000000000000000
-; CHECK-NEXT:    uunpklo z31.s, z2.h
-; CHECK-NEXT:    uunpkhi z12.s, z2.h
-; CHECK-NEXT:    mov z17.d, z3.d
-; CHECK-NEXT:    uunpklo z0.d, z4.s
-; CHECK-NEXT:    uunpkhi z4.d, z4.s
-; CHECK-NEXT:    uunpklo z7.d, z5.s
-; CHECK-NEXT:    uunpkhi z24.d, z5.s
-; CHECK-NEXT:    uunpklo z28.d, z6.s
-; CHECK-NEXT:    uunpkhi z29.d, z6.s
-; CHECK-NEXT:    uunpklo z8.d, z25.s
-; CHECK-NEXT:    uunpkhi z9.d, z25.s
-; CHECK-NEXT:    uunpklo z16.s, z17.h
-; CHECK-NEXT:    uunpklo z11.d, z31.s
-; CHECK-NEXT:    uunpkhi z14.d, z31.s
-; CHECK-NEXT:    uunpkhi z17.s, z17.h
-; CHECK-NEXT:    movprfx z30, z4
-; CHECK-NEXT:    frintx z30.h, p0/m, z4.h
-; CHECK-NEXT:    movprfx z4, z7
-; CHECK-NEXT:    frintx z4.h, p0/m, z7.h
-; CHECK-NEXT:    frintx z0.h, p0/m, z0.h
-; CHECK-NEXT:    movprfx z6, z24
-; CHECK-NEXT:    frintx z6.h, p0/m, z24.h
-; CHECK-NEXT:    movprfx z7, z28
-; CHECK-NEXT:    frintx z7.h, p0/m, z28.h
-; CHECK-NEXT:    movprfx z25, z29
-; CHECK-NEXT:    frintx z25.h, p0/m, z29.h
-; CHECK-NEXT:    movprfx z3, z9
-; CHECK-NEXT:    frintx z3.h, p0/m, z9.h
-; CHECK-NEXT:    mov z5.h, w9
-; CHECK-NEXT:    movprfx z31, z11
-; CHECK-NEXT:    frintx z31.h, p0/m, z11.h
-; CHECK-NEXT:    movprfx z9, z14
-; CHECK-NEXT:    frintx z9.h, p0/m, z14.h
-; CHECK-NEXT:    fcmge p1.h, p0/z, z0.h, z26.h
-; CHECK-NEXT:    fcmge p4.h, p0/z, z4.h, z26.h
-; CHECK-NEXT:    movprfx z24, z0
-; CHECK-NEXT:    fcvtzs z24.d, p0/m, z0.h
-; CHECK-NEXT:    fcmge p2.h, p0/z, z30.h, z26.h
-; CHECK-NEXT:    movprfx z29, z4
-; CHECK-NEXT:    fcvtzs z29.d, p0/m, z4.h
-; CHECK-NEXT:    fcmge p6.h, p0/z, z6.h, z26.h
-; CHECK-NEXT:    movprfx z28, z30
-; CHECK-NEXT:    fcvtzs z28.d, p0/m, z30.h
-; CHECK-NEXT:    movprfx z10, z6
-; CHECK-NEXT:    fcvtzs z10.d, p0/m, z6.h
-; CHECK-NEXT:    str z0, [sp, #1, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT:    fcmge p3.h, p0/z, z7.h, z26.h
-; CHECK-NEXT:    movprfx z13, z7
-; CHECK-NEXT:    fcvtzs z13.d, p0/m, z7.h
-; CHECK-NEXT:    movprfx z15, z25
-; CHECK-NEXT:    fcvtzs z15.d, p0/m, z25.h
-; CHECK-NEXT:    not p5.b, p0/z, p1.b
-; CHECK-NEXT:    movprfx z18, z3
-; CHECK-NEXT:    fcvtzs z18.d, p0/m, z3.h
-; CHECK-NEXT:    movprfx z20, z31
-; CHECK-NEXT:    fcvtzs z20.d, p0/m, z31.h
-; CHECK-NEXT:    not p4.b, p0/z, p4.b
-; CHECK-NEXT:    movprfx z21, z9
-; CHECK-NEXT:    fcvtzs z21.d, p0/m, z9.h
-; CHECK-NEXT:    fcmgt p1.h, p0/z, z30.h, z5.h
-; CHECK-NEXT:    sel z0.d, p5, z27.d, z24.d
-; CHECK-NEXT:    not p7.b, p0/z, p2.b
-; CHECK-NEXT:    fcmgt p2.h, p0/z, z4.h, z5.h
-; CHECK-NEXT:    mov z29.d, p4/m, z27.d
-; CHECK-NEXT:    fcmge p4.h, p0/z, z25.h, z26.h
-; CHECK-NEXT:    not p5.b, p0/z, p6.b
-; CHECK-NEXT:    not p3.b, p0/z, p3.b
-; CHECK-NEXT:    fcmge p6.h, p0/z, z9.h, z26.h
-; CHECK-NEXT:    fcmgt p9.h, p0/z, z6.h, z5.h
-; CHECK-NEXT:    str z0, [sp, #2, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT:    sel z0.d, p7, z27.d, z28.d
-; CHECK-NEXT:    movprfx z28, z8
-; CHECK-NEXT:    frintx z28.h, p0/m, z8.h
-; CHECK-NEXT:    sel z8.d, p5, z27.d, z10.d
-; CHECK-NEXT:    uunpklo z10.d, z12.s
-; CHECK-NEXT:    uunpkhi z12.d, z12.s
-; CHECK-NEXT:    not p5.b, p0/z, p4.b
-; CHECK-NEXT:    sel z11.d, p3, z27.d, z13.d
-; CHECK-NEXT:    uunpklo z13.d, z16.s
-; CHECK-NEXT:    fcmge p3.h, p0/z, z3.h, z26.h
-; CHECK-NEXT:    not p6.b, p0/z, p6.b
-; CHECK-NEXT:    str z0, [sp] // 16-byte Folded Spill
-; CHECK-NEXT:    sel z24.d, p5, z27.d, z15.d
-; CHECK-NEXT:    uunpkhi z15.d, z16.s
-; CHECK-NEXT:    movprfx z14, z28
-; CHECK-NEXT:    fcvtzs z14.d, p0/m, z28.h
-; CHECK-NEXT:    frintx z10.h, p0/m, z10.h
-; CHECK-NEXT:    uunpklo z16.d, z17.s
-; CHECK-NEXT:    frintx z12.h, p0/m, z12.h
-; CHECK-NEXT:    uunpkhi z17.d, z17.s
-; CHECK-NEXT:    movprfx z19, z13
-; CHECK-NEXT:    frintx z19.h, p0/m, z13.h
-; CHECK-NEXT:    fcmge p4.h, p0/z, z28.h, z26.h
-; CHECK-NEXT:    fcmge p5.h, p0/z, z31.h, z26.h
-; CHECK-NEXT:    not p3.b, p0/z, p3.b
-; CHECK-NEXT:    frintx z15.h, p0/m, z15.h
-; CHECK-NEXT:    fcmge p7.h, p0/z, z10.h, z26.h
+; CHECK-NEXT:    uunpkhi z14.s, z2.h
+; CHECK-NEXT:    uunpklo z15.s, z3.h
+; CHECK-NEXT:    uunpklo z7.d, z0.s
+; CHECK-NEXT:    uunpklo z5.d, z4.s
+; CHECK-NEXT:    uunpkhi z6.d, z4.s
+; CHECK-NEXT:    uunpklo z29.d, z25.s
+; CHECK-NEXT:    uunpkhi z26.d, z0.s
+; CHECK-NEXT:    uunpklo z8.d, z10.s
+; CHECK-NEXT:    uunpkhi z11.d, z10.s
+; CHECK-NEXT:    uunpklo z10.d, z9.s
+; CHECK-NEXT:    uunpkhi z13.d, z9.s
+; CHECK-NEXT:    mov z31.d, #0x8000000000000000
+; CHECK-NEXT:    uunpklo z16.d, z12.s
+; CHECK-NEXT:    uunpklo z18.d, z14.s
+; CHECK-NEXT:    movprfx z1, z7
+; CHECK-NEXT:    frintx z1.h, p0/m, z7.h
+; CHECK-NEXT:    movprfx z4, z5
+; CHECK-NEXT:    frintx z4.h, p0/m, z5.h
+; CHECK-NEXT:    movprfx z5, z6
+; CHECK-NEXT:    frintx z5.h, p0/m, z6.h
+; CHECK-NEXT:    movprfx z7, z29
+; CHECK-NEXT:    frintx z7.h, p0/m, z29.h
+; CHECK-NEXT:    movprfx z6, z26
+; CHECK-NEXT:    frintx z6.h, p0/m, z26.h
+; CHECK-NEXT:    mov z29.d, #0x8000000000000000
+; CHECK-NEXT:    movprfx z9, z11
+; CHECK-NEXT:    frintx z9.h, p0/m, z11.h
+; CHECK-NEXT:    movprfx z3, z10
+; CHECK-NEXT:    frintx z3.h, p0/m, z10.h
+; CHECK-NEXT:    movprfx z10, z13
+; CHECK-NEXT:    frintx z10.h, p0/m, z13.h
+; CHECK-NEXT:    uunpkhi z26.d, z25.s
+; CHECK-NEXT:    uunpkhi z13.d, z12.s
+; CHECK-NEXT:    frintx z8.h, p0/m, z8.h
+; CHECK-NEXT:    fcmge p3.h, p0/z, z1.h, z27.h
+; CHECK-NEXT:    uunpkhi z14.d, z14.s
+; CHECK-NEXT:    mov z0.d, #0x8000000000000000
+; CHECK-NEXT:    fcmge p2.h, p0/z, z7.h, z27.h
+; CHECK-NEXT:    mov z28.d, #0x8000000000000000
+; CHECK-NEXT:    fcmge p4.h, p0/z, z4.h, z27.h
+; CHECK-NEXT:    fcmge p5.h, p0/z, z5.h, z27.h
+; CHECK-NEXT:    uunpklo z19.d, z15.s
+; CHECK-NEXT:    uunpkhi z15.d, z15.s
+; CHECK-NEXT:    movprfx z20, z13
+; CHECK-NEXT:    frintx z20.h, p0/m, z13.h
+; CHECK-NEXT:    mov z30.d, #0x8000000000000000
+; CHECK-NEXT:    fcmge p1.h, p0/z, z6.h, z27.h
+; CHECK-NEXT:    frintx z26.h, p0/m, z26.h
+; CHECK-NEXT:    fcvtzs z29.d, p3/m, z1.h
+; CHECK-NEXT:    fcmge p3.h, p0/z, z9.h, z27.h
+; CHECK-NEXT:    mov z11.d, #0x8000000000000000
+; CHECK-NEXT:    fcvtzs z31.d, p2/m, z7.h
+; CHECK-NEXT:    fcmge p2.h, p0/z, z8.h, z27.h
+; CHECK-NEXT:    mov z17.d, #0x8000000000000000
 ; CHECK-NEXT:    frintx z16.h, p0/m, z16.h
-; CHECK-NEXT:    fcmge p8.h, p0/z, z12.h, z26.h
-; CHECK-NEXT:    frintx z17.h, p0/m, z17.h
-; CHECK-NEXT:    movprfx z23, z19
-; CHECK-NEXT:    fcvtzs z23.d, p0/m, z19.h
-; CHECK-NEXT:    not p4.b, p0/z, p4.b
-; CHECK-NEXT:    not p5.b, p0/z, p5.b
-; CHECK-NEXT:    sel z13.d, p3, z27.d, z18.d
-; CHECK-NEXT:    fcmge p3.h, p0/z, z19.h, z26.h
-; CHECK-NEXT:    movprfx z0, z15
-; CHECK-NEXT:    fcvtzs z0.d, p0/m, z15.h
-; CHECK-NEXT:    sel z22.d, p4, z27.d, z14.d
-; CHECK-NEXT:    sel z18.d, p6, z27.d, z21.d
-; CHECK-NEXT:    movprfx z21, z12
-; CHECK-NEXT:    fcvtzs z21.d, p0/m, z12.h
-; CHECK-NEXT:    movprfx z1, z16
-; CHECK-NEXT:    fcvtzs z1.d, p0/m, z16.h
-; CHECK-NEXT:    sel z14.d, p5, z27.d, z20.d
-; CHECK-NEXT:    fcmge p4.h, p0/z, z15.h, z26.h
-; CHECK-NEXT:    movprfx z20, z10
-; CHECK-NEXT:    fcvtzs z20.d, p0/m, z10.h
-; CHECK-NEXT:    movprfx z2, z17
-; CHECK-NEXT:    fcvtzs z2.d, p0/m, z17.h
-; CHECK-NEXT:    not p5.b, p0/z, p7.b
-; CHECK-NEXT:    fcmge p6.h, p0/z, z16.h, z26.h
-; CHECK-NEXT:    not p7.b, p0/z, p8.b
-; CHECK-NEXT:    fcmge p8.h, p0/z, z17.h, z26.h
-; CHECK-NEXT:    mov z26.d, #0x7fffffffffffffff
-; CHECK-NEXT:    not p3.b, p0/z, p3.b
-; CHECK-NEXT:    not p4.b, p0/z, p4.b
-; CHECK-NEXT:    mov z20.d, p5/m, z27.d
-; CHECK-NEXT:    mov z21.d, p7/m, z27.d
-; CHECK-NEXT:    not p5.b, p0/z, p6.b
-; CHECK-NEXT:    mov z23.d, p3/m, z27.d
-; CHECK-NEXT:    fcmgt p3.h, p0/z, z17.h, z5.h
-; CHECK-NEXT:    not p6.b, p0/z, p8.b
-; CHECK-NEXT:    mov z0.d, p4/m, z27.d
-; CHECK-NEXT:    fcmgt p4.h, p0/z, z16.h, z5.h
-; CHECK-NEXT:    mov z1.d, p5/m, z27.d
-; CHECK-NEXT:    fcmuo p5.h, p0/z, z16.h, z16.h
-; CHECK-NEXT:    mov z29.d, p2/m, z26.d
-; CHECK-NEXT:    mov z2.d, p6/m, z27.d
-; CHECK-NEXT:    ldr z27, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:    fcmgt p6.h, p0/z, z7.h, z5.h
-; CHECK-NEXT:    fcmgt p2.h, p0/z, z12.h, z5.h
-; CHECK-NEXT:    fcmuo p8.h, p0/z, z17.h, z17.h
-; CHECK-NEXT:    fcmgt p7.h, p0/z, z28.h, z5.h
-; CHECK-NEXT:    mov z1.d, p4/m, z26.d
-; CHECK-NEXT:    fcmuo p4.h, p0/z, z15.h, z15.h
-; CHECK-NEXT:    mov z8.d, p9/m, z26.d
-; CHECK-NEXT:    mov z27.d, p1/m, z26.d
-; CHECK-NEXT:    fcmgt p1.h, p0/z, z15.h, z5.h
-; CHECK-NEXT:    mov z2.d, p3/m, z26.d
-; CHECK-NEXT:    fcmgt p3.h, p0/z, z19.h, z5.h
-; CHECK-NEXT:    mov z11.d, p6/m, z26.d
-; CHECK-NEXT:    fcmuo p6.h, p0/z, z19.h, z19.h
-; CHECK-NEXT:    mov z1.d, p5/m, #0 // =0x0
-; CHECK-NEXT:    fcmgt p5.h, p0/z, z9.h, z5.h
-; CHECK-NEXT:    sel z15.d, p2, z26.d, z21.d
-; CHECK-NEXT:    fcmuo p2.h, p0/z, z12.h, z12.h
-; CHECK-NEXT:    mov z2.d, p8/m, #0 // =0x0
-; CHECK-NEXT:    sel z16.d, p7, z26.d, z22.d
-; CHECK-NEXT:    mov z0.d, p1/m, z26.d
-; CHECK-NEXT:    fcmgt p1.h, p0/z, z10.h, z5.h
-; CHECK-NEXT:    str z1, [x8, #14, mul vl]
-; CHECK-NEXT:    sel z17.d, p3, z26.d, z23.d
-; CHECK-NEXT:    fcmuo p3.h, p0/z, z10.h, z10.h
-; CHECK-NEXT:    str z2, [x8, #15, mul vl]
-; CHECK-NEXT:    sel z2.d, p5, z26.d, z18.d
-; CHECK-NEXT:    fcmuo p5.h, p0/z, z9.h, z9.h
-; CHECK-NEXT:    mov z0.d, p4/m, #0 // =0x0
-; CHECK-NEXT:    fcmgt p4.h, p0/z, z3.h, z5.h
-; CHECK-NEXT:    mov z15.d, p2/m, #0 // =0x0
-; CHECK-NEXT:    sel z1.d, p1, z26.d, z20.d
-; CHECK-NEXT:    fcmgt p1.h, p0/z, z31.h, z5.h
-; CHECK-NEXT:    mov z17.d, p6/m, #0 // =0x0
-; CHECK-NEXT:    fcmuo p2.h, p0/z, z31.h, z31.h
+; CHECK-NEXT:    frintx z14.h, p0/m, z14.h
+; CHECK-NEXT:    fcvtzs z0.d, p4/m, z4.h
+; CHECK-NEXT:    fcvtzs z28.d, p5/m, z5.h
+; CHECK-NEXT:    fcmge p4.h, p0/z, z3.h, z27.h
+; CHECK-NEXT:    mov z12.d, #0x8000000000000000
+; CHECK-NEXT:    fcmge p5.h, p0/z, z10.h, z27.h
+; CHECK-NEXT:    mov z13.d, #0x8000000000000000
+; CHECK-NEXT:    frintx z19.h, p0/m, z19.h
+; CHECK-NEXT:    frintx z15.h, p0/m, z15.h
+; CHECK-NEXT:    mov z24.d, #0x8000000000000000
+; CHECK-NEXT:    fcvtzs z30.d, p1/m, z6.h
+; CHECK-NEXT:    fcmge p1.h, p0/z, z26.h, z27.h
+; CHECK-NEXT:    mov z2.d, #0x8000000000000000
+; CHECK-NEXT:    str z0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    frintx z18.h, p0/m, z18.h
+; CHECK-NEXT:    fcvtzs z11.d, p3/m, z9.h
+; CHECK-NEXT:    fcmge p3.h, p0/z, z20.h, z27.h
+; CHECK-NEXT:    mov z25.h, w9
+; CHECK-NEXT:    fcvtzs z17.d, p2/m, z8.h
+; CHECK-NEXT:    fcmge p6.h, p0/z, z16.h, z27.h
+; CHECK-NEXT:    mov z21.d, #0x8000000000000000
+; CHECK-NEXT:    fcmge p2.h, p0/z, z14.h, z27.h
+; CHECK-NEXT:    mov z22.d, #0x8000000000000000
+; CHECK-NEXT:    fcvtzs z12.d, p4/m, z3.h
+; CHECK-NEXT:    fcvtzs z13.d, p5/m, z10.h
+; CHECK-NEXT:    fcmge p4.h, p0/z, z19.h, z27.h
+; CHECK-NEXT:    mov z23.d, #0x8000000000000000
+; CHECK-NEXT:    fcmge p5.h, p0/z, z15.h, z27.h
+; CHECK-NEXT:    mov z0.d, #0x8000000000000000
+; CHECK-NEXT:    fcvtzs z2.d, p1/m, z26.h
+; CHECK-NEXT:    fcmge p1.h, p0/z, z18.h, z27.h
+; CHECK-NEXT:    fcvtzs z24.d, p3/m, z20.h
+; CHECK-NEXT:    mov z27.d, #0x7fffffffffffffff
+; CHECK-NEXT:    fcmgt p11.h, p0/z, z20.h, z25.h
+; CHECK-NEXT:    fcvtzs z21.d, p6/m, z16.h
+; CHECK-NEXT:    fcmgt p3.h, p0/z, z16.h, z25.h
+; CHECK-NEXT:    fcmuo p6.h, p0/z, z16.h, z16.h
+; CHECK-NEXT:    fcvtzs z22.d, p2/m, z14.h
+; CHECK-NEXT:    fcmgt p2.h, p0/z, z8.h, z25.h
+; CHECK-NEXT:    mov z16.d, #0x8000000000000000
+; CHECK-NEXT:    fcmgt p7.h, p0/z, z5.h, z25.h
+; CHECK-NEXT:    fcvtzs z23.d, p4/m, z19.h
+; CHECK-NEXT:    fcvtzs z0.d, p5/m, z15.h
+; CHECK-NEXT:    fcmuo p4.h, p0/z, z20.h, z20.h
+; CHECK-NEXT:    fcmgt p5.h, p0/z, z15.h, z25.h
+; CHECK-NEXT:    mov z24.d, p11/m, z27.d
+; CHECK-NEXT:    sel z20.d, p3, z27.d, z21.d
+; CHECK-NEXT:    fcmgt p3.h, p0/z, z19.h, z25.h
+; CHECK-NEXT:    fcmgt p8.h, p0/z, z1.h, z25.h
+; CHECK-NEXT:    mov z17.d, p2/m, z27.d
+; CHECK-NEXT:    fcvtzs z16.d, p1/m, z18.h
+; CHECK-NEXT:    fcmgt p2.h, p0/z, z18.h, z25.h
+; CHECK-NEXT:    mov z28.d, p7/m, z27.d
+; CHECK-NEXT:    fcmgt p7.h, p0/z, z14.h, z25.h
+; CHECK-NEXT:    fcmuo p1.h, p0/z, z15.h, z15.h
+; CHECK-NEXT:    mov z0.d, p5/m, z27.d
+; CHECK-NEXT:    mov z24.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    fcmgt p5.h, p0/z, z10.h, z25.h
+; CHECK-NEXT:    fcmuo p4.h, p0/z, z19.h, z19.h
+; CHECK-NEXT:    sel z19.d, p3, z27.d, z23.d
+; CHECK-NEXT:    fcmuo p3.h, p0/z, z14.h, z14.h
+; CHECK-NEXT:    mov z20.d, p6/m, #0 // =0x0
+; CHECK-NEXT:    fcmgt p9.h, p0/z, z6.h, z25.h
+; CHECK-NEXT:    fcmgt p10.h, p0/z, z7.h, z25.h
+; CHECK-NEXT:    str z24, [x8, #15, mul vl]
+; CHECK-NEXT:    sel z24.d, p2, z27.d, z16.d
+; CHECK-NEXT:    fcmgt p2.h, p0/z, z3.h, z25.h
+; CHECK-NEXT:    sel z15.d, p7, z27.d, z22.d
+; CHECK-NEXT:    mov z0.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    mov z13.d, p5/m, z27.d
+; CHECK-NEXT:    str z20, [x8, #14, mul vl]
+; CHECK-NEXT:    fcmgt p5.h, p0/z, z9.h, z25.h
+; CHECK-NEXT:    fcmuo p1.h, p0/z, z18.h, z18.h
+; CHECK-NEXT:    mov z19.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p4.h, p0/z, z10.h, z10.h
+; CHECK-NEXT:    mov z29.d, p8/m, z27.d
 ; CHECK-NEXT:    str z0, [x8, #13, mul vl]
-; CHECK-NEXT:    mov z2.d, p5/m, #0 // =0x0
-; CHECK-NEXT:    fcmuo p5.h, p0/z, z25.h, z25.h
-; CHECK-NEXT:    str z17, [x8, #12, mul vl]
-; CHECK-NEXT:    mov z1.d, p3/m, #0 // =0x0
-; CHECK-NEXT:    fcmgt p3.h, p0/z, z25.h, z5.h
+; CHECK-NEXT:    mov z15.d, p3/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p3.h, p0/z, z3.h, z3.h
+; CHECK-NEXT:    sel z0.d, p2, z27.d, z12.d
+; CHECK-NEXT:    fcmuo p2.h, p0/z, z9.h, z9.h
+; CHECK-NEXT:    mov z30.d, p9/m, z27.d
+; CHECK-NEXT:    str z19, [x8, #12, mul vl]
+; CHECK-NEXT:    sel z3.d, p5, z27.d, z11.d
+; CHECK-NEXT:    mov z24.d, p1/m, #0 // =0x0
 ; CHECK-NEXT:    str z15, [x8, #11, mul vl]
-; CHECK-NEXT:    sel z0.d, p1, z26.d, z14.d
-; CHECK-NEXT:    fcmuo p1.h, p0/z, z3.h, z3.h
-; CHECK-NEXT:    sel z3.d, p4, z26.d, z13.d
-; CHECK-NEXT:    fcmuo p4.h, p0/z, z28.h, z28.h
-; CHECK-NEXT:    str z1, [x8, #10, mul vl]
-; CHECK-NEXT:    sel z1.d, p3, z26.d, z24.d
-; CHECK-NEXT:    fcmuo p3.h, p0/z, z7.h, z7.h
-; CHECK-NEXT:    ldr z7, [sp, #1, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    str z2, [x8, #9, mul vl]
-; CHECK-NEXT:    mov z0.d, p2/m, #0 // =0x0
-; CHECK-NEXT:    mov z3.d, p1/m, #0 // =0x0
-; CHECK-NEXT:    fcmuo p1.h, p0/z, z6.h, z6.h
-; CHECK-NEXT:    mov z16.d, p4/m, #0 // =0x0
-; CHECK-NEXT:    fcmuo p4.h, p0/z, z4.h, z4.h
-; CHECK-NEXT:    fcmgt p2.h, p0/z, z7.h, z5.h
-; CHECK-NEXT:    mov z1.d, p5/m, #0 // =0x0
-; CHECK-NEXT:    fcmuo p5.h, p0/z, z30.h, z30.h
+; CHECK-NEXT:    fcmgt p1.h, p0/z, z26.h, z25.h
+; CHECK-NEXT:    mov z13.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    mov z0.d, p3/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p5.h, p0/z, z8.h, z8.h
+; CHECK-NEXT:    fcmuo p4.h, p0/z, z26.h, z26.h
+; CHECK-NEXT:    str z24, [x8, #10, mul vl]
+; CHECK-NEXT:    mov z3.d, p2/m, #0 // =0x0
+; CHECK-NEXT:    fcmgt p3.h, p0/z, z4.h, z25.h
+; CHECK-NEXT:    str z13, [x8, #9, mul vl]
+; CHECK-NEXT:    fcmuo p2.h, p0/z, z6.h, z6.h
+; CHECK-NEXT:    mov z31.d, p10/m, z27.d
 ; CHECK-NEXT:    str z0, [x8, #8, mul vl]
-; CHECK-NEXT:    fcmuo p0.h, p0/z, z7.h, z7.h
-; CHECK-NEXT:    mov z11.d, p3/m, #0 // =0x0
+; CHECK-NEXT:    mov z2.d, p1/m, z27.d
+; CHECK-NEXT:    fcmuo p1.h, p0/z, z7.h, z7.h
 ; CHECK-NEXT:    str z3, [x8, #7, mul vl]
-; CHECK-NEXT:    ldr z0, [sp, #2, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    str z16, [x8, #6, mul vl]
-; CHECK-NEXT:    mov z8.d, p1/m, #0 // =0x0
-; CHECK-NEXT:    str z1, [x8, #5, mul vl]
-; CHECK-NEXT:    mov z29.d, p4/m, #0 // =0x0
-; CHECK-NEXT:    mov z27.d, p5/m, #0 // =0x0
-; CHECK-NEXT:    str z11, [x8, #4, mul vl]
-; CHECK-NEXT:    str z8, [x8, #3, mul vl]
-; CHECK-NEXT:    mov z0.d, p2/m, z26.d
-; CHECK-NEXT:    str z29, [x8, #2, mul vl]
-; CHECK-NEXT:    str z27, [x8, #1, mul vl]
+; CHECK-NEXT:    mov z17.d, p5/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p5.h, p0/z, z1.h, z1.h
+; CHECK-NEXT:    ldr z0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    mov z2.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p4.h, p0/z, z5.h, z5.h
+; CHECK-NEXT:    fcmuo p0.h, p0/z, z4.h, z4.h
+; CHECK-NEXT:    str z17, [x8, #6, mul vl]
+; CHECK-NEXT:    mov z31.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    mov z30.d, p2/m, #0 // =0x0
+; CHECK-NEXT:    mov z0.d, p3/m, z27.d
+; CHECK-NEXT:    mov z29.d, p5/m, #0 // =0x0
+; CHECK-NEXT:    str z2, [x8, #5, mul vl]
+; CHECK-NEXT:    str z31, [x8, #4, mul vl]
+; CHECK-NEXT:    mov z28.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    str z30, [x8, #3, mul vl]
 ; CHECK-NEXT:    mov z0.d, p0/m, #0 // =0x0
+; CHECK-NEXT:    str z29, [x8, #2, mul vl]
+; CHECK-NEXT:    str z28, [x8, #1, mul vl]
 ; CHECK-NEXT:    str z0, [x8]
-; CHECK-NEXT:    addvl sp, sp, #3
+; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -593,6 +512,8 @@ define <vscale x 32 x iXLen> @lrint_v32f16(<vscale x 32 x half> %x) {
 ; CHECK-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p11, [sp] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p10, [sp, #1, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p9, [sp, #2, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p8, [sp, #3, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
@@ -612,20 +533,17 @@ define <vscale x 1 x iXLen> @lrint_v1f32(<vscale x 1 x float> %x) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov w8, #-553648128 // =0xdf000000
-; CHECK-NEXT:    mov z2.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z1.s, w8
 ; CHECK-NEXT:    mov w8, #1593835519 // =0x5effffff
 ; CHECK-NEXT:    frintx z0.s, p0/m, z0.s
-; CHECK-NEXT:    mov z3.s, w8
+; CHECK-NEXT:    mov z2.s, w8
 ; CHECK-NEXT:    fcmge p1.s, p0/z, z0.s, z1.s
-; CHECK-NEXT:    movprfx z1, z0
-; CHECK-NEXT:    fcvtzs z1.d, p0/m, z0.s
-; CHECK-NEXT:    fcmgt p2.s, p0/z, z0.s, z3.s
-; CHECK-NEXT:    mov z3.d, #0x7fffffffffffffff
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    mov z1.d, #0x8000000000000000
+; CHECK-NEXT:    fcvtzs z1.d, p1/m, z0.s
+; CHECK-NEXT:    fcmgt p1.s, p0/z, z0.s, z2.s
+; CHECK-NEXT:    mov z2.d, #0x7fffffffffffffff
 ; CHECK-NEXT:    fcmuo p0.s, p0/z, z0.s, z0.s
-; CHECK-NEXT:    mov z1.d, p1/m, z2.d
-; CHECK-NEXT:    sel z0.d, p2, z3.d, z1.d
+; CHECK-NEXT:    sel z0.d, p1, z2.d, z1.d
 ; CHECK-NEXT:    mov z0.d, p0/m, #0 // =0x0
 ; CHECK-NEXT:    ret
   %a = call <vscale x 1 x iXLen> @llvm.lrint.nxv1iXLen.nxv1f32(<vscale x 1 x float> %x)
@@ -638,20 +556,17 @@ define <vscale x 2 x iXLen> @lrint_v2f32(<vscale x 2 x float> %x) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov w8, #-553648128 // =0xdf000000
-; CHECK-NEXT:    mov z2.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z1.s, w8
 ; CHECK-NEXT:    mov w8, #1593835519 // =0x5effffff
 ; CHECK-NEXT:    frintx z0.s, p0/m, z0.s
-; CHECK-NEXT:    mov z3.s, w8
+; CHECK-NEXT:    mov z2.s, w8
 ; CHECK-NEXT:    fcmge p1.s, p0/z, z0.s, z1.s
-; CHECK-NEXT:    movprfx z1, z0
-; CHECK-NEXT:    fcvtzs z1.d, p0/m, z0.s
-; CHECK-NEXT:    fcmgt p2.s, p0/z, z0.s, z3.s
-; CHECK-NEXT:    mov z3.d, #0x7fffffffffffffff
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    mov z1.d, #0x8000000000000000
+; CHECK-NEXT:    fcvtzs z1.d, p1/m, z0.s
+; CHECK-NEXT:    fcmgt p1.s, p0/z, z0.s, z2.s
+; CHECK-NEXT:    mov z2.d, #0x7fffffffffffffff
 ; CHECK-NEXT:    fcmuo p0.s, p0/z, z0.s, z0.s
-; CHECK-NEXT:    mov z1.d, p1/m, z2.d
-; CHECK-NEXT:    sel z0.d, p2, z3.d, z1.d
+; CHECK-NEXT:    sel z0.d, p1, z2.d, z1.d
 ; CHECK-NEXT:    mov z0.d, p0/m, #0 // =0x0
 ; CHECK-NEXT:    ret
   %a = call <vscale x 2 x iXLen> @llvm.lrint.nxv2iXLen.nxv2f32(<vscale x 2 x float> %x)
@@ -662,43 +577,30 @@ declare <vscale x 2 x iXLen> @llvm.lrint.nxv2iXLen.nxv2f32(<vscale x 2 x float>)
 define <vscale x 4 x iXLen> @lrint_v4f32(<vscale x 4 x float> %x) {
 ; CHECK-LABEL: lrint_v4f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
-; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    uunpklo z1.d, z0.s
 ; CHECK-NEXT:    uunpkhi z0.d, z0.s
 ; CHECK-NEXT:    mov w8, #-553648128 // =0xdf000000
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z2.s, w8
 ; CHECK-NEXT:    mov w8, #1593835519 // =0x5effffff
-; CHECK-NEXT:    mov z3.s, w8
-; CHECK-NEXT:    mov z6.d, #0x7fffffffffffffff
+; CHECK-NEXT:    mov z3.d, #0x8000000000000000
+; CHECK-NEXT:    mov z4.d, #0x8000000000000000
+; CHECK-NEXT:    mov z5.s, w8
 ; CHECK-NEXT:    frintx z1.s, p0/m, z1.s
 ; CHECK-NEXT:    frintx z0.s, p0/m, z0.s
 ; CHECK-NEXT:    fcmge p1.s, p0/z, z1.s, z2.s
 ; CHECK-NEXT:    fcmge p2.s, p0/z, z0.s, z2.s
-; CHECK-NEXT:    mov z2.d, #0x8000000000000000
-; CHECK-NEXT:    movprfx z4, z1
-; CHECK-NEXT:    fcvtzs z4.d, p0/m, z1.s
-; CHECK-NEXT:    movprfx z5, z0
-; CHECK-NEXT:    fcvtzs z5.d, p0/m, z0.s
-; CHECK-NEXT:    fcmgt p3.s, p0/z, z1.s, z3.s
-; CHECK-NEXT:    fcmgt p4.s, p0/z, z0.s, z3.s
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    sel z3.d, p1, z2.d, z4.d
-; CHECK-NEXT:    fcmuo p1.s, p0/z, z1.s, z1.s
+; CHECK-NEXT:    mov z2.d, #0x7fffffffffffffff
+; CHECK-NEXT:    fcmuo p3.s, p0/z, z1.s, z1.s
+; CHECK-NEXT:    fcvtzs z3.d, p1/m, z1.s
+; CHECK-NEXT:    fcmgt p1.s, p0/z, z1.s, z5.s
+; CHECK-NEXT:    fcvtzs z4.d, p2/m, z0.s
+; CHECK-NEXT:    fcmgt p2.s, p0/z, z0.s, z5.s
 ; CHECK-NEXT:    fcmuo p0.s, p0/z, z0.s, z0.s
-; CHECK-NEXT:    sel z2.d, p2, z2.d, z5.d
-; CHECK-NEXT:    sel z0.d, p3, z6.d, z3.d
-; CHECK-NEXT:    sel z1.d, p4, z6.d, z2.d
-; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    mov z0.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    sel z0.d, p1, z2.d, z3.d
+; CHECK-NEXT:    sel z1.d, p2, z2.d, z4.d
+; CHECK-NEXT:    mov z0.d, p3/m, #0 // =0x0
 ; CHECK-NEXT:    mov z1.d, p0/m, #0 // =0x0
-; CHECK-NEXT:    addvl sp, sp, #1
-; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
   %a = call <vscale x 4 x iXLen> @llvm.lrint.nxv4iXLen.nxv4f32(<vscale x 4 x float> %x)
   ret <vscale x 4 x iXLen> %a
@@ -710,7 +612,6 @@ define <vscale x 8 x iXLen> @lrint_v8f32(<vscale x 8 x float> %x) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    str p7, [sp, #4, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p6, [sp, #5, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
@@ -719,57 +620,47 @@ define <vscale x 8 x iXLen> @lrint_v8f32(<vscale x 8 x float> %x) {
 ; CHECK-NEXT:    uunpklo z2.d, z0.s
 ; CHECK-NEXT:    uunpkhi z0.d, z0.s
 ; CHECK-NEXT:    mov w8, #-553648128 // =0xdf000000
-; CHECK-NEXT:    uunpklo z3.d, z1.s
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    uunpklo z3.d, z1.s
 ; CHECK-NEXT:    uunpkhi z1.d, z1.s
 ; CHECK-NEXT:    mov z4.s, w8
 ; CHECK-NEXT:    mov w8, #1593835519 // =0x5effffff
 ; CHECK-NEXT:    mov z5.d, #0x8000000000000000
-; CHECK-NEXT:    mov z6.s, w8
-; CHECK-NEXT:    mov z26.d, #0x7fffffffffffffff
+; CHECK-NEXT:    mov z6.d, #0x8000000000000000
+; CHECK-NEXT:    mov z25.s, w8
+; CHECK-NEXT:    mov z7.d, #0x8000000000000000
 ; CHECK-NEXT:    frintx z2.s, p0/m, z2.s
 ; CHECK-NEXT:    frintx z0.s, p0/m, z0.s
+; CHECK-NEXT:    mov z24.d, #0x8000000000000000
 ; CHECK-NEXT:    frintx z3.s, p0/m, z3.s
 ; CHECK-NEXT:    frintx z1.s, p0/m, z1.s
 ; CHECK-NEXT:    fcmge p1.s, p0/z, z2.s, z4.s
 ; CHECK-NEXT:    fcmge p2.s, p0/z, z0.s, z4.s
-; CHECK-NEXT:    movprfx z7, z0
-; CHECK-NEXT:    fcvtzs z7.d, p0/m, z0.s
+; CHECK-NEXT:    fcmuo p6.s, p0/z, z0.s, z0.s
 ; CHECK-NEXT:    fcmge p3.s, p0/z, z3.s, z4.s
 ; CHECK-NEXT:    fcmge p4.s, p0/z, z1.s, z4.s
-; CHECK-NEXT:    movprfx z4, z2
-; CHECK-NEXT:    fcvtzs z4.d, p0/m, z2.s
-; CHECK-NEXT:    movprfx z24, z3
-; CHECK-NEXT:    fcvtzs z24.d, p0/m, z3.s
-; CHECK-NEXT:    movprfx z25, z1
-; CHECK-NEXT:    fcvtzs z25.d, p0/m, z1.s
-; CHECK-NEXT:    fcmgt p7.s, p0/z, z3.s, z6.s
-; CHECK-NEXT:    fcmgt p5.s, p0/z, z2.s, z6.s
-; CHECK-NEXT:    fcmgt p6.s, p0/z, z0.s, z6.s
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    not p3.b, p0/z, p3.b
-; CHECK-NEXT:    mov z4.d, p1/m, z5.d
-; CHECK-NEXT:    fcmgt p1.s, p0/z, z1.s, z6.s
-; CHECK-NEXT:    not p4.b, p0/z, p4.b
-; CHECK-NEXT:    sel z6.d, p2, z5.d, z7.d
-; CHECK-NEXT:    fcmuo p2.s, p0/z, z2.s, z2.s
-; CHECK-NEXT:    sel z7.d, p3, z5.d, z24.d
-; CHECK-NEXT:    fcmuo p3.s, p0/z, z0.s, z0.s
-; CHECK-NEXT:    sel z5.d, p4, z5.d, z25.d
-; CHECK-NEXT:    fcmuo p4.s, p0/z, z3.s, z3.s
+; CHECK-NEXT:    mov z4.d, #0x7fffffffffffffff
+; CHECK-NEXT:    fcmgt p5.s, p0/z, z1.s, z25.s
+; CHECK-NEXT:    fcvtzs z5.d, p1/m, z2.s
+; CHECK-NEXT:    fcvtzs z6.d, p2/m, z0.s
+; CHECK-NEXT:    fcmgt p1.s, p0/z, z2.s, z25.s
+; CHECK-NEXT:    fcmgt p2.s, p0/z, z0.s, z25.s
+; CHECK-NEXT:    fcvtzs z7.d, p3/m, z3.s
+; CHECK-NEXT:    fcmgt p3.s, p0/z, z3.s, z25.s
+; CHECK-NEXT:    fcvtzs z24.d, p4/m, z1.s
+; CHECK-NEXT:    fcmuo p4.s, p0/z, z2.s, z2.s
+; CHECK-NEXT:    sel z0.d, p1, z4.d, z5.d
+; CHECK-NEXT:    fcmuo p1.s, p0/z, z3.s, z3.s
 ; CHECK-NEXT:    fcmuo p0.s, p0/z, z1.s, z1.s
-; CHECK-NEXT:    sel z0.d, p5, z26.d, z4.d
-; CHECK-NEXT:    sel z1.d, p6, z26.d, z6.d
-; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    sel z2.d, p7, z26.d, z7.d
-; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    sel z3.d, p1, z26.d, z5.d
+; CHECK-NEXT:    sel z1.d, p2, z4.d, z6.d
+; CHECK-NEXT:    sel z2.d, p3, z4.d, z7.d
+; CHECK-NEXT:    sel z3.d, p5, z4.d, z24.d
 ; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    mov z0.d, p2/m, #0 // =0x0
-; CHECK-NEXT:    mov z1.d, p3/m, #0 // =0x0
-; CHECK-NEXT:    mov z2.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    mov z0.d, p4/m, #0 // =0x0
 ; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z1.d, p6/m, #0 // =0x0
+; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z2.d, p1/m, #0 // =0x0
 ; CHECK-NEXT:    mov z3.d, p0/m, #0 // =0x0
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -783,7 +674,7 @@ define <vscale x 16 x iXLen> @lrint_v16f32(<vscale x 16 x float> %x) {
 ; CHECK-LABEL: lrint_v16f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    addvl sp, sp, #-2
+; CHECK-NEXT:    addvl sp, sp, #-4
 ; CHECK-NEXT:    str p10, [sp, #1, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p9, [sp, #2, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p8, [sp, #3, mul vl] // 2-byte Folded Spill
@@ -791,119 +682,106 @@ define <vscale x 16 x iXLen> @lrint_v16f32(<vscale x 16 x float> %x) {
 ; CHECK-NEXT:    str p6, [sp, #5, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    str z8, [sp, #1, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT:    str z10, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z9, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z8, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG
 ; CHECK-NEXT:    uunpklo z4.d, z0.s
 ; CHECK-NEXT:    uunpkhi z0.d, z0.s
 ; CHECK-NEXT:    mov w8, #-553648128 // =0xdf000000
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    uunpklo z7.d, z1.s
-; CHECK-NEXT:    uunpkhi z1.d, z1.s
-; CHECK-NEXT:    uunpklo z24.d, z2.s
+; CHECK-NEXT:    uunpklo z6.d, z2.s
 ; CHECK-NEXT:    uunpkhi z2.d, z2.s
-; CHECK-NEXT:    uunpklo z25.d, z3.s
-; CHECK-NEXT:    uunpkhi z3.d, z3.s
-; CHECK-NEXT:    mov z26.d, #0x7fffffffffffffff
-; CHECK-NEXT:    movprfx z5, z4
-; CHECK-NEXT:    frintx z5.s, p0/m, z4.s
-; CHECK-NEXT:    movprfx z6, z0
-; CHECK-NEXT:    frintx z6.s, p0/m, z0.s
-; CHECK-NEXT:    mov z4.s, w8
-; CHECK-NEXT:    frintx z7.s, p0/m, z7.s
-; CHECK-NEXT:    movprfx z28, z1
-; CHECK-NEXT:    frintx z28.s, p0/m, z1.s
+; CHECK-NEXT:    uunpklo z5.d, z1.s
+; CHECK-NEXT:    uunpkhi z1.d, z1.s
+; CHECK-NEXT:    uunpklo z7.d, z3.s
+; CHECK-NEXT:    mov z24.s, w8
 ; CHECK-NEXT:    mov w8, #1593835519 // =0x5effffff
-; CHECK-NEXT:    mov z0.d, #0x8000000000000000
-; CHECK-NEXT:    frintx z24.s, p0/m, z24.s
-; CHECK-NEXT:    movprfx z29, z2
-; CHECK-NEXT:    frintx z29.s, p0/m, z2.s
-; CHECK-NEXT:    frintx z25.s, p0/m, z25.s
-; CHECK-NEXT:    movprfx z30, z3
-; CHECK-NEXT:    frintx z30.s, p0/m, z3.s
-; CHECK-NEXT:    mov z27.s, w8
-; CHECK-NEXT:    fcmge p1.s, p0/z, z5.s, z4.s
-; CHECK-NEXT:    fcmge p2.s, p0/z, z6.s, z4.s
-; CHECK-NEXT:    movprfx z1, z5
-; CHECK-NEXT:    fcvtzs z1.d, p0/m, z5.s
-; CHECK-NEXT:    movprfx z2, z6
-; CHECK-NEXT:    fcvtzs z2.d, p0/m, z6.s
-; CHECK-NEXT:    fcmge p5.s, p0/z, z7.s, z4.s
-; CHECK-NEXT:    fcmge p6.s, p0/z, z28.s, z4.s
-; CHECK-NEXT:    movprfx z3, z7
-; CHECK-NEXT:    fcvtzs z3.d, p0/m, z7.s
-; CHECK-NEXT:    fcmge p8.s, p0/z, z29.s, z4.s
-; CHECK-NEXT:    fcmgt p3.s, p0/z, z5.s, z27.s
-; CHECK-NEXT:    fcmgt p7.s, p0/z, z6.s, z27.s
-; CHECK-NEXT:    fcmge p9.s, p0/z, z25.s, z4.s
-; CHECK-NEXT:    movprfx z31, z25
-; CHECK-NEXT:    fcvtzs z31.d, p0/m, z25.s
-; CHECK-NEXT:    not p4.b, p0/z, p1.b
-; CHECK-NEXT:    fcmuo p1.s, p0/z, z5.s, z5.s
-; CHECK-NEXT:    movprfx z5, z28
-; CHECK-NEXT:    fcvtzs z5.d, p0/m, z28.s
-; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    fcmge p10.s, p0/z, z30.s, z4.s
-; CHECK-NEXT:    movprfx z8, z30
-; CHECK-NEXT:    fcvtzs z8.d, p0/m, z30.s
-; CHECK-NEXT:    mov z1.d, p4/m, z0.d
-; CHECK-NEXT:    fcmge p4.s, p0/z, z24.s, z4.s
-; CHECK-NEXT:    movprfx z4, z29
-; CHECK-NEXT:    fcvtzs z4.d, p0/m, z29.s
-; CHECK-NEXT:    mov z2.d, p2/m, z0.d
-; CHECK-NEXT:    fcmuo p2.s, p0/z, z6.s, z6.s
-; CHECK-NEXT:    movprfx z6, z24
-; CHECK-NEXT:    fcvtzs z6.d, p0/m, z24.s
-; CHECK-NEXT:    not p5.b, p0/z, p5.b
-; CHECK-NEXT:    not p6.b, p0/z, p6.b
-; CHECK-NEXT:    not p4.b, p0/z, p4.b
-; CHECK-NEXT:    mov z3.d, p5/m, z0.d
-; CHECK-NEXT:    not p5.b, p0/z, p8.b
-; CHECK-NEXT:    mov z5.d, p6/m, z0.d
-; CHECK-NEXT:    fcmgt p8.s, p0/z, z7.s, z27.s
-; CHECK-NEXT:    not p6.b, p0/z, p9.b
-; CHECK-NEXT:    mov z6.d, p4/m, z0.d
-; CHECK-NEXT:    fcmuo p9.s, p0/z, z7.s, z7.s
-; CHECK-NEXT:    not p4.b, p0/z, p10.b
-; CHECK-NEXT:    fcmgt p10.s, p0/z, z28.s, z27.s
-; CHECK-NEXT:    sel z7.d, p5, z0.d, z4.d
-; CHECK-NEXT:    fcmgt p5.s, p0/z, z24.s, z27.s
-; CHECK-NEXT:    mov z31.d, p6/m, z0.d
-; CHECK-NEXT:    fcmgt p6.s, p0/z, z30.s, z27.s
-; CHECK-NEXT:    mov z8.d, p4/m, z0.d
-; CHECK-NEXT:    sel z0.d, p3, z26.d, z1.d
-; CHECK-NEXT:    fcmgt p3.s, p0/z, z29.s, z27.s
-; CHECK-NEXT:    fcmgt p4.s, p0/z, z25.s, z27.s
-; CHECK-NEXT:    sel z1.d, p7, z26.d, z2.d
-; CHECK-NEXT:    fcmuo p7.s, p0/z, z28.s, z28.s
-; CHECK-NEXT:    sel z2.d, p8, z26.d, z3.d
-; CHECK-NEXT:    sel z3.d, p10, z26.d, z5.d
-; CHECK-NEXT:    fcmuo p8.s, p0/z, z29.s, z29.s
-; CHECK-NEXT:    sel z4.d, p5, z26.d, z6.d
-; CHECK-NEXT:    fcmuo p5.s, p0/z, z24.s, z24.s
-; CHECK-NEXT:    fcmuo p10.s, p0/z, z25.s, z25.s
-; CHECK-NEXT:    sel z5.d, p3, z26.d, z7.d
-; CHECK-NEXT:    fcmuo p0.s, p0/z, z30.s, z30.s
-; CHECK-NEXT:    sel z7.d, p6, z26.d, z8.d
-; CHECK-NEXT:    ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    mov z26.d, #0x8000000000000000
+; CHECK-NEXT:    frintx z4.s, p0/m, z4.s
+; CHECK-NEXT:    frintx z0.s, p0/m, z0.s
+; CHECK-NEXT:    mov z30.s, w8
+; CHECK-NEXT:    movprfx z27, z2
+; CHECK-NEXT:    frintx z27.s, p0/m, z2.s
+; CHECK-NEXT:    uunpkhi z2.d, z3.s
+; CHECK-NEXT:    frintx z6.s, p0/m, z6.s
+; CHECK-NEXT:    movprfx z25, z1
+; CHECK-NEXT:    frintx z25.s, p0/m, z1.s
+; CHECK-NEXT:    frintx z5.s, p0/m, z5.s
+; CHECK-NEXT:    frintx z7.s, p0/m, z7.s
+; CHECK-NEXT:    mov z1.d, #0x8000000000000000
+; CHECK-NEXT:    mov z3.d, #0x8000000000000000
+; CHECK-NEXT:    mov z28.d, #0x8000000000000000
+; CHECK-NEXT:    fcmge p1.s, p0/z, z4.s, z24.s
+; CHECK-NEXT:    fcmge p2.s, p0/z, z0.s, z24.s
+; CHECK-NEXT:    mov z29.d, #0x8000000000000000
+; CHECK-NEXT:    movprfx z9, z2
+; CHECK-NEXT:    frintx z9.s, p0/m, z2.s
+; CHECK-NEXT:    fcmge p5.s, p0/z, z6.s, z24.s
+; CHECK-NEXT:    mov z8.d, #0x8000000000000000
+; CHECK-NEXT:    fcmge p3.s, p0/z, z5.s, z24.s
+; CHECK-NEXT:    fcmge p4.s, p0/z, z25.s, z24.s
+; CHECK-NEXT:    fcmge p7.s, p0/z, z7.s, z24.s
+; CHECK-NEXT:    fcmge p6.s, p0/z, z27.s, z24.s
+; CHECK-NEXT:    mov z31.d, #0x8000000000000000
+; CHECK-NEXT:    mov z10.d, #0x7fffffffffffffff
+; CHECK-NEXT:    fcvtzs z1.d, p1/m, z4.s
+; CHECK-NEXT:    fcmgt p1.s, p0/z, z4.s, z30.s
+; CHECK-NEXT:    fcvtzs z26.d, p2/m, z0.s
+; CHECK-NEXT:    fcmge p2.s, p0/z, z9.s, z24.s
+; CHECK-NEXT:    mov z24.d, #0x8000000000000000
+; CHECK-NEXT:    fcvtzs z29.d, p5/m, z6.s
+; CHECK-NEXT:    fcvtzs z3.d, p3/m, z5.s
+; CHECK-NEXT:    fcvtzs z28.d, p4/m, z25.s
+; CHECK-NEXT:    fcvtzs z8.d, p7/m, z7.s
+; CHECK-NEXT:    fcmgt p4.s, p0/z, z0.s, z30.s
+; CHECK-NEXT:    fcmgt p5.s, p0/z, z5.s, z30.s
+; CHECK-NEXT:    fcmgt p7.s, p0/z, z25.s, z30.s
+; CHECK-NEXT:    fcmgt p8.s, p0/z, z6.s, z30.s
+; CHECK-NEXT:    fcvtzs z31.d, p6/m, z27.s
+; CHECK-NEXT:    fcmuo p6.s, p0/z, z0.s, z0.s
+; CHECK-NEXT:    sel z0.d, p1, z10.d, z1.d
+; CHECK-NEXT:    fcmgt p1.s, p0/z, z27.s, z30.s
+; CHECK-NEXT:    fcmgt p10.s, p0/z, z7.s, z30.s
+; CHECK-NEXT:    fcvtzs z24.d, p2/m, z9.s
+; CHECK-NEXT:    fcmgt p2.s, p0/z, z9.s, z30.s
+; CHECK-NEXT:    fcmuo p3.s, p0/z, z4.s, z4.s
+; CHECK-NEXT:    fcmuo p9.s, p0/z, z5.s, z5.s
+; CHECK-NEXT:    sel z1.d, p4, z10.d, z26.d
+; CHECK-NEXT:    fcmuo p4.s, p0/z, z25.s, z25.s
+; CHECK-NEXT:    sel z2.d, p5, z10.d, z3.d
+; CHECK-NEXT:    sel z3.d, p7, z10.d, z28.d
+; CHECK-NEXT:    sel z4.d, p8, z10.d, z29.d
+; CHECK-NEXT:    fcmuo p5.s, p0/z, z6.s, z6.s
+; CHECK-NEXT:    fcmuo p7.s, p0/z, z27.s, z27.s
+; CHECK-NEXT:    fcmuo p8.s, p0/z, z7.s, z7.s
+; CHECK-NEXT:    sel z5.d, p1, z10.d, z31.d
+; CHECK-NEXT:    sel z6.d, p10, z10.d, z8.d
+; CHECK-NEXT:    ldr z8, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p10, [sp, #1, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    fcmuo p0.s, p0/z, z9.s, z9.s
+; CHECK-NEXT:    ldr z9, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    sel z7.d, p2, z10.d, z24.d
+; CHECK-NEXT:    ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    mov z1.d, p6/m, #0 // =0x0
 ; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    sel z6.d, p4, z26.d, z31.d
-; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    mov z2.d, p9/m, #0 // =0x0
-; CHECK-NEXT:    mov z3.d, p7/m, #0 // =0x0
+; CHECK-NEXT:    mov z3.d, p4/m, #0 // =0x0
 ; CHECK-NEXT:    ldr p9, [sp, #2, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    mov z4.d, p5/m, #0 // =0x0
-; CHECK-NEXT:    mov z5.d, p8/m, #0 // =0x0
+; CHECK-NEXT:    mov z5.d, p7/m, #0 // =0x0
+; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z6.d, p8/m, #0 // =0x0
+; CHECK-NEXT:    mov z0.d, p3/m, #0 // =0x0
 ; CHECK-NEXT:    ldr p8, [sp, #3, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    mov z6.d, p10/m, #0 // =0x0
-; CHECK-NEXT:    mov z0.d, p1/m, #0 // =0x0
-; CHECK-NEXT:    ldr p10, [sp, #1, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    mov z1.d, p2/m, #0 // =0x0
 ; CHECK-NEXT:    mov z7.d, p0/m, #0 // =0x0
-; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    addvl sp, sp, #2
+; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #4
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
   %a = call <vscale x 16 x iXLen> @llvm.lrint.nxv16iXLen.nxv16f32(<vscale x 16 x float> %x)
@@ -916,6 +794,8 @@ define <vscale x 32 x iXLen> @lrint_v32f32(<vscale x 32 x float> %x) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-17
+; CHECK-NEXT:    str p11, [sp] // 2-byte Folded Spill
+; CHECK-NEXT:    str p10, [sp, #1, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p9, [sp, #2, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p8, [sp, #3, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p7, [sp, #4, mul vl] // 2-byte Folded Spill
@@ -938,8 +818,8 @@ define <vscale x 32 x iXLen> @lrint_v32f32(<vscale x 32 x float> %x) {
 ; CHECK-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
 ; CHECK-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
 ; CHECK-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT:    addvl sp, sp, #-3
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xa0, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 160 * VG
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 144 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
 ; CHECK-NEXT:    .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
@@ -950,224 +830,185 @@ define <vscale x 32 x iXLen> @lrint_v32f32(<vscale x 32 x float> %x) {
 ; CHECK-NEXT:    .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 56 * VG
 ; CHECK-NEXT:    .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG
 ; CHECK-NEXT:    uunpklo z24.d, z0.s
-; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    uunpkhi z25.d, z0.s
 ; CHECK-NEXT:    mov w9, #-553648128 // =0xdf000000
 ; CHECK-NEXT:    uunpklo z26.d, z1.s
-; CHECK-NEXT:    uunpkhi z25.d, z0.s
-; CHECK-NEXT:    uunpkhi z28.d, z1.s
-; CHECK-NEXT:    mov z29.s, w9
+; CHECK-NEXT:    uunpkhi z1.d, z1.s
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    uunpklo z27.d, z2.s
+; CHECK-NEXT:    uunpkhi z9.d, z2.s
+; CHECK-NEXT:    uunpklo z11.d, z3.s
+; CHECK-NEXT:    uunpkhi z12.d, z3.s
+; CHECK-NEXT:    mov z10.s, w9
 ; CHECK-NEXT:    mov w9, #1593835519 // =0x5effffff
-; CHECK-NEXT:    mov z17.d, z5.d
-; CHECK-NEXT:    mov z27.d, #0x8000000000000000
-; CHECK-NEXT:    uunpkhi z30.d, z2.s
-; CHECK-NEXT:    uunpklo z8.d, z3.s
 ; CHECK-NEXT:    movprfx z0, z24
 ; CHECK-NEXT:    frintx z0.s, p0/m, z24.s
-; CHECK-NEXT:    uunpkhi z9.d, z3.s
+; CHECK-NEXT:    movprfx z24, z25
+; CHECK-NEXT:    frintx z24.s, p0/m, z25.s
+; CHECK-NEXT:    uunpklo z13.d, z4.s
+; CHECK-NEXT:    movprfx z25, z26
+; CHECK-NEXT:    frintx z25.s, p0/m, z26.s
+; CHECK-NEXT:    movprfx z26, z1
+; CHECK-NEXT:    frintx z26.s, p0/m, z1.s
 ; CHECK-NEXT:    uunpkhi z14.d, z4.s
-; CHECK-NEXT:    movprfx z24, z26
-; CHECK-NEXT:    frintx z24.s, p0/m, z26.s
-; CHECK-NEXT:    movprfx z1, z25
-; CHECK-NEXT:    frintx z1.s, p0/m, z25.s
-; CHECK-NEXT:    movprfx z5, z28
-; CHECK-NEXT:    frintx z5.s, p0/m, z28.s
-; CHECK-NEXT:    uunpklo z26.d, z2.s
-; CHECK-NEXT:    uunpklo z16.d, z17.s
-; CHECK-NEXT:    mov z25.s, w9
-; CHECK-NEXT:    movprfx z28, z30
-; CHECK-NEXT:    frintx z28.s, p0/m, z30.s
-; CHECK-NEXT:    movprfx z30, z8
-; CHECK-NEXT:    frintx z30.s, p0/m, z8.s
-; CHECK-NEXT:    fcmge p1.s, p0/z, z0.s, z29.s
-; CHECK-NEXT:    movprfx z31, z0
-; CHECK-NEXT:    fcvtzs z31.d, p0/m, z0.s
-; CHECK-NEXT:    str z0, [sp, #2, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT:    fcmge p2.s, p0/z, z1.s, z29.s
-; CHECK-NEXT:    fcmge p3.s, p0/z, z24.s, z29.s
-; CHECK-NEXT:    fcmge p5.s, p0/z, z5.s, z29.s
-; CHECK-NEXT:    frintx z26.s, p0/m, z26.s
-; CHECK-NEXT:    movprfx z10, z1
-; CHECK-NEXT:    fcvtzs z10.d, p0/m, z1.s
-; CHECK-NEXT:    movprfx z11, z24
-; CHECK-NEXT:    fcvtzs z11.d, p0/m, z24.s
-; CHECK-NEXT:    movprfx z12, z5
-; CHECK-NEXT:    fcvtzs z12.d, p0/m, z5.s
-; CHECK-NEXT:    movprfx z15, z28
-; CHECK-NEXT:    fcvtzs z15.d, p0/m, z28.s
-; CHECK-NEXT:    str z1, [sp, #1, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT:    not p4.b, p0/z, p1.b
-; CHECK-NEXT:    fcmgt p1.s, p0/z, z1.s, z25.s
-; CHECK-NEXT:    fcmgt p9.s, p0/z, z5.s, z25.s
-; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    sel z0.d, p4, z27.d, z31.d
-; CHECK-NEXT:    fcmge p4.s, p0/z, z26.s, z29.s
-; CHECK-NEXT:    not p3.b, p0/z, p3.b
-; CHECK-NEXT:    not p5.b, p0/z, p5.b
-; CHECK-NEXT:    movprfx z13, z26
-; CHECK-NEXT:    fcvtzs z13.d, p0/m, z26.s
-; CHECK-NEXT:    sel z31.d, p2, z27.d, z10.d
-; CHECK-NEXT:    uunpklo z10.d, z4.s
-; CHECK-NEXT:    sel z8.d, p3, z27.d, z11.d
-; CHECK-NEXT:    fcmge p3.s, p0/z, z28.s, z29.s
-; CHECK-NEXT:    sel z11.d, p5, z27.d, z12.d
-; CHECK-NEXT:    movprfx z4, z9
-; CHECK-NEXT:    frintx z4.s, p0/m, z9.s
-; CHECK-NEXT:    str z0, [sp] // 16-byte Folded Spill
-; CHECK-NEXT:    not p5.b, p0/z, p4.b
-; CHECK-NEXT:    fcmge p4.s, p0/z, z30.s, z29.s
-; CHECK-NEXT:    fcmgt p2.s, p0/z, z24.s, z25.s
-; CHECK-NEXT:    sel z12.d, p5, z27.d, z13.d
-; CHECK-NEXT:    uunpkhi z13.d, z17.s
-; CHECK-NEXT:    movprfx z9, z10
-; CHECK-NEXT:    frintx z9.s, p0/m, z10.s
-; CHECK-NEXT:    movprfx z10, z14
-; CHECK-NEXT:    frintx z10.s, p0/m, z14.s
-; CHECK-NEXT:    uunpkhi z17.d, z6.s
-; CHECK-NEXT:    not p3.b, p0/z, p3.b
-; CHECK-NEXT:    uunpklo z14.d, z6.s
-; CHECK-NEXT:    movprfx z6, z16
-; CHECK-NEXT:    frintx z6.s, p0/m, z16.s
-; CHECK-NEXT:    uunpklo z16.d, z7.s
+; CHECK-NEXT:    movprfx z2, z27
+; CHECK-NEXT:    frintx z2.s, p0/m, z27.s
+; CHECK-NEXT:    mov z31.d, #0x8000000000000000
+; CHECK-NEXT:    movprfx z27, z9
+; CHECK-NEXT:    frintx z27.s, p0/m, z9.s
+; CHECK-NEXT:    movprfx z9, z11
+; CHECK-NEXT:    frintx z9.s, p0/m, z11.s
+; CHECK-NEXT:    movprfx z11, z12
+; CHECK-NEXT:    frintx z11.s, p0/m, z12.s
+; CHECK-NEXT:    uunpklo z15.d, z7.s
 ; CHECK-NEXT:    uunpkhi z7.d, z7.s
-; CHECK-NEXT:    sel z3.d, p3, z27.d, z15.d
-; CHECK-NEXT:    fcmge p3.s, p0/z, z4.s, z29.s
-; CHECK-NEXT:    frintx z13.s, p0/m, z13.s
-; CHECK-NEXT:    movprfx z15, z30
-; CHECK-NEXT:    fcvtzs z15.d, p0/m, z30.s
-; CHECK-NEXT:    fcmge p5.s, p0/z, z9.s, z29.s
-; CHECK-NEXT:    fcmge p6.s, p0/z, z10.s, z29.s
-; CHECK-NEXT:    frintx z17.s, p0/m, z17.s
-; CHECK-NEXT:    movprfx z18, z4
-; CHECK-NEXT:    fcvtzs z18.d, p0/m, z4.s
-; CHECK-NEXT:    movprfx z20, z10
-; CHECK-NEXT:    fcvtzs z20.d, p0/m, z10.s
-; CHECK-NEXT:    frintx z16.s, p0/m, z16.s
-; CHECK-NEXT:    not p4.b, p0/z, p4.b
-; CHECK-NEXT:    movprfx z19, z14
-; CHECK-NEXT:    frintx z19.s, p0/m, z14.s
-; CHECK-NEXT:    movprfx z14, z9
-; CHECK-NEXT:    fcvtzs z14.d, p0/m, z9.s
-; CHECK-NEXT:    fcmge p7.s, p0/z, z6.s, z29.s
-; CHECK-NEXT:    fcmge p8.s, p0/z, z13.s, z29.s
+; CHECK-NEXT:    mov z1.d, #0x8000000000000000
+; CHECK-NEXT:    mov z29.d, #0x8000000000000000
+; CHECK-NEXT:    fcmge p3.s, p0/z, z26.s, z10.s
+; CHECK-NEXT:    mov z30.d, #0x8000000000000000
+; CHECK-NEXT:    fcmge p5.s, p0/z, z0.s, z10.s
+; CHECK-NEXT:    mov z8.d, #0x8000000000000000
+; CHECK-NEXT:    fcmge p1.s, p0/z, z24.s, z10.s
+; CHECK-NEXT:    movprfx z12, z13
+; CHECK-NEXT:    frintx z12.s, p0/m, z13.s
+; CHECK-NEXT:    fcmge p2.s, p0/z, z25.s, z10.s
+; CHECK-NEXT:    fcmge p4.s, p0/z, z2.s, z10.s
+; CHECK-NEXT:    movprfx z13, z14
+; CHECK-NEXT:    frintx z13.s, p0/m, z14.s
+; CHECK-NEXT:    uunpklo z17.d, z5.s
+; CHECK-NEXT:    uunpkhi z18.d, z5.s
 ; CHECK-NEXT:    movprfx z21, z7
 ; CHECK-NEXT:    frintx z21.s, p0/m, z7.s
-; CHECK-NEXT:    not p3.b, p0/z, p3.b
-; CHECK-NEXT:    not p6.b, p0/z, p6.b
-; CHECK-NEXT:    mov z15.d, p4/m, z27.d
-; CHECK-NEXT:    fcmge p4.s, p0/z, z17.s, z29.s
-; CHECK-NEXT:    not p5.b, p0/z, p5.b
-; CHECK-NEXT:    sel z7.d, p3, z27.d, z18.d
-; CHECK-NEXT:    movprfx z0, z17
-; CHECK-NEXT:    fcvtzs z0.d, p0/m, z17.s
-; CHECK-NEXT:    sel z18.d, p6, z27.d, z20.d
-; CHECK-NEXT:    movprfx z20, z6
-; CHECK-NEXT:    fcvtzs z20.d, p0/m, z6.s
-; CHECK-NEXT:    fcmge p6.s, p0/z, z16.s, z29.s
-; CHECK-NEXT:    fcmge p3.s, p0/z, z19.s, z29.s
-; CHECK-NEXT:    mov z14.d, p5/m, z27.d
-; CHECK-NEXT:    not p5.b, p0/z, p7.b
-; CHECK-NEXT:    not p7.b, p0/z, p8.b
-; CHECK-NEXT:    fcmge p8.s, p0/z, z21.s, z29.s
-; CHECK-NEXT:    movprfx z1, z16
-; CHECK-NEXT:    fcvtzs z1.d, p0/m, z16.s
-; CHECK-NEXT:    movprfx z22, z13
-; CHECK-NEXT:    fcvtzs z22.d, p0/m, z13.s
-; CHECK-NEXT:    movprfx z23, z19
-; CHECK-NEXT:    fcvtzs z23.d, p0/m, z19.s
-; CHECK-NEXT:    not p4.b, p0/z, p4.b
-; CHECK-NEXT:    movprfx z2, z21
-; CHECK-NEXT:    fcvtzs z2.d, p0/m, z21.s
-; CHECK-NEXT:    mov z29.d, #0x7fffffffffffffff
-; CHECK-NEXT:    mov z20.d, p5/m, z27.d
-; CHECK-NEXT:    not p5.b, p0/z, p6.b
-; CHECK-NEXT:    mov z0.d, p4/m, z27.d
-; CHECK-NEXT:    fcmgt p4.s, p0/z, z16.s, z25.s
-; CHECK-NEXT:    not p3.b, p0/z, p3.b
-; CHECK-NEXT:    not p6.b, p0/z, p8.b
-; CHECK-NEXT:    mov z1.d, p5/m, z27.d
-; CHECK-NEXT:    mov z22.d, p7/m, z27.d
-; CHECK-NEXT:    mov z23.d, p3/m, z27.d
-; CHECK-NEXT:    fcmgt p3.s, p0/z, z21.s, z25.s
-; CHECK-NEXT:    fcmuo p5.s, p0/z, z16.s, z16.s
-; CHECK-NEXT:    mov z2.d, p6/m, z27.d
-; CHECK-NEXT:    sel z27.d, p1, z29.d, z31.d
-; CHECK-NEXT:    fcmgt p1.s, p0/z, z17.s, z25.s
-; CHECK-NEXT:    mov z1.d, p4/m, z29.d
-; CHECK-NEXT:    fcmgt p6.s, p0/z, z26.s, z25.s
-; CHECK-NEXT:    fcmgt p7.s, p0/z, z30.s, z25.s
-; CHECK-NEXT:    sel z31.d, p2, z29.d, z8.d
-; CHECK-NEXT:    fcmgt p2.s, p0/z, z13.s, z25.s
-; CHECK-NEXT:    fcmuo p8.s, p0/z, z21.s, z21.s
-; CHECK-NEXT:    mov z2.d, p3/m, z29.d
-; CHECK-NEXT:    fcmuo p4.s, p0/z, z17.s, z17.s
-; CHECK-NEXT:    fcmgt p3.s, p0/z, z19.s, z25.s
-; CHECK-NEXT:    mov z0.d, p1/m, z29.d
-; CHECK-NEXT:    fcmgt p1.s, p0/z, z6.s, z25.s
-; CHECK-NEXT:    mov z1.d, p5/m, #0 // =0x0
-; CHECK-NEXT:    sel z8.d, p9, z29.d, z11.d
-; CHECK-NEXT:    sel z11.d, p6, z29.d, z12.d
-; CHECK-NEXT:    sel z12.d, p7, z29.d, z15.d
-; CHECK-NEXT:    fcmgt p5.s, p0/z, z10.s, z25.s
-; CHECK-NEXT:    sel z15.d, p2, z29.d, z22.d
-; CHECK-NEXT:    fcmuo p2.s, p0/z, z13.s, z13.s
-; CHECK-NEXT:    str z1, [x8, #14, mul vl]
-; CHECK-NEXT:    mov z2.d, p8/m, #0 // =0x0
+; CHECK-NEXT:    uunpklo z19.d, z6.s
+; CHECK-NEXT:    uunpkhi z20.d, z6.s
+; CHECK-NEXT:    str z0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    fcvtzs z31.d, p3/m, z26.s
+; CHECK-NEXT:    fcmge p3.s, p0/z, z11.s, z10.s
+; CHECK-NEXT:    mov z5.d, #0x8000000000000000
+; CHECK-NEXT:    frintx z15.s, p0/m, z15.s
+; CHECK-NEXT:    fcvtzs z1.d, p5/m, z0.s
+; CHECK-NEXT:    fcvtzs z29.d, p1/m, z24.s
+; CHECK-NEXT:    fcvtzs z30.d, p2/m, z25.s
+; CHECK-NEXT:    fcvtzs z8.d, p4/m, z2.s
+; CHECK-NEXT:    fcmge p1.s, p0/z, z27.s, z10.s
+; CHECK-NEXT:    mov z4.d, #0x8000000000000000
+; CHECK-NEXT:    fcmge p2.s, p0/z, z9.s, z10.s
+; CHECK-NEXT:    mov z16.d, #0x8000000000000000
+; CHECK-NEXT:    fcmge p4.s, p0/z, z12.s, z10.s
+; CHECK-NEXT:    mov z6.d, #0x8000000000000000
+; CHECK-NEXT:    fcmge p5.s, p0/z, z13.s, z10.s
+; CHECK-NEXT:    mov z14.d, #0x8000000000000000
+; CHECK-NEXT:    frintx z17.s, p0/m, z17.s
+; CHECK-NEXT:    frintx z18.s, p0/m, z18.s
+; CHECK-NEXT:    frintx z19.s, p0/m, z19.s
+; CHECK-NEXT:    frintx z20.s, p0/m, z20.s
+; CHECK-NEXT:    mov z28.d, #0x8000000000000000
+; CHECK-NEXT:    fcvtzs z5.d, p3/m, z11.s
+; CHECK-NEXT:    fcmge p3.s, p0/z, z21.s, z10.s
+; CHECK-NEXT:    mov z3.s, w9
+; CHECK-NEXT:    fcmge p6.s, p0/z, z15.s, z10.s
+; CHECK-NEXT:    mov z22.d, #0x8000000000000000
+; CHECK-NEXT:    fcvtzs z4.d, p1/m, z27.s
+; CHECK-NEXT:    fcvtzs z16.d, p2/m, z9.s
+; CHECK-NEXT:    fcvtzs z6.d, p4/m, z12.s
+; CHECK-NEXT:    fcvtzs z14.d, p5/m, z13.s
+; CHECK-NEXT:    fcmge p1.s, p0/z, z17.s, z10.s
+; CHECK-NEXT:    fcmge p2.s, p0/z, z18.s, z10.s
+; CHECK-NEXT:    mov z23.d, #0x8000000000000000
+; CHECK-NEXT:    fcmge p4.s, p0/z, z19.s, z10.s
+; CHECK-NEXT:    fcmge p5.s, p0/z, z20.s, z10.s
+; CHECK-NEXT:    mov z10.d, #0x8000000000000000
+; CHECK-NEXT:    mov z0.d, #0x8000000000000000
+; CHECK-NEXT:    fcvtzs z28.d, p3/m, z21.s
+; CHECK-NEXT:    mov z7.d, #0x7fffffffffffffff
+; CHECK-NEXT:    fcmgt p11.s, p0/z, z21.s, z3.s
+; CHECK-NEXT:    fcvtzs z22.d, p6/m, z15.s
+; CHECK-NEXT:    fcmgt p3.s, p0/z, z15.s, z3.s
+; CHECK-NEXT:    fcmuo p6.s, p0/z, z15.s, z15.s
+; CHECK-NEXT:    mov z15.d, #0x8000000000000000
+; CHECK-NEXT:    fcmgt p7.s, p0/z, z24.s, z3.s
+; CHECK-NEXT:    fcvtzs z23.d, p2/m, z18.s
+; CHECK-NEXT:    fcvtzs z10.d, p5/m, z20.s
+; CHECK-NEXT:    fcmgt p2.s, p0/z, z9.s, z3.s
+; CHECK-NEXT:    fcmgt p5.s, p0/z, z20.s, z3.s
+; CHECK-NEXT:    fcvtzs z0.d, p4/m, z19.s
+; CHECK-NEXT:    fcmuo p4.s, p0/z, z21.s, z21.s
+; CHECK-NEXT:    mov z28.d, p11/m, z7.d
+; CHECK-NEXT:    sel z21.d, p3, z7.d, z22.d
+; CHECK-NEXT:    fcmgt p3.s, p0/z, z19.s, z3.s
+; CHECK-NEXT:    fcvtzs z15.d, p1/m, z17.s
+; CHECK-NEXT:    fcmuo p1.s, p0/z, z20.s, z20.s
+; CHECK-NEXT:    mov z29.d, p7/m, z7.d
+; CHECK-NEXT:    fcmgt p7.s, p0/z, z18.s, z3.s
+; CHECK-NEXT:    mov z16.d, p2/m, z7.d
+; CHECK-NEXT:    fcmgt p2.s, p0/z, z17.s, z3.s
+; CHECK-NEXT:    mov z10.d, p5/m, z7.d
+; CHECK-NEXT:    mov z28.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p4.s, p0/z, z19.s, z19.s
+; CHECK-NEXT:    mov z0.d, p3/m, z7.d
+; CHECK-NEXT:    fcmuo p3.s, p0/z, z18.s, z18.s
+; CHECK-NEXT:    fcmgt p5.s, p0/z, z13.s, z3.s
+; CHECK-NEXT:    mov z21.d, p6/m, #0 // =0x0
+; CHECK-NEXT:    fcmgt p10.s, p0/z, z2.s, z3.s
+; CHECK-NEXT:    fcmgt p8.s, p0/z, z25.s, z3.s
+; CHECK-NEXT:    str z28, [x8, #15, mul vl]
+; CHECK-NEXT:    mov z10.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p1.s, p0/z, z17.s, z17.s
+; CHECK-NEXT:    sel z19.d, p7, z7.d, z23.d
+; CHECK-NEXT:    sel z28.d, p2, z7.d, z15.d
+; CHECK-NEXT:    fcmgt p2.s, p0/z, z12.s, z3.s
+; CHECK-NEXT:    str z21, [x8, #14, mul vl]
 ; CHECK-NEXT:    mov z0.d, p4/m, #0 // =0x0
-; CHECK-NEXT:    sel z1.d, p1, z29.d, z20.d
-; CHECK-NEXT:    fcmgt p1.s, p0/z, z9.s, z25.s
-; CHECK-NEXT:    fcmuo p6.s, p0/z, z19.s, z19.s
-; CHECK-NEXT:    sel z16.d, p3, z29.d, z23.d
-; CHECK-NEXT:    fcmuo p3.s, p0/z, z6.s, z6.s
-; CHECK-NEXT:    fcmgt p4.s, p0/z, z4.s, z25.s
-; CHECK-NEXT:    str z2, [x8, #15, mul vl]
-; CHECK-NEXT:    sel z2.d, p5, z29.d, z18.d
-; CHECK-NEXT:    fcmuo p5.s, p0/z, z10.s, z10.s
-; CHECK-NEXT:    str z0, [x8, #13, mul vl]
-; CHECK-NEXT:    mov z15.d, p2/m, #0 // =0x0
-; CHECK-NEXT:    fcmuo p2.s, p0/z, z9.s, z9.s
-; CHECK-NEXT:    sel z0.d, p1, z29.d, z14.d
-; CHECK-NEXT:    mov z16.d, p6/m, #0 // =0x0
-; CHECK-NEXT:    fcmuo p1.s, p0/z, z4.s, z4.s
-; CHECK-NEXT:    mov z1.d, p3/m, #0 // =0x0
-; CHECK-NEXT:    fcmgt p3.s, p0/z, z28.s, z25.s
-; CHECK-NEXT:    sel z4.d, p4, z29.d, z7.d
-; CHECK-NEXT:    str z15, [x8, #11, mul vl]
-; CHECK-NEXT:    mov z2.d, p5/m, #0 // =0x0
-; CHECK-NEXT:    fcmuo p5.s, p0/z, z28.s, z28.s
-; CHECK-NEXT:    str z16, [x8, #12, mul vl]
-; CHECK-NEXT:    mov z0.d, p2/m, #0 // =0x0
-; CHECK-NEXT:    fcmuo p4.s, p0/z, z30.s, z30.s
-; CHECK-NEXT:    str z1, [x8, #10, mul vl]
-; CHECK-NEXT:    mov z4.d, p1/m, #0 // =0x0
-; CHECK-NEXT:    fcmuo p1.s, p0/z, z5.s, z5.s
-; CHECK-NEXT:    sel z1.d, p3, z29.d, z3.d
-; CHECK-NEXT:    ldr z3, [sp, #2, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    str z2, [x8, #9, mul vl]
+; CHECK-NEXT:    mov z14.d, p5/m, z7.d
+; CHECK-NEXT:    str z10, [x8, #13, mul vl]
+; CHECK-NEXT:    fcmgt p5.s, p0/z, z11.s, z3.s
+; CHECK-NEXT:    fcmuo p4.s, p0/z, z13.s, z13.s
+; CHECK-NEXT:    mov z19.d, p3/m, #0 // =0x0
+; CHECK-NEXT:    mov z28.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    fcmgt p1.s, p0/z, z27.s, z3.s
+; CHECK-NEXT:    str z0, [x8, #12, mul vl]
+; CHECK-NEXT:    fcmuo p3.s, p0/z, z12.s, z12.s
+; CHECK-NEXT:    sel z0.d, p2, z7.d, z6.d
+; CHECK-NEXT:    fcmuo p2.s, p0/z, z11.s, z11.s
+; CHECK-NEXT:    fcmgt p9.s, p0/z, z26.s, z3.s
+; CHECK-NEXT:    mov z30.d, p8/m, z7.d
+; CHECK-NEXT:    str z19, [x8, #11, mul vl]
+; CHECK-NEXT:    mov z5.d, p5/m, z7.d
+; CHECK-NEXT:    fcmuo p5.s, p0/z, z9.s, z9.s
+; CHECK-NEXT:    str z28, [x8, #10, mul vl]
+; CHECK-NEXT:    mov z4.d, p1/m, z7.d
+; CHECK-NEXT:    fcmuo p1.s, p0/z, z2.s, z2.s
+; CHECK-NEXT:    ldr z2, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    mov z14.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p4.s, p0/z, z27.s, z27.s
+; CHECK-NEXT:    mov z0.d, p3/m, #0 // =0x0
+; CHECK-NEXT:    mov z5.d, p2/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p2.s, p0/z, z26.s, z26.s
+; CHECK-NEXT:    mov z16.d, p5/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p5.s, p0/z, z25.s, z25.s
+; CHECK-NEXT:    mov z31.d, p9/m, z7.d
+; CHECK-NEXT:    str z14, [x8, #9, mul vl]
+; CHECK-NEXT:    fcmgt p3.s, p0/z, z2.s, z3.s
+; CHECK-NEXT:    mov z8.d, p10/m, z7.d
 ; CHECK-NEXT:    str z0, [x8, #8, mul vl]
-; CHECK-NEXT:    fcmuo p3.s, p0/z, z26.s, z26.s
-; CHECK-NEXT:    ldr z0, [sp, #1, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    str z4, [x8, #7, mul vl]
-; CHECK-NEXT:    mov z12.d, p4/m, #0 // =0x0
-; CHECK-NEXT:    fcmgt p2.s, p0/z, z3.s, z25.s
-; CHECK-NEXT:    mov z1.d, p5/m, #0 // =0x0
+; CHECK-NEXT:    mov z4.d, p4/m, #0 // =0x0
 ; CHECK-NEXT:    fcmuo p4.s, p0/z, z24.s, z24.s
+; CHECK-NEXT:    str z5, [x8, #7, mul vl]
+; CHECK-NEXT:    fcmuo p0.s, p0/z, z2.s, z2.s
+; CHECK-NEXT:    mov z31.d, p2/m, #0 // =0x0
+; CHECK-NEXT:    str z16, [x8, #6, mul vl]
 ; CHECK-NEXT:    mov z8.d, p1/m, #0 // =0x0
-; CHECK-NEXT:    fcmuo p5.s, p0/z, z0.s, z0.s
-; CHECK-NEXT:    ldr z0, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:    str z12, [x8, #6, mul vl]
-; CHECK-NEXT:    str z1, [x8, #5, mul vl]
-; CHECK-NEXT:    fcmuo p0.s, p0/z, z3.s, z3.s
-; CHECK-NEXT:    mov z11.d, p3/m, #0 // =0x0
-; CHECK-NEXT:    str z8, [x8, #3, mul vl]
-; CHECK-NEXT:    mov z31.d, p4/m, #0 // =0x0
-; CHECK-NEXT:    mov z0.d, p2/m, z29.d
-; CHECK-NEXT:    str z11, [x8, #4, mul vl]
-; CHECK-NEXT:    mov z27.d, p5/m, #0 // =0x0
-; CHECK-NEXT:    str z31, [x8, #2, mul vl]
+; CHECK-NEXT:    mov z30.d, p5/m, #0 // =0x0
+; CHECK-NEXT:    str z4, [x8, #5, mul vl]
+; CHECK-NEXT:    sel z0.d, p3, z7.d, z1.d
+; CHECK-NEXT:    str z31, [x8, #3, mul vl]
+; CHECK-NEXT:    mov z29.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    str z8, [x8, #4, mul vl]
+; CHECK-NEXT:    str z30, [x8, #2, mul vl]
 ; CHECK-NEXT:    mov z0.d, p0/m, #0 // =0x0
-; CHECK-NEXT:    str z27, [x8, #1, mul vl]
+; CHECK-NEXT:    str z29, [x8, #1, mul vl]
 ; CHECK-NEXT:    str z0, [x8]
-; CHECK-NEXT:    addvl sp, sp, #3
+; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1184,6 +1025,8 @@ define <vscale x 32 x iXLen> @lrint_v32f32(<vscale x 32 x float> %x) {
 ; CHECK-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p11, [sp] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p10, [sp, #1, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p9, [sp, #2, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p8, [sp, #3, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
@@ -1203,20 +1046,17 @@ define <vscale x 1 x iXLen> @lrint_v1f64(<vscale x 1 x double> %x) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov x8, #-4332462841530417152 // =0xc3e0000000000000
-; CHECK-NEXT:    mov z2.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z1.d, x8
 ; CHECK-NEXT:    mov x8, #4890909195324358655 // =0x43dfffffffffffff
 ; CHECK-NEXT:    frintx z0.d, p0/m, z0.d
-; CHECK-NEXT:    mov z3.d, x8
+; CHECK-NEXT:    mov z2.d, x8
 ; CHECK-NEXT:    fcmge p1.d, p0/z, z0.d, z1.d
-; CHECK-NEXT:    movprfx z1, z0
-; CHECK-NEXT:    fcvtzs z1.d, p0/m, z0.d
-; CHECK-NEXT:    fcmgt p2.d, p0/z, z0.d, z3.d
-; CHECK-NEXT:    mov z3.d, #0x7fffffffffffffff
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    mov z1.d, #0x8000000000000000
+; CHECK-NEXT:    fcvtzs z1.d, p1/m, z0.d
+; CHECK-NEXT:    fcmgt p1.d, p0/z, z0.d, z2.d
+; CHECK-NEXT:    mov z2.d, #0x7fffffffffffffff
 ; CHECK-NEXT:    fcmuo p0.d, p0/z, z0.d, z0.d
-; CHECK-NEXT:    mov z1.d, p1/m, z2.d
-; CHECK-NEXT:    sel z0.d, p2, z3.d, z1.d
+; CHECK-NEXT:    sel z0.d, p1, z2.d, z1.d
 ; CHECK-NEXT:    mov z0.d, p0/m, #0 // =0x0
 ; CHECK-NEXT:    ret
   %a = call <vscale x 1 x iXLen> @llvm.lrint.nxv1iXLen.nxv1f64(<vscale x 1 x double> %x)
@@ -1229,20 +1069,17 @@ define <vscale x 2 x iXLen> @lrint_v2f64(<vscale x 2 x double> %x) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov x8, #-4332462841530417152 // =0xc3e0000000000000
-; CHECK-NEXT:    mov z2.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z1.d, x8
 ; CHECK-NEXT:    mov x8, #4890909195324358655 // =0x43dfffffffffffff
 ; CHECK-NEXT:    frintx z0.d, p0/m, z0.d
-; CHECK-NEXT:    mov z3.d, x8
+; CHECK-NEXT:    mov z2.d, x8
 ; CHECK-NEXT:    fcmge p1.d, p0/z, z0.d, z1.d
-; CHECK-NEXT:    movprfx z1, z0
-; CHECK-NEXT:    fcvtzs z1.d, p0/m, z0.d
-; CHECK-NEXT:    fcmgt p2.d, p0/z, z0.d, z3.d
-; CHECK-NEXT:    mov z3.d, #0x7fffffffffffffff
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    mov z1.d, #0x8000000000000000
+; CHECK-NEXT:    fcvtzs z1.d, p1/m, z0.d
+; CHECK-NEXT:    fcmgt p1.d, p0/z, z0.d, z2.d
+; CHECK-NEXT:    mov z2.d, #0x7fffffffffffffff
 ; CHECK-NEXT:    fcmuo p0.d, p0/z, z0.d, z0.d
-; CHECK-NEXT:    mov z1.d, p1/m, z2.d
-; CHECK-NEXT:    sel z0.d, p2, z3.d, z1.d
+; CHECK-NEXT:    sel z0.d, p1, z2.d, z1.d
 ; CHECK-NEXT:    mov z0.d, p0/m, #0 // =0x0
 ; CHECK-NEXT:    ret
   %a = call <vscale x 2 x iXLen> @llvm.lrint.nxv2iXLen.nxv2f64(<vscale x 2 x double> %x)
@@ -1253,41 +1090,28 @@ declare <vscale x 2 x iXLen> @llvm.lrint.nxv2iXLen.nxv2f64(<vscale x 2 x double>
 define <vscale x 4 x iXLen> @lrint_v4f64(<vscale x 4 x double> %x) {
 ; CHECK-LABEL: lrint_v4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
-; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov x8, #-4332462841530417152 // =0xc3e0000000000000
-; CHECK-NEXT:    mov z6.d, #0x7fffffffffffffff
+; CHECK-NEXT:    mov z3.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z2.d, x8
 ; CHECK-NEXT:    mov x8, #4890909195324358655 // =0x43dfffffffffffff
+; CHECK-NEXT:    mov z4.d, #0x8000000000000000
 ; CHECK-NEXT:    frintx z0.d, p0/m, z0.d
 ; CHECK-NEXT:    frintx z1.d, p0/m, z1.d
-; CHECK-NEXT:    mov z3.d, x8
+; CHECK-NEXT:    mov z5.d, x8
 ; CHECK-NEXT:    fcmge p1.d, p0/z, z0.d, z2.d
 ; CHECK-NEXT:    fcmge p2.d, p0/z, z1.d, z2.d
-; CHECK-NEXT:    mov z2.d, #0x8000000000000000
-; CHECK-NEXT:    movprfx z4, z0
-; CHECK-NEXT:    fcvtzs z4.d, p0/m, z0.d
-; CHECK-NEXT:    movprfx z5, z1
-; CHECK-NEXT:    fcvtzs z5.d, p0/m, z1.d
-; CHECK-NEXT:    fcmgt p3.d, p0/z, z0.d, z3.d
-; CHECK-NEXT:    fcmgt p4.d, p0/z, z1.d, z3.d
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    sel z3.d, p1, z2.d, z4.d
-; CHECK-NEXT:    fcmuo p1.d, p0/z, z0.d, z0.d
+; CHECK-NEXT:    mov z2.d, #0x7fffffffffffffff
+; CHECK-NEXT:    fcmuo p3.d, p0/z, z0.d, z0.d
+; CHECK-NEXT:    fcvtzs z3.d, p1/m, z0.d
+; CHECK-NEXT:    fcmgt p1.d, p0/z, z0.d, z5.d
+; CHECK-NEXT:    fcvtzs z4.d, p2/m, z1.d
+; CHECK-NEXT:    fcmgt p2.d, p0/z, z1.d, z5.d
 ; CHECK-NEXT:    fcmuo p0.d, p0/z, z1.d, z1.d
-; CHECK-NEXT:    sel z2.d, p2, z2.d, z5.d
-; CHECK-NEXT:    sel z0.d, p3, z6.d, z3.d
-; CHECK-NEXT:    sel z1.d, p4, z6.d, z2.d
-; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    mov z0.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    sel z0.d, p1, z2.d, z3.d
+; CHECK-NEXT:    sel z1.d, p2, z2.d, z4.d
+; CHECK-NEXT:    mov z0.d, p3/m, #0 // =0x0
 ; CHECK-NEXT:    mov z1.d, p0/m, #0 // =0x0
-; CHECK-NEXT:    addvl sp, sp, #1
-; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
   %a = call <vscale x 4 x iXLen> @llvm.lrint.nxv4iXLen.nxv4f64(<vscale x 4 x double> %x)
   ret <vscale x 4 x iXLen> %a
@@ -1299,7 +1123,6 @@ define <vscale x 8 x iXLen> @lrint_v8f64(<vscale x 8 x double> %x) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    str p7, [sp, #4, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p6, [sp, #5, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
@@ -1309,52 +1132,42 @@ define <vscale x 8 x iXLen> @lrint_v8f64(<vscale x 8 x double> %x) {
 ; CHECK-NEXT:    mov x8, #-4332462841530417152 // =0xc3e0000000000000
 ; CHECK-NEXT:    mov z5.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z4.d, x8
+; CHECK-NEXT:    mov z6.d, #0x8000000000000000
 ; CHECK-NEXT:    mov x8, #4890909195324358655 // =0x43dfffffffffffff
-; CHECK-NEXT:    mov z26.d, #0x7fffffffffffffff
 ; CHECK-NEXT:    frintx z0.d, p0/m, z0.d
 ; CHECK-NEXT:    frintx z1.d, p0/m, z1.d
 ; CHECK-NEXT:    frintx z2.d, p0/m, z2.d
 ; CHECK-NEXT:    frintx z3.d, p0/m, z3.d
-; CHECK-NEXT:    mov z6.d, x8
+; CHECK-NEXT:    mov z25.d, x8
+; CHECK-NEXT:    mov z7.d, #0x8000000000000000
+; CHECK-NEXT:    mov z24.d, #0x8000000000000000
 ; CHECK-NEXT:    fcmge p1.d, p0/z, z0.d, z4.d
 ; CHECK-NEXT:    fcmge p2.d, p0/z, z1.d, z4.d
 ; CHECK-NEXT:    fcmge p3.d, p0/z, z2.d, z4.d
 ; CHECK-NEXT:    fcmge p4.d, p0/z, z3.d, z4.d
-; CHECK-NEXT:    movprfx z4, z0
-; CHECK-NEXT:    fcvtzs z4.d, p0/m, z0.d
-; CHECK-NEXT:    movprfx z7, z1
-; CHECK-NEXT:    fcvtzs z7.d, p0/m, z1.d
-; CHECK-NEXT:    movprfx z24, z2
-; CHECK-NEXT:    fcvtzs z24.d, p0/m, z2.d
-; CHECK-NEXT:    movprfx z25, z3
-; CHECK-NEXT:    fcvtzs z25.d, p0/m, z3.d
-; CHECK-NEXT:    fcmgt p7.d, p0/z, z2.d, z6.d
-; CHECK-NEXT:    fcmgt p5.d, p0/z, z0.d, z6.d
-; CHECK-NEXT:    fcmgt p6.d, p0/z, z1.d, z6.d
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    not p3.b, p0/z, p3.b
-; CHECK-NEXT:    mov z4.d, p1/m, z5.d
-; CHECK-NEXT:    fcmgt p1.d, p0/z, z3.d, z6.d
-; CHECK-NEXT:    not p4.b, p0/z, p4.b
-; CHECK-NEXT:    sel z6.d, p2, z5.d, z7.d
-; CHECK-NEXT:    fcmuo p2.d, p0/z, z0.d, z0.d
-; CHECK-NEXT:    sel z7.d, p3, z5.d, z24.d
-; CHECK-NEXT:    fcmuo p3.d, p0/z, z1.d, z1.d
-; CHECK-NEXT:    sel z5.d, p4, z5.d, z25.d
-; CHECK-NEXT:    fcmuo p4.d, p0/z, z2.d, z2.d
+; CHECK-NEXT:    mov z4.d, #0x7fffffffffffffff
+; CHECK-NEXT:    fcmgt p5.d, p0/z, z3.d, z25.d
+; CHECK-NEXT:    fcmuo p6.d, p0/z, z1.d, z1.d
+; CHECK-NEXT:    fcvtzs z5.d, p1/m, z0.d
+; CHECK-NEXT:    fcvtzs z6.d, p2/m, z1.d
+; CHECK-NEXT:    fcmgt p1.d, p0/z, z0.d, z25.d
+; CHECK-NEXT:    fcmgt p2.d, p0/z, z1.d, z25.d
+; CHECK-NEXT:    fcvtzs z7.d, p3/m, z2.d
+; CHECK-NEXT:    fcmgt p3.d, p0/z, z2.d, z25.d
+; CHECK-NEXT:    fcvtzs z24.d, p4/m, z3.d
+; CHECK-NEXT:    fcmuo p4.d, p0/z, z0.d, z0.d
+; CHECK-NEXT:    sel z0.d, p1, z4.d, z5.d
+; CHECK-NEXT:    fcmuo p1.d, p0/z, z2.d, z2.d
 ; CHECK-NEXT:    fcmuo p0.d, p0/z, z3.d, z3.d
-; CHECK-NEXT:    sel z0.d, p5, z26.d, z4.d
-; CHECK-NEXT:    sel z1.d, p6, z26.d, z6.d
-; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    sel z2.d, p7, z26.d, z7.d
-; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    sel z3.d, p1, z26.d, z5.d
+; CHECK-NEXT:    sel z1.d, p2, z4.d, z6.d
+; CHECK-NEXT:    sel z2.d, p3, z4.d, z7.d
+; CHECK-NEXT:    sel z3.d, p5, z4.d, z24.d
 ; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    mov z0.d, p2/m, #0 // =0x0
-; CHECK-NEXT:    mov z1.d, p3/m, #0 // =0x0
-; CHECK-NEXT:    mov z2.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    mov z0.d, p4/m, #0 // =0x0
 ; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z1.d, p6/m, #0 // =0x0
+; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z2.d, p1/m, #0 // =0x0
 ; CHECK-NEXT:    mov z3.d, p0/m, #0 // =0x0
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1368,7 +1181,7 @@ define <vscale x 16 x iXLen> @lrint_v16f64(<vscale x 16 x double> %x) {
 ; CHECK-LABEL: lrint_v16f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    addvl sp, sp, #-2
+; CHECK-NEXT:    addvl sp, sp, #-3
 ; CHECK-NEXT:    str p10, [sp, #1, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p9, [sp, #2, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p8, [sp, #3, mul vl] // 2-byte Folded Spill
@@ -1376,109 +1189,93 @@ define <vscale x 16 x iXLen> @lrint_v16f64(<vscale x 16 x double> %x) {
 ; CHECK-NEXT:    str p6, [sp, #5, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    str z8, [sp, #1, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT:    str z9, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z8, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov x8, #-4332462841530417152 // =0xc3e0000000000000
-; CHECK-NEXT:    mov z24.d, #0x7fffffffffffffff
-; CHECK-NEXT:    mov z25.d, x8
+; CHECK-NEXT:    mov z26.d, #0x8000000000000000
+; CHECK-NEXT:    mov z24.d, x8
 ; CHECK-NEXT:    mov x8, #4890909195324358655 // =0x43dfffffffffffff
-; CHECK-NEXT:    movprfx z26, z0
-; CHECK-NEXT:    frintx z26.d, p0/m, z0.d
-; CHECK-NEXT:    movprfx z27, z1
-; CHECK-NEXT:    frintx z27.d, p0/m, z1.d
+; CHECK-NEXT:    mov z27.d, #0x8000000000000000
+; CHECK-NEXT:    frintx z0.d, p0/m, z0.d
+; CHECK-NEXT:    frintx z1.d, p0/m, z1.d
+; CHECK-NEXT:    movprfx z25, z4
+; CHECK-NEXT:    frintx z25.d, p0/m, z4.d
 ; CHECK-NEXT:    frintx z2.d, p0/m, z2.d
-; CHECK-NEXT:    mov z0.d, #0x8000000000000000
-; CHECK-NEXT:    mov z1.d, x8
 ; CHECK-NEXT:    frintx z3.d, p0/m, z3.d
-; CHECK-NEXT:    movprfx z28, z4
-; CHECK-NEXT:    frintx z28.d, p0/m, z4.d
 ; CHECK-NEXT:    frintx z5.d, p0/m, z5.d
 ; CHECK-NEXT:    frintx z6.d, p0/m, z6.d
+; CHECK-NEXT:    mov z30.d, x8
+; CHECK-NEXT:    mov z4.d, #0x8000000000000000
 ; CHECK-NEXT:    frintx z7.d, p0/m, z7.d
-; CHECK-NEXT:    fcmge p1.d, p0/z, z26.d, z25.d
-; CHECK-NEXT:    fcmge p2.d, p0/z, z27.d, z25.d
-; CHECK-NEXT:    movprfx z4, z26
-; CHECK-NEXT:    fcvtzs z4.d, p0/m, z26.d
-; CHECK-NEXT:    fcmge p5.d, p0/z, z2.d, z25.d
-; CHECK-NEXT:    movprfx z29, z27
-; CHECK-NEXT:    fcvtzs z29.d, p0/m, z27.d
-; CHECK-NEXT:    fcmgt p3.d, p0/z, z26.d, z1.d
-; CHECK-NEXT:    fcmge p6.d, p0/z, z3.d, z25.d
-; CHECK-NEXT:    fcmge p8.d, p0/z, z5.d, z25.d
-; CHECK-NEXT:    fcmgt p7.d, p0/z, z27.d, z1.d
-; CHECK-NEXT:    fcmge p9.d, p0/z, z6.d, z25.d
-; CHECK-NEXT:    movprfx z30, z28
-; CHECK-NEXT:    fcvtzs z30.d, p0/m, z28.d
-; CHECK-NEXT:    fcmge p10.d, p0/z, z7.d, z25.d
-; CHECK-NEXT:    not p4.b, p0/z, p1.b
-; CHECK-NEXT:    fcmuo p1.d, p0/z, z26.d, z26.d
-; CHECK-NEXT:    movprfx z26, z2
-; CHECK-NEXT:    fcvtzs z26.d, p0/m, z2.d
-; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    movprfx z31, z6
-; CHECK-NEXT:    fcvtzs z31.d, p0/m, z6.d
-; CHECK-NEXT:    movprfx z8, z7
-; CHECK-NEXT:    fcvtzs z8.d, p0/m, z7.d
-; CHECK-NEXT:    mov z4.d, p4/m, z0.d
-; CHECK-NEXT:    fcmge p4.d, p0/z, z28.d, z25.d
-; CHECK-NEXT:    not p5.b, p0/z, p5.b
-; CHECK-NEXT:    mov z29.d, p2/m, z0.d
-; CHECK-NEXT:    fcmuo p2.d, p0/z, z27.d, z27.d
-; CHECK-NEXT:    movprfx z27, z3
-; CHECK-NEXT:    fcvtzs z27.d, p0/m, z3.d
-; CHECK-NEXT:    sel z25.d, p5, z0.d, z26.d
-; CHECK-NEXT:    movprfx z26, z5
-; CHECK-NEXT:    fcvtzs z26.d, p0/m, z5.d
-; CHECK-NEXT:    not p6.b, p0/z, p6.b
-; CHECK-NEXT:    not p5.b, p0/z, p8.b
-; CHECK-NEXT:    fcmgt p8.d, p0/z, z2.d, z1.d
-; CHECK-NEXT:    not p4.b, p0/z, p4.b
-; CHECK-NEXT:    mov z27.d, p6/m, z0.d
-; CHECK-NEXT:    not p6.b, p0/z, p9.b
+; CHECK-NEXT:    mov z28.d, #0x8000000000000000
+; CHECK-NEXT:    mov z29.d, #0x8000000000000000
+; CHECK-NEXT:    fcmge p1.d, p0/z, z0.d, z24.d
+; CHECK-NEXT:    fcmge p2.d, p0/z, z1.d, z24.d
+; CHECK-NEXT:    fcmge p5.d, p0/z, z25.d, z24.d
+; CHECK-NEXT:    fcmge p3.d, p0/z, z2.d, z24.d
+; CHECK-NEXT:    fcmge p4.d, p0/z, z3.d, z24.d
+; CHECK-NEXT:    fcmge p7.d, p0/z, z5.d, z24.d
+; CHECK-NEXT:    mov z31.d, #0x8000000000000000
+; CHECK-NEXT:    fcmge p6.d, p0/z, z6.d, z24.d
+; CHECK-NEXT:    mov z8.d, #0x8000000000000000
+; CHECK-NEXT:    mov z9.d, #0x7fffffffffffffff
+; CHECK-NEXT:    fcmgt p8.d, p0/z, z25.d, z30.d
+; CHECK-NEXT:    fcmgt p10.d, p0/z, z6.d, z30.d
+; CHECK-NEXT:    fcvtzs z26.d, p1/m, z0.d
+; CHECK-NEXT:    fcmgt p1.d, p0/z, z0.d, z30.d
+; CHECK-NEXT:    fcvtzs z4.d, p2/m, z1.d
+; CHECK-NEXT:    fcmge p2.d, p0/z, z7.d, z24.d
+; CHECK-NEXT:    mov z24.d, #0x8000000000000000
+; CHECK-NEXT:    fcvtzs z27.d, p3/m, z2.d
+; CHECK-NEXT:    fcvtzs z28.d, p4/m, z3.d
+; CHECK-NEXT:    fcvtzs z29.d, p5/m, z25.d
+; CHECK-NEXT:    fcvtzs z31.d, p7/m, z5.d
+; CHECK-NEXT:    fcmgt p4.d, p0/z, z1.d, z30.d
+; CHECK-NEXT:    fcmgt p5.d, p0/z, z2.d, z30.d
+; CHECK-NEXT:    fcmgt p7.d, p0/z, z3.d, z30.d
+; CHECK-NEXT:    fcvtzs z8.d, p6/m, z6.d
+; CHECK-NEXT:    fcmuo p3.d, p0/z, z0.d, z0.d
+; CHECK-NEXT:    sel z0.d, p1, z9.d, z26.d
+; CHECK-NEXT:    fcmgt p1.d, p0/z, z5.d, z30.d
+; CHECK-NEXT:    fcvtzs z24.d, p2/m, z7.d
+; CHECK-NEXT:    fcmgt p2.d, p0/z, z7.d, z30.d
+; CHECK-NEXT:    fcmuo p6.d, p0/z, z1.d, z1.d
 ; CHECK-NEXT:    fcmuo p9.d, p0/z, z2.d, z2.d
-; CHECK-NEXT:    mov z30.d, p4/m, z0.d
-; CHECK-NEXT:    not p4.b, p0/z, p10.b
-; CHECK-NEXT:    fcmgt p10.d, p0/z, z3.d, z1.d
-; CHECK-NEXT:    mov z26.d, p5/m, z0.d
-; CHECK-NEXT:    fcmgt p5.d, p0/z, z28.d, z1.d
-; CHECK-NEXT:    mov z31.d, p6/m, z0.d
-; CHECK-NEXT:    mov z8.d, p4/m, z0.d
-; CHECK-NEXT:    sel z0.d, p3, z24.d, z4.d
-; CHECK-NEXT:    fcmgt p3.d, p0/z, z5.d, z1.d
-; CHECK-NEXT:    fcmgt p4.d, p0/z, z6.d, z1.d
-; CHECK-NEXT:    fcmgt p6.d, p0/z, z7.d, z1.d
-; CHECK-NEXT:    sel z1.d, p7, z24.d, z29.d
-; CHECK-NEXT:    fcmuo p7.d, p0/z, z3.d, z3.d
-; CHECK-NEXT:    sel z2.d, p8, z24.d, z25.d
-; CHECK-NEXT:    sel z3.d, p10, z24.d, z27.d
-; CHECK-NEXT:    sel z4.d, p5, z24.d, z30.d
-; CHECK-NEXT:    fcmuo p5.d, p0/z, z28.d, z28.d
-; CHECK-NEXT:    fcmuo p8.d, p0/z, z5.d, z5.d
-; CHECK-NEXT:    fcmuo p10.d, p0/z, z6.d, z6.d
-; CHECK-NEXT:    sel z5.d, p3, z24.d, z26.d
+; CHECK-NEXT:    sel z1.d, p4, z9.d, z4.d
+; CHECK-NEXT:    fcmuo p4.d, p0/z, z3.d, z3.d
+; CHECK-NEXT:    sel z2.d, p5, z9.d, z27.d
+; CHECK-NEXT:    sel z3.d, p7, z9.d, z28.d
+; CHECK-NEXT:    sel z4.d, p8, z9.d, z29.d
+; CHECK-NEXT:    fcmuo p5.d, p0/z, z25.d, z25.d
+; CHECK-NEXT:    fcmuo p7.d, p0/z, z5.d, z5.d
+; CHECK-NEXT:    fcmuo p8.d, p0/z, z6.d, z6.d
+; CHECK-NEXT:    sel z5.d, p1, z9.d, z31.d
+; CHECK-NEXT:    sel z6.d, p10, z9.d, z8.d
+; CHECK-NEXT:    ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p10, [sp, #1, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    fcmuo p0.d, p0/z, z7.d, z7.d
-; CHECK-NEXT:    sel z6.d, p4, z24.d, z31.d
-; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    sel z7.d, p6, z24.d, z8.d
-; CHECK-NEXT:    ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    sel z7.d, p2, z9.d, z24.d
+; CHECK-NEXT:    ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    mov z1.d, p6/m, #0 // =0x0
 ; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    mov z2.d, p9/m, #0 // =0x0
 ; CHECK-NEXT:    ldr p9, [sp, #2, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    mov z3.d, p7/m, #0 // =0x0
+; CHECK-NEXT:    mov z3.d, p4/m, #0 // =0x0
 ; CHECK-NEXT:    mov z4.d, p5/m, #0 // =0x0
-; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    mov z5.d, p8/m, #0 // =0x0
-; CHECK-NEXT:    mov z6.d, p10/m, #0 // =0x0
-; CHECK-NEXT:    ldr p10, [sp, #1, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    mov z0.d, p1/m, #0 // =0x0
-; CHECK-NEXT:    mov z1.d, p2/m, #0 // =0x0
+; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z5.d, p7/m, #0 // =0x0
+; CHECK-NEXT:    mov z6.d, p8/m, #0 // =0x0
 ; CHECK-NEXT:    ldr p8, [sp, #3, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z0.d, p3/m, #0 // =0x0
 ; CHECK-NEXT:    mov z7.d, p0/m, #0 // =0x0
-; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    addvl sp, sp, #2
+; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #3
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
   %a = call <vscale x 16 x iXLen> @llvm.lrint.nxv16iXLen.nxv16f64(<vscale x 16 x double> %x)
@@ -1491,6 +1288,8 @@ define <vscale x 32 x iXLen> @lrint_v32f64(<vscale x 32 x double> %x) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-17
+; CHECK-NEXT:    str p11, [sp] // 2-byte Folded Spill
+; CHECK-NEXT:    str p10, [sp, #1, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p9, [sp, #2, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p8, [sp, #3, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p7, [sp, #4, mul vl] // 2-byte Folded Spill
@@ -1513,8 +1312,8 @@ define <vscale x 32 x iXLen> @lrint_v32f64(<vscale x 32 x double> %x) {
 ; CHECK-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
 ; CHECK-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
 ; CHECK-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT:    addvl sp, sp, #-3
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xa0, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 160 * VG
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 144 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
 ; CHECK-NEXT:    .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
@@ -1527,219 +1326,176 @@ define <vscale x 32 x iXLen> @lrint_v32f64(<vscale x 32 x double> %x) {
 ; CHECK-NEXT:    ldr z0, [x0]
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    ldr z2, [x0, #2, mul vl]
-; CHECK-NEXT:    mov x9, #-4332462841530417152 // =0xc3e0000000000000
-; CHECK-NEXT:    ldr z24, [x0, #6, mul vl]
 ; CHECK-NEXT:    ldr z1, [x0, #1, mul vl]
-; CHECK-NEXT:    mov z7.d, x9
-; CHECK-NEXT:    mov z26.d, #0x8000000000000000
-; CHECK-NEXT:    ldr z3, [x0, #3, mul vl]
+; CHECK-NEXT:    ldr z6, [x0, #4, mul vl]
+; CHECK-NEXT:    mov x9, #-4332462841530417152 // =0xc3e0000000000000
+; CHECK-NEXT:    ldr z5, [x0, #3, mul vl]
+; CHECK-NEXT:    mov z25.d, x9
+; CHECK-NEXT:    mov z28.d, #0x8000000000000000
 ; CHECK-NEXT:    frintx z0.d, p0/m, z0.d
-; CHECK-NEXT:    movprfx z30, z2
-; CHECK-NEXT:    frintx z30.d, p0/m, z2.d
-; CHECK-NEXT:    ldr z6, [x0, #5, mul vl]
-; CHECK-NEXT:    movprfx z25, z24
-; CHECK-NEXT:    frintx z25.d, p0/m, z24.d
-; CHECK-NEXT:    movprfx z12, z1
-; CHECK-NEXT:    frintx z12.d, p0/m, z1.d
-; CHECK-NEXT:    ldr z5, [x0, #4, mul vl]
-; CHECK-NEXT:    frintx z3.d, p0/m, z3.d
-; CHECK-NEXT:    mov x9, #4890909195324358655 // =0x43dfffffffffffff
+; CHECK-NEXT:    movprfx z4, z2
+; CHECK-NEXT:    frintx z4.d, p0/m, z2.d
+; CHECK-NEXT:    mov z27.d, #0x8000000000000000
+; CHECK-NEXT:    frintx z1.d, p0/m, z1.d
 ; CHECK-NEXT:    frintx z6.d, p0/m, z6.d
-; CHECK-NEXT:    mov z4.d, x9
-; CHECK-NEXT:    fcmge p3.d, p0/z, z0.d, z7.d
-; CHECK-NEXT:    movprfx z24, z0
-; CHECK-NEXT:    fcvtzs z24.d, p0/m, z0.d
-; CHECK-NEXT:    fcmge p5.d, p0/z, z30.d, z7.d
-; CHECK-NEXT:    movprfx z28, z30
-; CHECK-NEXT:    fcvtzs z28.d, p0/m, z30.d
-; CHECK-NEXT:    str z0, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    mov z30.d, #0x8000000000000000
 ; CHECK-NEXT:    frintx z5.d, p0/m, z5.d
-; CHECK-NEXT:    fcmge p4.d, p0/z, z12.d, z7.d
-; CHECK-NEXT:    ldr z8, [x0, #7, mul vl]
-; CHECK-NEXT:    ldr z9, [x0, #15, mul vl]
-; CHECK-NEXT:    movprfx z27, z12
-; CHECK-NEXT:    fcvtzs z27.d, p0/m, z12.d
-; CHECK-NEXT:    fcmge p6.d, p0/z, z3.d, z7.d
-; CHECK-NEXT:    fcmge p9.d, p0/z, z6.d, z7.d
-; CHECK-NEXT:    not p7.b, p0/z, p3.b
-; CHECK-NEXT:    movprfx z31, z3
-; CHECK-NEXT:    fcvtzs z31.d, p0/m, z3.d
-; CHECK-NEXT:    movprfx z15, z6
-; CHECK-NEXT:    fcvtzs z15.d, p0/m, z6.d
-; CHECK-NEXT:    not p5.b, p0/z, p5.b
-; CHECK-NEXT:    fcmge p8.d, p0/z, z5.d, z7.d
-; CHECK-NEXT:    movprfx z13, z5
-; CHECK-NEXT:    fcvtzs z13.d, p0/m, z5.d
-; CHECK-NEXT:    sel z0.d, p7, z26.d, z24.d
-; CHECK-NEXT:    not p4.b, p0/z, p4.b
-; CHECK-NEXT:    movprfx z17, z25
-; CHECK-NEXT:    fcvtzs z17.d, p0/m, z25.d
-; CHECK-NEXT:    not p3.b, p0/z, p6.b
-; CHECK-NEXT:    fcmge p6.d, p0/z, z25.d, z7.d
-; CHECK-NEXT:    movprfx z22, z9
-; CHECK-NEXT:    frintx z22.d, p0/m, z9.d
-; CHECK-NEXT:    sel z29.d, p4, z26.d, z27.d
-; CHECK-NEXT:    movprfx z27, z8
-; CHECK-NEXT:    frintx z27.d, p0/m, z8.d
-; CHECK-NEXT:    fcmgt p1.d, p0/z, z12.d, z4.d
-; CHECK-NEXT:    str z0, [sp, #1, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT:    sel z0.d, p5, z26.d, z28.d
-; CHECK-NEXT:    not p4.b, p0/z, p8.b
-; CHECK-NEXT:    ldr z10, [x0, #8, mul vl]
-; CHECK-NEXT:    not p5.b, p0/z, p9.b
-; CHECK-NEXT:    sel z24.d, p3, z26.d, z31.d
-; CHECK-NEXT:    not p3.b, p0/z, p6.b
-; CHECK-NEXT:    movprfx z2, z22
-; CHECK-NEXT:    fcvtzs z2.d, p0/m, z22.d
-; CHECK-NEXT:    fcmgt p2.d, p0/z, z30.d, z4.d
+; CHECK-NEXT:    mov z26.d, #0x8000000000000000
+; CHECK-NEXT:    mov z2.d, #0x8000000000000000
+; CHECK-NEXT:    mov z13.d, #0x8000000000000000
+; CHECK-NEXT:    mov z12.d, #0x8000000000000000
+; CHECK-NEXT:    mov x10, #4890909195324358655 // =0x43dfffffffffffff
 ; CHECK-NEXT:    str z0, [sp] // 16-byte Folded Spill
-; CHECK-NEXT:    fcmge p7.d, p0/z, z27.d, z7.d
-; CHECK-NEXT:    sel z31.d, p5, z26.d, z15.d
-; CHECK-NEXT:    ldr z11, [x0, #9, mul vl]
-; CHECK-NEXT:    movprfx z28, z10
-; CHECK-NEXT:    frintx z28.d, p0/m, z10.d
-; CHECK-NEXT:    ldr z10, [x0, #10, mul vl]
-; CHECK-NEXT:    ldr z18, [x0, #11, mul vl]
-; CHECK-NEXT:    ldr z16, [x0, #13, mul vl]
-; CHECK-NEXT:    ldr z14, [x0, #14, mul vl]
-; CHECK-NEXT:    ldr z19, [x0, #12, mul vl]
-; CHECK-NEXT:    mov z17.d, p3/m, z26.d
-; CHECK-NEXT:    fcmgt p9.d, p0/z, z3.d, z4.d
-; CHECK-NEXT:    movprfx z8, z11
-; CHECK-NEXT:    frintx z8.d, p0/m, z11.d
-; CHECK-NEXT:    sel z11.d, p4, z26.d, z13.d
-; CHECK-NEXT:    frintx z10.d, p0/m, z10.d
-; CHECK-NEXT:    movprfx z13, z18
-; CHECK-NEXT:    frintx z13.d, p0/m, z18.d
-; CHECK-NEXT:    fcmge p5.d, p0/z, z28.d, z7.d
-; CHECK-NEXT:    movprfx z18, z27
-; CHECK-NEXT:    fcvtzs z18.d, p0/m, z27.d
+; CHECK-NEXT:    fcmge p3.d, p0/z, z4.d, z25.d
+; CHECK-NEXT:    fcmge p1.d, p0/z, z0.d, z25.d
+; CHECK-NEXT:    ldr z29, [x0, #7, mul vl]
+; CHECK-NEXT:    ldr z24, [x0, #6, mul vl]
+; CHECK-NEXT:    ldr z10, [x0, #9, mul vl]
+; CHECK-NEXT:    ldr z8, [x0, #8, mul vl]
+; CHECK-NEXT:    ldr z7, [x0, #5, mul vl]
+; CHECK-NEXT:    ldr z14, [x0, #15, mul vl]
+; CHECK-NEXT:    fcmge p2.d, p0/z, z1.d, z25.d
+; CHECK-NEXT:    fcmge p5.d, p0/z, z6.d, z25.d
+; CHECK-NEXT:    ldr z15, [x0, #14, mul vl]
+; CHECK-NEXT:    frintx z29.d, p0/m, z29.d
+; CHECK-NEXT:    frintx z24.d, p0/m, z24.d
+; CHECK-NEXT:    movprfx z11, z10
+; CHECK-NEXT:    frintx z11.d, p0/m, z10.d
+; CHECK-NEXT:    fcmge p4.d, p0/z, z5.d, z25.d
+; CHECK-NEXT:    movprfx z9, z8
+; CHECK-NEXT:    frintx z9.d, p0/m, z8.d
+; CHECK-NEXT:    ldr z16, [x0, #11, mul vl]
+; CHECK-NEXT:    ldr z20, [x0, #13, mul vl]
+; CHECK-NEXT:    frintx z7.d, p0/m, z7.d
+; CHECK-NEXT:    fcvtzs z28.d, p3/m, z4.d
+; CHECK-NEXT:    mov z10.d, #0x8000000000000000
+; CHECK-NEXT:    ldr z18, [x0, #12, mul vl]
+; CHECK-NEXT:    movprfx z19, z14
+; CHECK-NEXT:    frintx z19.d, p0/m, z14.d
+; CHECK-NEXT:    fcmge p3.d, p0/z, z29.d, z25.d
+; CHECK-NEXT:    ldr z17, [x0, #10, mul vl]
+; CHECK-NEXT:    frintx z15.d, p0/m, z15.d
+; CHECK-NEXT:    fcvtzs z27.d, p2/m, z1.d
+; CHECK-NEXT:    fcvtzs z30.d, p5/m, z6.d
+; CHECK-NEXT:    fcmge p2.d, p0/z, z24.d, z25.d
+; CHECK-NEXT:    fcmge p5.d, p0/z, z11.d, z25.d
+; CHECK-NEXT:    mov z14.d, #0x8000000000000000
 ; CHECK-NEXT:    frintx z16.d, p0/m, z16.d
-; CHECK-NEXT:    movprfx z15, z19
-; CHECK-NEXT:    frintx z15.d, p0/m, z19.d
-; CHECK-NEXT:    movprfx z19, z28
-; CHECK-NEXT:    fcvtzs z19.d, p0/m, z28.d
-; CHECK-NEXT:    movprfx z21, z14
-; CHECK-NEXT:    frintx z21.d, p0/m, z14.d
-; CHECK-NEXT:    not p4.b, p0/z, p7.b
-; CHECK-NEXT:    fcmge p6.d, p0/z, z8.d, z7.d
-; CHECK-NEXT:    movprfx z20, z8
-; CHECK-NEXT:    fcvtzs z20.d, p0/m, z8.d
-; CHECK-NEXT:    fcmge p7.d, p0/z, z10.d, z7.d
-; CHECK-NEXT:    fcmge p8.d, p0/z, z13.d, z7.d
-; CHECK-NEXT:    not p5.b, p0/z, p5.b
-; CHECK-NEXT:    sel z9.d, p4, z26.d, z18.d
-; CHECK-NEXT:    fcmge p4.d, p0/z, z16.d, z7.d
-; CHECK-NEXT:    fcmge p3.d, p0/z, z15.d, z7.d
-; CHECK-NEXT:    movprfx z0, z16
-; CHECK-NEXT:    fcvtzs z0.d, p0/m, z16.d
-; CHECK-NEXT:    sel z14.d, p5, z26.d, z19.d
-; CHECK-NEXT:    movprfx z19, z10
-; CHECK-NEXT:    fcvtzs z19.d, p0/m, z10.d
-; CHECK-NEXT:    movprfx z1, z21
-; CHECK-NEXT:    fcvtzs z1.d, p0/m, z21.d
-; CHECK-NEXT:    not p6.b, p0/z, p6.b
-; CHECK-NEXT:    movprfx z23, z15
-; CHECK-NEXT:    fcvtzs z23.d, p0/m, z15.d
-; CHECK-NEXT:    not p5.b, p0/z, p7.b
-; CHECK-NEXT:    sel z18.d, p6, z26.d, z20.d
-; CHECK-NEXT:    fcmge p6.d, p0/z, z21.d, z7.d
-; CHECK-NEXT:    not p7.b, p0/z, p8.b
-; CHECK-NEXT:    fcmge p8.d, p0/z, z22.d, z7.d
-; CHECK-NEXT:    movprfx z20, z13
-; CHECK-NEXT:    fcvtzs z20.d, p0/m, z13.d
-; CHECK-NEXT:    not p4.b, p0/z, p4.b
-; CHECK-NEXT:    mov z7.d, #0x7fffffffffffffff
-; CHECK-NEXT:    mov z19.d, p5/m, z26.d
-; CHECK-NEXT:    not p3.b, p0/z, p3.b
-; CHECK-NEXT:    mov z0.d, p4/m, z26.d
-; CHECK-NEXT:    fcmgt p4.d, p0/z, z21.d, z4.d
-; CHECK-NEXT:    not p5.b, p0/z, p6.b
-; CHECK-NEXT:    mov z23.d, p3/m, z26.d
-; CHECK-NEXT:    fcmgt p3.d, p0/z, z22.d, z4.d
-; CHECK-NEXT:    not p6.b, p0/z, p8.b
-; CHECK-NEXT:    mov z20.d, p7/m, z26.d
-; CHECK-NEXT:    fcmuo p8.d, p0/z, z22.d, z22.d
-; CHECK-NEXT:    mov z1.d, p5/m, z26.d
-; CHECK-NEXT:    fcmuo p5.d, p0/z, z21.d, z21.d
-; CHECK-NEXT:    fcmgt p7.d, p0/z, z25.d, z4.d
-; CHECK-NEXT:    mov z2.d, p6/m, z26.d
-; CHECK-NEXT:    sel z26.d, p1, z7.d, z29.d
-; CHECK-NEXT:    fcmgt p1.d, p0/z, z16.d, z4.d
-; CHECK-NEXT:    ldr z29, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:    fcmgt p6.d, p0/z, z5.d, z4.d
-; CHECK-NEXT:    mov z24.d, p9/m, z7.d
-; CHECK-NEXT:    mov z1.d, p4/m, z7.d
-; CHECK-NEXT:    fcmuo p4.d, p0/z, z16.d, z16.d
-; CHECK-NEXT:    mov z2.d, p3/m, z7.d
-; CHECK-NEXT:    fcmgt p3.d, p0/z, z15.d, z4.d
-; CHECK-NEXT:    mov z17.d, p7/m, z7.d
-; CHECK-NEXT:    mov z29.d, p2/m, z7.d
-; CHECK-NEXT:    fcmgt p2.d, p0/z, z13.d, z4.d
-; CHECK-NEXT:    mov z0.d, p1/m, z7.d
-; CHECK-NEXT:    fcmgt p1.d, p0/z, z10.d, z4.d
-; CHECK-NEXT:    mov z1.d, p5/m, #0 // =0x0
-; CHECK-NEXT:    mov z11.d, p6/m, z7.d
+; CHECK-NEXT:    frintx z20.d, p0/m, z20.d
+; CHECK-NEXT:    fcvtzs z26.d, p4/m, z5.d
+; CHECK-NEXT:    fcmge p4.d, p0/z, z9.d, z25.d
+; CHECK-NEXT:    frintx z18.d, p0/m, z18.d
+; CHECK-NEXT:    mov z31.d, #0x8000000000000000
+; CHECK-NEXT:    fcvtzs z2.d, p1/m, z0.d
+; CHECK-NEXT:    fcmge p1.d, p0/z, z7.d, z25.d
+; CHECK-NEXT:    mov z8.d, #0x8000000000000000
+; CHECK-NEXT:    frintx z17.d, p0/m, z17.d
+; CHECK-NEXT:    fcvtzs z10.d, p3/m, z29.d
+; CHECK-NEXT:    fcmge p3.d, p0/z, z19.d, z25.d
+; CHECK-NEXT:    mov z3.d, x10
+; CHECK-NEXT:    fcmge p6.d, p0/z, z15.d, z25.d
+; CHECK-NEXT:    mov z21.d, #0x8000000000000000
+; CHECK-NEXT:    fcvtzs z13.d, p2/m, z24.d
+; CHECK-NEXT:    fcvtzs z14.d, p5/m, z11.d
+; CHECK-NEXT:    fcmge p2.d, p0/z, z16.d, z25.d
+; CHECK-NEXT:    mov z22.d, #0x8000000000000000
+; CHECK-NEXT:    fcmge p5.d, p0/z, z20.d, z25.d
+; CHECK-NEXT:    mov z0.d, #0x8000000000000000
+; CHECK-NEXT:    fcvtzs z12.d, p4/m, z9.d
+; CHECK-NEXT:    fcmge p4.d, p0/z, z18.d, z25.d
+; CHECK-NEXT:    mov z23.d, #0x8000000000000000
+; CHECK-NEXT:    fcvtzs z8.d, p1/m, z7.d
+; CHECK-NEXT:    fcmge p1.d, p0/z, z17.d, z25.d
+; CHECK-NEXT:    fcvtzs z31.d, p3/m, z19.d
+; CHECK-NEXT:    mov z25.d, #0x7fffffffffffffff
+; CHECK-NEXT:    fcmgt p11.d, p0/z, z19.d, z3.d
+; CHECK-NEXT:    fcvtzs z21.d, p6/m, z15.d
+; CHECK-NEXT:    fcmgt p3.d, p0/z, z15.d, z3.d
 ; CHECK-NEXT:    fcmuo p6.d, p0/z, z15.d, z15.d
-; CHECK-NEXT:    fcmgt p5.d, p0/z, z8.d, z4.d
-; CHECK-NEXT:    mov z2.d, p8/m, #0 // =0x0
-; CHECK-NEXT:    sel z16.d, p3, z7.d, z23.d
-; CHECK-NEXT:    fcmuo p3.d, p0/z, z10.d, z10.d
-; CHECK-NEXT:    mov z0.d, p4/m, #0 // =0x0
-; CHECK-NEXT:    sel z15.d, p2, z7.d, z20.d
-; CHECK-NEXT:    fcmuo p2.d, p0/z, z13.d, z13.d
-; CHECK-NEXT:    str z1, [x8, #14, mul vl]
-; CHECK-NEXT:    sel z1.d, p1, z7.d, z19.d
-; CHECK-NEXT:    fcmgt p1.d, p0/z, z28.d, z4.d
-; CHECK-NEXT:    fcmgt p4.d, p0/z, z27.d, z4.d
-; CHECK-NEXT:    str z2, [x8, #15, mul vl]
-; CHECK-NEXT:    sel z2.d, p5, z7.d, z18.d
-; CHECK-NEXT:    mov z16.d, p6/m, #0 // =0x0
-; CHECK-NEXT:    fcmuo p5.d, p0/z, z8.d, z8.d
+; CHECK-NEXT:    mov z15.d, #0x8000000000000000
+; CHECK-NEXT:    fcmgt p7.d, p0/z, z1.d, z3.d
+; CHECK-NEXT:    fcvtzs z22.d, p2/m, z16.d
+; CHECK-NEXT:    fcvtzs z0.d, p5/m, z20.d
+; CHECK-NEXT:    fcmgt p2.d, p0/z, z24.d, z3.d
+; CHECK-NEXT:    fcmgt p5.d, p0/z, z20.d, z3.d
+; CHECK-NEXT:    fcvtzs z23.d, p4/m, z18.d
+; CHECK-NEXT:    fcmuo p4.d, p0/z, z19.d, z19.d
+; CHECK-NEXT:    mov z31.d, p11/m, z25.d
+; CHECK-NEXT:    sel z19.d, p3, z25.d, z21.d
+; CHECK-NEXT:    fcmgt p3.d, p0/z, z18.d, z3.d
+; CHECK-NEXT:    fcvtzs z15.d, p1/m, z17.d
+; CHECK-NEXT:    fcmuo p1.d, p0/z, z20.d, z20.d
+; CHECK-NEXT:    mov z27.d, p7/m, z25.d
+; CHECK-NEXT:    fcmgt p7.d, p0/z, z16.d, z3.d
+; CHECK-NEXT:    mov z13.d, p2/m, z25.d
+; CHECK-NEXT:    fcmgt p2.d, p0/z, z17.d, z3.d
+; CHECK-NEXT:    mov z0.d, p5/m, z25.d
+; CHECK-NEXT:    mov z31.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p4.d, p0/z, z18.d, z18.d
+; CHECK-NEXT:    sel z20.d, p3, z25.d, z23.d
+; CHECK-NEXT:    fcmuo p3.d, p0/z, z16.d, z16.d
+; CHECK-NEXT:    fcmgt p5.d, p0/z, z11.d, z3.d
+; CHECK-NEXT:    mov z19.d, p6/m, #0 // =0x0
+; CHECK-NEXT:    fcmgt p10.d, p0/z, z6.d, z3.d
+; CHECK-NEXT:    fcmgt p8.d, p0/z, z4.d, z3.d
+; CHECK-NEXT:    str z31, [x8, #15, mul vl]
+; CHECK-NEXT:    mov z0.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p1.d, p0/z, z17.d, z17.d
+; CHECK-NEXT:    sel z18.d, p7, z25.d, z22.d
+; CHECK-NEXT:    sel z31.d, p2, z25.d, z15.d
+; CHECK-NEXT:    fcmgt p2.d, p0/z, z9.d, z3.d
+; CHECK-NEXT:    str z19, [x8, #14, mul vl]
+; CHECK-NEXT:    mov z20.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p4.d, p0/z, z11.d, z11.d
 ; CHECK-NEXT:    str z0, [x8, #13, mul vl]
-; CHECK-NEXT:    mov z15.d, p2/m, #0 // =0x0
-; CHECK-NEXT:    fcmuo p2.d, p0/z, z28.d, z28.d
-; CHECK-NEXT:    mov z1.d, p3/m, #0 // =0x0
-; CHECK-NEXT:    fcmgt p3.d, p0/z, z6.d, z4.d
-; CHECK-NEXT:    sel z0.d, p1, z7.d, z14.d
-; CHECK-NEXT:    fcmuo p1.d, p0/z, z27.d, z27.d
-; CHECK-NEXT:    sel z27.d, p4, z7.d, z9.d
-; CHECK-NEXT:    str z16, [x8, #12, mul vl]
-; CHECK-NEXT:    fcmuo p4.d, p0/z, z25.d, z25.d
-; CHECK-NEXT:    str z15, [x8, #11, mul vl]
-; CHECK-NEXT:    mov z2.d, p5/m, #0 // =0x0
-; CHECK-NEXT:    fcmuo p5.d, p0/z, z6.d, z6.d
-; CHECK-NEXT:    str z1, [x8, #10, mul vl]
-; CHECK-NEXT:    mov z0.d, p2/m, #0 // =0x0
-; CHECK-NEXT:    sel z1.d, p3, z7.d, z31.d
-; CHECK-NEXT:    fcmuo p3.d, p0/z, z5.d, z5.d
-; CHECK-NEXT:    ldr z5, [sp, #2, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    mov z27.d, p1/m, #0 // =0x0
-; CHECK-NEXT:    str z2, [x8, #9, mul vl]
-; CHECK-NEXT:    fcmuo p1.d, p0/z, z3.d, z3.d
+; CHECK-NEXT:    mov z14.d, p5/m, z25.d
+; CHECK-NEXT:    fcmgt p5.d, p0/z, z29.d, z3.d
+; CHECK-NEXT:    mov z18.d, p3/m, #0 // =0x0
+; CHECK-NEXT:    mov z31.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    fcmgt p1.d, p0/z, z7.d, z3.d
+; CHECK-NEXT:    str z20, [x8, #12, mul vl]
+; CHECK-NEXT:    fcmuo p3.d, p0/z, z9.d, z9.d
+; CHECK-NEXT:    sel z0.d, p2, z25.d, z12.d
+; CHECK-NEXT:    mov z14.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p4.d, p0/z, z7.d, z7.d
+; CHECK-NEXT:    fcmuo p2.d, p0/z, z29.d, z29.d
+; CHECK-NEXT:    str z18, [x8, #11, mul vl]
+; CHECK-NEXT:    sel z29.d, p5, z25.d, z10.d
+; CHECK-NEXT:    fcmuo p5.d, p0/z, z24.d, z24.d
+; CHECK-NEXT:    str z31, [x8, #10, mul vl]
+; CHECK-NEXT:    sel z7.d, p1, z25.d, z8.d
+; CHECK-NEXT:    fcmuo p1.d, p0/z, z6.d, z6.d
+; CHECK-NEXT:    ldr z6, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    str z14, [x8, #9, mul vl]
+; CHECK-NEXT:    fcmgt p9.d, p0/z, z5.d, z3.d
+; CHECK-NEXT:    mov z0.d, p3/m, #0 // =0x0
+; CHECK-NEXT:    mov z29.d, p2/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p2.d, p0/z, z5.d, z5.d
+; CHECK-NEXT:    mov z13.d, p5/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p5.d, p0/z, z4.d, z4.d
+; CHECK-NEXT:    mov z7.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    fcmgt p3.d, p0/z, z6.d, z3.d
+; CHECK-NEXT:    fcmuo p4.d, p0/z, z1.d, z1.d
+; CHECK-NEXT:    fcmuo p0.d, p0/z, z6.d, z6.d
 ; CHECK-NEXT:    str z0, [x8, #8, mul vl]
-; CHECK-NEXT:    mov z17.d, p4/m, #0 // =0x0
-; CHECK-NEXT:    fcmuo p4.d, p0/z, z30.d, z30.d
-; CHECK-NEXT:    fcmgt p2.d, p0/z, z5.d, z4.d
-; CHECK-NEXT:    mov z1.d, p5/m, #0 // =0x0
-; CHECK-NEXT:    fcmuo p5.d, p0/z, z12.d, z12.d
-; CHECK-NEXT:    str z27, [x8, #7, mul vl]
-; CHECK-NEXT:    fcmuo p0.d, p0/z, z5.d, z5.d
-; CHECK-NEXT:    mov z11.d, p3/m, #0 // =0x0
-; CHECK-NEXT:    ldr z0, [sp, #1, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    mov z24.d, p1/m, #0 // =0x0
-; CHECK-NEXT:    str z17, [x8, #6, mul vl]
-; CHECK-NEXT:    mov z29.d, p4/m, #0 // =0x0
-; CHECK-NEXT:    str z1, [x8, #5, mul vl]
-; CHECK-NEXT:    mov z26.d, p5/m, #0 // =0x0
-; CHECK-NEXT:    str z11, [x8, #4, mul vl]
-; CHECK-NEXT:    mov z0.d, p2/m, z7.d
-; CHECK-NEXT:    str z24, [x8, #3, mul vl]
-; CHECK-NEXT:    str z29, [x8, #2, mul vl]
-; CHECK-NEXT:    str z26, [x8, #1, mul vl]
+; CHECK-NEXT:    mov z28.d, p8/m, z25.d
+; CHECK-NEXT:    mov z26.d, p9/m, z25.d
+; CHECK-NEXT:    str z29, [x8, #7, mul vl]
+; CHECK-NEXT:    mov z30.d, p10/m, z25.d
+; CHECK-NEXT:    str z13, [x8, #6, mul vl]
+; CHECK-NEXT:    str z7, [x8, #5, mul vl]
+; CHECK-NEXT:    sel z0.d, p3, z25.d, z2.d
+; CHECK-NEXT:    mov z26.d, p2/m, #0 // =0x0
+; CHECK-NEXT:    mov z30.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    mov z28.d, p5/m, #0 // =0x0
+; CHECK-NEXT:    mov z27.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    str z26, [x8, #3, mul vl]
 ; CHECK-NEXT:    mov z0.d, p0/m, #0 // =0x0
+; CHECK-NEXT:    str z30, [x8, #4, mul vl]
+; CHECK-NEXT:    str z28, [x8, #2, mul vl]
+; CHECK-NEXT:    str z27, [x8, #1, mul vl]
 ; CHECK-NEXT:    str z0, [x8]
-; CHECK-NEXT:    addvl sp, sp, #3
+; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1756,6 +1512,8 @@ define <vscale x 32 x iXLen> @lrint_v32f64(<vscale x 32 x double> %x) {
 ; CHECK-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p11, [sp] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p10, [sp, #1, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p9, [sp, #2, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p8, [sp, #3, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/sve-pred-selectop2.ll b/llvm/test/CodeGen/AArch64/sve-pred-selectop2.ll
index bbc94f568dd0a..0c0762da5bba2 100644
--- a/llvm/test/CodeGen/AArch64/sve-pred-selectop2.ll
+++ b/llvm/test/CodeGen/AArch64/sve-pred-selectop2.ll
@@ -989,9 +989,9 @@ define <vscale x 4 x float> @fadd_nxv4f32_x(<vscale x 4 x float> %x, <vscale x 4
 ; CHECK-LABEL: fadd_nxv4f32_x:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    fcmle p1.s, p0/z, z2.s, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fadd z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    fadd z1.s, z0.s, z1.s
+; CHECK-NEXT:    fcmle p0.s, p0/z, z2.s, #0.0
+; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 4 x float> %n, zeroinitializer
@@ -1004,9 +1004,9 @@ define <vscale x 8 x half> @fadd_nxv8f16_x(<vscale x 8 x half> %x, <vscale x 8 x
 ; CHECK-LABEL: fadd_nxv8f16_x:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    fcmle p1.h, p0/z, z2.h, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fadd z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    fadd z1.h, z0.h, z1.h
+; CHECK-NEXT:    fcmle p0.h, p0/z, z2.h, #0.0
+; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 8 x half> %n, zeroinitializer
@@ -1019,9 +1019,9 @@ define <vscale x 2 x double> @fadd_nxv2f64_x(<vscale x 2 x double> %x, <vscale x
 ; CHECK-LABEL: fadd_nxv2f64_x:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    fcmle p1.d, p0/z, z2.d, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fadd z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    fadd z1.d, z0.d, z1.d
+; CHECK-NEXT:    fcmle p0.d, p0/z, z2.d, #0.0
+; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 2 x double> %n, zeroinitializer
@@ -1034,9 +1034,9 @@ define <vscale x 4 x float> @fsub_nxv4f32_x(<vscale x 4 x float> %x, <vscale x 4
 ; CHECK-LABEL: fsub_nxv4f32_x:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    fcmle p1.s, p0/z, z2.s, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fsub z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    fsub z1.s, z0.s, z1.s
+; CHECK-NEXT:    fcmle p0.s, p0/z, z2.s, #0.0
+; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 4 x float> %n, zeroinitializer
@@ -1049,9 +1049,9 @@ define <vscale x 8 x half> @fsub_nxv8f16_x(<vscale x 8 x half> %x, <vscale x 8 x
 ; CHECK-LABEL: fsub_nxv8f16_x:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    fcmle p1.h, p0/z, z2.h, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fsub z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    fsub z1.h, z0.h, z1.h
+; CHECK-NEXT:    fcmle p0.h, p0/z, z2.h, #0.0
+; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 8 x half> %n, zeroinitializer
@@ -1064,9 +1064,9 @@ define <vscale x 2 x double> @fsub_nxv2f64_x(<vscale x 2 x double> %x, <vscale x
 ; CHECK-LABEL: fsub_nxv2f64_x:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    fcmle p1.d, p0/z, z2.d, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fsub z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    fsub z1.d, z0.d, z1.d
+; CHECK-NEXT:    fcmle p0.d, p0/z, z2.d, #0.0
+; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 2 x double> %n, zeroinitializer
@@ -1079,9 +1079,9 @@ define <vscale x 4 x float> @fmul_nxv4f32_x(<vscale x 4 x float> %x, <vscale x 4
 ; CHECK-LABEL: fmul_nxv4f32_x:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    fcmle p1.s, p0/z, z2.s, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fmul z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    fmul z1.s, z0.s, z1.s
+; CHECK-NEXT:    fcmle p0.s, p0/z, z2.s, #0.0
+; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 4 x float> %n, zeroinitializer
@@ -1094,9 +1094,9 @@ define <vscale x 8 x half> @fmul_nxv8f16_x(<vscale x 8 x half> %x, <vscale x 8 x
 ; CHECK-LABEL: fmul_nxv8f16_x:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    fcmle p1.h, p0/z, z2.h, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fmul z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    fmul z1.h, z0.h, z1.h
+; CHECK-NEXT:    fcmle p0.h, p0/z, z2.h, #0.0
+; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 8 x half> %n, zeroinitializer
@@ -1109,9 +1109,9 @@ define <vscale x 2 x double> @fmul_nxv2f64_x(<vscale x 2 x double> %x, <vscale x
 ; CHECK-LABEL: fmul_nxv2f64_x:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    fcmle p1.d, p0/z, z2.d, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fmul z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    fmul z1.d, z0.d, z1.d
+; CHECK-NEXT:    fcmle p0.d, p0/z, z2.d, #0.0
+; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 2 x double> %n, zeroinitializer
@@ -1125,9 +1125,8 @@ define <vscale x 4 x float> @fdiv_nxv4f32_x(<vscale x 4 x float> %x, <vscale x 4
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    fdivr z1.s, p0/m, z1.s, z0.s
-; CHECK-NEXT:    fcmle p1.s, p0/z, z2.s, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    mov z0.s, p0/m, z1.s
+; CHECK-NEXT:    fcmle p0.s, p0/z, z2.s, #0.0
+; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 4 x float> %n, zeroinitializer
@@ -1141,9 +1140,8 @@ define <vscale x 8 x half> @fdiv_nxv8f16_x(<vscale x 8 x half> %x, <vscale x 8 x
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    fdivr z1.h, p0/m, z1.h, z0.h
-; CHECK-NEXT:    fcmle p1.h, p0/z, z2.h, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    mov z0.h, p0/m, z1.h
+; CHECK-NEXT:    fcmle p0.h, p0/z, z2.h, #0.0
+; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 8 x half> %n, zeroinitializer
@@ -1157,9 +1155,8 @@ define <vscale x 2 x double> @fdiv_nxv2f64_x(<vscale x 2 x double> %x, <vscale x
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    fdivr z1.d, p0/m, z1.d, z0.d
-; CHECK-NEXT:    fcmle p1.d, p0/z, z2.d, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    mov z0.d, p0/m, z1.d
+; CHECK-NEXT:    fcmle p0.d, p0/z, z2.d, #0.0
+; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 2 x double> %n, zeroinitializer
@@ -1173,8 +1170,8 @@ define <vscale x 4 x float> @minnum_nxv4f32_x(<vscale x 4 x float> %x, <vscale x
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    fcmle p1.s, p0/z, z2.s, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fminnm z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    fminnm z1.s, p0/m, z1.s, z0.s
+; CHECK-NEXT:    sel z0.s, p1, z0.s, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 4 x float> %n, zeroinitializer
@@ -1188,8 +1185,8 @@ define <vscale x 8 x half> @minnum_nxv8f16_x(<vscale x 8 x half> %x, <vscale x 8
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    fcmle p1.h, p0/z, z2.h, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fminnm z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    fminnm z1.h, p0/m, z1.h, z0.h
+; CHECK-NEXT:    sel z0.h, p1, z0.h, z1.h
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 8 x half> %n, zeroinitializer
@@ -1203,8 +1200,8 @@ define <vscale x 2 x double> @minnum_nxv2f64_x(<vscale x 2 x double> %x, <vscale
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    fcmle p1.d, p0/z, z2.d, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fminnm z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    fminnm z1.d, p0/m, z1.d, z0.d
+; CHECK-NEXT:    sel z0.d, p1, z0.d, z1.d
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 2 x double> %n, zeroinitializer
@@ -1218,8 +1215,8 @@ define <vscale x 4 x float> @maxnum_nxv4f32_x(<vscale x 4 x float> %x, <vscale x
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    fcmle p1.s, p0/z, z2.s, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fmaxnm z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    fmaxnm z1.s, p0/m, z1.s, z0.s
+; CHECK-NEXT:    sel z0.s, p1, z0.s, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 4 x float> %n, zeroinitializer
@@ -1233,8 +1230,8 @@ define <vscale x 8 x half> @maxnum_nxv8f16_x(<vscale x 8 x half> %x, <vscale x 8
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    fcmle p1.h, p0/z, z2.h, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fmaxnm z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    fmaxnm z1.h, p0/m, z1.h, z0.h
+; CHECK-NEXT:    sel z0.h, p1, z0.h, z1.h
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 8 x half> %n, zeroinitializer
@@ -1248,8 +1245,8 @@ define <vscale x 2 x double> @maxnum_nxv2f64_x(<vscale x 2 x double> %x, <vscale
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    fcmle p1.d, p0/z, z2.d, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fmaxnm z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    fmaxnm z1.d, p0/m, z1.d, z0.d
+; CHECK-NEXT:    sel z0.d, p1, z0.d, z1.d
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 2 x double> %n, zeroinitializer
@@ -1263,8 +1260,8 @@ define <vscale x 4 x float> @minimum_nxv4f32_x(<vscale x 4 x float> %x, <vscale
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    fcmle p1.s, p0/z, z2.s, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fmin z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    fmin z1.s, p0/m, z1.s, z0.s
+; CHECK-NEXT:    sel z0.s, p1, z0.s, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 4 x float> %n, zeroinitializer
@@ -1278,8 +1275,8 @@ define <vscale x 8 x half> @minimum_nxv8f16_x(<vscale x 8 x half> %x, <vscale x
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    fcmle p1.h, p0/z, z2.h, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fmin z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    fmin z1.h, p0/m, z1.h, z0.h
+; CHECK-NEXT:    sel z0.h, p1, z0.h, z1.h
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 8 x half> %n, zeroinitializer
@@ -1293,8 +1290,8 @@ define <vscale x 2 x double> @minimum_nxv2f64_x(<vscale x 2 x double> %x, <vscal
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    fcmle p1.d, p0/z, z2.d, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fmin z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    fmin z1.d, p0/m, z1.d, z0.d
+; CHECK-NEXT:    sel z0.d, p1, z0.d, z1.d
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 2 x double> %n, zeroinitializer
@@ -1308,8 +1305,8 @@ define <vscale x 4 x float> @maximum_nxv4f32_x(<vscale x 4 x float> %x, <vscale
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    fcmle p1.s, p0/z, z2.s, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fmax z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    fmax z1.s, p0/m, z1.s, z0.s
+; CHECK-NEXT:    sel z0.s, p1, z0.s, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 4 x float> %n, zeroinitializer
@@ -1323,8 +1320,8 @@ define <vscale x 8 x half> @maximum_nxv8f16_x(<vscale x 8 x half> %x, <vscale x
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    fcmle p1.h, p0/z, z2.h, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fmax z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    fmax z1.h, p0/m, z1.h, z0.h
+; CHECK-NEXT:    sel z0.h, p1, z0.h, z1.h
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 8 x half> %n, zeroinitializer
@@ -1338,8 +1335,8 @@ define <vscale x 2 x double> @maximum_nxv2f64_x(<vscale x 2 x double> %x, <vscal
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    fcmle p1.d, p0/z, z2.d, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fmax z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    fmax z1.d, p0/m, z1.d, z0.d
+; CHECK-NEXT:    sel z0.d, p1, z0.d, z1.d
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 2 x double> %n, zeroinitializer
@@ -1353,8 +1350,8 @@ define <vscale x 4 x float> @fmai_nxv4f32_x(<vscale x 4 x float> %x, <vscale x 4
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    fcmle p1.s, p0/z, z3.s, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fmla z0.s, p0/m, z1.s, z2.s
+; CHECK-NEXT:    fmad z1.s, p0/m, z2.s, z0.s
+; CHECK-NEXT:    sel z0.s, p1, z0.s, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 4 x float> %n, zeroinitializer
@@ -1368,8 +1365,8 @@ define <vscale x 8 x half> @fmai_nxv8f16_x(<vscale x 8 x half> %x, <vscale x 8 x
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    fcmle p1.h, p0/z, z3.h, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fmla z0.h, p0/m, z1.h, z2.h
+; CHECK-NEXT:    fmad z1.h, p0/m, z2.h, z0.h
+; CHECK-NEXT:    sel z0.h, p1, z0.h, z1.h
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 8 x half> %n, zeroinitializer
@@ -1383,8 +1380,8 @@ define <vscale x 2 x double> @fmai_nxv2f64_x(<vscale x 2 x double> %x, <vscale x
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    fcmle p1.d, p0/z, z3.d, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fmla z0.d, p0/m, z1.d, z2.d
+; CHECK-NEXT:    fmad z1.d, p0/m, z2.d, z0.d
+; CHECK-NEXT:    sel z0.d, p1, z0.d, z1.d
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 2 x double> %n, zeroinitializer
@@ -1398,8 +1395,8 @@ define <vscale x 4 x float> @fma_nxv4f32_x(<vscale x 4 x float> %x, <vscale x 4
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    fcmle p1.s, p0/z, z3.s, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fmla z0.s, p0/m, z1.s, z2.s
+; CHECK-NEXT:    fmad z1.s, p0/m, z2.s, z0.s
+; CHECK-NEXT:    sel z0.s, p1, z0.s, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 4 x float> %n, zeroinitializer
@@ -1414,8 +1411,8 @@ define <vscale x 8 x half> @fma_nxv8f16_x(<vscale x 8 x half> %x, <vscale x 8 x
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    fcmle p1.h, p0/z, z3.h, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fmla z0.h, p0/m, z1.h, z2.h
+; CHECK-NEXT:    fmad z1.h, p0/m, z2.h, z0.h
+; CHECK-NEXT:    sel z0.h, p1, z0.h, z1.h
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 8 x half> %n, zeroinitializer
@@ -1430,8 +1427,8 @@ define <vscale x 2 x double> @fma_nxv2f64_x(<vscale x 2 x double> %x, <vscale x
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    fcmle p1.d, p0/z, z3.d, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fmla z0.d, p0/m, z1.d, z2.d
+; CHECK-NEXT:    fmad z1.d, p0/m, z2.d, z0.d
+; CHECK-NEXT:    sel z0.d, p1, z0.d, z1.d
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 2 x double> %n, zeroinitializer
@@ -2470,9 +2467,8 @@ define <vscale x 4 x float> @fadd_nxv4f32_y(<vscale x 4 x float> %x, <vscale x 4
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    fadd z0.s, z0.s, z1.s
-; CHECK-NEXT:    fcmle p1.s, p0/z, z2.s, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
+; CHECK-NEXT:    fcmle p0.s, p0/z, z2.s, #0.0
+; CHECK-NEXT:    mov z0.s, p0/m, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 4 x float> %n, zeroinitializer
@@ -2486,9 +2482,8 @@ define <vscale x 8 x half> @fadd_nxv8f16_y(<vscale x 8 x half> %x, <vscale x 8 x
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    fadd z0.h, z0.h, z1.h
-; CHECK-NEXT:    fcmle p1.h, p0/z, z2.h, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
+; CHECK-NEXT:    fcmle p0.h, p0/z, z2.h, #0.0
+; CHECK-NEXT:    mov z0.h, p0/m, z1.h
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 8 x half> %n, zeroinitializer
@@ -2502,9 +2497,8 @@ define <vscale x 2 x double> @fadd_nxv2f64_y(<vscale x 2 x double> %x, <vscale x
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    fadd z0.d, z0.d, z1.d
-; CHECK-NEXT:    fcmle p1.d, p0/z, z2.d, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
+; CHECK-NEXT:    fcmle p0.d, p0/z, z2.d, #0.0
+; CHECK-NEXT:    mov z0.d, p0/m, z1.d
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 2 x double> %n, zeroinitializer
@@ -2517,10 +2511,9 @@ define <vscale x 4 x float> @fsub_nxv4f32_y(<vscale x 4 x float> %x, <vscale x 4
 ; CHECK-LABEL: fsub_nxv4f32_y:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    fcmle p1.s, p0/z, z2.s, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fsubr z1.s, p0/m, z1.s, z0.s
-; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    fsub z0.s, z0.s, z1.s
+; CHECK-NEXT:    fcmle p0.s, p0/z, z2.s, #0.0
+; CHECK-NEXT:    mov z0.s, p0/m, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 4 x float> %n, zeroinitializer
@@ -2533,10 +2526,9 @@ define <vscale x 8 x half> @fsub_nxv8f16_y(<vscale x 8 x half> %x, <vscale x 8 x
 ; CHECK-LABEL: fsub_nxv8f16_y:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    fcmle p1.h, p0/z, z2.h, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fsubr z1.h, p0/m, z1.h, z0.h
-; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    fsub z0.h, z0.h, z1.h
+; CHECK-NEXT:    fcmle p0.h, p0/z, z2.h, #0.0
+; CHECK-NEXT:    mov z0.h, p0/m, z1.h
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 8 x half> %n, zeroinitializer
@@ -2549,10 +2541,9 @@ define <vscale x 2 x double> @fsub_nxv2f64_y(<vscale x 2 x double> %x, <vscale x
 ; CHECK-LABEL: fsub_nxv2f64_y:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    fcmle p1.d, p0/z, z2.d, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fsubr z1.d, p0/m, z1.d, z0.d
-; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    fsub z0.d, z0.d, z1.d
+; CHECK-NEXT:    fcmle p0.d, p0/z, z2.d, #0.0
+; CHECK-NEXT:    mov z0.d, p0/m, z1.d
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 2 x double> %n, zeroinitializer
@@ -2566,9 +2557,8 @@ define <vscale x 4 x float> @fmul_nxv4f32_y(<vscale x 4 x float> %x, <vscale x 4
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    fmul z0.s, z0.s, z1.s
-; CHECK-NEXT:    fcmle p1.s, p0/z, z2.s, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
+; CHECK-NEXT:    fcmle p0.s, p0/z, z2.s, #0.0
+; CHECK-NEXT:    mov z0.s, p0/m, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 4 x float> %n, zeroinitializer
@@ -2582,9 +2572,8 @@ define <vscale x 8 x half> @fmul_nxv8f16_y(<vscale x 8 x half> %x, <vscale x 8 x
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    fmul z0.h, z0.h, z1.h
-; CHECK-NEXT:    fcmle p1.h, p0/z, z2.h, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
+; CHECK-NEXT:    fcmle p0.h, p0/z, z2.h, #0.0
+; CHECK-NEXT:    mov z0.h, p0/m, z1.h
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 8 x half> %n, zeroinitializer
@@ -2598,9 +2587,8 @@ define <vscale x 2 x double> @fmul_nxv2f64_y(<vscale x 2 x double> %x, <vscale x
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    fmul z0.d, z0.d, z1.d
-; CHECK-NEXT:    fcmle p1.d, p0/z, z2.d, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
+; CHECK-NEXT:    fcmle p0.d, p0/z, z2.d, #0.0
+; CHECK-NEXT:    mov z0.d, p0/m, z1.d
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 2 x double> %n, zeroinitializer
@@ -2614,9 +2602,8 @@ define <vscale x 4 x float> @fdiv_nxv4f32_y(<vscale x 4 x float> %x, <vscale x 4
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    fdiv z0.s, p0/m, z0.s, z1.s
-; CHECK-NEXT:    fcmle p1.s, p0/z, z2.s, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
+; CHECK-NEXT:    fcmle p0.s, p0/z, z2.s, #0.0
+; CHECK-NEXT:    mov z0.s, p0/m, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 4 x float> %n, zeroinitializer
@@ -2630,9 +2617,8 @@ define <vscale x 8 x half> @fdiv_nxv8f16_y(<vscale x 8 x half> %x, <vscale x 8 x
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    fdiv z0.h, p0/m, z0.h, z1.h
-; CHECK-NEXT:    fcmle p1.h, p0/z, z2.h, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
+; CHECK-NEXT:    fcmle p0.h, p0/z, z2.h, #0.0
+; CHECK-NEXT:    mov z0.h, p0/m, z1.h
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 8 x half> %n, zeroinitializer
@@ -2646,9 +2632,8 @@ define <vscale x 2 x double> @fdiv_nxv2f64_y(<vscale x 2 x double> %x, <vscale x
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    fdiv z0.d, p0/m, z0.d, z1.d
-; CHECK-NEXT:    fcmle p1.d, p0/z, z2.d, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
+; CHECK-NEXT:    fcmle p0.d, p0/z, z2.d, #0.0
+; CHECK-NEXT:    mov z0.d, p0/m, z1.d
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 2 x double> %n, zeroinitializer
@@ -2662,9 +2647,8 @@ define <vscale x 4 x float> @minnum_nxv4f32_y(<vscale x 4 x float> %x, <vscale x
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    fcmle p1.s, p0/z, z2.s, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fminnm z1.s, p0/m, z1.s, z0.s
-; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    fminnm z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    mov z0.s, p1/m, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 4 x float> %n, zeroinitializer
@@ -2678,9 +2662,8 @@ define <vscale x 8 x half> @minnum_nxv8f16_y(<vscale x 8 x half> %x, <vscale x 8
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    fcmle p1.h, p0/z, z2.h, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fminnm z1.h, p0/m, z1.h, z0.h
-; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    fminnm z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    mov z0.h, p1/m, z1.h
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 8 x half> %n, zeroinitializer
@@ -2694,9 +2677,8 @@ define <vscale x 2 x double> @minnum_nxv2f64_y(<vscale x 2 x double> %x, <vscale
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    fcmle p1.d, p0/z, z2.d, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fminnm z1.d, p0/m, z1.d, z0.d
-; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    fminnm z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    mov z0.d, p1/m, z1.d
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 2 x double> %n, zeroinitializer
@@ -2710,9 +2692,8 @@ define <vscale x 4 x float> @maxnum_nxv4f32_y(<vscale x 4 x float> %x, <vscale x
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    fcmle p1.s, p0/z, z2.s, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fmaxnm z1.s, p0/m, z1.s, z0.s
-; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    fmaxnm z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    mov z0.s, p1/m, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 4 x float> %n, zeroinitializer
@@ -2726,9 +2707,8 @@ define <vscale x 8 x half> @maxnum_nxv8f16_y(<vscale x 8 x half> %x, <vscale x 8
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    fcmle p1.h, p0/z, z2.h, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fmaxnm z1.h, p0/m, z1.h, z0.h
-; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    fmaxnm z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    mov z0.h, p1/m, z1.h
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 8 x half> %n, zeroinitializer
@@ -2742,9 +2722,8 @@ define <vscale x 2 x double> @maxnum_nxv2f64_y(<vscale x 2 x double> %x, <vscale
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    fcmle p1.d, p0/z, z2.d, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fmaxnm z1.d, p0/m, z1.d, z0.d
-; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    fmaxnm z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    mov z0.d, p1/m, z1.d
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 2 x double> %n, zeroinitializer
@@ -2758,9 +2737,8 @@ define <vscale x 4 x float> @minimum_nxv4f32_y(<vscale x 4 x float> %x, <vscale
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    fcmle p1.s, p0/z, z2.s, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fmin z1.s, p0/m, z1.s, z0.s
-; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    fmin z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    mov z0.s, p1/m, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 4 x float> %n, zeroinitializer
@@ -2774,9 +2752,8 @@ define <vscale x 8 x half> @minimum_nxv8f16_y(<vscale x 8 x half> %x, <vscale x
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    fcmle p1.h, p0/z, z2.h, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fmin z1.h, p0/m, z1.h, z0.h
-; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    fmin z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    mov z0.h, p1/m, z1.h
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 8 x half> %n, zeroinitializer
@@ -2790,9 +2767,8 @@ define <vscale x 2 x double> @minimum_nxv2f64_y(<vscale x 2 x double> %x, <vscal
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    fcmle p1.d, p0/z, z2.d, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fmin z1.d, p0/m, z1.d, z0.d
-; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    fmin z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    mov z0.d, p1/m, z1.d
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 2 x double> %n, zeroinitializer
@@ -2806,9 +2782,8 @@ define <vscale x 4 x float> @maximum_nxv4f32_y(<vscale x 4 x float> %x, <vscale
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    fcmle p1.s, p0/z, z2.s, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fmax z1.s, p0/m, z1.s, z0.s
-; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    fmax z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    mov z0.s, p1/m, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 4 x float> %n, zeroinitializer
@@ -2822,9 +2797,8 @@ define <vscale x 8 x half> @maximum_nxv8f16_y(<vscale x 8 x half> %x, <vscale x
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    fcmle p1.h, p0/z, z2.h, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fmax z1.h, p0/m, z1.h, z0.h
-; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    fmax z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    mov z0.h, p1/m, z1.h
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 8 x half> %n, zeroinitializer
@@ -2838,9 +2812,8 @@ define <vscale x 2 x double> @maximum_nxv2f64_y(<vscale x 2 x double> %x, <vscal
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    fcmle p1.d, p0/z, z2.d, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fmax z1.d, p0/m, z1.d, z0.d
-; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    fmax z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    mov z0.d, p1/m, z1.d
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 2 x double> %n, zeroinitializer
@@ -2855,8 +2828,7 @@ define <vscale x 4 x float> @fmai_nxv4f32_y(<vscale x 4 x float> %x, <vscale x 4
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    fcmle p1.s, p0/z, z3.s, #0.0
 ; CHECK-NEXT:    fmla z0.s, p0/m, z1.s, z2.s
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
+; CHECK-NEXT:    mov z0.s, p1/m, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 4 x float> %n, zeroinitializer
@@ -2871,8 +2843,7 @@ define <vscale x 8 x half> @fmai_nxv8f16_y(<vscale x 8 x half> %x, <vscale x 8 x
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    fcmle p1.h, p0/z, z3.h, #0.0
 ; CHECK-NEXT:    fmla z0.h, p0/m, z1.h, z2.h
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
+; CHECK-NEXT:    mov z0.h, p1/m, z1.h
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 8 x half> %n, zeroinitializer
@@ -2887,8 +2858,7 @@ define <vscale x 2 x double> @fmai_nxv2f64_y(<vscale x 2 x double> %x, <vscale x
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    fcmle p1.d, p0/z, z3.d, #0.0
 ; CHECK-NEXT:    fmla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
+; CHECK-NEXT:    mov z0.d, p1/m, z1.d
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 2 x double> %n, zeroinitializer
@@ -2903,8 +2873,7 @@ define <vscale x 4 x float> @fma_nxv4f32_y(<vscale x 4 x float> %x, <vscale x 4
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    fcmle p1.s, p0/z, z3.s, #0.0
 ; CHECK-NEXT:    fmla z0.s, p0/m, z1.s, z2.s
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
+; CHECK-NEXT:    mov z0.s, p1/m, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 4 x float> %n, zeroinitializer
@@ -2920,8 +2889,7 @@ define <vscale x 8 x half> @fma_nxv8f16_y(<vscale x 8 x half> %x, <vscale x 8 x
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    fcmle p1.h, p0/z, z3.h, #0.0
 ; CHECK-NEXT:    fmla z0.h, p0/m, z1.h, z2.h
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
+; CHECK-NEXT:    mov z0.h, p1/m, z1.h
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 8 x half> %n, zeroinitializer
@@ -2937,8 +2905,7 @@ define <vscale x 2 x double> @fma_nxv2f64_y(<vscale x 2 x double> %x, <vscale x
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    fcmle p1.d, p0/z, z3.d, #0.0
 ; CHECK-NEXT:    fmla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
+; CHECK-NEXT:    mov z0.d, p1/m, z1.d
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 2 x double> %n, zeroinitializer
diff --git a/llvm/test/CodeGen/AArch64/sve-pred-selectop3.ll b/llvm/test/CodeGen/AArch64/sve-pred-selectop3.ll
index 66dece82a0ac5..58d6149b94d3a 100644
--- a/llvm/test/CodeGen/AArch64/sve-pred-selectop3.ll
+++ b/llvm/test/CodeGen/AArch64/sve-pred-selectop3.ll
@@ -641,9 +641,9 @@ define <vscale x 4 x float> @fadd_nxv4f32_x(<vscale x 4 x float> %x, <vscale x 4
 ; CHECK-LABEL: fadd_nxv4f32_x:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    fcmle p1.s, p0/z, z2.s, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fadd z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    fadd z1.s, z0.s, z1.s
+; CHECK-NEXT:    fcmle p0.s, p0/z, z2.s, #0.0
+; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 4 x float> %n, zeroinitializer
@@ -656,9 +656,9 @@ define <vscale x 8 x half> @fadd_nxv8f16_x(<vscale x 8 x half> %x, <vscale x 8 x
 ; CHECK-LABEL: fadd_nxv8f16_x:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    fcmle p1.h, p0/z, z2.h, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fadd z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    fadd z1.h, z0.h, z1.h
+; CHECK-NEXT:    fcmle p0.h, p0/z, z2.h, #0.0
+; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 8 x half> %n, zeroinitializer
@@ -671,9 +671,9 @@ define <vscale x 2 x double> @fadd_nxv2f64_x(<vscale x 2 x double> %x, <vscale x
 ; CHECK-LABEL: fadd_nxv2f64_x:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    fcmle p1.d, p0/z, z2.d, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fadd z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    fadd z1.d, z0.d, z1.d
+; CHECK-NEXT:    fcmle p0.d, p0/z, z2.d, #0.0
+; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 2 x double> %n, zeroinitializer
@@ -686,9 +686,9 @@ define <vscale x 4 x float> @fsub_nxv4f32_x(<vscale x 4 x float> %x, <vscale x 4
 ; CHECK-LABEL: fsub_nxv4f32_x:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    fcmle p1.s, p0/z, z2.s, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fsub z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    fsub z1.s, z0.s, z1.s
+; CHECK-NEXT:    fcmle p0.s, p0/z, z2.s, #0.0
+; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 4 x float> %n, zeroinitializer
@@ -701,9 +701,9 @@ define <vscale x 8 x half> @fsub_nxv8f16_x(<vscale x 8 x half> %x, <vscale x 8 x
 ; CHECK-LABEL: fsub_nxv8f16_x:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    fcmle p1.h, p0/z, z2.h, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fsub z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    fsub z1.h, z0.h, z1.h
+; CHECK-NEXT:    fcmle p0.h, p0/z, z2.h, #0.0
+; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 8 x half> %n, zeroinitializer
@@ -716,9 +716,9 @@ define <vscale x 2 x double> @fsub_nxv2f64_x(<vscale x 2 x double> %x, <vscale x
 ; CHECK-LABEL: fsub_nxv2f64_x:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    fcmle p1.d, p0/z, z2.d, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fsub z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    fsub z1.d, z0.d, z1.d
+; CHECK-NEXT:    fcmle p0.d, p0/z, z2.d, #0.0
+; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 2 x double> %n, zeroinitializer
@@ -731,9 +731,9 @@ define <vscale x 4 x float> @fmul_nxv4f32_x(<vscale x 4 x float> %x, <vscale x 4
 ; CHECK-LABEL: fmul_nxv4f32_x:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    fcmle p1.s, p0/z, z2.s, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fmul z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    fmul z1.s, z0.s, z1.s
+; CHECK-NEXT:    fcmle p0.s, p0/z, z2.s, #0.0
+; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 4 x float> %n, zeroinitializer
@@ -746,9 +746,9 @@ define <vscale x 8 x half> @fmul_nxv8f16_x(<vscale x 8 x half> %x, <vscale x 8 x
 ; CHECK-LABEL: fmul_nxv8f16_x:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    fcmle p1.h, p0/z, z2.h, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fmul z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    fmul z1.h, z0.h, z1.h
+; CHECK-NEXT:    fcmle p0.h, p0/z, z2.h, #0.0
+; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 8 x half> %n, zeroinitializer
@@ -761,9 +761,9 @@ define <vscale x 2 x double> @fmul_nxv2f64_x(<vscale x 2 x double> %x, <vscale x
 ; CHECK-LABEL: fmul_nxv2f64_x:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    fcmle p1.d, p0/z, z2.d, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fmul z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    fmul z1.d, z0.d, z1.d
+; CHECK-NEXT:    fcmle p0.d, p0/z, z2.d, #0.0
+; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 2 x double> %n, zeroinitializer
@@ -777,9 +777,8 @@ define <vscale x 4 x float> @fdiv_nxv4f32_x(<vscale x 4 x float> %x, <vscale x 4
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    fdivr z1.s, p0/m, z1.s, z0.s
-; CHECK-NEXT:    fcmle p1.s, p0/z, z2.s, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    mov z0.s, p0/m, z1.s
+; CHECK-NEXT:    fcmle p0.s, p0/z, z2.s, #0.0
+; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 4 x float> %n, zeroinitializer
@@ -793,9 +792,8 @@ define <vscale x 8 x half> @fdiv_nxv8f16_x(<vscale x 8 x half> %x, <vscale x 8 x
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    fdivr z1.h, p0/m, z1.h, z0.h
-; CHECK-NEXT:    fcmle p1.h, p0/z, z2.h, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    mov z0.h, p0/m, z1.h
+; CHECK-NEXT:    fcmle p0.h, p0/z, z2.h, #0.0
+; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 8 x half> %n, zeroinitializer
@@ -809,9 +807,8 @@ define <vscale x 2 x double> @fdiv_nxv2f64_x(<vscale x 2 x double> %x, <vscale x
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    fdivr z1.d, p0/m, z1.d, z0.d
-; CHECK-NEXT:    fcmle p1.d, p0/z, z2.d, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    mov z0.d, p0/m, z1.d
+; CHECK-NEXT:    fcmle p0.d, p0/z, z2.d, #0.0
+; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 2 x double> %n, zeroinitializer
@@ -825,8 +822,8 @@ define <vscale x 4 x float> @fma_nxv4f32_x(<vscale x 4 x float> %x, <vscale x 4
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    fcmle p1.s, p0/z, z3.s, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fmla z0.s, p0/m, z1.s, z2.s
+; CHECK-NEXT:    fmad z1.s, p0/m, z2.s, z0.s
+; CHECK-NEXT:    sel z0.s, p1, z0.s, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 4 x float> %n, zeroinitializer
@@ -841,8 +838,8 @@ define <vscale x 8 x half> @fma_nxv8f16_x(<vscale x 8 x half> %x, <vscale x 8 x
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    fcmle p1.h, p0/z, z3.h, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fmla z0.h, p0/m, z1.h, z2.h
+; CHECK-NEXT:    fmad z1.h, p0/m, z2.h, z0.h
+; CHECK-NEXT:    sel z0.h, p1, z0.h, z1.h
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 8 x half> %n, zeroinitializer
@@ -857,8 +854,8 @@ define <vscale x 2 x double> @fma_nxv2f64_x(<vscale x 2 x double> %x, <vscale x
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    fcmle p1.d, p0/z, z3.d, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fmla z0.d, p0/m, z1.d, z2.d
+; CHECK-NEXT:    fmad z1.d, p0/m, z2.d, z0.d
+; CHECK-NEXT:    sel z0.d, p1, z0.d, z1.d
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 2 x double> %n, zeroinitializer
@@ -1540,10 +1537,9 @@ define <vscale x 4 x float> @fadd_nxv4f32_y(<vscale x 4 x float> %x, <vscale x 4
 ; CHECK-LABEL: fadd_nxv4f32_y:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    fcmle p1.s, p0/z, z2.s, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fadd z1.s, p0/m, z1.s, z0.s
-; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    fadd z0.s, z1.s, z0.s
+; CHECK-NEXT:    fcmle p0.s, p0/z, z2.s, #0.0
+; CHECK-NEXT:    mov z0.s, p0/m, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 4 x float> %n, zeroinitializer
@@ -1556,10 +1552,9 @@ define <vscale x 8 x half> @fadd_nxv8f16_y(<vscale x 8 x half> %x, <vscale x 8 x
 ; CHECK-LABEL: fadd_nxv8f16_y:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    fcmle p1.h, p0/z, z2.h, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fadd z1.h, p0/m, z1.h, z0.h
-; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    fadd z0.h, z1.h, z0.h
+; CHECK-NEXT:    fcmle p0.h, p0/z, z2.h, #0.0
+; CHECK-NEXT:    mov z0.h, p0/m, z1.h
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 8 x half> %n, zeroinitializer
@@ -1572,10 +1567,9 @@ define <vscale x 2 x double> @fadd_nxv2f64_y(<vscale x 2 x double> %x, <vscale x
 ; CHECK-LABEL: fadd_nxv2f64_y:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    fcmle p1.d, p0/z, z2.d, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fadd z1.d, p0/m, z1.d, z0.d
-; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    fadd z0.d, z1.d, z0.d
+; CHECK-NEXT:    fcmle p0.d, p0/z, z2.d, #0.0
+; CHECK-NEXT:    mov z0.d, p0/m, z1.d
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 2 x double> %n, zeroinitializer
@@ -1588,10 +1582,9 @@ define <vscale x 4 x float> @fsub_nxv4f32_y(<vscale x 4 x float> %x, <vscale x 4
 ; CHECK-LABEL: fsub_nxv4f32_y:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    fcmle p1.s, p0/z, z2.s, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fsubr z1.s, p0/m, z1.s, z0.s
-; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    fsub z0.s, z0.s, z1.s
+; CHECK-NEXT:    fcmle p0.s, p0/z, z2.s, #0.0
+; CHECK-NEXT:    mov z0.s, p0/m, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 4 x float> %n, zeroinitializer
@@ -1604,10 +1597,9 @@ define <vscale x 8 x half> @fsub_nxv8f16_y(<vscale x 8 x half> %x, <vscale x 8 x
 ; CHECK-LABEL: fsub_nxv8f16_y:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    fcmle p1.h, p0/z, z2.h, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fsubr z1.h, p0/m, z1.h, z0.h
-; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    fsub z0.h, z0.h, z1.h
+; CHECK-NEXT:    fcmle p0.h, p0/z, z2.h, #0.0
+; CHECK-NEXT:    mov z0.h, p0/m, z1.h
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 8 x half> %n, zeroinitializer
@@ -1620,10 +1612,9 @@ define <vscale x 2 x double> @fsub_nxv2f64_y(<vscale x 2 x double> %x, <vscale x
 ; CHECK-LABEL: fsub_nxv2f64_y:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    fcmle p1.d, p0/z, z2.d, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fsubr z1.d, p0/m, z1.d, z0.d
-; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    fsub z0.d, z0.d, z1.d
+; CHECK-NEXT:    fcmle p0.d, p0/z, z2.d, #0.0
+; CHECK-NEXT:    mov z0.d, p0/m, z1.d
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 2 x double> %n, zeroinitializer
@@ -1636,10 +1627,9 @@ define <vscale x 4 x float> @fmul_nxv4f32_y(<vscale x 4 x float> %x, <vscale x 4
 ; CHECK-LABEL: fmul_nxv4f32_y:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    fcmle p1.s, p0/z, z2.s, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fmul z1.s, p0/m, z1.s, z0.s
-; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    fmul z0.s, z1.s, z0.s
+; CHECK-NEXT:    fcmle p0.s, p0/z, z2.s, #0.0
+; CHECK-NEXT:    mov z0.s, p0/m, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 4 x float> %n, zeroinitializer
@@ -1652,10 +1642,9 @@ define <vscale x 8 x half> @fmul_nxv8f16_y(<vscale x 8 x half> %x, <vscale x 8 x
 ; CHECK-LABEL: fmul_nxv8f16_y:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    fcmle p1.h, p0/z, z2.h, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fmul z1.h, p0/m, z1.h, z0.h
-; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    fmul z0.h, z1.h, z0.h
+; CHECK-NEXT:    fcmle p0.h, p0/z, z2.h, #0.0
+; CHECK-NEXT:    mov z0.h, p0/m, z1.h
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 8 x half> %n, zeroinitializer
@@ -1668,10 +1657,9 @@ define <vscale x 2 x double> @fmul_nxv2f64_y(<vscale x 2 x double> %x, <vscale x
 ; CHECK-LABEL: fmul_nxv2f64_y:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    fcmle p1.d, p0/z, z2.d, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fmul z1.d, p0/m, z1.d, z0.d
-; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    fmul z0.d, z1.d, z0.d
+; CHECK-NEXT:    fcmle p0.d, p0/z, z2.d, #0.0
+; CHECK-NEXT:    mov z0.d, p0/m, z1.d
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 2 x double> %n, zeroinitializer
@@ -1685,9 +1673,8 @@ define <vscale x 4 x float> @fdiv_nxv4f32_y(<vscale x 4 x float> %x, <vscale x 4
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    fdiv z0.s, p0/m, z0.s, z1.s
-; CHECK-NEXT:    fcmle p1.s, p0/z, z2.s, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
+; CHECK-NEXT:    fcmle p0.s, p0/z, z2.s, #0.0
+; CHECK-NEXT:    mov z0.s, p0/m, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 4 x float> %n, zeroinitializer
@@ -1701,9 +1688,8 @@ define <vscale x 8 x half> @fdiv_nxv8f16_y(<vscale x 8 x half> %x, <vscale x 8 x
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    fdiv z0.h, p0/m, z0.h, z1.h
-; CHECK-NEXT:    fcmle p1.h, p0/z, z2.h, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
+; CHECK-NEXT:    fcmle p0.h, p0/z, z2.h, #0.0
+; CHECK-NEXT:    mov z0.h, p0/m, z1.h
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 8 x half> %n, zeroinitializer
@@ -1717,9 +1703,8 @@ define <vscale x 2 x double> @fdiv_nxv2f64_y(<vscale x 2 x double> %x, <vscale x
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    fdiv z0.d, p0/m, z0.d, z1.d
-; CHECK-NEXT:    fcmle p1.d, p0/z, z2.d, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
+; CHECK-NEXT:    fcmle p0.d, p0/z, z2.d, #0.0
+; CHECK-NEXT:    mov z0.d, p0/m, z1.d
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 2 x double> %n, zeroinitializer
@@ -1734,8 +1719,7 @@ define <vscale x 4 x float> @fmai_nxv4f32_y(<vscale x 4 x float> %x, <vscale x 4
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    fcmle p1.s, p0/z, z3.s, #0.0
 ; CHECK-NEXT:    fmla z0.s, p0/m, z1.s, z2.s
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
+; CHECK-NEXT:    mov z0.s, p1/m, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 4 x float> %n, zeroinitializer
@@ -1750,8 +1734,7 @@ define <vscale x 8 x half> @fmai_nxv8f16_y(<vscale x 8 x half> %x, <vscale x 8 x
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    fcmle p1.h, p0/z, z3.h, #0.0
 ; CHECK-NEXT:    fmla z0.h, p0/m, z1.h, z2.h
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
+; CHECK-NEXT:    mov z0.h, p1/m, z1.h
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 8 x half> %n, zeroinitializer
@@ -1766,8 +1749,7 @@ define <vscale x 2 x double> @fmai_nxv2f64_y(<vscale x 2 x double> %x, <vscale x
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    fcmle p1.d, p0/z, z3.d, #0.0
 ; CHECK-NEXT:    fmla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
+; CHECK-NEXT:    mov z0.d, p1/m, z1.d
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 2 x double> %n, zeroinitializer
@@ -1782,8 +1764,7 @@ define <vscale x 4 x float> @fma_nxv4f32_y(<vscale x 4 x float> %x, <vscale x 4
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    fcmle p1.s, p0/z, z3.s, #0.0
 ; CHECK-NEXT:    fmla z0.s, p0/m, z1.s, z2.s
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
+; CHECK-NEXT:    mov z0.s, p1/m, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 4 x float> %n, zeroinitializer
@@ -1799,8 +1780,7 @@ define <vscale x 8 x half> @fma_nxv8f16_y(<vscale x 8 x half> %x, <vscale x 8 x
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    fcmle p1.h, p0/z, z3.h, #0.0
 ; CHECK-NEXT:    fmla z0.h, p0/m, z1.h, z2.h
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
+; CHECK-NEXT:    mov z0.h, p1/m, z1.h
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 8 x half> %n, zeroinitializer
@@ -1816,8 +1796,7 @@ define <vscale x 2 x double> @fma_nxv2f64_y(<vscale x 2 x double> %x, <vscale x
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    fcmle p1.d, p0/z, z3.d, #0.0
 ; CHECK-NEXT:    fmla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
+; CHECK-NEXT:    mov z0.d, p1/m, z1.d
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 2 x double> %n, zeroinitializer
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-select-addsub.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-select-addsub.ll
index 22956f8fe3551..9d3fe3a90b463 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-select-addsub.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-select-addsub.ll
@@ -47,9 +47,9 @@ define <4 x i32> @select_addsub_v4i32(<4 x i1> %cc, <4 x i32> %a, <4 x i32> %b)
 define <4 x i32> @select_addsub_v4i32_select_swapped(<4 x i1> %cc, <4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: select_addsub_v4i32_select_swapped:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
-; CHECK-NEXT:    vmnot.m v0, v0
-; CHECK-NEXT:    vrsub.vi v9, v9, 0, v0.t
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vrsub.vi v10, v9, 0
+; CHECK-NEXT:    vmerge.vvm v9, v10, v9, v0
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
 ; CHECK-NEXT:    ret
   %sub = sub <4 x i32> %a, %b
@@ -74,9 +74,9 @@ define <4 x i32> @select_addsub_v4i32_add_swapped(<4 x i1> %cc, <4 x i32> %a, <4
 define <4 x i32> @select_addsub_v4i32_both_swapped(<4 x i1> %cc, <4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: select_addsub_v4i32_both_swapped:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
-; CHECK-NEXT:    vmnot.m v0, v0
-; CHECK-NEXT:    vrsub.vi v9, v9, 0, v0.t
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vrsub.vi v10, v9, 0
+; CHECK-NEXT:    vmerge.vvm v9, v10, v9, v0
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
 ; CHECK-NEXT:    ret
   %sub = sub <4 x i32> %a, %b
diff --git a/llvm/test/CodeGen/X86/pr78897.ll b/llvm/test/CodeGen/X86/pr78897.ll
index 0caa569107c0c..4613c2bcdcaf4 100644
--- a/llvm/test/CodeGen/X86/pr78897.ll
+++ b/llvm/test/CodeGen/X86/pr78897.ll
@@ -225,9 +225,9 @@ define <16 x i8> @produceShuffleVectorForByte(i8 zeroext %0) nounwind {
 ; X86-AVX512-NEXT:    pushl %esi
 ; X86-AVX512-NEXT:    vpbroadcastb {{[0-9]+}}(%esp), %xmm0
 ; X86-AVX512-NEXT:    vmovd %xmm0, %eax
-; X86-AVX512-NEXT:    kmovd %eax, %k0
-; X86-AVX512-NEXT:    knotw %k0, %k1
-; X86-AVX512-NEXT:    vmovdqu8 {{.*#+}} xmm0 {%k1} {z} = [17,17,17,17,17,17,17,17,u,u,u,u,u,u,u,u]
+; X86-AVX512-NEXT:    kmovd %eax, %k1
+; X86-AVX512-NEXT:    knotw %k1, %k2
+; X86-AVX512-NEXT:    vmovdqu8 {{.*#+}} xmm0 {%k2} {z} = [17,17,17,17,17,17,17,17,u,u,u,u,u,u,u,u]
 ; X86-AVX512-NEXT:    vpextrd $1, %xmm0, %eax
 ; X86-AVX512-NEXT:    vmovd %xmm0, %edx
 ; X86-AVX512-NEXT:    movl $286331152, %ecx # imm = 0x11111110
@@ -247,9 +247,9 @@ define <16 x i8> @produceShuffleVectorForByte(i8 zeroext %0) nounwind {
 ; X86-AVX512-NEXT:    addl %edx, %eax
 ; X86-AVX512-NEXT:    vmovd %esi, %xmm1
 ; X86-AVX512-NEXT:    vpinsrd $1, %eax, %xmm1, %xmm1
-; X86-AVX512-NEXT:    vmovdqu8 %xmm0, %xmm1 {%k1}
-; X86-AVX512-NEXT:    vpsrlw $4, %xmm1, %xmm0
-; X86-AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; X86-AVX512-NEXT:    vmovdqu8 %xmm1, %xmm0 {%k1}
+; X86-AVX512-NEXT:    vpsrlw $4, %xmm0, %xmm1
+; X86-AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 ; X86-AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}{1to4}, %xmm0, %xmm0
 ; X86-AVX512-NEXT:    popl %esi
 ; X86-AVX512-NEXT:    popl %edi
@@ -258,9 +258,9 @@ define <16 x i8> @produceShuffleVectorForByte(i8 zeroext %0) nounwind {
 ;
 ; X64-AVX512-LABEL: produceShuffleVectorForByte:
 ; X64-AVX512:       # %bb.0: # %entry
-; X64-AVX512-NEXT:    kmovd %edi, %k0
-; X64-AVX512-NEXT:    knotw %k0, %k1
-; X64-AVX512-NEXT:    vmovdqu8 {{.*#+}} xmm0 {%k1} {z} = [17,17,17,17,17,17,17,17,u,u,u,u,u,u,u,u]
+; X64-AVX512-NEXT:    kmovd %edi, %k1
+; X64-AVX512-NEXT:    knotw %k1, %k2
+; X64-AVX512-NEXT:    vmovdqu8 {{.*#+}} xmm0 {%k2} {z} = [17,17,17,17,17,17,17,17,u,u,u,u,u,u,u,u]
 ; X64-AVX512-NEXT:    vmovq %xmm0, %rax
 ; X64-AVX512-NEXT:    movabsq $1229782938247303440, %rcx # imm = 0x1111111111111110
 ; X64-AVX512-NEXT:    movabsq $76861433640456465, %rdx # imm = 0x111111111111111
@@ -269,9 +269,9 @@ define <16 x i8> @produceShuffleVectorForByte(i8 zeroext %0) nounwind {
 ; X64-AVX512-NEXT:    vmovq %rax, %xmm0
 ; X64-AVX512-NEXT:    imulq %rcx, %rdx
 ; X64-AVX512-NEXT:    vmovq %rdx, %xmm1
-; X64-AVX512-NEXT:    vmovdqu8 %xmm0, %xmm1 {%k1}
-; X64-AVX512-NEXT:    vpsrlw $4, %xmm1, %xmm0
-; X64-AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; X64-AVX512-NEXT:    vmovdqu8 %xmm1, %xmm0 {%k1}
+; X64-AVX512-NEXT:    vpsrlw $4, %xmm0, %xmm1
+; X64-AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 ; X64-AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
 ; X64-AVX512-NEXT:    retq
 entry:



More information about the llvm-commits mailing list