[llvm] [AArch64] Improve lowering of truncating build vectors (PR #81960)

Usman Nadeem via llvm-commits llvm-commits at lists.llvm.org
Thu Feb 15 18:10:58 PST 2024


https://github.com/UsmanNadeem created https://github.com/llvm/llvm-project/pull/81960

1. Look through assert_zext/sext nodes.
2. Generalize `ReconstructTruncateFromBuildVector` to work for more cases.

Change-Id: I717a7471986ea4961c71df62912f8dd6f1723118


>From 9cf3d45e33e7457115c03aec2f810cb5ee71c5c3 Mon Sep 17 00:00:00 2001
From: "Nadeem, Usman" <mnadeem at quicinc.com>
Date: Wed, 14 Feb 2024 17:24:51 -0800
Subject: [PATCH] [AArch64] Improve lowering of truncating build vectors

1. Look through assert_zext/sext nodes.
2. Generalize `ReconstructTruncateFromBuildVector` to work for more cases.

Change-Id: I717a7471986ea4961c71df62912f8dd6f1723118
---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 164 ++++--
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   |   1 +
 .../CodeGen/AArch64/arm64-convert-v4f64.ll    |  21 +-
 .../CodeGen/AArch64/fp-conversion-to-tbl.ll   |   5 +-
 llvm/test/CodeGen/AArch64/fptoi.ll            | 482 +++++----------
 .../test/CodeGen/AArch64/fptosi-sat-vector.ll | 553 +++++++++---------
 .../test/CodeGen/AArch64/fptoui-sat-vector.ll | 427 +++++++-------
 .../CodeGen/AArch64/neon-extracttruncate.ll   |  16 +-
 llvm/test/CodeGen/AArch64/shuffle-tbl34.ll    |  52 +-
 llvm/test/CodeGen/AArch64/trunc-v1i64.ll      |   2 +-
 llvm/test/CodeGen/AArch64/vcvt-oversize.ll    |   5 +-
 11 files changed, 774 insertions(+), 954 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 8c5a4cdae11634..353509a1c1efa9 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -11369,54 +11369,105 @@ static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
   return true;
 }
 
-// Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
-// v4i32s. This is really a truncate, which we can construct out of (legal)
-// concats and truncate nodes.
-static SDValue ReconstructTruncateFromBuildVector(SDValue V, SelectionDAG &DAG) {
-  if (V.getValueType() != MVT::v16i8)
-    return SDValue();
-  assert(V.getNumOperands() == 16 && "Expected 16 operands on the BUILDVECTOR");
-
-  for (unsigned X = 0; X < 4; X++) {
-    // Check the first item in each group is an extract from lane 0 of a v4i32
-    // or v4i16.
-    SDValue BaseExt = V.getOperand(X * 4);
-    if (BaseExt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
-        (BaseExt.getOperand(0).getValueType() != MVT::v4i16 &&
-         BaseExt.getOperand(0).getValueType() != MVT::v4i32) ||
-        !isa<ConstantSDNode>(BaseExt.getOperand(1)) ||
-        BaseExt.getConstantOperandVal(1) != 0)
+// Detect patterns like a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3, that
+// are truncates, which we can construct out of (legal) concats and truncate
+// nodes.
+static SDValue ReconstructTruncateFromBuildVector(SDValue V,
+                                                  SelectionDAG &DAG) {
+  EVT BVTy = V.getValueType();
+  if (BVTy != MVT::v16i8 && BVTy != MVT::v8i16 && BVTy != MVT::v8i8 &&
+      BVTy != MVT::v4i16)
+    return SDValue();
+
+  // Only handle truncating BVs.
+  if (V.getOperand(0).getValueType().getSizeInBits() ==
+      BVTy.getScalarSizeInBits())
+    return SDValue();
+
+  SmallVector<SDValue, 4> Sources;
+  uint64_t LastIdx = 0;
+  uint64_t MaxIdx = 0;
+  // Check for sequential indices e.g. i=0, i+1, ..., i=0, i+1, ...
+  for (SDValue Extr : V->ops()) {
+    SDValue SourceVec = Extr.getOperand(0);
+    EVT SourceVecTy = SourceVec.getValueType();
+
+    if (!DAG.getTargetLoweringInfo().isTypeLegal(SourceVecTy))
       return SDValue();
-    SDValue Base = BaseExt.getOperand(0);
-    // And check the other items are extracts from the same vector.
-    for (unsigned Y = 1; Y < 4; Y++) {
-      SDValue Ext = V.getOperand(X * 4 + Y);
-      if (Ext.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
-          Ext.getOperand(0) != Base ||
-          !isa<ConstantSDNode>(Ext.getOperand(1)) ||
-          Ext.getConstantOperandVal(1) != Y)
+    if (!isa<ConstantSDNode>(Extr.getOperand(1)))
+      return SDValue();
+
+    uint64_t CurIdx = Extr.getConstantOperandVal(1);
+    // Allow repeat of sources.
+    if (CurIdx == 0) {
+      // Check if all lanes are used by the BV.
+      if (Sources.size() && Sources[Sources.size() - 1]
+                                    .getValueType()
+                                    .getVectorMinNumElements() != LastIdx + 1)
         return SDValue();
-    }
+      Sources.push_back(SourceVec);
+    } else if (CurIdx != LastIdx + 1)
+      return SDValue();
+
+    LastIdx = CurIdx;
+    MaxIdx = std::max(MaxIdx, CurIdx);
   }
 
-  // Turn the buildvector into a series of truncates and concates, which will
-  // become uzip1's. Any v4i32s we found get truncated to v4i16, which are
-  // concat together to produce 2 v8i16. These are both truncated and concat
-  // together.
+  // Check if all lanes are used by the BV.
+  if (Sources[Sources.size() - 1].getValueType().getVectorMinNumElements() !=
+      LastIdx + 1)
+    return SDValue();
+  if (Sources.size() % 2 != 0)
+    return SDValue();
+
+  // At this point we know that we have a truncating BV of extract_vector_elt.
+  // We can just truncate and concat them.
   SDLoc DL(V);
-  SDValue Trunc[4] = {
-      V.getOperand(0).getOperand(0), V.getOperand(4).getOperand(0),
-      V.getOperand(8).getOperand(0), V.getOperand(12).getOperand(0)};
-  for (SDValue &V : Trunc)
-    if (V.getValueType() == MVT::v4i32)
-      V = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i16, V);
-  SDValue Concat0 =
-      DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[0], Trunc[1]);
-  SDValue Concat1 =
-      DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[2], Trunc[3]);
-  SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat0);
-  SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat1);
-  return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Trunc0, Trunc1);
+  LLVMContext &Ctx = *DAG.getContext();
+  while (Sources.size() > 1) {
+    for (unsigned i = 0; i < Sources.size(); i += 2) {
+      SDValue V1 = Sources[i];
+      SDValue V2 = Sources[i + 1];
+      EVT VT1 = V1.getValueType();
+      EVT VT2 = V2.getValueType();
+
+      if (VT1.is128BitVector()) {
+        VT1 = VT1.changeVectorElementType(
+            VT1.getVectorElementType().getHalfSizedIntegerVT(Ctx));
+        V1 = DAG.getNode(ISD::TRUNCATE, DL, VT1, V1);
+      }
+      if (VT2.is128BitVector()) {
+        VT2 = VT2.changeVectorElementType(
+            VT2.getVectorElementType().getHalfSizedIntegerVT(Ctx));
+        V2 = DAG.getNode(ISD::TRUNCATE, DL, VT2, V2);
+      }
+
+      assert(VT1 == VT2 && "Mismatched types.");
+      Sources[i / 2] =
+          DAG.getNode(ISD::CONCAT_VECTORS, DL,
+                      VT1.getDoubleNumVectorElementsVT(Ctx), V1, V2);
+    }
+    Sources.resize(Sources.size() / 2);
+  }
+
+  // We might not have the final type in some cases e.g. <4i32, 4i32> -> 8i8. Do
+  // a final truncating shuffle instead of a concat + trunc.
+  if (Sources[0].getValueType() != BVTy) {
+    SDValue V1 = Sources[0].getOperand(0);
+    SDValue V2 = Sources[0].getOperand(1);
+    V1 = DAG.getNode(DAG.getDataLayout().isLittleEndian() ? ISD::BITCAST
+                                                          : AArch64ISD::NVCAST,
+                     DL, BVTy, V1);
+    V2 = DAG.getNode(DAG.getDataLayout().isLittleEndian() ? ISD::BITCAST
+                                                          : AArch64ISD::NVCAST,
+                     DL, BVTy, V2);
+
+    SmallVector<int, 8> MaskVec;
+    for (unsigned i = 0; i < BVTy.getVectorNumElements() * 2; i += 2)
+      MaskVec.push_back(i);
+    return DAG.getVectorShuffle(BVTy, DL, V1, V2, MaskVec);
+  }
+  return Sources[0];
 }
 
 /// Check if a vector shuffle corresponds to a DUP instructions with a larger
@@ -13305,8 +13356,9 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
   // Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
   // v4i32s. This is really a truncate, which we can construct out of (legal)
   // concats and truncate nodes.
-  if (SDValue M = ReconstructTruncateFromBuildVector(Op, DAG))
-    return M;
+  if (AllLanesExtractElt)
+    if (SDValue M = ReconstructTruncateFromBuildVector(Op, DAG))
+      return M;
 
   // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
   if (NumElts >= 4) {
@@ -19096,6 +19148,28 @@ static SDValue performBuildVectorCombine(SDNode *N,
   SDLoc DL(N);
   EVT VT = N->getValueType(0);
 
+  //    BUILD_VECTOR (extract_elt(Assert[S|Z]ext(x)))
+  // => BUILD_VECTOR (extract_elt(x))
+  SmallVector<SDValue, 8> Ops;
+  bool ExtractExtended = false;
+  for (SDValue Extr : N->ops()) {
+    if (Extr.getOpcode() != ISD::EXTRACT_VECTOR_ELT) {
+      ExtractExtended = false;
+      break;
+    }
+    SDValue ExtractBase = Extr.getOperand(0);
+    if (ExtractBase.getOpcode() == ISD::AssertSext ||
+        ExtractBase.getOpcode() == ISD::AssertZext) {
+      ExtractExtended = true;
+      Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
+                                Extr.getValueType(), ExtractBase.getOperand(0),
+                                Extr.getOperand(1)));
+    } else
+      Ops.push_back(Extr);
+  }
+  if (ExtractExtended)
+    return DAG.getBuildVector(VT, DL, Ops);
+
   // A build vector of two extracted elements is equivalent to an
   // extract subvector where the inner vector is any-extended to the
   // extract_vector_elt VT.
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 8c2a852850320f..331eaa6fb24fda 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -6114,6 +6114,7 @@ def : Pat<(v8i16 (concat_vectors (v4i16 (trunc (v4i32 V128:$Vn))),
 def : Pat<(v4i32 (concat_vectors (v2i32 (trunc (v2i64 V128:$Vn))),
                                  (v2i32 (trunc (v2i64 V128:$Vm))))),
           (UZP1v4i32 V128:$Vn, V128:$Vm)>;
+
 // These are the same as above, with an optional assertzext node that can be
 // generated from fptoi lowering.
 def : Pat<(v16i8 (concat_vectors (v8i8 (assertzext (trunc (v8i16 V128:$Vn)))),
diff --git a/llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll b/llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll
index 9bf638f57a5120..193e3b0cfbc7bc 100644
--- a/llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll
@@ -8,9 +8,8 @@ define <4 x i16> @fptosi_v4f64_to_v4i16(ptr %ptr) {
 ; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    fcvtzs v1.2d, v1.2d
 ; CHECK-NEXT:    fcvtzs v0.2d, v0.2d
-; CHECK-NEXT:    xtn v1.2s, v1.2d
-; CHECK-NEXT:    xtn v0.2s, v0.2d
-; CHECK-NEXT:    uzp1 v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    xtn v0.4h, v0.4s
 ; CHECK-NEXT:    ret
   %tmp1 = load <4 x double>, ptr %ptr
   %tmp2 = fptosi <4 x double> %tmp1 to <4 x i16>
@@ -26,13 +25,10 @@ define <8 x i8> @fptosi_v4f64_to_v4i8(ptr %ptr) {
 ; CHECK-NEXT:    fcvtzs v1.2d, v1.2d
 ; CHECK-NEXT:    fcvtzs v3.2d, v3.2d
 ; CHECK-NEXT:    fcvtzs v2.2d, v2.2d
-; CHECK-NEXT:    xtn v0.2s, v0.2d
-; CHECK-NEXT:    xtn v1.2s, v1.2d
-; CHECK-NEXT:    xtn v3.2s, v3.2d
-; CHECK-NEXT:    xtn v2.2s, v2.2d
-; CHECK-NEXT:    uzp1 v0.4h, v1.4h, v0.4h
-; CHECK-NEXT:    uzp1 v1.4h, v2.4h, v3.4h
-; CHECK-NEXT:    uzp1 v0.8b, v1.8b, v0.8b
+; CHECK-NEXT:    uzp1 v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    uzp1 v1.4s, v2.4s, v3.4s
+; CHECK-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
+; CHECK-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-NEXT:    ret
   %tmp1 = load <8 x double>, ptr %ptr
   %tmp2 = fptosi <8 x double> %tmp1 to <8 x i8>
@@ -72,9 +68,8 @@ define <4 x i16> @fptoui_v4f64_to_v4i16(ptr %ptr) {
 ; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    fcvtzs v1.2d, v1.2d
 ; CHECK-NEXT:    fcvtzs v0.2d, v0.2d
-; CHECK-NEXT:    xtn v1.2s, v1.2d
-; CHECK-NEXT:    xtn v0.2s, v0.2d
-; CHECK-NEXT:    uzp1 v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    xtn v0.4h, v0.4s
 ; CHECK-NEXT:    ret
   %tmp1 = load <4 x double>, ptr %ptr
   %tmp2 = fptoui <4 x double> %tmp1 to <4 x i16>
diff --git a/llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll b/llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll
index 1ea87bb6b04b51..0a3b9a070c2b32 100644
--- a/llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll
+++ b/llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll
@@ -73,9 +73,8 @@ define void @fptoui_v8f32_to_v8i8_no_loop(ptr %A, ptr %dst) {
 ; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    fcvtzs.4s v1, v1
 ; CHECK-NEXT:    fcvtzs.4s v0, v0
-; CHECK-NEXT:    xtn.4h v1, v1
-; CHECK-NEXT:    xtn.4h v0, v0
-; CHECK-NEXT:    uzp1.8b v0, v0, v1
+; CHECK-NEXT:    uzp1.8h v0, v0, v1
+; CHECK-NEXT:    xtn.8b v0, v0
 ; CHECK-NEXT:    str d0, [x1]
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/AArch64/fptoi.ll b/llvm/test/CodeGen/AArch64/fptoi.ll
index 251719c1e3b430..a099db47655558 100644
--- a/llvm/test/CodeGen/AArch64/fptoi.ll
+++ b/llvm/test/CodeGen/AArch64/fptoi.ll
@@ -1096,30 +1096,17 @@ entry:
 }
 
 define <3 x i16> @fptos_v3f64_v3i16(<3 x double> %a) {
-; CHECK-SD-LABEL: fptos_v3f64_v3i16:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-SD-NEXT:    // kill: def $d2 killed $d2 def $q2
-; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0]
-; CHECK-SD-NEXT:    fcvtzs v1.2d, v2.2d
-; CHECK-SD-NEXT:    fcvtzs v0.2d, v0.2d
-; CHECK-SD-NEXT:    xtn v1.2s, v1.2d
-; CHECK-SD-NEXT:    xtn v0.2s, v0.2d
-; CHECK-SD-NEXT:    uzp1 v0.4h, v0.4h, v1.4h
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: fptos_v3f64_v3i16:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 def $q2
-; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
-; CHECK-GI-NEXT:    fcvtzs v1.2d, v2.2d
-; CHECK-GI-NEXT:    fcvtzs v0.2d, v0.2d
-; CHECK-GI-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
-; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: fptos_v3f64_v3i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-NEXT:    fcvtzs v1.2d, v2.2d
+; CHECK-NEXT:    fcvtzs v0.2d, v0.2d
+; CHECK-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    xtn v0.4h, v0.4s
+; CHECK-NEXT:    ret
 entry:
   %c = fptosi <3 x double> %a to <3 x i16>
   ret <3 x i16> %c
@@ -1134,9 +1121,8 @@ define <3 x i16> @fptou_v3f64_v3i16(<3 x double> %a) {
 ; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-SD-NEXT:    fcvtzs v1.2d, v2.2d
 ; CHECK-SD-NEXT:    fcvtzs v0.2d, v0.2d
-; CHECK-SD-NEXT:    xtn v1.2s, v1.2d
-; CHECK-SD-NEXT:    xtn v0.2s, v0.2d
-; CHECK-SD-NEXT:    uzp1 v0.4h, v0.4h, v1.4h
+; CHECK-SD-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    xtn v0.4h, v0.4s
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: fptou_v3f64_v3i16:
@@ -1160,9 +1146,8 @@ define <4 x i16> @fptos_v4f64_v4i16(<4 x double> %a) {
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    fcvtzs v1.2d, v1.2d
 ; CHECK-SD-NEXT:    fcvtzs v0.2d, v0.2d
-; CHECK-SD-NEXT:    xtn v1.2s, v1.2d
-; CHECK-SD-NEXT:    xtn v0.2s, v0.2d
-; CHECK-SD-NEXT:    uzp1 v0.4h, v0.4h, v1.4h
+; CHECK-SD-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    xtn v0.4h, v0.4s
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: fptos_v4f64_v4i16:
@@ -1182,9 +1167,8 @@ define <4 x i16> @fptou_v4f64_v4i16(<4 x double> %a) {
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    fcvtzs v1.2d, v1.2d
 ; CHECK-SD-NEXT:    fcvtzs v0.2d, v0.2d
-; CHECK-SD-NEXT:    xtn v1.2s, v1.2d
-; CHECK-SD-NEXT:    xtn v0.2s, v0.2d
-; CHECK-SD-NEXT:    uzp1 v0.4h, v0.4h, v1.4h
+; CHECK-SD-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    xtn v0.4h, v0.4s
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: fptou_v4f64_v4i16:
@@ -1204,15 +1188,11 @@ define <8 x i16> @fptos_v8f64_v8i16(<8 x double> %a) {
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    fcvtzs v3.2d, v3.2d
 ; CHECK-SD-NEXT:    fcvtzs v2.2d, v2.2d
-; CHECK-SD-NEXT:    adrp x8, .LCPI54_0
 ; CHECK-SD-NEXT:    fcvtzs v1.2d, v1.2d
 ; CHECK-SD-NEXT:    fcvtzs v0.2d, v0.2d
-; CHECK-SD-NEXT:    xtn v6.2s, v3.2d
-; CHECK-SD-NEXT:    xtn v5.2s, v2.2d
-; CHECK-SD-NEXT:    xtn v4.2s, v1.2d
-; CHECK-SD-NEXT:    xtn v3.2s, v0.2d
-; CHECK-SD-NEXT:    ldr q0, [x8, :lo12:.LCPI54_0]
-; CHECK-SD-NEXT:    tbl v0.16b, { v3.16b, v4.16b, v5.16b, v6.16b }, v0.16b
+; CHECK-SD-NEXT:    uzp1 v2.4s, v2.4s, v3.4s
+; CHECK-SD-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: fptos_v8f64_v8i16:
@@ -1235,15 +1215,11 @@ define <8 x i16> @fptou_v8f64_v8i16(<8 x double> %a) {
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    fcvtzs v3.2d, v3.2d
 ; CHECK-SD-NEXT:    fcvtzs v2.2d, v2.2d
-; CHECK-SD-NEXT:    adrp x8, .LCPI55_0
 ; CHECK-SD-NEXT:    fcvtzs v1.2d, v1.2d
 ; CHECK-SD-NEXT:    fcvtzs v0.2d, v0.2d
-; CHECK-SD-NEXT:    xtn v6.2s, v3.2d
-; CHECK-SD-NEXT:    xtn v5.2s, v2.2d
-; CHECK-SD-NEXT:    xtn v4.2s, v1.2d
-; CHECK-SD-NEXT:    xtn v3.2s, v0.2d
-; CHECK-SD-NEXT:    ldr q0, [x8, :lo12:.LCPI55_0]
-; CHECK-SD-NEXT:    tbl v0.16b, { v3.16b, v4.16b, v5.16b, v6.16b }, v0.16b
+; CHECK-SD-NEXT:    uzp1 v2.4s, v2.4s, v3.4s
+; CHECK-SD-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: fptou_v8f64_v8i16:
@@ -1265,25 +1241,19 @@ define <16 x i16> @fptos_v16f64_v16i16(<16 x double> %a) {
 ; CHECK-SD-LABEL: fptos_v16f64_v16i16:
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    fcvtzs v3.2d, v3.2d
-; CHECK-SD-NEXT:    fcvtzs v7.2d, v7.2d
-; CHECK-SD-NEXT:    adrp x8, .LCPI56_0
 ; CHECK-SD-NEXT:    fcvtzs v2.2d, v2.2d
-; CHECK-SD-NEXT:    fcvtzs v6.2d, v6.2d
 ; CHECK-SD-NEXT:    fcvtzs v1.2d, v1.2d
-; CHECK-SD-NEXT:    fcvtzs v5.2d, v5.2d
 ; CHECK-SD-NEXT:    fcvtzs v0.2d, v0.2d
+; CHECK-SD-NEXT:    fcvtzs v7.2d, v7.2d
+; CHECK-SD-NEXT:    fcvtzs v6.2d, v6.2d
+; CHECK-SD-NEXT:    fcvtzs v5.2d, v5.2d
 ; CHECK-SD-NEXT:    fcvtzs v4.2d, v4.2d
-; CHECK-SD-NEXT:    xtn v19.2s, v3.2d
-; CHECK-SD-NEXT:    xtn v23.2s, v7.2d
-; CHECK-SD-NEXT:    xtn v18.2s, v2.2d
-; CHECK-SD-NEXT:    xtn v22.2s, v6.2d
-; CHECK-SD-NEXT:    xtn v17.2s, v1.2d
-; CHECK-SD-NEXT:    xtn v21.2s, v5.2d
-; CHECK-SD-NEXT:    ldr q1, [x8, :lo12:.LCPI56_0]
-; CHECK-SD-NEXT:    xtn v16.2s, v0.2d
-; CHECK-SD-NEXT:    xtn v20.2s, v4.2d
-; CHECK-SD-NEXT:    tbl v0.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v1.16b
-; CHECK-SD-NEXT:    tbl v1.16b, { v20.16b, v21.16b, v22.16b, v23.16b }, v1.16b
+; CHECK-SD-NEXT:    uzp1 v2.4s, v2.4s, v3.4s
+; CHECK-SD-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    uzp1 v1.4s, v6.4s, v7.4s
+; CHECK-SD-NEXT:    uzp1 v3.4s, v4.4s, v5.4s
+; CHECK-SD-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-SD-NEXT:    uzp1 v1.8h, v3.8h, v1.8h
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: fptos_v16f64_v16i16:
@@ -1312,25 +1282,19 @@ define <16 x i16> @fptou_v16f64_v16i16(<16 x double> %a) {
 ; CHECK-SD-LABEL: fptou_v16f64_v16i16:
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    fcvtzs v3.2d, v3.2d
-; CHECK-SD-NEXT:    fcvtzs v7.2d, v7.2d
-; CHECK-SD-NEXT:    adrp x8, .LCPI57_0
 ; CHECK-SD-NEXT:    fcvtzs v2.2d, v2.2d
-; CHECK-SD-NEXT:    fcvtzs v6.2d, v6.2d
 ; CHECK-SD-NEXT:    fcvtzs v1.2d, v1.2d
-; CHECK-SD-NEXT:    fcvtzs v5.2d, v5.2d
 ; CHECK-SD-NEXT:    fcvtzs v0.2d, v0.2d
+; CHECK-SD-NEXT:    fcvtzs v7.2d, v7.2d
+; CHECK-SD-NEXT:    fcvtzs v6.2d, v6.2d
+; CHECK-SD-NEXT:    fcvtzs v5.2d, v5.2d
 ; CHECK-SD-NEXT:    fcvtzs v4.2d, v4.2d
-; CHECK-SD-NEXT:    xtn v19.2s, v3.2d
-; CHECK-SD-NEXT:    xtn v23.2s, v7.2d
-; CHECK-SD-NEXT:    xtn v18.2s, v2.2d
-; CHECK-SD-NEXT:    xtn v22.2s, v6.2d
-; CHECK-SD-NEXT:    xtn v17.2s, v1.2d
-; CHECK-SD-NEXT:    xtn v21.2s, v5.2d
-; CHECK-SD-NEXT:    ldr q1, [x8, :lo12:.LCPI57_0]
-; CHECK-SD-NEXT:    xtn v16.2s, v0.2d
-; CHECK-SD-NEXT:    xtn v20.2s, v4.2d
-; CHECK-SD-NEXT:    tbl v0.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v1.16b
-; CHECK-SD-NEXT:    tbl v1.16b, { v20.16b, v21.16b, v22.16b, v23.16b }, v1.16b
+; CHECK-SD-NEXT:    uzp1 v2.4s, v2.4s, v3.4s
+; CHECK-SD-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    uzp1 v1.4s, v6.4s, v7.4s
+; CHECK-SD-NEXT:    uzp1 v3.4s, v4.4s, v5.4s
+; CHECK-SD-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-SD-NEXT:    uzp1 v1.8h, v3.8h, v1.8h
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: fptou_v16f64_v16i16:
@@ -1358,65 +1322,38 @@ entry:
 define <32 x i16> @fptos_v32f64_v32i16(<32 x double> %a) {
 ; CHECK-SD-LABEL: fptos_v32f64_v32i16:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    stp d15, d14, [sp, #-64]! // 16-byte Folded Spill
-; CHECK-SD-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
-; CHECK-SD-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
-; CHECK-SD-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-SD-NEXT:    .cfi_def_cfa_offset 64
-; CHECK-SD-NEXT:    .cfi_offset b8, -8
-; CHECK-SD-NEXT:    .cfi_offset b9, -16
-; CHECK-SD-NEXT:    .cfi_offset b10, -24
-; CHECK-SD-NEXT:    .cfi_offset b11, -32
-; CHECK-SD-NEXT:    .cfi_offset b12, -40
-; CHECK-SD-NEXT:    .cfi_offset b13, -48
-; CHECK-SD-NEXT:    .cfi_offset b14, -56
-; CHECK-SD-NEXT:    .cfi_offset b15, -64
+; CHECK-SD-NEXT:    ldp q16, q17, [sp, #64]
 ; CHECK-SD-NEXT:    fcvtzs v3.2d, v3.2d
-; CHECK-SD-NEXT:    fcvtzs v18.2d, v2.2d
-; CHECK-SD-NEXT:    adrp x8, .LCPI58_0
-; CHECK-SD-NEXT:    fcvtzs v19.2d, v1.2d
-; CHECK-SD-NEXT:    ldp q20, q21, [sp, #160]
-; CHECK-SD-NEXT:    fcvtzs v22.2d, v0.2d
-; CHECK-SD-NEXT:    ldp q23, q24, [sp, #96]
+; CHECK-SD-NEXT:    ldp q18, q19, [sp, #96]
+; CHECK-SD-NEXT:    fcvtzs v2.2d, v2.2d
+; CHECK-SD-NEXT:    ldp q20, q21, [sp]
+; CHECK-SD-NEXT:    fcvtzs v1.2d, v1.2d
+; CHECK-SD-NEXT:    ldp q22, q23, [sp, #32]
+; CHECK-SD-NEXT:    fcvtzs v0.2d, v0.2d
 ; CHECK-SD-NEXT:    fcvtzs v7.2d, v7.2d
-; CHECK-SD-NEXT:    ldp q16, q17, [sp, #128]
-; CHECK-SD-NEXT:    xtn v3.2s, v3.2d
+; CHECK-SD-NEXT:    fcvtzs v6.2d, v6.2d
+; CHECK-SD-NEXT:    fcvtzs v5.2d, v5.2d
+; CHECK-SD-NEXT:    fcvtzs v4.2d, v4.2d
 ; CHECK-SD-NEXT:    fcvtzs v21.2d, v21.2d
 ; CHECK-SD-NEXT:    fcvtzs v20.2d, v20.2d
-; CHECK-SD-NEXT:    xtn v2.2s, v18.2d
-; CHECK-SD-NEXT:    ldp q18, q25, [sp, #64]
-; CHECK-SD-NEXT:    xtn v1.2s, v19.2d
-; CHECK-SD-NEXT:    fcvtzs v19.2d, v24.2d
-; CHECK-SD-NEXT:    fcvtzs v17.2d, v17.2d
-; CHECK-SD-NEXT:    xtn v0.2s, v22.2d
-; CHECK-SD-NEXT:    fcvtzs v22.2d, v23.2d
-; CHECK-SD-NEXT:    xtn v29.2s, v7.2d
-; CHECK-SD-NEXT:    fcvtzs v7.2d, v25.2d
-; CHECK-SD-NEXT:    fcvtzs v6.2d, v6.2d
+; CHECK-SD-NEXT:    fcvtzs v23.2d, v23.2d
+; CHECK-SD-NEXT:    fcvtzs v22.2d, v22.2d
+; CHECK-SD-NEXT:    fcvtzs v19.2d, v19.2d
 ; CHECK-SD-NEXT:    fcvtzs v18.2d, v18.2d
+; CHECK-SD-NEXT:    fcvtzs v17.2d, v17.2d
 ; CHECK-SD-NEXT:    fcvtzs v16.2d, v16.2d
-; CHECK-SD-NEXT:    fcvtzs v5.2d, v5.2d
-; CHECK-SD-NEXT:    xtn v15.2s, v21.2d
-; CHECK-SD-NEXT:    xtn v11.2s, v19.2d
-; CHECK-SD-NEXT:    fcvtzs v4.2d, v4.2d
-; CHECK-SD-NEXT:    xtn v14.2s, v20.2d
-; CHECK-SD-NEXT:    xtn v10.2s, v22.2d
-; CHECK-SD-NEXT:    xtn v13.2s, v17.2d
-; CHECK-SD-NEXT:    xtn v9.2s, v7.2d
-; CHECK-SD-NEXT:    xtn v28.2s, v6.2d
-; CHECK-SD-NEXT:    xtn v8.2s, v18.2d
-; CHECK-SD-NEXT:    xtn v12.2s, v16.2d
-; CHECK-SD-NEXT:    xtn v27.2s, v5.2d
-; CHECK-SD-NEXT:    xtn v26.2s, v4.2d
-; CHECK-SD-NEXT:    ldr q4, [x8, :lo12:.LCPI58_0]
-; CHECK-SD-NEXT:    tbl v0.16b, { v0.16b, v1.16b, v2.16b, v3.16b }, v4.16b
-; CHECK-SD-NEXT:    tbl v2.16b, { v8.16b, v9.16b, v10.16b, v11.16b }, v4.16b
-; CHECK-SD-NEXT:    tbl v3.16b, { v12.16b, v13.16b, v14.16b, v15.16b }, v4.16b
-; CHECK-SD-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-SD-NEXT:    tbl v1.16b, { v26.16b, v27.16b, v28.16b, v29.16b }, v4.16b
-; CHECK-SD-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; CHECK-SD-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-SD-NEXT:    ldp d15, d14, [sp], #64 // 16-byte Folded Reload
+; CHECK-SD-NEXT:    uzp1 v2.4s, v2.4s, v3.4s
+; CHECK-SD-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    uzp1 v1.4s, v6.4s, v7.4s
+; CHECK-SD-NEXT:    uzp1 v3.4s, v4.4s, v5.4s
+; CHECK-SD-NEXT:    uzp1 v5.4s, v20.4s, v21.4s
+; CHECK-SD-NEXT:    uzp1 v4.4s, v22.4s, v23.4s
+; CHECK-SD-NEXT:    uzp1 v6.4s, v18.4s, v19.4s
+; CHECK-SD-NEXT:    uzp1 v7.4s, v16.4s, v17.4s
+; CHECK-SD-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-SD-NEXT:    uzp1 v1.8h, v3.8h, v1.8h
+; CHECK-SD-NEXT:    uzp1 v2.8h, v5.8h, v4.8h
+; CHECK-SD-NEXT:    uzp1 v3.8h, v7.8h, v6.8h
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: fptos_v32f64_v32i16:
@@ -1462,65 +1399,38 @@ entry:
 define <32 x i16> @fptou_v32f64_v32i16(<32 x double> %a) {
 ; CHECK-SD-LABEL: fptou_v32f64_v32i16:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    stp d15, d14, [sp, #-64]! // 16-byte Folded Spill
-; CHECK-SD-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
-; CHECK-SD-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
-; CHECK-SD-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-SD-NEXT:    .cfi_def_cfa_offset 64
-; CHECK-SD-NEXT:    .cfi_offset b8, -8
-; CHECK-SD-NEXT:    .cfi_offset b9, -16
-; CHECK-SD-NEXT:    .cfi_offset b10, -24
-; CHECK-SD-NEXT:    .cfi_offset b11, -32
-; CHECK-SD-NEXT:    .cfi_offset b12, -40
-; CHECK-SD-NEXT:    .cfi_offset b13, -48
-; CHECK-SD-NEXT:    .cfi_offset b14, -56
-; CHECK-SD-NEXT:    .cfi_offset b15, -64
+; CHECK-SD-NEXT:    ldp q16, q17, [sp, #64]
 ; CHECK-SD-NEXT:    fcvtzs v3.2d, v3.2d
-; CHECK-SD-NEXT:    fcvtzs v18.2d, v2.2d
-; CHECK-SD-NEXT:    adrp x8, .LCPI59_0
-; CHECK-SD-NEXT:    fcvtzs v19.2d, v1.2d
-; CHECK-SD-NEXT:    ldp q20, q21, [sp, #160]
-; CHECK-SD-NEXT:    fcvtzs v22.2d, v0.2d
-; CHECK-SD-NEXT:    ldp q23, q24, [sp, #96]
+; CHECK-SD-NEXT:    ldp q18, q19, [sp, #96]
+; CHECK-SD-NEXT:    fcvtzs v2.2d, v2.2d
+; CHECK-SD-NEXT:    ldp q20, q21, [sp]
+; CHECK-SD-NEXT:    fcvtzs v1.2d, v1.2d
+; CHECK-SD-NEXT:    ldp q22, q23, [sp, #32]
+; CHECK-SD-NEXT:    fcvtzs v0.2d, v0.2d
 ; CHECK-SD-NEXT:    fcvtzs v7.2d, v7.2d
-; CHECK-SD-NEXT:    ldp q16, q17, [sp, #128]
-; CHECK-SD-NEXT:    xtn v3.2s, v3.2d
+; CHECK-SD-NEXT:    fcvtzs v6.2d, v6.2d
+; CHECK-SD-NEXT:    fcvtzs v5.2d, v5.2d
+; CHECK-SD-NEXT:    fcvtzs v4.2d, v4.2d
 ; CHECK-SD-NEXT:    fcvtzs v21.2d, v21.2d
 ; CHECK-SD-NEXT:    fcvtzs v20.2d, v20.2d
-; CHECK-SD-NEXT:    xtn v2.2s, v18.2d
-; CHECK-SD-NEXT:    ldp q18, q25, [sp, #64]
-; CHECK-SD-NEXT:    xtn v1.2s, v19.2d
-; CHECK-SD-NEXT:    fcvtzs v19.2d, v24.2d
-; CHECK-SD-NEXT:    fcvtzs v17.2d, v17.2d
-; CHECK-SD-NEXT:    xtn v0.2s, v22.2d
-; CHECK-SD-NEXT:    fcvtzs v22.2d, v23.2d
-; CHECK-SD-NEXT:    xtn v29.2s, v7.2d
-; CHECK-SD-NEXT:    fcvtzs v7.2d, v25.2d
-; CHECK-SD-NEXT:    fcvtzs v6.2d, v6.2d
+; CHECK-SD-NEXT:    fcvtzs v23.2d, v23.2d
+; CHECK-SD-NEXT:    fcvtzs v22.2d, v22.2d
+; CHECK-SD-NEXT:    fcvtzs v19.2d, v19.2d
 ; CHECK-SD-NEXT:    fcvtzs v18.2d, v18.2d
+; CHECK-SD-NEXT:    fcvtzs v17.2d, v17.2d
 ; CHECK-SD-NEXT:    fcvtzs v16.2d, v16.2d
-; CHECK-SD-NEXT:    fcvtzs v5.2d, v5.2d
-; CHECK-SD-NEXT:    xtn v15.2s, v21.2d
-; CHECK-SD-NEXT:    xtn v11.2s, v19.2d
-; CHECK-SD-NEXT:    fcvtzs v4.2d, v4.2d
-; CHECK-SD-NEXT:    xtn v14.2s, v20.2d
-; CHECK-SD-NEXT:    xtn v10.2s, v22.2d
-; CHECK-SD-NEXT:    xtn v13.2s, v17.2d
-; CHECK-SD-NEXT:    xtn v9.2s, v7.2d
-; CHECK-SD-NEXT:    xtn v28.2s, v6.2d
-; CHECK-SD-NEXT:    xtn v8.2s, v18.2d
-; CHECK-SD-NEXT:    xtn v12.2s, v16.2d
-; CHECK-SD-NEXT:    xtn v27.2s, v5.2d
-; CHECK-SD-NEXT:    xtn v26.2s, v4.2d
-; CHECK-SD-NEXT:    ldr q4, [x8, :lo12:.LCPI59_0]
-; CHECK-SD-NEXT:    tbl v0.16b, { v0.16b, v1.16b, v2.16b, v3.16b }, v4.16b
-; CHECK-SD-NEXT:    tbl v2.16b, { v8.16b, v9.16b, v10.16b, v11.16b }, v4.16b
-; CHECK-SD-NEXT:    tbl v3.16b, { v12.16b, v13.16b, v14.16b, v15.16b }, v4.16b
-; CHECK-SD-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-SD-NEXT:    tbl v1.16b, { v26.16b, v27.16b, v28.16b, v29.16b }, v4.16b
-; CHECK-SD-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; CHECK-SD-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-SD-NEXT:    ldp d15, d14, [sp], #64 // 16-byte Folded Reload
+; CHECK-SD-NEXT:    uzp1 v2.4s, v2.4s, v3.4s
+; CHECK-SD-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    uzp1 v1.4s, v6.4s, v7.4s
+; CHECK-SD-NEXT:    uzp1 v3.4s, v4.4s, v5.4s
+; CHECK-SD-NEXT:    uzp1 v5.4s, v20.4s, v21.4s
+; CHECK-SD-NEXT:    uzp1 v4.4s, v22.4s, v23.4s
+; CHECK-SD-NEXT:    uzp1 v6.4s, v18.4s, v19.4s
+; CHECK-SD-NEXT:    uzp1 v7.4s, v16.4s, v17.4s
+; CHECK-SD-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-SD-NEXT:    uzp1 v1.8h, v3.8h, v1.8h
+; CHECK-SD-NEXT:    uzp1 v2.8h, v5.8h, v4.8h
+; CHECK-SD-NEXT:    uzp1 v3.8h, v7.8h, v6.8h
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: fptou_v32f64_v32i16:
@@ -1600,9 +1510,8 @@ define <3 x i8> @fptos_v3f64_v3i8(<3 x double> %a) {
 ; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-SD-NEXT:    fcvtzs v1.2d, v2.2d
 ; CHECK-SD-NEXT:    fcvtzs v0.2d, v0.2d
-; CHECK-SD-NEXT:    xtn v1.2s, v1.2d
-; CHECK-SD-NEXT:    xtn v0.2s, v0.2d
-; CHECK-SD-NEXT:    uzp1 v0.4h, v0.4h, v1.4h
+; CHECK-SD-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    xtn v0.4h, v0.4s
 ; CHECK-SD-NEXT:    umov w0, v0.h[0]
 ; CHECK-SD-NEXT:    umov w1, v0.h[1]
 ; CHECK-SD-NEXT:    umov w2, v0.h[2]
@@ -1638,9 +1547,8 @@ define <3 x i8> @fptou_v3f64_v3i8(<3 x double> %a) {
 ; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-SD-NEXT:    fcvtzs v1.2d, v2.2d
 ; CHECK-SD-NEXT:    fcvtzs v0.2d, v0.2d
-; CHECK-SD-NEXT:    xtn v1.2s, v1.2d
-; CHECK-SD-NEXT:    xtn v0.2s, v0.2d
-; CHECK-SD-NEXT:    uzp1 v0.4h, v0.4h, v1.4h
+; CHECK-SD-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    xtn v0.4h, v0.4s
 ; CHECK-SD-NEXT:    umov w0, v0.h[0]
 ; CHECK-SD-NEXT:    umov w1, v0.h[1]
 ; CHECK-SD-NEXT:    umov w2, v0.h[2]
@@ -1672,9 +1580,8 @@ define <4 x i8> @fptos_v4f64_v4i8(<4 x double> %a) {
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    fcvtzs v1.2d, v1.2d
 ; CHECK-SD-NEXT:    fcvtzs v0.2d, v0.2d
-; CHECK-SD-NEXT:    xtn v1.2s, v1.2d
-; CHECK-SD-NEXT:    xtn v0.2s, v0.2d
-; CHECK-SD-NEXT:    uzp1 v0.4h, v0.4h, v1.4h
+; CHECK-SD-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    xtn v0.4h, v0.4s
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: fptos_v4f64_v4i8:
@@ -1694,9 +1601,8 @@ define <4 x i8> @fptou_v4f64_v4i8(<4 x double> %a) {
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    fcvtzs v1.2d, v1.2d
 ; CHECK-SD-NEXT:    fcvtzs v0.2d, v0.2d
-; CHECK-SD-NEXT:    xtn v1.2s, v1.2d
-; CHECK-SD-NEXT:    xtn v0.2s, v0.2d
-; CHECK-SD-NEXT:    uzp1 v0.4h, v0.4h, v1.4h
+; CHECK-SD-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    xtn v0.4h, v0.4s
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: fptou_v4f64_v4i8:
@@ -1718,13 +1624,10 @@ define <8 x i8> @fptos_v8f64_v8i8(<8 x double> %a) {
 ; CHECK-SD-NEXT:    fcvtzs v2.2d, v2.2d
 ; CHECK-SD-NEXT:    fcvtzs v1.2d, v1.2d
 ; CHECK-SD-NEXT:    fcvtzs v0.2d, v0.2d
-; CHECK-SD-NEXT:    xtn v3.2s, v3.2d
-; CHECK-SD-NEXT:    xtn v2.2s, v2.2d
-; CHECK-SD-NEXT:    xtn v1.2s, v1.2d
-; CHECK-SD-NEXT:    xtn v0.2s, v0.2d
-; CHECK-SD-NEXT:    uzp1 v2.4h, v2.4h, v3.4h
-; CHECK-SD-NEXT:    uzp1 v0.4h, v0.4h, v1.4h
-; CHECK-SD-NEXT:    uzp1 v0.8b, v0.8b, v2.8b
+; CHECK-SD-NEXT:    uzp1 v2.4s, v2.4s, v3.4s
+; CHECK-SD-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-SD-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: fptos_v8f64_v8i8:
@@ -1750,13 +1653,10 @@ define <8 x i8> @fptou_v8f64_v8i8(<8 x double> %a) {
 ; CHECK-SD-NEXT:    fcvtzs v2.2d, v2.2d
 ; CHECK-SD-NEXT:    fcvtzs v1.2d, v1.2d
 ; CHECK-SD-NEXT:    fcvtzs v0.2d, v0.2d
-; CHECK-SD-NEXT:    xtn v3.2s, v3.2d
-; CHECK-SD-NEXT:    xtn v2.2s, v2.2d
-; CHECK-SD-NEXT:    xtn v1.2s, v1.2d
-; CHECK-SD-NEXT:    xtn v0.2s, v0.2d
-; CHECK-SD-NEXT:    uzp1 v2.4h, v2.4h, v3.4h
-; CHECK-SD-NEXT:    uzp1 v0.4h, v0.4h, v1.4h
-; CHECK-SD-NEXT:    uzp1 v0.8b, v0.8b, v2.8b
+; CHECK-SD-NEXT:    uzp1 v2.4s, v2.4s, v3.4s
+; CHECK-SD-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-SD-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: fptou_v8f64_v8i8:
@@ -1786,21 +1686,13 @@ define <16 x i8> @fptos_v16f64_v16i8(<16 x double> %a) {
 ; CHECK-SD-NEXT:    fcvtzs v2.2d, v2.2d
 ; CHECK-SD-NEXT:    fcvtzs v1.2d, v1.2d
 ; CHECK-SD-NEXT:    fcvtzs v0.2d, v0.2d
-; CHECK-SD-NEXT:    xtn v7.2s, v7.2d
-; CHECK-SD-NEXT:    xtn v6.2s, v6.2d
-; CHECK-SD-NEXT:    xtn v5.2s, v5.2d
-; CHECK-SD-NEXT:    xtn v4.2s, v4.2d
-; CHECK-SD-NEXT:    xtn v3.2s, v3.2d
-; CHECK-SD-NEXT:    xtn v2.2s, v2.2d
-; CHECK-SD-NEXT:    xtn v1.2s, v1.2d
-; CHECK-SD-NEXT:    xtn v0.2s, v0.2d
-; CHECK-SD-NEXT:    uzp1 v6.4h, v6.4h, v7.4h
-; CHECK-SD-NEXT:    uzp1 v4.4h, v4.4h, v5.4h
-; CHECK-SD-NEXT:    uzp1 v2.4h, v2.4h, v3.4h
-; CHECK-SD-NEXT:    uzp1 v0.4h, v0.4h, v1.4h
-; CHECK-SD-NEXT:    mov v4.d[1], v6.d[0]
-; CHECK-SD-NEXT:    mov v0.d[1], v2.d[0]
-; CHECK-SD-NEXT:    uzp1 v0.16b, v0.16b, v4.16b
+; CHECK-SD-NEXT:    uzp1 v6.4s, v6.4s, v7.4s
+; CHECK-SD-NEXT:    uzp1 v4.4s, v4.4s, v5.4s
+; CHECK-SD-NEXT:    uzp1 v2.4s, v2.4s, v3.4s
+; CHECK-SD-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    uzp1 v1.8h, v4.8h, v6.8h
+; CHECK-SD-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-SD-NEXT:    uzp1 v0.16b, v0.16b, v1.16b
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: fptos_v16f64_v16i8:
@@ -1837,21 +1729,13 @@ define <16 x i8> @fptou_v16f64_v16i8(<16 x double> %a) {
 ; CHECK-SD-NEXT:    fcvtzs v2.2d, v2.2d
 ; CHECK-SD-NEXT:    fcvtzs v1.2d, v1.2d
 ; CHECK-SD-NEXT:    fcvtzs v0.2d, v0.2d
-; CHECK-SD-NEXT:    xtn v7.2s, v7.2d
-; CHECK-SD-NEXT:    xtn v6.2s, v6.2d
-; CHECK-SD-NEXT:    xtn v5.2s, v5.2d
-; CHECK-SD-NEXT:    xtn v4.2s, v4.2d
-; CHECK-SD-NEXT:    xtn v3.2s, v3.2d
-; CHECK-SD-NEXT:    xtn v2.2s, v2.2d
-; CHECK-SD-NEXT:    xtn v1.2s, v1.2d
-; CHECK-SD-NEXT:    xtn v0.2s, v0.2d
-; CHECK-SD-NEXT:    uzp1 v6.4h, v6.4h, v7.4h
-; CHECK-SD-NEXT:    uzp1 v4.4h, v4.4h, v5.4h
-; CHECK-SD-NEXT:    uzp1 v2.4h, v2.4h, v3.4h
-; CHECK-SD-NEXT:    uzp1 v0.4h, v0.4h, v1.4h
-; CHECK-SD-NEXT:    mov v4.d[1], v6.d[0]
-; CHECK-SD-NEXT:    mov v0.d[1], v2.d[0]
-; CHECK-SD-NEXT:    uzp1 v0.16b, v0.16b, v4.16b
+; CHECK-SD-NEXT:    uzp1 v6.4s, v6.4s, v7.4s
+; CHECK-SD-NEXT:    uzp1 v4.4s, v4.4s, v5.4s
+; CHECK-SD-NEXT:    uzp1 v2.4s, v2.4s, v3.4s
+; CHECK-SD-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    uzp1 v1.8h, v4.8h, v6.8h
+; CHECK-SD-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-SD-NEXT:    uzp1 v0.16b, v0.16b, v1.16b
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: fptou_v16f64_v16i8:
@@ -1900,36 +1784,20 @@ define <32 x i8> @fptos_v32f64_v32i8(<32 x double> %a) {
 ; CHECK-SD-NEXT:    fcvtzs v18.2d, v18.2d
 ; CHECK-SD-NEXT:    fcvtzs v17.2d, v17.2d
 ; CHECK-SD-NEXT:    fcvtzs v16.2d, v16.2d
-; CHECK-SD-NEXT:    xtn v7.2s, v7.2d
-; CHECK-SD-NEXT:    xtn v6.2s, v6.2d
-; CHECK-SD-NEXT:    xtn v5.2s, v5.2d
-; CHECK-SD-NEXT:    xtn v4.2s, v4.2d
-; CHECK-SD-NEXT:    xtn v3.2s, v3.2d
-; CHECK-SD-NEXT:    xtn v2.2s, v2.2d
-; CHECK-SD-NEXT:    xtn v1.2s, v1.2d
-; CHECK-SD-NEXT:    xtn v0.2s, v0.2d
-; CHECK-SD-NEXT:    xtn v23.2s, v23.2d
-; CHECK-SD-NEXT:    xtn v22.2s, v22.2d
-; CHECK-SD-NEXT:    xtn v21.2s, v21.2d
-; CHECK-SD-NEXT:    xtn v20.2s, v20.2d
-; CHECK-SD-NEXT:    xtn v19.2s, v19.2d
-; CHECK-SD-NEXT:    xtn v18.2s, v18.2d
-; CHECK-SD-NEXT:    xtn v17.2s, v17.2d
-; CHECK-SD-NEXT:    xtn v16.2s, v16.2d
-; CHECK-SD-NEXT:    uzp1 v6.4h, v6.4h, v7.4h
-; CHECK-SD-NEXT:    uzp1 v4.4h, v4.4h, v5.4h
-; CHECK-SD-NEXT:    uzp1 v2.4h, v2.4h, v3.4h
-; CHECK-SD-NEXT:    uzp1 v0.4h, v0.4h, v1.4h
-; CHECK-SD-NEXT:    uzp1 v1.4h, v22.4h, v23.4h
-; CHECK-SD-NEXT:    uzp1 v3.4h, v20.4h, v21.4h
-; CHECK-SD-NEXT:    uzp1 v5.4h, v18.4h, v19.4h
-; CHECK-SD-NEXT:    uzp1 v7.4h, v16.4h, v17.4h
-; CHECK-SD-NEXT:    mov v4.d[1], v6.d[0]
-; CHECK-SD-NEXT:    mov v0.d[1], v2.d[0]
-; CHECK-SD-NEXT:    mov v3.d[1], v1.d[0]
-; CHECK-SD-NEXT:    mov v7.d[1], v5.d[0]
+; CHECK-SD-NEXT:    uzp1 v6.4s, v6.4s, v7.4s
+; CHECK-SD-NEXT:    uzp1 v4.4s, v4.4s, v5.4s
+; CHECK-SD-NEXT:    uzp1 v2.4s, v2.4s, v3.4s
+; CHECK-SD-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    uzp1 v3.4s, v20.4s, v21.4s
+; CHECK-SD-NEXT:    uzp1 v1.4s, v22.4s, v23.4s
+; CHECK-SD-NEXT:    uzp1 v5.4s, v18.4s, v19.4s
+; CHECK-SD-NEXT:    uzp1 v7.4s, v16.4s, v17.4s
+; CHECK-SD-NEXT:    uzp1 v4.8h, v4.8h, v6.8h
+; CHECK-SD-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-SD-NEXT:    uzp1 v1.8h, v3.8h, v1.8h
+; CHECK-SD-NEXT:    uzp1 v2.8h, v7.8h, v5.8h
 ; CHECK-SD-NEXT:    uzp1 v0.16b, v0.16b, v4.16b
-; CHECK-SD-NEXT:    uzp1 v1.16b, v7.16b, v3.16b
+; CHECK-SD-NEXT:    uzp1 v1.16b, v2.16b, v1.16b
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: fptos_v32f64_v32i8:
@@ -1997,36 +1865,20 @@ define <32 x i8> @fptou_v32f64_v32i8(<32 x double> %a) {
 ; CHECK-SD-NEXT:    fcvtzs v18.2d, v18.2d
 ; CHECK-SD-NEXT:    fcvtzs v17.2d, v17.2d
 ; CHECK-SD-NEXT:    fcvtzs v16.2d, v16.2d
-; CHECK-SD-NEXT:    xtn v7.2s, v7.2d
-; CHECK-SD-NEXT:    xtn v6.2s, v6.2d
-; CHECK-SD-NEXT:    xtn v5.2s, v5.2d
-; CHECK-SD-NEXT:    xtn v4.2s, v4.2d
-; CHECK-SD-NEXT:    xtn v3.2s, v3.2d
-; CHECK-SD-NEXT:    xtn v2.2s, v2.2d
-; CHECK-SD-NEXT:    xtn v1.2s, v1.2d
-; CHECK-SD-NEXT:    xtn v0.2s, v0.2d
-; CHECK-SD-NEXT:    xtn v23.2s, v23.2d
-; CHECK-SD-NEXT:    xtn v22.2s, v22.2d
-; CHECK-SD-NEXT:    xtn v21.2s, v21.2d
-; CHECK-SD-NEXT:    xtn v20.2s, v20.2d
-; CHECK-SD-NEXT:    xtn v19.2s, v19.2d
-; CHECK-SD-NEXT:    xtn v18.2s, v18.2d
-; CHECK-SD-NEXT:    xtn v17.2s, v17.2d
-; CHECK-SD-NEXT:    xtn v16.2s, v16.2d
-; CHECK-SD-NEXT:    uzp1 v6.4h, v6.4h, v7.4h
-; CHECK-SD-NEXT:    uzp1 v4.4h, v4.4h, v5.4h
-; CHECK-SD-NEXT:    uzp1 v2.4h, v2.4h, v3.4h
-; CHECK-SD-NEXT:    uzp1 v0.4h, v0.4h, v1.4h
-; CHECK-SD-NEXT:    uzp1 v1.4h, v22.4h, v23.4h
-; CHECK-SD-NEXT:    uzp1 v3.4h, v20.4h, v21.4h
-; CHECK-SD-NEXT:    uzp1 v5.4h, v18.4h, v19.4h
-; CHECK-SD-NEXT:    uzp1 v7.4h, v16.4h, v17.4h
-; CHECK-SD-NEXT:    mov v4.d[1], v6.d[0]
-; CHECK-SD-NEXT:    mov v0.d[1], v2.d[0]
-; CHECK-SD-NEXT:    mov v3.d[1], v1.d[0]
-; CHECK-SD-NEXT:    mov v7.d[1], v5.d[0]
+; CHECK-SD-NEXT:    uzp1 v6.4s, v6.4s, v7.4s
+; CHECK-SD-NEXT:    uzp1 v4.4s, v4.4s, v5.4s
+; CHECK-SD-NEXT:    uzp1 v2.4s, v2.4s, v3.4s
+; CHECK-SD-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    uzp1 v3.4s, v20.4s, v21.4s
+; CHECK-SD-NEXT:    uzp1 v1.4s, v22.4s, v23.4s
+; CHECK-SD-NEXT:    uzp1 v5.4s, v18.4s, v19.4s
+; CHECK-SD-NEXT:    uzp1 v7.4s, v16.4s, v17.4s
+; CHECK-SD-NEXT:    uzp1 v4.8h, v4.8h, v6.8h
+; CHECK-SD-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-SD-NEXT:    uzp1 v1.8h, v3.8h, v1.8h
+; CHECK-SD-NEXT:    uzp1 v2.8h, v7.8h, v5.8h
 ; CHECK-SD-NEXT:    uzp1 v0.16b, v0.16b, v4.16b
-; CHECK-SD-NEXT:    uzp1 v1.16b, v7.16b, v3.16b
+; CHECK-SD-NEXT:    uzp1 v1.16b, v2.16b, v1.16b
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: fptou_v32f64_v32i8:
@@ -3028,9 +2880,8 @@ define <8 x i8> @fptos_v8f32_v8i8(<8 x float> %a) {
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    fcvtzs v1.4s, v1.4s
 ; CHECK-SD-NEXT:    fcvtzs v0.4s, v0.4s
-; CHECK-SD-NEXT:    xtn v1.4h, v1.4s
-; CHECK-SD-NEXT:    xtn v0.4h, v0.4s
-; CHECK-SD-NEXT:    uzp1 v0.8b, v0.8b, v1.8b
+; CHECK-SD-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-SD-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: fptos_v8f32_v8i8:
@@ -3050,9 +2901,8 @@ define <8 x i8> @fptou_v8f32_v8i8(<8 x float> %a) {
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    fcvtzs v1.4s, v1.4s
 ; CHECK-SD-NEXT:    fcvtzs v0.4s, v0.4s
-; CHECK-SD-NEXT:    xtn v1.4h, v1.4s
-; CHECK-SD-NEXT:    xtn v0.4h, v0.4s
-; CHECK-SD-NEXT:    uzp1 v0.8b, v0.8b, v1.8b
+; CHECK-SD-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-SD-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: fptou_v8f32_v8i8:
@@ -3074,12 +2924,8 @@ define <16 x i8> @fptos_v16f32_v16i8(<16 x float> %a) {
 ; CHECK-SD-NEXT:    fcvtzs v2.4s, v2.4s
 ; CHECK-SD-NEXT:    fcvtzs v1.4s, v1.4s
 ; CHECK-SD-NEXT:    fcvtzs v0.4s, v0.4s
-; CHECK-SD-NEXT:    xtn v3.4h, v3.4s
-; CHECK-SD-NEXT:    xtn v2.4h, v2.4s
-; CHECK-SD-NEXT:    xtn v1.4h, v1.4s
-; CHECK-SD-NEXT:    xtn v0.4h, v0.4s
-; CHECK-SD-NEXT:    mov v2.d[1], v3.d[0]
-; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-SD-NEXT:    uzp1 v2.8h, v2.8h, v3.8h
+; CHECK-SD-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
 ; CHECK-SD-NEXT:    uzp1 v0.16b, v0.16b, v2.16b
 ; CHECK-SD-NEXT:    ret
 ;
@@ -3136,20 +2982,12 @@ define <32 x i8> @fptos_v32f32_v32i8(<32 x float> %a) {
 ; CHECK-SD-NEXT:    fcvtzs v6.4s, v6.4s
 ; CHECK-SD-NEXT:    fcvtzs v5.4s, v5.4s
 ; CHECK-SD-NEXT:    fcvtzs v4.4s, v4.4s
-; CHECK-SD-NEXT:    xtn v3.4h, v3.4s
-; CHECK-SD-NEXT:    xtn v2.4h, v2.4s
-; CHECK-SD-NEXT:    xtn v1.4h, v1.4s
-; CHECK-SD-NEXT:    xtn v0.4h, v0.4s
-; CHECK-SD-NEXT:    xtn v7.4h, v7.4s
-; CHECK-SD-NEXT:    xtn v6.4h, v6.4s
-; CHECK-SD-NEXT:    xtn v5.4h, v5.4s
-; CHECK-SD-NEXT:    xtn v4.4h, v4.4s
-; CHECK-SD-NEXT:    mov v2.d[1], v3.d[0]
-; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0]
-; CHECK-SD-NEXT:    mov v6.d[1], v7.d[0]
-; CHECK-SD-NEXT:    mov v4.d[1], v5.d[0]
+; CHECK-SD-NEXT:    uzp1 v2.8h, v2.8h, v3.8h
+; CHECK-SD-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-SD-NEXT:    uzp1 v1.8h, v6.8h, v7.8h
+; CHECK-SD-NEXT:    uzp1 v3.8h, v4.8h, v5.8h
 ; CHECK-SD-NEXT:    uzp1 v0.16b, v0.16b, v2.16b
-; CHECK-SD-NEXT:    uzp1 v1.16b, v4.16b, v6.16b
+; CHECK-SD-NEXT:    uzp1 v1.16b, v3.16b, v1.16b
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: fptos_v32f32_v32i8:
diff --git a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
index 92fd3183393ea7..ff7df77aef1163 100644
--- a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
@@ -3288,63 +3288,62 @@ define <16 x i16> @test_signed_v16f16_v16i16(<16 x half> %f) {
 define <8 x i8> @test_signed_v8f64_v8i8(<8 x double> %f) {
 ; CHECK-LABEL: test_signed_v8f64_v8i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov d4, v3.d[1]
-; CHECK-NEXT:    fcvtzs w11, d3
-; CHECK-NEXT:    mov w9, #127 // =0x7f
-; CHECK-NEXT:    mov d3, v1.d[1]
-; CHECK-NEXT:    fcvtzs w13, d2
-; CHECK-NEXT:    fcvtzs w15, d1
-; CHECK-NEXT:    fcvtzs w17, d0
-; CHECK-NEXT:    fcvtzs w8, d4
 ; CHECK-NEXT:    mov d4, v2.d[1]
-; CHECK-NEXT:    mov d2, v0.d[1]
+; CHECK-NEXT:    fcvtzs w10, d2
+; CHECK-NEXT:    mov w8, #127 // =0x7f
+; CHECK-NEXT:    mov d2, v3.d[1]
+; CHECK-NEXT:    fcvtzs w12, d3
+; CHECK-NEXT:    mov d3, v0.d[1]
+; CHECK-NEXT:    fcvtzs w15, d0
+; CHECK-NEXT:    fcvtzs w16, d1
+; CHECK-NEXT:    mov d0, v1.d[1]
+; CHECK-NEXT:    fcvtzs w9, d4
+; CHECK-NEXT:    fcvtzs w13, d2
 ; CHECK-NEXT:    fcvtzs w14, d3
-; CHECK-NEXT:    cmp w8, #127
-; CHECK-NEXT:    fcvtzs w12, d4
-; CHECK-NEXT:    fcvtzs w16, d2
-; CHECK-NEXT:    csel w10, w8, w9, lt
-; CHECK-NEXT:    mov w8, #-128 // =0xffffff80
-; CHECK-NEXT:    cmn w10, #128
-; CHECK-NEXT:    csel w10, w10, w8, gt
-; CHECK-NEXT:    cmp w11, #127
-; CHECK-NEXT:    csel w11, w11, w9, lt
+; CHECK-NEXT:    cmp w9, #127
+; CHECK-NEXT:    csel w11, w9, w8, lt
+; CHECK-NEXT:    mov w9, #-128 // =0xffffff80
 ; CHECK-NEXT:    cmn w11, #128
-; CHECK-NEXT:    csel w11, w11, w8, gt
+; CHECK-NEXT:    csel w11, w11, w9, gt
+; CHECK-NEXT:    cmp w10, #127
+; CHECK-NEXT:    csel w10, w10, w8, lt
+; CHECK-NEXT:    cmn w10, #128
+; CHECK-NEXT:    csel w10, w10, w9, gt
 ; CHECK-NEXT:    cmp w12, #127
-; CHECK-NEXT:    csel w12, w12, w9, lt
-; CHECK-NEXT:    fmov s3, w11
+; CHECK-NEXT:    csel w12, w12, w8, lt
+; CHECK-NEXT:    fmov s1, w10
 ; CHECK-NEXT:    cmn w12, #128
-; CHECK-NEXT:    csel w12, w12, w8, gt
+; CHECK-NEXT:    csel w12, w12, w9, gt
 ; CHECK-NEXT:    cmp w13, #127
-; CHECK-NEXT:    csel w13, w13, w9, lt
-; CHECK-NEXT:    mov v3.s[1], w10
+; CHECK-NEXT:    csel w13, w13, w8, lt
+; CHECK-NEXT:    mov v1.s[1], w11
 ; CHECK-NEXT:    cmn w13, #128
-; CHECK-NEXT:    csel w13, w13, w8, gt
+; CHECK-NEXT:    csel w13, w13, w9, gt
 ; CHECK-NEXT:    cmp w14, #127
-; CHECK-NEXT:    csel w14, w14, w9, lt
-; CHECK-NEXT:    fmov s2, w13
+; CHECK-NEXT:    csel w14, w14, w8, lt
 ; CHECK-NEXT:    cmn w14, #128
-; CHECK-NEXT:    csel w14, w14, w8, gt
+; CHECK-NEXT:    mov v1.s[2], w12
+; CHECK-NEXT:    csel w14, w14, w9, gt
 ; CHECK-NEXT:    cmp w15, #127
-; CHECK-NEXT:    csel w15, w15, w9, lt
-; CHECK-NEXT:    mov v2.s[1], w12
+; CHECK-NEXT:    csel w15, w15, w8, lt
 ; CHECK-NEXT:    cmn w15, #128
-; CHECK-NEXT:    csel w15, w15, w8, gt
+; CHECK-NEXT:    csel w10, w15, w9, gt
 ; CHECK-NEXT:    cmp w16, #127
-; CHECK-NEXT:    csel w11, w16, w9, lt
-; CHECK-NEXT:    fmov s1, w15
+; CHECK-NEXT:    mov v1.s[3], w13
+; CHECK-NEXT:    fmov s2, w10
+; CHECK-NEXT:    fcvtzs w10, d0
+; CHECK-NEXT:    csel w11, w16, w8, lt
 ; CHECK-NEXT:    cmn w11, #128
-; CHECK-NEXT:    csel w10, w11, w8, gt
-; CHECK-NEXT:    cmp w17, #127
-; CHECK-NEXT:    csel w9, w17, w9, lt
-; CHECK-NEXT:    mov v1.s[1], w14
-; CHECK-NEXT:    cmn w9, #128
-; CHECK-NEXT:    csel w8, w9, w8, gt
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    adrp x8, .LCPI82_0
-; CHECK-NEXT:    ldr d4, [x8, :lo12:.LCPI82_0]
-; CHECK-NEXT:    mov v0.s[1], w10
-; CHECK-NEXT:    tbl v0.8b, { v0.16b, v1.16b, v2.16b, v3.16b }, v4.8b
+; CHECK-NEXT:    csel w11, w11, w9, gt
+; CHECK-NEXT:    mov v2.s[1], w14
+; CHECK-NEXT:    cmp w10, #127
+; CHECK-NEXT:    csel w8, w10, w8, lt
+; CHECK-NEXT:    cmn w8, #128
+; CHECK-NEXT:    mov v2.s[2], w11
+; CHECK-NEXT:    csel w8, w8, w9, gt
+; CHECK-NEXT:    mov v2.s[3], w8
+; CHECK-NEXT:    uzp1 v0.8h, v2.8h, v1.8h
+; CHECK-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-NEXT:    ret
     %x = call <8 x i8> @llvm.fptosi.sat.v8f64.v8i8(<8 x double> %f)
     ret <8 x i8> %x
@@ -3353,135 +3352,115 @@ define <8 x i8> @test_signed_v8f64_v8i8(<8 x double> %f) {
 define <16 x i8> @test_signed_v16f64_v16i8(<16 x double> %f) {
 ; CHECK-LABEL: test_signed_v16f64_v16i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov d16, v0.d[1]
-; CHECK-NEXT:    fcvtzs w10, d0
+; CHECK-NEXT:    mov d16, v6.d[1]
+; CHECK-NEXT:    fcvtzs w11, d6
 ; CHECK-NEXT:    mov w8, #127 // =0x7f
-; CHECK-NEXT:    mov d0, v1.d[1]
-; CHECK-NEXT:    fcvtzs w13, d1
-; CHECK-NEXT:    mov d1, v2.d[1]
+; CHECK-NEXT:    mov d6, v7.d[1]
+; CHECK-NEXT:    fcvtzs w12, d7
+; CHECK-NEXT:    mov d7, v4.d[1]
+; CHECK-NEXT:    fcvtzs w16, d4
+; CHECK-NEXT:    mov d4, v5.d[1]
+; CHECK-NEXT:    fcvtzs w1, d3
+; CHECK-NEXT:    fcvtzs w4, d0
 ; CHECK-NEXT:    fcvtzs w9, d16
-; CHECK-NEXT:    fcvtzs w12, d0
+; CHECK-NEXT:    fcvtzs w14, d6
+; CHECK-NEXT:    fcvtzs w15, d7
+; CHECK-NEXT:    fcvtzs w18, d4
 ; CHECK-NEXT:    cmp w9, #127
-; CHECK-NEXT:    csel w11, w9, w8, lt
+; CHECK-NEXT:    csel w10, w9, w8, lt
 ; CHECK-NEXT:    mov w9, #-128 // =0xffffff80
-; CHECK-NEXT:    cmn w11, #128
-; CHECK-NEXT:    csel w11, w11, w9, gt
-; CHECK-NEXT:    cmp w10, #127
-; CHECK-NEXT:    csel w10, w10, w8, lt
 ; CHECK-NEXT:    cmn w10, #128
 ; CHECK-NEXT:    csel w10, w10, w9, gt
-; CHECK-NEXT:    cmp w12, #127
-; CHECK-NEXT:    fmov s0, w10
-; CHECK-NEXT:    csel w10, w12, w8, lt
-; CHECK-NEXT:    cmn w10, #128
-; CHECK-NEXT:    csel w10, w10, w9, gt
-; CHECK-NEXT:    cmp w13, #127
-; CHECK-NEXT:    csel w12, w13, w8, lt
-; CHECK-NEXT:    mov v0.s[1], w11
-; CHECK-NEXT:    fcvtzs w11, d1
-; CHECK-NEXT:    cmn w12, #128
-; CHECK-NEXT:    csel w12, w12, w9, gt
-; CHECK-NEXT:    fmov s1, w12
-; CHECK-NEXT:    fcvtzs w12, d2
-; CHECK-NEXT:    mov d2, v3.d[1]
-; CHECK-NEXT:    cmp w11, #127
-; CHECK-NEXT:    mov w13, v0.s[1]
-; CHECK-NEXT:    mov v1.s[1], w10
-; CHECK-NEXT:    csel w10, w11, w8, lt
-; CHECK-NEXT:    cmn w10, #128
-; CHECK-NEXT:    fcvtzs w11, d2
-; CHECK-NEXT:    csel w10, w10, w9, gt
-; CHECK-NEXT:    cmp w12, #127
-; CHECK-NEXT:    mov v0.b[1], w13
-; CHECK-NEXT:    csel w12, w12, w8, lt
-; CHECK-NEXT:    cmn w12, #128
-; CHECK-NEXT:    mov w13, v1.s[1]
-; CHECK-NEXT:    csel w12, w12, w9, gt
-; CHECK-NEXT:    cmp w11, #127
-; CHECK-NEXT:    fmov s2, w12
-; CHECK-NEXT:    fcvtzs w12, d3
-; CHECK-NEXT:    mov d3, v4.d[1]
-; CHECK-NEXT:    mov v0.b[2], v1.b[0]
-; CHECK-NEXT:    mov v2.s[1], w10
-; CHECK-NEXT:    csel w10, w11, w8, lt
-; CHECK-NEXT:    cmn w10, #128
-; CHECK-NEXT:    fcvtzs w11, d3
-; CHECK-NEXT:    csel w10, w10, w9, gt
-; CHECK-NEXT:    cmp w12, #127
-; CHECK-NEXT:    mov v0.b[3], w13
-; CHECK-NEXT:    csel w12, w12, w8, lt
-; CHECK-NEXT:    cmn w12, #128
-; CHECK-NEXT:    mov w13, v2.s[1]
-; CHECK-NEXT:    csel w12, w12, w9, gt
-; CHECK-NEXT:    cmp w11, #127
-; CHECK-NEXT:    fmov s3, w12
-; CHECK-NEXT:    fcvtzs w12, d4
-; CHECK-NEXT:    mov v0.b[4], v2.b[0]
-; CHECK-NEXT:    mov d4, v5.d[1]
-; CHECK-NEXT:    mov v3.s[1], w10
-; CHECK-NEXT:    csel w10, w11, w8, lt
-; CHECK-NEXT:    cmn w10, #128
-; CHECK-NEXT:    mov v0.b[5], w13
-; CHECK-NEXT:    csel w10, w10, w9, gt
-; CHECK-NEXT:    cmp w12, #127
-; CHECK-NEXT:    fcvtzs w11, d4
-; CHECK-NEXT:    csel w12, w12, w8, lt
-; CHECK-NEXT:    cmn w12, #128
-; CHECK-NEXT:    mov w13, v3.s[1]
-; CHECK-NEXT:    csel w12, w12, w9, gt
-; CHECK-NEXT:    mov v0.b[6], v3.b[0]
-; CHECK-NEXT:    fmov s4, w12
-; CHECK-NEXT:    fcvtzs w12, d5
 ; CHECK-NEXT:    cmp w11, #127
-; CHECK-NEXT:    mov d5, v6.d[1]
-; CHECK-NEXT:    mov v4.s[1], w10
-; CHECK-NEXT:    csel w10, w11, w8, lt
-; CHECK-NEXT:    mov v0.b[7], w13
-; CHECK-NEXT:    cmn w10, #128
-; CHECK-NEXT:    csel w10, w10, w9, gt
+; CHECK-NEXT:    csel w11, w11, w8, lt
+; CHECK-NEXT:    cmn w11, #128
+; CHECK-NEXT:    csel w13, w11, w9, gt
 ; CHECK-NEXT:    cmp w12, #127
-; CHECK-NEXT:    fcvtzs w13, d5
 ; CHECK-NEXT:    csel w11, w12, w8, lt
 ; CHECK-NEXT:    cmn w11, #128
-; CHECK-NEXT:    mov w12, v4.s[1]
-; CHECK-NEXT:    mov v0.b[8], v4.b[0]
-; CHECK-NEXT:    csel w11, w11, w9, gt
-; CHECK-NEXT:    fmov s5, w11
-; CHECK-NEXT:    fcvtzs w11, d6
-; CHECK-NEXT:    cmp w13, #127
-; CHECK-NEXT:    mov d6, v7.d[1]
-; CHECK-NEXT:    mov v0.b[9], w12
-; CHECK-NEXT:    mov v5.s[1], w10
-; CHECK-NEXT:    csel w10, w13, w8, lt
-; CHECK-NEXT:    cmn w10, #128
-; CHECK-NEXT:    csel w10, w10, w9, gt
-; CHECK-NEXT:    cmp w11, #127
-; CHECK-NEXT:    fcvtzs w13, d6
-; CHECK-NEXT:    csel w11, w11, w8, lt
-; CHECK-NEXT:    cmn w11, #128
-; CHECK-NEXT:    mov v0.b[10], v5.b[0]
-; CHECK-NEXT:    mov w12, v5.s[1]
 ; CHECK-NEXT:    csel w11, w11, w9, gt
-; CHECK-NEXT:    fmov s6, w11
-; CHECK-NEXT:    fcvtzs w11, d7
-; CHECK-NEXT:    cmp w13, #127
-; CHECK-NEXT:    mov v0.b[11], w12
-; CHECK-NEXT:    mov v6.s[1], w10
-; CHECK-NEXT:    csel w10, w13, w8, lt
-; CHECK-NEXT:    cmn w10, #128
-; CHECK-NEXT:    csel w10, w10, w9, gt
-; CHECK-NEXT:    cmp w11, #127
-; CHECK-NEXT:    csel w8, w11, w8, lt
+; CHECK-NEXT:    cmp w14, #127
+; CHECK-NEXT:    csel w12, w14, w8, lt
+; CHECK-NEXT:    cmn w12, #128
+; CHECK-NEXT:    csel w12, w12, w9, gt
+; CHECK-NEXT:    cmp w15, #127
+; CHECK-NEXT:    csel w14, w15, w8, lt
+; CHECK-NEXT:    fcvtzs w15, d5
+; CHECK-NEXT:    mov d5, v2.d[1]
+; CHECK-NEXT:    cmn w14, #128
+; CHECK-NEXT:    csel w14, w14, w9, gt
+; CHECK-NEXT:    cmp w16, #127
+; CHECK-NEXT:    csel w16, w16, w8, lt
+; CHECK-NEXT:    cmn w16, #128
+; CHECK-NEXT:    fcvtzs w0, d5
+; CHECK-NEXT:    csel w17, w16, w9, gt
+; CHECK-NEXT:    cmp w15, #127
+; CHECK-NEXT:    csel w15, w15, w8, lt
+; CHECK-NEXT:    cmn w15, #128
+; CHECK-NEXT:    csel w15, w15, w9, gt
+; CHECK-NEXT:    cmp w18, #127
+; CHECK-NEXT:    csel w16, w18, w8, lt
+; CHECK-NEXT:    fcvtzs w18, d2
+; CHECK-NEXT:    mov d2, v3.d[1]
+; CHECK-NEXT:    cmn w16, #128
+; CHECK-NEXT:    mov d3, v0.d[1]
+; CHECK-NEXT:    fmov s0, w13
+; CHECK-NEXT:    csel w16, w16, w9, gt
+; CHECK-NEXT:    cmp w0, #127
+; CHECK-NEXT:    csel w0, w0, w8, lt
+; CHECK-NEXT:    cmn w0, #128
+; CHECK-NEXT:    fcvtzs w2, d2
+; CHECK-NEXT:    mov v0.s[1], w10
+; CHECK-NEXT:    csel w0, w0, w9, gt
+; CHECK-NEXT:    cmp w18, #127
+; CHECK-NEXT:    fcvtzs w3, d3
+; CHECK-NEXT:    csel w18, w18, w8, lt
+; CHECK-NEXT:    fmov s2, w17
+; CHECK-NEXT:    cmn w18, #128
+; CHECK-NEXT:    csel w18, w18, w9, gt
+; CHECK-NEXT:    cmp w1, #127
+; CHECK-NEXT:    mov v0.s[2], w11
+; CHECK-NEXT:    csel w1, w1, w8, lt
+; CHECK-NEXT:    fmov s3, w18
+; CHECK-NEXT:    mov v2.s[1], w14
+; CHECK-NEXT:    cmn w1, #128
+; CHECK-NEXT:    csel w1, w1, w9, gt
+; CHECK-NEXT:    cmp w2, #127
+; CHECK-NEXT:    csel w2, w2, w8, lt
+; CHECK-NEXT:    mov v3.s[1], w0
+; CHECK-NEXT:    mov v0.s[3], w12
+; CHECK-NEXT:    cmn w2, #128
+; CHECK-NEXT:    mov v2.s[2], w15
+; CHECK-NEXT:    csel w2, w2, w9, gt
+; CHECK-NEXT:    cmp w3, #127
+; CHECK-NEXT:    csel w3, w3, w8, lt
+; CHECK-NEXT:    cmn w3, #128
+; CHECK-NEXT:    mov v3.s[2], w1
+; CHECK-NEXT:    csel w13, w3, w9, gt
+; CHECK-NEXT:    cmp w4, #127
+; CHECK-NEXT:    mov v2.s[3], w16
+; CHECK-NEXT:    csel w3, w4, w8, lt
+; CHECK-NEXT:    fcvtzs w4, d1
+; CHECK-NEXT:    mov d1, v1.d[1]
+; CHECK-NEXT:    cmn w3, #128
+; CHECK-NEXT:    csel w10, w3, w9, gt
+; CHECK-NEXT:    mov v3.s[3], w2
+; CHECK-NEXT:    fmov s4, w10
+; CHECK-NEXT:    uzp1 v0.8h, v2.8h, v0.8h
+; CHECK-NEXT:    cmp w4, #127
+; CHECK-NEXT:    fcvtzs w10, d1
+; CHECK-NEXT:    mov v4.s[1], w13
+; CHECK-NEXT:    csel w13, w4, w8, lt
+; CHECK-NEXT:    cmn w13, #128
+; CHECK-NEXT:    csel w11, w13, w9, gt
+; CHECK-NEXT:    cmp w10, #127
+; CHECK-NEXT:    csel w8, w10, w8, lt
+; CHECK-NEXT:    mov v4.s[2], w11
 ; CHECK-NEXT:    cmn w8, #128
-; CHECK-NEXT:    mov v0.b[12], v6.b[0]
-; CHECK-NEXT:    mov w11, v6.s[1]
 ; CHECK-NEXT:    csel w8, w8, w9, gt
-; CHECK-NEXT:    fmov s7, w8
-; CHECK-NEXT:    mov v0.b[13], w11
-; CHECK-NEXT:    mov v7.s[1], w10
-; CHECK-NEXT:    mov v0.b[14], v7.b[0]
-; CHECK-NEXT:    mov w8, v7.s[1]
-; CHECK-NEXT:    mov v0.b[15], w8
+; CHECK-NEXT:    mov v4.s[3], w8
+; CHECK-NEXT:    uzp1 v1.8h, v4.8h, v3.8h
+; CHECK-NEXT:    uzp1 v0.16b, v1.16b, v0.16b
 ; CHECK-NEXT:    ret
     %x = call <16 x i8> @llvm.fptosi.sat.v16f64.v16i8(<16 x double> %f)
     ret <16 x i8> %x
@@ -3490,63 +3469,61 @@ define <16 x i8> @test_signed_v16f64_v16i8(<16 x double> %f) {
 define <8 x i16> @test_signed_v8f64_v8i16(<8 x double> %f) {
 ; CHECK-LABEL: test_signed_v8f64_v8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov d4, v3.d[1]
+; CHECK-NEXT:    mov d4, v2.d[1]
 ; CHECK-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-NEXT:    fcvtzs w11, d3
-; CHECK-NEXT:    mov d3, v1.d[1]
-; CHECK-NEXT:    fcvtzs w13, d2
-; CHECK-NEXT:    fcvtzs w15, d1
-; CHECK-NEXT:    fcvtzs w17, d0
+; CHECK-NEXT:    fcvtzs w10, d2
+; CHECK-NEXT:    mov d2, v3.d[1]
+; CHECK-NEXT:    fcvtzs w12, d3
+; CHECK-NEXT:    mov d3, v0.d[1]
+; CHECK-NEXT:    fcvtzs w15, d0
+; CHECK-NEXT:    fcvtzs w16, d1
+; CHECK-NEXT:    mov d0, v1.d[1]
 ; CHECK-NEXT:    fcvtzs w9, d4
-; CHECK-NEXT:    mov d4, v2.d[1]
-; CHECK-NEXT:    mov d2, v0.d[1]
+; CHECK-NEXT:    fcvtzs w13, d2
 ; CHECK-NEXT:    fcvtzs w14, d3
 ; CHECK-NEXT:    cmp w9, w8
-; CHECK-NEXT:    fcvtzs w12, d4
-; CHECK-NEXT:    fcvtzs w16, d2
-; CHECK-NEXT:    csel w10, w9, w8, lt
+; CHECK-NEXT:    csel w11, w9, w8, lt
 ; CHECK-NEXT:    mov w9, #-32768 // =0xffff8000
-; CHECK-NEXT:    cmn w10, #8, lsl #12 // =32768
-; CHECK-NEXT:    csel w10, w10, w9, gt
-; CHECK-NEXT:    cmp w11, w8
-; CHECK-NEXT:    csel w11, w11, w8, lt
 ; CHECK-NEXT:    cmn w11, #8, lsl #12 // =32768
 ; CHECK-NEXT:    csel w11, w11, w9, gt
+; CHECK-NEXT:    cmp w10, w8
+; CHECK-NEXT:    csel w10, w10, w8, lt
+; CHECK-NEXT:    cmn w10, #8, lsl #12 // =32768
+; CHECK-NEXT:    csel w10, w10, w9, gt
 ; CHECK-NEXT:    cmp w12, w8
 ; CHECK-NEXT:    csel w12, w12, w8, lt
-; CHECK-NEXT:    fmov s3, w11
+; CHECK-NEXT:    fmov s1, w10
 ; CHECK-NEXT:    cmn w12, #8, lsl #12 // =32768
 ; CHECK-NEXT:    csel w12, w12, w9, gt
 ; CHECK-NEXT:    cmp w13, w8
 ; CHECK-NEXT:    csel w13, w13, w8, lt
-; CHECK-NEXT:    mov v3.s[1], w10
+; CHECK-NEXT:    mov v1.s[1], w11
 ; CHECK-NEXT:    cmn w13, #8, lsl #12 // =32768
 ; CHECK-NEXT:    csel w13, w13, w9, gt
 ; CHECK-NEXT:    cmp w14, w8
 ; CHECK-NEXT:    csel w14, w14, w8, lt
-; CHECK-NEXT:    fmov s2, w13
 ; CHECK-NEXT:    cmn w14, #8, lsl #12 // =32768
+; CHECK-NEXT:    mov v1.s[2], w12
 ; CHECK-NEXT:    csel w14, w14, w9, gt
 ; CHECK-NEXT:    cmp w15, w8
 ; CHECK-NEXT:    csel w15, w15, w8, lt
-; CHECK-NEXT:    mov v2.s[1], w12
 ; CHECK-NEXT:    cmn w15, #8, lsl #12 // =32768
-; CHECK-NEXT:    csel w15, w15, w9, gt
+; CHECK-NEXT:    csel w10, w15, w9, gt
 ; CHECK-NEXT:    cmp w16, w8
+; CHECK-NEXT:    mov v1.s[3], w13
+; CHECK-NEXT:    fmov s2, w10
+; CHECK-NEXT:    fcvtzs w10, d0
 ; CHECK-NEXT:    csel w11, w16, w8, lt
-; CHECK-NEXT:    fmov s1, w15
 ; CHECK-NEXT:    cmn w11, #8, lsl #12 // =32768
-; CHECK-NEXT:    csel w10, w11, w9, gt
-; CHECK-NEXT:    cmp w17, w8
-; CHECK-NEXT:    csel w8, w17, w8, lt
-; CHECK-NEXT:    mov v1.s[1], w14
+; CHECK-NEXT:    csel w11, w11, w9, gt
+; CHECK-NEXT:    mov v2.s[1], w14
+; CHECK-NEXT:    cmp w10, w8
+; CHECK-NEXT:    csel w8, w10, w8, lt
 ; CHECK-NEXT:    cmn w8, #8, lsl #12 // =32768
+; CHECK-NEXT:    mov v2.s[2], w11
 ; CHECK-NEXT:    csel w8, w8, w9, gt
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    adrp x8, .LCPI84_0
-; CHECK-NEXT:    ldr q4, [x8, :lo12:.LCPI84_0]
-; CHECK-NEXT:    mov v0.s[1], w10
-; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b, v2.16b, v3.16b }, v4.16b
+; CHECK-NEXT:    mov v2.s[3], w8
+; CHECK-NEXT:    uzp1 v0.8h, v2.8h, v1.8h
 ; CHECK-NEXT:    ret
     %x = call <8 x i16> @llvm.fptosi.sat.v8f64.v8i16(<8 x double> %f)
     ret <8 x i16> %x
@@ -3555,116 +3532,114 @@ define <8 x i16> @test_signed_v8f64_v8i16(<8 x double> %f) {
 define <16 x i16> @test_signed_v16f64_v16i16(<16 x double> %f) {
 ; CHECK-LABEL: test_signed_v16f64_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov d16, v3.d[1]
-; CHECK-NEXT:    mov w9, #32767 // =0x7fff
-; CHECK-NEXT:    fcvtzs w11, d3
-; CHECK-NEXT:    mov d3, v1.d[1]
+; CHECK-NEXT:    mov d16, v2.d[1]
+; CHECK-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-NEXT:    fcvtzs w11, d2
+; CHECK-NEXT:    mov d2, v3.d[1]
+; CHECK-NEXT:    fcvtzs w12, d3
+; CHECK-NEXT:    mov d3, v0.d[1]
+; CHECK-NEXT:    fcvtzs w16, d0
+; CHECK-NEXT:    mov d0, v1.d[1]
+; CHECK-NEXT:    fcvtzs w1, d7
+; CHECK-NEXT:    fcvtzs w4, d4
+; CHECK-NEXT:    fcvtzs w9, d16
 ; CHECK-NEXT:    fcvtzs w14, d2
-; CHECK-NEXT:    fcvtzs w15, d1
-; CHECK-NEXT:    mov d1, v7.d[1]
+; CHECK-NEXT:    fcvtzs w15, d3
 ; CHECK-NEXT:    fcvtzs w18, d0
-; CHECK-NEXT:    fcvtzs w1, d7
-; CHECK-NEXT:    fcvtzs w2, d6
-; CHECK-NEXT:    fcvtzs w4, d5
-; CHECK-NEXT:    fcvtzs w6, d4
-; CHECK-NEXT:    fcvtzs w8, d16
-; CHECK-NEXT:    mov d16, v2.d[1]
-; CHECK-NEXT:    mov d2, v0.d[1]
-; CHECK-NEXT:    mov d0, v6.d[1]
-; CHECK-NEXT:    fcvtzs w0, d1
-; CHECK-NEXT:    cmp w8, w9
-; CHECK-NEXT:    fcvtzs w13, d16
-; CHECK-NEXT:    fcvtzs w17, d2
-; CHECK-NEXT:    csel w10, w8, w9, lt
-; CHECK-NEXT:    mov w8, #-32768 // =0xffff8000
+; CHECK-NEXT:    mov d0, v7.d[1]
+; CHECK-NEXT:    cmp w9, w8
+; CHECK-NEXT:    csel w10, w9, w8, lt
+; CHECK-NEXT:    mov w9, #-32768 // =0xffff8000
 ; CHECK-NEXT:    cmn w10, #8, lsl #12 // =32768
-; CHECK-NEXT:    csel w10, w10, w8, gt
-; CHECK-NEXT:    cmp w11, w9
-; CHECK-NEXT:    csel w11, w11, w9, lt
+; CHECK-NEXT:    fcvtzs w2, d0
+; CHECK-NEXT:    csel w10, w10, w9, gt
+; CHECK-NEXT:    cmp w11, w8
+; CHECK-NEXT:    csel w11, w11, w8, lt
 ; CHECK-NEXT:    cmn w11, #8, lsl #12 // =32768
-; CHECK-NEXT:    csel w12, w11, w8, gt
-; CHECK-NEXT:    cmp w13, w9
-; CHECK-NEXT:    csel w11, w13, w9, lt
-; CHECK-NEXT:    fcvtzs w13, d3
+; CHECK-NEXT:    csel w13, w11, w9, gt
+; CHECK-NEXT:    cmp w12, w8
+; CHECK-NEXT:    csel w11, w12, w8, lt
+; CHECK-NEXT:    fmov s0, w13
 ; CHECK-NEXT:    cmn w11, #8, lsl #12 // =32768
-; CHECK-NEXT:    csel w11, w11, w8, gt
-; CHECK-NEXT:    cmp w14, w9
-; CHECK-NEXT:    csel w14, w14, w9, lt
+; CHECK-NEXT:    csel w11, w11, w9, gt
+; CHECK-NEXT:    cmp w14, w8
+; CHECK-NEXT:    csel w12, w14, w8, lt
+; CHECK-NEXT:    mov v0.s[1], w10
+; CHECK-NEXT:    cmn w12, #8, lsl #12 // =32768
+; CHECK-NEXT:    csel w12, w12, w9, gt
+; CHECK-NEXT:    cmp w15, w8
+; CHECK-NEXT:    csel w14, w15, w8, lt
+; CHECK-NEXT:    fcvtzs w15, d1
+; CHECK-NEXT:    mov d1, v6.d[1]
 ; CHECK-NEXT:    cmn w14, #8, lsl #12 // =32768
-; CHECK-NEXT:    csel w14, w14, w8, gt
-; CHECK-NEXT:    cmp w13, w9
-; CHECK-NEXT:    csel w13, w13, w9, lt
-; CHECK-NEXT:    cmn w13, #8, lsl #12 // =32768
-; CHECK-NEXT:    csel w13, w13, w8, gt
-; CHECK-NEXT:    cmp w15, w9
-; CHECK-NEXT:    csel w15, w15, w9, lt
-; CHECK-NEXT:    cmn w15, #8, lsl #12 // =32768
-; CHECK-NEXT:    csel w16, w15, w8, gt
-; CHECK-NEXT:    cmp w17, w9
-; CHECK-NEXT:    csel w15, w17, w9, lt
+; CHECK-NEXT:    mov v0.s[2], w11
+; CHECK-NEXT:    csel w14, w14, w9, gt
+; CHECK-NEXT:    cmp w16, w8
+; CHECK-NEXT:    csel w16, w16, w8, lt
+; CHECK-NEXT:    cmn w16, #8, lsl #12 // =32768
+; CHECK-NEXT:    fcvtzs w0, d1
+; CHECK-NEXT:    mov d1, v4.d[1]
+; CHECK-NEXT:    csel w17, w16, w9, gt
+; CHECK-NEXT:    cmp w15, w8
+; CHECK-NEXT:    mov v0.s[3], w12
+; CHECK-NEXT:    csel w15, w15, w8, lt
+; CHECK-NEXT:    fmov s2, w17
 ; CHECK-NEXT:    cmn w15, #8, lsl #12 // =32768
-; CHECK-NEXT:    csel w15, w15, w8, gt
-; CHECK-NEXT:    cmp w18, w9
-; CHECK-NEXT:    csel w17, w18, w9, lt
-; CHECK-NEXT:    cmn w17, #8, lsl #12 // =32768
-; CHECK-NEXT:    csel w17, w17, w8, gt
-; CHECK-NEXT:    cmp w0, w9
-; CHECK-NEXT:    csel w18, w0, w9, lt
-; CHECK-NEXT:    fcvtzs w0, d0
-; CHECK-NEXT:    mov d0, v5.d[1]
+; CHECK-NEXT:    csel w15, w15, w9, gt
+; CHECK-NEXT:    cmp w18, w8
+; CHECK-NEXT:    fcvtzs w3, d1
+; CHECK-NEXT:    csel w16, w18, w8, lt
+; CHECK-NEXT:    fcvtzs w18, d6
+; CHECK-NEXT:    mov d1, v5.d[1]
+; CHECK-NEXT:    cmn w16, #8, lsl #12 // =32768
+; CHECK-NEXT:    mov v2.s[1], w14
+; CHECK-NEXT:    csel w16, w16, w9, gt
+; CHECK-NEXT:    cmp w0, w8
+; CHECK-NEXT:    csel w0, w0, w8, lt
+; CHECK-NEXT:    cmn w0, #8, lsl #12 // =32768
+; CHECK-NEXT:    csel w0, w0, w9, gt
+; CHECK-NEXT:    cmp w18, w8
+; CHECK-NEXT:    mov v2.s[2], w15
+; CHECK-NEXT:    csel w18, w18, w8, lt
 ; CHECK-NEXT:    cmn w18, #8, lsl #12 // =32768
-; CHECK-NEXT:    csel w18, w18, w8, gt
-; CHECK-NEXT:    cmp w1, w9
-; CHECK-NEXT:    csel w1, w1, w9, lt
+; CHECK-NEXT:    csel w18, w18, w9, gt
+; CHECK-NEXT:    cmp w1, w8
+; CHECK-NEXT:    csel w1, w1, w8, lt
+; CHECK-NEXT:    fmov s3, w18
+; CHECK-NEXT:    mov v2.s[3], w16
 ; CHECK-NEXT:    cmn w1, #8, lsl #12 // =32768
-; CHECK-NEXT:    fcvtzs w3, d0
-; CHECK-NEXT:    mov d0, v4.d[1]
-; CHECK-NEXT:    csel w1, w1, w8, gt
-; CHECK-NEXT:    cmp w0, w9
-; CHECK-NEXT:    csel w0, w0, w9, lt
-; CHECK-NEXT:    fmov s7, w1
-; CHECK-NEXT:    cmn w0, #8, lsl #12 // =32768
-; CHECK-NEXT:    csel w0, w0, w8, gt
-; CHECK-NEXT:    cmp w2, w9
-; CHECK-NEXT:    fcvtzs w5, d0
-; CHECK-NEXT:    csel w2, w2, w9, lt
-; CHECK-NEXT:    fmov s3, w12
-; CHECK-NEXT:    mov v7.s[1], w18
+; CHECK-NEXT:    csel w1, w1, w9, gt
+; CHECK-NEXT:    cmp w2, w8
+; CHECK-NEXT:    csel w2, w2, w8, lt
+; CHECK-NEXT:    mov v3.s[1], w0
 ; CHECK-NEXT:    cmn w2, #8, lsl #12 // =32768
-; CHECK-NEXT:    csel w2, w2, w8, gt
-; CHECK-NEXT:    cmp w3, w9
-; CHECK-NEXT:    csel w3, w3, w9, lt
-; CHECK-NEXT:    mov v3.s[1], w10
-; CHECK-NEXT:    fmov s6, w2
+; CHECK-NEXT:    uzp1 v0.8h, v2.8h, v0.8h
+; CHECK-NEXT:    csel w2, w2, w9, gt
+; CHECK-NEXT:    cmp w3, w8
+; CHECK-NEXT:    csel w3, w3, w8, lt
 ; CHECK-NEXT:    cmn w3, #8, lsl #12 // =32768
-; CHECK-NEXT:    fmov s2, w14
-; CHECK-NEXT:    csel w3, w3, w8, gt
-; CHECK-NEXT:    cmp w4, w9
-; CHECK-NEXT:    csel w4, w4, w9, lt
-; CHECK-NEXT:    mov v6.s[1], w0
-; CHECK-NEXT:    cmn w4, #8, lsl #12 // =32768
-; CHECK-NEXT:    mov v2.s[1], w11
-; CHECK-NEXT:    csel w12, w4, w8, gt
-; CHECK-NEXT:    cmp w5, w9
-; CHECK-NEXT:    fmov s1, w16
-; CHECK-NEXT:    csel w10, w5, w9, lt
-; CHECK-NEXT:    fmov s5, w12
-; CHECK-NEXT:    cmn w10, #8, lsl #12 // =32768
-; CHECK-NEXT:    csel w10, w10, w8, gt
-; CHECK-NEXT:    cmp w6, w9
-; CHECK-NEXT:    mov v1.s[1], w13
-; CHECK-NEXT:    csel w9, w6, w9, lt
-; CHECK-NEXT:    mov v5.s[1], w3
-; CHECK-NEXT:    fmov s0, w17
-; CHECK-NEXT:    cmn w9, #8, lsl #12 // =32768
-; CHECK-NEXT:    csel w8, w9, w8, gt
-; CHECK-NEXT:    fmov s4, w8
-; CHECK-NEXT:    mov v0.s[1], w15
-; CHECK-NEXT:    adrp x8, .LCPI85_0
-; CHECK-NEXT:    ldr q16, [x8, :lo12:.LCPI85_0]
-; CHECK-NEXT:    mov v4.s[1], w10
-; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b, v2.16b, v3.16b }, v16.16b
-; CHECK-NEXT:    tbl v1.16b, { v4.16b, v5.16b, v6.16b, v7.16b }, v16.16b
+; CHECK-NEXT:    mov v3.s[2], w1
+; CHECK-NEXT:    csel w13, w3, w9, gt
+; CHECK-NEXT:    cmp w4, w8
+; CHECK-NEXT:    csel w3, w4, w8, lt
+; CHECK-NEXT:    fcvtzs w4, d5
+; CHECK-NEXT:    cmn w3, #8, lsl #12 // =32768
+; CHECK-NEXT:    csel w10, w3, w9, gt
+; CHECK-NEXT:    mov v3.s[3], w2
+; CHECK-NEXT:    fmov s4, w10
+; CHECK-NEXT:    fcvtzs w10, d1
+; CHECK-NEXT:    cmp w4, w8
+; CHECK-NEXT:    mov v4.s[1], w13
+; CHECK-NEXT:    csel w13, w4, w8, lt
+; CHECK-NEXT:    cmn w13, #8, lsl #12 // =32768
+; CHECK-NEXT:    csel w11, w13, w9, gt
+; CHECK-NEXT:    cmp w10, w8
+; CHECK-NEXT:    csel w8, w10, w8, lt
+; CHECK-NEXT:    mov v4.s[2], w11
+; CHECK-NEXT:    cmn w8, #8, lsl #12 // =32768
+; CHECK-NEXT:    csel w8, w8, w9, gt
+; CHECK-NEXT:    mov v4.s[3], w8
+; CHECK-NEXT:    uzp1 v1.8h, v4.8h, v3.8h
 ; CHECK-NEXT:    ret
     %x = call <16 x i16> @llvm.fptosi.sat.v16f64.v16i16(<16 x double> %f)
     ret <16 x i16> %x
diff --git a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
index c94db3484994c3..15a12c13ec25d2 100644
--- a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
@@ -2754,46 +2754,45 @@ define <16 x i16> @test_unsigned_v16f16_v16i16(<16 x half> %f) {
 define <8 x i8> @test_unsigned_v8f64_v8i8(<8 x double> %f) {
 ; CHECK-LABEL: test_unsigned_v8f64_v8i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov d4, v3.d[1]
-; CHECK-NEXT:    mov d5, v2.d[1]
-; CHECK-NEXT:    mov w11, #255 // =0xff
-; CHECK-NEXT:    fcvtzu w9, d3
-; CHECK-NEXT:    mov d3, v1.d[1]
-; CHECK-NEXT:    fcvtzu w12, d2
-; CHECK-NEXT:    fcvtzu w14, d1
+; CHECK-NEXT:    mov d4, v2.d[1]
+; CHECK-NEXT:    mov d5, v3.d[1]
+; CHECK-NEXT:    mov w12, #255 // =0xff
+; CHECK-NEXT:    fcvtzu w9, d2
+; CHECK-NEXT:    fcvtzu w10, d3
+; CHECK-NEXT:    fcvtzu w14, d0
+; CHECK-NEXT:    fcvtzu w15, d1
 ; CHECK-NEXT:    fcvtzu w8, d4
 ; CHECK-NEXT:    mov d4, v0.d[1]
-; CHECK-NEXT:    fcvtzu w10, d5
-; CHECK-NEXT:    fcvtzu w13, d3
+; CHECK-NEXT:    fcvtzu w11, d5
+; CHECK-NEXT:    mov d0, v1.d[1]
 ; CHECK-NEXT:    cmp w8, #255
-; CHECK-NEXT:    fcvtzu w15, d4
-; CHECK-NEXT:    csel w8, w8, w11, lo
+; CHECK-NEXT:    fcvtzu w13, d4
+; CHECK-NEXT:    csel w8, w8, w12, lo
 ; CHECK-NEXT:    cmp w9, #255
-; CHECK-NEXT:    csel w9, w9, w11, lo
+; CHECK-NEXT:    csel w9, w9, w12, lo
 ; CHECK-NEXT:    cmp w10, #255
-; CHECK-NEXT:    fmov s4, w9
-; CHECK-NEXT:    csel w9, w10, w11, lo
-; CHECK-NEXT:    cmp w12, #255
-; CHECK-NEXT:    fcvtzu w10, d0
-; CHECK-NEXT:    mov v4.s[1], w8
-; CHECK-NEXT:    csel w8, w12, w11, lo
+; CHECK-NEXT:    csel w10, w10, w12, lo
+; CHECK-NEXT:    cmp w11, #255
+; CHECK-NEXT:    fmov s1, w9
+; CHECK-NEXT:    csel w11, w11, w12, lo
 ; CHECK-NEXT:    cmp w13, #255
-; CHECK-NEXT:    fmov s3, w8
-; CHECK-NEXT:    csel w8, w13, w11, lo
+; CHECK-NEXT:    csel w13, w13, w12, lo
 ; CHECK-NEXT:    cmp w14, #255
-; CHECK-NEXT:    mov v3.s[1], w9
-; CHECK-NEXT:    csel w9, w14, w11, lo
+; CHECK-NEXT:    csel w14, w14, w12, lo
+; CHECK-NEXT:    mov v1.s[1], w8
 ; CHECK-NEXT:    cmp w15, #255
-; CHECK-NEXT:    fmov s2, w9
-; CHECK-NEXT:    csel w9, w15, w11, lo
-; CHECK-NEXT:    cmp w10, #255
-; CHECK-NEXT:    mov v2.s[1], w8
-; CHECK-NEXT:    csel w8, w10, w11, lo
-; CHECK-NEXT:    fmov s1, w8
-; CHECK-NEXT:    adrp x8, .LCPI82_0
-; CHECK-NEXT:    ldr d0, [x8, :lo12:.LCPI82_0]
-; CHECK-NEXT:    mov v1.s[1], w9
-; CHECK-NEXT:    tbl v0.8b, { v1.16b, v2.16b, v3.16b, v4.16b }, v0.8b
+; CHECK-NEXT:    fmov s2, w14
+; CHECK-NEXT:    fcvtzu w8, d0
+; CHECK-NEXT:    csel w9, w15, w12, lo
+; CHECK-NEXT:    mov v2.s[1], w13
+; CHECK-NEXT:    mov v1.s[2], w10
+; CHECK-NEXT:    cmp w8, #255
+; CHECK-NEXT:    csel w8, w8, w12, lo
+; CHECK-NEXT:    mov v2.s[2], w9
+; CHECK-NEXT:    mov v1.s[3], w11
+; CHECK-NEXT:    mov v2.s[3], w8
+; CHECK-NEXT:    uzp1 v0.8h, v2.8h, v1.8h
+; CHECK-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-NEXT:    ret
     %x = call <8 x i8> @llvm.fptoui.sat.v8f64.v8i8(<8 x double> %f)
     ret <8 x i8> %x
@@ -2802,102 +2801,82 @@ define <8 x i8> @test_unsigned_v8f64_v8i8(<8 x double> %f) {
 define <16 x i8> @test_unsigned_v16f64_v16i8(<16 x double> %f) {
 ; CHECK-LABEL: test_unsigned_v16f64_v16i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov d16, v0.d[1]
-; CHECK-NEXT:    fcvtzu w10, d0
+; CHECK-NEXT:    mov d16, v6.d[1]
+; CHECK-NEXT:    mov d17, v7.d[1]
 ; CHECK-NEXT:    mov w8, #255 // =0xff
+; CHECK-NEXT:    fcvtzu w10, d6
+; CHECK-NEXT:    mov d6, v4.d[1]
+; CHECK-NEXT:    fcvtzu w11, d7
+; CHECK-NEXT:    mov d7, v5.d[1]
+; CHECK-NEXT:    fcvtzu w15, d4
+; CHECK-NEXT:    fcvtzu w16, d5
+; CHECK-NEXT:    mov d4, v3.d[1]
+; CHECK-NEXT:    mov d5, v0.d[1]
+; CHECK-NEXT:    fcvtzu w18, d2
 ; CHECK-NEXT:    fcvtzu w9, d16
-; CHECK-NEXT:    mov d16, v1.d[1]
-; CHECK-NEXT:    cmp w9, #255
-; CHECK-NEXT:    csel w9, w9, w8, lo
-; CHECK-NEXT:    cmp w10, #255
-; CHECK-NEXT:    csel w10, w10, w8, lo
-; CHECK-NEXT:    fmov s0, w10
-; CHECK-NEXT:    fcvtzu w10, d16
+; CHECK-NEXT:    fcvtzu w12, d17
 ; CHECK-NEXT:    mov d16, v2.d[1]
-; CHECK-NEXT:    mov v0.s[1], w9
-; CHECK-NEXT:    fcvtzu w9, d1
-; CHECK-NEXT:    cmp w10, #255
-; CHECK-NEXT:    csel w10, w10, w8, lo
+; CHECK-NEXT:    fcvtzu w13, d6
+; CHECK-NEXT:    fcvtzu w0, d3
+; CHECK-NEXT:    fcvtzu w3, d0
+; CHECK-NEXT:    fcvtzu w1, d4
+; CHECK-NEXT:    fcvtzu w2, d5
 ; CHECK-NEXT:    cmp w9, #255
-; CHECK-NEXT:    mov w11, v0.s[1]
-; CHECK-NEXT:    csel w9, w9, w8, lo
-; CHECK-NEXT:    fmov s1, w9
-; CHECK-NEXT:    fcvtzu w9, d16
-; CHECK-NEXT:    mov d16, v3.d[1]
-; CHECK-NEXT:    mov v0.b[1], w11
-; CHECK-NEXT:    mov v1.s[1], w10
-; CHECK-NEXT:    fcvtzu w10, d2
-; CHECK-NEXT:    cmp w9, #255
-; CHECK-NEXT:    csel w9, w9, w8, lo
+; CHECK-NEXT:    fcvtzu w17, d16
+; CHECK-NEXT:    csel w14, w9, w8, lo
 ; CHECK-NEXT:    cmp w10, #255
-; CHECK-NEXT:    mov w11, v1.s[1]
-; CHECK-NEXT:    mov v0.b[2], v1.b[0]
 ; CHECK-NEXT:    csel w10, w10, w8, lo
-; CHECK-NEXT:    fmov s2, w10
-; CHECK-NEXT:    fcvtzu w10, d16
-; CHECK-NEXT:    mov d16, v4.d[1]
-; CHECK-NEXT:    mov v0.b[3], w11
-; CHECK-NEXT:    mov v2.s[1], w9
-; CHECK-NEXT:    fcvtzu w9, d3
-; CHECK-NEXT:    cmp w10, #255
-; CHECK-NEXT:    csel w10, w10, w8, lo
-; CHECK-NEXT:    cmp w9, #255
-; CHECK-NEXT:    mov w11, v2.s[1]
-; CHECK-NEXT:    mov v0.b[4], v2.b[0]
-; CHECK-NEXT:    csel w9, w9, w8, lo
-; CHECK-NEXT:    fmov s3, w9
-; CHECK-NEXT:    fcvtzu w9, d16
-; CHECK-NEXT:    mov d16, v5.d[1]
-; CHECK-NEXT:    mov v0.b[5], w11
-; CHECK-NEXT:    mov v3.s[1], w10
-; CHECK-NEXT:    fcvtzu w10, d4
-; CHECK-NEXT:    cmp w9, #255
-; CHECK-NEXT:    csel w9, w9, w8, lo
-; CHECK-NEXT:    cmp w10, #255
-; CHECK-NEXT:    mov w11, v3.s[1]
-; CHECK-NEXT:    mov v0.b[6], v3.b[0]
-; CHECK-NEXT:    csel w10, w10, w8, lo
-; CHECK-NEXT:    fmov s4, w10
-; CHECK-NEXT:    fcvtzu w10, d16
-; CHECK-NEXT:    mov v0.b[7], w11
-; CHECK-NEXT:    mov v4.s[1], w9
-; CHECK-NEXT:    fcvtzu w9, d5
-; CHECK-NEXT:    mov d5, v6.d[1]
-; CHECK-NEXT:    cmp w10, #255
-; CHECK-NEXT:    csel w10, w10, w8, lo
-; CHECK-NEXT:    cmp w9, #255
-; CHECK-NEXT:    mov w11, v4.s[1]
-; CHECK-NEXT:    mov v0.b[8], v4.b[0]
-; CHECK-NEXT:    csel w9, w9, w8, lo
-; CHECK-NEXT:    fmov s16, w9
-; CHECK-NEXT:    fcvtzu w9, d5
-; CHECK-NEXT:    mov d5, v7.d[1]
-; CHECK-NEXT:    mov v0.b[9], w11
-; CHECK-NEXT:    mov v16.s[1], w10
-; CHECK-NEXT:    fcvtzu w10, d6
-; CHECK-NEXT:    cmp w9, #255
-; CHECK-NEXT:    csel w9, w9, w8, lo
-; CHECK-NEXT:    cmp w10, #255
-; CHECK-NEXT:    mov v0.b[10], v16.b[0]
-; CHECK-NEXT:    mov w11, v16.s[1]
-; CHECK-NEXT:    csel w10, w10, w8, lo
-; CHECK-NEXT:    fmov s6, w10
-; CHECK-NEXT:    fcvtzu w10, d7
-; CHECK-NEXT:    mov v0.b[11], w11
-; CHECK-NEXT:    mov v6.s[1], w9
-; CHECK-NEXT:    fcvtzu w9, d5
-; CHECK-NEXT:    cmp w9, #255
-; CHECK-NEXT:    mov v0.b[12], v6.b[0]
-; CHECK-NEXT:    mov w11, v6.s[1]
-; CHECK-NEXT:    csel w9, w9, w8, lo
-; CHECK-NEXT:    cmp w10, #255
-; CHECK-NEXT:    csel w8, w10, w8, lo
-; CHECK-NEXT:    fmov s5, w8
-; CHECK-NEXT:    mov v0.b[13], w11
-; CHECK-NEXT:    mov v5.s[1], w9
-; CHECK-NEXT:    mov v0.b[14], v5.b[0]
-; CHECK-NEXT:    mov w8, v5.s[1]
-; CHECK-NEXT:    mov v0.b[15], w8
+; CHECK-NEXT:    cmp w11, #255
+; CHECK-NEXT:    csel w9, w11, w8, lo
+; CHECK-NEXT:    cmp w12, #255
+; CHECK-NEXT:    fcvtzu w11, d7
+; CHECK-NEXT:    csel w12, w12, w8, lo
+; CHECK-NEXT:    cmp w13, #255
+; CHECK-NEXT:    fmov s0, w10
+; CHECK-NEXT:    csel w13, w13, w8, lo
+; CHECK-NEXT:    cmp w15, #255
+; CHECK-NEXT:    csel w15, w15, w8, lo
+; CHECK-NEXT:    cmp w16, #255
+; CHECK-NEXT:    csel w16, w16, w8, lo
+; CHECK-NEXT:    cmp w11, #255
+; CHECK-NEXT:    mov v0.s[1], w14
+; CHECK-NEXT:    csel w11, w11, w8, lo
+; CHECK-NEXT:    cmp w17, #255
+; CHECK-NEXT:    fmov s2, w15
+; CHECK-NEXT:    csel w17, w17, w8, lo
+; CHECK-NEXT:    cmp w18, #255
+; CHECK-NEXT:    csel w18, w18, w8, lo
+; CHECK-NEXT:    cmp w0, #255
+; CHECK-NEXT:    csel w0, w0, w8, lo
+; CHECK-NEXT:    cmp w1, #255
+; CHECK-NEXT:    fmov s3, w18
+; CHECK-NEXT:    csel w10, w1, w8, lo
+; CHECK-NEXT:    cmp w2, #255
+; CHECK-NEXT:    mov v2.s[1], w13
+; CHECK-NEXT:    csel w14, w2, w8, lo
+; CHECK-NEXT:    cmp w3, #255
+; CHECK-NEXT:    fcvtzu w2, d1
+; CHECK-NEXT:    csel w1, w3, w8, lo
+; CHECK-NEXT:    mov d1, v1.d[1]
+; CHECK-NEXT:    mov v3.s[1], w17
+; CHECK-NEXT:    fmov s4, w1
+; CHECK-NEXT:    mov v0.s[2], w9
+; CHECK-NEXT:    mov v2.s[2], w16
+; CHECK-NEXT:    cmp w2, #255
+; CHECK-NEXT:    mov v4.s[1], w14
+; CHECK-NEXT:    fcvtzu w13, d1
+; CHECK-NEXT:    csel w9, w2, w8, lo
+; CHECK-NEXT:    mov v3.s[2], w0
+; CHECK-NEXT:    mov v0.s[3], w12
+; CHECK-NEXT:    mov v2.s[3], w11
+; CHECK-NEXT:    mov v4.s[2], w9
+; CHECK-NEXT:    cmp w13, #255
+; CHECK-NEXT:    csel w8, w13, w8, lo
+; CHECK-NEXT:    mov v3.s[3], w10
+; CHECK-NEXT:    uzp1 v0.8h, v2.8h, v0.8h
+; CHECK-NEXT:    mov v4.s[3], w8
+; CHECK-NEXT:    uzp1 v1.8h, v4.8h, v3.8h
+; CHECK-NEXT:    uzp1 v0.16b, v1.16b, v0.16b
 ; CHECK-NEXT:    ret
     %x = call <16 x i8> @llvm.fptoui.sat.v16f64.v16i8(<16 x double> %f)
     ret <16 x i8> %x
@@ -2906,46 +2885,44 @@ define <16 x i8> @test_unsigned_v16f64_v16i8(<16 x double> %f) {
 define <8 x i16> @test_unsigned_v8f64_v8i16(<8 x double> %f) {
 ; CHECK-LABEL: test_unsigned_v8f64_v8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov d4, v3.d[1]
-; CHECK-NEXT:    mov d5, v2.d[1]
-; CHECK-NEXT:    mov w10, #65535 // =0xffff
-; CHECK-NEXT:    fcvtzu w9, d3
-; CHECK-NEXT:    mov d3, v1.d[1]
-; CHECK-NEXT:    fcvtzu w12, d2
-; CHECK-NEXT:    fcvtzu w14, d1
+; CHECK-NEXT:    mov d4, v2.d[1]
+; CHECK-NEXT:    mov d5, v3.d[1]
+; CHECK-NEXT:    mov w11, #65535 // =0xffff
+; CHECK-NEXT:    fcvtzu w9, d2
+; CHECK-NEXT:    fcvtzu w10, d3
+; CHECK-NEXT:    fcvtzu w14, d0
+; CHECK-NEXT:    fcvtzu w15, d1
 ; CHECK-NEXT:    fcvtzu w8, d4
 ; CHECK-NEXT:    mov d4, v0.d[1]
-; CHECK-NEXT:    fcvtzu w11, d5
-; CHECK-NEXT:    fcvtzu w13, d3
-; CHECK-NEXT:    cmp w8, w10
-; CHECK-NEXT:    fcvtzu w15, d4
-; CHECK-NEXT:    csel w8, w8, w10, lo
-; CHECK-NEXT:    cmp w9, w10
-; CHECK-NEXT:    csel w9, w9, w10, lo
-; CHECK-NEXT:    cmp w11, w10
-; CHECK-NEXT:    fmov s4, w9
-; CHECK-NEXT:    csel w9, w11, w10, lo
-; CHECK-NEXT:    cmp w12, w10
-; CHECK-NEXT:    fcvtzu w11, d0
-; CHECK-NEXT:    mov v4.s[1], w8
-; CHECK-NEXT:    csel w8, w12, w10, lo
-; CHECK-NEXT:    cmp w13, w10
-; CHECK-NEXT:    fmov s3, w8
-; CHECK-NEXT:    csel w8, w13, w10, lo
-; CHECK-NEXT:    cmp w14, w10
-; CHECK-NEXT:    mov v3.s[1], w9
-; CHECK-NEXT:    csel w9, w14, w10, lo
-; CHECK-NEXT:    cmp w15, w10
-; CHECK-NEXT:    fmov s2, w9
-; CHECK-NEXT:    csel w9, w15, w10, lo
-; CHECK-NEXT:    cmp w11, w10
-; CHECK-NEXT:    mov v2.s[1], w8
-; CHECK-NEXT:    csel w8, w11, w10, lo
-; CHECK-NEXT:    fmov s1, w8
-; CHECK-NEXT:    adrp x8, .LCPI84_0
-; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI84_0]
-; CHECK-NEXT:    mov v1.s[1], w9
-; CHECK-NEXT:    tbl v0.16b, { v1.16b, v2.16b, v3.16b, v4.16b }, v0.16b
+; CHECK-NEXT:    fcvtzu w12, d5
+; CHECK-NEXT:    mov d0, v1.d[1]
+; CHECK-NEXT:    cmp w8, w11
+; CHECK-NEXT:    fcvtzu w13, d4
+; CHECK-NEXT:    csel w8, w8, w11, lo
+; CHECK-NEXT:    cmp w9, w11
+; CHECK-NEXT:    csel w9, w9, w11, lo
+; CHECK-NEXT:    cmp w10, w11
+; CHECK-NEXT:    csel w10, w10, w11, lo
+; CHECK-NEXT:    cmp w12, w11
+; CHECK-NEXT:    fmov s1, w9
+; CHECK-NEXT:    csel w12, w12, w11, lo
+; CHECK-NEXT:    cmp w13, w11
+; CHECK-NEXT:    csel w13, w13, w11, lo
+; CHECK-NEXT:    cmp w14, w11
+; CHECK-NEXT:    csel w14, w14, w11, lo
+; CHECK-NEXT:    mov v1.s[1], w8
+; CHECK-NEXT:    cmp w15, w11
+; CHECK-NEXT:    fmov s2, w14
+; CHECK-NEXT:    fcvtzu w8, d0
+; CHECK-NEXT:    csel w9, w15, w11, lo
+; CHECK-NEXT:    mov v2.s[1], w13
+; CHECK-NEXT:    mov v1.s[2], w10
+; CHECK-NEXT:    cmp w8, w11
+; CHECK-NEXT:    csel w8, w8, w11, lo
+; CHECK-NEXT:    mov v2.s[2], w9
+; CHECK-NEXT:    mov v1.s[3], w12
+; CHECK-NEXT:    mov v2.s[3], w8
+; CHECK-NEXT:    uzp1 v0.8h, v2.8h, v1.8h
 ; CHECK-NEXT:    ret
     %x = call <8 x i16> @llvm.fptoui.sat.v8f64.v8i16(<8 x double> %f)
     ret <8 x i16> %x
@@ -2954,83 +2931,81 @@ define <8 x i16> @test_unsigned_v8f64_v8i16(<8 x double> %f) {
 define <16 x i16> @test_unsigned_v16f64_v16i16(<16 x double> %f) {
 ; CHECK-LABEL: test_unsigned_v16f64_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov d16, v3.d[1]
-; CHECK-NEXT:    mov d17, v2.d[1]
+; CHECK-NEXT:    mov d16, v2.d[1]
+; CHECK-NEXT:    mov d17, v3.d[1]
 ; CHECK-NEXT:    mov w8, #65535 // =0xffff
-; CHECK-NEXT:    fcvtzu w9, d3
+; CHECK-NEXT:    fcvtzu w10, d2
+; CHECK-NEXT:    mov d2, v0.d[1]
+; CHECK-NEXT:    fcvtzu w11, d3
 ; CHECK-NEXT:    mov d3, v1.d[1]
-; CHECK-NEXT:    fcvtzu w10, d1
-; CHECK-NEXT:    mov d1, v0.d[1]
-; CHECK-NEXT:    fcvtzu w11, d2
-; CHECK-NEXT:    fcvtzu w12, d0
+; CHECK-NEXT:    fcvtzu w15, d0
+; CHECK-NEXT:    fcvtzu w16, d1
 ; CHECK-NEXT:    mov d0, v7.d[1]
-; CHECK-NEXT:    mov d2, v6.d[1]
-; CHECK-NEXT:    fcvtzu w14, d7
-; CHECK-NEXT:    fcvtzu w13, d16
-; CHECK-NEXT:    fcvtzu w16, d17
-; CHECK-NEXT:    fcvtzu w15, d6
-; CHECK-NEXT:    fcvtzu w17, d3
-; CHECK-NEXT:    mov d6, v5.d[1]
-; CHECK-NEXT:    mov d3, v4.d[1]
-; CHECK-NEXT:    fcvtzu w18, d1
-; CHECK-NEXT:    cmp w13, w8
-; CHECK-NEXT:    csel w13, w13, w8, lo
+; CHECK-NEXT:    mov d1, v4.d[1]
+; CHECK-NEXT:    fcvtzu w18, d6
+; CHECK-NEXT:    fcvtzu w9, d16
+; CHECK-NEXT:    fcvtzu w12, d17
+; CHECK-NEXT:    mov d16, v6.d[1]
+; CHECK-NEXT:    fcvtzu w13, d2
+; CHECK-NEXT:    fcvtzu w0, d7
+; CHECK-NEXT:    fcvtzu w3, d4
+; CHECK-NEXT:    fcvtzu w1, d0
+; CHECK-NEXT:    fcvtzu w2, d1
+; CHECK-NEXT:    mov d1, v5.d[1]
 ; CHECK-NEXT:    cmp w9, w8
-; CHECK-NEXT:    csel w9, w9, w8, lo
-; CHECK-NEXT:    cmp w16, w8
-; CHECK-NEXT:    fmov s19, w9
-; CHECK-NEXT:    csel w9, w16, w8, lo
-; CHECK-NEXT:    cmp w11, w8
-; CHECK-NEXT:    fcvtzu w16, d0
-; CHECK-NEXT:    csel w11, w11, w8, lo
-; CHECK-NEXT:    cmp w17, w8
-; CHECK-NEXT:    mov v19.s[1], w13
-; CHECK-NEXT:    csel w13, w17, w8, lo
+; CHECK-NEXT:    fcvtzu w17, d16
+; CHECK-NEXT:    csel w14, w9, w8, lo
 ; CHECK-NEXT:    cmp w10, w8
 ; CHECK-NEXT:    csel w10, w10, w8, lo
-; CHECK-NEXT:    cmp w18, w8
-; CHECK-NEXT:    fmov s18, w11
-; CHECK-NEXT:    csel w11, w18, w8, lo
+; CHECK-NEXT:    cmp w11, w8
+; CHECK-NEXT:    csel w9, w11, w8, lo
 ; CHECK-NEXT:    cmp w12, w8
-; CHECK-NEXT:    fcvtzu w17, d2
+; CHECK-NEXT:    fcvtzu w11, d3
 ; CHECK-NEXT:    csel w12, w12, w8, lo
-; CHECK-NEXT:    cmp w16, w8
-; CHECK-NEXT:    fcvtzu w18, d6
-; CHECK-NEXT:    mov v18.s[1], w9
-; CHECK-NEXT:    csel w9, w16, w8, lo
-; CHECK-NEXT:    cmp w14, w8
-; CHECK-NEXT:    fmov s17, w10
-; CHECK-NEXT:    csel w10, w14, w8, lo
-; CHECK-NEXT:    fcvtzu w16, d5
-; CHECK-NEXT:    fmov s23, w10
-; CHECK-NEXT:    cmp w17, w8
-; CHECK-NEXT:    fcvtzu w14, d3
-; CHECK-NEXT:    csel w10, w17, w8, lo
+; CHECK-NEXT:    cmp w13, w8
+; CHECK-NEXT:    fmov s0, w10
+; CHECK-NEXT:    csel w13, w13, w8, lo
 ; CHECK-NEXT:    cmp w15, w8
-; CHECK-NEXT:    fcvtzu w17, d4
-; CHECK-NEXT:    mov v17.s[1], w13
-; CHECK-NEXT:    mov v23.s[1], w9
-; CHECK-NEXT:    csel w9, w15, w8, lo
-; CHECK-NEXT:    cmp w18, w8
-; CHECK-NEXT:    fmov s22, w9
-; CHECK-NEXT:    csel w9, w18, w8, lo
+; CHECK-NEXT:    csel w15, w15, w8, lo
 ; CHECK-NEXT:    cmp w16, w8
-; CHECK-NEXT:    fmov s16, w12
-; CHECK-NEXT:    mov v22.s[1], w10
-; CHECK-NEXT:    csel w10, w16, w8, lo
-; CHECK-NEXT:    cmp w14, w8
-; CHECK-NEXT:    fmov s21, w10
-; CHECK-NEXT:    csel w10, w14, w8, lo
+; CHECK-NEXT:    csel w16, w16, w8, lo
+; CHECK-NEXT:    cmp w11, w8
+; CHECK-NEXT:    mov v0.s[1], w14
+; CHECK-NEXT:    csel w11, w11, w8, lo
 ; CHECK-NEXT:    cmp w17, w8
-; CHECK-NEXT:    csel w8, w17, w8, lo
-; CHECK-NEXT:    mov v16.s[1], w11
-; CHECK-NEXT:    mov v21.s[1], w9
-; CHECK-NEXT:    fmov s20, w8
-; CHECK-NEXT:    adrp x8, .LCPI85_0
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI85_0]
-; CHECK-NEXT:    mov v20.s[1], w10
-; CHECK-NEXT:    tbl v0.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v1.16b
-; CHECK-NEXT:    tbl v1.16b, { v20.16b, v21.16b, v22.16b, v23.16b }, v1.16b
+; CHECK-NEXT:    fmov s2, w15
+; CHECK-NEXT:    csel w17, w17, w8, lo
+; CHECK-NEXT:    cmp w18, w8
+; CHECK-NEXT:    csel w18, w18, w8, lo
+; CHECK-NEXT:    cmp w0, w8
+; CHECK-NEXT:    csel w0, w0, w8, lo
+; CHECK-NEXT:    cmp w1, w8
+; CHECK-NEXT:    fmov s3, w18
+; CHECK-NEXT:    csel w10, w1, w8, lo
+; CHECK-NEXT:    cmp w2, w8
+; CHECK-NEXT:    mov v2.s[1], w13
+; CHECK-NEXT:    csel w14, w2, w8, lo
+; CHECK-NEXT:    cmp w3, w8
+; CHECK-NEXT:    fcvtzu w2, d5
+; CHECK-NEXT:    csel w1, w3, w8, lo
+; CHECK-NEXT:    mov v3.s[1], w17
+; CHECK-NEXT:    fcvtzu w13, d1
+; CHECK-NEXT:    fmov s4, w1
+; CHECK-NEXT:    mov v0.s[2], w9
+; CHECK-NEXT:    mov v2.s[2], w16
+; CHECK-NEXT:    cmp w2, w8
+; CHECK-NEXT:    mov v4.s[1], w14
+; CHECK-NEXT:    csel w9, w2, w8, lo
+; CHECK-NEXT:    mov v3.s[2], w0
+; CHECK-NEXT:    cmp w13, w8
+; CHECK-NEXT:    mov v0.s[3], w12
+; CHECK-NEXT:    csel w8, w13, w8, lo
+; CHECK-NEXT:    mov v2.s[3], w11
+; CHECK-NEXT:    mov v4.s[2], w9
+; CHECK-NEXT:    mov v3.s[3], w10
+; CHECK-NEXT:    uzp1 v0.8h, v2.8h, v0.8h
+; CHECK-NEXT:    mov v4.s[3], w8
+; CHECK-NEXT:    uzp1 v1.8h, v4.8h, v3.8h
 ; CHECK-NEXT:    ret
     %x = call <16 x i16> @llvm.fptoui.sat.v16f64.v16i16(<16 x double> %f)
     ret <16 x i16> %x
diff --git a/llvm/test/CodeGen/AArch64/neon-extracttruncate.ll b/llvm/test/CodeGen/AArch64/neon-extracttruncate.ll
index 3f590226c47150..8030387d327da0 100644
--- a/llvm/test/CodeGen/AArch64/neon-extracttruncate.ll
+++ b/llvm/test/CodeGen/AArch64/neon-extracttruncate.ll
@@ -37,20 +37,8 @@ entry:
 define <8 x i8> @extract_2_v4i32(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: extract_2_v4i32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov w8, v0.s[1]
-; CHECK-NEXT:    mov w9, v0.s[2]
-; CHECK-NEXT:    mov w10, v0.s[3]
-; CHECK-NEXT:    mov v0.b[1], w8
-; CHECK-NEXT:    mov w8, v1.s[1]
-; CHECK-NEXT:    mov v0.b[2], w9
-; CHECK-NEXT:    mov w9, v1.s[2]
-; CHECK-NEXT:    mov v0.b[3], w10
-; CHECK-NEXT:    mov v0.b[4], v1.b[0]
-; CHECK-NEXT:    mov v0.b[5], w8
-; CHECK-NEXT:    mov w8, v1.s[3]
-; CHECK-NEXT:    mov v0.b[6], w9
-; CHECK-NEXT:    mov v0.b[7], w8
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-NEXT:    ret
 entry:
   %a0 = extractelement <4 x i32> %a, i32 0
diff --git a/llvm/test/CodeGen/AArch64/shuffle-tbl34.ll b/llvm/test/CodeGen/AArch64/shuffle-tbl34.ll
index 0ef64789ad9724..b689f58f768475 100644
--- a/llvm/test/CodeGen/AArch64/shuffle-tbl34.ll
+++ b/llvm/test/CodeGen/AArch64/shuffle-tbl34.ll
@@ -675,57 +675,33 @@ define <16 x i8> @insert4_v16i8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c, <16 x i8
   ret <16 x i8> %i16
 }
 
-
-; CHECK: .LCPI16_0:
-; CHECK: 	.byte	0
-; CHECK: 	.byte	1
-; CHECK: 	.byte	4
-; CHECK: 	.byte	5
-; CHECK: 	.byte	16
-; CHECK: 	.byte	17
-; CHECK: 	.byte	20
-; CHECK: 	.byte	21
-; CHECK: 	.byte	32
-; CHECK: 	.byte	33
-; CHECK: 	.byte	36
-; CHECK: 	.byte	37
-; CHECK: 	.byte	48
-; CHECK: 	.byte	49
-; CHECK: 	.byte	52
-; CHECK: 	.byte	53
 define <16 x i16> @test(<2 x double> %l213, <2 x double> %l231, <2 x double> %l249, <2 x double> %l267, <2 x double> %l285, <2 x double> %l303, <2 x double> %l321, <2 x double> %l339) {
 ; CHECK-LABEL: test:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    frintm v0.2d, v0.2d
-; CHECK-NEXT:    frintm v4.2d, v4.2d
-; CHECK-NEXT:    adrp x8, .LCPI16_0
 ; CHECK-NEXT:    frintm v1.2d, v1.2d
-; CHECK-NEXT:    frintm v5.2d, v5.2d
 ; CHECK-NEXT:    frintm v2.2d, v2.2d
-; CHECK-NEXT:    frintm v6.2d, v6.2d
 ; CHECK-NEXT:    frintm v3.2d, v3.2d
+; CHECK-NEXT:    frintm v4.2d, v4.2d
+; CHECK-NEXT:    frintm v5.2d, v5.2d
+; CHECK-NEXT:    frintm v6.2d, v6.2d
 ; CHECK-NEXT:    frintm v7.2d, v7.2d
 ; CHECK-NEXT:    fcvtzs v0.2d, v0.2d
-; CHECK-NEXT:    fcvtzs v4.2d, v4.2d
 ; CHECK-NEXT:    fcvtzs v1.2d, v1.2d
-; CHECK-NEXT:    fcvtzs v5.2d, v5.2d
 ; CHECK-NEXT:    fcvtzs v2.2d, v2.2d
-; CHECK-NEXT:    fcvtzs v6.2d, v6.2d
 ; CHECK-NEXT:    fcvtzs v3.2d, v3.2d
+; CHECK-NEXT:    fcvtzs v4.2d, v4.2d
+; CHECK-NEXT:    fcvtzs v5.2d, v5.2d
+; CHECK-NEXT:    fcvtzs v6.2d, v6.2d
 ; CHECK-NEXT:    fcvtzs v7.2d, v7.2d
-; CHECK-NEXT:    xtn v16.2s, v0.2d
-; CHECK-NEXT:    xtn v20.2s, v4.2d
-; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI16_0]
-; CHECK-NEXT:    xtn v17.2s, v1.2d
-; CHECK-NEXT:    xtn v21.2s, v5.2d
-; CHECK-NEXT:    xtn v18.2s, v2.2d
-; CHECK-NEXT:    xtn v22.2s, v6.2d
-; CHECK-NEXT:    xtn v19.2s, v3.2d
-; CHECK-NEXT:    xtn v23.2s, v7.2d
-; CHECK-NEXT:    tbl v1.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v0.16b
-; CHECK-NEXT:    tbl v2.16b, { v20.16b, v21.16b, v22.16b, v23.16b }, v0.16b
-; CHECK-NEXT:    uzp1 v0.8h, v1.8h, v2.8h
-; CHECK-NEXT:    uzp2 v1.8h, v1.8h, v2.8h
+; CHECK-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    uzp1 v2.4s, v2.4s, v3.4s
+; CHECK-NEXT:    uzp1 v3.4s, v4.4s, v5.4s
+; CHECK-NEXT:    uzp1 v1.4s, v6.4s, v7.4s
+; CHECK-NEXT:    uzp1 v2.8h, v0.8h, v2.8h
+; CHECK-NEXT:    uzp1 v1.8h, v3.8h, v1.8h
+; CHECK-NEXT:    uzp1 v0.8h, v2.8h, v1.8h
+; CHECK-NEXT:    uzp2 v1.8h, v2.8h, v1.8h
 ; CHECK-NEXT:    ret
   %l214 = call fast <2 x double> @llvm.floor.v2f64(<2 x double> %l213)
   %l215 = fptosi <2 x double> %l214 to <2 x i16>
diff --git a/llvm/test/CodeGen/AArch64/trunc-v1i64.ll b/llvm/test/CodeGen/AArch64/trunc-v1i64.ll
index 19efd2fafd5e24..36adcc382a1b33 100644
--- a/llvm/test/CodeGen/AArch64/trunc-v1i64.ll
+++ b/llvm/test/CodeGen/AArch64/trunc-v1i64.ll
@@ -79,4 +79,4 @@ define i1 @test_v1i1_1(<1 x i64> %in0) {
 ; CHECK: and w0, [[REG]], #0x1
   %2 = extractelement <1 x i1> %1, i32 0
   ret i1 %2
-}
+}
\ No newline at end of file
diff --git a/llvm/test/CodeGen/AArch64/vcvt-oversize.ll b/llvm/test/CodeGen/AArch64/vcvt-oversize.ll
index 380bdbcc7f7408..611940546bc1a2 100644
--- a/llvm/test/CodeGen/AArch64/vcvt-oversize.ll
+++ b/llvm/test/CodeGen/AArch64/vcvt-oversize.ll
@@ -9,9 +9,8 @@ define <8 x i8> @float_to_i8(ptr %in) {
 ; CHECK-NEXT:    fadd v0.4s, v0.4s, v0.4s
 ; CHECK-NEXT:    fcvtzs v0.4s, v0.4s
 ; CHECK-NEXT:    fcvtzs v1.4s, v1.4s
-; CHECK-NEXT:    xtn v0.4h, v0.4s
-; CHECK-NEXT:    xtn v1.4h, v1.4s
-; CHECK-NEXT:    uzp1 v0.8b, v1.8b, v0.8b
+; CHECK-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
+; CHECK-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-NEXT:    ret
   %l = load <8 x float>, ptr %in
   %scale = fmul <8 x float> %l, <float 2.0, float 2.0, float 2.0, float 2.0, float 2.0, float 2.0, float 2.0, float 2.0>



More information about the llvm-commits mailing list