[llvm] [AArch64] Avoid GPR trip when moving truncated i32 vector elements (PR #114541)

via llvm-commits llvm-commits at lists.llvm.org
Thu Dec 19 10:04:32 PST 2024


https://github.com/SpencerAbson updated https://github.com/llvm/llvm-project/pull/114541

>From 5cd15cb492a6c023e424ef232ebf1ae6ccfe9def Mon Sep 17 00:00:00 2001
From: Spencer Abson <Spencer.Abson at arm.com>
Date: Thu, 31 Oct 2024 14:46:09 +0000
Subject: [PATCH 1/8] [AArch64] Avoid GPR trip when moving truncated i32 vector
 elements

This patch introduces ISEL patterns to enable the use of INS (element, of truncated size) when moving a
truncated (from i64) i32 element between vectors.
---
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   |  53 +++++++--
 .../CodeGen/AArch64/neon-ins-trunc-elt.ll     | 106 ++++++++++++++++++
 2 files changed, 149 insertions(+), 10 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/neon-ins-trunc-elt.ll

diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 250d6144f75318..f92532c27fe154 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -683,6 +683,22 @@ def topbitsallzero64: PatLeaf<(i64 GPR64:$src), [{
          CurDAG->MaskedValueIsZero(SDValue(N,0), APInt::getHighBitsSet(64, 63));
   }]>;
 
+def VectorIndexStoH : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getZExtValue() * 2, SDLoc(N), MVT::i64);
+}]>;
+
+def VectorIndexDtoH : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getZExtValue() * 4, SDLoc(N), MVT::i64);
+}]>;
+
+def VectorIndexStoB : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getZExtValue() * 4, SDLoc(N), MVT::i64);
+}]>;
+
+def VectorIndexHtoB : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getZExtValue() * 2, SDLoc(N), MVT::i64);
+}]>;
+
 // Node definitions.
 def AArch64adrp          : SDNode<"AArch64ISD::ADRP", SDTIntUnaryOp, []>;
 def AArch64adr           : SDNode<"AArch64ISD::ADR", SDTIntUnaryOp, []>;
@@ -7281,6 +7297,33 @@ defm : Neon_INS_elt_pattern<v8i16,  v4i16,  nxv8i16,  i32,  VectorIndexH, INSvi1
 defm : Neon_INS_elt_pattern<v4i32,  v2i32,  nxv4i32,  i32,  VectorIndexS, INSvi32lane>;
 defm : Neon_INS_elt_pattern<v2i64,  v1i64,  nxv2i64,  i64,  VectorIndexD, INSvi64lane>;
 
+// Rmove GPR trip when inserting extracted truncated i64 into vector of i32.
+// From another NEON vector
+def : Pat<(v4i32 (vector_insert v4i32:$src,
+                    (i32 (trunc (i64 (vector_extract v2i64:$Rn, (i64 imm:$Immn))))),
+                    (i64 imm:$Immd))),
+          (INSvi32lane V128:$src, imm:$Immd,  V128:$Rn, (VectorIndexHtoB imm:$Immn))>;
+
+def : Pat<(v2i32 (vector_insert v2i32:$src,
+                    (i32 (trunc (i64 (vector_extract v2i64:$Rn, (i64 imm:$Immn))))),
+                    (i64 imm:$Immd))),
+          (EXTRACT_SUBREG (INSvi32lane (SUBREG_TO_REG (i64 0), V64:$src, dsub),
+                                       imm:$Immd,  V128:$Rn, (VectorIndexHtoB imm:$Immn)),
+                          dsub)>;
+// From the bottom 128b of an SVE vector
+def : Pat<(v4i32 (vector_insert v4i32:$Rn,
+                    (i32 (trunc (i64 (vector_extract nxv2i64:$Rm, (i64 VectorIndexD:$Immn))))),
+                    (i64 imm:$Immd))),
+          (INSvi32lane V128:$Rn, imm:$Immd, (EXTRACT_SUBREG nxv2i64:$Rm, zsub), (VectorIndexHtoB VectorIndexD:$Immn))>;
+
+def : Pat<(v2i32 (vector_insert v2i32:$Rn,
+                    (i32 (trunc (i64 (vector_extract nxv2i64:$Rm, (i64 VectorIndexD:$Immn))))),
+                    (i64 imm:$Immd))),
+          (EXTRACT_SUBREG
+              (INSvi32lane (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), V64:$Rn, dsub), imm:$Immd,
+                    (EXTRACT_SUBREG nxv2i64:$Rm, zsub), (VectorIndexHtoB VectorIndexD:$Immn)),
+              dsub)>;
+
 // Insert from bitcast
 // vector_insert(bitcast(f32 src), n, lane) -> INSvi32lane(src, lane, INSERT_SUBREG(-, n), 0)
 def : Pat<(v4i32 (vector_insert v4i32:$src, (i32 (bitconvert (f32 FPR32:$Sn))), (i64 imm:$Immd))),
@@ -8700,16 +8743,6 @@ class Ld1Lane64IdxOpPat<SDPatternOperator scalar_load, Operand VecIndex,
                 (IdxOp VecIndex:$idx), GPR64sp:$Rn),
             dsub)>;
 
-def VectorIndexStoH : SDNodeXForm<imm, [{
-  return CurDAG->getTargetConstant(N->getZExtValue() * 2, SDLoc(N), MVT::i64);
-}]>;
-def VectorIndexStoB : SDNodeXForm<imm, [{
-  return CurDAG->getTargetConstant(N->getZExtValue() * 4, SDLoc(N), MVT::i64);
-}]>;
-def VectorIndexHtoB : SDNodeXForm<imm, [{
-  return CurDAG->getTargetConstant(N->getZExtValue() * 2, SDLoc(N), MVT::i64);
-}]>;
-
 def : Ld1Lane128IdxOpPat<extloadi16, VectorIndexS, v4i32, i32, LD1i16, VectorIndexStoH>;
 def : Ld1Lane128IdxOpPat<extloadi8, VectorIndexS, v4i32, i32, LD1i8, VectorIndexStoB>;
 def : Ld1Lane128IdxOpPat<extloadi8, VectorIndexH, v8i16, i32, LD1i8, VectorIndexHtoB>;
diff --git a/llvm/test/CodeGen/AArch64/neon-ins-trunc-elt.ll b/llvm/test/CodeGen/AArch64/neon-ins-trunc-elt.ll
new file mode 100644
index 00000000000000..ab11f12d53790f
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/neon-ins-trunc-elt.ll
@@ -0,0 +1,106 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+neon < %s | FileCheck %s
+
+; Inserting a truncated (i64 to i32) element from the bottom 128-bits of any vector type into a NEON vector should use INS (element) of the
+; truncated size to avoid pointless GPR trips.
+
+
+define <2 x i32> @test_s_trunc_d_lane0(<2 x i32> %a, <1 x i64> %b) {
+; CHECK-LABEL: test_s_trunc_d_lane0:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
+    %c = extractelement <1 x i64> %b, i32 0
+    %d = trunc i64 %c to i32
+    %e = insertelement <2 x i32> %a, i32 %d, i64 0
+    ret <2 x i32> %e
+}
+
+define <2 x i32> @test_s_trunc_d_qlane1(<2 x i32> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_s_trunc_d_qlane1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov v0.s[0], v1.s[2]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
+    %c = extractelement <2 x i64> %b, i32 1
+    %d = trunc i64 %c to i32
+    %e = insertelement <2 x i32> %a, i32 %d, i64 0
+    ret <2 x i32> %e
+}
+
+define <4 x i32> @test_qs_trunc_d_lane0(<4 x i32> %a, <1 x i64> %b) {
+; CHECK-LABEL: test_qs_trunc_d_lane0:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-NEXT:    ret
+    %c = extractelement <1 x i64> %b, i32 0
+    %d = trunc i64 %c to i32
+    %e = insertelement <4 x i32> %a, i32 %d, i64 0
+    ret <4 x i32> %e
+}
+
+define <4 x i32> @test_qs_trunc_d_qlane1(<4 x i32> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_qs_trunc_d_qlane1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v0.s[3], v1.s[2]
+; CHECK-NEXT:    ret
+    %c = extractelement <2 x i64> %b, i32 1
+    %d = trunc i64 %c to i32
+    %e = insertelement <4 x i32> %a, i32 %d, i64 3
+    ret <4 x i32> %e
+}
+
+; ---- From the bottom 128b of an SVE vector
+
+define <2 x i32> @test_s_trunc_dsve_lane0(<2 x i32> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: test_s_trunc_dsve_lane0:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
+    %c = extractelement <vscale x 2 x i64> %b, i32 0
+    %d = trunc i64 %c to i32
+    %e = insertelement <2 x i32> %a, i32 %d, i64 0
+    ret <2 x i32> %e
+}
+
+define <2 x i32> @test_s_trunc_dsve_lane1(<2 x i32> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: test_s_trunc_dsve_lane1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov v0.s[1], v1.s[2]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
+    %c = extractelement <vscale x 2 x i64> %b, i32 1
+    %d = trunc i64 %c to i32
+    %e = insertelement <2 x i32> %a, i32 %d, i64 1
+    ret <2 x i32> %e
+}
+
+define <4 x i32> @test_qs_trunc_dsve_lane0(<4 x i32> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: test_qs_trunc_dsve_lane0:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-NEXT:    ret
+    %c = extractelement <vscale x 2 x i64> %b, i32 0
+    %d = trunc i64 %c to i32
+    %e = insertelement <4 x i32> %a, i32 %d, i64 0
+    ret <4 x i32> %e
+}
+
+define <4 x i32> @test_qs_trunc_dsve_lane1(<4 x i32> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: test_qs_trunc_dsve_lane1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v0.s[3], v1.s[2]
+; CHECK-NEXT:    ret
+    %c = extractelement <vscale x 2 x i64> %b, i32 1
+    %d = trunc i64 %c to i32
+    %e = insertelement <4 x i32> %a, i32 %d, i64 3
+    ret <4 x i32> %e
+}

>From cb225b4b2704af18acc4ce8e78ac430b5b27cfc9 Mon Sep 17 00:00:00 2001
From: Spencer Abson <Spencer.Abson at arm.com>
Date: Fri, 1 Nov 2024 13:25:07 +0000
Subject: [PATCH 2/8] [NFC] Remove unused node operator

---
 llvm/lib/Target/AArch64/AArch64InstrInfo.td | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index f92532c27fe154..4f00a7a187723b 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -686,15 +686,9 @@ def topbitsallzero64: PatLeaf<(i64 GPR64:$src), [{
 def VectorIndexStoH : SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant(N->getZExtValue() * 2, SDLoc(N), MVT::i64);
 }]>;
-
-def VectorIndexDtoH : SDNodeXForm<imm, [{
-  return CurDAG->getTargetConstant(N->getZExtValue() * 4, SDLoc(N), MVT::i64);
-}]>;
-
 def VectorIndexStoB : SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant(N->getZExtValue() * 4, SDLoc(N), MVT::i64);
 }]>;
-
 def VectorIndexHtoB : SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant(N->getZExtValue() * 2, SDLoc(N), MVT::i64);
 }]>;

>From d25df8c9c8adabf741f476d988b7d0b19ef32519 Mon Sep 17 00:00:00 2001
From: Spencer Abson <Spencer.Abson at arm.com>
Date: Fri, 1 Nov 2024 13:54:01 +0000
Subject: [PATCH 3/8] [NFC] Add negative tests for bound of SVE extraction

---
 .../CodeGen/AArch64/neon-ins-trunc-elt.ll     | 30 +++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/llvm/test/CodeGen/AArch64/neon-ins-trunc-elt.ll b/llvm/test/CodeGen/AArch64/neon-ins-trunc-elt.ll
index ab11f12d53790f..4c189cd2886e8a 100644
--- a/llvm/test/CodeGen/AArch64/neon-ins-trunc-elt.ll
+++ b/llvm/test/CodeGen/AArch64/neon-ins-trunc-elt.ll
@@ -83,6 +83,22 @@ define <2 x i32> @test_s_trunc_dsve_lane1(<2 x i32> %a, <vscale x 2 x i64> %b) {
     ret <2 x i32> %e
 }
 
+; (negative test) Extracted element is not within V-register.
+define <2 x i32> @test_s_trunc_dsve_lane2(<2 x i32> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: test_s_trunc_dsve_lane2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z1.d, z1.d[2]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    fmov x8, d1
+; CHECK-NEXT:    mov v0.s[1], w8
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
+    %c = extractelement <vscale x 2 x i64> %b, i32 2
+    %d = trunc i64 %c to i32
+    %e = insertelement <2 x i32> %a, i32 %d, i64 1
+    ret <2 x i32> %e
+}
+
 define <4 x i32> @test_qs_trunc_dsve_lane0(<4 x i32> %a, <vscale x 2 x i64> %b) {
 ; CHECK-LABEL: test_qs_trunc_dsve_lane0:
 ; CHECK:       // %bb.0:
@@ -104,3 +120,17 @@ define <4 x i32> @test_qs_trunc_dsve_lane1(<4 x i32> %a, <vscale x 2 x i64> %b)
     %e = insertelement <4 x i32> %a, i32 %d, i64 3
     ret <4 x i32> %e
 }
+
+; (negative test) Extracted element is not within V-register.
+define <4 x i32> @test_qs_trunc_dsve_lane2(<4 x i32> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: test_qs_trunc_dsve_lane2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z1.d, z1.d[2]
+; CHECK-NEXT:    fmov x8, d1
+; CHECK-NEXT:    mov v0.s[3], w8
+; CHECK-NEXT:    ret
+    %c = extractelement <vscale x 2 x i64> %b, i32 2
+    %d = trunc i64 %c to i32
+    %e = insertelement <4 x i32> %a, i32 %d, i64 3
+    ret <4 x i32> %e
+}

>From a8c6043da985f1171cb0f2ec3843971d4e1fd27e Mon Sep 17 00:00:00 2001
From: Spencer Abson <Spencer.Abson at arm.com>
Date: Tue, 5 Nov 2024 19:13:55 +0000
Subject: [PATCH 4/8] Implement DAG combine for truncating extracted i64

---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 37 ++++++++--
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   | 73 +++++--------------
 .../aarch64-neon-vector-insert-uaddlv.ll      | 24 +++---
 llvm/test/CodeGen/AArch64/fmlal-loreg.ll      |  3 +-
 .../CodeGen/AArch64/neon-ins-trunc-elt.ll     |  8 +-
 llvm/test/CodeGen/AArch64/sve-doublereduct.ll |  9 +--
 .../CodeGen/AArch64/sve-extract-element.ll    |  4 +-
 .../AArch64/sve-extract-fixed-vector.ll       |  7 +-
 .../AArch64/sve-fixed-length-int-reduce.ll    | 45 ++++--------
 llvm/test/CodeGen/AArch64/sve-int-reduce.ll   | 12 +--
 .../CodeGen/AArch64/sve-split-int-reduce.ll   | 18 ++---
 ...-streaming-mode-fixed-length-int-reduce.ll | 27 +++----
 ...-streaming-mode-fixed-length-reductions.ll |  6 +-
 .../test/CodeGen/AArch64/sve-vecreduce-dot.ll |  3 +-
 .../CodeGen/AArch64/uaddlv-vaddlp-combine.ll  |  3 +-
 .../AArch64/vec-combine-compare-to-bitmask.ll |  2 +-
 16 files changed, 116 insertions(+), 165 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index e8c02c09879747..5bca2743a61cae 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -20735,8 +20735,9 @@ static SDValue performBuildVectorCombine(SDNode *N,
   return SDValue();
 }
 
-static SDValue performTruncateCombine(SDNode *N,
-                                      SelectionDAG &DAG) {
+static SDValue performTruncateCombine(SDNode *N, SelectionDAG &DAG,
+                                      TargetLowering::DAGCombinerInfo &DCI) {
+  SDLoc DL(N);
   EVT VT = N->getValueType(0);
   SDValue N0 = N->getOperand(0);
   if (VT.isFixedLengthVector() && VT.is64BitVector() && N0.hasOneUse() &&
@@ -20744,8 +20745,34 @@ static SDValue performTruncateCombine(SDNode *N,
     SDValue Op = N0.getOperand(0);
     if (VT.getScalarType() == MVT::i32 &&
         N0.getOperand(0).getValueType().getScalarType() == MVT::i64)
-      Op = DAG.getNode(ISD::TRUNCATE, SDLoc(N), MVT::i32, Op);
-    return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, Op);
+      Op = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Op);
+    return DAG.getNode(N0.getOpcode(), DL, VT, Op);
+  }
+
+  // Performing the following combine produces a preferable form for ISEL.
+  // i32 (trunc (extract Vi64, idx)) -> i32 (extract (nvcast Vi32), idx*2))
+  if (DCI.isAfterLegalizeDAG() && N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+    SDValue Op = N0.getOperand(0);
+    SDValue ExtractIndexNode = N0.getOperand(1);
+    if (!isa<ConstantSDNode>(ExtractIndexNode))
+      return SDValue();
+
+    // For a legal DAG, EXTRACT_VECTOR_ELT can only have produced an i32 or i64.
+    // So we can only expect: i32 (trunc (i64 (extract Vi64, idx))).
+    assert((VT == MVT::i32 && N0.getValueType() == MVT::i64) &&
+           "Unexpected legalisation result!");
+
+    MVT CastVT;
+    EVT SrcVectorType = Op.getValueType();
+    assert(SrcVectorType.getScalarType() == MVT::i64);
+    unsigned ExtractIndex =
+        cast<ConstantSDNode>(ExtractIndexNode)->getZExtValue();
+
+    CastVT = SrcVectorType.isScalableVector() ? MVT::nxv4i32 : MVT::v4i32;
+
+    Op = DAG.getNode(AArch64ISD::NVCAST, DL, CastVT, Op);
+    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op,
+                       DAG.getConstant(ExtractIndex * 2, DL, MVT::i64));
   }
 
   return SDValue();
@@ -25992,7 +26019,7 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::BUILD_VECTOR:
     return performBuildVectorCombine(N, DCI, DAG);
   case ISD::TRUNCATE:
-    return performTruncateCombine(N, DAG);
+    return performTruncateCombine(N, DAG, DCI);
   case AArch64ISD::ANDS:
     return performFlagSettingCombine(N, DCI, ISD::AND);
   case AArch64ISD::ADC:
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 4f00a7a187723b..833adc274f7efc 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -683,16 +683,6 @@ def topbitsallzero64: PatLeaf<(i64 GPR64:$src), [{
          CurDAG->MaskedValueIsZero(SDValue(N,0), APInt::getHighBitsSet(64, 63));
   }]>;
 
-def VectorIndexStoH : SDNodeXForm<imm, [{
-  return CurDAG->getTargetConstant(N->getZExtValue() * 2, SDLoc(N), MVT::i64);
-}]>;
-def VectorIndexStoB : SDNodeXForm<imm, [{
-  return CurDAG->getTargetConstant(N->getZExtValue() * 4, SDLoc(N), MVT::i64);
-}]>;
-def VectorIndexHtoB : SDNodeXForm<imm, [{
-  return CurDAG->getTargetConstant(N->getZExtValue() * 2, SDLoc(N), MVT::i64);
-}]>;
-
 // Node definitions.
 def AArch64adrp          : SDNode<"AArch64ISD::ADRP", SDTIntUnaryOp, []>;
 def AArch64adr           : SDNode<"AArch64ISD::ADR", SDTIntUnaryOp, []>;
@@ -6997,24 +6987,18 @@ defm : DUPWithTruncPats<v16i8,  v4i16, v8i16, i32, DUPv16i8lane, VecIndex_x2>;
 defm : DUPWithTruncPats<v16i8,  v2i32, v4i32, i32, DUPv16i8lane, VecIndex_x4>;
 defm : DUPWithTruncPats<v8i16,  v2i32, v4i32, i32, DUPv8i16lane, VecIndex_x2>;
 
-multiclass DUPWithTrunci64Pats<ValueType ResVT, Instruction DUP,
-                               SDNodeXForm IdxXFORM> {
-  def : Pat<(ResVT (AArch64dup (i32 (trunc (extractelt (v2i64 V128:$Rn),
-                                                         imm:$idx))))),
-            (DUP V128:$Rn, (IdxXFORM imm:$idx))>;
-
-  def : Pat<(ResVT (AArch64dup (i32 (trunc (extractelt (v1i64 V64:$Rn),
-                                                       imm:$idx))))),
-            (DUP (SUBREG_TO_REG (i64 0), V64:$Rn, dsub), (IdxXFORM imm:$idx))>;
-}
+class DUPWithTrunci64Pat<ValueType ResVT, Instruction DUP,
+                         SDNodeXForm IdxXFORM>
+  : Pat<(ResVT (AArch64dup (i32 (extractelt (v4i32 V128:$Rn), imm:$idx)))),
+        (DUP V128:$Rn, (IdxXFORM imm:$idx))>;
 
-defm : DUPWithTrunci64Pats<v8i8,  DUPv8i8lane,   VecIndex_x8>;
-defm : DUPWithTrunci64Pats<v4i16, DUPv4i16lane,  VecIndex_x4>;
-defm : DUPWithTrunci64Pats<v2i32, DUPv2i32lane,  VecIndex_x2>;
+def : DUPWithTrunci64Pat<v8i8,  DUPv8i8lane,   VecIndex_x4>;
+def : DUPWithTrunci64Pat<v4i16, DUPv4i16lane,  VecIndex_x2>;
+def : DUPWithTrunci64Pat<v2i32, DUPv2i32lane,  NOOP_SDNodeXForm>;
 
-defm : DUPWithTrunci64Pats<v16i8, DUPv16i8lane, VecIndex_x8>;
-defm : DUPWithTrunci64Pats<v8i16, DUPv8i16lane, VecIndex_x4>;
-defm : DUPWithTrunci64Pats<v4i32, DUPv4i32lane, VecIndex_x2>;
+def : DUPWithTrunci64Pat<v16i8, DUPv16i8lane, VecIndex_x4>;
+def : DUPWithTrunci64Pat<v8i16, DUPv8i16lane, VecIndex_x2>;
+def : DUPWithTrunci64Pat<v4i32, DUPv4i32lane, NOOP_SDNodeXForm>;
 
 // SMOV and UMOV definitions, with some extra patterns for convenience
 defm SMOV : SMov;
@@ -7291,33 +7275,6 @@ defm : Neon_INS_elt_pattern<v8i16,  v4i16,  nxv8i16,  i32,  VectorIndexH, INSvi1
 defm : Neon_INS_elt_pattern<v4i32,  v2i32,  nxv4i32,  i32,  VectorIndexS, INSvi32lane>;
 defm : Neon_INS_elt_pattern<v2i64,  v1i64,  nxv2i64,  i64,  VectorIndexD, INSvi64lane>;
 
-// Rmove GPR trip when inserting extracted truncated i64 into vector of i32.
-// From another NEON vector
-def : Pat<(v4i32 (vector_insert v4i32:$src,
-                    (i32 (trunc (i64 (vector_extract v2i64:$Rn, (i64 imm:$Immn))))),
-                    (i64 imm:$Immd))),
-          (INSvi32lane V128:$src, imm:$Immd,  V128:$Rn, (VectorIndexHtoB imm:$Immn))>;
-
-def : Pat<(v2i32 (vector_insert v2i32:$src,
-                    (i32 (trunc (i64 (vector_extract v2i64:$Rn, (i64 imm:$Immn))))),
-                    (i64 imm:$Immd))),
-          (EXTRACT_SUBREG (INSvi32lane (SUBREG_TO_REG (i64 0), V64:$src, dsub),
-                                       imm:$Immd,  V128:$Rn, (VectorIndexHtoB imm:$Immn)),
-                          dsub)>;
-// From the bottom 128b of an SVE vector
-def : Pat<(v4i32 (vector_insert v4i32:$Rn,
-                    (i32 (trunc (i64 (vector_extract nxv2i64:$Rm, (i64 VectorIndexD:$Immn))))),
-                    (i64 imm:$Immd))),
-          (INSvi32lane V128:$Rn, imm:$Immd, (EXTRACT_SUBREG nxv2i64:$Rm, zsub), (VectorIndexHtoB VectorIndexD:$Immn))>;
-
-def : Pat<(v2i32 (vector_insert v2i32:$Rn,
-                    (i32 (trunc (i64 (vector_extract nxv2i64:$Rm, (i64 VectorIndexD:$Immn))))),
-                    (i64 imm:$Immd))),
-          (EXTRACT_SUBREG
-              (INSvi32lane (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), V64:$Rn, dsub), imm:$Immd,
-                    (EXTRACT_SUBREG nxv2i64:$Rm, zsub), (VectorIndexHtoB VectorIndexD:$Immn)),
-              dsub)>;
-
 // Insert from bitcast
 // vector_insert(bitcast(f32 src), n, lane) -> INSvi32lane(src, lane, INSERT_SUBREG(-, n), 0)
 def : Pat<(v4i32 (vector_insert v4i32:$src, (i32 (bitconvert (f32 FPR32:$Sn))), (i64 imm:$Immd))),
@@ -8737,6 +8694,16 @@ class Ld1Lane64IdxOpPat<SDPatternOperator scalar_load, Operand VecIndex,
                 (IdxOp VecIndex:$idx), GPR64sp:$Rn),
             dsub)>;
 
+def VectorIndexStoH : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getZExtValue() * 2, SDLoc(N), MVT::i64);
+}]>;
+def VectorIndexStoB : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getZExtValue() * 4, SDLoc(N), MVT::i64);
+}]>;
+def VectorIndexHtoB : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getZExtValue() * 2, SDLoc(N), MVT::i64);
+}]>;
+
 def : Ld1Lane128IdxOpPat<extloadi16, VectorIndexS, v4i32, i32, LD1i16, VectorIndexStoH>;
 def : Ld1Lane128IdxOpPat<extloadi8, VectorIndexS, v4i32, i32, LD1i8, VectorIndexStoB>;
 def : Ld1Lane128IdxOpPat<extloadi8, VectorIndexH, v8i16, i32, LD1i8, VectorIndexHtoB>;
diff --git a/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll b/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll
index 2e165179381820..1b7bc128d6332e 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll
@@ -384,9 +384,9 @@ define void @insert_vec_v4i16_uaddlv_from_v4i32(ptr %0) {
 ; CHECK-NEXT:    movi.2d v1, #0000000000000000
 ; CHECK-NEXT:    uaddlv.4s d0, v0
 ; CHECK-NEXT:    mov.h v1[0], v0[0]
-; CHECK-NEXT:    ushll.4s v0, v1, #0
-; CHECK-NEXT:    ucvtf.4s v0, v0
-; CHECK-NEXT:    str q0, [x0]
+; CHECK-NEXT:    ushll.4s v1, v1, #0
+; CHECK-NEXT:    ucvtf.4s v1, v1
+; CHECK-NEXT:    str q1, [x0]
 ; CHECK-NEXT:    ret
 
 entry:
@@ -403,13 +403,13 @@ define void @insert_vec_v16i16_uaddlv_from_v4i32(ptr %0) {
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    movi.2d v0, #0000000000000000
 ; CHECK-NEXT:    movi.2d v1, #0000000000000000
+; CHECK-NEXT:    movi.2d v2, #0000000000000000
 ; CHECK-NEXT:    uaddlv.4s d0, v0
+; CHECK-NEXT:    stp q2, q2, [x0, #32]
 ; CHECK-NEXT:    mov.h v1[0], v0[0]
-; CHECK-NEXT:    movi.2d v0, #0000000000000000
 ; CHECK-NEXT:    ushll.4s v1, v1, #0
-; CHECK-NEXT:    stp q0, q0, [x0, #32]
 ; CHECK-NEXT:    ucvtf.4s v1, v1
-; CHECK-NEXT:    stp q1, q0, [x0]
+; CHECK-NEXT:    stp q1, q2, [x0]
 ; CHECK-NEXT:    ret
 
 entry:
@@ -430,9 +430,9 @@ define void @insert_vec_v8i8_uaddlv_from_v4i32(ptr %0) {
 ; CHECK-NEXT:    uaddlv.4s d0, v0
 ; CHECK-NEXT:    mov.h v1[0], v0[0]
 ; CHECK-NEXT:    bic.4h v1, #255, lsl #8
-; CHECK-NEXT:    ushll.4s v0, v1, #0
-; CHECK-NEXT:    ucvtf.4s v0, v0
-; CHECK-NEXT:    str q0, [x0]
+; CHECK-NEXT:    ushll.4s v1, v1, #0
+; CHECK-NEXT:    ucvtf.4s v1, v1
+; CHECK-NEXT:    str q1, [x0]
 ; CHECK-NEXT:    ret
 
 entry:
@@ -449,14 +449,14 @@ define void @insert_vec_v16i8_uaddlv_from_v4i32(ptr %0) {
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    movi.2d v0, #0000000000000000
 ; CHECK-NEXT:    movi.2d v1, #0000000000000000
+; CHECK-NEXT:    movi.2d v2, #0000000000000000
 ; CHECK-NEXT:    uaddlv.4s d0, v0
+; CHECK-NEXT:    stp q2, q2, [x0, #32]
 ; CHECK-NEXT:    mov.h v1[0], v0[0]
-; CHECK-NEXT:    movi.2d v0, #0000000000000000
 ; CHECK-NEXT:    bic.4h v1, #255, lsl #8
-; CHECK-NEXT:    stp q0, q0, [x0, #32]
 ; CHECK-NEXT:    ushll.4s v1, v1, #0
 ; CHECK-NEXT:    ucvtf.4s v1, v1
-; CHECK-NEXT:    stp q1, q0, [x0]
+; CHECK-NEXT:    stp q1, q2, [x0]
 ; CHECK-NEXT:    ret
 
 entry:
diff --git a/llvm/test/CodeGen/AArch64/fmlal-loreg.ll b/llvm/test/CodeGen/AArch64/fmlal-loreg.ll
index 31ead890ba8ac7..6116c6421c4b12 100644
--- a/llvm/test/CodeGen/AArch64/fmlal-loreg.ll
+++ b/llvm/test/CodeGen/AArch64/fmlal-loreg.ll
@@ -11,9 +11,10 @@ define <4 x float> @test(ptr %lhs_panel, ptr %rhs_panel, <4 x float> %a) {
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    .cfi_offset b8, -16
 ; CHECK-NEXT:    fmov x8, d0
+; CHECK-NEXT:    fmov w9, s0
 ; CHECK-NEXT:    ldr q16, [x1]
 ; CHECK-NEXT:    ldr q8, [x0]
-; CHECK-NEXT:    lsr x9, x8, #32
+; CHECK-NEXT:    lsr x8, x8, #32
 ; CHECK-NEXT:    //APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    //NO_APP
diff --git a/llvm/test/CodeGen/AArch64/neon-ins-trunc-elt.ll b/llvm/test/CodeGen/AArch64/neon-ins-trunc-elt.ll
index 4c189cd2886e8a..0d58fc59c2c319 100644
--- a/llvm/test/CodeGen/AArch64/neon-ins-trunc-elt.ll
+++ b/llvm/test/CodeGen/AArch64/neon-ins-trunc-elt.ll
@@ -87,9 +87,9 @@ define <2 x i32> @test_s_trunc_dsve_lane1(<2 x i32> %a, <vscale x 2 x i64> %b) {
 define <2 x i32> @test_s_trunc_dsve_lane2(<2 x i32> %a, <vscale x 2 x i64> %b) {
 ; CHECK-LABEL: test_s_trunc_dsve_lane2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z1.d, z1.d[2]
+; CHECK-NEXT:    mov z1.s, z1.s[4]
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    fmov x8, d1
+; CHECK-NEXT:    fmov w8, s1
 ; CHECK-NEXT:    mov v0.s[1], w8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    ret
@@ -125,8 +125,8 @@ define <4 x i32> @test_qs_trunc_dsve_lane1(<4 x i32> %a, <vscale x 2 x i64> %b)
 define <4 x i32> @test_qs_trunc_dsve_lane2(<4 x i32> %a, <vscale x 2 x i64> %b) {
 ; CHECK-LABEL: test_qs_trunc_dsve_lane2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z1.d, z1.d[2]
-; CHECK-NEXT:    fmov x8, d1
+; CHECK-NEXT:    mov z1.s, z1.s[4]
+; CHECK-NEXT:    fmov w8, s1
 ; CHECK-NEXT:    mov v0.s[3], w8
 ; CHECK-NEXT:    ret
     %c = extractelement <vscale x 2 x i64> %b, i32 2
diff --git a/llvm/test/CodeGen/AArch64/sve-doublereduct.ll b/llvm/test/CodeGen/AArch64/sve-doublereduct.ll
index 7bc31d44bb6547..b813b8f84ba16b 100644
--- a/llvm/test/CodeGen/AArch64/sve-doublereduct.ll
+++ b/llvm/test/CodeGen/AArch64/sve-doublereduct.ll
@@ -91,8 +91,7 @@ define i32 @add_i32(<vscale x 8 x i32> %a, <vscale x 4 x i32> %b) {
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    add z0.s, z0.s, z2.s
 ; CHECK-NEXT:    uaddv d0, p0, z0.s
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
   %r1 = call i32 @llvm.vector.reduce.add.i32.nxv8i32(<vscale x 8 x i32> %a)
   %r2 = call i32 @llvm.vector.reduce.add.i32.nxv4i32(<vscale x 4 x i32> %b)
@@ -112,8 +111,7 @@ define i16 @add_ext_i16(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
 ; CHECK-NEXT:    add z1.h, z1.h, z3.h
 ; CHECK-NEXT:    add z0.h, z0.h, z1.h
 ; CHECK-NEXT:    uaddv d0, p0, z0.h
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
   %ae = zext <vscale x 16 x i8> %a to <vscale x 16 x i16>
   %be = zext <vscale x 16 x i8> %b to <vscale x 16 x i16>
@@ -139,8 +137,7 @@ define i16 @add_ext_v32i16(<vscale x 32 x i8> %a, <vscale x 16 x i8> %b) {
 ; CHECK-NEXT:    add z1.h, z2.h, z5.h
 ; CHECK-NEXT:    add z0.h, z0.h, z1.h
 ; CHECK-NEXT:    uaddv d0, p0, z0.h
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
   %ae = zext <vscale x 32 x i8> %a to <vscale x 32 x i16>
   %be = zext <vscale x 16 x i8> %b to <vscale x 16 x i16>
diff --git a/llvm/test/CodeGen/AArch64/sve-extract-element.ll b/llvm/test/CodeGen/AArch64/sve-extract-element.ll
index 6d4f5963881e58..939c7e43100189 100644
--- a/llvm/test/CodeGen/AArch64/sve-extract-element.ll
+++ b/llvm/test/CodeGen/AArch64/sve-extract-element.ll
@@ -644,8 +644,8 @@ define i1 @test_lane4_2xi1(<vscale x 2 x i1> %a) #0 {
 ; CHECK-LABEL: test_lane4_2xi1:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z0.d, p0/z, #1 // =0x1
-; CHECK-NEXT:    mov z0.d, z0.d[4]
-; CHECK-NEXT:    fmov x8, d0
+; CHECK-NEXT:    mov z0.s, z0.s[8]
+; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    and w0, w8, #0x1
 ; CHECK-NEXT:    ret
   %b = extractelement <vscale x 2 x i1> %a, i32 4
diff --git a/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll b/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll
index 518e3573b5edd3..965af2a745afd4 100644
--- a/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll
@@ -238,11 +238,8 @@ define <2 x i1> @extract_v2i1_nxv2i1(<vscale x 2 x i1> %inmask) {
 ; CHECK-LABEL: extract_v2i1_nxv2i1:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z0.d, p0/z, #1 // =0x1
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    mov x8, v0.d[1]
-; CHECK-NEXT:    fmov s0, w0
-; CHECK-NEXT:    mov v0.s[1], w8
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    mov v0.s[1], v0.s[2]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
   %mask = call <2 x i1> @llvm.vector.extract.v2i1.nxv2i1(<vscale x 2 x i1> %inmask, i64 0)
   ret <2 x i1> %mask
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-reduce.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-reduce.ll
index 752c2cd34bfe48..be19e9ef5e86f8 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-reduce.ll
@@ -37,8 +37,7 @@ define i8 @uaddv_v32i8(ptr %a) vscale_range(2,0) #0 {
 ; CHECK-NEXT:    ptrue p0.b, vl32
 ; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
 ; CHECK-NEXT:    uaddv d0, p0, z0.b
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
   %op = load <32 x i8>, ptr %a
   %res = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> %op)
@@ -54,8 +53,7 @@ define i8 @uaddv_v64i8(ptr %a) #0 {
 ; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    add z0.b, z1.b, z0.b
 ; VBITS_GE_256-NEXT:    uaddv d0, p0, z0.b
-; VBITS_GE_256-NEXT:    fmov x0, d0
-; VBITS_GE_256-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; VBITS_GE_256-NEXT:    fmov w0, s0
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: uaddv_v64i8:
@@ -63,8 +61,7 @@ define i8 @uaddv_v64i8(ptr %a) #0 {
 ; VBITS_GE_512-NEXT:    ptrue p0.b, vl64
 ; VBITS_GE_512-NEXT:    ld1b { z0.b }, p0/z, [x0]
 ; VBITS_GE_512-NEXT:    uaddv d0, p0, z0.b
-; VBITS_GE_512-NEXT:    fmov x0, d0
-; VBITS_GE_512-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; VBITS_GE_512-NEXT:    fmov w0, s0
 ; VBITS_GE_512-NEXT:    ret
   %op = load <64 x i8>, ptr %a
   %res = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> %op)
@@ -77,8 +74,7 @@ define i8 @uaddv_v128i8(ptr %a) vscale_range(8,0) #0 {
 ; CHECK-NEXT:    ptrue p0.b, vl128
 ; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
 ; CHECK-NEXT:    uaddv d0, p0, z0.b
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
   %op = load <128 x i8>, ptr %a
   %res = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> %op)
@@ -91,8 +87,7 @@ define i8 @uaddv_v256i8(ptr %a) vscale_range(16,0) #0 {
 ; CHECK-NEXT:    ptrue p0.b, vl256
 ; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
 ; CHECK-NEXT:    uaddv d0, p0, z0.b
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
   %op = load <256 x i8>, ptr %a
   %res = call i8 @llvm.vector.reduce.add.v256i8(<256 x i8> %op)
@@ -127,8 +122,7 @@ define i16 @uaddv_v16i16(ptr %a) vscale_range(2,0) #0 {
 ; CHECK-NEXT:    ptrue p0.h, vl16
 ; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
 ; CHECK-NEXT:    uaddv d0, p0, z0.h
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
   %op = load <16 x i16>, ptr %a
   %res = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %op)
@@ -144,8 +138,7 @@ define i16 @uaddv_v32i16(ptr %a) #0 {
 ; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    add z0.h, z1.h, z0.h
 ; VBITS_GE_256-NEXT:    uaddv d0, p0, z0.h
-; VBITS_GE_256-NEXT:    fmov x0, d0
-; VBITS_GE_256-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; VBITS_GE_256-NEXT:    fmov w0, s0
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: uaddv_v32i16:
@@ -153,8 +146,7 @@ define i16 @uaddv_v32i16(ptr %a) #0 {
 ; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
 ; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
 ; VBITS_GE_512-NEXT:    uaddv d0, p0, z0.h
-; VBITS_GE_512-NEXT:    fmov x0, d0
-; VBITS_GE_512-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; VBITS_GE_512-NEXT:    fmov w0, s0
 ; VBITS_GE_512-NEXT:    ret
   %op = load <32 x i16>, ptr %a
   %res = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> %op)
@@ -167,8 +159,7 @@ define i16 @uaddv_v64i16(ptr %a) vscale_range(8,0) #0 {
 ; CHECK-NEXT:    ptrue p0.h, vl64
 ; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
 ; CHECK-NEXT:    uaddv d0, p0, z0.h
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
   %op = load <64 x i16>, ptr %a
   %res = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> %op)
@@ -181,8 +172,7 @@ define i16 @uaddv_v128i16(ptr %a) vscale_range(16,0) #0 {
 ; CHECK-NEXT:    ptrue p0.h, vl128
 ; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
 ; CHECK-NEXT:    uaddv d0, p0, z0.h
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
   %op = load <128 x i16>, ptr %a
   %res = call i16 @llvm.vector.reduce.add.v128i16(<128 x i16> %op)
@@ -217,8 +207,7 @@ define i32 @uaddv_v8i32(ptr %a) vscale_range(2,0) #0 {
 ; CHECK-NEXT:    ptrue p0.s, vl8
 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
 ; CHECK-NEXT:    uaddv d0, p0, z0.s
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
   %op = load <8 x i32>, ptr %a
   %res = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %op)
@@ -234,8 +223,7 @@ define i32 @uaddv_v16i32(ptr %a) #0 {
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    add z0.s, z1.s, z0.s
 ; VBITS_GE_256-NEXT:    uaddv d0, p0, z0.s
-; VBITS_GE_256-NEXT:    fmov x0, d0
-; VBITS_GE_256-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; VBITS_GE_256-NEXT:    fmov w0, s0
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: uaddv_v16i32:
@@ -243,8 +231,7 @@ define i32 @uaddv_v16i32(ptr %a) #0 {
 ; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
 ; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
 ; VBITS_GE_512-NEXT:    uaddv d0, p0, z0.s
-; VBITS_GE_512-NEXT:    fmov x0, d0
-; VBITS_GE_512-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; VBITS_GE_512-NEXT:    fmov w0, s0
 ; VBITS_GE_512-NEXT:    ret
   %op = load <16 x i32>, ptr %a
   %res = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %op)
@@ -257,8 +244,7 @@ define i32 @uaddv_v32i32(ptr %a) vscale_range(8,0) #0 {
 ; CHECK-NEXT:    ptrue p0.s, vl32
 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
 ; CHECK-NEXT:    uaddv d0, p0, z0.s
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
   %op = load <32 x i32>, ptr %a
   %res = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %op)
@@ -271,8 +257,7 @@ define i32 @uaddv_v64i32(ptr %a) vscale_range(16,0) #0 {
 ; CHECK-NEXT:    ptrue p0.s, vl64
 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
 ; CHECK-NEXT:    uaddv d0, p0, z0.s
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
   %op = load <64 x i32>, ptr %a
   %res = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %op)
diff --git a/llvm/test/CodeGen/AArch64/sve-int-reduce.ll b/llvm/test/CodeGen/AArch64/sve-int-reduce.ll
index 8c1b5225b7f257..6ec18477fe1a0c 100644
--- a/llvm/test/CodeGen/AArch64/sve-int-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve-int-reduce.ll
@@ -146,8 +146,7 @@ define i8 @uaddv_nxv16i8(<vscale x 16 x i8> %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    uaddv d0, p0, z0.b
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.add.nxv16i8(<vscale x 16 x i8> %a)
   ret i8 %res
@@ -158,8 +157,7 @@ define i16 @uaddv_nxv8i16(<vscale x 8 x i16> %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    uaddv d0, p0, z0.h
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.add.nxv8i16(<vscale x 8 x i16> %a)
   ret i16 %res
@@ -170,8 +168,7 @@ define i32 @uaddv_nxv4i32(<vscale x 4 x i32> %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    uaddv d0, p0, z0.s
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %a)
   ret i32 %res
@@ -422,8 +419,7 @@ define i8 @uaddv_nxv12i8(<vscale x 12 x i8> %a) {
 ; CHECK-NEXT:    uzp1 z1.h, z1.h, z2.h
 ; CHECK-NEXT:    uzp1 z0.b, z0.b, z1.b
 ; CHECK-NEXT:    uaddv d0, p0, z0.b
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.add.nxv12i8(<vscale x 12 x i8> %a)
   ret i8 %res
diff --git a/llvm/test/CodeGen/AArch64/sve-split-int-reduce.ll b/llvm/test/CodeGen/AArch64/sve-split-int-reduce.ll
index dd7b15ef5ee6f4..90383b43d58128 100644
--- a/llvm/test/CodeGen/AArch64/sve-split-int-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve-split-int-reduce.ll
@@ -33,8 +33,7 @@ define i32 @orv_nxv2i32(<vscale x 2 x i32> %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    orv d0, p0, z0.d
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.or.nxv2i32(<vscale x 2 x i32> %a)
   ret i32 %res
@@ -61,8 +60,7 @@ define i16 @xorv_nxv2i16(<vscale x 2 x i16> %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    eorv d0, p0, z0.d
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.xor.nxv2i16(<vscale x 2 x i16> %a)
   ret i16 %res
@@ -87,8 +85,7 @@ define i16 @uaddv_nxv4i16(<vscale x 4 x i16> %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    uaddv d0, p0, z0.s
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.add.nxv4i16(<vscale x 4 x i16> %a)
   ret i16 %res
@@ -100,8 +97,7 @@ define i16 @uaddv_nxv16i16(<vscale x 16 x i16> %a) {
 ; CHECK-NEXT:    add z0.h, z0.h, z1.h
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    uaddv d0, p0, z0.h
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.add.nxv16i16(<vscale x 16 x i16> %a)
   ret i16 %res
@@ -115,8 +111,7 @@ define i32 @uaddv_nxv16i32(<vscale x 16 x i32> %a) {
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    add z0.s, z0.s, z1.s
 ; CHECK-NEXT:    uaddv d0, p0, z0.s
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.add.nxv16i32(<vscale x 16 x i32> %a)
   ret i32 %res
@@ -130,8 +125,7 @@ define i32 @umin_nxv2i32(<vscale x 2 x i32> %a) {
 ; CHECK-NEXT:    and z0.d, z0.d, #0xffffffff
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    uminv d0, p0, z0.d
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.umin.nxv2i32(<vscale x 2 x i32> %a)
   ret i32 %res
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-reduce.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-reduce.ll
index 92a67cba55f7a1..244dcc734bd7c2 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-reduce.ll
@@ -15,8 +15,7 @@ define i8 @uaddv_v8i8(<8 x i8> %a) {
 ; CHECK-NEXT:    ptrue p0.b, vl8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    uaddv d0, p0, z0.b
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: uaddv_v8i8:
@@ -51,8 +50,7 @@ define i8 @uaddv_v16i8(<16 x i8> %a) {
 ; CHECK-NEXT:    ptrue p0.b, vl16
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    uaddv d0, p0, z0.b
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: uaddv_v16i8:
@@ -103,8 +101,7 @@ define i8 @uaddv_v32i8(ptr %a) {
 ; CHECK-NEXT:    ptrue p0.b, vl16
 ; CHECK-NEXT:    add z0.b, z1.b, z0.b
 ; CHECK-NEXT:    uaddv d0, p0, z0.b
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: uaddv_v32i8:
@@ -188,8 +185,7 @@ define i16 @uaddv_v4i16(<4 x i16> %a) {
 ; CHECK-NEXT:    ptrue p0.h, vl4
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    uaddv d0, p0, z0.h
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: uaddv_v4i16:
@@ -216,8 +212,7 @@ define i16 @uaddv_v8i16(<8 x i16> %a) {
 ; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    uaddv d0, p0, z0.h
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: uaddv_v8i16:
@@ -252,8 +247,7 @@ define i16 @uaddv_v16i16(ptr %a) {
 ; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    add z0.h, z1.h, z0.h
 ; CHECK-NEXT:    uaddv d0, p0, z0.h
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: uaddv_v16i16:
@@ -305,8 +299,7 @@ define i32 @uaddv_v2i32(<2 x i32> %a) {
 ; CHECK-NEXT:    ptrue p0.s, vl2
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    uaddv d0, p0, z0.s
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: uaddv_v2i32:
@@ -328,8 +321,7 @@ define i32 @uaddv_v4i32(<4 x i32> %a) {
 ; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    uaddv d0, p0, z0.s
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: uaddv_v4i32:
@@ -353,8 +345,7 @@ define i32 @uaddv_v8i32(ptr %a) {
 ; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    add z0.s, z1.s, z0.s
 ; CHECK-NEXT:    uaddv d0, p0, z0.s
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: uaddv_v8i32:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reductions.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reductions.ll
index 00a15f4bcd6394..688537704a6f73 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reductions.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reductions.ll
@@ -66,8 +66,7 @@ define i32 @reduce_uaddv_v16i8(<32 x i8> %a) {
 ; STREAMING-SVE-NEXT:    add z0.s, z2.s, z0.s
 ; STREAMING-SVE-NEXT:    add z0.s, z1.s, z0.s
 ; STREAMING-SVE-NEXT:    uaddv d0, p0, z0.s
-; STREAMING-SVE-NEXT:    fmov x0, d0
-; STREAMING-SVE-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; STREAMING-SVE-NEXT:    fmov w0, s0
 ; STREAMING-SVE-NEXT:    ret
   %1 = zext <32 x i8> %a to <32 x i32>
   %2 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %1)
@@ -134,8 +133,7 @@ define i32 @reduce_saddv_v16i8(<32 x i8> %a) {
 ; STREAMING-SVE-NEXT:    add z0.s, z2.s, z0.s
 ; STREAMING-SVE-NEXT:    add z0.s, z1.s, z0.s
 ; STREAMING-SVE-NEXT:    uaddv d0, p0, z0.s
-; STREAMING-SVE-NEXT:    fmov x0, d0
-; STREAMING-SVE-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; STREAMING-SVE-NEXT:    fmov w0, s0
 ; STREAMING-SVE-NEXT:    ret
   %1 = sext <32 x i8> %a to <32 x i32>
   %2 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %1)
diff --git a/llvm/test/CodeGen/AArch64/sve-vecreduce-dot.ll b/llvm/test/CodeGen/AArch64/sve-vecreduce-dot.ll
index 91f8f5c2c90d84..6af26067cd6d6d 100644
--- a/llvm/test/CodeGen/AArch64/sve-vecreduce-dot.ll
+++ b/llvm/test/CodeGen/AArch64/sve-vecreduce-dot.ll
@@ -42,8 +42,7 @@ define i32 @test(<vscale x 32 x i8> %bin.rdx, <vscale x 32 x i8> %bin.rdx2)  {
 ; CHECK-NEXT:    add z1.s, z3.s, z1.s
 ; CHECK-NEXT:    add z0.s, z1.s, z0.s
 ; CHECK-NEXT:    uaddv d0, p0, z0.s
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
   %a = sext <vscale x 32 x i8> %bin.rdx to <vscale x 32 x i32>
   %b = sext <vscale x 32 x i8> %bin.rdx2 to <vscale x 32 x i32>
diff --git a/llvm/test/CodeGen/AArch64/uaddlv-vaddlp-combine.ll b/llvm/test/CodeGen/AArch64/uaddlv-vaddlp-combine.ll
index f0856c43daf1d9..e6905f687ad9a2 100644
--- a/llvm/test/CodeGen/AArch64/uaddlv-vaddlp-combine.ll
+++ b/llvm/test/CodeGen/AArch64/uaddlv-vaddlp-combine.ll
@@ -5,8 +5,7 @@ define i32 @uaddlv_uaddlp_v8i16(<8 x i16> %0) {
 ; CHECK-LABEL: uaddlv_uaddlp_v8i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    uaddlv s0, v0.8h
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
   %2 = tail call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> %0)
   %3 = tail call i64 @llvm.aarch64.neon.uaddlv.i64.v4i32(<4 x i32> %2)
diff --git a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
index 557aa010b3a7d9..7f2eefe5ed72f6 100644
--- a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
+++ b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
@@ -81,7 +81,7 @@ define i8 @convert_to_bitmask2(<2 x i64> %vec) {
 ; CHECK-NEXT:    ldr q1, [x8, lCPI3_0 at PAGEOFF]
 ; CHECK-NEXT:    bic.16b v0, v1, v0
 ; CHECK-NEXT:    addp.2d d0, v0
-; CHECK-NEXT:    fmov x8, d0
+; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    and w0, w8, #0x3
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:    .loh AdrpLdr Lloh6, Lloh7

>From 0d76f393eb1ad65df2f602d97fba8f6ec2ab7aa8 Mon Sep 17 00:00:00 2001
From: Spencer Abson <Spencer.Abson at arm.com>
Date: Thu, 14 Nov 2024 18:16:19 +0000
Subject: [PATCH 5/8] [NFC] Refactor DUP patterns

---
 llvm/lib/Target/AArch64/AArch64InstrInfo.td | 48 ++++++++-------------
 1 file changed, 18 insertions(+), 30 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 833adc274f7efc..7f28dc18ad0ef1 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -6954,6 +6954,12 @@ def : Pat<(v4f32 (AArch64duplane32 (v4f32 V128:$Rn), VectorIndexS:$imm)),
 def : Pat<(v2f64 (AArch64duplane64 (v2f64 V128:$Rn), VectorIndexD:$imm)),
           (DUPv2i64lane V128:$Rn, VectorIndexD:$imm)>;
 
+// Also covers DUP (truncate i64 to i32)
+def : Pat<(v2i32 (AArch64dup (i32 (extractelt (v4i32 V128:$Rn), imm:$idx)))),
+          (DUPv2i32lane V128:$Rn, imm:$idx)>;
+def : Pat<(v4i32 (AArch64dup (i32 (extractelt (v4i32 V128:$Rn), imm:$idx)))),
+          (DUPv4i32lane V128:$Rn, imm:$idx)>;
+
 // If there's an (AArch64dup (vector_extract ...) ...), we can use a duplane
 // instruction even if the types don't match: we just have to remap the lane
 // carefully. N.b. this trick only applies to truncations.
@@ -6967,38 +6973,20 @@ def VecIndex_x8 : SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant(8 * N->getZExtValue(), SDLoc(N), MVT::i64);
 }]>;
 
-multiclass DUPWithTruncPats<ValueType ResVT, ValueType Src64VT,
-                            ValueType Src128VT, ValueType ScalVT,
-                            Instruction DUP, SDNodeXForm IdxXFORM> {
-  def : Pat<(ResVT (AArch64dup (ScalVT (vector_extract (Src128VT V128:$Rn),
-                                                     imm:$idx)))),
-            (DUP V128:$Rn, (IdxXFORM imm:$idx))>;
-
-  def : Pat<(ResVT (AArch64dup (ScalVT (vector_extract (Src64VT V64:$Rn),
-                                                     imm:$idx)))),
-            (DUP (SUBREG_TO_REG (i64 0), V64:$Rn, dsub), (IdxXFORM imm:$idx))>;
-}
-
-defm : DUPWithTruncPats<v8i8,   v4i16, v8i16, i32, DUPv8i8lane,  VecIndex_x2>;
-defm : DUPWithTruncPats<v8i8,   v2i32, v4i32, i32, DUPv8i8lane,  VecIndex_x4>;
-defm : DUPWithTruncPats<v4i16,  v2i32, v4i32, i32, DUPv4i16lane, VecIndex_x2>;
-
-defm : DUPWithTruncPats<v16i8,  v4i16, v8i16, i32, DUPv16i8lane, VecIndex_x2>;
-defm : DUPWithTruncPats<v16i8,  v2i32, v4i32, i32, DUPv16i8lane, VecIndex_x4>;
-defm : DUPWithTruncPats<v8i16,  v2i32, v4i32, i32, DUPv8i16lane, VecIndex_x2>;
-
-class DUPWithTrunci64Pat<ValueType ResVT, Instruction DUP,
-                         SDNodeXForm IdxXFORM>
-  : Pat<(ResVT (AArch64dup (i32 (extractelt (v4i32 V128:$Rn), imm:$idx)))),
+class DUPWithTruncPat<ValueType ResVT, ValueType SrcVT, ValueType ScalVT,
+           Instruction DUP, SDNodeXForm IdxXFORM>
+  : Pat<(ResVT (AArch64dup (ScalVT (vector_extract (SrcVT V128:$Rn), imm:$idx)))),
         (DUP V128:$Rn, (IdxXFORM imm:$idx))>;
 
-def : DUPWithTrunci64Pat<v8i8,  DUPv8i8lane,   VecIndex_x4>;
-def : DUPWithTrunci64Pat<v4i16, DUPv4i16lane,  VecIndex_x2>;
-def : DUPWithTrunci64Pat<v2i32, DUPv2i32lane,  NOOP_SDNodeXForm>;
-
-def : DUPWithTrunci64Pat<v16i8, DUPv16i8lane, VecIndex_x4>;
-def : DUPWithTrunci64Pat<v8i16, DUPv8i16lane, VecIndex_x2>;
-def : DUPWithTrunci64Pat<v4i32, DUPv4i32lane, NOOP_SDNodeXForm>;
+// DUP (truncate i16 to i8)
+def : DUPWithTruncPat<v8i8,  v8i16, i32, DUPv8i8lane,  VecIndex_x2>;
+def : DUPWithTruncPat<v16i8, v8i16, i32, DUPv16i8lane, VecIndex_x2>;
+// DUP (truncate i32/64 to i8)
+def : DUPWithTruncPat<v8i8,  v4i32, i32, DUPv8i8lane,  VecIndex_x4>;
+def : DUPWithTruncPat<v16i8, v4i32, i32, DUPv16i8lane, VecIndex_x4>;
+// DUP (truncate i32/i64 to i16)
+def : DUPWithTruncPat<v4i16, v4i32, i32, DUPv4i16lane, VecIndex_x2>;
+def : DUPWithTruncPat<v8i16, v4i32, i32, DUPv8i16lane, VecIndex_x2>;
 
 // SMOV and UMOV definitions, with some extra patterns for convenience
 defm SMOV : SMov;

>From 0534b1e8f16d78e06c28445488a5b9d897e39e84 Mon Sep 17 00:00:00 2001
From: Spencer Abson <Spencer.Abson at arm.com>
Date: Wed, 18 Dec 2024 20:38:27 +0000
Subject: [PATCH 6/8] Fixup and restrict combine where the extracted element
 has other uses

---
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 14 ++++++++++----
 llvm/test/CodeGen/AArch64/fmlal-loreg.ll        |  3 +--
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 5bca2743a61cae..844ce328b76634 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -20762,13 +20762,19 @@ static SDValue performTruncateCombine(SDNode *N, SelectionDAG &DAG,
     assert((VT == MVT::i32 && N0.getValueType() == MVT::i64) &&
            "Unexpected legalisation result!");
 
-    MVT CastVT;
     EVT SrcVectorType = Op.getValueType();
-    assert(SrcVectorType.getScalarType() == MVT::i64);
+    // We also assume that SrcVectorType cannot be a V64 (see LowerEXTRACT_VECTOR_ELT).
+    assert((SrcVectorType.getScalarType() == MVT::i64 &&
+            SrcVectorType != MVT::v1i64) && "Unexpected legalisation result!");
+
+    // If the i64 we are extacting has uses other than this truncation, the upper half
+    // of this value must still be live so we prefer to extract it all at once.
+    if(!N0.hasOneUse())
+      return SDValue();
+
     unsigned ExtractIndex =
         cast<ConstantSDNode>(ExtractIndexNode)->getZExtValue();
-
-    CastVT = SrcVectorType.isScalableVector() ? MVT::nxv4i32 : MVT::v4i32;
+    MVT CastVT = SrcVectorType.isScalableVector() ? MVT::nxv4i32 : MVT::v4i32;
 
     Op = DAG.getNode(AArch64ISD::NVCAST, DL, CastVT, Op);
     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op,
diff --git a/llvm/test/CodeGen/AArch64/fmlal-loreg.ll b/llvm/test/CodeGen/AArch64/fmlal-loreg.ll
index 6116c6421c4b12..31ead890ba8ac7 100644
--- a/llvm/test/CodeGen/AArch64/fmlal-loreg.ll
+++ b/llvm/test/CodeGen/AArch64/fmlal-loreg.ll
@@ -11,10 +11,9 @@ define <4 x float> @test(ptr %lhs_panel, ptr %rhs_panel, <4 x float> %a) {
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    .cfi_offset b8, -16
 ; CHECK-NEXT:    fmov x8, d0
-; CHECK-NEXT:    fmov w9, s0
 ; CHECK-NEXT:    ldr q16, [x1]
 ; CHECK-NEXT:    ldr q8, [x0]
-; CHECK-NEXT:    lsr x8, x8, #32
+; CHECK-NEXT:    lsr x9, x8, #32
 ; CHECK-NEXT:    //APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    //NO_APP

>From b245dfd130a9d0a3d4421db4afeed6bd3b52c162 Mon Sep 17 00:00:00 2001
From: Spencer Abson <Spencer.Abson at arm.com>
Date: Wed, 18 Dec 2024 20:56:48 +0000
Subject: [PATCH 7/8] [NFC] clang-format

---
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 844ce328b76634..30a7945bf09e9c 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -20763,13 +20763,16 @@ static SDValue performTruncateCombine(SDNode *N, SelectionDAG &DAG,
            "Unexpected legalisation result!");
 
     EVT SrcVectorType = Op.getValueType();
-    // We also assume that SrcVectorType cannot be a V64 (see LowerEXTRACT_VECTOR_ELT).
+    // We also assume that SrcVectorType cannot be a V64 (see
+    // LowerEXTRACT_VECTOR_ELT).
     assert((SrcVectorType.getScalarType() == MVT::i64 &&
-            SrcVectorType != MVT::v1i64) && "Unexpected legalisation result!");
+            SrcVectorType != MVT::v1i64) &&
+           "Unexpected legalisation result!");
 
-    // If the i64 we are extacting has uses other than this truncation, the upper half
-    // of this value must still be live so we prefer to extract it all at once.
-    if(!N0.hasOneUse())
+    // If the i64 we are extacting has uses other than this truncation, the
+    // upper half of this value must still be live so we prefer to extract it
+    // all at once.
+    if (!N0.hasOneUse())
       return SDValue();
 
     unsigned ExtractIndex =

>From c830d925b4b854b356b6ce76f1df6cfdb6374fd0 Mon Sep 17 00:00:00 2001
From: Spencer Abson <Spencer.Abson at arm.com>
Date: Thu, 19 Dec 2024 16:51:15 +0000
Subject: [PATCH 8/8] [NFC] Fixups from review

---
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 30a7945bf09e9c..4a4f58b330b63d 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -20751,7 +20751,8 @@ static SDValue performTruncateCombine(SDNode *N, SelectionDAG &DAG,
 
   // Performing the following combine produces a preferable form for ISEL.
   // i32 (trunc (extract Vi64, idx)) -> i32 (extract (nvcast Vi32), idx*2))
-  if (DCI.isAfterLegalizeDAG() && N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+  if (DCI.isAfterLegalizeDAG() && N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+      N0.hasOneUse()) {
     SDValue Op = N0.getOperand(0);
     SDValue ExtractIndexNode = N0.getOperand(1);
     if (!isa<ConstantSDNode>(ExtractIndexNode))
@@ -20765,23 +20766,16 @@ static SDValue performTruncateCombine(SDNode *N, SelectionDAG &DAG,
     EVT SrcVectorType = Op.getValueType();
     // We also assume that SrcVectorType cannot be a V64 (see
     // LowerEXTRACT_VECTOR_ELT).
-    assert((SrcVectorType.getScalarType() == MVT::i64 &&
-            SrcVectorType != MVT::v1i64) &&
+    assert((SrcVectorType == MVT::v2i64 || SrcVectorType == MVT::nxv2i64) &&
            "Unexpected legalisation result!");
 
-    // If the i64 we are extacting has uses other than this truncation, the
-    // upper half of this value must still be live so we prefer to extract it
-    // all at once.
-    if (!N0.hasOneUse())
-      return SDValue();
-
     unsigned ExtractIndex =
         cast<ConstantSDNode>(ExtractIndexNode)->getZExtValue();
     MVT CastVT = SrcVectorType.isScalableVector() ? MVT::nxv4i32 : MVT::v4i32;
 
     Op = DAG.getNode(AArch64ISD::NVCAST, DL, CastVT, Op);
     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op,
-                       DAG.getConstant(ExtractIndex * 2, DL, MVT::i64));
+                       DAG.getVectorIdxConstant(ExtractIndex * 2, DL));
   }
 
   return SDValue();



More information about the llvm-commits mailing list