[llvm] c2bd5c2 - [AArch64] Avoid GPR trip when moving truncated i32 vector elements (#114541)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Dec 20 03:07:40 PST 2024
Author: SpencerAbson
Date: 2024-12-20T11:07:37Z
New Revision: c2bd5c25b3634e55089d34afe922aa38eee743e2
URL: https://github.com/llvm/llvm-project/commit/c2bd5c25b3634e55089d34afe922aa38eee743e2
DIFF: https://github.com/llvm/llvm-project/commit/c2bd5c25b3634e55089d34afe922aa38eee743e2.diff
LOG: [AArch64] Avoid GPR trip when moving truncated i32 vector elements (#114541)
This patch implements a DAG combine whereby
```
a: v2i64 = ...
b: i64 = extract_vector_elt a, Constant:i64<n>
c: i32 = truncate b
```
Becomes
```
a: v2i64 = ...
b: v4i32 = AArch64ISD::NVCAST a
c: i32 = extract_vector_elt c, Constant:i64<2n>
```
The primary goal of this work is to enable the use of [INS
(element)](https://developer.arm.com/documentation/ddi0602/2024-09/SIMD-FP-Instructions/INS--element---Insert-vector-element-from-another-vector-element-?lang=en)
when moving a truncated i32 element between vectors. This combine
canonicalises the structure of the DAG for all legal instances of the
pattern above (by removing the explicit `trunc` operator in this
specific case), allowing us to take advantage of existing ISEL patterns
for this behavior.
Added:
llvm/test/CodeGen/AArch64/neon-ins-trunc-elt.ll
Modified:
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
llvm/lib/Target/AArch64/AArch64InstrInfo.td
llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll
llvm/test/CodeGen/AArch64/sve-doublereduct.ll
llvm/test/CodeGen/AArch64/sve-extract-element.ll
llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll
llvm/test/CodeGen/AArch64/sve-fixed-length-int-reduce.ll
llvm/test/CodeGen/AArch64/sve-int-reduce.ll
llvm/test/CodeGen/AArch64/sve-split-int-reduce.ll
llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-reduce.ll
llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reductions.ll
llvm/test/CodeGen/AArch64/sve-vecreduce-dot.ll
llvm/test/CodeGen/AArch64/uaddlv-vaddlp-combine.ll
llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
llvm/test/CodeGen/AArch64/vecreduce-bool.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index eb007c25ac89e3..5b941d173ec2b9 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -20945,8 +20945,9 @@ static SDValue performBuildVectorCombine(SDNode *N,
return SDValue();
}
-static SDValue performTruncateCombine(SDNode *N,
- SelectionDAG &DAG) {
+static SDValue performTruncateCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ SDLoc DL(N);
EVT VT = N->getValueType(0);
SDValue N0 = N->getOperand(0);
if (VT.isFixedLengthVector() && VT.is64BitVector() && N0.hasOneUse() &&
@@ -20954,8 +20955,37 @@ static SDValue performTruncateCombine(SDNode *N,
SDValue Op = N0.getOperand(0);
if (VT.getScalarType() == MVT::i32 &&
N0.getOperand(0).getValueType().getScalarType() == MVT::i64)
- Op = DAG.getNode(ISD::TRUNCATE, SDLoc(N), MVT::i32, Op);
- return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, Op);
+ Op = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Op);
+ return DAG.getNode(N0.getOpcode(), DL, VT, Op);
+ }
+
+ // Performing the following combine produces a preferable form for ISEL.
+ // i32 (trunc (extract Vi64, idx)) -> i32 (extract (nvcast Vi32), idx*2))
+ if (DCI.isAfterLegalizeDAG() && N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+ N0.hasOneUse()) {
+ SDValue Op = N0.getOperand(0);
+ SDValue ExtractIndexNode = N0.getOperand(1);
+ if (!isa<ConstantSDNode>(ExtractIndexNode))
+ return SDValue();
+
+ // For a legal DAG, EXTRACT_VECTOR_ELT can only have produced an i32 or i64.
+ // So we can only expect: i32 (trunc (i64 (extract Vi64, idx))).
+ assert((VT == MVT::i32 && N0.getValueType() == MVT::i64) &&
+ "Unexpected legalisation result!");
+
+ EVT SrcVectorType = Op.getValueType();
+ // We also assume that SrcVectorType cannot be a V64 (see
+ // LowerEXTRACT_VECTOR_ELT).
+ assert((SrcVectorType == MVT::v2i64 || SrcVectorType == MVT::nxv2i64) &&
+ "Unexpected legalisation result!");
+
+ unsigned ExtractIndex =
+ cast<ConstantSDNode>(ExtractIndexNode)->getZExtValue();
+ MVT CastVT = SrcVectorType.isScalableVector() ? MVT::nxv4i32 : MVT::v4i32;
+
+ Op = DAG.getNode(AArch64ISD::NVCAST, DL, CastVT, Op);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op,
+ DAG.getVectorIdxConstant(ExtractIndex * 2, DL));
}
return SDValue();
@@ -26258,7 +26288,7 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
case ISD::BUILD_VECTOR:
return performBuildVectorCombine(N, DCI, DAG);
case ISD::TRUNCATE:
- return performTruncateCombine(N, DAG);
+ return performTruncateCombine(N, DAG, DCI);
case AArch64ISD::ANDS:
return performFlagSettingCombine(N, DCI, ISD::AND);
case AArch64ISD::ADC:
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index d015cc15581ad0..b37f4a08755c5f 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -6977,6 +6977,12 @@ def : Pat<(v4f32 (AArch64duplane32 (v4f32 V128:$Rn), VectorIndexS:$imm)),
def : Pat<(v2f64 (AArch64duplane64 (v2f64 V128:$Rn), VectorIndexD:$imm)),
(DUPv2i64lane V128:$Rn, VectorIndexD:$imm)>;
+// Also covers DUP (truncate i64 to i32)
+def : Pat<(v2i32 (AArch64dup (i32 (extractelt (v4i32 V128:$Rn), imm:$idx)))),
+ (DUPv2i32lane V128:$Rn, imm:$idx)>;
+def : Pat<(v4i32 (AArch64dup (i32 (extractelt (v4i32 V128:$Rn), imm:$idx)))),
+ (DUPv4i32lane V128:$Rn, imm:$idx)>;
+
// If there's an (AArch64dup (vector_extract ...) ...), we can use a duplane
// instruction even if the types don't match: we just have to remap the lane
// carefully. N.b. this trick only applies to truncations.
@@ -6990,44 +6996,20 @@ def VecIndex_x8 : SDNodeXForm<imm, [{
return CurDAG->getTargetConstant(8 * N->getZExtValue(), SDLoc(N), MVT::i64);
}]>;
-multiclass DUPWithTruncPats<ValueType ResVT, ValueType Src64VT,
- ValueType Src128VT, ValueType ScalVT,
- Instruction DUP, SDNodeXForm IdxXFORM> {
- def : Pat<(ResVT (AArch64dup (ScalVT (vector_extract (Src128VT V128:$Rn),
- imm:$idx)))),
- (DUP V128:$Rn, (IdxXFORM imm:$idx))>;
-
- def : Pat<(ResVT (AArch64dup (ScalVT (vector_extract (Src64VT V64:$Rn),
- imm:$idx)))),
- (DUP (SUBREG_TO_REG (i64 0), V64:$Rn, dsub), (IdxXFORM imm:$idx))>;
-}
-
-defm : DUPWithTruncPats<v8i8, v4i16, v8i16, i32, DUPv8i8lane, VecIndex_x2>;
-defm : DUPWithTruncPats<v8i8, v2i32, v4i32, i32, DUPv8i8lane, VecIndex_x4>;
-defm : DUPWithTruncPats<v4i16, v2i32, v4i32, i32, DUPv4i16lane, VecIndex_x2>;
-
-defm : DUPWithTruncPats<v16i8, v4i16, v8i16, i32, DUPv16i8lane, VecIndex_x2>;
-defm : DUPWithTruncPats<v16i8, v2i32, v4i32, i32, DUPv16i8lane, VecIndex_x4>;
-defm : DUPWithTruncPats<v8i16, v2i32, v4i32, i32, DUPv8i16lane, VecIndex_x2>;
-
-multiclass DUPWithTrunci64Pats<ValueType ResVT, Instruction DUP,
- SDNodeXForm IdxXFORM> {
- def : Pat<(ResVT (AArch64dup (i32 (trunc (extractelt (v2i64 V128:$Rn),
- imm:$idx))))),
- (DUP V128:$Rn, (IdxXFORM imm:$idx))>;
-
- def : Pat<(ResVT (AArch64dup (i32 (trunc (extractelt (v1i64 V64:$Rn),
- imm:$idx))))),
- (DUP (SUBREG_TO_REG (i64 0), V64:$Rn, dsub), (IdxXFORM imm:$idx))>;
-}
-
-defm : DUPWithTrunci64Pats<v8i8, DUPv8i8lane, VecIndex_x8>;
-defm : DUPWithTrunci64Pats<v4i16, DUPv4i16lane, VecIndex_x4>;
-defm : DUPWithTrunci64Pats<v2i32, DUPv2i32lane, VecIndex_x2>;
-
-defm : DUPWithTrunci64Pats<v16i8, DUPv16i8lane, VecIndex_x8>;
-defm : DUPWithTrunci64Pats<v8i16, DUPv8i16lane, VecIndex_x4>;
-defm : DUPWithTrunci64Pats<v4i32, DUPv4i32lane, VecIndex_x2>;
+class DUPWithTruncPat<ValueType ResVT, ValueType SrcVT, ValueType ScalVT,
+ Instruction DUP, SDNodeXForm IdxXFORM>
+ : Pat<(ResVT (AArch64dup (ScalVT (vector_extract (SrcVT V128:$Rn), imm:$idx)))),
+ (DUP V128:$Rn, (IdxXFORM imm:$idx))>;
+
+// DUP (truncate i16 to i8)
+def : DUPWithTruncPat<v8i8, v8i16, i32, DUPv8i8lane, VecIndex_x2>;
+def : DUPWithTruncPat<v16i8, v8i16, i32, DUPv16i8lane, VecIndex_x2>;
+// DUP (truncate i32/64 to i8)
+def : DUPWithTruncPat<v8i8, v4i32, i32, DUPv8i8lane, VecIndex_x4>;
+def : DUPWithTruncPat<v16i8, v4i32, i32, DUPv16i8lane, VecIndex_x4>;
+// DUP (truncate i32/i64 to i16)
+def : DUPWithTruncPat<v4i16, v4i32, i32, DUPv4i16lane, VecIndex_x2>;
+def : DUPWithTruncPat<v8i16, v4i32, i32, DUPv8i16lane, VecIndex_x2>;
// SMOV and UMOV definitions, with some extra patterns for convenience
defm SMOV : SMov;
diff --git a/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll b/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll
index 2e165179381820..1b7bc128d6332e 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll
@@ -384,9 +384,9 @@ define void @insert_vec_v4i16_uaddlv_from_v4i32(ptr %0) {
; CHECK-NEXT: movi.2d v1, #0000000000000000
; CHECK-NEXT: uaddlv.4s d0, v0
; CHECK-NEXT: mov.h v1[0], v0[0]
-; CHECK-NEXT: ushll.4s v0, v1, #0
-; CHECK-NEXT: ucvtf.4s v0, v0
-; CHECK-NEXT: str q0, [x0]
+; CHECK-NEXT: ushll.4s v1, v1, #0
+; CHECK-NEXT: ucvtf.4s v1, v1
+; CHECK-NEXT: str q1, [x0]
; CHECK-NEXT: ret
entry:
@@ -403,13 +403,13 @@ define void @insert_vec_v16i16_uaddlv_from_v4i32(ptr %0) {
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: movi.2d v0, #0000000000000000
; CHECK-NEXT: movi.2d v1, #0000000000000000
+; CHECK-NEXT: movi.2d v2, #0000000000000000
; CHECK-NEXT: uaddlv.4s d0, v0
+; CHECK-NEXT: stp q2, q2, [x0, #32]
; CHECK-NEXT: mov.h v1[0], v0[0]
-; CHECK-NEXT: movi.2d v0, #0000000000000000
; CHECK-NEXT: ushll.4s v1, v1, #0
-; CHECK-NEXT: stp q0, q0, [x0, #32]
; CHECK-NEXT: ucvtf.4s v1, v1
-; CHECK-NEXT: stp q1, q0, [x0]
+; CHECK-NEXT: stp q1, q2, [x0]
; CHECK-NEXT: ret
entry:
@@ -430,9 +430,9 @@ define void @insert_vec_v8i8_uaddlv_from_v4i32(ptr %0) {
; CHECK-NEXT: uaddlv.4s d0, v0
; CHECK-NEXT: mov.h v1[0], v0[0]
; CHECK-NEXT: bic.4h v1, #255, lsl #8
-; CHECK-NEXT: ushll.4s v0, v1, #0
-; CHECK-NEXT: ucvtf.4s v0, v0
-; CHECK-NEXT: str q0, [x0]
+; CHECK-NEXT: ushll.4s v1, v1, #0
+; CHECK-NEXT: ucvtf.4s v1, v1
+; CHECK-NEXT: str q1, [x0]
; CHECK-NEXT: ret
entry:
@@ -449,14 +449,14 @@ define void @insert_vec_v16i8_uaddlv_from_v4i32(ptr %0) {
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: movi.2d v0, #0000000000000000
; CHECK-NEXT: movi.2d v1, #0000000000000000
+; CHECK-NEXT: movi.2d v2, #0000000000000000
; CHECK-NEXT: uaddlv.4s d0, v0
+; CHECK-NEXT: stp q2, q2, [x0, #32]
; CHECK-NEXT: mov.h v1[0], v0[0]
-; CHECK-NEXT: movi.2d v0, #0000000000000000
; CHECK-NEXT: bic.4h v1, #255, lsl #8
-; CHECK-NEXT: stp q0, q0, [x0, #32]
; CHECK-NEXT: ushll.4s v1, v1, #0
; CHECK-NEXT: ucvtf.4s v1, v1
-; CHECK-NEXT: stp q1, q0, [x0]
+; CHECK-NEXT: stp q1, q2, [x0]
; CHECK-NEXT: ret
entry:
diff --git a/llvm/test/CodeGen/AArch64/neon-ins-trunc-elt.ll b/llvm/test/CodeGen/AArch64/neon-ins-trunc-elt.ll
new file mode 100644
index 00000000000000..0d58fc59c2c319
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/neon-ins-trunc-elt.ll
@@ -0,0 +1,136 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+neon < %s | FileCheck %s
+
+; Inserting a truncated (i64 to i32) element from the bottom 128-bits of any vector type into a NEON vector should use INS (element) of the
+; truncated size to avoid pointless GPR trips.
+
+
+define <2 x i32> @test_s_trunc_d_lane0(<2 x i32> %a, <1 x i64> %b) {
+; CHECK-LABEL: test_s_trunc_d_lane0:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: ret
+ %c = extractelement <1 x i64> %b, i32 0
+ %d = trunc i64 %c to i32
+ %e = insertelement <2 x i32> %a, i32 %d, i64 0
+ ret <2 x i32> %e
+}
+
+define <2 x i32> @test_s_trunc_d_qlane1(<2 x i32> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_s_trunc_d_qlane1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: mov v0.s[0], v1.s[2]
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: ret
+ %c = extractelement <2 x i64> %b, i32 1
+ %d = trunc i64 %c to i32
+ %e = insertelement <2 x i32> %a, i32 %d, i64 0
+ ret <2 x i32> %e
+}
+
+define <4 x i32> @test_qs_trunc_d_lane0(<4 x i32> %a, <1 x i64> %b) {
+; CHECK-LABEL: test_qs_trunc_d_lane0:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-NEXT: ret
+ %c = extractelement <1 x i64> %b, i32 0
+ %d = trunc i64 %c to i32
+ %e = insertelement <4 x i32> %a, i32 %d, i64 0
+ ret <4 x i32> %e
+}
+
+define <4 x i32> @test_qs_trunc_d_qlane1(<4 x i32> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_qs_trunc_d_qlane1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov v0.s[3], v1.s[2]
+; CHECK-NEXT: ret
+ %c = extractelement <2 x i64> %b, i32 1
+ %d = trunc i64 %c to i32
+ %e = insertelement <4 x i32> %a, i32 %d, i64 3
+ ret <4 x i32> %e
+}
+
+; ---- From the bottom 128b of an SVE vector
+
+define <2 x i32> @test_s_trunc_dsve_lane0(<2 x i32> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: test_s_trunc_dsve_lane0:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: ret
+ %c = extractelement <vscale x 2 x i64> %b, i32 0
+ %d = trunc i64 %c to i32
+ %e = insertelement <2 x i32> %a, i32 %d, i64 0
+ ret <2 x i32> %e
+}
+
+define <2 x i32> @test_s_trunc_dsve_lane1(<2 x i32> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: test_s_trunc_dsve_lane1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: mov v0.s[1], v1.s[2]
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: ret
+ %c = extractelement <vscale x 2 x i64> %b, i32 1
+ %d = trunc i64 %c to i32
+ %e = insertelement <2 x i32> %a, i32 %d, i64 1
+ ret <2 x i32> %e
+}
+
+; (negative test) Extracted element is not within V-register.
+define <2 x i32> @test_s_trunc_dsve_lane2(<2 x i32> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: test_s_trunc_dsve_lane2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z1.s, z1.s[4]
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: fmov w8, s1
+; CHECK-NEXT: mov v0.s[1], w8
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: ret
+ %c = extractelement <vscale x 2 x i64> %b, i32 2
+ %d = trunc i64 %c to i32
+ %e = insertelement <2 x i32> %a, i32 %d, i64 1
+ ret <2 x i32> %e
+}
+
+define <4 x i32> @test_qs_trunc_dsve_lane0(<4 x i32> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: test_qs_trunc_dsve_lane0:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-NEXT: ret
+ %c = extractelement <vscale x 2 x i64> %b, i32 0
+ %d = trunc i64 %c to i32
+ %e = insertelement <4 x i32> %a, i32 %d, i64 0
+ ret <4 x i32> %e
+}
+
+define <4 x i32> @test_qs_trunc_dsve_lane1(<4 x i32> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: test_qs_trunc_dsve_lane1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov v0.s[3], v1.s[2]
+; CHECK-NEXT: ret
+ %c = extractelement <vscale x 2 x i64> %b, i32 1
+ %d = trunc i64 %c to i32
+ %e = insertelement <4 x i32> %a, i32 %d, i64 3
+ ret <4 x i32> %e
+}
+
+; (negative test) Extracted element is not within V-register.
+define <4 x i32> @test_qs_trunc_dsve_lane2(<4 x i32> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: test_qs_trunc_dsve_lane2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z1.s, z1.s[4]
+; CHECK-NEXT: fmov w8, s1
+; CHECK-NEXT: mov v0.s[3], w8
+; CHECK-NEXT: ret
+ %c = extractelement <vscale x 2 x i64> %b, i32 2
+ %d = trunc i64 %c to i32
+ %e = insertelement <4 x i32> %a, i32 %d, i64 3
+ ret <4 x i32> %e
+}
diff --git a/llvm/test/CodeGen/AArch64/sve-doublereduct.ll b/llvm/test/CodeGen/AArch64/sve-doublereduct.ll
index 7bc31d44bb6547..b813b8f84ba16b 100644
--- a/llvm/test/CodeGen/AArch64/sve-doublereduct.ll
+++ b/llvm/test/CodeGen/AArch64/sve-doublereduct.ll
@@ -91,8 +91,7 @@ define i32 @add_i32(<vscale x 8 x i32> %a, <vscale x 4 x i32> %b) {
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: add z0.s, z0.s, z2.s
; CHECK-NEXT: uaddv d0, p0, z0.s
-; CHECK-NEXT: fmov x0, d0
-; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
%r1 = call i32 @llvm.vector.reduce.add.i32.nxv8i32(<vscale x 8 x i32> %a)
%r2 = call i32 @llvm.vector.reduce.add.i32.nxv4i32(<vscale x 4 x i32> %b)
@@ -112,8 +111,7 @@ define i16 @add_ext_i16(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
; CHECK-NEXT: add z1.h, z1.h, z3.h
; CHECK-NEXT: add z0.h, z0.h, z1.h
; CHECK-NEXT: uaddv d0, p0, z0.h
-; CHECK-NEXT: fmov x0, d0
-; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
%ae = zext <vscale x 16 x i8> %a to <vscale x 16 x i16>
%be = zext <vscale x 16 x i8> %b to <vscale x 16 x i16>
@@ -139,8 +137,7 @@ define i16 @add_ext_v32i16(<vscale x 32 x i8> %a, <vscale x 16 x i8> %b) {
; CHECK-NEXT: add z1.h, z2.h, z5.h
; CHECK-NEXT: add z0.h, z0.h, z1.h
; CHECK-NEXT: uaddv d0, p0, z0.h
-; CHECK-NEXT: fmov x0, d0
-; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
%ae = zext <vscale x 32 x i8> %a to <vscale x 32 x i16>
%be = zext <vscale x 16 x i8> %b to <vscale x 16 x i16>
diff --git a/llvm/test/CodeGen/AArch64/sve-extract-element.ll b/llvm/test/CodeGen/AArch64/sve-extract-element.ll
index 6d4f5963881e58..939c7e43100189 100644
--- a/llvm/test/CodeGen/AArch64/sve-extract-element.ll
+++ b/llvm/test/CodeGen/AArch64/sve-extract-element.ll
@@ -644,8 +644,8 @@ define i1 @test_lane4_2xi1(<vscale x 2 x i1> %a) #0 {
; CHECK-LABEL: test_lane4_2xi1:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z0.d, p0/z, #1 // =0x1
-; CHECK-NEXT: mov z0.d, z0.d[4]
-; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: mov z0.s, z0.s[8]
+; CHECK-NEXT: fmov w8, s0
; CHECK-NEXT: and w0, w8, #0x1
; CHECK-NEXT: ret
%b = extractelement <vscale x 2 x i1> %a, i32 4
diff --git a/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll b/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll
index 518e3573b5edd3..965af2a745afd4 100644
--- a/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll
@@ -238,11 +238,8 @@ define <2 x i1> @extract_v2i1_nxv2i1(<vscale x 2 x i1> %inmask) {
; CHECK-LABEL: extract_v2i1_nxv2i1:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z0.d, p0/z, #1 // =0x1
-; CHECK-NEXT: fmov x0, d0
-; CHECK-NEXT: mov x8, v0.d[1]
-; CHECK-NEXT: fmov s0, w0
-; CHECK-NEXT: mov v0.s[1], w8
-; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: mov v0.s[1], v0.s[2]
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
%mask = call <2 x i1> @llvm.vector.extract.v2i1.nxv2i1(<vscale x 2 x i1> %inmask, i64 0)
ret <2 x i1> %mask
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-reduce.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-reduce.ll
index 752c2cd34bfe48..be19e9ef5e86f8 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-reduce.ll
@@ -37,8 +37,7 @@ define i8 @uaddv_v32i8(ptr %a) vscale_range(2,0) #0 {
; CHECK-NEXT: ptrue p0.b, vl32
; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
; CHECK-NEXT: uaddv d0, p0, z0.b
-; CHECK-NEXT: fmov x0, d0
-; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
%op = load <32 x i8>, ptr %a
%res = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> %op)
@@ -54,8 +53,7 @@ define i8 @uaddv_v64i8(ptr %a) #0 {
; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0]
; VBITS_GE_256-NEXT: add z0.b, z1.b, z0.b
; VBITS_GE_256-NEXT: uaddv d0, p0, z0.b
-; VBITS_GE_256-NEXT: fmov x0, d0
-; VBITS_GE_256-NEXT: // kill: def $w0 killed $w0 killed $x0
+; VBITS_GE_256-NEXT: fmov w0, s0
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: uaddv_v64i8:
@@ -63,8 +61,7 @@ define i8 @uaddv_v64i8(ptr %a) #0 {
; VBITS_GE_512-NEXT: ptrue p0.b, vl64
; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0]
; VBITS_GE_512-NEXT: uaddv d0, p0, z0.b
-; VBITS_GE_512-NEXT: fmov x0, d0
-; VBITS_GE_512-NEXT: // kill: def $w0 killed $w0 killed $x0
+; VBITS_GE_512-NEXT: fmov w0, s0
; VBITS_GE_512-NEXT: ret
%op = load <64 x i8>, ptr %a
%res = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> %op)
@@ -77,8 +74,7 @@ define i8 @uaddv_v128i8(ptr %a) vscale_range(8,0) #0 {
; CHECK-NEXT: ptrue p0.b, vl128
; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
; CHECK-NEXT: uaddv d0, p0, z0.b
-; CHECK-NEXT: fmov x0, d0
-; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
%op = load <128 x i8>, ptr %a
%res = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> %op)
@@ -91,8 +87,7 @@ define i8 @uaddv_v256i8(ptr %a) vscale_range(16,0) #0 {
; CHECK-NEXT: ptrue p0.b, vl256
; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
; CHECK-NEXT: uaddv d0, p0, z0.b
-; CHECK-NEXT: fmov x0, d0
-; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
%op = load <256 x i8>, ptr %a
%res = call i8 @llvm.vector.reduce.add.v256i8(<256 x i8> %op)
@@ -127,8 +122,7 @@ define i16 @uaddv_v16i16(ptr %a) vscale_range(2,0) #0 {
; CHECK-NEXT: ptrue p0.h, vl16
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: uaddv d0, p0, z0.h
-; CHECK-NEXT: fmov x0, d0
-; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
%op = load <16 x i16>, ptr %a
%res = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %op)
@@ -144,8 +138,7 @@ define i16 @uaddv_v32i16(ptr %a) #0 {
; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
; VBITS_GE_256-NEXT: add z0.h, z1.h, z0.h
; VBITS_GE_256-NEXT: uaddv d0, p0, z0.h
-; VBITS_GE_256-NEXT: fmov x0, d0
-; VBITS_GE_256-NEXT: // kill: def $w0 killed $w0 killed $x0
+; VBITS_GE_256-NEXT: fmov w0, s0
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: uaddv_v32i16:
@@ -153,8 +146,7 @@ define i16 @uaddv_v32i16(ptr %a) #0 {
; VBITS_GE_512-NEXT: ptrue p0.h, vl32
; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_512-NEXT: uaddv d0, p0, z0.h
-; VBITS_GE_512-NEXT: fmov x0, d0
-; VBITS_GE_512-NEXT: // kill: def $w0 killed $w0 killed $x0
+; VBITS_GE_512-NEXT: fmov w0, s0
; VBITS_GE_512-NEXT: ret
%op = load <32 x i16>, ptr %a
%res = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> %op)
@@ -167,8 +159,7 @@ define i16 @uaddv_v64i16(ptr %a) vscale_range(8,0) #0 {
; CHECK-NEXT: ptrue p0.h, vl64
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: uaddv d0, p0, z0.h
-; CHECK-NEXT: fmov x0, d0
-; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
%op = load <64 x i16>, ptr %a
%res = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> %op)
@@ -181,8 +172,7 @@ define i16 @uaddv_v128i16(ptr %a) vscale_range(16,0) #0 {
; CHECK-NEXT: ptrue p0.h, vl128
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: uaddv d0, p0, z0.h
-; CHECK-NEXT: fmov x0, d0
-; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
%op = load <128 x i16>, ptr %a
%res = call i16 @llvm.vector.reduce.add.v128i16(<128 x i16> %op)
@@ -217,8 +207,7 @@ define i32 @uaddv_v8i32(ptr %a) vscale_range(2,0) #0 {
; CHECK-NEXT: ptrue p0.s, vl8
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: uaddv d0, p0, z0.s
-; CHECK-NEXT: fmov x0, d0
-; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
%op = load <8 x i32>, ptr %a
%res = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %op)
@@ -234,8 +223,7 @@ define i32 @uaddv_v16i32(ptr %a) #0 {
; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: add z0.s, z1.s, z0.s
; VBITS_GE_256-NEXT: uaddv d0, p0, z0.s
-; VBITS_GE_256-NEXT: fmov x0, d0
-; VBITS_GE_256-NEXT: // kill: def $w0 killed $w0 killed $x0
+; VBITS_GE_256-NEXT: fmov w0, s0
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: uaddv_v16i32:
@@ -243,8 +231,7 @@ define i32 @uaddv_v16i32(ptr %a) #0 {
; VBITS_GE_512-NEXT: ptrue p0.s, vl16
; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_512-NEXT: uaddv d0, p0, z0.s
-; VBITS_GE_512-NEXT: fmov x0, d0
-; VBITS_GE_512-NEXT: // kill: def $w0 killed $w0 killed $x0
+; VBITS_GE_512-NEXT: fmov w0, s0
; VBITS_GE_512-NEXT: ret
%op = load <16 x i32>, ptr %a
%res = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %op)
@@ -257,8 +244,7 @@ define i32 @uaddv_v32i32(ptr %a) vscale_range(8,0) #0 {
; CHECK-NEXT: ptrue p0.s, vl32
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: uaddv d0, p0, z0.s
-; CHECK-NEXT: fmov x0, d0
-; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
%op = load <32 x i32>, ptr %a
%res = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %op)
@@ -271,8 +257,7 @@ define i32 @uaddv_v64i32(ptr %a) vscale_range(16,0) #0 {
; CHECK-NEXT: ptrue p0.s, vl64
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: uaddv d0, p0, z0.s
-; CHECK-NEXT: fmov x0, d0
-; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
%op = load <64 x i32>, ptr %a
%res = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %op)
diff --git a/llvm/test/CodeGen/AArch64/sve-int-reduce.ll b/llvm/test/CodeGen/AArch64/sve-int-reduce.ll
index 8c1b5225b7f257..6ec18477fe1a0c 100644
--- a/llvm/test/CodeGen/AArch64/sve-int-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve-int-reduce.ll
@@ -146,8 +146,7 @@ define i8 @uaddv_nxv16i8(<vscale x 16 x i8> %a) {
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.b
; CHECK-NEXT: uaddv d0, p0, z0.b
-; CHECK-NEXT: fmov x0, d0
-; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
%res = call i8 @llvm.vector.reduce.add.nxv16i8(<vscale x 16 x i8> %a)
ret i8 %res
@@ -158,8 +157,7 @@ define i16 @uaddv_nxv8i16(<vscale x 8 x i16> %a) {
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: uaddv d0, p0, z0.h
-; CHECK-NEXT: fmov x0, d0
-; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
%res = call i16 @llvm.vector.reduce.add.nxv8i16(<vscale x 8 x i16> %a)
ret i16 %res
@@ -170,8 +168,7 @@ define i32 @uaddv_nxv4i32(<vscale x 4 x i32> %a) {
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: uaddv d0, p0, z0.s
-; CHECK-NEXT: fmov x0, d0
-; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
%res = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %a)
ret i32 %res
@@ -422,8 +419,7 @@ define i8 @uaddv_nxv12i8(<vscale x 12 x i8> %a) {
; CHECK-NEXT: uzp1 z1.h, z1.h, z2.h
; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b
; CHECK-NEXT: uaddv d0, p0, z0.b
-; CHECK-NEXT: fmov x0, d0
-; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
%res = call i8 @llvm.vector.reduce.add.nxv12i8(<vscale x 12 x i8> %a)
ret i8 %res
diff --git a/llvm/test/CodeGen/AArch64/sve-split-int-reduce.ll b/llvm/test/CodeGen/AArch64/sve-split-int-reduce.ll
index dd7b15ef5ee6f4..90383b43d58128 100644
--- a/llvm/test/CodeGen/AArch64/sve-split-int-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve-split-int-reduce.ll
@@ -33,8 +33,7 @@ define i32 @orv_nxv2i32(<vscale x 2 x i32> %a) {
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: orv d0, p0, z0.d
-; CHECK-NEXT: fmov x0, d0
-; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
%res = call i32 @llvm.vector.reduce.or.nxv2i32(<vscale x 2 x i32> %a)
ret i32 %res
@@ -61,8 +60,7 @@ define i16 @xorv_nxv2i16(<vscale x 2 x i16> %a) {
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: eorv d0, p0, z0.d
-; CHECK-NEXT: fmov x0, d0
-; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
%res = call i16 @llvm.vector.reduce.xor.nxv2i16(<vscale x 2 x i16> %a)
ret i16 %res
@@ -87,8 +85,7 @@ define i16 @uaddv_nxv4i16(<vscale x 4 x i16> %a) {
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: uaddv d0, p0, z0.s
-; CHECK-NEXT: fmov x0, d0
-; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
%res = call i16 @llvm.vector.reduce.add.nxv4i16(<vscale x 4 x i16> %a)
ret i16 %res
@@ -100,8 +97,7 @@ define i16 @uaddv_nxv16i16(<vscale x 16 x i16> %a) {
; CHECK-NEXT: add z0.h, z0.h, z1.h
; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: uaddv d0, p0, z0.h
-; CHECK-NEXT: fmov x0, d0
-; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
%res = call i16 @llvm.vector.reduce.add.nxv16i16(<vscale x 16 x i16> %a)
ret i16 %res
@@ -115,8 +111,7 @@ define i32 @uaddv_nxv16i32(<vscale x 16 x i32> %a) {
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: add z0.s, z0.s, z1.s
; CHECK-NEXT: uaddv d0, p0, z0.s
-; CHECK-NEXT: fmov x0, d0
-; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
%res = call i32 @llvm.vector.reduce.add.nxv16i32(<vscale x 16 x i32> %a)
ret i32 %res
@@ -130,8 +125,7 @@ define i32 @umin_nxv2i32(<vscale x 2 x i32> %a) {
; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: uminv d0, p0, z0.d
-; CHECK-NEXT: fmov x0, d0
-; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
%res = call i32 @llvm.vector.reduce.umin.nxv2i32(<vscale x 2 x i32> %a)
ret i32 %res
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-reduce.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-reduce.ll
index 92a67cba55f7a1..244dcc734bd7c2 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-reduce.ll
@@ -15,8 +15,7 @@ define i8 @uaddv_v8i8(<8 x i8> %a) {
; CHECK-NEXT: ptrue p0.b, vl8
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
; CHECK-NEXT: uaddv d0, p0, z0.b
-; CHECK-NEXT: fmov x0, d0
-; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: uaddv_v8i8:
@@ -51,8 +50,7 @@ define i8 @uaddv_v16i8(<16 x i8> %a) {
; CHECK-NEXT: ptrue p0.b, vl16
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
; CHECK-NEXT: uaddv d0, p0, z0.b
-; CHECK-NEXT: fmov x0, d0
-; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: uaddv_v16i8:
@@ -103,8 +101,7 @@ define i8 @uaddv_v32i8(ptr %a) {
; CHECK-NEXT: ptrue p0.b, vl16
; CHECK-NEXT: add z0.b, z1.b, z0.b
; CHECK-NEXT: uaddv d0, p0, z0.b
-; CHECK-NEXT: fmov x0, d0
-; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: uaddv_v32i8:
@@ -188,8 +185,7 @@ define i16 @uaddv_v4i16(<4 x i16> %a) {
; CHECK-NEXT: ptrue p0.h, vl4
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
; CHECK-NEXT: uaddv d0, p0, z0.h
-; CHECK-NEXT: fmov x0, d0
-; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: uaddv_v4i16:
@@ -216,8 +212,7 @@ define i16 @uaddv_v8i16(<8 x i16> %a) {
; CHECK-NEXT: ptrue p0.h, vl8
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
; CHECK-NEXT: uaddv d0, p0, z0.h
-; CHECK-NEXT: fmov x0, d0
-; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: uaddv_v8i16:
@@ -252,8 +247,7 @@ define i16 @uaddv_v16i16(ptr %a) {
; CHECK-NEXT: ptrue p0.h, vl8
; CHECK-NEXT: add z0.h, z1.h, z0.h
; CHECK-NEXT: uaddv d0, p0, z0.h
-; CHECK-NEXT: fmov x0, d0
-; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: uaddv_v16i16:
@@ -305,8 +299,7 @@ define i32 @uaddv_v2i32(<2 x i32> %a) {
; CHECK-NEXT: ptrue p0.s, vl2
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
; CHECK-NEXT: uaddv d0, p0, z0.s
-; CHECK-NEXT: fmov x0, d0
-; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: uaddv_v2i32:
@@ -328,8 +321,7 @@ define i32 @uaddv_v4i32(<4 x i32> %a) {
; CHECK-NEXT: ptrue p0.s, vl4
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
; CHECK-NEXT: uaddv d0, p0, z0.s
-; CHECK-NEXT: fmov x0, d0
-; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: uaddv_v4i32:
@@ -353,8 +345,7 @@ define i32 @uaddv_v8i32(ptr %a) {
; CHECK-NEXT: ptrue p0.s, vl4
; CHECK-NEXT: add z0.s, z1.s, z0.s
; CHECK-NEXT: uaddv d0, p0, z0.s
-; CHECK-NEXT: fmov x0, d0
-; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: uaddv_v8i32:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reductions.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reductions.ll
index 00a15f4bcd6394..688537704a6f73 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reductions.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reductions.ll
@@ -66,8 +66,7 @@ define i32 @reduce_uaddv_v16i8(<32 x i8> %a) {
; STREAMING-SVE-NEXT: add z0.s, z2.s, z0.s
; STREAMING-SVE-NEXT: add z0.s, z1.s, z0.s
; STREAMING-SVE-NEXT: uaddv d0, p0, z0.s
-; STREAMING-SVE-NEXT: fmov x0, d0
-; STREAMING-SVE-NEXT: // kill: def $w0 killed $w0 killed $x0
+; STREAMING-SVE-NEXT: fmov w0, s0
; STREAMING-SVE-NEXT: ret
%1 = zext <32 x i8> %a to <32 x i32>
%2 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %1)
@@ -134,8 +133,7 @@ define i32 @reduce_saddv_v16i8(<32 x i8> %a) {
; STREAMING-SVE-NEXT: add z0.s, z2.s, z0.s
; STREAMING-SVE-NEXT: add z0.s, z1.s, z0.s
; STREAMING-SVE-NEXT: uaddv d0, p0, z0.s
-; STREAMING-SVE-NEXT: fmov x0, d0
-; STREAMING-SVE-NEXT: // kill: def $w0 killed $w0 killed $x0
+; STREAMING-SVE-NEXT: fmov w0, s0
; STREAMING-SVE-NEXT: ret
%1 = sext <32 x i8> %a to <32 x i32>
%2 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %1)
diff --git a/llvm/test/CodeGen/AArch64/sve-vecreduce-dot.ll b/llvm/test/CodeGen/AArch64/sve-vecreduce-dot.ll
index 91f8f5c2c90d84..6af26067cd6d6d 100644
--- a/llvm/test/CodeGen/AArch64/sve-vecreduce-dot.ll
+++ b/llvm/test/CodeGen/AArch64/sve-vecreduce-dot.ll
@@ -42,8 +42,7 @@ define i32 @test(<vscale x 32 x i8> %bin.rdx, <vscale x 32 x i8> %bin.rdx2) {
; CHECK-NEXT: add z1.s, z3.s, z1.s
; CHECK-NEXT: add z0.s, z1.s, z0.s
; CHECK-NEXT: uaddv d0, p0, z0.s
-; CHECK-NEXT: fmov x0, d0
-; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
%a = sext <vscale x 32 x i8> %bin.rdx to <vscale x 32 x i32>
%b = sext <vscale x 32 x i8> %bin.rdx2 to <vscale x 32 x i32>
diff --git a/llvm/test/CodeGen/AArch64/uaddlv-vaddlp-combine.ll b/llvm/test/CodeGen/AArch64/uaddlv-vaddlp-combine.ll
index f0856c43daf1d9..e6905f687ad9a2 100644
--- a/llvm/test/CodeGen/AArch64/uaddlv-vaddlp-combine.ll
+++ b/llvm/test/CodeGen/AArch64/uaddlv-vaddlp-combine.ll
@@ -5,8 +5,7 @@ define i32 @uaddlv_uaddlp_v8i16(<8 x i16> %0) {
; CHECK-LABEL: uaddlv_uaddlp_v8i16:
; CHECK: // %bb.0:
; CHECK-NEXT: uaddlv s0, v0.8h
-; CHECK-NEXT: fmov x0, d0
-; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
%2 = tail call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> %0)
%3 = tail call i64 @llvm.aarch64.neon.uaddlv.i64.v4i32(<4 x i32> %2)
diff --git a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
index 557aa010b3a7d9..7f2eefe5ed72f6 100644
--- a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
+++ b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
@@ -81,7 +81,7 @@ define i8 @convert_to_bitmask2(<2 x i64> %vec) {
; CHECK-NEXT: ldr q1, [x8, lCPI3_0 at PAGEOFF]
; CHECK-NEXT: bic.16b v0, v1, v0
; CHECK-NEXT: addp.2d d0, v0
-; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: fmov w8, s0
; CHECK-NEXT: and w0, w8, #0x3
; CHECK-NEXT: ret
; CHECK-NEXT: .loh AdrpLdr Lloh6, Lloh7
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-bool.ll b/llvm/test/CodeGen/AArch64/vecreduce-bool.ll
index 625e8ae6a98dc2..1bdf7bbb7f813c 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-bool.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-bool.ll
@@ -833,7 +833,7 @@ define i32 @reduce_xor_v2i64(<2 x i64> %a0, i32 %a1, i32 %a2) nounwind {
; CHECK: // %bb.0:
; CHECK-NEXT: cmlt v0.2d, v0.2d, #0
; CHECK-NEXT: addp d0, v0.2d
-; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: fmov w8, s0
; CHECK-NEXT: tst w8, #0x1
; CHECK-NEXT: csel w0, w0, w1, ne
; CHECK-NEXT: ret
More information about the llvm-commits
mailing list