[llvm] [AArch64] Avoid GPR trip when moving truncated i32 vector elements (PR #114541)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Nov 14 10:17:21 PST 2024
https://github.com/SpencerAbson updated https://github.com/llvm/llvm-project/pull/114541
>From 5cd15cb492a6c023e424ef232ebf1ae6ccfe9def Mon Sep 17 00:00:00 2001
From: Spencer Abson <Spencer.Abson at arm.com>
Date: Thu, 31 Oct 2024 14:46:09 +0000
Subject: [PATCH 1/5] [AArch64] Avoid GPR trip when moving truncated i32 vector
elements
This patch introduces ISEL patterns to enable the use of INS (element, of truncated size) when moving a
truncated (from i64) i32 element between vectors.
---
llvm/lib/Target/AArch64/AArch64InstrInfo.td | 53 +++++++--
.../CodeGen/AArch64/neon-ins-trunc-elt.ll | 106 ++++++++++++++++++
2 files changed, 149 insertions(+), 10 deletions(-)
create mode 100644 llvm/test/CodeGen/AArch64/neon-ins-trunc-elt.ll
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 250d6144f75318..f92532c27fe154 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -683,6 +683,22 @@ def topbitsallzero64: PatLeaf<(i64 GPR64:$src), [{
CurDAG->MaskedValueIsZero(SDValue(N,0), APInt::getHighBitsSet(64, 63));
}]>;
+def VectorIndexStoH : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(N->getZExtValue() * 2, SDLoc(N), MVT::i64);
+}]>;
+
+def VectorIndexDtoH : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(N->getZExtValue() * 4, SDLoc(N), MVT::i64);
+}]>;
+
+def VectorIndexStoB : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(N->getZExtValue() * 4, SDLoc(N), MVT::i64);
+}]>;
+
+def VectorIndexHtoB : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(N->getZExtValue() * 2, SDLoc(N), MVT::i64);
+}]>;
+
// Node definitions.
def AArch64adrp : SDNode<"AArch64ISD::ADRP", SDTIntUnaryOp, []>;
def AArch64adr : SDNode<"AArch64ISD::ADR", SDTIntUnaryOp, []>;
@@ -7281,6 +7297,33 @@ defm : Neon_INS_elt_pattern<v8i16, v4i16, nxv8i16, i32, VectorIndexH, INSvi1
defm : Neon_INS_elt_pattern<v4i32, v2i32, nxv4i32, i32, VectorIndexS, INSvi32lane>;
defm : Neon_INS_elt_pattern<v2i64, v1i64, nxv2i64, i64, VectorIndexD, INSvi64lane>;
+// Rmove GPR trip when inserting extracted truncated i64 into vector of i32.
+// From another NEON vector
+def : Pat<(v4i32 (vector_insert v4i32:$src,
+ (i32 (trunc (i64 (vector_extract v2i64:$Rn, (i64 imm:$Immn))))),
+ (i64 imm:$Immd))),
+ (INSvi32lane V128:$src, imm:$Immd, V128:$Rn, (VectorIndexHtoB imm:$Immn))>;
+
+def : Pat<(v2i32 (vector_insert v2i32:$src,
+ (i32 (trunc (i64 (vector_extract v2i64:$Rn, (i64 imm:$Immn))))),
+ (i64 imm:$Immd))),
+ (EXTRACT_SUBREG (INSvi32lane (SUBREG_TO_REG (i64 0), V64:$src, dsub),
+ imm:$Immd, V128:$Rn, (VectorIndexHtoB imm:$Immn)),
+ dsub)>;
+// From the bottom 128b of an SVE vector
+def : Pat<(v4i32 (vector_insert v4i32:$Rn,
+ (i32 (trunc (i64 (vector_extract nxv2i64:$Rm, (i64 VectorIndexD:$Immn))))),
+ (i64 imm:$Immd))),
+ (INSvi32lane V128:$Rn, imm:$Immd, (EXTRACT_SUBREG nxv2i64:$Rm, zsub), (VectorIndexHtoB VectorIndexD:$Immn))>;
+
+def : Pat<(v2i32 (vector_insert v2i32:$Rn,
+ (i32 (trunc (i64 (vector_extract nxv2i64:$Rm, (i64 VectorIndexD:$Immn))))),
+ (i64 imm:$Immd))),
+ (EXTRACT_SUBREG
+ (INSvi32lane (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), V64:$Rn, dsub), imm:$Immd,
+ (EXTRACT_SUBREG nxv2i64:$Rm, zsub), (VectorIndexHtoB VectorIndexD:$Immn)),
+ dsub)>;
+
// Insert from bitcast
// vector_insert(bitcast(f32 src), n, lane) -> INSvi32lane(src, lane, INSERT_SUBREG(-, n), 0)
def : Pat<(v4i32 (vector_insert v4i32:$src, (i32 (bitconvert (f32 FPR32:$Sn))), (i64 imm:$Immd))),
@@ -8700,16 +8743,6 @@ class Ld1Lane64IdxOpPat<SDPatternOperator scalar_load, Operand VecIndex,
(IdxOp VecIndex:$idx), GPR64sp:$Rn),
dsub)>;
-def VectorIndexStoH : SDNodeXForm<imm, [{
- return CurDAG->getTargetConstant(N->getZExtValue() * 2, SDLoc(N), MVT::i64);
-}]>;
-def VectorIndexStoB : SDNodeXForm<imm, [{
- return CurDAG->getTargetConstant(N->getZExtValue() * 4, SDLoc(N), MVT::i64);
-}]>;
-def VectorIndexHtoB : SDNodeXForm<imm, [{
- return CurDAG->getTargetConstant(N->getZExtValue() * 2, SDLoc(N), MVT::i64);
-}]>;
-
def : Ld1Lane128IdxOpPat<extloadi16, VectorIndexS, v4i32, i32, LD1i16, VectorIndexStoH>;
def : Ld1Lane128IdxOpPat<extloadi8, VectorIndexS, v4i32, i32, LD1i8, VectorIndexStoB>;
def : Ld1Lane128IdxOpPat<extloadi8, VectorIndexH, v8i16, i32, LD1i8, VectorIndexHtoB>;
diff --git a/llvm/test/CodeGen/AArch64/neon-ins-trunc-elt.ll b/llvm/test/CodeGen/AArch64/neon-ins-trunc-elt.ll
new file mode 100644
index 00000000000000..ab11f12d53790f
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/neon-ins-trunc-elt.ll
@@ -0,0 +1,106 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+neon < %s | FileCheck %s
+
+; Inserting a truncated (i64 to i32) element from the bottom 128-bits of any vector type into a NEON vector should use INS (element) of the
+; truncated size to avoid pointless GPR trips.
+
+
+define <2 x i32> @test_s_trunc_d_lane0(<2 x i32> %a, <1 x i64> %b) {
+; CHECK-LABEL: test_s_trunc_d_lane0:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: ret
+ %c = extractelement <1 x i64> %b, i32 0
+ %d = trunc i64 %c to i32
+ %e = insertelement <2 x i32> %a, i32 %d, i64 0
+ ret <2 x i32> %e
+}
+
+define <2 x i32> @test_s_trunc_d_qlane1(<2 x i32> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_s_trunc_d_qlane1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: mov v0.s[0], v1.s[2]
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: ret
+ %c = extractelement <2 x i64> %b, i32 1
+ %d = trunc i64 %c to i32
+ %e = insertelement <2 x i32> %a, i32 %d, i64 0
+ ret <2 x i32> %e
+}
+
+define <4 x i32> @test_qs_trunc_d_lane0(<4 x i32> %a, <1 x i64> %b) {
+; CHECK-LABEL: test_qs_trunc_d_lane0:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-NEXT: ret
+ %c = extractelement <1 x i64> %b, i32 0
+ %d = trunc i64 %c to i32
+ %e = insertelement <4 x i32> %a, i32 %d, i64 0
+ ret <4 x i32> %e
+}
+
+define <4 x i32> @test_qs_trunc_d_qlane1(<4 x i32> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_qs_trunc_d_qlane1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov v0.s[3], v1.s[2]
+; CHECK-NEXT: ret
+ %c = extractelement <2 x i64> %b, i32 1
+ %d = trunc i64 %c to i32
+ %e = insertelement <4 x i32> %a, i32 %d, i64 3
+ ret <4 x i32> %e
+}
+
+; ---- From the bottom 128b of an SVE vector
+
+define <2 x i32> @test_s_trunc_dsve_lane0(<2 x i32> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: test_s_trunc_dsve_lane0:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: ret
+ %c = extractelement <vscale x 2 x i64> %b, i32 0
+ %d = trunc i64 %c to i32
+ %e = insertelement <2 x i32> %a, i32 %d, i64 0
+ ret <2 x i32> %e
+}
+
+define <2 x i32> @test_s_trunc_dsve_lane1(<2 x i32> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: test_s_trunc_dsve_lane1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: mov v0.s[1], v1.s[2]
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: ret
+ %c = extractelement <vscale x 2 x i64> %b, i32 1
+ %d = trunc i64 %c to i32
+ %e = insertelement <2 x i32> %a, i32 %d, i64 1
+ ret <2 x i32> %e
+}
+
+define <4 x i32> @test_qs_trunc_dsve_lane0(<4 x i32> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: test_qs_trunc_dsve_lane0:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-NEXT: ret
+ %c = extractelement <vscale x 2 x i64> %b, i32 0
+ %d = trunc i64 %c to i32
+ %e = insertelement <4 x i32> %a, i32 %d, i64 0
+ ret <4 x i32> %e
+}
+
+define <4 x i32> @test_qs_trunc_dsve_lane1(<4 x i32> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: test_qs_trunc_dsve_lane1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov v0.s[3], v1.s[2]
+; CHECK-NEXT: ret
+ %c = extractelement <vscale x 2 x i64> %b, i32 1
+ %d = trunc i64 %c to i32
+ %e = insertelement <4 x i32> %a, i32 %d, i64 3
+ ret <4 x i32> %e
+}
>From cb225b4b2704af18acc4ce8e78ac430b5b27cfc9 Mon Sep 17 00:00:00 2001
From: Spencer Abson <Spencer.Abson at arm.com>
Date: Fri, 1 Nov 2024 13:25:07 +0000
Subject: [PATCH 2/5] [NFC] Remove unused node operator
---
llvm/lib/Target/AArch64/AArch64InstrInfo.td | 6 ------
1 file changed, 6 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index f92532c27fe154..4f00a7a187723b 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -686,15 +686,9 @@ def topbitsallzero64: PatLeaf<(i64 GPR64:$src), [{
def VectorIndexStoH : SDNodeXForm<imm, [{
return CurDAG->getTargetConstant(N->getZExtValue() * 2, SDLoc(N), MVT::i64);
}]>;
-
-def VectorIndexDtoH : SDNodeXForm<imm, [{
- return CurDAG->getTargetConstant(N->getZExtValue() * 4, SDLoc(N), MVT::i64);
-}]>;
-
def VectorIndexStoB : SDNodeXForm<imm, [{
return CurDAG->getTargetConstant(N->getZExtValue() * 4, SDLoc(N), MVT::i64);
}]>;
-
def VectorIndexHtoB : SDNodeXForm<imm, [{
return CurDAG->getTargetConstant(N->getZExtValue() * 2, SDLoc(N), MVT::i64);
}]>;
>From d25df8c9c8adabf741f476d988b7d0b19ef32519 Mon Sep 17 00:00:00 2001
From: Spencer Abson <Spencer.Abson at arm.com>
Date: Fri, 1 Nov 2024 13:54:01 +0000
Subject: [PATCH 3/5] [NFC] Add negative tests for bound of SVE extraction
---
.../CodeGen/AArch64/neon-ins-trunc-elt.ll | 30 +++++++++++++++++++
1 file changed, 30 insertions(+)
diff --git a/llvm/test/CodeGen/AArch64/neon-ins-trunc-elt.ll b/llvm/test/CodeGen/AArch64/neon-ins-trunc-elt.ll
index ab11f12d53790f..4c189cd2886e8a 100644
--- a/llvm/test/CodeGen/AArch64/neon-ins-trunc-elt.ll
+++ b/llvm/test/CodeGen/AArch64/neon-ins-trunc-elt.ll
@@ -83,6 +83,22 @@ define <2 x i32> @test_s_trunc_dsve_lane1(<2 x i32> %a, <vscale x 2 x i64> %b) {
ret <2 x i32> %e
}
+; (negative test) Extracted element is not within V-register.
+define <2 x i32> @test_s_trunc_dsve_lane2(<2 x i32> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: test_s_trunc_dsve_lane2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z1.d, z1.d[2]
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: fmov x8, d1
+; CHECK-NEXT: mov v0.s[1], w8
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: ret
+ %c = extractelement <vscale x 2 x i64> %b, i32 2
+ %d = trunc i64 %c to i32
+ %e = insertelement <2 x i32> %a, i32 %d, i64 1
+ ret <2 x i32> %e
+}
+
define <4 x i32> @test_qs_trunc_dsve_lane0(<4 x i32> %a, <vscale x 2 x i64> %b) {
; CHECK-LABEL: test_qs_trunc_dsve_lane0:
; CHECK: // %bb.0:
@@ -104,3 +120,17 @@ define <4 x i32> @test_qs_trunc_dsve_lane1(<4 x i32> %a, <vscale x 2 x i64> %b)
%e = insertelement <4 x i32> %a, i32 %d, i64 3
ret <4 x i32> %e
}
+
+; (negative test) Extracted element is not within V-register.
+define <4 x i32> @test_qs_trunc_dsve_lane2(<4 x i32> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: test_qs_trunc_dsve_lane2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z1.d, z1.d[2]
+; CHECK-NEXT: fmov x8, d1
+; CHECK-NEXT: mov v0.s[3], w8
+; CHECK-NEXT: ret
+ %c = extractelement <vscale x 2 x i64> %b, i32 2
+ %d = trunc i64 %c to i32
+ %e = insertelement <4 x i32> %a, i32 %d, i64 3
+ ret <4 x i32> %e
+}
>From a8c6043da985f1171cb0f2ec3843971d4e1fd27e Mon Sep 17 00:00:00 2001
From: Spencer Abson <Spencer.Abson at arm.com>
Date: Tue, 5 Nov 2024 19:13:55 +0000
Subject: [PATCH 4/5] Implement DAG combine for truncating extracted i64
---
.../Target/AArch64/AArch64ISelLowering.cpp | 37 ++++++++--
llvm/lib/Target/AArch64/AArch64InstrInfo.td | 73 +++++--------------
.../aarch64-neon-vector-insert-uaddlv.ll | 24 +++---
llvm/test/CodeGen/AArch64/fmlal-loreg.ll | 3 +-
.../CodeGen/AArch64/neon-ins-trunc-elt.ll | 8 +-
llvm/test/CodeGen/AArch64/sve-doublereduct.ll | 9 +--
.../CodeGen/AArch64/sve-extract-element.ll | 4 +-
.../AArch64/sve-extract-fixed-vector.ll | 7 +-
.../AArch64/sve-fixed-length-int-reduce.ll | 45 ++++--------
llvm/test/CodeGen/AArch64/sve-int-reduce.ll | 12 +--
.../CodeGen/AArch64/sve-split-int-reduce.ll | 18 ++---
...-streaming-mode-fixed-length-int-reduce.ll | 27 +++----
...-streaming-mode-fixed-length-reductions.ll | 6 +-
.../test/CodeGen/AArch64/sve-vecreduce-dot.ll | 3 +-
.../CodeGen/AArch64/uaddlv-vaddlp-combine.ll | 3 +-
.../AArch64/vec-combine-compare-to-bitmask.ll | 2 +-
16 files changed, 116 insertions(+), 165 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index e8c02c09879747..5bca2743a61cae 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -20735,8 +20735,9 @@ static SDValue performBuildVectorCombine(SDNode *N,
return SDValue();
}
-static SDValue performTruncateCombine(SDNode *N,
- SelectionDAG &DAG) {
+static SDValue performTruncateCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ SDLoc DL(N);
EVT VT = N->getValueType(0);
SDValue N0 = N->getOperand(0);
if (VT.isFixedLengthVector() && VT.is64BitVector() && N0.hasOneUse() &&
@@ -20744,8 +20745,34 @@ static SDValue performTruncateCombine(SDNode *N,
SDValue Op = N0.getOperand(0);
if (VT.getScalarType() == MVT::i32 &&
N0.getOperand(0).getValueType().getScalarType() == MVT::i64)
- Op = DAG.getNode(ISD::TRUNCATE, SDLoc(N), MVT::i32, Op);
- return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, Op);
+ Op = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Op);
+ return DAG.getNode(N0.getOpcode(), DL, VT, Op);
+ }
+
+ // Performing the following combine produces a preferable form for ISEL.
+ // i32 (trunc (extract Vi64, idx)) -> i32 (extract (nvcast Vi32), idx*2))
+ if (DCI.isAfterLegalizeDAG() && N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+ SDValue Op = N0.getOperand(0);
+ SDValue ExtractIndexNode = N0.getOperand(1);
+ if (!isa<ConstantSDNode>(ExtractIndexNode))
+ return SDValue();
+
+ // For a legal DAG, EXTRACT_VECTOR_ELT can only have produced an i32 or i64.
+ // So we can only expect: i32 (trunc (i64 (extract Vi64, idx))).
+ assert((VT == MVT::i32 && N0.getValueType() == MVT::i64) &&
+ "Unexpected legalisation result!");
+
+ MVT CastVT;
+ EVT SrcVectorType = Op.getValueType();
+ assert(SrcVectorType.getScalarType() == MVT::i64);
+ unsigned ExtractIndex =
+ cast<ConstantSDNode>(ExtractIndexNode)->getZExtValue();
+
+ CastVT = SrcVectorType.isScalableVector() ? MVT::nxv4i32 : MVT::v4i32;
+
+ Op = DAG.getNode(AArch64ISD::NVCAST, DL, CastVT, Op);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op,
+ DAG.getConstant(ExtractIndex * 2, DL, MVT::i64));
}
return SDValue();
@@ -25992,7 +26019,7 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
case ISD::BUILD_VECTOR:
return performBuildVectorCombine(N, DCI, DAG);
case ISD::TRUNCATE:
- return performTruncateCombine(N, DAG);
+ return performTruncateCombine(N, DAG, DCI);
case AArch64ISD::ANDS:
return performFlagSettingCombine(N, DCI, ISD::AND);
case AArch64ISD::ADC:
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 4f00a7a187723b..833adc274f7efc 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -683,16 +683,6 @@ def topbitsallzero64: PatLeaf<(i64 GPR64:$src), [{
CurDAG->MaskedValueIsZero(SDValue(N,0), APInt::getHighBitsSet(64, 63));
}]>;
-def VectorIndexStoH : SDNodeXForm<imm, [{
- return CurDAG->getTargetConstant(N->getZExtValue() * 2, SDLoc(N), MVT::i64);
-}]>;
-def VectorIndexStoB : SDNodeXForm<imm, [{
- return CurDAG->getTargetConstant(N->getZExtValue() * 4, SDLoc(N), MVT::i64);
-}]>;
-def VectorIndexHtoB : SDNodeXForm<imm, [{
- return CurDAG->getTargetConstant(N->getZExtValue() * 2, SDLoc(N), MVT::i64);
-}]>;
-
// Node definitions.
def AArch64adrp : SDNode<"AArch64ISD::ADRP", SDTIntUnaryOp, []>;
def AArch64adr : SDNode<"AArch64ISD::ADR", SDTIntUnaryOp, []>;
@@ -6997,24 +6987,18 @@ defm : DUPWithTruncPats<v16i8, v4i16, v8i16, i32, DUPv16i8lane, VecIndex_x2>;
defm : DUPWithTruncPats<v16i8, v2i32, v4i32, i32, DUPv16i8lane, VecIndex_x4>;
defm : DUPWithTruncPats<v8i16, v2i32, v4i32, i32, DUPv8i16lane, VecIndex_x2>;
-multiclass DUPWithTrunci64Pats<ValueType ResVT, Instruction DUP,
- SDNodeXForm IdxXFORM> {
- def : Pat<(ResVT (AArch64dup (i32 (trunc (extractelt (v2i64 V128:$Rn),
- imm:$idx))))),
- (DUP V128:$Rn, (IdxXFORM imm:$idx))>;
-
- def : Pat<(ResVT (AArch64dup (i32 (trunc (extractelt (v1i64 V64:$Rn),
- imm:$idx))))),
- (DUP (SUBREG_TO_REG (i64 0), V64:$Rn, dsub), (IdxXFORM imm:$idx))>;
-}
+class DUPWithTrunci64Pat<ValueType ResVT, Instruction DUP,
+ SDNodeXForm IdxXFORM>
+ : Pat<(ResVT (AArch64dup (i32 (extractelt (v4i32 V128:$Rn), imm:$idx)))),
+ (DUP V128:$Rn, (IdxXFORM imm:$idx))>;
-defm : DUPWithTrunci64Pats<v8i8, DUPv8i8lane, VecIndex_x8>;
-defm : DUPWithTrunci64Pats<v4i16, DUPv4i16lane, VecIndex_x4>;
-defm : DUPWithTrunci64Pats<v2i32, DUPv2i32lane, VecIndex_x2>;
+def : DUPWithTrunci64Pat<v8i8, DUPv8i8lane, VecIndex_x4>;
+def : DUPWithTrunci64Pat<v4i16, DUPv4i16lane, VecIndex_x2>;
+def : DUPWithTrunci64Pat<v2i32, DUPv2i32lane, NOOP_SDNodeXForm>;
-defm : DUPWithTrunci64Pats<v16i8, DUPv16i8lane, VecIndex_x8>;
-defm : DUPWithTrunci64Pats<v8i16, DUPv8i16lane, VecIndex_x4>;
-defm : DUPWithTrunci64Pats<v4i32, DUPv4i32lane, VecIndex_x2>;
+def : DUPWithTrunci64Pat<v16i8, DUPv16i8lane, VecIndex_x4>;
+def : DUPWithTrunci64Pat<v8i16, DUPv8i16lane, VecIndex_x2>;
+def : DUPWithTrunci64Pat<v4i32, DUPv4i32lane, NOOP_SDNodeXForm>;
// SMOV and UMOV definitions, with some extra patterns for convenience
defm SMOV : SMov;
@@ -7291,33 +7275,6 @@ defm : Neon_INS_elt_pattern<v8i16, v4i16, nxv8i16, i32, VectorIndexH, INSvi1
defm : Neon_INS_elt_pattern<v4i32, v2i32, nxv4i32, i32, VectorIndexS, INSvi32lane>;
defm : Neon_INS_elt_pattern<v2i64, v1i64, nxv2i64, i64, VectorIndexD, INSvi64lane>;
-// Rmove GPR trip when inserting extracted truncated i64 into vector of i32.
-// From another NEON vector
-def : Pat<(v4i32 (vector_insert v4i32:$src,
- (i32 (trunc (i64 (vector_extract v2i64:$Rn, (i64 imm:$Immn))))),
- (i64 imm:$Immd))),
- (INSvi32lane V128:$src, imm:$Immd, V128:$Rn, (VectorIndexHtoB imm:$Immn))>;
-
-def : Pat<(v2i32 (vector_insert v2i32:$src,
- (i32 (trunc (i64 (vector_extract v2i64:$Rn, (i64 imm:$Immn))))),
- (i64 imm:$Immd))),
- (EXTRACT_SUBREG (INSvi32lane (SUBREG_TO_REG (i64 0), V64:$src, dsub),
- imm:$Immd, V128:$Rn, (VectorIndexHtoB imm:$Immn)),
- dsub)>;
-// From the bottom 128b of an SVE vector
-def : Pat<(v4i32 (vector_insert v4i32:$Rn,
- (i32 (trunc (i64 (vector_extract nxv2i64:$Rm, (i64 VectorIndexD:$Immn))))),
- (i64 imm:$Immd))),
- (INSvi32lane V128:$Rn, imm:$Immd, (EXTRACT_SUBREG nxv2i64:$Rm, zsub), (VectorIndexHtoB VectorIndexD:$Immn))>;
-
-def : Pat<(v2i32 (vector_insert v2i32:$Rn,
- (i32 (trunc (i64 (vector_extract nxv2i64:$Rm, (i64 VectorIndexD:$Immn))))),
- (i64 imm:$Immd))),
- (EXTRACT_SUBREG
- (INSvi32lane (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), V64:$Rn, dsub), imm:$Immd,
- (EXTRACT_SUBREG nxv2i64:$Rm, zsub), (VectorIndexHtoB VectorIndexD:$Immn)),
- dsub)>;
-
// Insert from bitcast
// vector_insert(bitcast(f32 src), n, lane) -> INSvi32lane(src, lane, INSERT_SUBREG(-, n), 0)
def : Pat<(v4i32 (vector_insert v4i32:$src, (i32 (bitconvert (f32 FPR32:$Sn))), (i64 imm:$Immd))),
@@ -8737,6 +8694,16 @@ class Ld1Lane64IdxOpPat<SDPatternOperator scalar_load, Operand VecIndex,
(IdxOp VecIndex:$idx), GPR64sp:$Rn),
dsub)>;
+def VectorIndexStoH : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(N->getZExtValue() * 2, SDLoc(N), MVT::i64);
+}]>;
+def VectorIndexStoB : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(N->getZExtValue() * 4, SDLoc(N), MVT::i64);
+}]>;
+def VectorIndexHtoB : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(N->getZExtValue() * 2, SDLoc(N), MVT::i64);
+}]>;
+
def : Ld1Lane128IdxOpPat<extloadi16, VectorIndexS, v4i32, i32, LD1i16, VectorIndexStoH>;
def : Ld1Lane128IdxOpPat<extloadi8, VectorIndexS, v4i32, i32, LD1i8, VectorIndexStoB>;
def : Ld1Lane128IdxOpPat<extloadi8, VectorIndexH, v8i16, i32, LD1i8, VectorIndexHtoB>;
diff --git a/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll b/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll
index 2e165179381820..1b7bc128d6332e 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll
@@ -384,9 +384,9 @@ define void @insert_vec_v4i16_uaddlv_from_v4i32(ptr %0) {
; CHECK-NEXT: movi.2d v1, #0000000000000000
; CHECK-NEXT: uaddlv.4s d0, v0
; CHECK-NEXT: mov.h v1[0], v0[0]
-; CHECK-NEXT: ushll.4s v0, v1, #0
-; CHECK-NEXT: ucvtf.4s v0, v0
-; CHECK-NEXT: str q0, [x0]
+; CHECK-NEXT: ushll.4s v1, v1, #0
+; CHECK-NEXT: ucvtf.4s v1, v1
+; CHECK-NEXT: str q1, [x0]
; CHECK-NEXT: ret
entry:
@@ -403,13 +403,13 @@ define void @insert_vec_v16i16_uaddlv_from_v4i32(ptr %0) {
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: movi.2d v0, #0000000000000000
; CHECK-NEXT: movi.2d v1, #0000000000000000
+; CHECK-NEXT: movi.2d v2, #0000000000000000
; CHECK-NEXT: uaddlv.4s d0, v0
+; CHECK-NEXT: stp q2, q2, [x0, #32]
; CHECK-NEXT: mov.h v1[0], v0[0]
-; CHECK-NEXT: movi.2d v0, #0000000000000000
; CHECK-NEXT: ushll.4s v1, v1, #0
-; CHECK-NEXT: stp q0, q0, [x0, #32]
; CHECK-NEXT: ucvtf.4s v1, v1
-; CHECK-NEXT: stp q1, q0, [x0]
+; CHECK-NEXT: stp q1, q2, [x0]
; CHECK-NEXT: ret
entry:
@@ -430,9 +430,9 @@ define void @insert_vec_v8i8_uaddlv_from_v4i32(ptr %0) {
; CHECK-NEXT: uaddlv.4s d0, v0
; CHECK-NEXT: mov.h v1[0], v0[0]
; CHECK-NEXT: bic.4h v1, #255, lsl #8
-; CHECK-NEXT: ushll.4s v0, v1, #0
-; CHECK-NEXT: ucvtf.4s v0, v0
-; CHECK-NEXT: str q0, [x0]
+; CHECK-NEXT: ushll.4s v1, v1, #0
+; CHECK-NEXT: ucvtf.4s v1, v1
+; CHECK-NEXT: str q1, [x0]
; CHECK-NEXT: ret
entry:
@@ -449,14 +449,14 @@ define void @insert_vec_v16i8_uaddlv_from_v4i32(ptr %0) {
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: movi.2d v0, #0000000000000000
; CHECK-NEXT: movi.2d v1, #0000000000000000
+; CHECK-NEXT: movi.2d v2, #0000000000000000
; CHECK-NEXT: uaddlv.4s d0, v0
+; CHECK-NEXT: stp q2, q2, [x0, #32]
; CHECK-NEXT: mov.h v1[0], v0[0]
-; CHECK-NEXT: movi.2d v0, #0000000000000000
; CHECK-NEXT: bic.4h v1, #255, lsl #8
-; CHECK-NEXT: stp q0, q0, [x0, #32]
; CHECK-NEXT: ushll.4s v1, v1, #0
; CHECK-NEXT: ucvtf.4s v1, v1
-; CHECK-NEXT: stp q1, q0, [x0]
+; CHECK-NEXT: stp q1, q2, [x0]
; CHECK-NEXT: ret
entry:
diff --git a/llvm/test/CodeGen/AArch64/fmlal-loreg.ll b/llvm/test/CodeGen/AArch64/fmlal-loreg.ll
index 31ead890ba8ac7..6116c6421c4b12 100644
--- a/llvm/test/CodeGen/AArch64/fmlal-loreg.ll
+++ b/llvm/test/CodeGen/AArch64/fmlal-loreg.ll
@@ -11,9 +11,10 @@ define <4 x float> @test(ptr %lhs_panel, ptr %rhs_panel, <4 x float> %a) {
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: .cfi_offset b8, -16
; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: fmov w9, s0
; CHECK-NEXT: ldr q16, [x1]
; CHECK-NEXT: ldr q8, [x0]
-; CHECK-NEXT: lsr x9, x8, #32
+; CHECK-NEXT: lsr x8, x8, #32
; CHECK-NEXT: //APP
; CHECK-NEXT: nop
; CHECK-NEXT: //NO_APP
diff --git a/llvm/test/CodeGen/AArch64/neon-ins-trunc-elt.ll b/llvm/test/CodeGen/AArch64/neon-ins-trunc-elt.ll
index 4c189cd2886e8a..0d58fc59c2c319 100644
--- a/llvm/test/CodeGen/AArch64/neon-ins-trunc-elt.ll
+++ b/llvm/test/CodeGen/AArch64/neon-ins-trunc-elt.ll
@@ -87,9 +87,9 @@ define <2 x i32> @test_s_trunc_dsve_lane1(<2 x i32> %a, <vscale x 2 x i64> %b) {
define <2 x i32> @test_s_trunc_dsve_lane2(<2 x i32> %a, <vscale x 2 x i64> %b) {
; CHECK-LABEL: test_s_trunc_dsve_lane2:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov z1.d, z1.d[2]
+; CHECK-NEXT: mov z1.s, z1.s[4]
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: fmov x8, d1
+; CHECK-NEXT: fmov w8, s1
; CHECK-NEXT: mov v0.s[1], w8
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: ret
@@ -125,8 +125,8 @@ define <4 x i32> @test_qs_trunc_dsve_lane1(<4 x i32> %a, <vscale x 2 x i64> %b)
define <4 x i32> @test_qs_trunc_dsve_lane2(<4 x i32> %a, <vscale x 2 x i64> %b) {
; CHECK-LABEL: test_qs_trunc_dsve_lane2:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov z1.d, z1.d[2]
-; CHECK-NEXT: fmov x8, d1
+; CHECK-NEXT: mov z1.s, z1.s[4]
+; CHECK-NEXT: fmov w8, s1
; CHECK-NEXT: mov v0.s[3], w8
; CHECK-NEXT: ret
%c = extractelement <vscale x 2 x i64> %b, i32 2
diff --git a/llvm/test/CodeGen/AArch64/sve-doublereduct.ll b/llvm/test/CodeGen/AArch64/sve-doublereduct.ll
index 7bc31d44bb6547..b813b8f84ba16b 100644
--- a/llvm/test/CodeGen/AArch64/sve-doublereduct.ll
+++ b/llvm/test/CodeGen/AArch64/sve-doublereduct.ll
@@ -91,8 +91,7 @@ define i32 @add_i32(<vscale x 8 x i32> %a, <vscale x 4 x i32> %b) {
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: add z0.s, z0.s, z2.s
; CHECK-NEXT: uaddv d0, p0, z0.s
-; CHECK-NEXT: fmov x0, d0
-; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
%r1 = call i32 @llvm.vector.reduce.add.i32.nxv8i32(<vscale x 8 x i32> %a)
%r2 = call i32 @llvm.vector.reduce.add.i32.nxv4i32(<vscale x 4 x i32> %b)
@@ -112,8 +111,7 @@ define i16 @add_ext_i16(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
; CHECK-NEXT: add z1.h, z1.h, z3.h
; CHECK-NEXT: add z0.h, z0.h, z1.h
; CHECK-NEXT: uaddv d0, p0, z0.h
-; CHECK-NEXT: fmov x0, d0
-; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
%ae = zext <vscale x 16 x i8> %a to <vscale x 16 x i16>
%be = zext <vscale x 16 x i8> %b to <vscale x 16 x i16>
@@ -139,8 +137,7 @@ define i16 @add_ext_v32i16(<vscale x 32 x i8> %a, <vscale x 16 x i8> %b) {
; CHECK-NEXT: add z1.h, z2.h, z5.h
; CHECK-NEXT: add z0.h, z0.h, z1.h
; CHECK-NEXT: uaddv d0, p0, z0.h
-; CHECK-NEXT: fmov x0, d0
-; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
%ae = zext <vscale x 32 x i8> %a to <vscale x 32 x i16>
%be = zext <vscale x 16 x i8> %b to <vscale x 16 x i16>
diff --git a/llvm/test/CodeGen/AArch64/sve-extract-element.ll b/llvm/test/CodeGen/AArch64/sve-extract-element.ll
index 6d4f5963881e58..939c7e43100189 100644
--- a/llvm/test/CodeGen/AArch64/sve-extract-element.ll
+++ b/llvm/test/CodeGen/AArch64/sve-extract-element.ll
@@ -644,8 +644,8 @@ define i1 @test_lane4_2xi1(<vscale x 2 x i1> %a) #0 {
; CHECK-LABEL: test_lane4_2xi1:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z0.d, p0/z, #1 // =0x1
-; CHECK-NEXT: mov z0.d, z0.d[4]
-; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: mov z0.s, z0.s[8]
+; CHECK-NEXT: fmov w8, s0
; CHECK-NEXT: and w0, w8, #0x1
; CHECK-NEXT: ret
%b = extractelement <vscale x 2 x i1> %a, i32 4
diff --git a/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll b/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll
index 518e3573b5edd3..965af2a745afd4 100644
--- a/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll
@@ -238,11 +238,8 @@ define <2 x i1> @extract_v2i1_nxv2i1(<vscale x 2 x i1> %inmask) {
; CHECK-LABEL: extract_v2i1_nxv2i1:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z0.d, p0/z, #1 // =0x1
-; CHECK-NEXT: fmov x0, d0
-; CHECK-NEXT: mov x8, v0.d[1]
-; CHECK-NEXT: fmov s0, w0
-; CHECK-NEXT: mov v0.s[1], w8
-; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: mov v0.s[1], v0.s[2]
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
%mask = call <2 x i1> @llvm.vector.extract.v2i1.nxv2i1(<vscale x 2 x i1> %inmask, i64 0)
ret <2 x i1> %mask
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-reduce.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-reduce.ll
index 752c2cd34bfe48..be19e9ef5e86f8 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-reduce.ll
@@ -37,8 +37,7 @@ define i8 @uaddv_v32i8(ptr %a) vscale_range(2,0) #0 {
; CHECK-NEXT: ptrue p0.b, vl32
; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
; CHECK-NEXT: uaddv d0, p0, z0.b
-; CHECK-NEXT: fmov x0, d0
-; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
%op = load <32 x i8>, ptr %a
%res = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> %op)
@@ -54,8 +53,7 @@ define i8 @uaddv_v64i8(ptr %a) #0 {
; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0]
; VBITS_GE_256-NEXT: add z0.b, z1.b, z0.b
; VBITS_GE_256-NEXT: uaddv d0, p0, z0.b
-; VBITS_GE_256-NEXT: fmov x0, d0
-; VBITS_GE_256-NEXT: // kill: def $w0 killed $w0 killed $x0
+; VBITS_GE_256-NEXT: fmov w0, s0
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: uaddv_v64i8:
@@ -63,8 +61,7 @@ define i8 @uaddv_v64i8(ptr %a) #0 {
; VBITS_GE_512-NEXT: ptrue p0.b, vl64
; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0]
; VBITS_GE_512-NEXT: uaddv d0, p0, z0.b
-; VBITS_GE_512-NEXT: fmov x0, d0
-; VBITS_GE_512-NEXT: // kill: def $w0 killed $w0 killed $x0
+; VBITS_GE_512-NEXT: fmov w0, s0
; VBITS_GE_512-NEXT: ret
%op = load <64 x i8>, ptr %a
%res = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> %op)
@@ -77,8 +74,7 @@ define i8 @uaddv_v128i8(ptr %a) vscale_range(8,0) #0 {
; CHECK-NEXT: ptrue p0.b, vl128
; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
; CHECK-NEXT: uaddv d0, p0, z0.b
-; CHECK-NEXT: fmov x0, d0
-; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
%op = load <128 x i8>, ptr %a
%res = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> %op)
@@ -91,8 +87,7 @@ define i8 @uaddv_v256i8(ptr %a) vscale_range(16,0) #0 {
; CHECK-NEXT: ptrue p0.b, vl256
; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
; CHECK-NEXT: uaddv d0, p0, z0.b
-; CHECK-NEXT: fmov x0, d0
-; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
%op = load <256 x i8>, ptr %a
%res = call i8 @llvm.vector.reduce.add.v256i8(<256 x i8> %op)
@@ -127,8 +122,7 @@ define i16 @uaddv_v16i16(ptr %a) vscale_range(2,0) #0 {
; CHECK-NEXT: ptrue p0.h, vl16
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: uaddv d0, p0, z0.h
-; CHECK-NEXT: fmov x0, d0
-; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
%op = load <16 x i16>, ptr %a
%res = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %op)
@@ -144,8 +138,7 @@ define i16 @uaddv_v32i16(ptr %a) #0 {
; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
; VBITS_GE_256-NEXT: add z0.h, z1.h, z0.h
; VBITS_GE_256-NEXT: uaddv d0, p0, z0.h
-; VBITS_GE_256-NEXT: fmov x0, d0
-; VBITS_GE_256-NEXT: // kill: def $w0 killed $w0 killed $x0
+; VBITS_GE_256-NEXT: fmov w0, s0
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: uaddv_v32i16:
@@ -153,8 +146,7 @@ define i16 @uaddv_v32i16(ptr %a) #0 {
; VBITS_GE_512-NEXT: ptrue p0.h, vl32
; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_512-NEXT: uaddv d0, p0, z0.h
-; VBITS_GE_512-NEXT: fmov x0, d0
-; VBITS_GE_512-NEXT: // kill: def $w0 killed $w0 killed $x0
+; VBITS_GE_512-NEXT: fmov w0, s0
; VBITS_GE_512-NEXT: ret
%op = load <32 x i16>, ptr %a
%res = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> %op)
@@ -167,8 +159,7 @@ define i16 @uaddv_v64i16(ptr %a) vscale_range(8,0) #0 {
; CHECK-NEXT: ptrue p0.h, vl64
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: uaddv d0, p0, z0.h
-; CHECK-NEXT: fmov x0, d0
-; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
%op = load <64 x i16>, ptr %a
%res = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> %op)
@@ -181,8 +172,7 @@ define i16 @uaddv_v128i16(ptr %a) vscale_range(16,0) #0 {
; CHECK-NEXT: ptrue p0.h, vl128
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: uaddv d0, p0, z0.h
-; CHECK-NEXT: fmov x0, d0
-; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
%op = load <128 x i16>, ptr %a
%res = call i16 @llvm.vector.reduce.add.v128i16(<128 x i16> %op)
@@ -217,8 +207,7 @@ define i32 @uaddv_v8i32(ptr %a) vscale_range(2,0) #0 {
; CHECK-NEXT: ptrue p0.s, vl8
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: uaddv d0, p0, z0.s
-; CHECK-NEXT: fmov x0, d0
-; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
%op = load <8 x i32>, ptr %a
%res = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %op)
@@ -234,8 +223,7 @@ define i32 @uaddv_v16i32(ptr %a) #0 {
; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: add z0.s, z1.s, z0.s
; VBITS_GE_256-NEXT: uaddv d0, p0, z0.s
-; VBITS_GE_256-NEXT: fmov x0, d0
-; VBITS_GE_256-NEXT: // kill: def $w0 killed $w0 killed $x0
+; VBITS_GE_256-NEXT: fmov w0, s0
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: uaddv_v16i32:
@@ -243,8 +231,7 @@ define i32 @uaddv_v16i32(ptr %a) #0 {
; VBITS_GE_512-NEXT: ptrue p0.s, vl16
; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_512-NEXT: uaddv d0, p0, z0.s
-; VBITS_GE_512-NEXT: fmov x0, d0
-; VBITS_GE_512-NEXT: // kill: def $w0 killed $w0 killed $x0
+; VBITS_GE_512-NEXT: fmov w0, s0
; VBITS_GE_512-NEXT: ret
%op = load <16 x i32>, ptr %a
%res = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %op)
@@ -257,8 +244,7 @@ define i32 @uaddv_v32i32(ptr %a) vscale_range(8,0) #0 {
; CHECK-NEXT: ptrue p0.s, vl32
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: uaddv d0, p0, z0.s
-; CHECK-NEXT: fmov x0, d0
-; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
%op = load <32 x i32>, ptr %a
%res = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %op)
@@ -271,8 +257,7 @@ define i32 @uaddv_v64i32(ptr %a) vscale_range(16,0) #0 {
; CHECK-NEXT: ptrue p0.s, vl64
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: uaddv d0, p0, z0.s
-; CHECK-NEXT: fmov x0, d0
-; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
%op = load <64 x i32>, ptr %a
%res = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %op)
diff --git a/llvm/test/CodeGen/AArch64/sve-int-reduce.ll b/llvm/test/CodeGen/AArch64/sve-int-reduce.ll
index 8c1b5225b7f257..6ec18477fe1a0c 100644
--- a/llvm/test/CodeGen/AArch64/sve-int-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve-int-reduce.ll
@@ -146,8 +146,7 @@ define i8 @uaddv_nxv16i8(<vscale x 16 x i8> %a) {
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.b
; CHECK-NEXT: uaddv d0, p0, z0.b
-; CHECK-NEXT: fmov x0, d0
-; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
%res = call i8 @llvm.vector.reduce.add.nxv16i8(<vscale x 16 x i8> %a)
ret i8 %res
@@ -158,8 +157,7 @@ define i16 @uaddv_nxv8i16(<vscale x 8 x i16> %a) {
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: uaddv d0, p0, z0.h
-; CHECK-NEXT: fmov x0, d0
-; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
%res = call i16 @llvm.vector.reduce.add.nxv8i16(<vscale x 8 x i16> %a)
ret i16 %res
@@ -170,8 +168,7 @@ define i32 @uaddv_nxv4i32(<vscale x 4 x i32> %a) {
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: uaddv d0, p0, z0.s
-; CHECK-NEXT: fmov x0, d0
-; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
%res = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %a)
ret i32 %res
@@ -422,8 +419,7 @@ define i8 @uaddv_nxv12i8(<vscale x 12 x i8> %a) {
; CHECK-NEXT: uzp1 z1.h, z1.h, z2.h
; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b
; CHECK-NEXT: uaddv d0, p0, z0.b
-; CHECK-NEXT: fmov x0, d0
-; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
%res = call i8 @llvm.vector.reduce.add.nxv12i8(<vscale x 12 x i8> %a)
ret i8 %res
diff --git a/llvm/test/CodeGen/AArch64/sve-split-int-reduce.ll b/llvm/test/CodeGen/AArch64/sve-split-int-reduce.ll
index dd7b15ef5ee6f4..90383b43d58128 100644
--- a/llvm/test/CodeGen/AArch64/sve-split-int-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve-split-int-reduce.ll
@@ -33,8 +33,7 @@ define i32 @orv_nxv2i32(<vscale x 2 x i32> %a) {
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: orv d0, p0, z0.d
-; CHECK-NEXT: fmov x0, d0
-; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
%res = call i32 @llvm.vector.reduce.or.nxv2i32(<vscale x 2 x i32> %a)
ret i32 %res
@@ -61,8 +60,7 @@ define i16 @xorv_nxv2i16(<vscale x 2 x i16> %a) {
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: eorv d0, p0, z0.d
-; CHECK-NEXT: fmov x0, d0
-; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
%res = call i16 @llvm.vector.reduce.xor.nxv2i16(<vscale x 2 x i16> %a)
ret i16 %res
@@ -87,8 +85,7 @@ define i16 @uaddv_nxv4i16(<vscale x 4 x i16> %a) {
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: uaddv d0, p0, z0.s
-; CHECK-NEXT: fmov x0, d0
-; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
%res = call i16 @llvm.vector.reduce.add.nxv4i16(<vscale x 4 x i16> %a)
ret i16 %res
@@ -100,8 +97,7 @@ define i16 @uaddv_nxv16i16(<vscale x 16 x i16> %a) {
; CHECK-NEXT: add z0.h, z0.h, z1.h
; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: uaddv d0, p0, z0.h
-; CHECK-NEXT: fmov x0, d0
-; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
%res = call i16 @llvm.vector.reduce.add.nxv16i16(<vscale x 16 x i16> %a)
ret i16 %res
@@ -115,8 +111,7 @@ define i32 @uaddv_nxv16i32(<vscale x 16 x i32> %a) {
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: add z0.s, z0.s, z1.s
; CHECK-NEXT: uaddv d0, p0, z0.s
-; CHECK-NEXT: fmov x0, d0
-; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
%res = call i32 @llvm.vector.reduce.add.nxv16i32(<vscale x 16 x i32> %a)
ret i32 %res
@@ -130,8 +125,7 @@ define i32 @umin_nxv2i32(<vscale x 2 x i32> %a) {
; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: uminv d0, p0, z0.d
-; CHECK-NEXT: fmov x0, d0
-; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
%res = call i32 @llvm.vector.reduce.umin.nxv2i32(<vscale x 2 x i32> %a)
ret i32 %res
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-reduce.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-reduce.ll
index 92a67cba55f7a1..244dcc734bd7c2 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-reduce.ll
@@ -15,8 +15,7 @@ define i8 @uaddv_v8i8(<8 x i8> %a) {
; CHECK-NEXT: ptrue p0.b, vl8
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
; CHECK-NEXT: uaddv d0, p0, z0.b
-; CHECK-NEXT: fmov x0, d0
-; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: uaddv_v8i8:
@@ -51,8 +50,7 @@ define i8 @uaddv_v16i8(<16 x i8> %a) {
; CHECK-NEXT: ptrue p0.b, vl16
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
; CHECK-NEXT: uaddv d0, p0, z0.b
-; CHECK-NEXT: fmov x0, d0
-; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: uaddv_v16i8:
@@ -103,8 +101,7 @@ define i8 @uaddv_v32i8(ptr %a) {
; CHECK-NEXT: ptrue p0.b, vl16
; CHECK-NEXT: add z0.b, z1.b, z0.b
; CHECK-NEXT: uaddv d0, p0, z0.b
-; CHECK-NEXT: fmov x0, d0
-; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: uaddv_v32i8:
@@ -188,8 +185,7 @@ define i16 @uaddv_v4i16(<4 x i16> %a) {
; CHECK-NEXT: ptrue p0.h, vl4
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
; CHECK-NEXT: uaddv d0, p0, z0.h
-; CHECK-NEXT: fmov x0, d0
-; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: uaddv_v4i16:
@@ -216,8 +212,7 @@ define i16 @uaddv_v8i16(<8 x i16> %a) {
; CHECK-NEXT: ptrue p0.h, vl8
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
; CHECK-NEXT: uaddv d0, p0, z0.h
-; CHECK-NEXT: fmov x0, d0
-; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: uaddv_v8i16:
@@ -252,8 +247,7 @@ define i16 @uaddv_v16i16(ptr %a) {
; CHECK-NEXT: ptrue p0.h, vl8
; CHECK-NEXT: add z0.h, z1.h, z0.h
; CHECK-NEXT: uaddv d0, p0, z0.h
-; CHECK-NEXT: fmov x0, d0
-; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: uaddv_v16i16:
@@ -305,8 +299,7 @@ define i32 @uaddv_v2i32(<2 x i32> %a) {
; CHECK-NEXT: ptrue p0.s, vl2
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
; CHECK-NEXT: uaddv d0, p0, z0.s
-; CHECK-NEXT: fmov x0, d0
-; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: uaddv_v2i32:
@@ -328,8 +321,7 @@ define i32 @uaddv_v4i32(<4 x i32> %a) {
; CHECK-NEXT: ptrue p0.s, vl4
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
; CHECK-NEXT: uaddv d0, p0, z0.s
-; CHECK-NEXT: fmov x0, d0
-; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: uaddv_v4i32:
@@ -353,8 +345,7 @@ define i32 @uaddv_v8i32(ptr %a) {
; CHECK-NEXT: ptrue p0.s, vl4
; CHECK-NEXT: add z0.s, z1.s, z0.s
; CHECK-NEXT: uaddv d0, p0, z0.s
-; CHECK-NEXT: fmov x0, d0
-; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: uaddv_v8i32:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reductions.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reductions.ll
index 00a15f4bcd6394..688537704a6f73 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reductions.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reductions.ll
@@ -66,8 +66,7 @@ define i32 @reduce_uaddv_v16i8(<32 x i8> %a) {
; STREAMING-SVE-NEXT: add z0.s, z2.s, z0.s
; STREAMING-SVE-NEXT: add z0.s, z1.s, z0.s
; STREAMING-SVE-NEXT: uaddv d0, p0, z0.s
-; STREAMING-SVE-NEXT: fmov x0, d0
-; STREAMING-SVE-NEXT: // kill: def $w0 killed $w0 killed $x0
+; STREAMING-SVE-NEXT: fmov w0, s0
; STREAMING-SVE-NEXT: ret
%1 = zext <32 x i8> %a to <32 x i32>
%2 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %1)
@@ -134,8 +133,7 @@ define i32 @reduce_saddv_v16i8(<32 x i8> %a) {
; STREAMING-SVE-NEXT: add z0.s, z2.s, z0.s
; STREAMING-SVE-NEXT: add z0.s, z1.s, z0.s
; STREAMING-SVE-NEXT: uaddv d0, p0, z0.s
-; STREAMING-SVE-NEXT: fmov x0, d0
-; STREAMING-SVE-NEXT: // kill: def $w0 killed $w0 killed $x0
+; STREAMING-SVE-NEXT: fmov w0, s0
; STREAMING-SVE-NEXT: ret
%1 = sext <32 x i8> %a to <32 x i32>
%2 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %1)
diff --git a/llvm/test/CodeGen/AArch64/sve-vecreduce-dot.ll b/llvm/test/CodeGen/AArch64/sve-vecreduce-dot.ll
index 91f8f5c2c90d84..6af26067cd6d6d 100644
--- a/llvm/test/CodeGen/AArch64/sve-vecreduce-dot.ll
+++ b/llvm/test/CodeGen/AArch64/sve-vecreduce-dot.ll
@@ -42,8 +42,7 @@ define i32 @test(<vscale x 32 x i8> %bin.rdx, <vscale x 32 x i8> %bin.rdx2) {
; CHECK-NEXT: add z1.s, z3.s, z1.s
; CHECK-NEXT: add z0.s, z1.s, z0.s
; CHECK-NEXT: uaddv d0, p0, z0.s
-; CHECK-NEXT: fmov x0, d0
-; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
%a = sext <vscale x 32 x i8> %bin.rdx to <vscale x 32 x i32>
%b = sext <vscale x 32 x i8> %bin.rdx2 to <vscale x 32 x i32>
diff --git a/llvm/test/CodeGen/AArch64/uaddlv-vaddlp-combine.ll b/llvm/test/CodeGen/AArch64/uaddlv-vaddlp-combine.ll
index f0856c43daf1d9..e6905f687ad9a2 100644
--- a/llvm/test/CodeGen/AArch64/uaddlv-vaddlp-combine.ll
+++ b/llvm/test/CodeGen/AArch64/uaddlv-vaddlp-combine.ll
@@ -5,8 +5,7 @@ define i32 @uaddlv_uaddlp_v8i16(<8 x i16> %0) {
; CHECK-LABEL: uaddlv_uaddlp_v8i16:
; CHECK: // %bb.0:
; CHECK-NEXT: uaddlv s0, v0.8h
-; CHECK-NEXT: fmov x0, d0
-; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
%2 = tail call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> %0)
%3 = tail call i64 @llvm.aarch64.neon.uaddlv.i64.v4i32(<4 x i32> %2)
diff --git a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
index 557aa010b3a7d9..7f2eefe5ed72f6 100644
--- a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
+++ b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
@@ -81,7 +81,7 @@ define i8 @convert_to_bitmask2(<2 x i64> %vec) {
; CHECK-NEXT: ldr q1, [x8, lCPI3_0 at PAGEOFF]
; CHECK-NEXT: bic.16b v0, v1, v0
; CHECK-NEXT: addp.2d d0, v0
-; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: fmov w8, s0
; CHECK-NEXT: and w0, w8, #0x3
; CHECK-NEXT: ret
; CHECK-NEXT: .loh AdrpLdr Lloh6, Lloh7
>From 0d76f393eb1ad65df2f602d97fba8f6ec2ab7aa8 Mon Sep 17 00:00:00 2001
From: Spencer Abson <Spencer.Abson at arm.com>
Date: Thu, 14 Nov 2024 18:16:19 +0000
Subject: [PATCH 5/5] [NFC] Refactor DUP patterns
---
llvm/lib/Target/AArch64/AArch64InstrInfo.td | 48 ++++++++-------------
1 file changed, 18 insertions(+), 30 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 833adc274f7efc..7f28dc18ad0ef1 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -6954,6 +6954,12 @@ def : Pat<(v4f32 (AArch64duplane32 (v4f32 V128:$Rn), VectorIndexS:$imm)),
def : Pat<(v2f64 (AArch64duplane64 (v2f64 V128:$Rn), VectorIndexD:$imm)),
(DUPv2i64lane V128:$Rn, VectorIndexD:$imm)>;
+// Also covers DUP (truncate i64 to i32)
+def : Pat<(v2i32 (AArch64dup (i32 (extractelt (v4i32 V128:$Rn), imm:$idx)))),
+ (DUPv2i32lane V128:$Rn, imm:$idx)>;
+def : Pat<(v4i32 (AArch64dup (i32 (extractelt (v4i32 V128:$Rn), imm:$idx)))),
+ (DUPv4i32lane V128:$Rn, imm:$idx)>;
+
// If there's an (AArch64dup (vector_extract ...) ...), we can use a duplane
// instruction even if the types don't match: we just have to remap the lane
// carefully. N.b. this trick only applies to truncations.
@@ -6967,38 +6973,20 @@ def VecIndex_x8 : SDNodeXForm<imm, [{
return CurDAG->getTargetConstant(8 * N->getZExtValue(), SDLoc(N), MVT::i64);
}]>;
-multiclass DUPWithTruncPats<ValueType ResVT, ValueType Src64VT,
- ValueType Src128VT, ValueType ScalVT,
- Instruction DUP, SDNodeXForm IdxXFORM> {
- def : Pat<(ResVT (AArch64dup (ScalVT (vector_extract (Src128VT V128:$Rn),
- imm:$idx)))),
- (DUP V128:$Rn, (IdxXFORM imm:$idx))>;
-
- def : Pat<(ResVT (AArch64dup (ScalVT (vector_extract (Src64VT V64:$Rn),
- imm:$idx)))),
- (DUP (SUBREG_TO_REG (i64 0), V64:$Rn, dsub), (IdxXFORM imm:$idx))>;
-}
-
-defm : DUPWithTruncPats<v8i8, v4i16, v8i16, i32, DUPv8i8lane, VecIndex_x2>;
-defm : DUPWithTruncPats<v8i8, v2i32, v4i32, i32, DUPv8i8lane, VecIndex_x4>;
-defm : DUPWithTruncPats<v4i16, v2i32, v4i32, i32, DUPv4i16lane, VecIndex_x2>;
-
-defm : DUPWithTruncPats<v16i8, v4i16, v8i16, i32, DUPv16i8lane, VecIndex_x2>;
-defm : DUPWithTruncPats<v16i8, v2i32, v4i32, i32, DUPv16i8lane, VecIndex_x4>;
-defm : DUPWithTruncPats<v8i16, v2i32, v4i32, i32, DUPv8i16lane, VecIndex_x2>;
-
-class DUPWithTrunci64Pat<ValueType ResVT, Instruction DUP,
- SDNodeXForm IdxXFORM>
- : Pat<(ResVT (AArch64dup (i32 (extractelt (v4i32 V128:$Rn), imm:$idx)))),
+class DUPWithTruncPat<ValueType ResVT, ValueType SrcVT, ValueType ScalVT,
+ Instruction DUP, SDNodeXForm IdxXFORM>
+ : Pat<(ResVT (AArch64dup (ScalVT (vector_extract (SrcVT V128:$Rn), imm:$idx)))),
(DUP V128:$Rn, (IdxXFORM imm:$idx))>;
-def : DUPWithTrunci64Pat<v8i8, DUPv8i8lane, VecIndex_x4>;
-def : DUPWithTrunci64Pat<v4i16, DUPv4i16lane, VecIndex_x2>;
-def : DUPWithTrunci64Pat<v2i32, DUPv2i32lane, NOOP_SDNodeXForm>;
-
-def : DUPWithTrunci64Pat<v16i8, DUPv16i8lane, VecIndex_x4>;
-def : DUPWithTrunci64Pat<v8i16, DUPv8i16lane, VecIndex_x2>;
-def : DUPWithTrunci64Pat<v4i32, DUPv4i32lane, NOOP_SDNodeXForm>;
+// DUP (truncate i16 to i8)
+def : DUPWithTruncPat<v8i8, v8i16, i32, DUPv8i8lane, VecIndex_x2>;
+def : DUPWithTruncPat<v16i8, v8i16, i32, DUPv16i8lane, VecIndex_x2>;
+// DUP (truncate i32/64 to i8)
+def : DUPWithTruncPat<v8i8, v4i32, i32, DUPv8i8lane, VecIndex_x4>;
+def : DUPWithTruncPat<v16i8, v4i32, i32, DUPv16i8lane, VecIndex_x4>;
+// DUP (truncate i32/i64 to i16)
+def : DUPWithTruncPat<v4i16, v4i32, i32, DUPv4i16lane, VecIndex_x2>;
+def : DUPWithTruncPat<v8i16, v4i32, i32, DUPv8i16lane, VecIndex_x2>;
// SMOV and UMOV definitions, with some extra patterns for convenience
defm SMOV : SMov;
More information about the llvm-commits
mailing list