[llvm] [LLVM][CodeGen][SVE] Improve custom lowering for EXTRACT_SUBVECTOR. (PR #90963)
Paul Walker via llvm-commits
llvm-commits at lists.llvm.org
Wed May 8 06:21:14 PDT 2024
https://github.com/paulwalker-arm updated https://github.com/llvm/llvm-project/pull/90963
>From 8ea062aa5551cf29b44ed224fe0ad8068c9b9df8 Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker at arm.com>
Date: Tue, 7 May 2024 14:45:14 +0100
Subject: [PATCH 1/4] Add bfloat test coverage for VECTOR_SPLICE.
---
.../AArch64/named-vector-shuffles-sve.ll | 104 ++++++++++++++++++
1 file changed, 104 insertions(+)
diff --git a/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll b/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll
index f5763cd61033b..d1171bc312473 100644
--- a/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll
+++ b/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll
@@ -692,6 +692,104 @@ define <vscale x 2 x double> @splice_nxv2f64_neg3(<vscale x 2 x double> %a, <vsc
ret <vscale x 2 x double> %res
}
+define <vscale x 2 x bfloat> @splice_nxv2bf16_neg_idx(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b) #0 {
+; CHECK-LABEL: splice_nxv2bf16_neg_idx:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl1
+; CHECK-NEXT: rev p0.d, p0.d
+; CHECK-NEXT: splice z0.d, p0, z0.d, z1.d
+; CHECK-NEXT: ret
+ %res = call <vscale x 2 x bfloat> @llvm.vector.splice.nxv2bf16(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b, i32 -1)
+ ret <vscale x 2 x bfloat> %res
+}
+
+define <vscale x 2 x bfloat> @splice_nxv2bf16_neg2_idx(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b) #0 {
+; CHECK-LABEL: splice_nxv2bf16_neg2_idx:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl2
+; CHECK-NEXT: rev p0.d, p0.d
+; CHECK-NEXT: splice z0.d, p0, z0.d, z1.d
+; CHECK-NEXT: ret
+ %res = call <vscale x 2 x bfloat> @llvm.vector.splice.nxv2bf16(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b, i32 -2)
+ ret <vscale x 2 x bfloat> %res
+}
+
+define <vscale x 2 x bfloat> @splice_nxv2bf16_first_idx(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b) #0 {
+; CHECK-LABEL: splice_nxv2bf16_first_idx:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ext z0.b, z0.b, z1.b, #8
+; CHECK-NEXT: ret
+ %res = call <vscale x 2 x bfloat> @llvm.vector.splice.nxv2bf16(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b, i32 1)
+ ret <vscale x 2 x bfloat> %res
+}
+
+define <vscale x 2 x bfloat> @splice_nxv2bf16_last_idx(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b) vscale_range(16,16) #0 {
+; CHECK-LABEL: splice_nxv2bf16_last_idx:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ext z0.b, z0.b, z1.b, #248
+; CHECK-NEXT: ret
+ %res = call <vscale x 2 x bfloat> @llvm.vector.splice.nxv2bf16(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b, i32 31)
+ ret <vscale x 2 x bfloat> %res
+}
+
+define <vscale x 4 x bfloat> @splice_nxv4bf16_neg_idx(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b) #0 {
+; CHECK-LABEL: splice_nxv4bf16_neg_idx:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl1
+; CHECK-NEXT: rev p0.s, p0.s
+; CHECK-NEXT: splice z0.s, p0, z0.s, z1.s
+; CHECK-NEXT: ret
+ %res = call <vscale x 4 x bfloat> @llvm.vector.splice.nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b, i32 -1)
+ ret <vscale x 4 x bfloat> %res
+}
+
+define <vscale x 4 x bfloat> @splice_nxv4bf16_neg3_idx(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b) #0 {
+; CHECK-LABEL: splice_nxv4bf16_neg3_idx:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl3
+; CHECK-NEXT: rev p0.s, p0.s
+; CHECK-NEXT: splice z0.s, p0, z0.s, z1.s
+; CHECK-NEXT: ret
+ %res = call <vscale x 4 x bfloat> @llvm.vector.splice.nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b, i32 -3)
+ ret <vscale x 4 x bfloat> %res
+}
+
+define <vscale x 4 x bfloat> @splice_nxv4bf16_first_idx(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b) #0 {
+; CHECK-LABEL: splice_nxv4bf16_first_idx:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ext z0.b, z0.b, z1.b, #4
+; CHECK-NEXT: ret
+ %res = call <vscale x 4 x bfloat> @llvm.vector.splice.nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b, i32 1)
+ ret <vscale x 4 x bfloat> %res
+}
+
+define <vscale x 4 x bfloat> @splice_nxv4bf16_last_idx(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b) vscale_range(16,16) #0 {
+; CHECK-LABEL: splice_nxv4bf16_last_idx:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ext z0.b, z0.b, z1.b, #252
+; CHECK-NEXT: ret
+ %res = call <vscale x 4 x bfloat> @llvm.vector.splice.nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b, i32 63)
+ ret <vscale x 4 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @splice_nxv8bf16_first_idx(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) #0 {
+; CHECK-LABEL: splice_nxv8bf16_first_idx:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ext z0.b, z0.b, z1.b, #2
+; CHECK-NEXT: ret
+ %res = call <vscale x 8 x bfloat> @llvm.vector.splice.nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, i32 1)
+ ret <vscale x 8 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @splice_nxv8bf16_last_idx(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) vscale_range(16,16) #0 {
+; CHECK-LABEL: splice_nxv8bf16_last_idx:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ext z0.b, z0.b, z1.b, #254
+; CHECK-NEXT: ret
+ %res = call <vscale x 8 x bfloat> @llvm.vector.splice.nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, i32 127)
+ ret <vscale x 8 x bfloat> %res
+}
+
; Ensure predicate based splice is promoted to use ZPRs.
define <vscale x 2 x i1> @splice_nxv2i1(<vscale x 2 x i1> %a, <vscale x 2 x i1> %b) #0 {
; CHECK-LABEL: splice_nxv2i1:
@@ -834,12 +932,14 @@ declare <vscale x 2 x i1> @llvm.vector.splice.nxv2i1(<vscale x 2 x i1>, <vscale
declare <vscale x 4 x i1> @llvm.vector.splice.nxv4i1(<vscale x 4 x i1>, <vscale x 4 x i1>, i32)
declare <vscale x 8 x i1> @llvm.vector.splice.nxv8i1(<vscale x 8 x i1>, <vscale x 8 x i1>, i32)
declare <vscale x 16 x i1> @llvm.vector.splice.nxv16i1(<vscale x 16 x i1>, <vscale x 16 x i1>, i32)
+
declare <vscale x 2 x i8> @llvm.vector.splice.nxv2i8(<vscale x 2 x i8>, <vscale x 2 x i8>, i32)
declare <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, i32)
declare <vscale x 8 x i16> @llvm.vector.splice.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, i32)
declare <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, i32)
declare <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32>, <vscale x 8 x i32>, i32)
declare <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, i32)
+
declare <vscale x 2 x half> @llvm.vector.splice.nxv2f16(<vscale x 2 x half>, <vscale x 2 x half>, i32)
declare <vscale x 4 x half> @llvm.vector.splice.nxv4f16(<vscale x 4 x half>, <vscale x 4 x half>, i32)
declare <vscale x 8 x half> @llvm.vector.splice.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>, i32)
@@ -848,4 +948,8 @@ declare <vscale x 4 x float> @llvm.vector.splice.nxv4f32(<vscale x 4 x float>, <
declare <vscale x 16 x float> @llvm.vector.splice.nxv16f32(<vscale x 16 x float>, <vscale x 16 x float>, i32)
declare <vscale x 2 x double> @llvm.vector.splice.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, i32)
+declare <vscale x 2 x bfloat> @llvm.vector.splice.nxv2bf16(<vscale x 2 x bfloat>, <vscale x 2 x bfloat>, i32)
+declare <vscale x 4 x bfloat> @llvm.vector.splice.nxv4bf16(<vscale x 4 x bfloat>, <vscale x 4 x bfloat>, i32)
+declare <vscale x 8 x bfloat> @llvm.vector.splice.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i32)
+
attributes #0 = { nounwind "target-features"="+sve" }
>From 2047b2713a4358e305e79bd8655bc8452a6b3cdf Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker at arm.com>
Date: Tue, 7 May 2024 13:52:54 +0100
Subject: [PATCH 2/4] [LLVM][CodeGen][SVE] Clean up lowering of VECTOR_SPLICE
operations.
Remove DAG combine that is performing type legalisation and instead
add isel patterns for all legal types.
---
.../SelectionDAG/SelectionDAGBuilder.cpp | 3 +-
.../Target/AArch64/AArch64ISelLowering.cpp | 38 ++++---------------
.../lib/Target/AArch64/AArch64SVEInstrInfo.td | 23 +++++++----
llvm/lib/Target/AArch64/SVEInstrFormats.td | 17 +++++----
4 files changed, 32 insertions(+), 49 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index cfd82a342433f..f1e7a3f4421e8 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -12249,9 +12249,8 @@ void SelectionDAGBuilder::visitVectorSplice(const CallInst &I) {
// VECTOR_SHUFFLE doesn't support a scalable mask so use a dedicated node.
if (VT.isScalableVector()) {
- MVT IdxVT = TLI.getVectorIdxTy(DAG.getDataLayout());
setValue(&I, DAG.getNode(ISD::VECTOR_SPLICE, DL, VT, V1, V2,
- DAG.getConstant(Imm, DL, IdxVT)));
+ DAG.getVectorIdxConstant(Imm, DL)));
return;
}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 2af679e0755b5..a1931bc1e4936 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1048,9 +1048,9 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
setTargetDAGCombine({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND,
- ISD::VECTOR_SPLICE, ISD::SIGN_EXTEND_INREG,
- ISD::CONCAT_VECTORS, ISD::EXTRACT_SUBVECTOR,
- ISD::INSERT_SUBVECTOR, ISD::STORE, ISD::BUILD_VECTOR});
+ ISD::SIGN_EXTEND_INREG, ISD::CONCAT_VECTORS,
+ ISD::EXTRACT_SUBVECTOR, ISD::INSERT_SUBVECTOR,
+ ISD::STORE, ISD::BUILD_VECTOR});
setTargetDAGCombine(ISD::TRUNCATE);
setTargetDAGCombine(ISD::LOAD);
@@ -1580,6 +1580,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::MLOAD, VT, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
setOperationAction(ISD::SPLAT_VECTOR, VT, Legal);
+ setOperationAction(ISD::VECTOR_SPLICE, VT, Custom);
if (!Subtarget->isLittleEndian())
setOperationAction(ISD::BITCAST, VT, Expand);
@@ -10102,10 +10103,9 @@ SDValue AArch64TargetLowering::LowerVECTOR_SPLICE(SDValue Op,
Op.getOperand(1));
}
- // This will select to an EXT instruction, which has a maximum immediate
- // value of 255, hence 2048-bits is the maximum value we can lower.
- if (IdxVal >= 0 &&
- IdxVal < int64_t(2048 / Ty.getVectorElementType().getSizeInBits()))
+ // We can select to an EXT instruction when indexing the first 256 bytes.
+ unsigned BlockSize = AArch64::SVEBitsPerBlock / Ty.getVectorMinNumElements();
+ if (IdxVal >= 0 && (IdxVal * BlockSize / 8) < 256)
return Op;
return SDValue();
@@ -24237,28 +24237,6 @@ performInsertVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
return performPostLD1Combine(N, DCI, true);
}
-static SDValue performSVESpliceCombine(SDNode *N, SelectionDAG &DAG) {
- EVT Ty = N->getValueType(0);
- if (Ty.isInteger())
- return SDValue();
-
- EVT IntTy = Ty.changeVectorElementTypeToInteger();
- EVT ExtIntTy = getPackedSVEVectorVT(IntTy.getVectorElementCount());
- if (ExtIntTy.getVectorElementType().getScalarSizeInBits() <
- IntTy.getVectorElementType().getScalarSizeInBits())
- return SDValue();
-
- SDLoc DL(N);
- SDValue LHS = DAG.getAnyExtOrTrunc(DAG.getBitcast(IntTy, N->getOperand(0)),
- DL, ExtIntTy);
- SDValue RHS = DAG.getAnyExtOrTrunc(DAG.getBitcast(IntTy, N->getOperand(1)),
- DL, ExtIntTy);
- SDValue Idx = N->getOperand(2);
- SDValue Splice = DAG.getNode(ISD::VECTOR_SPLICE, DL, ExtIntTy, LHS, RHS, Idx);
- SDValue Trunc = DAG.getAnyExtOrTrunc(Splice, DL, IntTy);
- return DAG.getBitcast(Ty, Trunc);
-}
-
static SDValue performFPExtendCombine(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const AArch64Subtarget *Subtarget) {
@@ -24643,8 +24621,6 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
case ISD::MGATHER:
case ISD::MSCATTER:
return performMaskedGatherScatterCombine(N, DCI, DAG);
- case ISD::VECTOR_SPLICE:
- return performSVESpliceCombine(N, DAG);
case ISD::FP_EXTEND:
return performFPExtendCombine(N, DAG, DCI, Subtarget);
case AArch64ISD::BRCOND:
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 62e68de1359f7..64e545aa26b45 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -1994,14 +1994,21 @@ let Predicates = [HasSVEorSME] in {
(LASTB_VPZ_D (PTRUE_D 31), ZPR:$Z1), dsub))>;
// Splice with lane bigger or equal to 0
- def : Pat<(nxv16i8 (vector_splice (nxv16i8 ZPR:$Z1), (nxv16i8 ZPR:$Z2), (i64 (sve_ext_imm_0_255 i32:$index)))),
- (EXT_ZZI ZPR:$Z1, ZPR:$Z2, imm0_255:$index)>;
- def : Pat<(nxv8i16 (vector_splice (nxv8i16 ZPR:$Z1), (nxv8i16 ZPR:$Z2), (i64 (sve_ext_imm_0_127 i32:$index)))),
- (EXT_ZZI ZPR:$Z1, ZPR:$Z2, imm0_255:$index)>;
- def : Pat<(nxv4i32 (vector_splice (nxv4i32 ZPR:$Z1), (nxv4i32 ZPR:$Z2), (i64 (sve_ext_imm_0_63 i32:$index)))),
- (EXT_ZZI ZPR:$Z1, ZPR:$Z2, imm0_255:$index)>;
- def : Pat<(nxv2i64 (vector_splice (nxv2i64 ZPR:$Z1), (nxv2i64 ZPR:$Z2), (i64 (sve_ext_imm_0_31 i32:$index)))),
- (EXT_ZZI ZPR:$Z1, ZPR:$Z2, imm0_255:$index)>;
+ foreach VT = [nxv16i8] in
+ def : Pat<(VT (vector_splice (VT ZPR:$Z1), (VT ZPR:$Z2), (i64 (sve_ext_imm_0_255 i32:$index)))),
+ (EXT_ZZI ZPR:$Z1, ZPR:$Z2, imm0_255:$index)>;
+
+ foreach VT = [nxv8i16, nxv8f16, nxv8bf16] in
+ def : Pat<(VT (vector_splice (VT ZPR:$Z1), (VT ZPR:$Z2), (i64 (sve_ext_imm_0_127 i32:$index)))),
+ (EXT_ZZI ZPR:$Z1, ZPR:$Z2, imm0_255:$index)>;
+
+ foreach VT = [nxv4i32, nxv4f16, nxv4f32, nxv4bf16] in
+ def : Pat<(VT (vector_splice (VT ZPR:$Z1), (VT ZPR:$Z2), (i64 (sve_ext_imm_0_63 i32:$index)))),
+ (EXT_ZZI ZPR:$Z1, ZPR:$Z2, imm0_255:$index)>;
+
+ foreach VT = [nxv2i64, nxv2f16, nxv2f32, nxv2f64, nxv2bf16] in
+ def : Pat<(VT (vector_splice (VT ZPR:$Z1), (VT ZPR:$Z2), (i64 (sve_ext_imm_0_31 i32:$index)))),
+ (EXT_ZZI ZPR:$Z1, ZPR:$Z2, imm0_255:$index)>;
defm CMPHS_PPzZZ : sve_int_cmp_0<0b000, "cmphs", SETUGE, SETULE>;
defm CMPHI_PPzZZ : sve_int_cmp_0<0b001, "cmphi", SETUGT, SETULT>;
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index 69c3238c7d614..fc7d3cdda4acd 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -7060,16 +7060,17 @@ multiclass sve_int_perm_splice<string asm, SDPatternOperator op> {
def _S : sve_int_perm_splice<0b10, asm, ZPR32>;
def _D : sve_int_perm_splice<0b11, asm, ZPR64>;
- def : SVE_3_Op_Pat<nxv16i8, op, nxv16i1, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
- def : SVE_3_Op_Pat<nxv8i16, op, nxv8i1, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
- def : SVE_3_Op_Pat<nxv4i32, op, nxv4i1, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
- def : SVE_3_Op_Pat<nxv2i64, op, nxv2i1, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
+ foreach VT = [nxv16i8] in
+ def : SVE_3_Op_Pat<VT, op, nxv16i1, VT, VT, !cast<Instruction>(NAME # _B)>;
- def : SVE_3_Op_Pat<nxv8f16, op, nxv8i1, nxv8f16, nxv8f16, !cast<Instruction>(NAME # _H)>;
- def : SVE_3_Op_Pat<nxv4f32, op, nxv4i1, nxv4f32, nxv4f32, !cast<Instruction>(NAME # _S)>;
- def : SVE_3_Op_Pat<nxv2f64, op, nxv2i1, nxv2f64, nxv2f64, !cast<Instruction>(NAME # _D)>;
+ foreach VT = [nxv8i16, nxv8f16, nxv8bf16] in
+ def : SVE_3_Op_Pat<VT, op, nxv8i1, VT, VT, !cast<Instruction>(NAME # _H)>;
- def : SVE_3_Op_Pat<nxv8bf16, op, nxv8i1, nxv8bf16, nxv8bf16, !cast<Instruction>(NAME # _H)>;
+ foreach VT = [nxv4i32, nxv4f16, nxv4f32, nxv4bf16] in
+ def : SVE_3_Op_Pat<VT, op, nxv4i1, VT, VT, !cast<Instruction>(NAME # _S)>;
+
+ foreach VT = [nxv2i64, nxv2f16, nxv2f32, nxv2f64, nxv2bf16] in
+ def : SVE_3_Op_Pat<VT, op, nxv2i1, VT, VT, !cast<Instruction>(NAME # _D)>;
}
class sve2_int_perm_splice_cons<bits<2> sz8_64, string asm,
>From 5c1e4403ba363e374658a6f562f3075144a16cca Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker at arm.com>
Date: Fri, 3 May 2024 15:55:24 +0100
Subject: [PATCH 3/4] [LLVM][CodeGen][SVE] Add tests for vector extracts from
unpacked types.
---
.../sve-extract-fixed-from-scalable-vector.ll | 78 ++++++++++++++++++-
1 file changed, 76 insertions(+), 2 deletions(-)
diff --git a/llvm/test/CodeGen/AArch64/sve-extract-fixed-from-scalable-vector.ll b/llvm/test/CodeGen/AArch64/sve-extract-fixed-from-scalable-vector.ll
index b9c531fe33526..e91aac430110c 100644
--- a/llvm/test/CodeGen/AArch64/sve-extract-fixed-from-scalable-vector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-extract-fixed-from-scalable-vector.ll
@@ -307,11 +307,85 @@ define <4 x i64> @extract_v4i64_nxv8i64_0(<vscale x 8 x i64> %arg) {
ret <4 x i64> %ext
}
+define <4 x half> @extract_v4f16_nxv2f16_0(<vscale x 2 x half> %arg) {
+; CHECK-LABEL: extract_v4f16_nxv2f16_0:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: cntd x8
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: addpl x9, sp, #6
+; CHECK-NEXT: subs x8, x8, #4
+; CHECK-NEXT: csel x8, xzr, x8, lo
+; CHECK-NEXT: st1h { z0.d }, p0, [sp, #3, mul vl]
+; CHECK-NEXT: cmp x8, #0
+; CHECK-NEXT: csel x8, x8, xzr, lo
+; CHECK-NEXT: lsl x8, x8, #1
+; CHECK-NEXT: ldr d0, [x9, x8]
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %ext = call <4 x half> @llvm.vector.extract.v4f16.nxv2f16(<vscale x 2 x half> %arg, i64 0)
+ ret <4 x half> %ext
+}
+
+define <4 x half> @extract_v4f16_nxv2f16_4(<vscale x 2 x half> %arg) {
+; CHECK-LABEL: extract_v4f16_nxv2f16_4:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: cntd x8
+; CHECK-NEXT: mov w9, #4 // =0x4
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: subs x8, x8, #4
+; CHECK-NEXT: csel x8, xzr, x8, lo
+; CHECK-NEXT: st1h { z0.d }, p0, [sp, #3, mul vl]
+; CHECK-NEXT: cmp x8, #4
+; CHECK-NEXT: csel x8, x8, x9, lo
+; CHECK-NEXT: addpl x9, sp, #6
+; CHECK-NEXT: lsl x8, x8, #1
+; CHECK-NEXT: ldr d0, [x9, x8]
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %ext = call <4 x half> @llvm.vector.extract.v4f16.nxv2f16(<vscale x 2 x half> %arg, i64 4)
+ ret <4 x half> %ext
+}
+
+define <2 x half> @extract_v2f16_nxv4f16_2(<vscale x 4 x half> %arg) {
+; CHECK-LABEL: extract_v2f16_nxv4f16_2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z1.s, z0.s[3]
+; CHECK-NEXT: mov z0.s, z0.s[2]
+; CHECK-NEXT: mov v0.h[1], v1.h[0]
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT: ret
+ %ext = call <2 x half> @llvm.vector.extract.v2f16.nxv4f16(<vscale x 4 x half> %arg, i64 2)
+ ret <2 x half> %ext
+}
+
+define <2 x half> @extract_v2f16_nxv4f16_6(<vscale x 4 x half> %arg) {
+; CHECK-LABEL: extract_v2f16_nxv4f16_6:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z1.s, z0.s[7]
+; CHECK-NEXT: mov z0.s, z0.s[6]
+; CHECK-NEXT: mov v0.h[1], v1.h[0]
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT: ret
+ %ext = call <2 x half> @llvm.vector.extract.v2f16.nxv4f16(<vscale x 4 x half> %arg, i64 6)
+ ret <2 x half> %ext
+}
-declare <2 x i64> @llvm.vector.extract.v2i64.nxv8i64(<vscale x 8 x i64>, i64)
-declare <4 x i64> @llvm.vector.extract.v4i64.nxv8i64(<vscale x 8 x i64>, i64)
declare <4 x float> @llvm.vector.extract.v4f32.nxv16f32(<vscale x 16 x float>, i64)
declare <2 x float> @llvm.vector.extract.v2f32.nxv16f32(<vscale x 16 x float>, i64)
+declare <4 x half> @llvm.vector.extract.v4f16.nxv2f16(<vscale x 2 x half>, i64);
+declare <2 x half> @llvm.vector.extract.v2f16.nxv4f16(<vscale x 4 x half>, i64);
+declare <2 x i64> @llvm.vector.extract.v2i64.nxv8i64(<vscale x 8 x i64>, i64)
+declare <4 x i64> @llvm.vector.extract.v4i64.nxv8i64(<vscale x 8 x i64>, i64)
declare <4 x i32> @llvm.vector.extract.v4i32.nxv16i32(<vscale x 16 x i32>, i64)
declare <2 x i32> @llvm.vector.extract.v2i32.nxv16i32(<vscale x 16 x i32>, i64)
declare <8 x i16> @llvm.vector.extract.v8i16.nxv32i16(<vscale x 32 x i16>, i64)
>From 6f0589b510be7235f5600a885c436c39464d8885 Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker at arm.com>
Date: Fri, 3 May 2024 13:02:04 +0100
Subject: [PATCH 4/4] [LLVM][CodeGen][SVE] Improve custom lowering for
EXTRACT_SUBVECTOR.
We can extract any legal fixed length vector from a scalable vector
by using VECTOR_SPLICE. I've also taken the time to simplify the
code a little.
---
.../Target/AArch64/AArch64ISelLowering.cpp | 59 ++++----
.../sve-extract-fixed-from-scalable-vector.ll | 48 ++----
.../AArch64/sve-extract-fixed-vector.ll | 142 +++---------------
...e-streaming-mode-fixed-length-int-to-fp.ll | 30 ++--
4 files changed, 77 insertions(+), 202 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index a1931bc1e4936..de5799fca7261 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -13897,45 +13897,52 @@ AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
SelectionDAG &DAG) const {
- assert(Op.getValueType().isFixedLengthVector() &&
+ EVT VT = Op.getValueType();
+ assert(VT.isFixedLengthVector() &&
"Only cases that extract a fixed length vector are supported!");
-
EVT InVT = Op.getOperand(0).getValueType();
- unsigned Idx = Op.getConstantOperandVal(1);
- unsigned Size = Op.getValueSizeInBits();
// If we don't have legal types yet, do nothing
- if (!DAG.getTargetLoweringInfo().isTypeLegal(InVT))
+ if (!isTypeLegal(InVT))
return SDValue();
- if (InVT.isScalableVector()) {
- // This will be matched by custom code during ISelDAGToDAG.
- if (Idx == 0 && isPackedVectorType(InVT, DAG))
+ if (InVT.is128BitVector()) {
+ assert(VT.is64BitVector() && "Extracting unexpected vector type!");
+ unsigned Idx = Op.getConstantOperandVal(1);
+
+ // This will get lowered to an appropriate EXTRACT_SUBREG in ISel.
+ if (Idx == 0)
return Op;
- return SDValue();
+ // If this is extracting the upper 64-bits of a 128-bit vector, we match
+ // that directly.
+ if (Idx * InVT.getScalarSizeInBits() == 64 && Subtarget->isNeonAvailable())
+ return Op;
}
- // This will get lowered to an appropriate EXTRACT_SUBREG in ISel.
- if (Idx == 0 && InVT.getSizeInBits() <= 128)
- return Op;
-
- // If this is extracting the upper 64-bits of a 128-bit vector, we match
- // that directly.
- if (Size == 64 && Idx * InVT.getScalarSizeInBits() == 64 &&
- InVT.getSizeInBits() == 128 && Subtarget->isNeonAvailable())
- return Op;
-
- if (useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable())) {
+ if (InVT.isScalableVector() ||
+ useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable())) {
SDLoc DL(Op);
+ SDValue Vec = Op.getOperand(0);
+ SDValue Idx = Op.getOperand(1);
- EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
- SDValue NewInVec =
- convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
+ EVT PackedVT = getPackedSVEVectorVT(InVT.getVectorElementType());
+ if (PackedVT != InVT) {
+ // Pack input into the bottom part of an SVE register and try again.
+ SDValue Container = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, PackedVT,
+ DAG.getUNDEF(PackedVT), Vec,
+ DAG.getVectorIdxConstant(0, DL));
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Container, Idx);
+ }
+
+ // This will get matched by custom code during ISelDAGToDAG.
+ if (isNullConstant(Idx))
+ return Op;
- SDValue Splice = DAG.getNode(ISD::VECTOR_SPLICE, DL, ContainerVT, NewInVec,
- NewInVec, DAG.getConstant(Idx, DL, MVT::i64));
- return convertFromScalableVector(DAG, Op.getValueType(), Splice);
+ assert(InVT.isScalableVector() && "Unexpected vector type!");
+ // Move requested subvector to the start of the vector and try again.
+ SDValue Splice = DAG.getNode(ISD::VECTOR_SPLICE, DL, InVT, Vec, Vec, Idx);
+ return convertFromScalableVector(DAG, VT, Splice);
}
return SDValue();
diff --git a/llvm/test/CodeGen/AArch64/sve-extract-fixed-from-scalable-vector.ll b/llvm/test/CodeGen/AArch64/sve-extract-fixed-from-scalable-vector.ll
index e91aac430110c..641050ae69d9b 100644
--- a/llvm/test/CodeGen/AArch64/sve-extract-fixed-from-scalable-vector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-extract-fixed-from-scalable-vector.ll
@@ -143,15 +143,8 @@ define <4 x float> @extract_v4f32_nxv16f32_12(<vscale x 16 x float> %arg) {
define <2 x float> @extract_v2f32_nxv16f32_2(<vscale x 16 x float> %arg) {
; CHECK-LABEL: extract_v2f32_nxv16f32_2:
; CHECK: // %bb.0:
-; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT: addvl sp, sp, #-1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
-; CHECK-NEXT: .cfi_offset w29, -16
-; CHECK-NEXT: ptrue p0.s
-; CHECK-NEXT: st1w { z0.s }, p0, [sp]
-; CHECK-NEXT: ldr d0, [sp, #8]
-; CHECK-NEXT: addvl sp, sp, #1
-; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
%ext = call <2 x float> @llvm.vector.extract.v2f32.nxv16f32(<vscale x 16 x float> %arg, i64 2)
ret <2 x float> %ext
@@ -274,15 +267,8 @@ define <4 x i3> @extract_v4i3_nxv32i3_16(<vscale x 32 x i3> %arg) {
define <2 x i32> @extract_v2i32_nxv16i32_2(<vscale x 16 x i32> %arg) {
; CHECK-LABEL: extract_v2i32_nxv16i32_2:
; CHECK: // %bb.0:
-; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT: addvl sp, sp, #-1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
-; CHECK-NEXT: .cfi_offset w29, -16
-; CHECK-NEXT: ptrue p0.s
-; CHECK-NEXT: st1w { z0.s }, p0, [sp]
-; CHECK-NEXT: ldr d0, [sp, #8]
-; CHECK-NEXT: addvl sp, sp, #1
-; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
%ext = call <2 x i32> @llvm.vector.extract.v2i32.nxv16i32(<vscale x 16 x i32> %arg, i64 2)
ret <2 x i32> %ext
@@ -314,16 +300,9 @@ define <4 x half> @extract_v4f16_nxv2f16_0(<vscale x 2 x half> %arg) {
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
; CHECK-NEXT: .cfi_offset w29, -16
-; CHECK-NEXT: cntd x8
; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: addpl x9, sp, #6
-; CHECK-NEXT: subs x8, x8, #4
-; CHECK-NEXT: csel x8, xzr, x8, lo
-; CHECK-NEXT: st1h { z0.d }, p0, [sp, #3, mul vl]
-; CHECK-NEXT: cmp x8, #0
-; CHECK-NEXT: csel x8, x8, xzr, lo
-; CHECK-NEXT: lsl x8, x8, #1
-; CHECK-NEXT: ldr d0, [x9, x8]
+; CHECK-NEXT: st1h { z0.d }, p0, [sp]
+; CHECK-NEXT: ldr d0, [sp]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
@@ -338,17 +317,12 @@ define <4 x half> @extract_v4f16_nxv2f16_4(<vscale x 2 x half> %arg) {
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
; CHECK-NEXT: .cfi_offset w29, -16
-; CHECK-NEXT: cntd x8
-; CHECK-NEXT: mov w9, #4 // =0x4
; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: subs x8, x8, #4
-; CHECK-NEXT: csel x8, xzr, x8, lo
-; CHECK-NEXT: st1h { z0.d }, p0, [sp, #3, mul vl]
-; CHECK-NEXT: cmp x8, #4
-; CHECK-NEXT: csel x8, x8, x9, lo
-; CHECK-NEXT: addpl x9, sp, #6
-; CHECK-NEXT: lsl x8, x8, #1
-; CHECK-NEXT: ldr d0, [x9, x8]
+; CHECK-NEXT: ptrue p1.h
+; CHECK-NEXT: st1h { z0.d }, p0, [sp]
+; CHECK-NEXT: ld1h { z0.h }, p1/z, [sp]
+; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll b/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll
index 88268104889fd..b05b46a75b698 100644
--- a/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll
@@ -15,20 +15,8 @@ define <2 x i64> @extract_v2i64_nxv2i64(<vscale x 2 x i64> %vec) nounwind {
define <2 x i64> @extract_v2i64_nxv2i64_idx2(<vscale x 2 x i64> %vec) nounwind {
; CHECK-LABEL: extract_v2i64_nxv2i64_idx2:
; CHECK: // %bb.0:
-; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT: addvl sp, sp, #-1
-; CHECK-NEXT: cntd x8
-; CHECK-NEXT: mov w9, #2 // =0x2
-; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: sub x8, x8, #2
-; CHECK-NEXT: cmp x8, #2
-; CHECK-NEXT: st1d { z0.d }, p0, [sp]
-; CHECK-NEXT: csel x8, x8, x9, lo
-; CHECK-NEXT: mov x9, sp
-; CHECK-NEXT: lsl x8, x8, #3
-; CHECK-NEXT: ldr q0, [x9, x8]
-; CHECK-NEXT: addvl sp, sp, #1
-; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ext z0.b, z0.b, z0.b, #16
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
%retval = call <2 x i64> @llvm.vector.extract.v2i64.nxv2i64(<vscale x 2 x i64> %vec, i64 2)
ret <2 x i64> %retval
@@ -48,20 +36,8 @@ define <4 x i32> @extract_v4i32_nxv4i32(<vscale x 4 x i32> %vec) nounwind {
define <4 x i32> @extract_v4i32_nxv4i32_idx4(<vscale x 4 x i32> %vec) nounwind {
; CHECK-LABEL: extract_v4i32_nxv4i32_idx4:
; CHECK: // %bb.0:
-; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT: addvl sp, sp, #-1
-; CHECK-NEXT: cntw x8
-; CHECK-NEXT: mov w9, #4 // =0x4
-; CHECK-NEXT: ptrue p0.s
-; CHECK-NEXT: sub x8, x8, #4
-; CHECK-NEXT: cmp x8, #4
-; CHECK-NEXT: st1w { z0.s }, p0, [sp]
-; CHECK-NEXT: csel x8, x8, x9, lo
-; CHECK-NEXT: mov x9, sp
-; CHECK-NEXT: lsl x8, x8, #2
-; CHECK-NEXT: ldr q0, [x9, x8]
-; CHECK-NEXT: addvl sp, sp, #1
-; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ext z0.b, z0.b, z0.b, #16
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
%retval = call <4 x i32> @llvm.vector.extract.v4i32.nxv4i32(<vscale x 4 x i32> %vec, i64 4)
ret <4 x i32> %retval
@@ -82,18 +58,9 @@ define <4 x i32> @extract_v4i32_nxv2i32(<vscale x 2 x i32> %vec) nounwind #1 {
define <4 x i32> @extract_v4i32_nxv2i32_idx4(<vscale x 2 x i32> %vec) nounwind #1 {
; CHECK-LABEL: extract_v4i32_nxv2i32_idx4:
; CHECK: // %bb.0:
-; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT: addvl sp, sp, #-1
-; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: mov x8, #4 // =0x4
-; CHECK-NEXT: mov x9, sp
-; CHECK-NEXT: ptrue p1.d, vl4
-; CHECK-NEXT: st1d { z0.d }, p0, [sp]
-; CHECK-NEXT: ld1d { z0.d }, p1/z, [x9, x8, lsl #3]
+; CHECK-NEXT: ext z0.b, z0.b, z0.b, #32
; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
-; CHECK-NEXT: addvl sp, sp, #1
-; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
%retval = call <4 x i32> @llvm.vector.extract.v4i32.nxv2i32(<vscale x 2 x i32> %vec, i64 4)
ret <4 x i32> %retval
@@ -113,20 +80,8 @@ define <8 x i16> @extract_v8i16_nxv8i16(<vscale x 8 x i16> %vec) nounwind {
define <8 x i16> @extract_v8i16_nxv8i16_idx8(<vscale x 8 x i16> %vec) nounwind {
; CHECK-LABEL: extract_v8i16_nxv8i16_idx8:
; CHECK: // %bb.0:
-; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT: addvl sp, sp, #-1
-; CHECK-NEXT: cnth x8
-; CHECK-NEXT: mov w9, #8 // =0x8
-; CHECK-NEXT: ptrue p0.h
-; CHECK-NEXT: sub x8, x8, #8
-; CHECK-NEXT: cmp x8, #8
-; CHECK-NEXT: st1h { z0.h }, p0, [sp]
-; CHECK-NEXT: csel x8, x8, x9, lo
-; CHECK-NEXT: mov x9, sp
-; CHECK-NEXT: lsl x8, x8, #1
-; CHECK-NEXT: ldr q0, [x9, x8]
-; CHECK-NEXT: addvl sp, sp, #1
-; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ext z0.b, z0.b, z0.b, #16
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
%retval = call <8 x i16> @llvm.vector.extract.v8i16.nxv8i16(<vscale x 8 x i16> %vec, i64 8)
ret <8 x i16> %retval
@@ -147,18 +102,9 @@ define <8 x i16> @extract_v8i16_nxv4i16(<vscale x 4 x i16> %vec) nounwind #1 {
define <8 x i16> @extract_v8i16_nxv4i16_idx8(<vscale x 4 x i16> %vec) nounwind #1 {
; CHECK-LABEL: extract_v8i16_nxv4i16_idx8:
; CHECK: // %bb.0:
-; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT: addvl sp, sp, #-1
-; CHECK-NEXT: ptrue p0.s
-; CHECK-NEXT: mov x8, #8 // =0x8
-; CHECK-NEXT: mov x9, sp
-; CHECK-NEXT: ptrue p1.s, vl8
-; CHECK-NEXT: st1w { z0.s }, p0, [sp]
-; CHECK-NEXT: ld1w { z0.s }, p1/z, [x9, x8, lsl #2]
+; CHECK-NEXT: ext z0.b, z0.b, z0.b, #32
; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
-; CHECK-NEXT: addvl sp, sp, #1
-; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
%retval = call <8 x i16> @llvm.vector.extract.v8i16.nxv4i16(<vscale x 4 x i16> %vec, i64 8)
ret <8 x i16> %retval
@@ -180,19 +126,10 @@ define <8 x i16> @extract_v8i16_nxv2i16(<vscale x 2 x i16> %vec) nounwind #1 {
define <8 x i16> @extract_v8i16_nxv2i16_idx8(<vscale x 2 x i16> %vec) nounwind #1 {
; CHECK-LABEL: extract_v8i16_nxv2i16_idx8:
; CHECK: // %bb.0:
-; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT: addvl sp, sp, #-1
-; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: mov x8, #8 // =0x8
-; CHECK-NEXT: mov x9, sp
-; CHECK-NEXT: ptrue p1.d, vl8
-; CHECK-NEXT: st1d { z0.d }, p0, [sp]
-; CHECK-NEXT: ld1d { z0.d }, p1/z, [x9, x8, lsl #3]
+; CHECK-NEXT: ext z0.b, z0.b, z0.b, #64
; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
-; CHECK-NEXT: addvl sp, sp, #1
-; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
%retval = call <8 x i16> @llvm.vector.extract.v8i16.nxv2i16(<vscale x 2 x i16> %vec, i64 8)
ret <8 x i16> %retval
@@ -212,19 +149,8 @@ define <16 x i8> @extract_v16i8_nxv16i8(<vscale x 16 x i8> %vec) nounwind {
define <16 x i8> @extract_v16i8_nxv16i8_idx16(<vscale x 16 x i8> %vec) nounwind {
; CHECK-LABEL: extract_v16i8_nxv16i8_idx16:
; CHECK: // %bb.0:
-; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT: addvl sp, sp, #-1
-; CHECK-NEXT: rdvl x8, #1
-; CHECK-NEXT: ptrue p0.b
-; CHECK-NEXT: mov w9, #16 // =0x10
-; CHECK-NEXT: sub x8, x8, #16
-; CHECK-NEXT: cmp x8, #16
-; CHECK-NEXT: st1b { z0.b }, p0, [sp]
-; CHECK-NEXT: csel x8, x8, x9, lo
-; CHECK-NEXT: mov x9, sp
-; CHECK-NEXT: ldr q0, [x9, x8]
-; CHECK-NEXT: addvl sp, sp, #1
-; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ext z0.b, z0.b, z0.b, #16
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
%retval = call <16 x i8> @llvm.vector.extract.v16i8.nxv16i8(<vscale x 16 x i8> %vec, i64 16)
ret <16 x i8> %retval
@@ -245,18 +171,9 @@ define <16 x i8> @extract_v16i8_nxv8i8(<vscale x 8 x i8> %vec) nounwind #1 {
define <16 x i8> @extract_v16i8_nxv8i8_idx16(<vscale x 8 x i8> %vec) nounwind #1 {
; CHECK-LABEL: extract_v16i8_nxv8i8_idx16:
; CHECK: // %bb.0:
-; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT: addvl sp, sp, #-1
-; CHECK-NEXT: ptrue p0.h
-; CHECK-NEXT: mov x8, #16 // =0x10
-; CHECK-NEXT: mov x9, sp
-; CHECK-NEXT: ptrue p1.h, vl16
-; CHECK-NEXT: st1h { z0.h }, p0, [sp]
-; CHECK-NEXT: ld1h { z0.h }, p1/z, [x9, x8, lsl #1]
+; CHECK-NEXT: ext z0.b, z0.b, z0.b, #32
; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
-; CHECK-NEXT: addvl sp, sp, #1
-; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
%retval = call <16 x i8> @llvm.vector.extract.v16i8.nxv8i8(<vscale x 8 x i8> %vec, i64 16)
ret <16 x i8> %retval
@@ -278,19 +195,10 @@ define <16 x i8> @extract_v16i8_nxv4i8(<vscale x 4 x i8> %vec) nounwind #1 {
define <16 x i8> @extract_v16i8_nxv4i8_idx16(<vscale x 4 x i8> %vec) nounwind #1 {
; CHECK-LABEL: extract_v16i8_nxv4i8_idx16:
; CHECK: // %bb.0:
-; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT: addvl sp, sp, #-1
-; CHECK-NEXT: ptrue p0.s
-; CHECK-NEXT: mov x8, #16 // =0x10
-; CHECK-NEXT: mov x9, sp
-; CHECK-NEXT: ptrue p1.s, vl16
-; CHECK-NEXT: st1w { z0.s }, p0, [sp]
-; CHECK-NEXT: ld1w { z0.s }, p1/z, [x9, x8, lsl #2]
+; CHECK-NEXT: ext z0.b, z0.b, z0.b, #64
; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
-; CHECK-NEXT: addvl sp, sp, #1
-; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
%retval = call <16 x i8> @llvm.vector.extract.v16i8.nxv4i8(<vscale x 4 x i8> %vec, i64 16)
ret <16 x i8> %retval
@@ -313,17 +221,11 @@ define <16 x i8> @extract_v16i8_nxv2i8(<vscale x 2 x i8> %vec) nounwind #1 {
define <16 x i8> @extract_v16i8_nxv2i8_idx16(<vscale x 2 x i8> %vec) nounwind #1 {
; CHECK-LABEL: extract_v16i8_nxv2i8_idx16:
; CHECK: // %bb.0:
-; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT: addvl sp, sp, #-1
-; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: st1d { z0.d }, p0, [sp]
-; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp]
+; CHECK-NEXT: ext z0.b, z0.b, z0.b, #128
; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
-; CHECK-NEXT: addvl sp, sp, #1
-; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
%retval = call <16 x i8> @llvm.vector.extract.v16i8.nxv2i8(<vscale x 2 x i8> %vec, i64 16)
ret <16 x i8> %retval
@@ -434,13 +336,8 @@ define <16 x i1> @extract_v16i1_nxv16i1(<vscale x 16 x i1> %inmask) {
define <2 x i64> @extract_fixed_v2i64_nxv2i64(<vscale x 2 x i64> %vec) nounwind #0 {
; CHECK-LABEL: extract_fixed_v2i64_nxv2i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT: addvl sp, sp, #-1
-; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: st1d { z0.d }, p0, [sp]
-; CHECK-NEXT: ldr q0, [sp, #16]
-; CHECK-NEXT: addvl sp, sp, #1
-; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ext z0.b, z0.b, z0.b, #16
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
%retval = call <2 x i64> @llvm.vector.extract.v2i64.nxv2i64(<vscale x 2 x i64> %vec, i64 2)
ret <2 x i64> %retval
@@ -449,14 +346,9 @@ define <2 x i64> @extract_fixed_v2i64_nxv2i64(<vscale x 2 x i64> %vec) nounwind
define void @extract_fixed_v4i64_nxv2i64(<vscale x 2 x i64> %vec, ptr %p) nounwind #0 {
; CHECK-LABEL: extract_fixed_v4i64_nxv2i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: ext z0.b, z0.b, z0.b, #32
; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: st1d { z0.d }, p0, [sp]
-; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp]
; CHECK-NEXT: st1d { z0.d }, p0, [x0]
-; CHECK-NEXT: addvl sp, sp, #1
-; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
%retval = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> %vec, i64 4)
store <4 x i64> %retval, ptr %p
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
index d9ca19baea7d5..f7522b97b6593 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
@@ -301,18 +301,19 @@ define void @ucvtf_v8i16_v8f64(ptr %a, ptr %b) {
; CHECK-NEXT: uunpklo z1.s, z0.h
; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8
; CHECK-NEXT: uunpklo z0.s, z0.h
-; CHECK-NEXT: uunpklo z2.d, z1.s
-; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8
-; CHECK-NEXT: uunpklo z3.d, z0.s
+; CHECK-NEXT: mov z3.d, z1.d
+; CHECK-NEXT: uunpklo z2.d, z0.s
; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT: ext z3.b, z3.b, z1.b, #8
; CHECK-NEXT: uunpklo z1.d, z1.s
-; CHECK-NEXT: ucvtf z2.d, p0/m, z2.d
; CHECK-NEXT: uunpklo z0.d, z0.s
-; CHECK-NEXT: ucvtf z3.d, p0/m, z3.d
+; CHECK-NEXT: uunpklo z3.d, z3.s
+; CHECK-NEXT: ucvtf z2.d, p0/m, z2.d
; CHECK-NEXT: ucvtf z1.d, p0/m, z1.d
; CHECK-NEXT: ucvtf z0.d, p0/m, z0.d
-; CHECK-NEXT: stp q2, q1, [x1]
-; CHECK-NEXT: stp q3, q0, [x1, #32]
+; CHECK-NEXT: ucvtf z3.d, p0/m, z3.d
+; CHECK-NEXT: stp q1, q3, [x1]
+; CHECK-NEXT: stp q2, q0, [x1, #32]
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: ucvtf_v8i16_v8f64:
@@ -1230,18 +1231,19 @@ define void @scvtf_v8i16_v8f64(ptr %a, ptr %b) {
; CHECK-NEXT: sunpklo z1.s, z0.h
; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8
; CHECK-NEXT: sunpklo z0.s, z0.h
-; CHECK-NEXT: sunpklo z2.d, z1.s
-; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8
-; CHECK-NEXT: sunpklo z3.d, z0.s
+; CHECK-NEXT: mov z3.d, z1.d
+; CHECK-NEXT: sunpklo z2.d, z0.s
; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT: ext z3.b, z3.b, z1.b, #8
; CHECK-NEXT: sunpklo z1.d, z1.s
-; CHECK-NEXT: scvtf z2.d, p0/m, z2.d
; CHECK-NEXT: sunpklo z0.d, z0.s
-; CHECK-NEXT: scvtf z3.d, p0/m, z3.d
+; CHECK-NEXT: sunpklo z3.d, z3.s
+; CHECK-NEXT: scvtf z2.d, p0/m, z2.d
; CHECK-NEXT: scvtf z1.d, p0/m, z1.d
; CHECK-NEXT: scvtf z0.d, p0/m, z0.d
-; CHECK-NEXT: stp q2, q1, [x1]
-; CHECK-NEXT: stp q3, q0, [x1, #32]
+; CHECK-NEXT: scvtf z3.d, p0/m, z3.d
+; CHECK-NEXT: stp q1, q3, [x1]
+; CHECK-NEXT: stp q2, q0, [x1, #32]
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: scvtf_v8i16_v8f64:
More information about the llvm-commits
mailing list