[llvm] [AArch64] Combine getActiveLaneMask with vector_extract (PR #81139)
Momchil Velikov via llvm-commits
llvm-commits at lists.llvm.org
Thu Apr 18 10:00:12 PDT 2024
https://github.com/momchil-velikov updated https://github.com/llvm/llvm-project/pull/81139
>From 0f9031ce4663844fe399cc7ab3b26fc716d76b91 Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Fri, 22 Dec 2023 11:30:22 +0000
Subject: [PATCH 1/2] [AArch64] Combine getActiveLaneMask with vector_extract
... into a `whilelo` instruction with a pair of predicate registers.
---
.../Target/AArch64/AArch64ISelLowering.cpp | 61 ++-
llvm/test/CodeGen/AArch64/active_lane_mask.ll | 1 +
.../AArch64/get-active-lane-mask-extract.ll | 395 ++++++++++++++++++
3 files changed, 455 insertions(+), 2 deletions(-)
create mode 100644 llvm/test/CodeGen/AArch64/get-active-lane-mask-extract.ll
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 819e8ccd5c33f06..6634367b72b6d55 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1847,8 +1847,8 @@ void AArch64TargetLowering::addTypeForNEON(MVT VT) {
bool AArch64TargetLowering::shouldExpandGetActiveLaneMask(EVT ResVT,
EVT OpVT) const {
- // Only SVE has a 1:1 mapping from intrinsic -> instruction (whilelo).
- if (!Subtarget->hasSVE())
+ // Only SVE/SME has a 1:1 mapping from intrinsic -> instruction (whilelo).
+ if (!Subtarget->hasSVEorSME())
return true;
// We can only support legal predicate result types. We can use the SVE
@@ -20481,6 +20481,61 @@ static SDValue convertMergedOpToPredOp(SDNode *N, unsigned Opc,
return SDValue();
}
+static SDValue tryCombineWhileLo(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const AArch64Subtarget *Subtarget) {
+ if (DCI.isBeforeLegalize())
+ return SDValue();
+
+ if (!Subtarget->hasSVE2p1() && !Subtarget->hasSME2())
+ return SDValue();
+
+ if (!N->hasNUsesOfValue(2, 0))
+ return SDValue();
+
+ const uint64_t HalfSize = N->getValueType(0).getVectorMinNumElements() / 2;
+ if (HalfSize < 2)
+ return SDValue();
+
+ auto It = N->use_begin();
+ SDNode *Lo = *It++;
+ SDNode *Hi = *It;
+
+ uint64_t OffLo, OffHi;
+ if (Lo->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
+ !isIntImmediate(Lo->getOperand(1).getNode(), OffLo) ||
+ Hi->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
+ !isIntImmediate(Hi->getOperand(1).getNode(), OffHi))
+ return SDValue();
+
+ if (OffLo > OffHi) {
+ std::swap(Lo, Hi);
+ std::swap(OffLo, OffHi);
+ }
+
+ if (OffLo != 0 || OffHi != HalfSize)
+ return SDValue();
+
+ SelectionDAG &DAG = DCI.DAG;
+ SDLoc DL(N);
+ SDValue ID =
+ DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo_x2, DL, MVT::i64);
+ SDValue Idx = N->getOperand(1);
+ SDValue TC = N->getOperand(2);
+ if (Idx.getValueType() != MVT::i64) {
+ Idx = DAG.getZExtOrTrunc(Idx, DL, MVT::i64);
+ TC = DAG.getZExtOrTrunc(TC, DL, MVT::i64);
+ }
+ auto R =
+ DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL,
+ {Lo->getValueType(0), Hi->getValueType(0)}, {ID, Idx, TC});
+
+ DCI.CombineTo(Lo, R.getValue(0));
+ DCI.CombineTo(Hi, R.getValue(1));
+
+ return SDValue(N, 0);
+}
+
static SDValue performIntrinsicCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
const AArch64Subtarget *Subtarget) {
@@ -20811,6 +20866,8 @@ static SDValue performIntrinsicCombine(SDNode *N,
case Intrinsic::aarch64_sve_ptest_last:
return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
AArch64CC::LAST_ACTIVE);
+ case Intrinsic::aarch64_sve_whilelo:
+ return tryCombineWhileLo(N, DCI, Subtarget);
}
return SDValue();
}
diff --git a/llvm/test/CodeGen/AArch64/active_lane_mask.ll b/llvm/test/CodeGen/AArch64/active_lane_mask.ll
index a65c5d66677946a..6a509b5f3afcae4 100644
--- a/llvm/test/CodeGen/AArch64/active_lane_mask.ll
+++ b/llvm/test/CodeGen/AArch64/active_lane_mask.ll
@@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme < %s | FileCheck %s
; == Scalable ==
diff --git a/llvm/test/CodeGen/AArch64/get-active-lane-mask-extract.ll b/llvm/test/CodeGen/AArch64/get-active-lane-mask-extract.ll
new file mode 100644
index 000000000000000..de022014a656d42
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/get-active-lane-mask-extract.ll
@@ -0,0 +1,395 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mattr=+sve < %s | FileCheck %s -check-prefix CHECK-SVE
+; RUN: llc -mattr=+sve2p1 < %s | FileCheck %s -check-prefix CHECK-SVE2p1
+; RUN: llc -mattr=+sme2 < %s | FileCheck %s -check-prefix CHECK-SME2
+target triple = "aarch64-linux"
+
+; Test combining of getActiveLaneMask with a pair of extract_vector operations.
+
+define void @test_2x8bit_mask_with_32bit_index_and_trip_count(i32 %i, i32 %n) #0 {
+; CHECK-SVE-LABEL: test_2x8bit_mask_with_32bit_index_and_trip_count:
+; CHECK-SVE: // %bb.0:
+; CHECK-SVE-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-SVE-NEXT: whilelo p1.b, w0, w1
+; CHECK-SVE-NEXT: punpklo p0.h, p1.b
+; CHECK-SVE-NEXT: punpkhi p1.h, p1.b
+; CHECK-SVE-NEXT: bl use
+; CHECK-SVE-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-SVE2p1-LABEL: test_2x8bit_mask_with_32bit_index_and_trip_count:
+; CHECK-SVE2p1: // %bb.0:
+; CHECK-SVE2p1-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-SVE2p1-NEXT: mov w8, w1
+; CHECK-SVE2p1-NEXT: mov w9, w0
+; CHECK-SVE2p1-NEXT: whilelo { p0.h, p1.h }, x9, x8
+; CHECK-SVE2p1-NEXT: bl use
+; CHECK-SVE2p1-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-SVE2p1-NEXT: ret
+;
+; CHECK-SME2-LABEL: test_2x8bit_mask_with_32bit_index_and_trip_count:
+; CHECK-SME2: // %bb.0:
+; CHECK-SME2-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-SME2-NEXT: mov w8, w1
+; CHECK-SME2-NEXT: mov w9, w0
+; CHECK-SME2-NEXT: whilelo { p0.h, p1.h }, x9, x8
+; CHECK-SME2-NEXT: bl use
+; CHECK-SME2-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-SME2-NEXT: ret
+ %r = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 %i, i32 %n)
+ %v0 = call <vscale x 8 x i1> @llvm.vector.extract.nxv8i1.nxv16i1.i64(<vscale x 16 x i1> %r, i64 0)
+ %v1 = call <vscale x 8 x i1> @llvm.vector.extract.nxv8i1.nxv16i1.i64(<vscale x 16 x i1> %r, i64 8)
+ call void @use(<vscale x 8 x i1> %v0, <vscale x 8 x i1> %v1)
+ ret void
+}
+
+define void @test_2x8bit_mask_with_64bit_index_and_trip_count(i64 %i, i64 %n) #0 {
+; CHECK-SVE-LABEL: test_2x8bit_mask_with_64bit_index_and_trip_count:
+; CHECK-SVE: // %bb.0:
+; CHECK-SVE-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-SVE-NEXT: whilelo p1.b, x0, x1
+; CHECK-SVE-NEXT: punpklo p0.h, p1.b
+; CHECK-SVE-NEXT: punpkhi p1.h, p1.b
+; CHECK-SVE-NEXT: bl use
+; CHECK-SVE-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-SVE2p1-LABEL: test_2x8bit_mask_with_64bit_index_and_trip_count:
+; CHECK-SVE2p1: // %bb.0:
+; CHECK-SVE2p1-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-SVE2p1-NEXT: whilelo { p0.h, p1.h }, x0, x1
+; CHECK-SVE2p1-NEXT: bl use
+; CHECK-SVE2p1-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-SVE2p1-NEXT: ret
+;
+; CHECK-SME2-LABEL: test_2x8bit_mask_with_64bit_index_and_trip_count:
+; CHECK-SME2: // %bb.0:
+; CHECK-SME2-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-SME2-NEXT: whilelo { p0.h, p1.h }, x0, x1
+; CHECK-SME2-NEXT: bl use
+; CHECK-SME2-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-SME2-NEXT: ret
+ %r = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 %i, i64 %n)
+ %v0 = call <vscale x 8 x i1> @llvm.vector.extract.nxv8i1.nxv16i1.i64(<vscale x 16 x i1> %r, i64 0)
+ %v1 = call <vscale x 8 x i1> @llvm.vector.extract.nxv8i1.nxv16i1.i64(<vscale x 16 x i1> %r, i64 8)
+ call void @use(<vscale x 8 x i1> %v0, <vscale x 8 x i1> %v1)
+ ret void
+}
+
+define void @test_edge_case_2x1bit_mask(i64 %i, i64 %n) #0 {
+; CHECK-SVE-LABEL: test_edge_case_2x1bit_mask:
+; CHECK-SVE: // %bb.0:
+; CHECK-SVE-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-SVE-NEXT: whilelo p1.d, x0, x1
+; CHECK-SVE-NEXT: punpklo p0.h, p1.b
+; CHECK-SVE-NEXT: punpkhi p1.h, p1.b
+; CHECK-SVE-NEXT: bl use
+; CHECK-SVE-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-SVE2p1-LABEL: test_edge_case_2x1bit_mask:
+; CHECK-SVE2p1: // %bb.0:
+; CHECK-SVE2p1-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-SVE2p1-NEXT: whilelo p1.d, x0, x1
+; CHECK-SVE2p1-NEXT: punpklo p0.h, p1.b
+; CHECK-SVE2p1-NEXT: punpkhi p1.h, p1.b
+; CHECK-SVE2p1-NEXT: bl use
+; CHECK-SVE2p1-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-SVE2p1-NEXT: ret
+;
+; CHECK-SME2-LABEL: test_edge_case_2x1bit_mask:
+; CHECK-SME2: // %bb.0:
+; CHECK-SME2-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-SME2-NEXT: whilelo p1.d, x0, x1
+; CHECK-SME2-NEXT: punpklo p0.h, p1.b
+; CHECK-SME2-NEXT: punpkhi p1.h, p1.b
+; CHECK-SME2-NEXT: bl use
+; CHECK-SME2-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-SME2-NEXT: ret
+ %r = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 %i, i64 %n)
+ %v0 = call <vscale x 1 x i1> @llvm.vector.extract.nxv1i1.nxv2i1.i64(<vscale x 2 x i1> %r, i64 0)
+ %v1 = call <vscale x 1 x i1> @llvm.vector.extract.nxv1i1.nxv2i1.i64(<vscale x 2 x i1> %r, i64 1)
+ call void @use(<vscale x 1 x i1> %v0, <vscale x 1 x i1> %v1)
+ ret void
+}
+
+define void @test_edge_case_2x16bit_mask(i64 %i, i64 %n) #0 {
+; CHECK-SVE-LABEL: test_edge_case_2x16bit_mask:
+; CHECK-SVE: // %bb.0:
+; CHECK-SVE-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-SVE-NEXT: index z1.d, #0, #1
+; CHECK-SVE-NEXT: mov z0.d, x0
+; CHECK-SVE-NEXT: ptrue p0.d
+; CHECK-SVE-NEXT: mov z3.d, x1
+; CHECK-SVE-NEXT: mov z2.d, z1.d
+; CHECK-SVE-NEXT: mov z4.d, z1.d
+; CHECK-SVE-NEXT: mov z6.d, z1.d
+; CHECK-SVE-NEXT: uqadd z17.d, z1.d, z0.d
+; CHECK-SVE-NEXT: incd z1.d, all, mul #8
+; CHECK-SVE-NEXT: incd z2.d
+; CHECK-SVE-NEXT: incd z4.d, all, mul #2
+; CHECK-SVE-NEXT: incd z6.d, all, mul #4
+; CHECK-SVE-NEXT: cmphi p1.d, p0/z, z3.d, z17.d
+; CHECK-SVE-NEXT: uqadd z1.d, z1.d, z0.d
+; CHECK-SVE-NEXT: mov z5.d, z2.d
+; CHECK-SVE-NEXT: uqadd z18.d, z2.d, z0.d
+; CHECK-SVE-NEXT: mov z7.d, z2.d
+; CHECK-SVE-NEXT: mov z16.d, z4.d
+; CHECK-SVE-NEXT: uqadd z19.d, z4.d, z0.d
+; CHECK-SVE-NEXT: uqadd z20.d, z6.d, z0.d
+; CHECK-SVE-NEXT: incd z2.d, all, mul #8
+; CHECK-SVE-NEXT: incd z4.d, all, mul #8
+; CHECK-SVE-NEXT: incd z6.d, all, mul #8
+; CHECK-SVE-NEXT: incd z5.d, all, mul #2
+; CHECK-SVE-NEXT: incd z7.d, all, mul #4
+; CHECK-SVE-NEXT: cmphi p2.d, p0/z, z3.d, z18.d
+; CHECK-SVE-NEXT: incd z16.d, all, mul #4
+; CHECK-SVE-NEXT: cmphi p3.d, p0/z, z3.d, z19.d
+; CHECK-SVE-NEXT: cmphi p5.d, p0/z, z3.d, z20.d
+; CHECK-SVE-NEXT: uqadd z2.d, z2.d, z0.d
+; CHECK-SVE-NEXT: uqadd z4.d, z4.d, z0.d
+; CHECK-SVE-NEXT: uqadd z6.d, z6.d, z0.d
+; CHECK-SVE-NEXT: mov z18.d, z5.d
+; CHECK-SVE-NEXT: uqadd z17.d, z5.d, z0.d
+; CHECK-SVE-NEXT: uqadd z19.d, z7.d, z0.d
+; CHECK-SVE-NEXT: incd z5.d, all, mul #8
+; CHECK-SVE-NEXT: incd z7.d, all, mul #8
+; CHECK-SVE-NEXT: uzp1 p1.s, p1.s, p2.s
+; CHECK-SVE-NEXT: incd z18.d, all, mul #4
+; CHECK-SVE-NEXT: cmphi p8.d, p0/z, z3.d, z2.d
+; CHECK-SVE-NEXT: cmphi p4.d, p0/z, z3.d, z17.d
+; CHECK-SVE-NEXT: uqadd z17.d, z16.d, z0.d
+; CHECK-SVE-NEXT: incd z16.d, all, mul #8
+; CHECK-SVE-NEXT: uqadd z5.d, z5.d, z0.d
+; CHECK-SVE-NEXT: uqadd z7.d, z7.d, z0.d
+; CHECK-SVE-NEXT: cmphi p6.d, p0/z, z3.d, z19.d
+; CHECK-SVE-NEXT: uqadd z20.d, z18.d, z0.d
+; CHECK-SVE-NEXT: incd z18.d, all, mul #8
+; CHECK-SVE-NEXT: uzp1 p3.s, p3.s, p4.s
+; CHECK-SVE-NEXT: uqadd z16.d, z16.d, z0.d
+; CHECK-SVE-NEXT: cmphi p7.d, p0/z, z3.d, z17.d
+; CHECK-SVE-NEXT: cmphi p4.d, p0/z, z3.d, z1.d
+; CHECK-SVE-NEXT: uzp1 p5.s, p5.s, p6.s
+; CHECK-SVE-NEXT: cmphi p6.d, p0/z, z3.d, z4.d
+; CHECK-SVE-NEXT: cmphi p9.d, p0/z, z3.d, z5.d
+; CHECK-SVE-NEXT: cmphi p10.d, p0/z, z3.d, z7.d
+; CHECK-SVE-NEXT: uqadd z0.d, z18.d, z0.d
+; CHECK-SVE-NEXT: cmphi p2.d, p0/z, z3.d, z20.d
+; CHECK-SVE-NEXT: uzp1 p4.s, p4.s, p8.s
+; CHECK-SVE-NEXT: cmphi p8.d, p0/z, z3.d, z16.d
+; CHECK-SVE-NEXT: uzp1 p6.s, p6.s, p9.s
+; CHECK-SVE-NEXT: uzp1 p1.h, p1.h, p3.h
+; CHECK-SVE-NEXT: uzp1 p2.s, p7.s, p2.s
+; CHECK-SVE-NEXT: cmphi p7.d, p0/z, z3.d, z6.d
+; CHECK-SVE-NEXT: cmphi p0.d, p0/z, z3.d, z0.d
+; CHECK-SVE-NEXT: uzp1 p7.s, p7.s, p10.s
+; CHECK-SVE-NEXT: uzp1 p0.s, p8.s, p0.s
+; CHECK-SVE-NEXT: uzp1 p3.h, p4.h, p6.h
+; CHECK-SVE-NEXT: uzp1 p2.h, p5.h, p2.h
+; CHECK-SVE-NEXT: uzp1 p4.h, p7.h, p0.h
+; CHECK-SVE-NEXT: uzp1 p0.b, p1.b, p2.b
+; CHECK-SVE-NEXT: uzp1 p1.b, p3.b, p4.b
+; CHECK-SVE-NEXT: bl use
+; CHECK-SVE-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-SVE2p1-LABEL: test_edge_case_2x16bit_mask:
+; CHECK-SVE2p1: // %bb.0:
+; CHECK-SVE2p1-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-SVE2p1-NEXT: index z1.d, #0, #1
+; CHECK-SVE2p1-NEXT: mov z0.d, x0
+; CHECK-SVE2p1-NEXT: ptrue p0.d
+; CHECK-SVE2p1-NEXT: mov z3.d, x1
+; CHECK-SVE2p1-NEXT: mov z2.d, z1.d
+; CHECK-SVE2p1-NEXT: mov z4.d, z1.d
+; CHECK-SVE2p1-NEXT: mov z6.d, z1.d
+; CHECK-SVE2p1-NEXT: uqadd z17.d, z1.d, z0.d
+; CHECK-SVE2p1-NEXT: incd z1.d, all, mul #8
+; CHECK-SVE2p1-NEXT: incd z2.d
+; CHECK-SVE2p1-NEXT: incd z4.d, all, mul #2
+; CHECK-SVE2p1-NEXT: incd z6.d, all, mul #4
+; CHECK-SVE2p1-NEXT: cmphi p1.d, p0/z, z3.d, z17.d
+; CHECK-SVE2p1-NEXT: uqadd z1.d, z1.d, z0.d
+; CHECK-SVE2p1-NEXT: mov z5.d, z2.d
+; CHECK-SVE2p1-NEXT: uqadd z18.d, z2.d, z0.d
+; CHECK-SVE2p1-NEXT: mov z7.d, z2.d
+; CHECK-SVE2p1-NEXT: mov z16.d, z4.d
+; CHECK-SVE2p1-NEXT: uqadd z19.d, z4.d, z0.d
+; CHECK-SVE2p1-NEXT: uqadd z20.d, z6.d, z0.d
+; CHECK-SVE2p1-NEXT: incd z2.d, all, mul #8
+; CHECK-SVE2p1-NEXT: incd z4.d, all, mul #8
+; CHECK-SVE2p1-NEXT: incd z6.d, all, mul #8
+; CHECK-SVE2p1-NEXT: incd z5.d, all, mul #2
+; CHECK-SVE2p1-NEXT: incd z7.d, all, mul #4
+; CHECK-SVE2p1-NEXT: cmphi p2.d, p0/z, z3.d, z18.d
+; CHECK-SVE2p1-NEXT: incd z16.d, all, mul #4
+; CHECK-SVE2p1-NEXT: cmphi p3.d, p0/z, z3.d, z19.d
+; CHECK-SVE2p1-NEXT: cmphi p5.d, p0/z, z3.d, z20.d
+; CHECK-SVE2p1-NEXT: uqadd z2.d, z2.d, z0.d
+; CHECK-SVE2p1-NEXT: uqadd z4.d, z4.d, z0.d
+; CHECK-SVE2p1-NEXT: uqadd z6.d, z6.d, z0.d
+; CHECK-SVE2p1-NEXT: mov z18.d, z5.d
+; CHECK-SVE2p1-NEXT: uqadd z17.d, z5.d, z0.d
+; CHECK-SVE2p1-NEXT: uqadd z19.d, z7.d, z0.d
+; CHECK-SVE2p1-NEXT: incd z5.d, all, mul #8
+; CHECK-SVE2p1-NEXT: incd z7.d, all, mul #8
+; CHECK-SVE2p1-NEXT: uzp1 p1.s, p1.s, p2.s
+; CHECK-SVE2p1-NEXT: incd z18.d, all, mul #4
+; CHECK-SVE2p1-NEXT: cmphi p8.d, p0/z, z3.d, z2.d
+; CHECK-SVE2p1-NEXT: cmphi p4.d, p0/z, z3.d, z17.d
+; CHECK-SVE2p1-NEXT: uqadd z17.d, z16.d, z0.d
+; CHECK-SVE2p1-NEXT: incd z16.d, all, mul #8
+; CHECK-SVE2p1-NEXT: uqadd z5.d, z5.d, z0.d
+; CHECK-SVE2p1-NEXT: uqadd z7.d, z7.d, z0.d
+; CHECK-SVE2p1-NEXT: cmphi p6.d, p0/z, z3.d, z19.d
+; CHECK-SVE2p1-NEXT: uqadd z20.d, z18.d, z0.d
+; CHECK-SVE2p1-NEXT: incd z18.d, all, mul #8
+; CHECK-SVE2p1-NEXT: uzp1 p3.s, p3.s, p4.s
+; CHECK-SVE2p1-NEXT: uqadd z16.d, z16.d, z0.d
+; CHECK-SVE2p1-NEXT: cmphi p7.d, p0/z, z3.d, z17.d
+; CHECK-SVE2p1-NEXT: cmphi p4.d, p0/z, z3.d, z1.d
+; CHECK-SVE2p1-NEXT: uzp1 p5.s, p5.s, p6.s
+; CHECK-SVE2p1-NEXT: cmphi p6.d, p0/z, z3.d, z4.d
+; CHECK-SVE2p1-NEXT: cmphi p9.d, p0/z, z3.d, z5.d
+; CHECK-SVE2p1-NEXT: cmphi p10.d, p0/z, z3.d, z7.d
+; CHECK-SVE2p1-NEXT: uqadd z0.d, z18.d, z0.d
+; CHECK-SVE2p1-NEXT: cmphi p2.d, p0/z, z3.d, z20.d
+; CHECK-SVE2p1-NEXT: uzp1 p4.s, p4.s, p8.s
+; CHECK-SVE2p1-NEXT: cmphi p8.d, p0/z, z3.d, z16.d
+; CHECK-SVE2p1-NEXT: uzp1 p6.s, p6.s, p9.s
+; CHECK-SVE2p1-NEXT: uzp1 p1.h, p1.h, p3.h
+; CHECK-SVE2p1-NEXT: uzp1 p2.s, p7.s, p2.s
+; CHECK-SVE2p1-NEXT: cmphi p7.d, p0/z, z3.d, z6.d
+; CHECK-SVE2p1-NEXT: cmphi p0.d, p0/z, z3.d, z0.d
+; CHECK-SVE2p1-NEXT: uzp1 p7.s, p7.s, p10.s
+; CHECK-SVE2p1-NEXT: uzp1 p0.s, p8.s, p0.s
+; CHECK-SVE2p1-NEXT: uzp1 p3.h, p4.h, p6.h
+; CHECK-SVE2p1-NEXT: uzp1 p2.h, p5.h, p2.h
+; CHECK-SVE2p1-NEXT: uzp1 p4.h, p7.h, p0.h
+; CHECK-SVE2p1-NEXT: uzp1 p0.b, p1.b, p2.b
+; CHECK-SVE2p1-NEXT: uzp1 p1.b, p3.b, p4.b
+; CHECK-SVE2p1-NEXT: bl use
+; CHECK-SVE2p1-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-SVE2p1-NEXT: ret
+;
+; CHECK-SME2-LABEL: test_edge_case_2x16bit_mask:
+; CHECK-SME2: // %bb.0:
+; CHECK-SME2-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-SME2-NEXT: index z1.d, #0, #1
+; CHECK-SME2-NEXT: mov z0.d, x0
+; CHECK-SME2-NEXT: ptrue p0.d
+; CHECK-SME2-NEXT: mov z3.d, x1
+; CHECK-SME2-NEXT: mov z2.d, z1.d
+; CHECK-SME2-NEXT: mov z4.d, z1.d
+; CHECK-SME2-NEXT: mov z6.d, z1.d
+; CHECK-SME2-NEXT: uqadd z17.d, z1.d, z0.d
+; CHECK-SME2-NEXT: incd z1.d, all, mul #8
+; CHECK-SME2-NEXT: incd z2.d
+; CHECK-SME2-NEXT: incd z4.d, all, mul #2
+; CHECK-SME2-NEXT: incd z6.d, all, mul #4
+; CHECK-SME2-NEXT: cmphi p1.d, p0/z, z3.d, z17.d
+; CHECK-SME2-NEXT: uqadd z1.d, z1.d, z0.d
+; CHECK-SME2-NEXT: mov z5.d, z2.d
+; CHECK-SME2-NEXT: uqadd z18.d, z2.d, z0.d
+; CHECK-SME2-NEXT: mov z7.d, z2.d
+; CHECK-SME2-NEXT: mov z16.d, z4.d
+; CHECK-SME2-NEXT: uqadd z19.d, z4.d, z0.d
+; CHECK-SME2-NEXT: uqadd z20.d, z6.d, z0.d
+; CHECK-SME2-NEXT: incd z2.d, all, mul #8
+; CHECK-SME2-NEXT: incd z4.d, all, mul #8
+; CHECK-SME2-NEXT: incd z6.d, all, mul #8
+; CHECK-SME2-NEXT: incd z5.d, all, mul #2
+; CHECK-SME2-NEXT: incd z7.d, all, mul #4
+; CHECK-SME2-NEXT: cmphi p2.d, p0/z, z3.d, z18.d
+; CHECK-SME2-NEXT: incd z16.d, all, mul #4
+; CHECK-SME2-NEXT: cmphi p3.d, p0/z, z3.d, z19.d
+; CHECK-SME2-NEXT: cmphi p5.d, p0/z, z3.d, z20.d
+; CHECK-SME2-NEXT: uqadd z2.d, z2.d, z0.d
+; CHECK-SME2-NEXT: uqadd z4.d, z4.d, z0.d
+; CHECK-SME2-NEXT: uqadd z6.d, z6.d, z0.d
+; CHECK-SME2-NEXT: mov z18.d, z5.d
+; CHECK-SME2-NEXT: uqadd z17.d, z5.d, z0.d
+; CHECK-SME2-NEXT: uqadd z19.d, z7.d, z0.d
+; CHECK-SME2-NEXT: incd z5.d, all, mul #8
+; CHECK-SME2-NEXT: incd z7.d, all, mul #8
+; CHECK-SME2-NEXT: uzp1 p1.s, p1.s, p2.s
+; CHECK-SME2-NEXT: incd z18.d, all, mul #4
+; CHECK-SME2-NEXT: cmphi p8.d, p0/z, z3.d, z2.d
+; CHECK-SME2-NEXT: cmphi p4.d, p0/z, z3.d, z17.d
+; CHECK-SME2-NEXT: uqadd z17.d, z16.d, z0.d
+; CHECK-SME2-NEXT: incd z16.d, all, mul #8
+; CHECK-SME2-NEXT: uqadd z5.d, z5.d, z0.d
+; CHECK-SME2-NEXT: uqadd z7.d, z7.d, z0.d
+; CHECK-SME2-NEXT: cmphi p6.d, p0/z, z3.d, z19.d
+; CHECK-SME2-NEXT: uqadd z20.d, z18.d, z0.d
+; CHECK-SME2-NEXT: incd z18.d, all, mul #8
+; CHECK-SME2-NEXT: uzp1 p3.s, p3.s, p4.s
+; CHECK-SME2-NEXT: uqadd z16.d, z16.d, z0.d
+; CHECK-SME2-NEXT: cmphi p7.d, p0/z, z3.d, z17.d
+; CHECK-SME2-NEXT: cmphi p4.d, p0/z, z3.d, z1.d
+; CHECK-SME2-NEXT: uzp1 p5.s, p5.s, p6.s
+; CHECK-SME2-NEXT: cmphi p6.d, p0/z, z3.d, z4.d
+; CHECK-SME2-NEXT: cmphi p9.d, p0/z, z3.d, z5.d
+; CHECK-SME2-NEXT: cmphi p10.d, p0/z, z3.d, z7.d
+; CHECK-SME2-NEXT: uqadd z0.d, z18.d, z0.d
+; CHECK-SME2-NEXT: cmphi p2.d, p0/z, z3.d, z20.d
+; CHECK-SME2-NEXT: uzp1 p4.s, p4.s, p8.s
+; CHECK-SME2-NEXT: cmphi p8.d, p0/z, z3.d, z16.d
+; CHECK-SME2-NEXT: uzp1 p6.s, p6.s, p9.s
+; CHECK-SME2-NEXT: uzp1 p1.h, p1.h, p3.h
+; CHECK-SME2-NEXT: uzp1 p2.s, p7.s, p2.s
+; CHECK-SME2-NEXT: cmphi p7.d, p0/z, z3.d, z6.d
+; CHECK-SME2-NEXT: cmphi p0.d, p0/z, z3.d, z0.d
+; CHECK-SME2-NEXT: uzp1 p7.s, p7.s, p10.s
+; CHECK-SME2-NEXT: uzp1 p0.s, p8.s, p0.s
+; CHECK-SME2-NEXT: uzp1 p3.h, p4.h, p6.h
+; CHECK-SME2-NEXT: uzp1 p2.h, p5.h, p2.h
+; CHECK-SME2-NEXT: uzp1 p4.h, p7.h, p0.h
+; CHECK-SME2-NEXT: uzp1 p0.b, p1.b, p2.b
+; CHECK-SME2-NEXT: uzp1 p1.b, p3.b, p4.b
+; CHECK-SME2-NEXT: bl use
+; CHECK-SME2-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-SME2-NEXT: ret
+ %r = call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64 %i, i64 %n)
+ %v0 = call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv32i1.i64(<vscale x 32 x i1> %r, i64 0)
+ %v1 = call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv32i1.i64(<vscale x 32 x i1> %r, i64 16)
+ call void @use(<vscale x 16 x i1> %v0, <vscale x 16 x i1> %v1)
+ ret void
+}
+
+define void @test_boring_case_2x2bit_mask(i64 %i, i64 %n) #0 {
+; CHECK-SVE-LABEL: test_boring_case_2x2bit_mask:
+; CHECK-SVE: // %bb.0:
+; CHECK-SVE-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-SVE-NEXT: whilelo p1.s, x0, x1
+; CHECK-SVE-NEXT: punpklo p0.h, p1.b
+; CHECK-SVE-NEXT: punpkhi p1.h, p1.b
+; CHECK-SVE-NEXT: bl use
+; CHECK-SVE-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-SVE2p1-LABEL: test_boring_case_2x2bit_mask:
+; CHECK-SVE2p1: // %bb.0:
+; CHECK-SVE2p1-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-SVE2p1-NEXT: whilelo { p0.d, p1.d }, x0, x1
+; CHECK-SVE2p1-NEXT: bl use
+; CHECK-SVE2p1-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-SVE2p1-NEXT: ret
+;
+; CHECK-SME2-LABEL: test_boring_case_2x2bit_mask:
+; CHECK-SME2: // %bb.0:
+; CHECK-SME2-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-SME2-NEXT: whilelo { p0.d, p1.d }, x0, x1
+; CHECK-SME2-NEXT: bl use
+; CHECK-SME2-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-SME2-NEXT: ret
+ %r = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 %i, i64 %n)
+ %v0 = call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv4i1.i64(<vscale x 4 x i1> %r, i64 0)
+ %v1 = call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv4i1.i64(<vscale x 4 x i1> %r, i64 2)
+ call void @use(<vscale x 2 x i1> %v0, <vscale x 2 x i1> %v1)
+ ret void
+}
+
+declare void @use(...)
+
+attributes #0 = { nounwind }
>From c4dbea52ea72cc6feeec2734b8965068466b8928 Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Thu, 18 Apr 2024 15:09:53 +0100
Subject: [PATCH 2/2] [fixup] Don't enable the transformation for SME and
simplify some code and tests
---
.../Target/AArch64/AArch64ISelLowering.cpp | 14 +-
llvm/test/CodeGen/AArch64/active_lane_mask.ll | 1 -
.../AArch64/get-active-lane-mask-extract.ll | 329 +-----------------
3 files changed, 19 insertions(+), 325 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 6634367b72b6d55..1b1d8a35bc06f09 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1847,8 +1847,8 @@ void AArch64TargetLowering::addTypeForNEON(MVT VT) {
bool AArch64TargetLowering::shouldExpandGetActiveLaneMask(EVT ResVT,
EVT OpVT) const {
- // Only SVE/SME has a 1:1 mapping from intrinsic -> instruction (whilelo).
- if (!Subtarget->hasSVEorSME())
+ // Only SVE has a 1:1 mapping from intrinsic -> instruction (whilelo).
+ if (!Subtarget->hasSVE())
return true;
// We can only support legal predicate result types. We can use the SVE
@@ -20487,7 +20487,7 @@ static SDValue tryCombineWhileLo(SDNode *N,
if (DCI.isBeforeLegalize())
return SDValue();
- if (!Subtarget->hasSVE2p1() && !Subtarget->hasSME2())
+ if (!Subtarget->hasSVE2p1())
return SDValue();
if (!N->hasNUsesOfValue(2, 0))
@@ -20501,13 +20501,13 @@ static SDValue tryCombineWhileLo(SDNode *N,
SDNode *Lo = *It++;
SDNode *Hi = *It;
- uint64_t OffLo, OffHi;
if (Lo->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
- !isIntImmediate(Lo->getOperand(1).getNode(), OffLo) ||
- Hi->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
- !isIntImmediate(Hi->getOperand(1).getNode(), OffHi))
+ Hi->getOpcode() != ISD::EXTRACT_SUBVECTOR)
return SDValue();
+ uint64_t OffLo = Lo->getConstantOperandVal(1);
+ uint64_t OffHi = Hi->getConstantOperandVal(1);
+
if (OffLo > OffHi) {
std::swap(Lo, Hi);
std::swap(OffLo, OffHi);
diff --git a/llvm/test/CodeGen/AArch64/active_lane_mask.ll b/llvm/test/CodeGen/AArch64/active_lane_mask.ll
index 6a509b5f3afcae4..a65c5d66677946a 100644
--- a/llvm/test/CodeGen/AArch64/active_lane_mask.ll
+++ b/llvm/test/CodeGen/AArch64/active_lane_mask.ll
@@ -1,6 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme < %s | FileCheck %s
; == Scalable ==
diff --git a/llvm/test/CodeGen/AArch64/get-active-lane-mask-extract.ll b/llvm/test/CodeGen/AArch64/get-active-lane-mask-extract.ll
index de022014a656d42..df789f9502c8302 100644
--- a/llvm/test/CodeGen/AArch64/get-active-lane-mask-extract.ll
+++ b/llvm/test/CodeGen/AArch64/get-active-lane-mask-extract.ll
@@ -1,7 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -mattr=+sve < %s | FileCheck %s -check-prefix CHECK-SVE
; RUN: llc -mattr=+sve2p1 < %s | FileCheck %s -check-prefix CHECK-SVE2p1
-; RUN: llc -mattr=+sme2 < %s | FileCheck %s -check-prefix CHECK-SME2
target triple = "aarch64-linux"
; Test combining of getActiveLaneMask with a pair of extract_vector operations.
@@ -9,384 +8,80 @@ target triple = "aarch64-linux"
define void @test_2x8bit_mask_with_32bit_index_and_trip_count(i32 %i, i32 %n) #0 {
; CHECK-SVE-LABEL: test_2x8bit_mask_with_32bit_index_and_trip_count:
; CHECK-SVE: // %bb.0:
-; CHECK-SVE-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
; CHECK-SVE-NEXT: whilelo p1.b, w0, w1
; CHECK-SVE-NEXT: punpklo p0.h, p1.b
; CHECK-SVE-NEXT: punpkhi p1.h, p1.b
-; CHECK-SVE-NEXT: bl use
-; CHECK-SVE-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-SVE-NEXT: ret
+; CHECK-SVE-NEXT: b use
;
; CHECK-SVE2p1-LABEL: test_2x8bit_mask_with_32bit_index_and_trip_count:
; CHECK-SVE2p1: // %bb.0:
-; CHECK-SVE2p1-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
; CHECK-SVE2p1-NEXT: mov w8, w1
; CHECK-SVE2p1-NEXT: mov w9, w0
; CHECK-SVE2p1-NEXT: whilelo { p0.h, p1.h }, x9, x8
-; CHECK-SVE2p1-NEXT: bl use
-; CHECK-SVE2p1-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-SVE2p1-NEXT: ret
-;
-; CHECK-SME2-LABEL: test_2x8bit_mask_with_32bit_index_and_trip_count:
-; CHECK-SME2: // %bb.0:
-; CHECK-SME2-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-SME2-NEXT: mov w8, w1
-; CHECK-SME2-NEXT: mov w9, w0
-; CHECK-SME2-NEXT: whilelo { p0.h, p1.h }, x9, x8
-; CHECK-SME2-NEXT: bl use
-; CHECK-SME2-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-SME2-NEXT: ret
+; CHECK-SVE2p1-NEXT: b use
%r = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 %i, i32 %n)
%v0 = call <vscale x 8 x i1> @llvm.vector.extract.nxv8i1.nxv16i1.i64(<vscale x 16 x i1> %r, i64 0)
%v1 = call <vscale x 8 x i1> @llvm.vector.extract.nxv8i1.nxv16i1.i64(<vscale x 16 x i1> %r, i64 8)
- call void @use(<vscale x 8 x i1> %v0, <vscale x 8 x i1> %v1)
+ tail call void @use(<vscale x 8 x i1> %v0, <vscale x 8 x i1> %v1)
ret void
}
define void @test_2x8bit_mask_with_64bit_index_and_trip_count(i64 %i, i64 %n) #0 {
; CHECK-SVE-LABEL: test_2x8bit_mask_with_64bit_index_and_trip_count:
; CHECK-SVE: // %bb.0:
-; CHECK-SVE-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
; CHECK-SVE-NEXT: whilelo p1.b, x0, x1
; CHECK-SVE-NEXT: punpklo p0.h, p1.b
; CHECK-SVE-NEXT: punpkhi p1.h, p1.b
-; CHECK-SVE-NEXT: bl use
-; CHECK-SVE-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-SVE-NEXT: ret
+; CHECK-SVE-NEXT: b use
;
; CHECK-SVE2p1-LABEL: test_2x8bit_mask_with_64bit_index_and_trip_count:
; CHECK-SVE2p1: // %bb.0:
-; CHECK-SVE2p1-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
; CHECK-SVE2p1-NEXT: whilelo { p0.h, p1.h }, x0, x1
-; CHECK-SVE2p1-NEXT: bl use
-; CHECK-SVE2p1-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-SVE2p1-NEXT: ret
-;
-; CHECK-SME2-LABEL: test_2x8bit_mask_with_64bit_index_and_trip_count:
-; CHECK-SME2: // %bb.0:
-; CHECK-SME2-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-SME2-NEXT: whilelo { p0.h, p1.h }, x0, x1
-; CHECK-SME2-NEXT: bl use
-; CHECK-SME2-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-SME2-NEXT: ret
+; CHECK-SVE2p1-NEXT: b use
%r = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 %i, i64 %n)
%v0 = call <vscale x 8 x i1> @llvm.vector.extract.nxv8i1.nxv16i1.i64(<vscale x 16 x i1> %r, i64 0)
%v1 = call <vscale x 8 x i1> @llvm.vector.extract.nxv8i1.nxv16i1.i64(<vscale x 16 x i1> %r, i64 8)
- call void @use(<vscale x 8 x i1> %v0, <vscale x 8 x i1> %v1)
+ tail call void @use(<vscale x 8 x i1> %v0, <vscale x 8 x i1> %v1)
ret void
}
define void @test_edge_case_2x1bit_mask(i64 %i, i64 %n) #0 {
; CHECK-SVE-LABEL: test_edge_case_2x1bit_mask:
; CHECK-SVE: // %bb.0:
-; CHECK-SVE-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
; CHECK-SVE-NEXT: whilelo p1.d, x0, x1
; CHECK-SVE-NEXT: punpklo p0.h, p1.b
; CHECK-SVE-NEXT: punpkhi p1.h, p1.b
-; CHECK-SVE-NEXT: bl use
-; CHECK-SVE-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-SVE-NEXT: ret
+; CHECK-SVE-NEXT: b use
;
; CHECK-SVE2p1-LABEL: test_edge_case_2x1bit_mask:
; CHECK-SVE2p1: // %bb.0:
-; CHECK-SVE2p1-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
; CHECK-SVE2p1-NEXT: whilelo p1.d, x0, x1
; CHECK-SVE2p1-NEXT: punpklo p0.h, p1.b
; CHECK-SVE2p1-NEXT: punpkhi p1.h, p1.b
-; CHECK-SVE2p1-NEXT: bl use
-; CHECK-SVE2p1-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-SVE2p1-NEXT: ret
-;
-; CHECK-SME2-LABEL: test_edge_case_2x1bit_mask:
-; CHECK-SME2: // %bb.0:
-; CHECK-SME2-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-SME2-NEXT: whilelo p1.d, x0, x1
-; CHECK-SME2-NEXT: punpklo p0.h, p1.b
-; CHECK-SME2-NEXT: punpkhi p1.h, p1.b
-; CHECK-SME2-NEXT: bl use
-; CHECK-SME2-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-SME2-NEXT: ret
+; CHECK-SVE2p1-NEXT: b use
%r = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 %i, i64 %n)
%v0 = call <vscale x 1 x i1> @llvm.vector.extract.nxv1i1.nxv2i1.i64(<vscale x 2 x i1> %r, i64 0)
%v1 = call <vscale x 1 x i1> @llvm.vector.extract.nxv1i1.nxv2i1.i64(<vscale x 2 x i1> %r, i64 1)
- call void @use(<vscale x 1 x i1> %v0, <vscale x 1 x i1> %v1)
- ret void
-}
-
-define void @test_edge_case_2x16bit_mask(i64 %i, i64 %n) #0 {
-; CHECK-SVE-LABEL: test_edge_case_2x16bit_mask:
-; CHECK-SVE: // %bb.0:
-; CHECK-SVE-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-SVE-NEXT: index z1.d, #0, #1
-; CHECK-SVE-NEXT: mov z0.d, x0
-; CHECK-SVE-NEXT: ptrue p0.d
-; CHECK-SVE-NEXT: mov z3.d, x1
-; CHECK-SVE-NEXT: mov z2.d, z1.d
-; CHECK-SVE-NEXT: mov z4.d, z1.d
-; CHECK-SVE-NEXT: mov z6.d, z1.d
-; CHECK-SVE-NEXT: uqadd z17.d, z1.d, z0.d
-; CHECK-SVE-NEXT: incd z1.d, all, mul #8
-; CHECK-SVE-NEXT: incd z2.d
-; CHECK-SVE-NEXT: incd z4.d, all, mul #2
-; CHECK-SVE-NEXT: incd z6.d, all, mul #4
-; CHECK-SVE-NEXT: cmphi p1.d, p0/z, z3.d, z17.d
-; CHECK-SVE-NEXT: uqadd z1.d, z1.d, z0.d
-; CHECK-SVE-NEXT: mov z5.d, z2.d
-; CHECK-SVE-NEXT: uqadd z18.d, z2.d, z0.d
-; CHECK-SVE-NEXT: mov z7.d, z2.d
-; CHECK-SVE-NEXT: mov z16.d, z4.d
-; CHECK-SVE-NEXT: uqadd z19.d, z4.d, z0.d
-; CHECK-SVE-NEXT: uqadd z20.d, z6.d, z0.d
-; CHECK-SVE-NEXT: incd z2.d, all, mul #8
-; CHECK-SVE-NEXT: incd z4.d, all, mul #8
-; CHECK-SVE-NEXT: incd z6.d, all, mul #8
-; CHECK-SVE-NEXT: incd z5.d, all, mul #2
-; CHECK-SVE-NEXT: incd z7.d, all, mul #4
-; CHECK-SVE-NEXT: cmphi p2.d, p0/z, z3.d, z18.d
-; CHECK-SVE-NEXT: incd z16.d, all, mul #4
-; CHECK-SVE-NEXT: cmphi p3.d, p0/z, z3.d, z19.d
-; CHECK-SVE-NEXT: cmphi p5.d, p0/z, z3.d, z20.d
-; CHECK-SVE-NEXT: uqadd z2.d, z2.d, z0.d
-; CHECK-SVE-NEXT: uqadd z4.d, z4.d, z0.d
-; CHECK-SVE-NEXT: uqadd z6.d, z6.d, z0.d
-; CHECK-SVE-NEXT: mov z18.d, z5.d
-; CHECK-SVE-NEXT: uqadd z17.d, z5.d, z0.d
-; CHECK-SVE-NEXT: uqadd z19.d, z7.d, z0.d
-; CHECK-SVE-NEXT: incd z5.d, all, mul #8
-; CHECK-SVE-NEXT: incd z7.d, all, mul #8
-; CHECK-SVE-NEXT: uzp1 p1.s, p1.s, p2.s
-; CHECK-SVE-NEXT: incd z18.d, all, mul #4
-; CHECK-SVE-NEXT: cmphi p8.d, p0/z, z3.d, z2.d
-; CHECK-SVE-NEXT: cmphi p4.d, p0/z, z3.d, z17.d
-; CHECK-SVE-NEXT: uqadd z17.d, z16.d, z0.d
-; CHECK-SVE-NEXT: incd z16.d, all, mul #8
-; CHECK-SVE-NEXT: uqadd z5.d, z5.d, z0.d
-; CHECK-SVE-NEXT: uqadd z7.d, z7.d, z0.d
-; CHECK-SVE-NEXT: cmphi p6.d, p0/z, z3.d, z19.d
-; CHECK-SVE-NEXT: uqadd z20.d, z18.d, z0.d
-; CHECK-SVE-NEXT: incd z18.d, all, mul #8
-; CHECK-SVE-NEXT: uzp1 p3.s, p3.s, p4.s
-; CHECK-SVE-NEXT: uqadd z16.d, z16.d, z0.d
-; CHECK-SVE-NEXT: cmphi p7.d, p0/z, z3.d, z17.d
-; CHECK-SVE-NEXT: cmphi p4.d, p0/z, z3.d, z1.d
-; CHECK-SVE-NEXT: uzp1 p5.s, p5.s, p6.s
-; CHECK-SVE-NEXT: cmphi p6.d, p0/z, z3.d, z4.d
-; CHECK-SVE-NEXT: cmphi p9.d, p0/z, z3.d, z5.d
-; CHECK-SVE-NEXT: cmphi p10.d, p0/z, z3.d, z7.d
-; CHECK-SVE-NEXT: uqadd z0.d, z18.d, z0.d
-; CHECK-SVE-NEXT: cmphi p2.d, p0/z, z3.d, z20.d
-; CHECK-SVE-NEXT: uzp1 p4.s, p4.s, p8.s
-; CHECK-SVE-NEXT: cmphi p8.d, p0/z, z3.d, z16.d
-; CHECK-SVE-NEXT: uzp1 p6.s, p6.s, p9.s
-; CHECK-SVE-NEXT: uzp1 p1.h, p1.h, p3.h
-; CHECK-SVE-NEXT: uzp1 p2.s, p7.s, p2.s
-; CHECK-SVE-NEXT: cmphi p7.d, p0/z, z3.d, z6.d
-; CHECK-SVE-NEXT: cmphi p0.d, p0/z, z3.d, z0.d
-; CHECK-SVE-NEXT: uzp1 p7.s, p7.s, p10.s
-; CHECK-SVE-NEXT: uzp1 p0.s, p8.s, p0.s
-; CHECK-SVE-NEXT: uzp1 p3.h, p4.h, p6.h
-; CHECK-SVE-NEXT: uzp1 p2.h, p5.h, p2.h
-; CHECK-SVE-NEXT: uzp1 p4.h, p7.h, p0.h
-; CHECK-SVE-NEXT: uzp1 p0.b, p1.b, p2.b
-; CHECK-SVE-NEXT: uzp1 p1.b, p3.b, p4.b
-; CHECK-SVE-NEXT: bl use
-; CHECK-SVE-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-SVE-NEXT: ret
-;
-; CHECK-SVE2p1-LABEL: test_edge_case_2x16bit_mask:
-; CHECK-SVE2p1: // %bb.0:
-; CHECK-SVE2p1-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-SVE2p1-NEXT: index z1.d, #0, #1
-; CHECK-SVE2p1-NEXT: mov z0.d, x0
-; CHECK-SVE2p1-NEXT: ptrue p0.d
-; CHECK-SVE2p1-NEXT: mov z3.d, x1
-; CHECK-SVE2p1-NEXT: mov z2.d, z1.d
-; CHECK-SVE2p1-NEXT: mov z4.d, z1.d
-; CHECK-SVE2p1-NEXT: mov z6.d, z1.d
-; CHECK-SVE2p1-NEXT: uqadd z17.d, z1.d, z0.d
-; CHECK-SVE2p1-NEXT: incd z1.d, all, mul #8
-; CHECK-SVE2p1-NEXT: incd z2.d
-; CHECK-SVE2p1-NEXT: incd z4.d, all, mul #2
-; CHECK-SVE2p1-NEXT: incd z6.d, all, mul #4
-; CHECK-SVE2p1-NEXT: cmphi p1.d, p0/z, z3.d, z17.d
-; CHECK-SVE2p1-NEXT: uqadd z1.d, z1.d, z0.d
-; CHECK-SVE2p1-NEXT: mov z5.d, z2.d
-; CHECK-SVE2p1-NEXT: uqadd z18.d, z2.d, z0.d
-; CHECK-SVE2p1-NEXT: mov z7.d, z2.d
-; CHECK-SVE2p1-NEXT: mov z16.d, z4.d
-; CHECK-SVE2p1-NEXT: uqadd z19.d, z4.d, z0.d
-; CHECK-SVE2p1-NEXT: uqadd z20.d, z6.d, z0.d
-; CHECK-SVE2p1-NEXT: incd z2.d, all, mul #8
-; CHECK-SVE2p1-NEXT: incd z4.d, all, mul #8
-; CHECK-SVE2p1-NEXT: incd z6.d, all, mul #8
-; CHECK-SVE2p1-NEXT: incd z5.d, all, mul #2
-; CHECK-SVE2p1-NEXT: incd z7.d, all, mul #4
-; CHECK-SVE2p1-NEXT: cmphi p2.d, p0/z, z3.d, z18.d
-; CHECK-SVE2p1-NEXT: incd z16.d, all, mul #4
-; CHECK-SVE2p1-NEXT: cmphi p3.d, p0/z, z3.d, z19.d
-; CHECK-SVE2p1-NEXT: cmphi p5.d, p0/z, z3.d, z20.d
-; CHECK-SVE2p1-NEXT: uqadd z2.d, z2.d, z0.d
-; CHECK-SVE2p1-NEXT: uqadd z4.d, z4.d, z0.d
-; CHECK-SVE2p1-NEXT: uqadd z6.d, z6.d, z0.d
-; CHECK-SVE2p1-NEXT: mov z18.d, z5.d
-; CHECK-SVE2p1-NEXT: uqadd z17.d, z5.d, z0.d
-; CHECK-SVE2p1-NEXT: uqadd z19.d, z7.d, z0.d
-; CHECK-SVE2p1-NEXT: incd z5.d, all, mul #8
-; CHECK-SVE2p1-NEXT: incd z7.d, all, mul #8
-; CHECK-SVE2p1-NEXT: uzp1 p1.s, p1.s, p2.s
-; CHECK-SVE2p1-NEXT: incd z18.d, all, mul #4
-; CHECK-SVE2p1-NEXT: cmphi p8.d, p0/z, z3.d, z2.d
-; CHECK-SVE2p1-NEXT: cmphi p4.d, p0/z, z3.d, z17.d
-; CHECK-SVE2p1-NEXT: uqadd z17.d, z16.d, z0.d
-; CHECK-SVE2p1-NEXT: incd z16.d, all, mul #8
-; CHECK-SVE2p1-NEXT: uqadd z5.d, z5.d, z0.d
-; CHECK-SVE2p1-NEXT: uqadd z7.d, z7.d, z0.d
-; CHECK-SVE2p1-NEXT: cmphi p6.d, p0/z, z3.d, z19.d
-; CHECK-SVE2p1-NEXT: uqadd z20.d, z18.d, z0.d
-; CHECK-SVE2p1-NEXT: incd z18.d, all, mul #8
-; CHECK-SVE2p1-NEXT: uzp1 p3.s, p3.s, p4.s
-; CHECK-SVE2p1-NEXT: uqadd z16.d, z16.d, z0.d
-; CHECK-SVE2p1-NEXT: cmphi p7.d, p0/z, z3.d, z17.d
-; CHECK-SVE2p1-NEXT: cmphi p4.d, p0/z, z3.d, z1.d
-; CHECK-SVE2p1-NEXT: uzp1 p5.s, p5.s, p6.s
-; CHECK-SVE2p1-NEXT: cmphi p6.d, p0/z, z3.d, z4.d
-; CHECK-SVE2p1-NEXT: cmphi p9.d, p0/z, z3.d, z5.d
-; CHECK-SVE2p1-NEXT: cmphi p10.d, p0/z, z3.d, z7.d
-; CHECK-SVE2p1-NEXT: uqadd z0.d, z18.d, z0.d
-; CHECK-SVE2p1-NEXT: cmphi p2.d, p0/z, z3.d, z20.d
-; CHECK-SVE2p1-NEXT: uzp1 p4.s, p4.s, p8.s
-; CHECK-SVE2p1-NEXT: cmphi p8.d, p0/z, z3.d, z16.d
-; CHECK-SVE2p1-NEXT: uzp1 p6.s, p6.s, p9.s
-; CHECK-SVE2p1-NEXT: uzp1 p1.h, p1.h, p3.h
-; CHECK-SVE2p1-NEXT: uzp1 p2.s, p7.s, p2.s
-; CHECK-SVE2p1-NEXT: cmphi p7.d, p0/z, z3.d, z6.d
-; CHECK-SVE2p1-NEXT: cmphi p0.d, p0/z, z3.d, z0.d
-; CHECK-SVE2p1-NEXT: uzp1 p7.s, p7.s, p10.s
-; CHECK-SVE2p1-NEXT: uzp1 p0.s, p8.s, p0.s
-; CHECK-SVE2p1-NEXT: uzp1 p3.h, p4.h, p6.h
-; CHECK-SVE2p1-NEXT: uzp1 p2.h, p5.h, p2.h
-; CHECK-SVE2p1-NEXT: uzp1 p4.h, p7.h, p0.h
-; CHECK-SVE2p1-NEXT: uzp1 p0.b, p1.b, p2.b
-; CHECK-SVE2p1-NEXT: uzp1 p1.b, p3.b, p4.b
-; CHECK-SVE2p1-NEXT: bl use
-; CHECK-SVE2p1-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-SVE2p1-NEXT: ret
-;
-; CHECK-SME2-LABEL: test_edge_case_2x16bit_mask:
-; CHECK-SME2: // %bb.0:
-; CHECK-SME2-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-SME2-NEXT: index z1.d, #0, #1
-; CHECK-SME2-NEXT: mov z0.d, x0
-; CHECK-SME2-NEXT: ptrue p0.d
-; CHECK-SME2-NEXT: mov z3.d, x1
-; CHECK-SME2-NEXT: mov z2.d, z1.d
-; CHECK-SME2-NEXT: mov z4.d, z1.d
-; CHECK-SME2-NEXT: mov z6.d, z1.d
-; CHECK-SME2-NEXT: uqadd z17.d, z1.d, z0.d
-; CHECK-SME2-NEXT: incd z1.d, all, mul #8
-; CHECK-SME2-NEXT: incd z2.d
-; CHECK-SME2-NEXT: incd z4.d, all, mul #2
-; CHECK-SME2-NEXT: incd z6.d, all, mul #4
-; CHECK-SME2-NEXT: cmphi p1.d, p0/z, z3.d, z17.d
-; CHECK-SME2-NEXT: uqadd z1.d, z1.d, z0.d
-; CHECK-SME2-NEXT: mov z5.d, z2.d
-; CHECK-SME2-NEXT: uqadd z18.d, z2.d, z0.d
-; CHECK-SME2-NEXT: mov z7.d, z2.d
-; CHECK-SME2-NEXT: mov z16.d, z4.d
-; CHECK-SME2-NEXT: uqadd z19.d, z4.d, z0.d
-; CHECK-SME2-NEXT: uqadd z20.d, z6.d, z0.d
-; CHECK-SME2-NEXT: incd z2.d, all, mul #8
-; CHECK-SME2-NEXT: incd z4.d, all, mul #8
-; CHECK-SME2-NEXT: incd z6.d, all, mul #8
-; CHECK-SME2-NEXT: incd z5.d, all, mul #2
-; CHECK-SME2-NEXT: incd z7.d, all, mul #4
-; CHECK-SME2-NEXT: cmphi p2.d, p0/z, z3.d, z18.d
-; CHECK-SME2-NEXT: incd z16.d, all, mul #4
-; CHECK-SME2-NEXT: cmphi p3.d, p0/z, z3.d, z19.d
-; CHECK-SME2-NEXT: cmphi p5.d, p0/z, z3.d, z20.d
-; CHECK-SME2-NEXT: uqadd z2.d, z2.d, z0.d
-; CHECK-SME2-NEXT: uqadd z4.d, z4.d, z0.d
-; CHECK-SME2-NEXT: uqadd z6.d, z6.d, z0.d
-; CHECK-SME2-NEXT: mov z18.d, z5.d
-; CHECK-SME2-NEXT: uqadd z17.d, z5.d, z0.d
-; CHECK-SME2-NEXT: uqadd z19.d, z7.d, z0.d
-; CHECK-SME2-NEXT: incd z5.d, all, mul #8
-; CHECK-SME2-NEXT: incd z7.d, all, mul #8
-; CHECK-SME2-NEXT: uzp1 p1.s, p1.s, p2.s
-; CHECK-SME2-NEXT: incd z18.d, all, mul #4
-; CHECK-SME2-NEXT: cmphi p8.d, p0/z, z3.d, z2.d
-; CHECK-SME2-NEXT: cmphi p4.d, p0/z, z3.d, z17.d
-; CHECK-SME2-NEXT: uqadd z17.d, z16.d, z0.d
-; CHECK-SME2-NEXT: incd z16.d, all, mul #8
-; CHECK-SME2-NEXT: uqadd z5.d, z5.d, z0.d
-; CHECK-SME2-NEXT: uqadd z7.d, z7.d, z0.d
-; CHECK-SME2-NEXT: cmphi p6.d, p0/z, z3.d, z19.d
-; CHECK-SME2-NEXT: uqadd z20.d, z18.d, z0.d
-; CHECK-SME2-NEXT: incd z18.d, all, mul #8
-; CHECK-SME2-NEXT: uzp1 p3.s, p3.s, p4.s
-; CHECK-SME2-NEXT: uqadd z16.d, z16.d, z0.d
-; CHECK-SME2-NEXT: cmphi p7.d, p0/z, z3.d, z17.d
-; CHECK-SME2-NEXT: cmphi p4.d, p0/z, z3.d, z1.d
-; CHECK-SME2-NEXT: uzp1 p5.s, p5.s, p6.s
-; CHECK-SME2-NEXT: cmphi p6.d, p0/z, z3.d, z4.d
-; CHECK-SME2-NEXT: cmphi p9.d, p0/z, z3.d, z5.d
-; CHECK-SME2-NEXT: cmphi p10.d, p0/z, z3.d, z7.d
-; CHECK-SME2-NEXT: uqadd z0.d, z18.d, z0.d
-; CHECK-SME2-NEXT: cmphi p2.d, p0/z, z3.d, z20.d
-; CHECK-SME2-NEXT: uzp1 p4.s, p4.s, p8.s
-; CHECK-SME2-NEXT: cmphi p8.d, p0/z, z3.d, z16.d
-; CHECK-SME2-NEXT: uzp1 p6.s, p6.s, p9.s
-; CHECK-SME2-NEXT: uzp1 p1.h, p1.h, p3.h
-; CHECK-SME2-NEXT: uzp1 p2.s, p7.s, p2.s
-; CHECK-SME2-NEXT: cmphi p7.d, p0/z, z3.d, z6.d
-; CHECK-SME2-NEXT: cmphi p0.d, p0/z, z3.d, z0.d
-; CHECK-SME2-NEXT: uzp1 p7.s, p7.s, p10.s
-; CHECK-SME2-NEXT: uzp1 p0.s, p8.s, p0.s
-; CHECK-SME2-NEXT: uzp1 p3.h, p4.h, p6.h
-; CHECK-SME2-NEXT: uzp1 p2.h, p5.h, p2.h
-; CHECK-SME2-NEXT: uzp1 p4.h, p7.h, p0.h
-; CHECK-SME2-NEXT: uzp1 p0.b, p1.b, p2.b
-; CHECK-SME2-NEXT: uzp1 p1.b, p3.b, p4.b
-; CHECK-SME2-NEXT: bl use
-; CHECK-SME2-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-SME2-NEXT: ret
- %r = call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64 %i, i64 %n)
- %v0 = call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv32i1.i64(<vscale x 32 x i1> %r, i64 0)
- %v1 = call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv32i1.i64(<vscale x 32 x i1> %r, i64 16)
- call void @use(<vscale x 16 x i1> %v0, <vscale x 16 x i1> %v1)
+ tail call void @use(<vscale x 1 x i1> %v0, <vscale x 1 x i1> %v1)
ret void
}
define void @test_boring_case_2x2bit_mask(i64 %i, i64 %n) #0 {
; CHECK-SVE-LABEL: test_boring_case_2x2bit_mask:
; CHECK-SVE: // %bb.0:
-; CHECK-SVE-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
; CHECK-SVE-NEXT: whilelo p1.s, x0, x1
; CHECK-SVE-NEXT: punpklo p0.h, p1.b
; CHECK-SVE-NEXT: punpkhi p1.h, p1.b
-; CHECK-SVE-NEXT: bl use
-; CHECK-SVE-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-SVE-NEXT: ret
+; CHECK-SVE-NEXT: b use
;
; CHECK-SVE2p1-LABEL: test_boring_case_2x2bit_mask:
; CHECK-SVE2p1: // %bb.0:
-; CHECK-SVE2p1-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
; CHECK-SVE2p1-NEXT: whilelo { p0.d, p1.d }, x0, x1
-; CHECK-SVE2p1-NEXT: bl use
-; CHECK-SVE2p1-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-SVE2p1-NEXT: ret
-;
-; CHECK-SME2-LABEL: test_boring_case_2x2bit_mask:
-; CHECK-SME2: // %bb.0:
-; CHECK-SME2-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-SME2-NEXT: whilelo { p0.d, p1.d }, x0, x1
-; CHECK-SME2-NEXT: bl use
-; CHECK-SME2-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-SME2-NEXT: ret
+; CHECK-SVE2p1-NEXT: b use
%r = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 %i, i64 %n)
%v0 = call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv4i1.i64(<vscale x 4 x i1> %r, i64 0)
%v1 = call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv4i1.i64(<vscale x 4 x i1> %r, i64 2)
- call void @use(<vscale x 2 x i1> %v0, <vscale x 2 x i1> %v1)
+ tail call void @use(<vscale x 2 x i1> %v0, <vscale x 2 x i1> %v1)
ret void
}
More information about the llvm-commits
mailing list