[llvm] 8b58494 - [AArch64] Improve codegen for get.active.lane.mask when SVE is available
David Sherwood via llvm-commits
llvm-commits at lists.llvm.org
Thu Feb 10 08:02:50 PST 2022
Author: David Sherwood
Date: 2022-02-10T16:02:44Z
New Revision: 8b58494cea784f413a27418823ae2f97fde96371
URL: https://github.com/llvm/llvm-project/commit/8b58494cea784f413a27418823ae2f97fde96371
DIFF: https://github.com/llvm/llvm-project/commit/8b58494cea784f413a27418823ae2f97fde96371.diff
LOG: [AArch64] Improve codegen for get.active.lane.mask when SVE is available
When lowering the get.active.lane.mask intrinsic with a fixed-width
predicate vector result, we can actually make use of the SVE whilelo
instruction when SVE is enabled. We do this by carefully choosing
a sensible VT for the whilelo instruction, then promoting it to an
integer vector, i.e. nxv16i1 -> nx16i8. We can then extract a v16i8
subvector and truncate back to the original return type, i.e. v16i1.
This leads to a significant improvement in code quality.
Differential Revision: https://reviews.llvm.org/D116664
Added:
Modified:
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
llvm/test/CodeGen/AArch64/active_lane_mask.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 5dbdba448af1..02918e19476d 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1514,9 +1514,11 @@ bool AArch64TargetLowering::shouldExpandGetActiveLaneMask(EVT ResVT,
if (!Subtarget->hasSVE())
return true;
- // We can only support legal predicate result types.
+ // We can only support legal predicate result types. We can use the SVE
+ // whilelo instruction for generating fixed-width predicates too.
if (ResVT != MVT::nxv2i1 && ResVT != MVT::nxv4i1 && ResVT != MVT::nxv8i1 &&
- ResVT != MVT::nxv16i1)
+ ResVT != MVT::nxv16i1 && ResVT != MVT::v2i1 && ResVT != MVT::v4i1 &&
+ ResVT != MVT::v8i1 && ResVT != MVT::v16i1)
return true;
// The whilelo instruction only works with i32 or i64 scalar inputs.
@@ -15390,6 +15392,39 @@ static SDValue performIntrinsicCombine(SDNode *N,
switch (IID) {
default:
break;
+ case Intrinsic::get_active_lane_mask: {
+ SDValue Res = SDValue();
+ EVT VT = N->getValueType(0);
+ if (VT.isFixedLengthVector()) {
+ // We can use the SVE whilelo instruction to lower this intrinsic by
+ // creating the appropriate sequence of scalable vector operations and
+ // then extracting a fixed-width subvector from the scalable vector.
+
+ SDLoc DL(N);
+ SDValue ID =
+ DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64);
+
+ EVT WhileVT = EVT::getVectorVT(
+ *DAG.getContext(), MVT::i1,
+ ElementCount::getScalable(VT.getVectorNumElements()));
+
+ // Get promoted scalable vector VT, i.e. promote nxv4i1 -> nxv4i32.
+ EVT PromVT = getPromotedVTForPredicate(WhileVT);
+
+ // Get the fixed-width equivalent of PromVT for extraction.
+ EVT ExtVT =
+ EVT::getVectorVT(*DAG.getContext(), PromVT.getVectorElementType(),
+ VT.getVectorElementCount());
+
+ Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, WhileVT, ID,
+ N->getOperand(1), N->getOperand(2));
+ Res = DAG.getNode(ISD::SIGN_EXTEND, DL, PromVT, Res);
+ Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtVT, Res,
+ DAG.getConstant(0, DL, MVT::i64));
+ Res = DAG.getNode(ISD::TRUNCATE, DL, VT, Res);
+ }
+ return Res;
+ }
case Intrinsic::aarch64_neon_vcvtfxs2fp:
case Intrinsic::aarch64_neon_vcvtfxu2fp:
return tryCombineFixedPointConvert(N, DCI, DAG);
diff --git a/llvm/test/CodeGen/AArch64/active_lane_mask.ll b/llvm/test/CodeGen/AArch64/active_lane_mask.ll
index 45f030dc4714..7b3bcacc4f3d 100644
--- a/llvm/test/CodeGen/AArch64/active_lane_mask.ll
+++ b/llvm/test/CodeGen/AArch64/active_lane_mask.ll
@@ -324,27 +324,9 @@ define <vscale x 32 x i1> @lane_mask_nxv32i1_i8(i8 %index, i8 %TC) {
define <16 x i1> @lane_mask_v16i1_i32(i32 %index, i32 %TC) {
; CHECK-LABEL: lane_mask_v16i1_i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: adrp x8, .LCPI15_0
-; CHECK-NEXT: adrp x9, .LCPI15_3
-; CHECK-NEXT: adrp x10, .LCPI15_2
-; CHECK-NEXT: dup v2.4s, w0
-; CHECK-NEXT: dup v5.4s, w1
-; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI15_0]
-; CHECK-NEXT: adrp x8, .LCPI15_1
-; CHECK-NEXT: ldr q1, [x9, :lo12:.LCPI15_3]
-; CHECK-NEXT: ldr q3, [x10, :lo12:.LCPI15_2]
-; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI15_1]
-; CHECK-NEXT: uqadd v1.4s, v2.4s, v1.4s
-; CHECK-NEXT: uqadd v3.4s, v2.4s, v3.4s
-; CHECK-NEXT: uqadd v4.4s, v2.4s, v4.4s
-; CHECK-NEXT: uqadd v0.4s, v2.4s, v0.4s
-; CHECK-NEXT: cmhi v1.4s, v5.4s, v1.4s
-; CHECK-NEXT: cmhi v2.4s, v5.4s, v3.4s
-; CHECK-NEXT: cmhi v3.4s, v5.4s, v4.4s
-; CHECK-NEXT: cmhi v0.4s, v5.4s, v0.4s
-; CHECK-NEXT: uzp1 v1.8h, v2.8h, v1.8h
-; CHECK-NEXT: uzp1 v0.8h, v0.8h, v3.8h
-; CHECK-NEXT: uzp1 v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: whilelo p0.b, w0, w1
+; CHECK-NEXT: mov z0.b, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
%active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %TC)
ret <16 x i1> %active.lane.mask
@@ -353,17 +335,8 @@ define <16 x i1> @lane_mask_v16i1_i32(i32 %index, i32 %TC) {
define <8 x i1> @lane_mask_v8i1_i32(i32 %index, i32 %TC) {
; CHECK-LABEL: lane_mask_v8i1_i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: adrp x8, .LCPI16_1
-; CHECK-NEXT: adrp x9, .LCPI16_0
-; CHECK-NEXT: dup v2.4s, w0
-; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI16_1]
-; CHECK-NEXT: ldr q1, [x9, :lo12:.LCPI16_0]
-; CHECK-NEXT: uqadd v0.4s, v2.4s, v0.4s
-; CHECK-NEXT: uqadd v1.4s, v2.4s, v1.4s
-; CHECK-NEXT: dup v2.4s, w1
-; CHECK-NEXT: cmhi v0.4s, v2.4s, v0.4s
-; CHECK-NEXT: cmhi v1.4s, v2.4s, v1.4s
-; CHECK-NEXT: uzp1 v0.8h, v1.8h, v0.8h
+; CHECK-NEXT: whilelo p0.h, w0, w1
+; CHECK-NEXT: mov z0.h, p0/z, #-1 // =0xffffffffffffffff
; CHECK-NEXT: xtn v0.8b, v0.8h
; CHECK-NEXT: ret
%active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %TC)
@@ -373,12 +346,8 @@ define <8 x i1> @lane_mask_v8i1_i32(i32 %index, i32 %TC) {
define <4 x i1> @lane_mask_v4i1_i32(i32 %index, i32 %TC) {
; CHECK-LABEL: lane_mask_v4i1_i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: adrp x8, .LCPI17_0
-; CHECK-NEXT: dup v1.4s, w0
-; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI17_0]
-; CHECK-NEXT: uqadd v0.4s, v1.4s, v0.4s
-; CHECK-NEXT: dup v1.4s, w1
-; CHECK-NEXT: cmhi v0.4s, v1.4s, v0.4s
+; CHECK-NEXT: whilelo p0.s, w0, w1
+; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff
; CHECK-NEXT: xtn v0.4h, v0.4s
; CHECK-NEXT: ret
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %TC)
@@ -388,12 +357,9 @@ define <4 x i1> @lane_mask_v4i1_i32(i32 %index, i32 %TC) {
define <2 x i1> @lane_mask_v2i1_i32(i32 %index, i32 %TC) {
; CHECK-LABEL: lane_mask_v2i1_i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: adrp x8, .LCPI18_0
-; CHECK-NEXT: dup v0.2s, w0
-; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI18_0]
-; CHECK-NEXT: uqadd v0.2s, v0.2s, v1.2s
-; CHECK-NEXT: dup v1.2s, w1
-; CHECK-NEXT: cmhi v0.2s, v1.2s, v0.2s
+; CHECK-NEXT: whilelo p0.d, w0, w1
+; CHECK-NEXT: mov z0.d, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT: xtn v0.2s, v0.2d
; CHECK-NEXT: ret
%active.lane.mask = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 %index, i32 %TC)
ret <2 x i1> %active.lane.mask
@@ -402,47 +368,9 @@ define <2 x i1> @lane_mask_v2i1_i32(i32 %index, i32 %TC) {
define <16 x i1> @lane_mask_v16i1_i64(i64 %index, i64 %TC) {
; CHECK-LABEL: lane_mask_v16i1_i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: adrp x8, .LCPI19_0
-; CHECK-NEXT: adrp x9, .LCPI19_1
-; CHECK-NEXT: adrp x10, .LCPI19_2
-; CHECK-NEXT: dup v1.2d, x0
-; CHECK-NEXT: dup v17.2d, x1
-; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI19_0]
-; CHECK-NEXT: adrp x8, .LCPI19_3
-; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI19_1]
-; CHECK-NEXT: adrp x9, .LCPI19_4
-; CHECK-NEXT: ldr q3, [x10, :lo12:.LCPI19_2]
-; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI19_3]
-; CHECK-NEXT: adrp x8, .LCPI19_5
-; CHECK-NEXT: ldr q5, [x9, :lo12:.LCPI19_4]
-; CHECK-NEXT: adrp x9, .LCPI19_7
-; CHECK-NEXT: uqadd v0.2d, v1.2d, v0.2d
-; CHECK-NEXT: ldr q6, [x8, :lo12:.LCPI19_5]
-; CHECK-NEXT: adrp x8, .LCPI19_6
-; CHECK-NEXT: ldr q7, [x9, :lo12:.LCPI19_7]
-; CHECK-NEXT: uqadd v2.2d, v1.2d, v2.2d
-; CHECK-NEXT: ldr q16, [x8, :lo12:.LCPI19_6]
-; CHECK-NEXT: uqadd v3.2d, v1.2d, v3.2d
-; CHECK-NEXT: uqadd v4.2d, v1.2d, v4.2d
-; CHECK-NEXT: uqadd v6.2d, v1.2d, v6.2d
-; CHECK-NEXT: uqadd v7.2d, v1.2d, v7.2d
-; CHECK-NEXT: uqadd v16.2d, v1.2d, v16.2d
-; CHECK-NEXT: uqadd v1.2d, v1.2d, v5.2d
-; CHECK-NEXT: cmhi v6.2d, v17.2d, v6.2d
-; CHECK-NEXT: cmhi v5.2d, v17.2d, v7.2d
-; CHECK-NEXT: cmhi v7.2d, v17.2d, v16.2d
-; CHECK-NEXT: cmhi v1.2d, v17.2d, v1.2d
-; CHECK-NEXT: cmhi v4.2d, v17.2d, v4.2d
-; CHECK-NEXT: cmhi v3.2d, v17.2d, v3.2d
-; CHECK-NEXT: cmhi v2.2d, v17.2d, v2.2d
-; CHECK-NEXT: cmhi v0.2d, v17.2d, v0.2d
-; CHECK-NEXT: uzp1 v5.4s, v7.4s, v5.4s
-; CHECK-NEXT: uzp1 v1.4s, v1.4s, v6.4s
-; CHECK-NEXT: uzp1 v3.4s, v3.4s, v4.4s
-; CHECK-NEXT: uzp1 v0.4s, v0.4s, v2.4s
-; CHECK-NEXT: uzp1 v1.8h, v1.8h, v5.8h
-; CHECK-NEXT: uzp1 v0.8h, v0.8h, v3.8h
-; CHECK-NEXT: uzp1 v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: whilelo p0.b, x0, x1
+; CHECK-NEXT: mov z0.b, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
%active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 %index, i64 %TC)
ret <16 x i1> %active.lane.mask
@@ -451,27 +379,8 @@ define <16 x i1> @lane_mask_v16i1_i64(i64 %index, i64 %TC) {
define <8 x i1> @lane_mask_v8i1_i64(i64 %index, i64 %TC) {
; CHECK-LABEL: lane_mask_v8i1_i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: adrp x8, .LCPI20_0
-; CHECK-NEXT: adrp x9, .LCPI20_3
-; CHECK-NEXT: adrp x10, .LCPI20_2
-; CHECK-NEXT: dup v2.2d, x0
-; CHECK-NEXT: dup v5.2d, x1
-; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI20_0]
-; CHECK-NEXT: adrp x8, .LCPI20_1
-; CHECK-NEXT: ldr q1, [x9, :lo12:.LCPI20_3]
-; CHECK-NEXT: ldr q3, [x10, :lo12:.LCPI20_2]
-; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI20_1]
-; CHECK-NEXT: uqadd v1.2d, v2.2d, v1.2d
-; CHECK-NEXT: uqadd v3.2d, v2.2d, v3.2d
-; CHECK-NEXT: uqadd v4.2d, v2.2d, v4.2d
-; CHECK-NEXT: uqadd v0.2d, v2.2d, v0.2d
-; CHECK-NEXT: cmhi v1.2d, v5.2d, v1.2d
-; CHECK-NEXT: cmhi v2.2d, v5.2d, v3.2d
-; CHECK-NEXT: cmhi v3.2d, v5.2d, v4.2d
-; CHECK-NEXT: cmhi v0.2d, v5.2d, v0.2d
-; CHECK-NEXT: uzp1 v1.4s, v2.4s, v1.4s
-; CHECK-NEXT: uzp1 v0.4s, v0.4s, v3.4s
-; CHECK-NEXT: uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-NEXT: whilelo p0.h, x0, x1
+; CHECK-NEXT: mov z0.h, p0/z, #-1 // =0xffffffffffffffff
; CHECK-NEXT: xtn v0.8b, v0.8h
; CHECK-NEXT: ret
%active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 %index, i64 %TC)
@@ -481,17 +390,8 @@ define <8 x i1> @lane_mask_v8i1_i64(i64 %index, i64 %TC) {
define <4 x i1> @lane_mask_v4i1_i64(i64 %index, i64 %TC) {
; CHECK-LABEL: lane_mask_v4i1_i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: adrp x8, .LCPI21_1
-; CHECK-NEXT: adrp x9, .LCPI21_0
-; CHECK-NEXT: dup v2.2d, x0
-; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI21_1]
-; CHECK-NEXT: ldr q1, [x9, :lo12:.LCPI21_0]
-; CHECK-NEXT: uqadd v0.2d, v2.2d, v0.2d
-; CHECK-NEXT: uqadd v1.2d, v2.2d, v1.2d
-; CHECK-NEXT: dup v2.2d, x1
-; CHECK-NEXT: cmhi v0.2d, v2.2d, v0.2d
-; CHECK-NEXT: cmhi v1.2d, v2.2d, v1.2d
-; CHECK-NEXT: uzp1 v0.4s, v1.4s, v0.4s
+; CHECK-NEXT: whilelo p0.s, x0, x1
+; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff
; CHECK-NEXT: xtn v0.4h, v0.4s
; CHECK-NEXT: ret
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 %index, i64 %TC)
@@ -501,12 +401,8 @@ define <4 x i1> @lane_mask_v4i1_i64(i64 %index, i64 %TC) {
define <2 x i1> @lane_mask_v2i1_i64(i64 %index, i64 %TC) {
; CHECK-LABEL: lane_mask_v2i1_i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: adrp x8, .LCPI22_0
-; CHECK-NEXT: dup v1.2d, x0
-; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI22_0]
-; CHECK-NEXT: uqadd v0.2d, v1.2d, v0.2d
-; CHECK-NEXT: dup v1.2d, x1
-; CHECK-NEXT: cmhi v0.2d, v1.2d, v0.2d
+; CHECK-NEXT: whilelo p0.d, x0, x1
+; CHECK-NEXT: mov z0.d, p0/z, #-1 // =0xffffffffffffffff
; CHECK-NEXT: xtn v0.2s, v0.2d
; CHECK-NEXT: ret
%active.lane.mask = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 %index, i64 %TC)
More information about the llvm-commits
mailing list