[llvm] [AArch64] Extend efficient lowering of experimental.cttz.elts (PR #92114)
Hari Limaye via llvm-commits
llvm-commits at lists.llvm.org
Mon May 20 03:36:55 PDT 2024
https://github.com/hazzlim updated https://github.com/llvm/llvm-project/pull/92114
>From 4e18140713cff4b5371c8ecb9c476d2b08d56259 Mon Sep 17 00:00:00 2001
From: Hari Limaye <hari.limaye at arm.com>
Date: Thu, 9 May 2024 10:06:03 +0000
Subject: [PATCH 1/2] [AArch64] Extend efficient lowering of
experimental.cttz.elts
This patch extends support for more efficient lowering of the
experimental.cttz.elts intrinsic to fixed-width vector types, by first
creating an SVE predicate register mask from the fixed-width vector.
---
.../Target/AArch64/AArch64ISelLowering.cpp | 22 ++-
.../Analysis/CostModel/AArch64/cttz_elts.ll | 32 ++--
.../AArch64/intrinsic-cttz-elts-sve.ll | 137 ++++++++++++++++++
3 files changed, 171 insertions(+), 20 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 33cc8ffaf85d5..fc2d1a18652ab 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1871,9 +1871,11 @@ bool AArch64TargetLowering::shouldExpandCttzElements(EVT VT) const {
if (!Subtarget->hasSVEorSME())
return true;
- // We can only use the BRKB + CNTP sequence with legal predicate types.
+ // We can only use the BRKB + CNTP sequence with legal predicate types. We can
+ // also support fixed-width predicates.
return VT != MVT::nxv16i1 && VT != MVT::nxv8i1 && VT != MVT::nxv4i1 &&
- VT != MVT::nxv2i1;
+ VT != MVT::nxv2i1 && VT != MVT::v16i1 && VT != MVT::v8i1 &&
+ VT != MVT::v4i1 && VT != MVT::v2i1;
}
void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
@@ -5838,9 +5840,21 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return SDValue();
}
case Intrinsic::experimental_cttz_elts: {
- SDValue NewCttzElts =
- DAG.getNode(AArch64ISD::CTTZ_ELTS, dl, MVT::i64, Op.getOperand(1));
+ SDValue CttzOp = Op.getOperand(1);
+ EVT VT = CttzOp.getValueType();
+
+ if (!VT.isScalableVector()) {
+ // Retrieve original fixed-width vector from ISD::TRUNCATE Node.
+ assert(CttzOp.getOpcode() == ISD::TRUNCATE && "Expected ISD::TRUNCATE!");
+ SDValue FixedWidthVec = CttzOp.getOperand(0);
+ // We can use SVE instructions to lower this intrinsic by first creating
+ // an SVE predicate register mask from the fixed-width vector.
+ CttzOp = convertFixedMaskToScalableVector(FixedWidthVec, DAG);
+ }
+
+ SDValue NewCttzElts =
+ DAG.getNode(AArch64ISD::CTTZ_ELTS, dl, MVT::i64, CttzOp);
return DAG.getZExtOrTrunc(NewCttzElts, dl, Op.getValueType());
}
}
diff --git a/llvm/test/Analysis/CostModel/AArch64/cttz_elts.ll b/llvm/test/Analysis/CostModel/AArch64/cttz_elts.ll
index cc1532ee33dcf..e1a9ee114d261 100644
--- a/llvm/test/Analysis/CostModel/AArch64/cttz_elts.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/cttz_elts.ll
@@ -13,15 +13,15 @@ define void @foo_no_vscale_range() {
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv8i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv16i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %res.i32.nxv32i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1> undef, i1 true)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i64.v2i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v2i1(<2 x i1> undef, i1 true)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i64.v4i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> undef, i1 true)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i64.v8i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> undef, i1 true)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i64.v16i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v16i1(<16 x i1> undef, i1 true)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.v2i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v2i1(<2 x i1> undef, i1 true)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.v4i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> undef, i1 true)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.v8i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> undef, i1 true)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.v16i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v16i1(<16 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %res.i64.v32i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v32i1(<32 x i1> undef, i1 true)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i32.v2i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v2i1(<2 x i1> undef, i1 true)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i32.v4i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v4i1(<4 x i1> undef, i1 true)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i32.v8i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v8i1(<8 x i1> undef, i1 true)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i32.v16i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v16i1(<16 x i1> undef, i1 true)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.v2i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v2i1(<2 x i1> undef, i1 true)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.v4i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v4i1(<4 x i1> undef, i1 true)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.v8i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v8i1(<8 x i1> undef, i1 true)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.v16i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v16i1(<16 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %res.i32.v32i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v32i1(<32 x i1> undef, i1 true)
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv2i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv4i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> undef, i1 false)
@@ -33,15 +33,15 @@ define void @foo_no_vscale_range() {
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv8i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv16i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %res.i32.nxv32i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1> undef, i1 false)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i64.v2i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v2i1(<2 x i1> undef, i1 false)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i64.v4i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> undef, i1 false)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i64.v8i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> undef, i1 false)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i64.v16i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v16i1(<16 x i1> undef, i1 false)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.v2i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v2i1(<2 x i1> undef, i1 false)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.v4i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> undef, i1 false)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.v8i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> undef, i1 false)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.v16i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v16i1(<16 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %res.i64.v32i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v32i1(<32 x i1> undef, i1 false)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i32.v2i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v2i1(<2 x i1> undef, i1 false)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i32.v4i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v4i1(<4 x i1> undef, i1 false)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i32.v8i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v8i1(<8 x i1> undef, i1 false)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i32.v16i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v16i1(<16 x i1> undef, i1 false)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.v2i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v2i1(<2 x i1> undef, i1 false)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.v4i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v4i1(<4 x i1> undef, i1 false)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.v8i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v8i1(<8 x i1> undef, i1 false)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.v16i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v16i1(<16 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %res.i32.v32i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v32i1(<32 x i1> undef, i1 false)
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
diff --git a/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll b/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll
index 211237542a15b..d9ea72d44934a 100644
--- a/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll
+++ b/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll
@@ -359,6 +359,143 @@ define i32 @add_i32_ctz_nxv16i1_poison(<vscale x 16 x i1> %a, i32 %b) {
ret i32 %add
}
+; FIXED-WIDTH VECTOR TYPES
+
+define i32 @ctz_v16i1(<16 x i1> %a) {
+; CHECK-LABEL: ctz_v16i1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl16
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: ptrue p1.b
+; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, #0
+; CHECK-NEXT: brkb p0.b, p1/z, p0.b
+; CHECK-NEXT: cntp x0, p0, p0.b
+; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: ret
+ %res = call i32 @llvm.experimental.cttz.elts.i32.v16i1(<16 x i1> %a, i1 0)
+ ret i32 %res
+}
+
+define i32 @ctz_v16i1_poison(<16 x i1> %a) {
+; CHECK-LABEL: ctz_v16i1_poison:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl16
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: ptrue p1.b
+; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, #0
+; CHECK-NEXT: brkb p0.b, p1/z, p0.b
+; CHECK-NEXT: cntp x0, p0, p0.b
+; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: ret
+ %res = call i32 @llvm.experimental.cttz.elts.i32.v16i1(<16 x i1> %a, i1 1)
+ ret i32 %res
+}
+
+define i64 @add_i64_ctz_v16i1_poison(<16 x i1> %a, i64 %b) {
+; CHECK-LABEL: add_i64_ctz_v16i1_poison:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl16
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: ptrue p1.b
+; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, #0
+; CHECK-NEXT: brkb p0.b, p1/z, p0.b
+; CHECK-NEXT: incp x0, p0.b
+; CHECK-NEXT: ret
+ %res = call i64 @llvm.experimental.cttz.elts.i64.v16i1(<16 x i1> %a, i1 1)
+ %add = add i64 %res, %b
+ ret i64 %add
+}
+
+define i32 @ctz_v8i1(<8 x i1> %a) {
+; CHECK-LABEL: ctz_v8i1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl8
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: ptrue p1.b
+; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, #0
+; CHECK-NEXT: brkb p0.b, p1/z, p0.b
+; CHECK-NEXT: cntp x0, p0, p0.b
+; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: ret
+ %res = call i32 @llvm.experimental.cttz.elts.i32.v8i1(<8 x i1> %a, i1 0)
+ ret i32 %res
+}
+
+define i32 @ctz_v8i1_poison(<8 x i1> %a) {
+; CHECK-LABEL: ctz_v8i1_poison:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl8
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: ptrue p1.b
+; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, #0
+; CHECK-NEXT: brkb p0.b, p1/z, p0.b
+; CHECK-NEXT: cntp x0, p0, p0.b
+; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: ret
+ %res = call i32 @llvm.experimental.cttz.elts.i32.v8i1(<8 x i1> %a, i1 1)
+ ret i32 %res
+}
+
+define i32 @ctz_v4i1(<4 x i1> %a) {
+; CHECK-LABEL: ctz_v4i1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl4
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: ptrue p1.h
+; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0
+; CHECK-NEXT: brkb p0.b, p1/z, p0.b
+; CHECK-NEXT: cntp x0, p0, p0.h
+; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: ret
+ %res = call i32 @llvm.experimental.cttz.elts.i32.v4i1(<4 x i1> %a, i1 0)
+ ret i32 %res
+}
+
+define i32 @ctz_v4i1_poison(<4 x i1> %a) {
+; CHECK-LABEL: ctz_v4i1_poison:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl4
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: ptrue p1.h
+; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0
+; CHECK-NEXT: brkb p0.b, p1/z, p0.b
+; CHECK-NEXT: cntp x0, p0, p0.h
+; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: ret
+ %res = call i32 @llvm.experimental.cttz.elts.i32.v4i1(<4 x i1> %a, i1 1)
+ ret i32 %res
+}
+
+define i32 @ctz_v2i1(<2 x i1> %a) {
+; CHECK-LABEL: ctz_v2i1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl2
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0
+; CHECK-NEXT: brkb p0.b, p1/z, p0.b
+; CHECK-NEXT: cntp x0, p0, p0.s
+; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: ret
+ %res = call i32 @llvm.experimental.cttz.elts.i32.v2i1(<2 x i1> %a, i1 0)
+ ret i32 %res
+}
+
+define i32 @ctz_v2i1_poison(<2 x i1> %a) {
+; CHECK-LABEL: ctz_v2i1_poison:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl2
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0
+; CHECK-NEXT: brkb p0.b, p1/z, p0.b
+; CHECK-NEXT: cntp x0, p0, p0.s
+; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: ret
+ %res = call i32 @llvm.experimental.cttz.elts.i32.v2i1(<2 x i1> %a, i1 1)
+ ret i32 %res
+}
+
declare i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1>, i1)
declare i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1>, i1)
declare i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1>, i1)
>From 656b49e326807709c9bd5df2a03298e48bfa9238 Mon Sep 17 00:00:00 2001
From: Hari Limaye <hari.limaye at arm.com>
Date: Sun, 19 May 2024 00:59:08 +0000
Subject: [PATCH 2/2] Properly sign extend fixed-width i1 vectors to legal type
Note: this produces a sub-optimal lowering, as the sign_extend is not
optimised away. This is left to a follow-up patch.
---
.../Target/AArch64/AArch64ISelLowering.cpp | 11 ++++----
.../AArch64/intrinsic-cttz-elts-sve.ll | 27 ++++++++++++-------
2 files changed, 23 insertions(+), 15 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index fc2d1a18652ab..1d49198b93328 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -5842,15 +5842,14 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
case Intrinsic::experimental_cttz_elts: {
SDValue CttzOp = Op.getOperand(1);
EVT VT = CttzOp.getValueType();
+ assert(VT.getVectorElementType() == MVT::i1 && "Expected MVT::i1");
- if (!VT.isScalableVector()) {
- // Retrieve original fixed-width vector from ISD::TRUNCATE Node.
- assert(CttzOp.getOpcode() == ISD::TRUNCATE && "Expected ISD::TRUNCATE!");
- SDValue FixedWidthVec = CttzOp.getOperand(0);
-
+ if (VT.isFixedLengthVector()) {
// We can use SVE instructions to lower this intrinsic by first creating
// an SVE predicate register mask from the fixed-width vector.
- CttzOp = convertFixedMaskToScalableVector(FixedWidthVec, DAG);
+ EVT NewVT = getTypeToTransformTo(*DAG.getContext(), VT);
+ SDValue Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, NewVT, CttzOp);
+ CttzOp = convertFixedMaskToScalableVector(Mask, DAG);
}
SDValue NewCttzElts =
diff --git a/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll b/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll
index d9ea72d44934a..9c72afd84fa7c 100644
--- a/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll
+++ b/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll
@@ -364,9 +364,10 @@ define i32 @add_i32_ctz_nxv16i1_poison(<vscale x 16 x i1> %a, i32 %b) {
define i32 @ctz_v16i1(<16 x i1> %a) {
; CHECK-LABEL: ctz_v16i1:
; CHECK: // %bb.0:
+; CHECK-NEXT: shl v0.16b, v0.16b, #7
; CHECK-NEXT: ptrue p0.b, vl16
-; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
; CHECK-NEXT: ptrue p1.b
+; CHECK-NEXT: cmlt v0.16b, v0.16b, #0
; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, #0
; CHECK-NEXT: brkb p0.b, p1/z, p0.b
; CHECK-NEXT: cntp x0, p0, p0.b
@@ -379,9 +380,10 @@ define i32 @ctz_v16i1(<16 x i1> %a) {
define i32 @ctz_v16i1_poison(<16 x i1> %a) {
; CHECK-LABEL: ctz_v16i1_poison:
; CHECK: // %bb.0:
+; CHECK-NEXT: shl v0.16b, v0.16b, #7
; CHECK-NEXT: ptrue p0.b, vl16
-; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
; CHECK-NEXT: ptrue p1.b
+; CHECK-NEXT: cmlt v0.16b, v0.16b, #0
; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, #0
; CHECK-NEXT: brkb p0.b, p1/z, p0.b
; CHECK-NEXT: cntp x0, p0, p0.b
@@ -394,9 +396,10 @@ define i32 @ctz_v16i1_poison(<16 x i1> %a) {
define i64 @add_i64_ctz_v16i1_poison(<16 x i1> %a, i64 %b) {
; CHECK-LABEL: add_i64_ctz_v16i1_poison:
; CHECK: // %bb.0:
+; CHECK-NEXT: shl v0.16b, v0.16b, #7
; CHECK-NEXT: ptrue p0.b, vl16
-; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
; CHECK-NEXT: ptrue p1.b
+; CHECK-NEXT: cmlt v0.16b, v0.16b, #0
; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, #0
; CHECK-NEXT: brkb p0.b, p1/z, p0.b
; CHECK-NEXT: incp x0, p0.b
@@ -409,9 +412,10 @@ define i64 @add_i64_ctz_v16i1_poison(<16 x i1> %a, i64 %b) {
define i32 @ctz_v8i1(<8 x i1> %a) {
; CHECK-LABEL: ctz_v8i1:
; CHECK: // %bb.0:
+; CHECK-NEXT: shl v0.8b, v0.8b, #7
; CHECK-NEXT: ptrue p0.b, vl8
-; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
; CHECK-NEXT: ptrue p1.b
+; CHECK-NEXT: cmlt v0.8b, v0.8b, #0
; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, #0
; CHECK-NEXT: brkb p0.b, p1/z, p0.b
; CHECK-NEXT: cntp x0, p0, p0.b
@@ -424,9 +428,10 @@ define i32 @ctz_v8i1(<8 x i1> %a) {
define i32 @ctz_v8i1_poison(<8 x i1> %a) {
; CHECK-LABEL: ctz_v8i1_poison:
; CHECK: // %bb.0:
+; CHECK-NEXT: shl v0.8b, v0.8b, #7
; CHECK-NEXT: ptrue p0.b, vl8
-; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
; CHECK-NEXT: ptrue p1.b
+; CHECK-NEXT: cmlt v0.8b, v0.8b, #0
; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, #0
; CHECK-NEXT: brkb p0.b, p1/z, p0.b
; CHECK-NEXT: cntp x0, p0, p0.b
@@ -439,9 +444,10 @@ define i32 @ctz_v8i1_poison(<8 x i1> %a) {
define i32 @ctz_v4i1(<4 x i1> %a) {
; CHECK-LABEL: ctz_v4i1:
; CHECK: // %bb.0:
+; CHECK-NEXT: shl v0.4h, v0.4h, #15
; CHECK-NEXT: ptrue p0.h, vl4
-; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
; CHECK-NEXT: ptrue p1.h
+; CHECK-NEXT: cmlt v0.4h, v0.4h, #0
; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0
; CHECK-NEXT: brkb p0.b, p1/z, p0.b
; CHECK-NEXT: cntp x0, p0, p0.h
@@ -454,9 +460,10 @@ define i32 @ctz_v4i1(<4 x i1> %a) {
define i32 @ctz_v4i1_poison(<4 x i1> %a) {
; CHECK-LABEL: ctz_v4i1_poison:
; CHECK: // %bb.0:
+; CHECK-NEXT: shl v0.4h, v0.4h, #15
; CHECK-NEXT: ptrue p0.h, vl4
-; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
; CHECK-NEXT: ptrue p1.h
+; CHECK-NEXT: cmlt v0.4h, v0.4h, #0
; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0
; CHECK-NEXT: brkb p0.b, p1/z, p0.b
; CHECK-NEXT: cntp x0, p0, p0.h
@@ -469,9 +476,10 @@ define i32 @ctz_v4i1_poison(<4 x i1> %a) {
define i32 @ctz_v2i1(<2 x i1> %a) {
; CHECK-LABEL: ctz_v2i1:
; CHECK: // %bb.0:
+; CHECK-NEXT: shl v0.2s, v0.2s, #31
; CHECK-NEXT: ptrue p0.s, vl2
-; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: cmlt v0.2s, v0.2s, #0
; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0
; CHECK-NEXT: brkb p0.b, p1/z, p0.b
; CHECK-NEXT: cntp x0, p0, p0.s
@@ -484,9 +492,10 @@ define i32 @ctz_v2i1(<2 x i1> %a) {
define i32 @ctz_v2i1_poison(<2 x i1> %a) {
; CHECK-LABEL: ctz_v2i1_poison:
; CHECK: // %bb.0:
+; CHECK-NEXT: shl v0.2s, v0.2s, #31
; CHECK-NEXT: ptrue p0.s, vl2
-; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: cmlt v0.2s, v0.2s, #0
; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0
; CHECK-NEXT: brkb p0.b, p1/z, p0.b
; CHECK-NEXT: cntp x0, p0, p0.s
More information about the llvm-commits
mailing list