[llvm] e0ad56b - [AArch64] Add lowering for `@llvm.experimental.vector.compress` (#101015)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Aug 13 02:04:21 PDT 2024
Author: Lawrence Benson
Date: 2024-08-13T11:04:17+02:00
New Revision: e0ad56b7359be7fddf858c8be7b118f5be906c33
URL: https://github.com/llvm/llvm-project/commit/e0ad56b7359be7fddf858c8be7b118f5be906c33
DIFF: https://github.com/llvm/llvm-project/commit/e0ad56b7359be7fddf858c8be7b118f5be906c33.diff
LOG: [AArch64] Add lowering for `@llvm.experimental.vector.compress` (#101015)
This is a follow-up to #92289 that adds custom lowering of the new
`@llvm.experimental.vector.compress` intrinsic on AArch64 with SVE
instructions.
Some vectors have a `compact` instruction that they can be lowered to.
Added:
llvm/test/CodeGen/AArch64/sve-vector-compress.ll
Modified:
llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
llvm/lib/Target/AArch64/AArch64ISelLowering.h
Removed:
################################################################################
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 5a21ad7ac7e2cd..ddb7c8c54bbfe4 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -2412,11 +2412,64 @@ void DAGTypeLegalizer::SplitVecRes_VECTOR_COMPRESS(SDNode *N, SDValue &Lo,
SDValue &Hi) {
// This is not "trivial", as there is a dependency between the two subvectors.
// Depending on the number of 1s in the mask, the elements from the Hi vector
- // need to be moved to the Lo vector. So we just perform this as one "big"
- // operation and then extract the Lo and Hi vectors from that. This gets rid
- // of VECTOR_COMPRESS and all other operands can be legalized later.
- SDValue Compressed = TLI.expandVECTOR_COMPRESS(N, DAG);
- std::tie(Lo, Hi) = DAG.SplitVector(Compressed, SDLoc(N));
+ // need to be moved to the Lo vector. Passthru values make this even harder.
+ // We try to use VECTOR_COMPRESS if the target has custom lowering with
+ // smaller types and passthru is undef, as it is most likely faster than the
+ // fully expand path. Otherwise, just do the full expansion as one "big"
+ // operation and then extract the Lo and Hi vectors from that. This gets
+ // rid of VECTOR_COMPRESS and all other operands can be legalized later.
+ SDLoc DL(N);
+ EVT VecVT = N->getValueType(0);
+
+ auto [LoVT, HiVT] = DAG.GetSplitDestVTs(VecVT);
+ bool HasCustomLowering = false;
+ EVT CheckVT = LoVT;
+ while (CheckVT.getVectorMinNumElements() > 1) {
+ // TLI.isOperationLegalOrCustom requires a legal type, but we could have a
+ // custom lowering for illegal types. So we do the checks separately.
+ if (TLI.isOperationLegal(ISD::VECTOR_COMPRESS, CheckVT) ||
+ TLI.isOperationCustom(ISD::VECTOR_COMPRESS, CheckVT)) {
+ HasCustomLowering = true;
+ break;
+ }
+ CheckVT = CheckVT.getHalfNumVectorElementsVT(*DAG.getContext());
+ }
+
+ SDValue Passthru = N->getOperand(2);
+ if (!HasCustomLowering || !Passthru.isUndef()) {
+ SDValue Compressed = TLI.expandVECTOR_COMPRESS(N, DAG);
+ std::tie(Lo, Hi) = DAG.SplitVector(Compressed, DL, LoVT, HiVT);
+ return;
+ }
+
+ // Try to VECTOR_COMPRESS smaller vectors and combine via a stack store+load.
+ SDValue LoMask, HiMask;
+ std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
+ std::tie(LoMask, HiMask) = SplitMask(N->getOperand(1));
+
+ SDValue UndefPassthru = DAG.getUNDEF(LoVT);
+ Lo = DAG.getNode(ISD::VECTOR_COMPRESS, DL, LoVT, Lo, LoMask, UndefPassthru);
+ Hi = DAG.getNode(ISD::VECTOR_COMPRESS, DL, HiVT, Hi, HiMask, UndefPassthru);
+
+ SDValue StackPtr = DAG.CreateStackTemporary(
+ VecVT.getStoreSize(), DAG.getReducedAlign(VecVT, /*UseABI=*/false));
+ MachineFunction &MF = DAG.getMachineFunction();
+ MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(
+ MF, cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex());
+
+ // We store LoVec and then insert HiVec starting at offset=|1s| in LoMask.
+ SDValue WideMask =
+ DAG.getNode(ISD::ZERO_EXTEND, DL, LoMask.getValueType(), LoMask);
+ SDValue Offset = DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i32, WideMask);
+ Offset = TLI.getVectorElementPointer(DAG, StackPtr, VecVT, Offset);
+
+ SDValue Chain = DAG.getEntryNode();
+ Chain = DAG.getStore(Chain, DL, Lo, StackPtr, PtrInfo);
+ Chain = DAG.getStore(Chain, DL, Hi, Offset,
+ MachinePointerInfo::getUnknownStack(MF));
+
+ SDValue Compressed = DAG.getLoad(VecVT, DL, Chain, StackPtr, PtrInfo);
+ std::tie(Lo, Hi) = DAG.SplitVector(Compressed, DL);
}
void DAGTypeLegalizer::SplitVecRes_SETCC(SDNode *N, SDValue &Lo, SDValue &Hi) {
@@ -5790,7 +5843,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_VECTOR_COMPRESS(SDNode *N) {
TLI.getTypeToTransformTo(*DAG.getContext(), Vec.getValueType());
EVT WideMaskVT = EVT::getVectorVT(*DAG.getContext(),
Mask.getValueType().getVectorElementType(),
- WideVecVT.getVectorNumElements());
+ WideVecVT.getVectorElementCount());
SDValue WideVec = ModifyToType(Vec, WideVecVT);
SDValue WideMask = ModifyToType(Mask, WideMaskVT, /*FillWithZeroes=*/true);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index bb7aea2c85e67a..314e7134dcd01a 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1775,6 +1775,18 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
MVT::v2f32, MVT::v4f32, MVT::v2f64})
setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom);
+ // We can lower types that have <vscale x {2|4}> elements to compact.
+ for (auto VT :
+ {MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv2f32,
+ MVT::nxv2f64, MVT::nxv4i8, MVT::nxv4i16, MVT::nxv4i32, MVT::nxv4f32})
+ setOperationAction(ISD::VECTOR_COMPRESS, VT, Custom);
+
+ // If we have SVE, we can use SVE logic for legal (or smaller than legal)
+ // NEON vectors in the lowest bits of the SVE register.
+ for (auto VT : {MVT::v2i8, MVT::v2i16, MVT::v2i32, MVT::v2i64, MVT::v2f32,
+ MVT::v2f64, MVT::v4i8, MVT::v4i16, MVT::v4i32, MVT::v4f32})
+ setOperationAction(ISD::VECTOR_COMPRESS, VT, Custom);
+
// Histcnt is SVE2 only
if (Subtarget->hasSVE2()) {
setOperationAction(ISD::EXPERIMENTAL_VECTOR_HISTOGRAM, MVT::Other,
@@ -6619,6 +6631,104 @@ SDValue AArch64TargetLowering::LowerLOAD(SDValue Op,
return DAG.getMergeValues({Ext, Chain}, DL);
}
+SDValue AArch64TargetLowering::LowerVECTOR_COMPRESS(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ SDValue Vec = Op.getOperand(0);
+ SDValue Mask = Op.getOperand(1);
+ SDValue Passthru = Op.getOperand(2);
+ EVT VecVT = Vec.getValueType();
+ EVT MaskVT = Mask.getValueType();
+ EVT ElmtVT = VecVT.getVectorElementType();
+ const bool IsFixedLength = VecVT.isFixedLengthVector();
+ const bool HasPassthru = !Passthru.isUndef();
+ unsigned MinElmts = VecVT.getVectorElementCount().getKnownMinValue();
+ EVT FixedVecVT = MVT::getVectorVT(ElmtVT.getSimpleVT(), MinElmts);
+
+ assert(VecVT.isVector() && "Input to VECTOR_COMPRESS must be vector.");
+
+ if (!Subtarget->isSVEAvailable())
+ return SDValue();
+
+ if (IsFixedLength && VecVT.getSizeInBits().getFixedValue() > 128)
+ return SDValue();
+
+ // Only <vscale x {4|2} x {i32|i64}> supported for compact.
+ if (MinElmts != 2 && MinElmts != 4)
+ return SDValue();
+
+ // We can use the SVE register containing the NEON vector in its lowest bits.
+ if (IsFixedLength) {
+ EVT ScalableVecVT =
+ MVT::getScalableVectorVT(ElmtVT.getSimpleVT(), MinElmts);
+ EVT ScalableMaskVT = MVT::getScalableVectorVT(
+ MaskVT.getVectorElementType().getSimpleVT(), MinElmts);
+
+ Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ScalableVecVT,
+ DAG.getUNDEF(ScalableVecVT), Vec,
+ DAG.getConstant(0, DL, MVT::i64));
+ Mask = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ScalableMaskVT,
+ DAG.getUNDEF(ScalableMaskVT), Mask,
+ DAG.getConstant(0, DL, MVT::i64));
+ Mask = DAG.getNode(ISD::TRUNCATE, DL,
+ ScalableMaskVT.changeVectorElementType(MVT::i1), Mask);
+ Passthru = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ScalableVecVT,
+ DAG.getUNDEF(ScalableVecVT), Passthru,
+ DAG.getConstant(0, DL, MVT::i64));
+
+ VecVT = Vec.getValueType();
+ MaskVT = Mask.getValueType();
+ }
+
+ // Get legal type for compact instruction
+ EVT ContainerVT = getSVEContainerType(VecVT);
+ EVT CastVT = VecVT.changeVectorElementTypeToInteger();
+
+ // Convert to i32 or i64 for smaller types, as these are the only supported
+ // sizes for compact.
+ if (ContainerVT != VecVT) {
+ Vec = DAG.getBitcast(CastVT, Vec);
+ Vec = DAG.getNode(ISD::ANY_EXTEND, DL, ContainerVT, Vec);
+ }
+
+ SDValue Compressed = DAG.getNode(
+ ISD::INTRINSIC_WO_CHAIN, DL, Vec.getValueType(),
+ DAG.getConstant(Intrinsic::aarch64_sve_compact, DL, MVT::i64), Mask, Vec);
+
+ // compact fills with 0s, so if our passthru is all 0s, do nothing here.
+ if (HasPassthru && !ISD::isConstantSplatVectorAllZeros(Passthru.getNode())) {
+ SDValue Offset = DAG.getNode(
+ ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64,
+ DAG.getConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64), Mask, Mask);
+
+ SDValue IndexMask = DAG.getNode(
+ ISD::INTRINSIC_WO_CHAIN, DL, MaskVT,
+ DAG.getConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64),
+ DAG.getConstant(0, DL, MVT::i64), Offset);
+
+ Compressed =
+ DAG.getNode(ISD::VSELECT, DL, VecVT, IndexMask, Compressed, Passthru);
+ }
+
+ // Extracting from a legal SVE type before truncating produces better code.
+ if (IsFixedLength) {
+ Compressed = DAG.getNode(
+ ISD::EXTRACT_SUBVECTOR, DL,
+ FixedVecVT.changeVectorElementType(ContainerVT.getVectorElementType()),
+ Compressed, DAG.getConstant(0, DL, MVT::i64));
+ CastVT = FixedVecVT.changeVectorElementTypeToInteger();
+ VecVT = FixedVecVT;
+ }
+
+ // If we changed the element type before, we need to convert it back.
+ if (ContainerVT != VecVT) {
+ Compressed = DAG.getNode(ISD::TRUNCATE, DL, CastVT, Compressed);
+ Compressed = DAG.getBitcast(VecVT, Compressed);
+ }
+
+ return Compressed;
+}
+
// Generate SUBS and CSEL for integer abs.
SDValue AArch64TargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {
MVT VT = Op.getSimpleValueType();
@@ -6999,6 +7109,8 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
return LowerDYNAMIC_STACKALLOC(Op, DAG);
case ISD::VSCALE:
return LowerVSCALE(Op, DAG);
+ case ISD::VECTOR_COMPRESS:
+ return LowerVECTOR_COMPRESS(Op, DAG);
case ISD::ANY_EXTEND:
case ISD::SIGN_EXTEND:
case ISD::ZERO_EXTEND:
@@ -26563,6 +26675,10 @@ void AArch64TargetLowering::ReplaceNodeResults(
case ISD::VECREDUCE_UMIN:
Results.push_back(LowerVECREDUCE(SDValue(N, 0), DAG));
return;
+ case ISD::VECTOR_COMPRESS:
+ if (SDValue Res = LowerVECTOR_COMPRESS(SDValue(N, 0), DAG))
+ Results.push_back(Res);
+ return;
case ISD::ADD:
case ISD::FADD:
ReplaceAddWithADDP(N, Results, DAG, Subtarget);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index ace682fb89379d..2fa9c49019326d 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -1075,6 +1075,8 @@ class AArch64TargetLowering : public TargetLowering {
SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerVECTOR_COMPRESS(SDValue Op, SelectionDAG &DAG) const;
+
SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/test/CodeGen/AArch64/sve-vector-compress.ll b/llvm/test/CodeGen/AArch64/sve-vector-compress.ll
new file mode 100644
index 00000000000000..84c15e4fbc33c7
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-vector-compress.ll
@@ -0,0 +1,276 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=aarch64 -mattr=+sve < %s | FileCheck %s
+
+define <vscale x 2 x i8> @test_compress_nxv2i8(<vscale x 2 x i8> %vec, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: test_compress_nxv2i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: compact z0.d, p0, z0.d
+; CHECK-NEXT: ret
+ %out = call <vscale x 2 x i8> @llvm.experimental.vector.compress(<vscale x 2 x i8> %vec, <vscale x 2 x i1> %mask, <vscale x 2 x i8> undef)
+ ret <vscale x 2 x i8> %out
+}
+
+define <vscale x 2 x i16> @test_compress_nxv2i16(<vscale x 2 x i16> %vec, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: test_compress_nxv2i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: compact z0.d, p0, z0.d
+; CHECK-NEXT: ret
+ %out = call <vscale x 2 x i16> @llvm.experimental.vector.compress(<vscale x 2 x i16> %vec, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef)
+ ret <vscale x 2 x i16> %out
+}
+
+define <vscale x 2 x i32> @test_compress_nxv2i32(<vscale x 2 x i32> %vec, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: test_compress_nxv2i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: compact z0.d, p0, z0.d
+; CHECK-NEXT: ret
+ %out = call <vscale x 2 x i32> @llvm.experimental.vector.compress(<vscale x 2 x i32> %vec, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
+ ret <vscale x 2 x i32> %out
+}
+
+define <vscale x 2 x i64> @test_compress_nxv2i64(<vscale x 2 x i64> %vec, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: test_compress_nxv2i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: compact z0.d, p0, z0.d
+; CHECK-NEXT: ret
+ %out = call <vscale x 2 x i64> @llvm.experimental.vector.compress(<vscale x 2 x i64> %vec, <vscale x 2 x i1> %mask, <vscale x 2 x i64> undef)
+ ret <vscale x 2 x i64> %out
+}
+
+define <vscale x 2 x float> @test_compress_nxv2f32(<vscale x 2 x float> %vec, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: test_compress_nxv2f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: compact z0.d, p0, z0.d
+; CHECK-NEXT: ret
+ %out = call <vscale x 2 x float> @llvm.experimental.vector.compress(<vscale x 2 x float> %vec, <vscale x 2 x i1> %mask, <vscale x 2 x float> undef)
+ ret <vscale x 2 x float> %out
+}
+
+define <vscale x 2 x double> @test_compress_nxv2f64(<vscale x 2 x double> %vec, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: test_compress_nxv2f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: compact z0.d, p0, z0.d
+; CHECK-NEXT: ret
+ %out = call <vscale x 2 x double> @llvm.experimental.vector.compress(<vscale x 2 x double> %vec, <vscale x 2 x i1> %mask, <vscale x 2 x double> undef)
+ ret <vscale x 2 x double> %out
+}
+
+define <vscale x 4 x i8> @test_compress_nxv4i8(<vscale x 4 x i8> %vec, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: test_compress_nxv4i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: compact z0.s, p0, z0.s
+; CHECK-NEXT: ret
+ %out = call <vscale x 4 x i8> @llvm.experimental.vector.compress(<vscale x 4 x i8> %vec, <vscale x 4 x i1> %mask, <vscale x 4 x i8> undef)
+ ret <vscale x 4 x i8> %out
+}
+
+define <vscale x 4 x i16> @test_compress_nxv4i16(<vscale x 4 x i16> %vec, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: test_compress_nxv4i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: compact z0.s, p0, z0.s
+; CHECK-NEXT: ret
+ %out = call <vscale x 4 x i16> @llvm.experimental.vector.compress(<vscale x 4 x i16> %vec, <vscale x 4 x i1> %mask, <vscale x 4 x i16> undef)
+ ret <vscale x 4 x i16> %out
+}
+
+define <vscale x 4 x i32> @test_compress_nxv4i32(<vscale x 4 x i32> %vec, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: test_compress_nxv4i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: compact z0.s, p0, z0.s
+; CHECK-NEXT: ret
+ %out = call <vscale x 4 x i32> @llvm.experimental.vector.compress(<vscale x 4 x i32> %vec, <vscale x 4 x i1> %mask, <vscale x 4 x i32> undef)
+ ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 4 x float> @test_compress_nxv4f32(<vscale x 4 x float> %vec, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: test_compress_nxv4f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: compact z0.s, p0, z0.s
+; CHECK-NEXT: ret
+ %out = call <vscale x 4 x float> @llvm.experimental.vector.compress(<vscale x 4 x float> %vec, <vscale x 4 x i1> %mask, <vscale x 4 x float> undef)
+ ret <vscale x 4 x float> %out
+}
+
+define <vscale x 4 x i4> @test_compress_illegal_element_type(<vscale x 4 x i4> %vec, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: test_compress_illegal_element_type:
+; CHECK: // %bb.0:
+; CHECK-NEXT: compact z0.s, p0, z0.s
+; CHECK-NEXT: ret
+ %out = call <vscale x 4 x i4> @llvm.experimental.vector.compress(<vscale x 4 x i4> %vec, <vscale x 4 x i1> %mask, <vscale x 4 x i4> undef)
+ ret <vscale x 4 x i4> %out
+}
+
+define <vscale x 8 x i32> @test_compress_large(<vscale x 8 x i32> %vec, <vscale x 8 x i1> %mask) {
+; CHECK-LABEL: test_compress_large:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: punpklo p2.h, p0.b
+; CHECK-NEXT: cnth x9
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: sub x9, x9, #1
+; CHECK-NEXT: punpkhi p0.h, p0.b
+; CHECK-NEXT: compact z0.s, p2, z0.s
+; CHECK-NEXT: cntp x8, p1, p2.s
+; CHECK-NEXT: compact z1.s, p0, z1.s
+; CHECK-NEXT: st1w { z0.s }, p1, [sp]
+; CHECK-NEXT: mov w8, w8
+; CHECK-NEXT: cmp x8, x9
+; CHECK-NEXT: csel x8, x8, x9, lo
+; CHECK-NEXT: mov x9, sp
+; CHECK-NEXT: st1w { z1.s }, p1, [x9, x8, lsl #2]
+; CHECK-NEXT: ld1w { z0.s }, p1/z, [sp]
+; CHECK-NEXT: ld1w { z1.s }, p1/z, [sp, #1, mul vl]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %out = call <vscale x 8 x i32> @llvm.experimental.vector.compress(<vscale x 8 x i32> %vec, <vscale x 8 x i1> %mask, <vscale x 8 x i32> undef)
+ ret <vscale x 8 x i32> %out
+}
+
+; We pass a placeholder value for the const_mask* tests to check that they are converted to a no-op by simply copying
+; the second vector input register to the ret register or doing nothing.
+define <vscale x 4 x i32> @test_compress_const_splat1_mask(<vscale x 4 x i32> %ignore, <vscale x 4 x i32> %vec) {
+; CHECK-LABEL: test_compress_const_splat1_mask:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z0.d, z1.d
+; CHECK-NEXT: ret
+ %out = call <vscale x 4 x i32> @llvm.experimental.vector.compress(<vscale x 4 x i32> %vec, <vscale x 4 x i1> splat (i1 -1), <vscale x 4 x i32> undef)
+ ret <vscale x 4 x i32> %out
+}
+define <vscale x 4 x i32> @test_compress_const_splat0_mask(<vscale x 4 x i32> %ignore, <vscale x 4 x i32> %vec) {
+; CHECK-LABEL: test_compress_const_splat0_mask:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ret
+ %out = call <vscale x 4 x i32> @llvm.experimental.vector.compress(<vscale x 4 x i32> %vec, <vscale x 4 x i1> splat (i1 0), <vscale x 4 x i32> undef)
+ ret <vscale x 4 x i32> %out
+}
+define <vscale x 4 x i32> @test_compress_undef_mask(<vscale x 4 x i32> %ignore, <vscale x 4 x i32> %vec) {
+; CHECK-LABEL: test_compress_undef_mask:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ret
+ %out = call <vscale x 4 x i32> @llvm.experimental.vector.compress(<vscale x 4 x i32> %vec, <vscale x 4 x i1> undef, <vscale x 4 x i32> undef)
+ ret <vscale x 4 x i32> %out
+}
+
+define <4 x i32> @test_compress_v4i32_with_sve(<4 x i32> %vec, <4 x i1> %mask) {
+; CHECK-LABEL: test_compress_v4i32_with_sve:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ushll v1.4s, v1.4h, #0
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: shl v1.4s, v1.4s, #31
+; CHECK-NEXT: cmlt v1.4s, v1.4s, #0
+; CHECK-NEXT: and z1.s, z1.s, #0x1
+; CHECK-NEXT: cmpne p0.s, p0/z, z1.s, #0
+; CHECK-NEXT: compact z0.s, p0, z0.s
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT: ret
+ %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> %vec, <4 x i1> %mask, <4 x i32> undef)
+ ret <4 x i32> %out
+}
+
+define <1 x i32> @test_compress_v1i32_with_sve(<1 x i32> %vec, <1 x i1> %mask) {
+; CHECK-LABEL: test_compress_v1i32_with_sve:
+; CHECK: // %bb.0:
+; CHECK-NEXT: movi v1.2d, #0000000000000000
+; CHECK-NEXT: sbfx w8, w0, #0, #1
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ushll v0.2d, v0.2s, #0
+; CHECK-NEXT: mov v1.s[0], w8
+; CHECK-NEXT: ushll v1.2d, v1.2s, #0
+; CHECK-NEXT: and z1.d, z1.d, #0x1
+; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0
+; CHECK-NEXT: compact z0.d, p0, z0.d
+; CHECK-NEXT: xtn v0.2s, v0.2d
+; CHECK-NEXT: ret
+ %out = call <1 x i32> @llvm.experimental.vector.compress(<1 x i32> %vec, <1 x i1> %mask, <1 x i32> undef)
+ ret <1 x i32> %out
+}
+
+define <4 x double> @test_compress_v4f64_with_sve(<4 x double> %vec, <4 x i1> %mask) {
+; CHECK-LABEL: test_compress_v4f64_with_sve:
+; CHECK: // %bb.0:
+; CHECK-NEXT: sub sp, sp, #32
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: ushll v2.4s, v2.4h, #0
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT: ushll v3.2d, v2.2s, #0
+; CHECK-NEXT: ushll2 v4.2d, v2.4s, #0
+; CHECK-NEXT: fmov x8, d2
+; CHECK-NEXT: shl v3.2d, v3.2d, #63
+; CHECK-NEXT: shl v4.2d, v4.2d, #63
+; CHECK-NEXT: lsr x9, x8, #32
+; CHECK-NEXT: eor w8, w8, w9
+; CHECK-NEXT: mov x9, sp
+; CHECK-NEXT: cmlt v3.2d, v3.2d, #0
+; CHECK-NEXT: cmlt v4.2d, v4.2d, #0
+; CHECK-NEXT: and x8, x8, #0x3
+; CHECK-NEXT: lsl x8, x8, #3
+; CHECK-NEXT: and z3.d, z3.d, #0x1
+; CHECK-NEXT: and z4.d, z4.d, #0x1
+; CHECK-NEXT: cmpne p1.d, p0/z, z3.d, #0
+; CHECK-NEXT: cmpne p0.d, p0/z, z4.d, #0
+; CHECK-NEXT: compact z0.d, p1, z0.d
+; CHECK-NEXT: compact z1.d, p0, z1.d
+; CHECK-NEXT: str q0, [sp]
+; CHECK-NEXT: str q1, [x9, x8]
+; CHECK-NEXT: ldp q0, q1, [sp], #32
+; CHECK-NEXT: ret
+ %out = call <4 x double> @llvm.experimental.vector.compress(<4 x double> %vec, <4 x i1> %mask, <4 x double> undef)
+ ret <4 x double> %out
+}
+
+define <2 x i16> @test_compress_v2i16_with_sve(<2 x i16> %vec, <2 x i1> %mask) {
+; CHECK-LABEL: test_compress_v2i16_with_sve:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ushll v1.2d, v1.2s, #0
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ushll v0.2d, v0.2s, #0
+; CHECK-NEXT: and z1.d, z1.d, #0x1
+; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0
+; CHECK-NEXT: compact z0.d, p0, z0.d
+; CHECK-NEXT: xtn v0.2s, v0.2d
+; CHECK-NEXT: ret
+ %out = call <2 x i16> @llvm.experimental.vector.compress(<2 x i16> %vec, <2 x i1> %mask, <2 x i16> undef)
+ ret <2 x i16> %out
+}
+
+
+define <vscale x 4 x i32> @test_compress_nxv4i32_with_passthru(<vscale x 4 x i32> %vec, <vscale x 4 x i1> %mask, <vscale x 4 x i32> %passthru) {
+; CHECK-LABEL: test_compress_nxv4i32_with_passthru:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cntp x8, p0, p0.s
+; CHECK-NEXT: compact z0.s, p0, z0.s
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: sel z0.s, p0, z0.s, z1.s
+; CHECK-NEXT: ret
+ %out = call <vscale x 4 x i32> @llvm.experimental.vector.compress(<vscale x 4 x i32> %vec, <vscale x 4 x i1> %mask, <vscale x 4 x i32> %passthru)
+ ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 4 x i32> @test_compress_nxv4i32_with_zero_passthru(<vscale x 4 x i32> %vec, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: test_compress_nxv4i32_with_zero_passthru:
+; CHECK: // %bb.0:
+; CHECK-NEXT: compact z0.s, p0, z0.s
+; CHECK-NEXT: ret
+ %out = call <vscale x 4 x i32> @llvm.experimental.vector.compress(<vscale x 4 x i32> %vec, <vscale x 4 x i1> %mask, <vscale x 4 x i32> splat(i32 0))
+ ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 4 x i32> @test_compress_nxv4i32_with_const_passthru(<vscale x 4 x i32> %vec, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: test_compress_nxv4i32_with_const_passthru:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cntp x8, p0, p0.s
+; CHECK-NEXT: compact z0.s, p0, z0.s
+; CHECK-NEXT: mov z1.s, #5 // =0x5
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: sel z0.s, p0, z0.s, z1.s
+; CHECK-NEXT: ret
+ %out = call <vscale x 4 x i32> @llvm.experimental.vector.compress(<vscale x 4 x i32> %vec, <vscale x 4 x i1> %mask, <vscale x 4 x i32> splat(i32 5))
+ ret <vscale x 4 x i32> %out
+}
More information about the llvm-commits
mailing list