[llvm] 135ddf1 - [AArch64][SVE] Add basic support for `@llvm.masked.compressstore` (#168350)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Nov 28 02:17:41 PST 2025
Author: Benjamin Maxwell
Date: 2025-11-28T10:17:36Z
New Revision: 135ddf1e8efef7c8ac9b01caa660210e3ca12327
URL: https://github.com/llvm/llvm-project/commit/135ddf1e8efef7c8ac9b01caa660210e3ca12327
DIFF: https://github.com/llvm/llvm-project/commit/135ddf1e8efef7c8ac9b01caa660210e3ca12327.diff
LOG: [AArch64][SVE] Add basic support for `@llvm.masked.compressstore` (#168350)
This patch adds SVE support for the `masked.compressstore` intrinsic via
the existing `VECTOR_COMPRESS` lowering and compressing the store mask
via `VECREDUCE_ADD`.
Currently, only `nxv4[i32|f32]` and `nxv2[i64|f64]` are directly
supported, with other types promoted to these, where possible.
This is done in preparation for LV support of this intrinsic, which is
currently being worked on in #140723.
Added:
llvm/test/CodeGen/AArch64/sve-masked-compressstore-sve2p2.ll
llvm/test/CodeGen/AArch64/sve-masked-compressstore.ll
Modified:
llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
llvm/lib/Target/AArch64/AArch64ISelLowering.h
llvm/lib/Target/AArch64/AArch64InstrInfo.td
llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
Removed:
################################################################################
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 5684e0e4c26c4..521d8f07434e6 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -10607,23 +10607,26 @@ TargetLowering::IncrementMemoryAddress(SDValue Addr, SDValue Mask,
assert(DataVT.getVectorElementCount() == MaskVT.getVectorElementCount() &&
"Incompatible types of Data and Mask");
if (IsCompressedMemory) {
- if (DataVT.isScalableVector())
- report_fatal_error(
- "Cannot currently handle compressed memory with scalable vectors");
// Incrementing the pointer according to number of '1's in the mask.
- EVT MaskIntVT = EVT::getIntegerVT(*DAG.getContext(), MaskVT.getSizeInBits());
- SDValue MaskInIntReg = DAG.getBitcast(MaskIntVT, Mask);
- if (MaskIntVT.getSizeInBits() < 32) {
- MaskInIntReg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, MaskInIntReg);
- MaskIntVT = MVT::i32;
+ if (DataVT.isScalableVector()) {
+ EVT MaskExtVT = MaskVT.changeElementType(MVT::i32);
+ SDValue MaskExt = DAG.getNode(ISD::ZERO_EXTEND, DL, MaskExtVT, Mask);
+ Increment = DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i32, MaskExt);
+ } else {
+ EVT MaskIntVT =
+ EVT::getIntegerVT(*DAG.getContext(), MaskVT.getSizeInBits());
+ SDValue MaskInIntReg = DAG.getBitcast(MaskIntVT, Mask);
+ if (MaskIntVT.getSizeInBits() < 32) {
+ MaskInIntReg =
+ DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, MaskInIntReg);
+ MaskIntVT = MVT::i32;
+ }
+ Increment = DAG.getNode(ISD::CTPOP, DL, MaskIntVT, MaskInIntReg);
}
-
- // Count '1's with POPCNT.
- Increment = DAG.getNode(ISD::CTPOP, DL, MaskIntVT, MaskInIntReg);
- Increment = DAG.getZExtOrTrunc(Increment, DL, AddrVT);
// Scale is an element size in bytes.
SDValue Scale = DAG.getConstant(DataVT.getScalarSizeInBits() / 8, DL,
AddrVT);
+ Increment = DAG.getZExtOrTrunc(Increment, DL, AddrVT);
Increment = DAG.getNode(ISD::MUL, DL, AddrVT, Increment, Scale);
} else if (DataVT.isScalableVector()) {
Increment = DAG.getVScale(DL, AddrVT,
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index a1400f4cecdac..3661467b4fb67 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1987,10 +1987,15 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom);
// We can lower types that have <vscale x {2|4}> elements to compact.
- for (auto VT :
- {MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv2f32,
- MVT::nxv2f64, MVT::nxv4i8, MVT::nxv4i16, MVT::nxv4i32, MVT::nxv4f32})
+ for (auto VT : {MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64,
+ MVT::nxv2f32, MVT::nxv2f64, MVT::nxv4i8, MVT::nxv4i16,
+ MVT::nxv4i32, MVT::nxv4f32}) {
setOperationAction(ISD::VECTOR_COMPRESS, VT, Custom);
+ // Use a custom lowering for masked stores that could be a supported
+ // compressing store. Note: These types still use the normal (Legal)
+ // lowering for non-compressing masked stores.
+ setOperationAction(ISD::MSTORE, VT, Custom);
+ }
// If we have SVE, we can use SVE logic for legal (or smaller than legal)
// NEON vectors in the lowest bits of the SVE register.
@@ -7936,7 +7941,7 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
case ISD::STORE:
return LowerSTORE(Op, DAG);
case ISD::MSTORE:
- return LowerFixedLengthVectorMStoreToSVE(Op, DAG);
+ return LowerMSTORE(Op, DAG);
case ISD::MGATHER:
return LowerMGATHER(Op, DAG);
case ISD::MSCATTER:
@@ -30439,6 +30444,43 @@ SDValue AArch64TargetLowering::LowerFixedLengthVectorStoreToSVE(
Store->isTruncatingStore());
}
+SDValue AArch64TargetLowering::LowerMSTORE(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ auto *Store = cast<MaskedStoreSDNode>(Op);
+ EVT VT = Store->getValue().getValueType();
+ if (VT.isFixedLengthVector())
+ return LowerFixedLengthVectorMStoreToSVE(Op, DAG);
+
+ if (!Store->isCompressingStore())
+ return SDValue();
+
+ EVT MaskVT = Store->getMask().getValueType();
+ EVT MaskExtVT = getPromotedVTForPredicate(MaskVT);
+ EVT MaskReduceVT = MaskExtVT.getScalarType();
+ SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
+
+ SDValue MaskExt =
+ DAG.getNode(ISD::ZERO_EXTEND, DL, MaskExtVT, Store->getMask());
+ SDValue CntActive =
+ DAG.getNode(ISD::VECREDUCE_ADD, DL, MaskReduceVT, MaskExt);
+ if (MaskReduceVT != MVT::i64)
+ CntActive = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, CntActive);
+
+ SDValue CompressedValue =
+ DAG.getNode(ISD::VECTOR_COMPRESS, DL, VT, Store->getValue(),
+ Store->getMask(), DAG.getPOISON(VT));
+ SDValue CompressedMask =
+ DAG.getNode(ISD::GET_ACTIVE_LANE_MASK, DL, MaskVT, Zero, CntActive);
+
+ return DAG.getMaskedStore(Store->getChain(), DL, CompressedValue,
+ Store->getBasePtr(), Store->getOffset(),
+ CompressedMask, Store->getMemoryVT(),
+ Store->getMemOperand(), Store->getAddressingMode(),
+ Store->isTruncatingStore(),
+ /*isCompressing=*/false);
+}
+
SDValue AArch64TargetLowering::LowerFixedLengthVectorMStoreToSVE(
SDValue Op, SelectionDAG &DAG) const {
auto *Store = cast<MaskedStoreSDNode>(Op);
@@ -30453,7 +30495,8 @@ SDValue AArch64TargetLowering::LowerFixedLengthVectorMStoreToSVE(
return DAG.getMaskedStore(
Store->getChain(), DL, NewValue, Store->getBasePtr(), Store->getOffset(),
Mask, Store->getMemoryVT(), Store->getMemOperand(),
- Store->getAddressingMode(), Store->isTruncatingStore());
+ Store->getAddressingMode(), Store->isTruncatingStore(),
+ Store->isCompressingStore());
}
SDValue AArch64TargetLowering::LowerFixedLengthVectorIntDivideToSVE(
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index ca08eb40c956a..32aa913181a21 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -761,6 +761,7 @@ class AArch64TargetLowering : public TargetLowering {
SDValue LowerWindowsDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerInlineDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerMSTORE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerAVG(SDValue Op, SelectionDAG &DAG, unsigned NewOp) const;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index b4d8649b31d6d..da93a2b13fc11 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -645,29 +645,34 @@ def nontrunc_masked_store :
(masked_st node:$val, node:$ptr, undef, node:$pred), [{
return !cast<MaskedStoreSDNode>(N)->isTruncatingStore() &&
cast<MaskedStoreSDNode>(N)->isUnindexed() &&
- !cast<MaskedStoreSDNode>(N)->isNonTemporal();
+ !cast<MaskedStoreSDNode>(N)->isNonTemporal() &&
+ !cast<MaskedStoreSDNode>(N)->isCompressingStore();
}]>;
// truncating masked store fragments.
def trunc_masked_store :
PatFrag<(ops node:$val, node:$ptr, node:$pred),
(masked_st node:$val, node:$ptr, undef, node:$pred), [{
return cast<MaskedStoreSDNode>(N)->isTruncatingStore() &&
- cast<MaskedStoreSDNode>(N)->isUnindexed();
+ cast<MaskedStoreSDNode>(N)->isUnindexed() &&
+ !cast<MaskedStoreSDNode>(N)->isCompressingStore();
}]>;
def trunc_masked_store_i8 :
PatFrag<(ops node:$val, node:$ptr, node:$pred),
(trunc_masked_store node:$val, node:$ptr, node:$pred), [{
- return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8;
+ return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8 &&
+ !cast<MaskedStoreSDNode>(N)->isCompressingStore();
}]>;
def trunc_masked_store_i16 :
PatFrag<(ops node:$val, node:$ptr, node:$pred),
(trunc_masked_store node:$val, node:$ptr, node:$pred), [{
- return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i16;
+ return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i16 &&
+ !cast<MaskedStoreSDNode>(N)->isCompressingStore();
}]>;
def trunc_masked_store_i32 :
PatFrag<(ops node:$val, node:$ptr, node:$pred),
(trunc_masked_store node:$val, node:$ptr, node:$pred), [{
- return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32;
+ return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32 &&
+ !cast<MaskedStoreSDNode>(N)->isCompressingStore();
}]>;
def non_temporal_store :
@@ -675,7 +680,8 @@ def non_temporal_store :
(masked_st node:$val, node:$ptr, undef, node:$pred), [{
return !cast<MaskedStoreSDNode>(N)->isTruncatingStore() &&
cast<MaskedStoreSDNode>(N)->isUnindexed() &&
- cast<MaskedStoreSDNode>(N)->isNonTemporal();
+ cast<MaskedStoreSDNode>(N)->isNonTemporal() &&
+ !cast<MaskedStoreSDNode>(N)->isCompressingStore();
}]>;
multiclass masked_gather_scatter<PatFrags GatherScatterOp> {
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 52fc28a98449b..fe3bb5e7981d2 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -334,6 +334,23 @@ class AArch64TTIImpl final : public BasicTTIImplBase<AArch64TTIImpl> {
return isLegalMaskedLoadStore(DataType, Alignment);
}
+ bool isElementTypeLegalForCompressStore(Type *Ty) const {
+ return Ty->isFloatTy() || Ty->isDoubleTy() || Ty->isIntegerTy(32) ||
+ Ty->isIntegerTy(64);
+ }
+
+ bool isLegalMaskedCompressStore(Type *DataType,
+ Align Alignment) const override {
+ if (!ST->isSVEAvailable())
+ return false;
+
+ if (isa<FixedVectorType>(DataType) &&
+ DataType->getPrimitiveSizeInBits() < 128)
+ return false;
+
+ return isElementTypeLegalForCompressStore(DataType->getScalarType());
+ }
+
bool isLegalMaskedGatherScatter(Type *DataType) const {
if (!ST->isSVEAvailable())
return false;
diff --git a/llvm/test/CodeGen/AArch64/sve-masked-compressstore-sve2p2.ll b/llvm/test/CodeGen/AArch64/sve-masked-compressstore-sve2p2.ll
new file mode 100644
index 0000000000000..92ecc3c83e2c5
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-masked-compressstore-sve2p2.ll
@@ -0,0 +1,17 @@
+; RUN: llc -mtriple=aarch64 -mattr=+sve2p2 < %s
+
+;; These masked.compressstore operations could be natively supported with +sve2p2
+;; (or by promoting to 32/64 bit elements + a truncstore), but currently are not
+;; supported.
+
+; XFAIL: *
+
+define void @test_compressstore_nxv8i16(ptr %p, <vscale x 8 x i16> %vec, <vscale x 8 x i1> %mask) {
+ tail call void @llvm.masked.compressstore.nxv8i16(<vscale x 8 x i16> %vec, ptr align 2 %p, <vscale x 8 x i1> %mask)
+ ret void
+}
+
+define void @test_compressstore_nxv16i8(ptr %p, <vscale x 16 x i8> %vec, <vscale x 16 x i1> %mask) {
+ tail call void @llvm.masked.compressstore.nxv16i8(<vscale x 16 x i8> %vec, ptr align 1 %p, <vscale x 16 x i1> %mask)
+ ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/sve-masked-compressstore.ll b/llvm/test/CodeGen/AArch64/sve-masked-compressstore.ll
new file mode 100644
index 0000000000000..c698658afc8c4
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-masked-compressstore.ll
@@ -0,0 +1,280 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=aarch64 -mattr=+sve < %s | FileCheck %s --check-prefixes=CHECK,CHECK-BASE
+; RUN: llc -mtriple=aarch64 -aarch64-sve-vector-bits-min=256 -mattr=+sve < %s | FileCheck %s --check-prefixes=CHECK,CHECK-VL256
+
+;; Full SVE vectors (supported with +sve)
+
+define void @test_compressstore_nxv4i32(ptr %p, <vscale x 4 x i32> %vec, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: test_compressstore_nxv4i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cntp x8, p0, p0.s
+; CHECK-NEXT: compact z0.s, p0, z0.s
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
+ tail call void @llvm.masked.compressstore.nxv4i32(<vscale x 4 x i32> %vec, ptr align 4 %p, <vscale x 4 x i1> %mask)
+ ret void
+}
+
+define void @test_compressstore_nxv2i64(ptr %p, <vscale x 2 x i64> %vec, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: test_compressstore_nxv2i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cntp x8, p0, p0.d
+; CHECK-NEXT: compact z0.d, p0, z0.d
+; CHECK-NEXT: whilelo p0.d, xzr, x8
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
+ tail call void @llvm.masked.compressstore.nxv2i64(<vscale x 2 x i64> %vec, ptr align 8 %p, <vscale x 2 x i1> %mask)
+ ret void
+}
+
+define void @test_compressstore_nxv4f32(ptr %p, <vscale x 4 x float> %vec, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: test_compressstore_nxv4f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cntp x8, p0, p0.s
+; CHECK-NEXT: compact z0.s, p0, z0.s
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
+ tail call void @llvm.masked.compressstore.nxv4f32(<vscale x 4 x float> %vec, ptr align 4 %p, <vscale x 4 x i1> %mask)
+ ret void
+}
+
+define void @test_compressstore_nxv2f64(ptr %p, <vscale x 2 x double> %vec, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: test_compressstore_nxv2f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cntp x8, p0, p0.d
+; CHECK-NEXT: compact z0.d, p0, z0.d
+; CHECK-NEXT: whilelo p0.d, xzr, x8
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
+ tail call void @llvm.masked.compressstore.nxv2f64(<vscale x 2 x double> %vec, ptr align 8 %p, <vscale x 2 x i1> %mask)
+ ret void
+}
+
+;; SVE vectors that will be split
+
+define void @test_compressstore_nxv8i32(ptr %p, <vscale x 8 x i32> %vec, <vscale x 8 x i1> %mask) {
+; CHECK-LABEL: test_compressstore_nxv8i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: cntp x8, p1, p1.s
+; CHECK-NEXT: compact z1.s, p1, z1.s
+; CHECK-NEXT: cntp x9, p0, p0.s
+; CHECK-NEXT: compact z0.s, p0, z0.s
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: whilelo p1.s, xzr, x9
+; CHECK-NEXT: st1w { z1.s }, p0, [x0, x9, lsl #2]
+; CHECK-NEXT: st1w { z0.s }, p1, [x0]
+; CHECK-NEXT: ret
+ tail call void @llvm.masked.compressstore.nxv8i32(<vscale x 8 x i32> %vec, ptr align 4 %p, <vscale x 8 x i1> %mask)
+ ret void
+}
+
+;; Unpacked SVE vector types
+
+define void @test_compressstore_nxv2f32(ptr %p, <vscale x 2 x float> %vec, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: test_compressstore_nxv2f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cntp x8, p0, p0.d
+; CHECK-NEXT: compact z0.d, p0, z0.d
+; CHECK-NEXT: whilelo p0.d, xzr, x8
+; CHECK-NEXT: st1w { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
+ tail call void @llvm.masked.compressstore.nxv2f32(<vscale x 2 x float> %vec, ptr align 4 %p, <vscale x 2 x i1> %mask)
+ ret void
+}
+
+;; SVE vector types promoted to 32/64-bit (non-exhaustive)
+
+define void @test_compressstore_nxv2i8(ptr %p, <vscale x 2 x i8> %vec, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: test_compressstore_nxv2i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cntp x8, p0, p0.d
+; CHECK-NEXT: compact z0.d, p0, z0.d
+; CHECK-NEXT: whilelo p0.d, xzr, x8
+; CHECK-NEXT: st1b { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
+ tail call void @llvm.masked.compressstore.nxv2i8(<vscale x 2 x i8> %vec, ptr align 1 %p, <vscale x 2 x i1> %mask)
+ ret void
+}
+
+define void @test_compressstore_nxv4i16(ptr %p, <vscale x 4 x i16> %vec, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: test_compressstore_nxv4i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cntp x8, p0, p0.s
+; CHECK-NEXT: compact z0.s, p0, z0.s
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: st1h { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
+ tail call void @llvm.masked.compressstore.nxv4i16(<vscale x 4 x i16> %vec, ptr align 2 %p, <vscale x 4 x i1> %mask)
+ ret void
+}
+
+;; NEON vector types (promoted to SVE)
+
+define void @test_compressstore_v2f64(ptr %p, <2 x double> %vec, <2 x i1> %mask) {
+; CHECK-LABEL: test_compressstore_v2f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ushll v1.2d, v1.2s, #0
+; CHECK-NEXT: ptrue p0.d, vl2
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: shl v1.2d, v1.2d, #63
+; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0
+; CHECK-NEXT: cntp x8, p0, p0.d
+; CHECK-NEXT: compact z0.d, p0, z0.d
+; CHECK-NEXT: whilelo p0.d, xzr, x8
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
+ tail call void @llvm.masked.compressstore.v2f64(<2 x double> %vec, ptr align 8 %p, <2 x i1> %mask)
+ ret void
+}
+
+define void @test_compressstore_v4i32(ptr %p, <4 x i32> %vec, <4 x i1> %mask) {
+; CHECK-LABEL: test_compressstore_v4i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ushll v1.4s, v1.4h, #0
+; CHECK-NEXT: ptrue p0.s, vl4
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: shl v1.4s, v1.4s, #31
+; CHECK-NEXT: cmpne p0.s, p0/z, z1.s, #0
+; CHECK-NEXT: cntp x8, p0, p0.s
+; CHECK-NEXT: compact z0.s, p0, z0.s
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
+ tail call void @llvm.masked.compressstore.v4i32(<4 x i32> %vec, ptr align 4 %p, <4 x i1> %mask)
+ ret void
+}
+
+define void @test_compressstore_v2i64(ptr %p, <2 x i64> %vec, <2 x i1> %mask) {
+; CHECK-LABEL: test_compressstore_v2i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ushll v1.2d, v1.2s, #0
+; CHECK-NEXT: ptrue p0.d, vl2
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: shl v1.2d, v1.2d, #63
+; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0
+; CHECK-NEXT: cntp x8, p0, p0.d
+; CHECK-NEXT: compact z0.d, p0, z0.d
+; CHECK-NEXT: whilelo p0.d, xzr, x8
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
+ tail call void @llvm.masked.compressstore.v2i64(<2 x i64> %vec, ptr align 8 %p, <2 x i1> %mask)
+ ret void
+}
+
+define void @test_compressstore_v8i32(ptr %p, <8 x i32> %vec, <8 x i1> %mask) {
+; CHECK-BASE-LABEL: test_compressstore_v8i32:
+; CHECK-BASE: // %bb.0:
+; CHECK-BASE-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-BASE-NEXT: zip1 v3.8b, v2.8b, v0.8b
+; CHECK-BASE-NEXT: adrp x8, .LCPI11_0
+; CHECK-BASE-NEXT: zip2 v2.8b, v2.8b, v0.8b
+; CHECK-BASE-NEXT: ldr d5, [x8, :lo12:.LCPI11_0]
+; CHECK-BASE-NEXT: ptrue p0.s
+; CHECK-BASE-NEXT: // kill: def $q1 killed $q1 def $z1
+; CHECK-BASE-NEXT: ptrue p1.s, vl4
+; CHECK-BASE-NEXT: shl v4.4h, v3.4h, #15
+; CHECK-BASE-NEXT: ushll v2.4s, v2.4h, #0
+; CHECK-BASE-NEXT: ushll v3.4s, v3.4h, #0
+; CHECK-BASE-NEXT: cmlt v4.4h, v4.4h, #0
+; CHECK-BASE-NEXT: shl v2.4s, v2.4s, #31
+; CHECK-BASE-NEXT: shl v3.4s, v3.4s, #31
+; CHECK-BASE-NEXT: and v4.8b, v4.8b, v5.8b
+; CHECK-BASE-NEXT: addv h4, v4.4h
+; CHECK-BASE-NEXT: fmov w8, s4
+; CHECK-BASE-NEXT: and w8, w8, #0xf
+; CHECK-BASE-NEXT: fmov s4, w8
+; CHECK-BASE-NEXT: cnt z4.s, p0/m, z4.s
+; CHECK-BASE-NEXT: cmpne p0.s, p1/z, z2.s, #0
+; CHECK-BASE-NEXT: cmpne p1.s, p1/z, z3.s, #0
+; CHECK-BASE-NEXT: cntp x8, p0, p0.s
+; CHECK-BASE-NEXT: compact z1.s, p0, z1.s
+; CHECK-BASE-NEXT: compact z0.s, p1, z0.s
+; CHECK-BASE-NEXT: cntp x9, p1, p1.s
+; CHECK-BASE-NEXT: fmov w10, s4
+; CHECK-BASE-NEXT: whilelo p0.s, xzr, x8
+; CHECK-BASE-NEXT: whilelo p1.s, xzr, x9
+; CHECK-BASE-NEXT: st1w { z1.s }, p0, [x0, x10, lsl #2]
+; CHECK-BASE-NEXT: st1w { z0.s }, p1, [x0]
+; CHECK-BASE-NEXT: ret
+;
+; CHECK-VL256-LABEL: test_compressstore_v8i32:
+; CHECK-VL256: // %bb.0:
+; CHECK-VL256-NEXT: // kill: def $d2 killed $d2 def $z2
+; CHECK-VL256-NEXT: ptrue p0.s, vl8
+; CHECK-VL256-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-VL256-NEXT: // kill: def $q1 killed $q1 def $z1
+; CHECK-VL256-NEXT: uunpklo z2.h, z2.b
+; CHECK-VL256-NEXT: ptrue p1.s, vl4
+; CHECK-VL256-NEXT: splice z0.s, p1, z0.s, z1.s
+; CHECK-VL256-NEXT: uunpklo z2.s, z2.h
+; CHECK-VL256-NEXT: lsl z2.s, z2.s, #31
+; CHECK-VL256-NEXT: asr z2.s, z2.s, #31
+; CHECK-VL256-NEXT: cmpne p0.s, p0/z, z2.s, #0
+; CHECK-VL256-NEXT: cntp x8, p0, p0.s
+; CHECK-VL256-NEXT: compact z0.s, p0, z0.s
+; CHECK-VL256-NEXT: whilelo p0.s, xzr, x8
+; CHECK-VL256-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-VL256-NEXT: ret
+ tail call void @llvm.masked.compressstore.v8i32(<8 x i32> %vec, ptr align 4 %p, <8 x i1> %mask)
+ ret void
+}
+
+define void @test_compressstore_v4i64(ptr %p, <4 x i64> %vec, <4 x i1> %mask) {
+; CHECK-BASE-LABEL: test_compressstore_v4i64:
+; CHECK-BASE: // %bb.0:
+; CHECK-BASE-NEXT: ushll v2.4s, v2.4h, #0
+; CHECK-BASE-NEXT: index z4.s, #1, #1
+; CHECK-BASE-NEXT: ptrue p0.s
+; CHECK-BASE-NEXT: ptrue p1.d, vl2
+; CHECK-BASE-NEXT: // kill: def $q1 killed $q1 def $z1
+; CHECK-BASE-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-BASE-NEXT: shl v3.2s, v2.2s, #31
+; CHECK-BASE-NEXT: cmlt v3.2s, v3.2s, #0
+; CHECK-BASE-NEXT: and v3.8b, v3.8b, v4.8b
+; CHECK-BASE-NEXT: ushll2 v4.2d, v2.4s, #0
+; CHECK-BASE-NEXT: ushll v2.2d, v2.2s, #0
+; CHECK-BASE-NEXT: addp v3.2s, v3.2s, v3.2s
+; CHECK-BASE-NEXT: shl v2.2d, v2.2d, #63
+; CHECK-BASE-NEXT: fmov w8, s3
+; CHECK-BASE-NEXT: shl v3.2d, v4.2d, #63
+; CHECK-BASE-NEXT: and w8, w8, #0x3
+; CHECK-BASE-NEXT: fmov s4, w8
+; CHECK-BASE-NEXT: cnt z4.s, p0/m, z4.s
+; CHECK-BASE-NEXT: cmpne p0.d, p1/z, z3.d, #0
+; CHECK-BASE-NEXT: cmpne p1.d, p1/z, z2.d, #0
+; CHECK-BASE-NEXT: cntp x8, p0, p0.d
+; CHECK-BASE-NEXT: compact z1.d, p0, z1.d
+; CHECK-BASE-NEXT: compact z0.d, p1, z0.d
+; CHECK-BASE-NEXT: cntp x9, p1, p1.d
+; CHECK-BASE-NEXT: fmov w10, s4
+; CHECK-BASE-NEXT: whilelo p0.d, xzr, x8
+; CHECK-BASE-NEXT: whilelo p1.d, xzr, x9
+; CHECK-BASE-NEXT: st1d { z1.d }, p0, [x0, x10, lsl #3]
+; CHECK-BASE-NEXT: st1d { z0.d }, p1, [x0]
+; CHECK-BASE-NEXT: ret
+;
+; CHECK-VL256-LABEL: test_compressstore_v4i64:
+; CHECK-VL256: // %bb.0:
+; CHECK-VL256-NEXT: // kill: def $d2 killed $d2 def $z2
+; CHECK-VL256-NEXT: ptrue p0.d, vl4
+; CHECK-VL256-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-VL256-NEXT: // kill: def $q1 killed $q1 def $z1
+; CHECK-VL256-NEXT: uunpklo z2.s, z2.h
+; CHECK-VL256-NEXT: ptrue p1.d, vl2
+; CHECK-VL256-NEXT: splice z0.d, p1, z0.d, z1.d
+; CHECK-VL256-NEXT: uunpklo z2.d, z2.s
+; CHECK-VL256-NEXT: lsl z2.d, z2.d, #63
+; CHECK-VL256-NEXT: asr z2.d, z2.d, #63
+; CHECK-VL256-NEXT: cmpne p0.d, p0/z, z2.d, #0
+; CHECK-VL256-NEXT: cntp x8, p0, p0.d
+; CHECK-VL256-NEXT: compact z0.d, p0, z0.d
+; CHECK-VL256-NEXT: whilelo p0.d, xzr, x8
+; CHECK-VL256-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-VL256-NEXT: ret
+ tail call void @llvm.masked.compressstore.v4i64(<4 x i64> %vec, ptr align 8 %p, <4 x i1> %mask)
+ ret void
+}
More information about the llvm-commits
mailing list