[llvm] 135ddf1 - [AArch64][SVE] Add basic support for `@llvm.masked.compressstore` (#168350)

Fri Nov 28 02:17:41 PST 2025

Author: Benjamin Maxwell
Date: 2025-11-28T10:17:36Z
New Revision: 135ddf1e8efef7c8ac9b01caa660210e3ca12327

URL: https://github.com/llvm/llvm-project/commit/135ddf1e8efef7c8ac9b01caa660210e3ca12327
DIFF: https://github.com/llvm/llvm-project/commit/135ddf1e8efef7c8ac9b01caa660210e3ca12327.diff

LOG: [AArch64][SVE] Add basic support for `@llvm.masked.compressstore` (#168350)

This patch adds SVE support for the `masked.compressstore` intrinsic via
the existing `VECTOR_COMPRESS` lowering and compressing the store mask
via `VECREDUCE_ADD`.

Currently, only `nxv4[i32|f32]` and `nxv2[i64|f64]` are directly
supported, with other types promoted to these, where possible.

This is done in preparation for LV support of this intrinsic, which is
currently being worked on in #140723.

Added: 
    llvm/test/CodeGen/AArch64/sve-masked-compressstore-sve2p2.ll
    llvm/test/CodeGen/AArch64/sve-masked-compressstore.ll

Modified: 
    llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
    llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
    llvm/lib/Target/AArch64/AArch64ISelLowering.h
    llvm/lib/Target/AArch64/AArch64InstrInfo.td
    llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 5684e0e4c26c4..521d8f07434e6 100644

--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -10607,23 +10607,26 @@ TargetLowering::IncrementMemoryAddress(SDValue Addr, SDValue Mask,
   assert(DataVT.getVectorElementCount() == MaskVT.getVectorElementCount() &&
          "Incompatible types of Data and Mask");
   if (IsCompressedMemory) {
-    if (DataVT.isScalableVector())
-      report_fatal_error(
-          "Cannot currently handle compressed memory with scalable vectors");
     // Incrementing the pointer according to number of '1's in the mask.
-    EVT MaskIntVT = EVT::getIntegerVT(*DAG.getContext(), MaskVT.getSizeInBits());
-    SDValue MaskInIntReg = DAG.getBitcast(MaskIntVT, Mask);
-    if (MaskIntVT.getSizeInBits() < 32) {
-      MaskInIntReg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, MaskInIntReg);
-      MaskIntVT = MVT::i32;
+    if (DataVT.isScalableVector()) {
+      EVT MaskExtVT = MaskVT.changeElementType(MVT::i32);
+      SDValue MaskExt = DAG.getNode(ISD::ZERO_EXTEND, DL, MaskExtVT, Mask);
+      Increment = DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i32, MaskExt);
+    } else {
+      EVT MaskIntVT =
+          EVT::getIntegerVT(*DAG.getContext(), MaskVT.getSizeInBits());
+      SDValue MaskInIntReg = DAG.getBitcast(MaskIntVT, Mask);
+      if (MaskIntVT.getSizeInBits() < 32) {
+        MaskInIntReg =
+            DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, MaskInIntReg);
+        MaskIntVT = MVT::i32;
+      }
+      Increment = DAG.getNode(ISD::CTPOP, DL, MaskIntVT, MaskInIntReg);
     }
-
-    // Count '1's with POPCNT.
-    Increment = DAG.getNode(ISD::CTPOP, DL, MaskIntVT, MaskInIntReg);
-    Increment = DAG.getZExtOrTrunc(Increment, DL, AddrVT);
     // Scale is an element size in bytes.
     SDValue Scale = DAG.getConstant(DataVT.getScalarSizeInBits() / 8, DL,
                                     AddrVT);
+    Increment = DAG.getZExtOrTrunc(Increment, DL, AddrVT);
     Increment = DAG.getNode(ISD::MUL, DL, AddrVT, Increment, Scale);
   } else if (DataVT.isScalableVector()) {
     Increment = DAG.getVScale(DL, AddrVT,

diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index a1400f4cecdac..3661467b4fb67 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1987,10 +1987,15 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom);
 
     // We can lower types that have <vscale x {2|4}> elements to compact.
-    for (auto VT :
-         {MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv2f32,
-          MVT::nxv2f64, MVT::nxv4i8, MVT::nxv4i16, MVT::nxv4i32, MVT::nxv4f32})
+    for (auto VT : {MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64,
+                    MVT::nxv2f32, MVT::nxv2f64, MVT::nxv4i8, MVT::nxv4i16,
+                    MVT::nxv4i32, MVT::nxv4f32}) {
       setOperationAction(ISD::VECTOR_COMPRESS, VT, Custom);
+      // Use a custom lowering for masked stores that could be a supported
+      // compressing store. Note: These types still use the normal (Legal)
+      // lowering for non-compressing masked stores.
+      setOperationAction(ISD::MSTORE, VT, Custom);
+    }
 
     // If we have SVE, we can use SVE logic for legal (or smaller than legal)
     // NEON vectors in the lowest bits of the SVE register.
@@ -7936,7 +7941,7 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
   case ISD::STORE:
     return LowerSTORE(Op, DAG);
   case ISD::MSTORE:
-    return LowerFixedLengthVectorMStoreToSVE(Op, DAG);
+    return LowerMSTORE(Op, DAG);
   case ISD::MGATHER:
     return LowerMGATHER(Op, DAG);
   case ISD::MSCATTER:
@@ -30439,6 +30444,43 @@ SDValue AArch64TargetLowering::LowerFixedLengthVectorStoreToSVE(
                             Store->isTruncatingStore());
 }
 
+SDValue AArch64TargetLowering::LowerMSTORE(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  auto *Store = cast<MaskedStoreSDNode>(Op);
+  EVT VT = Store->getValue().getValueType();
+  if (VT.isFixedLengthVector())
+    return LowerFixedLengthVectorMStoreToSVE(Op, DAG);
+
+  if (!Store->isCompressingStore())
+    return SDValue();
+
+  EVT MaskVT = Store->getMask().getValueType();
+  EVT MaskExtVT = getPromotedVTForPredicate(MaskVT);
+  EVT MaskReduceVT = MaskExtVT.getScalarType();
+  SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
+
+  SDValue MaskExt =
+      DAG.getNode(ISD::ZERO_EXTEND, DL, MaskExtVT, Store->getMask());
+  SDValue CntActive =
+      DAG.getNode(ISD::VECREDUCE_ADD, DL, MaskReduceVT, MaskExt);
+  if (MaskReduceVT != MVT::i64)
+    CntActive = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, CntActive);
+
+  SDValue CompressedValue =
+      DAG.getNode(ISD::VECTOR_COMPRESS, DL, VT, Store->getValue(),
+                  Store->getMask(), DAG.getPOISON(VT));
+  SDValue CompressedMask =
+      DAG.getNode(ISD::GET_ACTIVE_LANE_MASK, DL, MaskVT, Zero, CntActive);
+
+  return DAG.getMaskedStore(Store->getChain(), DL, CompressedValue,
+                            Store->getBasePtr(), Store->getOffset(),
+                            CompressedMask, Store->getMemoryVT(),
+                            Store->getMemOperand(), Store->getAddressingMode(),
+                            Store->isTruncatingStore(),
+                            /*isCompressing=*/false);
+}
+
 SDValue AArch64TargetLowering::LowerFixedLengthVectorMStoreToSVE(
     SDValue Op, SelectionDAG &DAG) const {
   auto *Store = cast<MaskedStoreSDNode>(Op);
@@ -30453,7 +30495,8 @@ SDValue AArch64TargetLowering::LowerFixedLengthVectorMStoreToSVE(
   return DAG.getMaskedStore(
       Store->getChain(), DL, NewValue, Store->getBasePtr(), Store->getOffset(),
       Mask, Store->getMemoryVT(), Store->getMemOperand(),
-      Store->getAddressingMode(), Store->isTruncatingStore());
+      Store->getAddressingMode(), Store->isTruncatingStore(),
+      Store->isCompressingStore());
 }
 
 SDValue AArch64TargetLowering::LowerFixedLengthVectorIntDivideToSVE(

diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index ca08eb40c956a..32aa913181a21 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -761,6 +761,7 @@ class AArch64TargetLowering : public TargetLowering {
   SDValue LowerWindowsDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerInlineDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerMSTORE(SDValue Op, SelectionDAG &DAG) const;
 
   SDValue LowerAVG(SDValue Op, SelectionDAG &DAG, unsigned NewOp) const;
 

diff  --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index b4d8649b31d6d..da93a2b13fc11 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -645,29 +645,34 @@ def nontrunc_masked_store :
           (masked_st node:$val, node:$ptr, undef, node:$pred), [{
   return !cast<MaskedStoreSDNode>(N)->isTruncatingStore() &&
          cast<MaskedStoreSDNode>(N)->isUnindexed() &&
-         !cast<MaskedStoreSDNode>(N)->isNonTemporal();
+         !cast<MaskedStoreSDNode>(N)->isNonTemporal() &&
+         !cast<MaskedStoreSDNode>(N)->isCompressingStore();
 }]>;
 // truncating masked store fragments.
 def trunc_masked_store :
   PatFrag<(ops node:$val, node:$ptr, node:$pred),
           (masked_st node:$val, node:$ptr, undef, node:$pred), [{
   return cast<MaskedStoreSDNode>(N)->isTruncatingStore() &&
-         cast<MaskedStoreSDNode>(N)->isUnindexed();
+         cast<MaskedStoreSDNode>(N)->isUnindexed() &&
+         !cast<MaskedStoreSDNode>(N)->isCompressingStore();
 }]>;
 def trunc_masked_store_i8 :
   PatFrag<(ops node:$val, node:$ptr, node:$pred),
           (trunc_masked_store node:$val, node:$ptr, node:$pred), [{
-  return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8;
+  return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8 &&
+         !cast<MaskedStoreSDNode>(N)->isCompressingStore();
 }]>;
 def trunc_masked_store_i16 :
   PatFrag<(ops node:$val, node:$ptr, node:$pred),
           (trunc_masked_store node:$val, node:$ptr, node:$pred), [{
-  return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i16;
+  return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i16 &&
+         !cast<MaskedStoreSDNode>(N)->isCompressingStore();
 }]>;
 def trunc_masked_store_i32 :
   PatFrag<(ops node:$val, node:$ptr, node:$pred),
           (trunc_masked_store node:$val, node:$ptr, node:$pred), [{
-  return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32;
+  return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32 &&
+         !cast<MaskedStoreSDNode>(N)->isCompressingStore();
 }]>;
 
 def non_temporal_store :
@@ -675,7 +680,8 @@ def non_temporal_store :
           (masked_st node:$val, node:$ptr, undef, node:$pred), [{
   return !cast<MaskedStoreSDNode>(N)->isTruncatingStore() &&
          cast<MaskedStoreSDNode>(N)->isUnindexed() &&
-         cast<MaskedStoreSDNode>(N)->isNonTemporal();
+         cast<MaskedStoreSDNode>(N)->isNonTemporal() &&
+         !cast<MaskedStoreSDNode>(N)->isCompressingStore();
 }]>;
 
 multiclass masked_gather_scatter<PatFrags GatherScatterOp> {

diff  --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 52fc28a98449b..fe3bb5e7981d2 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -334,6 +334,23 @@ class AArch64TTIImpl final : public BasicTTIImplBase<AArch64TTIImpl> {
     return isLegalMaskedLoadStore(DataType, Alignment);
   }
 
+  bool isElementTypeLegalForCompressStore(Type *Ty) const {
+    return Ty->isFloatTy() || Ty->isDoubleTy() || Ty->isIntegerTy(32) ||
+           Ty->isIntegerTy(64);
+  }
+
+  bool isLegalMaskedCompressStore(Type *DataType,
+                                  Align Alignment) const override {
+    if (!ST->isSVEAvailable())
+      return false;
+
+    if (isa<FixedVectorType>(DataType) &&
+        DataType->getPrimitiveSizeInBits() < 128)
+      return false;
+
+    return isElementTypeLegalForCompressStore(DataType->getScalarType());
+  }
+
   bool isLegalMaskedGatherScatter(Type *DataType) const {
     if (!ST->isSVEAvailable())
       return false;

diff  --git a/llvm/test/CodeGen/AArch64/sve-masked-compressstore-sve2p2.ll b/llvm/test/CodeGen/AArch64/sve-masked-compressstore-sve2p2.ll
new file mode 100644
index 0000000000000..92ecc3c83e2c5
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-masked-compressstore-sve2p2.ll
@@ -0,0 +1,17 @@
+; RUN: llc -mtriple=aarch64 -mattr=+sve2p2 < %s
+
+;; These masked.compressstore operations could be natively supported with +sve2p2
+;; (or by promoting to 32/64 bit elements + a truncstore), but currently are not
+;; supported.
+
+; XFAIL: *
+
+define void @test_compressstore_nxv8i16(ptr %p, <vscale x 8 x i16> %vec, <vscale x 8 x i1> %mask) {
+  tail call void @llvm.masked.compressstore.nxv8i16(<vscale x 8 x i16> %vec, ptr align 2 %p, <vscale x 8 x i1> %mask)
+  ret void
+}
+
+define void @test_compressstore_nxv16i8(ptr %p, <vscale x 16 x i8> %vec, <vscale x 16 x i1> %mask) {
+  tail call void @llvm.masked.compressstore.nxv16i8(<vscale x 16 x i8> %vec, ptr align 1 %p, <vscale x 16 x i1> %mask)
+  ret void
+}

diff  --git a/llvm/test/CodeGen/AArch64/sve-masked-compressstore.ll b/llvm/test/CodeGen/AArch64/sve-masked-compressstore.ll
new file mode 100644
index 0000000000000..c698658afc8c4
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-masked-compressstore.ll
@@ -0,0 +1,280 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=aarch64 -mattr=+sve < %s | FileCheck %s --check-prefixes=CHECK,CHECK-BASE
+; RUN: llc -mtriple=aarch64 -aarch64-sve-vector-bits-min=256 -mattr=+sve < %s | FileCheck %s --check-prefixes=CHECK,CHECK-VL256
+
+;; Full SVE vectors (supported with +sve)
+
+define void @test_compressstore_nxv4i32(ptr %p, <vscale x 4 x i32> %vec, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: test_compressstore_nxv4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cntp x8, p0, p0.s
+; CHECK-NEXT:    compact z0.s, p0, z0.s
+; CHECK-NEXT:    whilelo p0.s, xzr, x8
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
+  tail call void @llvm.masked.compressstore.nxv4i32(<vscale x 4 x i32> %vec, ptr align 4 %p, <vscale x 4 x i1> %mask)
+  ret void
+}
+
+define void @test_compressstore_nxv2i64(ptr %p, <vscale x 2 x i64> %vec, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: test_compressstore_nxv2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cntp x8, p0, p0.d
+; CHECK-NEXT:    compact z0.d, p0, z0.d
+; CHECK-NEXT:    whilelo p0.d, xzr, x8
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
+  tail call void @llvm.masked.compressstore.nxv2i64(<vscale x 2 x i64> %vec, ptr align 8 %p, <vscale x 2 x i1> %mask)
+  ret void
+}
+
+define void @test_compressstore_nxv4f32(ptr %p, <vscale x 4 x float> %vec, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: test_compressstore_nxv4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cntp x8, p0, p0.s
+; CHECK-NEXT:    compact z0.s, p0, z0.s
+; CHECK-NEXT:    whilelo p0.s, xzr, x8
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
+  tail call void @llvm.masked.compressstore.nxv4f32(<vscale x 4 x float> %vec, ptr align 4 %p, <vscale x 4 x i1> %mask)
+  ret void
+}
+
+define void @test_compressstore_nxv2f64(ptr %p, <vscale x 2 x double> %vec, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: test_compressstore_nxv2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cntp x8, p0, p0.d
+; CHECK-NEXT:    compact z0.d, p0, z0.d
+; CHECK-NEXT:    whilelo p0.d, xzr, x8
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
+  tail call void @llvm.masked.compressstore.nxv2f64(<vscale x 2 x double> %vec, ptr align 8 %p, <vscale x 2 x i1> %mask)
+  ret void
+}
+
+;; SVE vectors that will be split
+
+define void @test_compressstore_nxv8i32(ptr %p, <vscale x 8 x i32> %vec, <vscale x 8 x i1> %mask) {
+; CHECK-LABEL: test_compressstore_nxv8i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    punpkhi p1.h, p0.b
+; CHECK-NEXT:    punpklo p0.h, p0.b
+; CHECK-NEXT:    cntp x8, p1, p1.s
+; CHECK-NEXT:    compact z1.s, p1, z1.s
+; CHECK-NEXT:    cntp x9, p0, p0.s
+; CHECK-NEXT:    compact z0.s, p0, z0.s
+; CHECK-NEXT:    whilelo p0.s, xzr, x8
+; CHECK-NEXT:    whilelo p1.s, xzr, x9
+; CHECK-NEXT:    st1w { z1.s }, p0, [x0, x9, lsl #2]
+; CHECK-NEXT:    st1w { z0.s }, p1, [x0]
+; CHECK-NEXT:    ret
+  tail call void @llvm.masked.compressstore.nxv8i32(<vscale x 8 x i32> %vec, ptr align 4 %p, <vscale x 8 x i1> %mask)
+  ret void
+}
+
+;; Unpacked SVE vector types
+
+define void @test_compressstore_nxv2f32(ptr %p, <vscale x 2 x float> %vec, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: test_compressstore_nxv2f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cntp x8, p0, p0.d
+; CHECK-NEXT:    compact z0.d, p0, z0.d
+; CHECK-NEXT:    whilelo p0.d, xzr, x8
+; CHECK-NEXT:    st1w { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
+  tail call void @llvm.masked.compressstore.nxv2f32(<vscale x 2 x float> %vec, ptr align 4 %p, <vscale x 2 x i1> %mask)
+  ret void
+}
+
+;; SVE vector types promoted to 32/64-bit (non-exhaustive)
+
+define void @test_compressstore_nxv2i8(ptr %p, <vscale x 2 x i8> %vec, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: test_compressstore_nxv2i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cntp x8, p0, p0.d
+; CHECK-NEXT:    compact z0.d, p0, z0.d
+; CHECK-NEXT:    whilelo p0.d, xzr, x8
+; CHECK-NEXT:    st1b { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
+  tail call void @llvm.masked.compressstore.nxv2i8(<vscale x 2 x i8> %vec, ptr align 1 %p, <vscale x 2 x i1> %mask)
+  ret void
+}
+
+define void @test_compressstore_nxv4i16(ptr %p, <vscale x 4 x i16> %vec, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: test_compressstore_nxv4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cntp x8, p0, p0.s
+; CHECK-NEXT:    compact z0.s, p0, z0.s
+; CHECK-NEXT:    whilelo p0.s, xzr, x8
+; CHECK-NEXT:    st1h { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
+  tail call void @llvm.masked.compressstore.nxv4i16(<vscale x 4 x i16> %vec, ptr align 2 %p, <vscale x 4 x i1> %mask)
+  ret void
+}
+
+;; NEON vector types (promoted to SVE)
+
+define void @test_compressstore_v2f64(ptr %p, <2 x double> %vec, <2 x i1> %mask) {
+; CHECK-LABEL: test_compressstore_v2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ushll v1.2d, v1.2s, #0
+; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    shl v1.2d, v1.2d, #63
+; CHECK-NEXT:    cmpne p0.d, p0/z, z1.d, #0
+; CHECK-NEXT:    cntp x8, p0, p0.d
+; CHECK-NEXT:    compact z0.d, p0, z0.d
+; CHECK-NEXT:    whilelo p0.d, xzr, x8
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
+  tail call void @llvm.masked.compressstore.v2f64(<2 x double> %vec, ptr align 8 %p, <2 x i1> %mask)
+  ret void
+}
+
+define void @test_compressstore_v4i32(ptr %p, <4 x i32> %vec, <4 x i1> %mask) {
+; CHECK-LABEL: test_compressstore_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    shl v1.4s, v1.4s, #31
+; CHECK-NEXT:    cmpne p0.s, p0/z, z1.s, #0
+; CHECK-NEXT:    cntp x8, p0, p0.s
+; CHECK-NEXT:    compact z0.s, p0, z0.s
+; CHECK-NEXT:    whilelo p0.s, xzr, x8
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
+  tail call void @llvm.masked.compressstore.v4i32(<4 x i32> %vec, ptr align 4 %p, <4 x i1> %mask)
+  ret void
+}
+
+define void @test_compressstore_v2i64(ptr %p, <2 x i64> %vec, <2 x i1> %mask) {
+; CHECK-LABEL: test_compressstore_v2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ushll v1.2d, v1.2s, #0
+; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    shl v1.2d, v1.2d, #63
+; CHECK-NEXT:    cmpne p0.d, p0/z, z1.d, #0
+; CHECK-NEXT:    cntp x8, p0, p0.d
+; CHECK-NEXT:    compact z0.d, p0, z0.d
+; CHECK-NEXT:    whilelo p0.d, xzr, x8
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
+  tail call void @llvm.masked.compressstore.v2i64(<2 x i64> %vec, ptr align 8 %p, <2 x i1> %mask)
+  ret void
+}
+
+define void @test_compressstore_v8i32(ptr %p, <8 x i32> %vec, <8 x i1> %mask) {
+; CHECK-BASE-LABEL: test_compressstore_v8i32:
+; CHECK-BASE:       // %bb.0:
+; CHECK-BASE-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-BASE-NEXT:    zip1 v3.8b, v2.8b, v0.8b
+; CHECK-BASE-NEXT:    adrp x8, .LCPI11_0
+; CHECK-BASE-NEXT:    zip2 v2.8b, v2.8b, v0.8b
+; CHECK-BASE-NEXT:    ldr d5, [x8, :lo12:.LCPI11_0]
+; CHECK-BASE-NEXT:    ptrue p0.s
+; CHECK-BASE-NEXT:    // kill: def $q1 killed $q1 def $z1
+; CHECK-BASE-NEXT:    ptrue p1.s, vl4
+; CHECK-BASE-NEXT:    shl v4.4h, v3.4h, #15
+; CHECK-BASE-NEXT:    ushll v2.4s, v2.4h, #0
+; CHECK-BASE-NEXT:    ushll v3.4s, v3.4h, #0
+; CHECK-BASE-NEXT:    cmlt v4.4h, v4.4h, #0
+; CHECK-BASE-NEXT:    shl v2.4s, v2.4s, #31
+; CHECK-BASE-NEXT:    shl v3.4s, v3.4s, #31
+; CHECK-BASE-NEXT:    and v4.8b, v4.8b, v5.8b
+; CHECK-BASE-NEXT:    addv h4, v4.4h
+; CHECK-BASE-NEXT:    fmov w8, s4
+; CHECK-BASE-NEXT:    and w8, w8, #0xf
+; CHECK-BASE-NEXT:    fmov s4, w8
+; CHECK-BASE-NEXT:    cnt z4.s, p0/m, z4.s
+; CHECK-BASE-NEXT:    cmpne p0.s, p1/z, z2.s, #0
+; CHECK-BASE-NEXT:    cmpne p1.s, p1/z, z3.s, #0
+; CHECK-BASE-NEXT:    cntp x8, p0, p0.s
+; CHECK-BASE-NEXT:    compact z1.s, p0, z1.s
+; CHECK-BASE-NEXT:    compact z0.s, p1, z0.s
+; CHECK-BASE-NEXT:    cntp x9, p1, p1.s
+; CHECK-BASE-NEXT:    fmov w10, s4
+; CHECK-BASE-NEXT:    whilelo p0.s, xzr, x8
+; CHECK-BASE-NEXT:    whilelo p1.s, xzr, x9
+; CHECK-BASE-NEXT:    st1w { z1.s }, p0, [x0, x10, lsl #2]
+; CHECK-BASE-NEXT:    st1w { z0.s }, p1, [x0]
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-VL256-LABEL: test_compressstore_v8i32:
+; CHECK-VL256:       // %bb.0:
+; CHECK-VL256-NEXT:    // kill: def $d2 killed $d2 def $z2
+; CHECK-VL256-NEXT:    ptrue p0.s, vl8
+; CHECK-VL256-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-VL256-NEXT:    // kill: def $q1 killed $q1 def $z1
+; CHECK-VL256-NEXT:    uunpklo z2.h, z2.b
+; CHECK-VL256-NEXT:    ptrue p1.s, vl4
+; CHECK-VL256-NEXT:    splice z0.s, p1, z0.s, z1.s
+; CHECK-VL256-NEXT:    uunpklo z2.s, z2.h
+; CHECK-VL256-NEXT:    lsl z2.s, z2.s, #31
+; CHECK-VL256-NEXT:    asr z2.s, z2.s, #31
+; CHECK-VL256-NEXT:    cmpne p0.s, p0/z, z2.s, #0
+; CHECK-VL256-NEXT:    cntp x8, p0, p0.s
+; CHECK-VL256-NEXT:    compact z0.s, p0, z0.s
+; CHECK-VL256-NEXT:    whilelo p0.s, xzr, x8
+; CHECK-VL256-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-VL256-NEXT:    ret
+  tail call void @llvm.masked.compressstore.v8i32(<8 x i32> %vec, ptr align 4 %p, <8 x i1> %mask)
+  ret void
+}
+
+define void @test_compressstore_v4i64(ptr %p, <4 x i64> %vec, <4 x i1> %mask) {
+; CHECK-BASE-LABEL: test_compressstore_v4i64:
+; CHECK-BASE:       // %bb.0:
+; CHECK-BASE-NEXT:    ushll v2.4s, v2.4h, #0
+; CHECK-BASE-NEXT:    index z4.s, #1, #1
+; CHECK-BASE-NEXT:    ptrue p0.s
+; CHECK-BASE-NEXT:    ptrue p1.d, vl2
+; CHECK-BASE-NEXT:    // kill: def $q1 killed $q1 def $z1
+; CHECK-BASE-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-BASE-NEXT:    shl v3.2s, v2.2s, #31
+; CHECK-BASE-NEXT:    cmlt v3.2s, v3.2s, #0
+; CHECK-BASE-NEXT:    and v3.8b, v3.8b, v4.8b
+; CHECK-BASE-NEXT:    ushll2 v4.2d, v2.4s, #0
+; CHECK-BASE-NEXT:    ushll v2.2d, v2.2s, #0
+; CHECK-BASE-NEXT:    addp v3.2s, v3.2s, v3.2s
+; CHECK-BASE-NEXT:    shl v2.2d, v2.2d, #63
+; CHECK-BASE-NEXT:    fmov w8, s3
+; CHECK-BASE-NEXT:    shl v3.2d, v4.2d, #63
+; CHECK-BASE-NEXT:    and w8, w8, #0x3
+; CHECK-BASE-NEXT:    fmov s4, w8
+; CHECK-BASE-NEXT:    cnt z4.s, p0/m, z4.s
+; CHECK-BASE-NEXT:    cmpne p0.d, p1/z, z3.d, #0
+; CHECK-BASE-NEXT:    cmpne p1.d, p1/z, z2.d, #0
+; CHECK-BASE-NEXT:    cntp x8, p0, p0.d
+; CHECK-BASE-NEXT:    compact z1.d, p0, z1.d
+; CHECK-BASE-NEXT:    compact z0.d, p1, z0.d
+; CHECK-BASE-NEXT:    cntp x9, p1, p1.d
+; CHECK-BASE-NEXT:    fmov w10, s4
+; CHECK-BASE-NEXT:    whilelo p0.d, xzr, x8
+; CHECK-BASE-NEXT:    whilelo p1.d, xzr, x9
+; CHECK-BASE-NEXT:    st1d { z1.d }, p0, [x0, x10, lsl #3]
+; CHECK-BASE-NEXT:    st1d { z0.d }, p1, [x0]
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-VL256-LABEL: test_compressstore_v4i64:
+; CHECK-VL256:       // %bb.0:
+; CHECK-VL256-NEXT:    // kill: def $d2 killed $d2 def $z2
+; CHECK-VL256-NEXT:    ptrue p0.d, vl4
+; CHECK-VL256-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-VL256-NEXT:    // kill: def $q1 killed $q1 def $z1
+; CHECK-VL256-NEXT:    uunpklo z2.s, z2.h
+; CHECK-VL256-NEXT:    ptrue p1.d, vl2
+; CHECK-VL256-NEXT:    splice z0.d, p1, z0.d, z1.d
+; CHECK-VL256-NEXT:    uunpklo z2.d, z2.s
+; CHECK-VL256-NEXT:    lsl z2.d, z2.d, #63
+; CHECK-VL256-NEXT:    asr z2.d, z2.d, #63
+; CHECK-VL256-NEXT:    cmpne p0.d, p0/z, z2.d, #0
+; CHECK-VL256-NEXT:    cntp x8, p0, p0.d
+; CHECK-VL256-NEXT:    compact z0.d, p0, z0.d
+; CHECK-VL256-NEXT:    whilelo p0.d, xzr, x8
+; CHECK-VL256-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-VL256-NEXT:    ret
+  tail call void @llvm.masked.compressstore.v4i64(<4 x i64> %vec, ptr align 8 %p, <4 x i1> %mask)
+  ret void
+}