[llvm] [AArch64] Add lowering for `@llvm.experimental.vector.compress` (PR #101015)

Mon Jul 29 07:11:07 PDT 2024

https://github.com/lawben created https://github.com/llvm/llvm-project/pull/101015

This is a follow-up to #92289 that adds custom lowering of the new `@llvm.experimental.vector.compress` intrinsic on AArch64 with SVE instructions.

Some vectors have a `compact` instruction that they can be lowered to. For other cases, we use SVE's `st1b`. 

TODO: I still need to run this on an SVE machine. 

>From 24780e80a2f1e9ae1d7979dcd15f2894040f702c Mon Sep 17 00:00:00 2001
From: Lawrence Benson <github at lawben.com>
Date: Mon, 29 Jul 2024 15:22:15 +0200
Subject: [PATCH 1/5] Add AArch64 lowering for
 @llvm.experimental.vector.compress

---
 .../SelectionDAG/LegalizeVectorTypes.cpp      |  62 +++++++-
 .../Target/AArch64/AArch64ISelLowering.cpp    | 148 ++++++++++++++++++
 llvm/lib/Target/AArch64/AArch64ISelLowering.h |   2 +
 3 files changed, 206 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 5672b611234b8..f8981255f8dd6 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -2408,11 +2408,61 @@ void DAGTypeLegalizer::SplitVecRes_VECTOR_COMPRESS(SDNode *N, SDValue &Lo,
                                                    SDValue &Hi) {
   // This is not "trivial", as there is a dependency between the two subvectors.
   // Depending on the number of 1s in the mask, the elements from the Hi vector
-  // need to be moved to the Lo vector. So we just perform this as one "big"
-  // operation and then extract the Lo and Hi vectors from that. This gets rid
-  // of VECTOR_COMPRESS and all other operands can be legalized later.
-  SDValue Compressed = TLI.expandVECTOR_COMPRESS(N, DAG);
-  std::tie(Lo, Hi) = DAG.SplitVector(Compressed, SDLoc(N));
+  // need to be moved to the Lo vector. Passthru values make this even harder.
+  // We try to use MASKED_COMPRESS if the target has custom lowering with
+  // smaller types and passthru is undef, as it is most likely faster than the
+  // fully expand path. Otherwise, just do the full expansion as one "big"
+  // operation and then extract the Lo and Hi vectors from that. This gets
+  // rid of MASKED_COMPRESS and all other operands can be legalized later.
+  SDLoc DL(N);
+  EVT VecVT = N->getValueType(0);
+
+  auto [LoVT, HiVT] = DAG.GetSplitDestVTs(VecVT);
+  bool HasLegalOrCustom = false;
+  EVT CheckVT = LoVT;
+  while (CheckVT.getVectorMinNumElements() > 1) {
+    if (TLI.isOperationLegalOrCustom(ISD::VECTOR_COMPRESS, CheckVT)) {
+      HasLegalOrCustom = true;
+      break;
+    }
+    CheckVT = CheckVT.getHalfNumVectorElementsVT(*DAG.getContext());
+  }
+
+  SDValue Passthru = N->getOperand(2);
+  if (!HasLegalOrCustom || !Passthru.isUndef()) {
+    SDValue Compressed = TLI.expandVECTOR_COMPRESS(N, DAG);
+    std::tie(Lo, Hi) = DAG.SplitVector(Compressed, DL, LoVT, HiVT);
+    return;
+  }
+
+  // Try to VECTOR_COMPRESS smaller vectors and combine via a stack store+load.
+  SDValue LoMask, HiMask;
+  std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
+  std::tie(LoMask, HiMask) = SplitMask(N->getOperand(1));
+
+  SDValue UndefPassthru = DAG.getUNDEF(LoVT);
+  Lo = DAG.getNode(ISD::VECTOR_COMPRESS, DL, LoVT, Lo, LoMask, UndefPassthru);
+  Hi = DAG.getNode(ISD::VECTOR_COMPRESS, DL, HiVT, Hi, HiMask, UndefPassthru);
+
+  SDValue StackPtr = DAG.CreateStackTemporary(
+      VecVT.getStoreSize(), DAG.getReducedAlign(VecVT, /*UseABI=*/false));
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(
+      MF, cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex());
+
+  // We store LoVec and then insert HiVec starting at offset=|1s| in LoMask.
+  SDValue WideMask =
+      DAG.getNode(ISD::ZERO_EXTEND, DL, LoMask.getValueType(), LoMask);
+  SDValue Offset = DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i32, WideMask);
+  Offset = TLI.getVectorElementPointer(DAG, StackPtr, VecVT, Offset);
+
+  SDValue Chain = DAG.getEntryNode();
+  Chain = DAG.getStore(Chain, DL, Lo, StackPtr, PtrInfo);
+  Chain = DAG.getStore(Chain, DL, Hi, Offset,
+                       MachinePointerInfo::getUnknownStack(MF));
+
+  SDValue Compressed = DAG.getLoad(VecVT, DL, Chain, StackPtr, PtrInfo);
+  std::tie(Lo, Hi) = DAG.SplitVector(Compressed, DL);
 }
 
 void DAGTypeLegalizer::SplitVecRes_SETCC(SDNode *N, SDValue &Lo, SDValue &Hi) {
@@ -5784,7 +5834,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_VECTOR_COMPRESS(SDNode *N) {
       TLI.getTypeToTransformTo(*DAG.getContext(), Vec.getValueType());
   EVT WideMaskVT = EVT::getVectorVT(*DAG.getContext(),
                                     Mask.getValueType().getVectorElementType(),
-                                    WideVecVT.getVectorNumElements());
+                                    WideVecVT.getVectorElementCount());
 
   SDValue WideVec = ModifyToType(Vec, WideVecVT);
   SDValue WideMask = ModifyToType(Mask, WideMaskVT, /*FillWithZeroes=*/true);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 1e9da9b819bdd..8836674999a0f 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1535,6 +1535,23 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       }
     }
 
+    // We can lower types that have <vscale x {2|4}> elements to svcompact and
+    // legal i8/i16 types via a compressing store.
+    for (auto VT :
+         {MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv2f32,
+          MVT::nxv2f64, MVT::nxv4i8, MVT::nxv4i16, MVT::nxv4i32, MVT::nxv4f32,
+          MVT::nxv8i8, MVT::nxv8i16, MVT::nxv16i8})
+      setOperationAction(ISD::VECTOR_COMPRESS, VT, Custom);
+
+    // If we have SVE, we can use SVE logic for legal (or smaller than legal)
+    // NEON vectors in the lowest bits of the SVE register.
+    if (Subtarget->hasSVE())
+      for (auto VT : {MVT::v1i8, MVT::v1i16, MVT::v1i32, MVT::v1i64, MVT::v1f32,
+                      MVT::v1f64, MVT::v2i8, MVT::v2i16, MVT::v2i32, MVT::v2i64,
+                      MVT::v2f32, MVT::v2f64, MVT::v4i8, MVT::v4i16, MVT::v4i32,
+                      MVT::v4f32, MVT::v8i8, MVT::v8i16, MVT::v8i16, MVT::v16i8})
+        setOperationAction(ISD::VECTOR_COMPRESS, VT, Custom);
+
     // NEON doesn't support masked loads/stores, but SME and SVE do.
     for (auto VT :
          {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v1f64,
@@ -6615,6 +6632,131 @@ SDValue AArch64TargetLowering::LowerLOAD(SDValue Op,
   return DAG.getMergeValues({Ext, Chain}, DL);
 }
 
+SDValue AArch64TargetLowering::LowerVECTOR_COMPRESS(SDValue Op,
+                                                    SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  SDValue Vec = Op.getOperand(0);
+  SDValue Mask = Op.getOperand(1);
+  SDValue Passthru = Op.getOperand(2);
+  EVT VecVT = Vec.getValueType();
+  EVT MaskVT = Mask.getValueType();
+  EVT ElmtVT = VecVT.getVectorElementType();
+  const bool IsFixedLength = VecVT.isFixedLengthVector();
+  const bool HasPassthru = !Passthru.isUndef();
+  unsigned MinElmts = VecVT.getVectorElementCount().getKnownMinValue();
+  EVT FixedVecVT = MVT::getVectorVT(ElmtVT.getSimpleVT(), MinElmts);
+
+  assert(VecVT.isVector() && "Input to VECTOR_COMPRESS must be vector.");
+
+  if (!Subtarget->hasSVE())
+    return SDValue();
+
+  if (IsFixedLength && VecVT.getSizeInBits().getFixedValue() > 128)
+    return SDValue();
+
+  // We can use the SVE register containing the NEON vector in its lowest bits.
+  if (IsFixedLength) {
+    EVT ScalableVecVT =
+        MVT::getScalableVectorVT(ElmtVT.getSimpleVT(), MinElmts);
+    EVT ScalableMaskVT = MVT::getScalableVectorVT(
+        MaskVT.getVectorElementType().getSimpleVT(), MinElmts);
+
+    Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ScalableVecVT,
+                      DAG.getUNDEF(ScalableVecVT), Vec,
+                      DAG.getConstant(0, DL, MVT::i64));
+    Mask = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ScalableMaskVT,
+                       DAG.getUNDEF(ScalableMaskVT), Mask,
+                       DAG.getConstant(0, DL, MVT::i64));
+    Mask = DAG.getNode(ISD::TRUNCATE, DL,
+                       ScalableMaskVT.changeVectorElementType(MVT::i1), Mask);
+    Passthru = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ScalableVecVT,
+                           DAG.getUNDEF(ScalableVecVT), Passthru,
+                           DAG.getConstant(0, DL, MVT::i64));
+
+    VecVT = Vec.getValueType();
+    MaskVT = Mask.getValueType();
+  }
+
+  // Special case where we can't use svcompact but can do a compressing store
+  // and then reload the vector.
+  if (VecVT == MVT::nxv8i8 || VecVT == MVT::nxv16i8 || VecVT == MVT::nxv8i16) {
+    SDValue StackPtr = DAG.CreateStackTemporary(VecVT);
+    int FI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
+    MachinePointerInfo PtrInfo =
+        MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI);
+
+    MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
+        PtrInfo, MachineMemOperand::Flags::MOStore,
+        LocationSize::precise(VecVT.getStoreSize()),
+        DAG.getReducedAlign(VecVT, /*UseABI=*/false));
+
+    SDValue Chain = DAG.getEntryNode();
+    if (HasPassthru)
+      Chain = DAG.getStore(Chain, DL, Passthru, StackPtr, PtrInfo);
+
+    Chain = DAG.getMaskedStore(Chain, DL, Vec, StackPtr, DAG.getUNDEF(MVT::i64),
+                               Mask, VecVT, MMO, ISD::UNINDEXED, /*IsTruncating=*/false, /*IsCompressing=*/true);
+
+    SDValue Compressed = DAG.getLoad(VecVT, DL, Chain, StackPtr, PtrInfo);
+
+    if (IsFixedLength)
+      Compressed = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, FixedVecVT,
+                               Compressed, DAG.getConstant(0, DL, MVT::i64));
+
+    return Compressed;
+  }
+
+  // Only <vscale x {2|4} x {i32|i64}> supported for svcompact.
+  if (MinElmts != 2 && MinElmts != 4)
+    return SDValue();
+
+  // Get legal type for svcompact instruction
+  EVT ContainerVT = getSVEContainerType(VecVT);
+  EVT CastVT = VecVT.changeVectorElementTypeToInteger();
+
+  // Convert to i32 or i64 for smaller types, as these are the only supported
+  // sizes for svcompact.
+  if (ContainerVT != VecVT) {
+    Vec = DAG.getBitcast(CastVT, Vec);
+    Vec = DAG.getNode(ISD::ANY_EXTEND, DL, ContainerVT, Vec);
+  }
+
+  SDValue Compressed = DAG.getNode(
+      ISD::INTRINSIC_WO_CHAIN, DL, Vec.getValueType(),
+      DAG.getConstant(Intrinsic::aarch64_sve_compact, DL, MVT::i64), Mask, Vec);
+
+  // svcompact fills with 0s, so if our passthru is all 0s, do nothing here.
+  if (HasPassthru && !ISD::isConstantSplatVectorAllZeros(Passthru.getNode())) {
+    SDValue Offset = DAG.getNode(
+        ISD::ZERO_EXTEND, DL, MaskVT.changeVectorElementType(MVT::i32), Mask);
+    Offset = DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i32, Offset);
+    Compressed =
+        DAG.getNode(ISD::VP_MERGE, DL, VecVT,
+                    DAG.getSplatVector(MaskVT, DL,
+                                       DAG.getAllOnesConstant(
+                                           DL, MaskVT.getVectorElementType())),
+                    Compressed, Passthru, Offset);
+  }
+
+  // Extracting from a legal SVE type before truncating produces better code.
+  if (IsFixedLength) {
+    Compressed = DAG.getNode(
+        ISD::EXTRACT_SUBVECTOR, DL,
+        FixedVecVT.changeVectorElementType(ContainerVT.getVectorElementType()),
+        Compressed, DAG.getConstant(0, DL, MVT::i64));
+    CastVT = FixedVecVT.changeVectorElementTypeToInteger();
+    VecVT = FixedVecVT;
+  }
+
+  // If we changed the element type before, we need to convert it back.
+  if (ContainerVT != VecVT) {
+    Compressed = DAG.getNode(ISD::TRUNCATE, DL, CastVT, Compressed);
+    Compressed = DAG.getBitcast(VecVT, Compressed);
+  }
+
+  return Compressed;
+}
+
 // Generate SUBS and CSEL for integer abs.
 SDValue AArch64TargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {
   MVT VT = Op.getSimpleValueType();
@@ -6995,6 +7137,8 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
     return LowerDYNAMIC_STACKALLOC(Op, DAG);
   case ISD::VSCALE:
     return LowerVSCALE(Op, DAG);
+  case ISD::VECTOR_COMPRESS:
+    return LowerVECTOR_COMPRESS(Op, DAG);
   case ISD::ANY_EXTEND:
   case ISD::SIGN_EXTEND:
   case ISD::ZERO_EXTEND:
@@ -26214,6 +26358,10 @@ void AArch64TargetLowering::ReplaceNodeResults(
   case ISD::VECREDUCE_UMIN:
     Results.push_back(LowerVECREDUCE(SDValue(N, 0), DAG));
     return;
+  case ISD::VECTOR_COMPRESS:
+    if (SDValue Res = LowerVECTOR_COMPRESS(SDValue(N, 0), DAG))
+      Results.push_back(Res);
+    return;
   case ISD::ADD:
   case ISD::FADD:
     ReplaceAddWithADDP(N, Results, DAG, Subtarget);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 81e15185f985d..517b1ba1fd400 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -1073,6 +1073,8 @@ class AArch64TargetLowering : public TargetLowering {
 
   SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG) const;
 
+  SDValue LowerVECTOR_COMPRESS(SDValue Op, SelectionDAG &DAG) const;
+
   SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const;

>From a01b7789dc74fc61094f40a33a2ac505348ec463 Mon Sep 17 00:00:00 2001
From: Lawrence Benson <github at lawben.com>
Date: Mon, 29 Jul 2024 15:27:44 +0200
Subject: [PATCH 2/5] Fix typo

---
 llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index f8981255f8dd6..b42a54a56cfed 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -2409,11 +2409,11 @@ void DAGTypeLegalizer::SplitVecRes_VECTOR_COMPRESS(SDNode *N, SDValue &Lo,
   // This is not "trivial", as there is a dependency between the two subvectors.
   // Depending on the number of 1s in the mask, the elements from the Hi vector
   // need to be moved to the Lo vector. Passthru values make this even harder.
-  // We try to use MASKED_COMPRESS if the target has custom lowering with
+  // We try to use VECTOR_COMPRESS if the target has custom lowering with
   // smaller types and passthru is undef, as it is most likely faster than the
   // fully expand path. Otherwise, just do the full expansion as one "big"
   // operation and then extract the Lo and Hi vectors from that. This gets
-  // rid of MASKED_COMPRESS and all other operands can be legalized later.
+  // rid of VECTOR_COMPRESS and all other operands can be legalized later.
   SDLoc DL(N);
   EVT VecVT = N->getValueType(0);
 

>From 7c11026f1adb53c86f0590b1d022e8d7c35cd239 Mon Sep 17 00:00:00 2001
From: Lawrence Benson <github at lawben.com>
Date: Mon, 29 Jul 2024 15:30:02 +0200
Subject: [PATCH 3/5] Fix formatting

---
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 8836674999a0f..b76641e9df8bf 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1546,10 +1546,11 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     // If we have SVE, we can use SVE logic for legal (or smaller than legal)
     // NEON vectors in the lowest bits of the SVE register.
     if (Subtarget->hasSVE())
-      for (auto VT : {MVT::v1i8, MVT::v1i16, MVT::v1i32, MVT::v1i64, MVT::v1f32,
-                      MVT::v1f64, MVT::v2i8, MVT::v2i16, MVT::v2i32, MVT::v2i64,
-                      MVT::v2f32, MVT::v2f64, MVT::v4i8, MVT::v4i16, MVT::v4i32,
-                      MVT::v4f32, MVT::v8i8, MVT::v8i16, MVT::v8i16, MVT::v16i8})
+      for (auto VT :
+           {MVT::v1i8,  MVT::v1i16, MVT::v1i32, MVT::v1i64, MVT::v1f32,
+            MVT::v1f64, MVT::v2i8,  MVT::v2i16, MVT::v2i32, MVT::v2i64,
+            MVT::v2f32, MVT::v2f64, MVT::v4i8,  MVT::v4i16, MVT::v4i32,
+            MVT::v4f32, MVT::v8i8,  MVT::v8i16, MVT::v8i16, MVT::v16i8})
         setOperationAction(ISD::VECTOR_COMPRESS, VT, Custom);
 
     // NEON doesn't support masked loads/stores, but SME and SVE do.
@@ -6695,7 +6696,8 @@ SDValue AArch64TargetLowering::LowerVECTOR_COMPRESS(SDValue Op,
       Chain = DAG.getStore(Chain, DL, Passthru, StackPtr, PtrInfo);
 
     Chain = DAG.getMaskedStore(Chain, DL, Vec, StackPtr, DAG.getUNDEF(MVT::i64),
-                               Mask, VecVT, MMO, ISD::UNINDEXED, /*IsTruncating=*/false, /*IsCompressing=*/true);
+                               Mask, VecVT, MMO, ISD::UNINDEXED,
+                               /*IsTruncating=*/false, /*IsCompressing=*/true);
 
     SDValue Compressed = DAG.getLoad(VecVT, DL, Chain, StackPtr, PtrInfo);
 

>From 76a15b21e889af3e8db1ad065f73c0a8731ba90b Mon Sep 17 00:00:00 2001
From: Lawrence Benson <github at lawben.com>
Date: Mon, 29 Jul 2024 15:54:39 +0200
Subject: [PATCH 4/5] Add tests

---
 .../CodeGen/AArch64/sve-vector-compress.ll    | 500 ++++++++++++++++++
 1 file changed, 500 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/sve-vector-compress.ll

diff --git a/llvm/test/CodeGen/AArch64/sve-vector-compress.ll b/llvm/test/CodeGen/AArch64/sve-vector-compress.ll
new file mode 100644
index 0000000000000..ea9a77c11c53a
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-vector-compress.ll
@@ -0,0 +1,500 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=aarch64 -mattr=+sve < %s | FileCheck %s
+
+define <vscale x 2 x i8> @test_compress_nxv2i8(<vscale x 2 x i8> %vec, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: test_compress_nxv2i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    compact z0.d, p0, z0.d
+; CHECK-NEXT:    ret
+    %out = call <vscale x 2 x i8> @llvm.experimental.vector.compress(<vscale x 2 x i8> %vec, <vscale x 2 x i1> %mask, <vscale x 2 x i8> undef)
+    ret <vscale x 2 x i8> %out
+}
+
+define <vscale x 2 x i16> @test_compress_nxv2i16(<vscale x 2 x i16> %vec, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: test_compress_nxv2i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    compact z0.d, p0, z0.d
+; CHECK-NEXT:    ret
+    %out = call <vscale x 2 x i16> @llvm.experimental.vector.compress(<vscale x 2 x i16> %vec, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef)
+    ret <vscale x 2 x i16> %out
+}
+
+define <vscale x 2 x i32> @test_compress_nxv2i32(<vscale x 2 x i32> %vec, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: test_compress_nxv2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    compact z0.d, p0, z0.d
+; CHECK-NEXT:    ret
+    %out = call <vscale x 2 x i32> @llvm.experimental.vector.compress(<vscale x 2 x i32> %vec, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
+    ret <vscale x 2 x i32> %out
+}
+
+define <vscale x 2 x i64> @test_compress_nxv2i64(<vscale x 2 x i64> %vec, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: test_compress_nxv2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    compact z0.d, p0, z0.d
+; CHECK-NEXT:    ret
+    %out = call <vscale x 2 x i64> @llvm.experimental.vector.compress(<vscale x 2 x i64> %vec, <vscale x 2 x i1> %mask, <vscale x 2 x i64> undef)
+    ret <vscale x 2 x i64> %out
+}
+
+define <vscale x 2 x float> @test_compress_nxv2f32(<vscale x 2 x float> %vec, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: test_compress_nxv2f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    compact z0.d, p0, z0.d
+; CHECK-NEXT:    ret
+    %out = call <vscale x 2 x float> @llvm.experimental.vector.compress(<vscale x 2 x float> %vec, <vscale x 2 x i1> %mask, <vscale x 2 x float> undef)
+    ret <vscale x 2 x float> %out
+}
+
+define <vscale x 2 x double> @test_compress_nxv2f64(<vscale x 2 x double> %vec, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: test_compress_nxv2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    compact z0.d, p0, z0.d
+; CHECK-NEXT:    ret
+    %out = call <vscale x 2 x double> @llvm.experimental.vector.compress(<vscale x 2 x double> %vec, <vscale x 2 x i1> %mask, <vscale x 2 x double> undef)
+    ret <vscale x 2 x double> %out
+}
+
+define <vscale x 4 x i8> @test_compress_nxv4i8(<vscale x 4 x i8> %vec, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: test_compress_nxv4i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    compact z0.s, p0, z0.s
+; CHECK-NEXT:    ret
+    %out = call <vscale x 4 x i8> @llvm.experimental.vector.compress(<vscale x 4 x i8> %vec, <vscale x 4 x i1> %mask, <vscale x 4 x i8> undef)
+    ret <vscale x 4 x i8> %out
+}
+
+define <vscale x 4 x i16> @test_compress_nxv4i16(<vscale x 4 x i16> %vec, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: test_compress_nxv4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    compact z0.s, p0, z0.s
+; CHECK-NEXT:    ret
+    %out = call <vscale x 4 x i16> @llvm.experimental.vector.compress(<vscale x 4 x i16> %vec, <vscale x 4 x i1> %mask, <vscale x 4 x i16> undef)
+    ret <vscale x 4 x i16> %out
+}
+
+define <vscale x 4 x i32> @test_compress_nxv4i32(<vscale x 4 x i32> %vec, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: test_compress_nxv4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    compact z0.s, p0, z0.s
+; CHECK-NEXT:    ret
+    %out = call <vscale x 4 x i32> @llvm.experimental.vector.compress(<vscale x 4 x i32> %vec, <vscale x 4 x i1> %mask, <vscale x 4 x i32> undef)
+    ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 4 x float> @test_compress_nxv4f32(<vscale x 4 x float> %vec, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: test_compress_nxv4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    compact z0.s, p0, z0.s
+; CHECK-NEXT:    ret
+    %out = call <vscale x 4 x float> @llvm.experimental.vector.compress(<vscale x 4 x float> %vec, <vscale x 4 x i1> %mask, <vscale x 4 x float> undef)
+    ret <vscale x 4 x float> %out
+}
+
+define <vscale x 8 x i8> @test_compress_nxv8i8(<vscale x 8 x i8> %vec, <vscale x 8 x i1> %mask) {
+; CHECK-LABEL: test_compress_nxv8i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    ptrue p1.h
+; CHECK-NEXT:    st1b { z0.h }, p0, [sp, #1, mul vl]
+; CHECK-NEXT:    ld1b { z0.h }, p1/z, [sp, #1, mul vl]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+    %out = call <vscale x 8 x i8> @llvm.experimental.vector.compress(<vscale x 8 x i8> %vec, <vscale x 8 x i1> %mask, <vscale x 8 x i8> undef)
+    ret <vscale x 8 x i8> %out
+}
+
+define <vscale x 8 x i16> @test_compress_nxv8i16(<vscale x 8 x i16> %vec, <vscale x 8 x i1> %mask) {
+; CHECK-LABEL: test_compress_nxv8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    ptrue p1.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [sp]
+; CHECK-NEXT:    ld1h { z0.h }, p1/z, [sp]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+    %out = call <vscale x 8 x i16> @llvm.experimental.vector.compress(<vscale x 8 x i16> %vec, <vscale x 8 x i1> %mask, <vscale x 8 x i16> undef)
+    ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 16 x i8> @test_compress_nxv16i8(<vscale x 16 x i8> %vec, <vscale x 16 x i1> %mask) {
+; CHECK-LABEL: test_compress_nxv16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    ptrue p1.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [sp]
+; CHECK-NEXT:    ld1b { z0.b }, p1/z, [sp]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+    %out = call <vscale x 16 x i8> @llvm.experimental.vector.compress(<vscale x 16 x i8> %vec, <vscale x 16 x i1> %mask, <vscale x 16 x i8> undef)
+    ret <vscale x 16 x i8> %out
+}
+
+define <vscale x 4 x i4> @test_compress_illegal_element_type(<vscale x 4 x i4> %vec, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: test_compress_illegal_element_type:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    compact z0.s, p0, z0.s
+; CHECK-NEXT:    ret
+    %out = call <vscale x 4 x i4> @llvm.experimental.vector.compress(<vscale x 4 x i4> %vec, <vscale x 4 x i1> %mask, <vscale x 4 x i4> undef)
+    ret <vscale x 4 x i4> %out
+}
+
+define <vscale x 8 x i32> @test_compress_large(<vscale x 8 x i32> %vec, <vscale x 8 x i1> %mask) {
+; CHECK-LABEL: test_compress_large:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-2
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    punpklo p1.h, p0.b
+; CHECK-NEXT:    cnth x9
+; CHECK-NEXT:    ptrue p2.s
+; CHECK-NEXT:    sub x9, x9, #1
+; CHECK-NEXT:    punpkhi p0.h, p0.b
+; CHECK-NEXT:    st1w { z0.s }, p1, [sp]
+; CHECK-NEXT:    cntp x8, p2, p1.s
+; CHECK-NEXT:    and x8, x8, #0xffffffff
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    csel x8, x8, x9, lo
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    st1w { z1.s }, p0, [x9, x8, lsl #2]
+; CHECK-NEXT:    ld1w { z0.s }, p2/z, [sp]
+; CHECK-NEXT:    ld1w { z1.s }, p2/z, [sp, #1, mul vl]
+; CHECK-NEXT:    addvl sp, sp, #2
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+    %out = call <vscale x 8 x i32> @llvm.experimental.vector.compress(<vscale x 8 x i32> %vec, <vscale x 8 x i1> %mask, <vscale x 8 x i32> undef)
+    ret <vscale x 8 x i32> %out
+}
+
+define <vscale x 64 x i8> @test_compress_very_large(<vscale x 64 x i8> %vec, <vscale x 64 x i1> %mask) {
+; CHECK-LABEL: test_compress_very_large:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-8
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc8, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 72 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    ptrue p4.b
+; CHECK-NEXT:    rdvl x10, #2
+; CHECK-NEXT:    mov x11, sp
+; CHECK-NEXT:    sub x10, x10, #1
+; CHECK-NEXT:    st1b { z0.b }, p0, [sp]
+; CHECK-NEXT:    cntp x8, p4, p0.b
+; CHECK-NEXT:    cntp x9, p4, p2.b
+; CHECK-NEXT:    eor p0.b, p4/z, p0.b, p1.b
+; CHECK-NEXT:    and x8, x8, #0xffffffff
+; CHECK-NEXT:    cmp x8, x10
+; CHECK-NEXT:    and x9, x9, #0xffffffff
+; CHECK-NEXT:    csel x8, x8, x10, lo
+; CHECK-NEXT:    cmp x9, x10
+; CHECK-NEXT:    st1b { z1.b }, p1, [x11, x8]
+; CHECK-NEXT:    addvl x8, sp, #2
+; CHECK-NEXT:    csel x9, x9, x10, lo
+; CHECK-NEXT:    st1b { z2.b }, p2, [sp, #2, mul vl]
+; CHECK-NEXT:    addvl x10, sp, #4
+; CHECK-NEXT:    st1b { z3.b }, p3, [x8, x9]
+; CHECK-NEXT:    cntp x8, p4, p0.b
+; CHECK-NEXT:    rdvl x9, #4
+; CHECK-NEXT:    ld1b { z0.b }, p4/z, [sp, #1, mul vl]
+; CHECK-NEXT:    sub x9, x9, #1
+; CHECK-NEXT:    st1b { z0.b }, p4, [sp, #5, mul vl]
+; CHECK-NEXT:    ld1b { z0.b }, p4/z, [sp]
+; CHECK-NEXT:    and x8, x8, #0xffffffff
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    csel x8, x8, x9, lo
+; CHECK-NEXT:    st1b { z0.b }, p4, [sp, #4, mul vl]
+; CHECK-NEXT:    ld1b { z0.b }, p4/z, [sp, #2, mul vl]
+; CHECK-NEXT:    st1b { z0.b }, p4, [x10, x8]
+; CHECK-NEXT:    add x8, x10, x8
+; CHECK-NEXT:    ld1b { z0.b }, p4/z, [sp, #3, mul vl]
+; CHECK-NEXT:    st1b { z0.b }, p4, [x8, #1, mul vl]
+; CHECK-NEXT:    ld1b { z0.b }, p4/z, [sp, #4, mul vl]
+; CHECK-NEXT:    ld1b { z1.b }, p4/z, [sp, #5, mul vl]
+; CHECK-NEXT:    ld1b { z2.b }, p4/z, [sp, #6, mul vl]
+; CHECK-NEXT:    ld1b { z3.b }, p4/z, [sp, #7, mul vl]
+; CHECK-NEXT:    addvl sp, sp, #8
+; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+    %out = call <vscale x 64 x i8> @llvm.experimental.vector.compress(<vscale x 64 x i8> %vec, <vscale x 64 x i1> %mask, <vscale x 64 x i8> undef)
+    ret <vscale x 64 x i8> %out
+}
+
+
+; We pass a placeholder value for the const_mask* tests to check that they are converted to a no-op by simply copying
+; the second vector input register to the ret register or doing nothing.
+define <vscale x 4 x i32> @test_compress_const_splat1_mask(<vscale x 4 x i32> %ignore, <vscale x 4 x i32> %vec) {
+; CHECK-LABEL: test_compress_const_splat1_mask:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ret
+    %out = call <vscale x 4 x i32> @llvm.experimental.vector.compress(<vscale x 4 x i32> %vec, <vscale x 4 x i1> splat (i1 -1), <vscale x 4 x i32> undef)
+    ret <vscale x 4 x i32> %out
+}
+define <vscale x 4 x i32> @test_compress_const_splat0_mask(<vscale x 4 x i32> %ignore, <vscale x 4 x i32> %vec) {
+; CHECK-LABEL: test_compress_const_splat0_mask:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ret
+    %out = call <vscale x 4 x i32> @llvm.experimental.vector.compress(<vscale x 4 x i32> %vec, <vscale x 4 x i1> splat (i1 0), <vscale x 4 x i32> undef)
+    ret <vscale x 4 x i32> %out
+}
+define <vscale x 4 x i32> @test_compress_undef_mask(<vscale x 4 x i32> %ignore, <vscale x 4 x i32> %vec) {
+; CHECK-LABEL: test_compress_undef_mask:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ret
+    %out = call <vscale x 4 x i32> @llvm.experimental.vector.compress(<vscale x 4 x i32> %vec, <vscale x 4 x i1> undef, <vscale x 4 x i32> undef)
+    ret <vscale x 4 x i32> %out
+}
+
+define void @test_combine_compress_store_nxv16i8(<vscale x 16 x i8> %vec, <vscale x 16 x i1> %mask, ptr %ptr) {
+; CHECK-LABEL: test_combine_compress_store_nxv16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
+    %out = call <vscale x 16 x i8> @llvm.experimental.vector.compress(<vscale x 16 x i8> %vec, <vscale x 16 x i1> %mask, <vscale x 16 x i8> undef)
+    store <vscale x 16 x i8> %out, ptr %ptr
+    ret void
+}
+
+define void @test_combine_compress_store_nxv4i32(<vscale x 4 x i32> %vec, <vscale x 4 x i1> %mask, ptr %ptr) {
+; CHECK-LABEL: test_combine_compress_store_nxv4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
+    %out = call <vscale x 4 x i32> @llvm.experimental.vector.compress(<vscale x 4 x i32> %vec, <vscale x 4 x i1> %mask, <vscale x 4 x i32> undef)
+    store <vscale x 4 x i32> %out, ptr %ptr
+    ret void
+}
+
+
+define <4 x i32> @test_compress_v4i32_with_sve(<4 x i32> %vec, <4 x i1> %mask) {
+; CHECK-LABEL: test_compress_v4i32_with_sve:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    shl v1.4s, v1.4s, #31
+; CHECK-NEXT:    cmlt v1.4s, v1.4s, #0
+; CHECK-NEXT:    and z1.s, z1.s, #0x1
+; CHECK-NEXT:    cmpne p0.s, p0/z, z1.s, #0
+; CHECK-NEXT:    compact z0.s, p0, z0.s
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    ret
+    %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> %vec, <4 x i1> %mask, <4 x i32> undef)
+    ret <4 x i32> %out
+}
+
+define <1 x i32> @test_compress_v1i32_with_sve(<1 x i32> %vec, <1 x i1> %mask) {
+; CHECK-LABEL: test_compress_v1i32_with_sve:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov s1, w0
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-NEXT:    shl v1.2s, v1.2s, #31
+; CHECK-NEXT:    cmlt v1.2s, v1.2s, #0
+; CHECK-NEXT:    ushll v1.2d, v1.2s, #0
+; CHECK-NEXT:    and z1.d, z1.d, #0x1
+; CHECK-NEXT:    cmpne p0.d, p0/z, z1.d, #0
+; CHECK-NEXT:    compact z0.d, p0, z0.d
+; CHECK-NEXT:    xtn v0.2s, v0.2d
+; CHECK-NEXT:    ret
+    %out = call <1 x i32> @llvm.experimental.vector.compress(<1 x i32> %vec, <1 x i1> %mask, <1 x i32> undef)
+    ret <1 x i32> %out
+}
+
+define <8 x i16> @test_compress_v8i16_with_sve(<8 x i16> %vec, <8 x i1> %mask) {
+; CHECK-LABEL: test_compress_v8i16_with_sve:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    shl v1.8h, v1.8h, #15
+; CHECK-NEXT:    cmlt v1.8h, v1.8h, #0
+; CHECK-NEXT:    and z1.h, z1.h, #0x1
+; CHECK-NEXT:    cmpne p1.h, p0/z, z1.h, #0
+; CHECK-NEXT:    st1h { z0.h }, p1, [sp]
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [sp]
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+    %out = call <8 x i16> @llvm.experimental.vector.compress(<8 x i16> %vec, <8 x i1> %mask, <8 x i16> undef)
+    ret <8 x i16> %out
+}
+
+define <4 x double> @test_compress_v4f64_with_sve(<4 x double> %vec, <4 x i1> %mask) {
+; CHECK-LABEL: test_compress_v4f64_with_sve:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #32
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    ushll v2.4s, v2.4h, #0
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov x10, sp
+; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    ushll v3.2d, v2.2s, #0
+; CHECK-NEXT:    ushll2 v4.2d, v2.4s, #0
+; CHECK-NEXT:    fmov x8, d2
+; CHECK-NEXT:    shl v3.2d, v3.2d, #63
+; CHECK-NEXT:    shl v4.2d, v4.2d, #63
+; CHECK-NEXT:    lsr x9, x8, #32
+; CHECK-NEXT:    eor w8, w8, w9
+; CHECK-NEXT:    cmlt v3.2d, v3.2d, #0
+; CHECK-NEXT:    cmlt v4.2d, v4.2d, #0
+; CHECK-NEXT:    and x8, x8, #0x3
+; CHECK-NEXT:    and z3.d, z3.d, #0x1
+; CHECK-NEXT:    and z4.d, z4.d, #0x1
+; CHECK-NEXT:    cmpne p1.d, p0/z, z3.d, #0
+; CHECK-NEXT:    cmpne p0.d, p0/z, z4.d, #0
+; CHECK-NEXT:    st1d { z0.d }, p1, [x10]
+; CHECK-NEXT:    st1d { z1.d }, p0, [x10, x8, lsl #3]
+; CHECK-NEXT:    ldp q0, q1, [sp], #32
+; CHECK-NEXT:    ret
+    %out = call <4 x double> @llvm.experimental.vector.compress(<4 x double> %vec, <4 x i1> %mask, <4 x double> undef)
+    ret <4 x double> %out
+}
+
+define <2 x i16> @test_compress_v2i16_with_sve(<2 x i16> %vec, <2 x i1> %mask) {
+; CHECK-LABEL: test_compress_v2i16_with_sve:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ushll v1.2d, v1.2s, #0
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-NEXT:    and z1.d, z1.d, #0x1
+; CHECK-NEXT:    cmpne p0.d, p0/z, z1.d, #0
+; CHECK-NEXT:    compact z0.d, p0, z0.d
+; CHECK-NEXT:    xtn v0.2s, v0.2d
+; CHECK-NEXT:    ret
+    %out = call <2 x i16> @llvm.experimental.vector.compress(<2 x i16> %vec, <2 x i1> %mask, <2 x i16> undef)
+    ret <2 x i16> %out
+}
+
+define void @test_combine_compress_store_v4i32_with_sve(<4 x i32> %vec, <4 x i1> %mask, ptr %ptr) {
+; CHECK-LABEL: test_combine_compress_store_v4i32_with_sve:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    and z1.s, z1.s, #0x1
+; CHECK-NEXT:    cmpne p0.s, p0/z, z1.s, #0
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
+    %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> %vec, <4 x i1> %mask, <4 x i32> undef)
+    store <4 x i32> %out, ptr %ptr
+    ret void
+}
+
+define void @test_combine_compress_store_v16i8_with_sve(<16 x i8> %vec, <16 x i1> %mask, ptr %ptr) {
+; CHECK-LABEL: test_combine_compress_store_v16i8_with_sve:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    and z1.b, z1.b, #0x1
+; CHECK-NEXT:    cmpne p0.b, p0/z, z1.b, #0
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
+    %out = call <16 x i8> @llvm.experimental.vector.compress(<16 x i8> %vec, <16 x i1> %mask, <16 x i8> undef)
+    store <16 x i8> %out, ptr %ptr
+    ret void
+}
+
+define <vscale x 4 x i32> @test_compress_nxv4i32_with_passthru(<vscale x 4 x i32> %vec, <vscale x 4 x i1> %mask, <vscale x 4 x i32> %passthru) {
+; CHECK-LABEL: test_compress_nxv4i32_with_passthru:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z2.s, p0/z, #1 // =0x1
+; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    compact z0.s, p0, z0.s
+; CHECK-NEXT:    uaddv d2, p1, z2.s
+; CHECK-NEXT:    fmov x8, d2
+; CHECK-NEXT:    index z2.s, #0, #1
+; CHECK-NEXT:    mov z3.s, w8
+; CHECK-NEXT:    cmphi p1.s, p1/z, z3.s, z2.s
+; CHECK-NEXT:    sel z0.s, p1, z0.s, z1.s
+; CHECK-NEXT:    ret
+    %out = call <vscale x 4 x i32> @llvm.experimental.vector.compress(<vscale x 4 x i32> %vec, <vscale x 4 x i1> %mask, <vscale x 4 x i32> %passthru)
+    ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 4 x i32> @test_compress_nxv4i32_with_zero_passthru(<vscale x 4 x i32> %vec, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: test_compress_nxv4i32_with_zero_passthru:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    compact z0.s, p0, z0.s
+; CHECK-NEXT:    ret
+    %out = call <vscale x 4 x i32> @llvm.experimental.vector.compress(<vscale x 4 x i32> %vec, <vscale x 4 x i1> %mask, <vscale x 4 x i32> splat(i32 0))
+    ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 4 x i32> @test_compress_nxv4i32_with_const_passthru(<vscale x 4 x i32> %vec, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: test_compress_nxv4i32_with_const_passthru:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z1.s, p0/z, #1 // =0x1
+; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    compact z0.s, p0, z0.s
+; CHECK-NEXT:    uaddv d1, p1, z1.s
+; CHECK-NEXT:    fmov x8, d1
+; CHECK-NEXT:    index z1.s, #0, #1
+; CHECK-NEXT:    mov z2.s, w8
+; CHECK-NEXT:    cmphi p1.s, p1/z, z2.s, z1.s
+; CHECK-NEXT:    mov z1.s, #5 // =0x5
+; CHECK-NEXT:    sel z0.s, p1, z0.s, z1.s
+; CHECK-NEXT:    ret
+    %out = call <vscale x 4 x i32> @llvm.experimental.vector.compress(<vscale x 4 x i32> %vec, <vscale x 4 x i1> %mask, <vscale x 4 x i32> splat(i32 5))
+    ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 16 x i8> @test_compress_nxv16i8_with_passthru(<vscale x 16 x i8> %vec, <vscale x 16 x i1> %mask, <vscale x 16 x i8> %passthru) {
+; CHECK-LABEL: test_compress_nxv16i8_with_passthru:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    ptrue p1.b
+; CHECK-NEXT:    st1b { z1.b }, p1, [sp]
+; CHECK-NEXT:    st1b { z0.b }, p0, [sp]
+; CHECK-NEXT:    ld1b { z0.b }, p1/z, [sp]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+    %out = call <vscale x 16 x i8> @llvm.experimental.vector.compress(<vscale x 16 x i8> %vec, <vscale x 16 x i1> %mask, <vscale x 16 x i8> %passthru)
+    ret <vscale x 16 x i8> %out
+}
+
+define <vscale x 16 x i8> @test_compress_nxv16i8_with_const_passthru(<vscale x 16 x i8> %vec, <vscale x 16 x i1> %mask) {
+; CHECK-LABEL: test_compress_nxv16i8_with_const_passthru:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    mov z1.b, #5 // =0x5
+; CHECK-NEXT:    ptrue p1.b
+; CHECK-NEXT:    st1b { z1.b }, p1, [sp]
+; CHECK-NEXT:    st1b { z0.b }, p0, [sp]
+; CHECK-NEXT:    ld1b { z0.b }, p1/z, [sp]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+    %out = call <vscale x 16 x i8> @llvm.experimental.vector.compress(<vscale x 16 x i8> %vec, <vscale x 16 x i1> %mask, <vscale x 16 x i8> splat(i8 5))
+    ret <vscale x 16 x i8> %out
+}

>From fb3759fc2c5c2ea84d0a3e99e11310ba0398af4e Mon Sep 17 00:00:00 2001
From: Lawrence Benson <github at lawben.com>
Date: Mon, 29 Jul 2024 16:02:38 +0200
Subject: [PATCH 5/5] Add combine for VECTOR_COMPRESS + store

---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 64 +++++++++++++++++++
 .../CodeGen/AArch64/sve-vector-compress.ll    | 30 ++++-----
 2 files changed, 77 insertions(+), 17 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index b76641e9df8bf..6bfeb4d11ec42 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -23074,6 +23074,67 @@ static SDValue combineI8TruncStore(StoreSDNode *ST, SelectionDAG &DAG,
   return Chain;
 }
 
+static SDValue combineVECTOR_COMPRESSStore(SelectionDAG &DAG,
+                                           StoreSDNode *Store,
+                                           const AArch64Subtarget *Subtarget) {
+  // If the regular store is preceded by an VECTOR_COMPRESS, we can combine them
+  // into a compressing store for scalable vectors in SVE.
+  SDValue VecOp = Store->getValue();
+  EVT VecVT = VecOp.getValueType();
+  if (VecOp.getOpcode() != ISD::VECTOR_COMPRESS || !Subtarget->hasSVE())
+    return SDValue();
+
+  bool IsFixedLength = VecVT.isFixedLengthVector();
+  if (IsFixedLength && VecVT.getSizeInBits().getFixedValue() > 128)
+    return SDValue();
+
+  SDLoc DL(Store);
+  SDValue Vec = VecOp.getOperand(0);
+  SDValue Mask = VecOp.getOperand(1);
+  SDValue Passthru = VecOp.getOperand(2);
+  EVT MemVT = Store->getMemoryVT();
+  MachineMemOperand *MMO = Store->getMemOperand();
+  SDValue Chain = Store->getChain();
+
+  // We can use the SVE register containing the NEON vector in its lowest bits.
+  if (IsFixedLength) {
+    EVT ElmtVT = VecVT.getVectorElementType();
+    unsigned NumElmts = VecVT.getVectorNumElements();
+    EVT ScalableVecVT =
+        MVT::getScalableVectorVT(ElmtVT.getSimpleVT(), NumElmts);
+    EVT ScalableMaskVT = MVT::getScalableVectorVT(
+        Mask.getValueType().getVectorElementType().getSimpleVT(), NumElmts);
+
+    Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ScalableVecVT,
+                      DAG.getUNDEF(ScalableVecVT), Vec,
+                      DAG.getConstant(0, DL, MVT::i64));
+    Mask = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ScalableMaskVT,
+                       DAG.getUNDEF(ScalableMaskVT), Mask,
+                       DAG.getConstant(0, DL, MVT::i64));
+    Mask = DAG.getNode(ISD::TRUNCATE, DL,
+                       ScalableMaskVT.changeVectorElementType(MVT::i1), Mask);
+    Passthru = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ScalableVecVT,
+                           DAG.getUNDEF(ScalableVecVT), Passthru,
+                           DAG.getConstant(0, DL, MVT::i64));
+
+    MemVT = ScalableVecVT;
+    MMO->setType(LLT::scalable_vector(NumElmts, ElmtVT.getSizeInBits()));
+  }
+
+  // If the passthru is all 0s, we don't need an explicit passthru store.
+  unsigned MinElmts = VecVT.getVectorMinNumElements();
+  if (ISD::isConstantSplatVectorAllZeros(Passthru.getNode()) && (MinElmts == 2 || MinElmts == 4))
+    return SDValue();
+
+  if (!Passthru.isUndef())
+    Chain = DAG.getStore(Chain, DL, Passthru, Store->getBasePtr(), MMO);
+
+  return DAG.getMaskedStore(Chain, DL, Vec, Store->getBasePtr(),
+                            DAG.getUNDEF(MVT::i64), Mask, MemVT, MMO,
+                            ISD::UNINDEXED, Store->isTruncatingStore(),
+                            /*IsCompressing=*/true);
+}
+
 static SDValue performSTORECombine(SDNode *N,
                                    TargetLowering::DAGCombinerInfo &DCI,
                                    SelectionDAG &DAG,
@@ -23118,6 +23179,9 @@ static SDValue performSTORECombine(SDNode *N,
   if (SDValue Store = combineBoolVectorAndTruncateStore(DAG, ST))
     return Store;
 
+  if (SDValue Store = combineVECTOR_COMPRESSStore(DAG, ST, Subtarget))
+    return Store;
+
   if (ST->isTruncatingStore()) {
     EVT StoreVT = ST->getMemoryVT();
     if (!isHalvingTruncateOfLegalScalableType(ValueVT, StoreVT))
diff --git a/llvm/test/CodeGen/AArch64/sve-vector-compress.ll b/llvm/test/CodeGen/AArch64/sve-vector-compress.ll
index ea9a77c11c53a..cdebb0db47ceb 100644
--- a/llvm/test/CodeGen/AArch64/sve-vector-compress.ll
+++ b/llvm/test/CodeGen/AArch64/sve-vector-compress.ll
@@ -165,7 +165,7 @@ define <vscale x 8 x i32> @test_compress_large(<vscale x 8 x i32> %vec, <vscale
 ; CHECK-NEXT:    punpkhi p0.h, p0.b
 ; CHECK-NEXT:    st1w { z0.s }, p1, [sp]
 ; CHECK-NEXT:    cntp x8, p2, p1.s
-; CHECK-NEXT:    and x8, x8, #0xffffffff
+; CHECK-NEXT:    mov w8, w8
 ; CHECK-NEXT:    cmp x8, x9
 ; CHECK-NEXT:    csel x8, x8, x9, lo
 ; CHECK-NEXT:    mov x9, sp
@@ -196,9 +196,9 @@ define <vscale x 64 x i8> @test_compress_very_large(<vscale x 64 x i8> %vec, <vs
 ; CHECK-NEXT:    cntp x8, p4, p0.b
 ; CHECK-NEXT:    cntp x9, p4, p2.b
 ; CHECK-NEXT:    eor p0.b, p4/z, p0.b, p1.b
-; CHECK-NEXT:    and x8, x8, #0xffffffff
+; CHECK-NEXT:    mov w8, w8
 ; CHECK-NEXT:    cmp x8, x10
-; CHECK-NEXT:    and x9, x9, #0xffffffff
+; CHECK-NEXT:    mov w9, w9
 ; CHECK-NEXT:    csel x8, x8, x10, lo
 ; CHECK-NEXT:    cmp x9, x10
 ; CHECK-NEXT:    st1b { z1.b }, p1, [x11, x8]
@@ -213,7 +213,7 @@ define <vscale x 64 x i8> @test_compress_very_large(<vscale x 64 x i8> %vec, <vs
 ; CHECK-NEXT:    sub x9, x9, #1
 ; CHECK-NEXT:    st1b { z0.b }, p4, [sp, #5, mul vl]
 ; CHECK-NEXT:    ld1b { z0.b }, p4/z, [sp]
-; CHECK-NEXT:    and x8, x8, #0xffffffff
+; CHECK-NEXT:    mov w8, w8
 ; CHECK-NEXT:    cmp x8, x9
 ; CHECK-NEXT:    csel x8, x8, x9, lo
 ; CHECK-NEXT:    st1b { z0.b }, p4, [sp, #4, mul vl]
@@ -302,11 +302,11 @@ define <4 x i32> @test_compress_v4i32_with_sve(<4 x i32> %vec, <4 x i1> %mask) {
 define <1 x i32> @test_compress_v1i32_with_sve(<1 x i32> %vec, <1 x i1> %mask) {
 ; CHECK-LABEL: test_compress_v1i32_with_sve:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    fmov s1, w0
+; CHECK-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-NEXT:    sbfx w8, w0, #0, #1
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-NEXT:    shl v1.2s, v1.2s, #31
-; CHECK-NEXT:    cmlt v1.2s, v1.2s, #0
+; CHECK-NEXT:    mov v1.s[0], w8
 ; CHECK-NEXT:    ushll v1.2d, v1.2s, #0
 ; CHECK-NEXT:    and z1.d, z1.d, #0x1
 ; CHECK-NEXT:    cmpne p0.d, p0/z, z1.d, #0
@@ -421,12 +421,10 @@ define void @test_combine_compress_store_v16i8_with_sve(<16 x i8> %vec, <16 x i1
 define <vscale x 4 x i32> @test_compress_nxv4i32_with_passthru(<vscale x 4 x i32> %vec, <vscale x 4 x i1> %mask, <vscale x 4 x i32> %passthru) {
 ; CHECK-LABEL: test_compress_nxv4i32_with_passthru:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z2.s, p0/z, #1 // =0x1
-; CHECK-NEXT:    ptrue p1.s
-; CHECK-NEXT:    compact z0.s, p0, z0.s
-; CHECK-NEXT:    uaddv d2, p1, z2.s
-; CHECK-NEXT:    fmov x8, d2
+; CHECK-NEXT:    cntp x8, p0, p0.s
 ; CHECK-NEXT:    index z2.s, #0, #1
+; CHECK-NEXT:    compact z0.s, p0, z0.s
+; CHECK-NEXT:    ptrue p1.s
 ; CHECK-NEXT:    mov z3.s, w8
 ; CHECK-NEXT:    cmphi p1.s, p1/z, z3.s, z2.s
 ; CHECK-NEXT:    sel z0.s, p1, z0.s, z1.s
@@ -447,12 +445,10 @@ define <vscale x 4 x i32> @test_compress_nxv4i32_with_zero_passthru(<vscale x 4
 define <vscale x 4 x i32> @test_compress_nxv4i32_with_const_passthru(<vscale x 4 x i32> %vec, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: test_compress_nxv4i32_with_const_passthru:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z1.s, p0/z, #1 // =0x1
-; CHECK-NEXT:    ptrue p1.s
-; CHECK-NEXT:    compact z0.s, p0, z0.s
-; CHECK-NEXT:    uaddv d1, p1, z1.s
-; CHECK-NEXT:    fmov x8, d1
+; CHECK-NEXT:    cntp x8, p0, p0.s
 ; CHECK-NEXT:    index z1.s, #0, #1
+; CHECK-NEXT:    compact z0.s, p0, z0.s
+; CHECK-NEXT:    ptrue p1.s
 ; CHECK-NEXT:    mov z2.s, w8
 ; CHECK-NEXT:    cmphi p1.s, p1/z, z2.s, z1.s
 ; CHECK-NEXT:    mov z1.s, #5 // =0x5