[llvm] 177ce19 - [LLVM] Add `llvm.experimental.vector.compress` intrinsic (#92289)

via llvm-commits llvm-commits at lists.llvm.org
Wed Jul 17 05:24:27 PDT 2024


Author: Lawrence Benson
Date: 2024-07-17T14:24:24+02:00
New Revision: 177ce1900f0de05337f744edd3f4e454f7a93b06

URL: https://github.com/llvm/llvm-project/commit/177ce1900f0de05337f744edd3f4e454f7a93b06
DIFF: https://github.com/llvm/llvm-project/commit/177ce1900f0de05337f744edd3f4e454f7a93b06.diff

LOG: [LLVM] Add `llvm.experimental.vector.compress` intrinsic (#92289)

This PR adds a new vector intrinsic `@llvm.experimental.vector.compress`
to "compress" data within a vector based on a selection mask, i.e., it
moves all selected values (i.e., where `mask[i] == 1`) to consecutive
lanes in the result vector. A `passthru` vector can be provided, from
which remaining lanes are filled.

The main reason for this is that the existing
`@llvm.masked.compressstore` has very strong constraints in that it can
only write values that were selected, resulting in guard branches for
all targets except AVX-512 (and even there the AMD implementation is
_very_ slow). More instruction sets support "compress" logic, but only
within registers. So to store the values, an additional store is needed.
But this combination is likely significantly faster on many target as it
avoids branches.

In follow up PRs, my plan is to add target-specific lowerings for x86,
SVE, and possibly RISCV. I also want to combine this with a store
instruction, as this is probably a common case and we can avoid some
memory writes in that case.

See [discussion in
forum](https://discourse.llvm.org/t/new-intrinsic-for-masked-vector-compress-without-store/78663)
for initial discussion on the design.

Added: 
    llvm/test/CodeGen/AArch64/GlobalISel/legalize-vector-compress.mir
    llvm/test/CodeGen/AArch64/vector-compress.ll

Modified: 
    llvm/docs/GlobalISel/GenericOpcode.rst
    llvm/docs/LangRef.rst
    llvm/docs/ReleaseNotes.rst
    llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
    llvm/include/llvm/CodeGen/ISDOpcodes.h
    llvm/include/llvm/CodeGen/TargetLowering.h
    llvm/include/llvm/IR/Intrinsics.td
    llvm/include/llvm/Support/TargetOpcodes.def
    llvm/include/llvm/Target/GenericOpcodes.td
    llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td
    llvm/include/llvm/Target/TargetSelectionDAG.td
    llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
    llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
    llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
    llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
    llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
    llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
    llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
    llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
    llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
    llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
    llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
    llvm/lib/CodeGen/TargetLoweringBase.cpp
    llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
    llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir

Removed: 
    


################################################################################
diff  --git a/llvm/docs/GlobalISel/GenericOpcode.rst b/llvm/docs/GlobalISel/GenericOpcode.rst
index b05394aeee003..18a53a4815722 100644
--- a/llvm/docs/GlobalISel/GenericOpcode.rst
+++ b/llvm/docs/GlobalISel/GenericOpcode.rst
@@ -726,6 +726,13 @@ The type of the operand must be equal to or larger than the vector element
 type. If the operand is larger than the vector element type, the scalar is
 implicitly truncated to the vector element type.
 
+G_VECTOR_COMPRESS
+^^^^^^^^^^^^^^^^^
+
+Given an input vector, a mask vector, and a passthru vector, continuously place
+all selected (i.e., where mask[i] = true) input lanes in an output vector. All
+remaining lanes in the output are taken from passthru, which may be undef.
+
 Vector Reduction Operations
 ---------------------------
 

diff  --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index f2ff1f0f5852c..cd86156ec816f 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -19525,6 +19525,93 @@ the follow sequence of operations:
 
 The ``mask`` operand will apply to at least the gather and scatter operations.
 
+
+.. _int_vector_compress:
+
+'``llvm.experimental.vector.compress.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+LLVM provides an intrinsic for compressing data within a vector based on a selection mask.
+Semantically, this is similar to :ref:`llvm.masked.compressstore <int_compressstore>` but with weaker assumptions
+and without storing the results to memory, i.e., the data remains in the vector.
+
+Syntax:
+"""""""
+This is an overloaded intrinsic. A number of scalar values of integer, floating point or pointer data type are collected
+from an input vector and placed adjacently within the result vector. A mask defines which elements to collect from the vector.
+The remaining lanes are filled with values from ``passthru``.
+
+:: code-block:: llvm
+
+      declare <8 x i32> @llvm.experimental.vector.compress.v8i32(<8 x i32> <value>, <8 x i1> <mask>, <8 x i32> <passthru>)
+      declare <16 x float> @llvm.experimental.vector.compress.v16f32(<16 x float> <value>, <16 x i1> <mask>, <16 x float> undef)
+
+Overview:
+"""""""""
+
+Selects elements from input vector ``value`` according to the ``mask``.
+All selected elements are written into adjacent lanes in the result vector,
+from lower to higher.
+The mask holds an entry for each vector lane, and is used to select elements
+to be kept.
+If a ``passthru`` vector is given, all remaining lanes are filled with the
+corresponding lane's value from ``passthru``.
+The main 
diff erence to :ref:`llvm.masked.compressstore <int_compressstore>` is
+that the we do not need to guard against memory access for unselected lanes.
+This allows for branchless code and better optimization for all targets that
+do not support or have inefficient
+instructions of the explicit semantics of
+:ref:`llvm.masked.compressstore <int_compressstore>` but still have some form
+of compress operations.
+The result vector can be written with a similar effect, as all the selected
+values are at the lower positions of the vector, but without requiring
+branches to avoid writes where the mask is ``false``.
+
+Arguments:
+""""""""""
+
+The first operand is the input vector, from which elements are selected.
+The second operand is the mask, a vector of boolean values.
+The third operand is the passthru vector, from which elements are filled
+into remaining lanes.
+The mask and the input vector must have the same number of vector elements.
+The input and passthru vectors must have the same type.
+
+Semantics:
+""""""""""
+
+The ``llvm.experimental.vector.compress`` intrinsic compresses data within a vector.
+It collects elements from possibly non-adjacent lanes of a vector and places
+them contiguously in the result vector based on a selection mask, filling the
+remaining lanes with values from ``passthru``.
+This intrinsic performs the logic of the following C++ example.
+All values in ``out`` after the last selected one are undefined if
+``passthru`` is undefined.
+If all entries in the ``mask`` are 0, the ``out`` vector is ``passthru``.
+If any element of the mask is poison, all elements of the result are poison.
+Otherwise, if any element of the mask is undef, all elements of the result are undef.
+If ``passthru`` is undefined, the number of valid lanes is equal to the number
+of ``true`` entries in the mask, i.e., all lanes >= number-of-selected-values
+are undefined.
+
+.. code-block:: cpp
+
+    // Consecutively place selected values in a vector.
+    using VecT __attribute__((vector_size(N))) = int;
+    VecT compress(VecT vec, VecT mask, VecT passthru) {
+      VecT out;
+      int idx = 0;
+      for (int i = 0; i < N / sizeof(int); ++i) {
+        out[idx] = vec[i];
+        idx += static_cast<bool>(mask[i]);
+      }
+      for (; idx < N / sizeof(int); ++idx) {
+        out[idx] = passthru[idx];
+      }
+      return out;
+    }
+
+
 Matrix Intrinsics
 -----------------
 

diff  --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index fa60049f67828..827a0fd3606ec 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -79,6 +79,7 @@ Changes to the LLVM IR
   * ``llvm.instprof.mcdc.tvbitmap.update``: 3rd argument has been
     removed. The next argument has been changed from byte index to bit
     index.
+* Added ``llvm.experimental.vector.compress`` intrinsic.
 
 Changes to LLVM infrastructure
 ------------------------------

diff  --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
index 6e2ab8ce40338..b17bc9aa2a44e 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
@@ -412,6 +412,7 @@ class LegalizerHelper {
   LegalizeResult lowerUnmergeValues(MachineInstr &MI);
   LegalizeResult lowerExtractInsertVectorElt(MachineInstr &MI);
   LegalizeResult lowerShuffleVector(MachineInstr &MI);
+  LegalizeResult lowerVECTOR_COMPRESS(MachineInstr &MI);
   Register getDynStackAllocTargetPtr(Register SPReg, Register AllocSize,
                                      Align Alignment, LLT PtrTy);
   LegalizeResult lowerDynStackAlloc(MachineInstr &MI);

diff  --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h
index e6b10209b4767..daceaf98583bd 100644
--- a/llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -659,6 +659,14 @@ enum NodeType {
   /// non-constant operands.
   STEP_VECTOR,
 
+  /// VECTOR_COMPRESS(Vec, Mask, Passthru)
+  /// consecutively place vector elements based on mask
+  /// e.g., vec = {A, B, C, D} and mask = {1, 0, 1, 0}
+  ///         --> {A, C, ?, ?} where ? is undefined
+  /// If passthru is defined, ?s are replaced with elements from passthru.
+  /// If passthru is undef, ?s remain undefined.
+  VECTOR_COMPRESS,
+
   /// MULHU/MULHS - Multiply high - Multiply two integers of type iN,
   /// producing an unsigned/signed value of type i[2*N], then return the top
   /// part.

diff  --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index ef66b82d6f414..d4a2166bf768e 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -5496,6 +5496,10 @@ class TargetLowering : public TargetLoweringBase {
   /// method accepts vectors as its arguments.
   SDValue expandVectorSplice(SDNode *Node, SelectionDAG &DAG) const;
 
+  /// Expand a vector VECTOR_COMPRESS into a sequence of extract element, store
+  /// temporarily, advance store position, before re-loading the final vector.
+  SDValue expandVECTOR_COMPRESS(SDNode *Node, SelectionDAG &DAG) const;
+
   /// Legalize a SETCC or VP_SETCC with given LHS and RHS and condition code CC
   /// on the current target. A VP_SETCC will additionally be given a Mask
   /// and/or EVL not equal to SDValue().

diff  --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index fc39122aa1be0..b4e758136b39f 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -2398,6 +2398,11 @@ def int_masked_compressstore:
             [IntrWriteMem, IntrArgMemOnly, IntrWillReturn,
              NoCapture<ArgIndex<1>>]>;
 
+def int_experimental_vector_compress:
+    DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+              [LLVMMatchType<0>, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, LLVMMatchType<0>],
+              [IntrNoMem, IntrWillReturn]>;
+
 // Test whether a pointer is associated with a type metadata identifier.
 def int_type_test : DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_ptr_ty, llvm_metadata_ty],
                               [IntrNoMem, IntrWillReturn, IntrSpeculatable]>;

diff  --git a/llvm/include/llvm/Support/TargetOpcodes.def b/llvm/include/llvm/Support/TargetOpcodes.def
index e7f40e87ed24a..a6672f87af977 100644
--- a/llvm/include/llvm/Support/TargetOpcodes.def
+++ b/llvm/include/llvm/Support/TargetOpcodes.def
@@ -754,6 +754,9 @@ HANDLE_TARGET_OPCODE(G_SHUFFLE_VECTOR)
 /// Generic splatvector.
 HANDLE_TARGET_OPCODE(G_SPLAT_VECTOR)
 
+/// Generic masked compress.
+HANDLE_TARGET_OPCODE(G_VECTOR_COMPRESS)
+
 /// Generic count trailing zeroes.
 HANDLE_TARGET_OPCODE(G_CTTZ)
 

diff  --git a/llvm/include/llvm/Target/GenericOpcodes.td b/llvm/include/llvm/Target/GenericOpcodes.td
index e1710ff2d8abf..7501048dfdd78 100644
--- a/llvm/include/llvm/Target/GenericOpcodes.td
+++ b/llvm/include/llvm/Target/GenericOpcodes.td
@@ -1548,6 +1548,13 @@ def G_SPLAT_VECTOR: GenericInstruction {
   let hasSideEffects = false;
 }
 
+// Generic masked compress.
+def G_VECTOR_COMPRESS: GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$vec, type1:$mask, type0:$passthru);
+  let hasSideEffects = false;
+}
+
 //------------------------------------------------------------------------------
 // Vector reductions
 //------------------------------------------------------------------------------

diff  --git a/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td b/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td
index fbe551e1be911..e9dbdef9fe9e7 100644
--- a/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td
+++ b/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td
@@ -193,6 +193,7 @@ def : GINodeEquiv<G_VECREDUCE_UMAX, vecreduce_umax>;
 def : GINodeEquiv<G_VECREDUCE_SMIN, vecreduce_smin>;
 def : GINodeEquiv<G_VECREDUCE_SMAX, vecreduce_smax>;
 def : GINodeEquiv<G_VECREDUCE_ADD, vecreduce_add>;
+def : GINodeEquiv<G_VECTOR_COMPRESS, vector_compress>;
 
 def : GINodeEquiv<G_STRICT_FADD, strict_fadd>;
 def : GINodeEquiv<G_STRICT_FSUB, strict_fsub>;

diff  --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td
index 133c9b113e51b..46044aab79a83 100644
--- a/llvm/include/llvm/Target/TargetSelectionDAG.td
+++ b/llvm/include/llvm/Target/TargetSelectionDAG.td
@@ -266,6 +266,12 @@ def SDTMaskedScatter : SDTypeProfile<0, 4, [
   SDTCisSameNumEltsAs<0, 1>, SDTCisSameNumEltsAs<0, 3>
 ]>;
 
+def SDTVectorCompress : SDTypeProfile<1, 3, [
+  SDTCisVec<0>, SDTCisSameAs<0, 1>,
+  SDTCisVec<2>, SDTCisSameNumEltsAs<1, 2>,
+  SDTCisSameAs<1, 3>
+]>;
+
 def SDTVecShuffle : SDTypeProfile<1, 2, [
   SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>
 ]>;
@@ -757,6 +763,8 @@ def masked_gather : SDNode<"ISD::MGATHER", SDTMaskedGather,
 def masked_scatter : SDNode<"ISD::MSCATTER", SDTMaskedScatter,
                             [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
 
+def vector_compress : SDNode<"ISD::VECTOR_COMPRESS", SDTVectorCompress>;
+
 // Do not use ld, st directly. Use load, extload, sextload, zextload, store,
 // and truncst (see below).
 def ld         : SDNode<"ISD::LOAD"       , SDTLoad,

diff  --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index 97be19825fcf3..72dff12423ced 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -1994,6 +1994,8 @@ unsigned IRTranslator::getSimpleIntrinsicOpcode(Intrinsic::ID ID) {
       return TargetOpcode::G_VECREDUCE_UMAX;
     case Intrinsic::vector_reduce_umin:
       return TargetOpcode::G_VECREDUCE_UMIN;
+    case Intrinsic::experimental_vector_compress:
+      return TargetOpcode::G_VECTOR_COMPRESS;
     case Intrinsic::lround:
       return TargetOpcode::G_LROUND;
     case Intrinsic::llround:

diff  --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index b58c96a866883..bcc30390bc82e 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -4034,6 +4034,8 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) {
     return lowerExtractInsertVectorElt(MI);
   case G_SHUFFLE_VECTOR:
     return lowerShuffleVector(MI);
+  case G_VECTOR_COMPRESS:
+    return lowerVECTOR_COMPRESS(MI);
   case G_DYN_STACKALLOC:
     return lowerDynStackAlloc(MI);
   case G_STACKSAVE:
@@ -7593,6 +7595,93 @@ LegalizerHelper::lowerShuffleVector(MachineInstr &MI) {
   return Legalized;
 }
 
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerVECTOR_COMPRESS(llvm::MachineInstr &MI) {
+  auto [Dst, DstTy, Vec, VecTy, Mask, MaskTy, Passthru, PassthruTy] =
+      MI.getFirst4RegLLTs();
+
+  if (VecTy.isScalableVector())
+    report_fatal_error("Cannot expand masked_compress for scalable vectors.");
+
+  Align VecAlign = getStackTemporaryAlignment(VecTy);
+  MachinePointerInfo PtrInfo;
+  Register StackPtr =
+      createStackTemporary(TypeSize::getFixed(VecTy.getSizeInBytes()), VecAlign,
+                           PtrInfo)
+          .getReg(0);
+  MachinePointerInfo ValPtrInfo =
+      MachinePointerInfo::getUnknownStack(*MI.getMF());
+
+  LLT IdxTy = LLT::scalar(32);
+  LLT ValTy = VecTy.getElementType();
+  Align ValAlign = getStackTemporaryAlignment(ValTy);
+
+  auto OutPos = MIRBuilder.buildConstant(IdxTy, 0);
+
+  bool HasPassthru =
+      MRI.getVRegDef(Passthru)->getOpcode() != TargetOpcode::G_IMPLICIT_DEF;
+
+  if (HasPassthru)
+    MIRBuilder.buildStore(Passthru, StackPtr, PtrInfo, VecAlign);
+
+  Register LastWriteVal;
+  std::optional<APInt> PassthruSplatVal =
+      isConstantOrConstantSplatVector(*MRI.getVRegDef(Passthru), MRI);
+
+  if (PassthruSplatVal.has_value()) {
+    LastWriteVal =
+        MIRBuilder.buildConstant(ValTy, PassthruSplatVal.value()).getReg(0);
+  } else if (HasPassthru) {
+    auto Popcount = MIRBuilder.buildZExt(MaskTy.changeElementSize(32), Mask);
+    Popcount = MIRBuilder.buildInstr(TargetOpcode::G_VECREDUCE_ADD,
+                                     {LLT::scalar(32)}, {Popcount});
+
+    Register LastElmtPtr =
+        getVectorElementPointer(StackPtr, VecTy, Popcount.getReg(0));
+    LastWriteVal =
+        MIRBuilder.buildLoad(ValTy, LastElmtPtr, ValPtrInfo, ValAlign)
+            .getReg(0);
+  }
+
+  unsigned NumElmts = VecTy.getNumElements();
+  for (unsigned I = 0; I < NumElmts; ++I) {
+    auto Idx = MIRBuilder.buildConstant(IdxTy, I);
+    auto Val = MIRBuilder.buildExtractVectorElement(ValTy, Vec, Idx);
+    Register ElmtPtr =
+        getVectorElementPointer(StackPtr, VecTy, OutPos.getReg(0));
+    MIRBuilder.buildStore(Val, ElmtPtr, ValPtrInfo, ValAlign);
+
+    LLT MaskITy = MaskTy.getElementType();
+    auto MaskI = MIRBuilder.buildExtractVectorElement(MaskITy, Mask, Idx);
+    if (MaskITy.getSizeInBits() > 1)
+      MaskI = MIRBuilder.buildTrunc(LLT::scalar(1), MaskI);
+
+    MaskI = MIRBuilder.buildZExt(IdxTy, MaskI);
+    OutPos = MIRBuilder.buildAdd(IdxTy, OutPos, MaskI);
+
+    if (HasPassthru && I == NumElmts - 1) {
+      auto EndOfVector =
+          MIRBuilder.buildConstant(IdxTy, VecTy.getNumElements() - 1);
+      auto AllLanesSelected = MIRBuilder.buildICmp(
+          CmpInst::ICMP_UGT, LLT::scalar(1), OutPos, EndOfVector);
+      OutPos = MIRBuilder.buildInstr(TargetOpcode::G_UMIN, {IdxTy},
+                                     {OutPos, EndOfVector});
+      ElmtPtr = getVectorElementPointer(StackPtr, VecTy, OutPos.getReg(0));
+
+      LastWriteVal =
+          MIRBuilder.buildSelect(ValTy, AllLanesSelected, Val, LastWriteVal)
+              .getReg(0);
+      MIRBuilder.buildStore(LastWriteVal, ElmtPtr, ValPtrInfo, ValAlign);
+    }
+  }
+
+  // TODO: Use StackPtr's FrameIndex alignment.
+  MIRBuilder.buildLoad(Dst, StackPtr, PtrInfo, VecAlign);
+
+  MI.eraseFromParent();
+  return Legalized;
+}
+
 Register LegalizerHelper::getDynStackAllocTargetPtr(Register SPReg,
                                                     Register AllocSize,
                                                     Align Alignment,

diff  --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 302ad128f4f53..30203f9119af7 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -537,6 +537,7 @@ namespace {
     SDValue visitVECTOR_SHUFFLE(SDNode *N);
     SDValue visitSCALAR_TO_VECTOR(SDNode *N);
     SDValue visitINSERT_SUBVECTOR(SDNode *N);
+    SDValue visitVECTOR_COMPRESS(SDNode *N);
     SDValue visitMLOAD(SDNode *N);
     SDValue visitMSTORE(SDNode *N);
     SDValue visitMGATHER(SDNode *N);
@@ -1955,6 +1956,7 @@ SDValue DAGCombiner::visit(SDNode *N) {
   case ISD::MLOAD:              return visitMLOAD(N);
   case ISD::MSCATTER:           return visitMSCATTER(N);
   case ISD::MSTORE:             return visitMSTORE(N);
+  case ISD::VECTOR_COMPRESS:    return visitVECTOR_COMPRESS(N);
   case ISD::LIFETIME_END:       return visitLIFETIME_END(N);
   case ISD::FP_TO_FP16:         return visitFP_TO_FP16(N);
   case ISD::FP16_TO_FP:         return visitFP16_TO_FP(N);
@@ -10006,7 +10008,7 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
       return LHSC.ult(OpSizeInBits) && RHSC.ult(OpSizeInBits) &&
              LHSC.getZExtValue() <= RHSC.getZExtValue();
     };
-    
+
     // fold (shl (sr[la] exact X,  C1), C2) -> (shl    X, (C2-C1)) if C1 <= C2
     // fold (shl (sr[la] exact X,  C1), C2) -> (sr[la] X, (C2-C1)) if C1 >= C2
     if (N0->getFlags().hasExact()) {
@@ -12041,6 +12043,55 @@ SDValue DAGCombiner::visitVP_STRIDED_STORE(SDNode *N) {
   return SDValue();
 }
 
+SDValue DAGCombiner::visitVECTOR_COMPRESS(SDNode *N) {
+  SDLoc DL(N);
+  SDValue Vec = N->getOperand(0);
+  SDValue Mask = N->getOperand(1);
+  SDValue Passthru = N->getOperand(2);
+  EVT VecVT = Vec.getValueType();
+
+  bool HasPassthru = !Passthru.isUndef();
+
+  APInt SplatVal;
+  if (ISD::isConstantSplatVector(Mask.getNode(), SplatVal))
+    return TLI.isConstTrueVal(Mask) ? Vec : Passthru;
+
+  if (Vec.isUndef() || Mask.isUndef())
+    return Passthru;
+
+  // No need for potentially expensive compress if the mask is constant.
+  if (ISD::isBuildVectorOfConstantSDNodes(Mask.getNode())) {
+    SmallVector<SDValue, 16> Ops;
+    EVT ScalarVT = VecVT.getVectorElementType();
+    unsigned NumSelected = 0;
+    unsigned NumElmts = VecVT.getVectorNumElements();
+    for (unsigned I = 0; I < NumElmts; ++I) {
+      SDValue MaskI = Mask.getOperand(I);
+      // We treat undef mask entries as "false".
+      if (MaskI.isUndef())
+        continue;
+
+      if (TLI.isConstTrueVal(MaskI)) {
+        SDValue VecI = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Vec,
+                                   DAG.getVectorIdxConstant(I, DL));
+        Ops.push_back(VecI);
+        NumSelected++;
+      }
+    }
+    for (unsigned Rest = NumSelected; Rest < NumElmts; ++Rest) {
+      SDValue Val =
+          HasPassthru
+              ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Passthru,
+                            DAG.getVectorIdxConstant(Rest, DL))
+              : DAG.getUNDEF(ScalarVT);
+      Ops.push_back(Val);
+    }
+    return DAG.getBuildVector(VecVT, DL, Ops);
+  }
+
+  return SDValue();
+}
+
 SDValue DAGCombiner::visitVPGATHER(SDNode *N) {
   VPGatherSDNode *MGT = cast<VPGatherSDNode>(N);
   SDValue Mask = MGT->getMask();

diff  --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 08321c3842450..af77b0070df0a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -87,6 +87,9 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
     break;
   case ISD::MGATHER:     Res = PromoteIntRes_MGATHER(cast<MaskedGatherSDNode>(N));
     break;
+  case ISD::VECTOR_COMPRESS:
+    Res = PromoteIntRes_VECTOR_COMPRESS(N);
+    break;
   case ISD::SELECT:
   case ISD::VSELECT:
   case ISD::VP_SELECT:
@@ -995,6 +998,13 @@ SDValue DAGTypeLegalizer::PromoteIntRes_MGATHER(MaskedGatherSDNode *N) {
   return Res;
 }
 
+SDValue DAGTypeLegalizer::PromoteIntRes_VECTOR_COMPRESS(SDNode *N) {
+  SDValue Vec = GetPromotedInteger(N->getOperand(0));
+  SDValue Passthru = GetPromotedInteger(N->getOperand(2));
+  return DAG.getNode(ISD::VECTOR_COMPRESS, SDLoc(N), Vec.getValueType(), Vec,
+                     N->getOperand(1), Passthru);
+}
+
 /// Promote the overflow flag of an overflowing arithmetic node.
 SDValue DAGTypeLegalizer::PromoteIntRes_Overflow(SDNode *N) {
   // Change the return type of the boolean result while obeying
@@ -1944,6 +1954,9 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) {
                                                  OpNo); break;
   case ISD::MSCATTER: Res = PromoteIntOp_MSCATTER(cast<MaskedScatterSDNode>(N),
                                                   OpNo); break;
+  case ISD::VECTOR_COMPRESS:
+    Res = PromoteIntOp_VECTOR_COMPRESS(N, OpNo);
+    break;
   case ISD::VP_TRUNCATE:
   case ISD::TRUNCATE:     Res = PromoteIntOp_TRUNCATE(N); break;
   case ISD::BF16_TO_FP:
@@ -2442,6 +2455,16 @@ SDValue DAGTypeLegalizer::PromoteIntOp_MSCATTER(MaskedScatterSDNode *N,
                               N->getIndexType(), TruncateStore);
 }
 
+SDValue DAGTypeLegalizer::PromoteIntOp_VECTOR_COMPRESS(SDNode *N,
+                                                       unsigned OpNo) {
+  assert(OpNo == 1 && "Can only promote VECTOR_COMPRESS mask.");
+  SDValue Vec = N->getOperand(0);
+  EVT VT = Vec.getValueType();
+  SDValue Passthru = N->getOperand(2);
+  SDValue Mask = PromoteTargetBoolean(N->getOperand(1), VT);
+  return DAG.getNode(ISD::VECTOR_COMPRESS, SDLoc(N), VT, Vec, Mask, Passthru);
+}
+
 SDValue DAGTypeLegalizer::PromoteIntOp_TRUNCATE(SDNode *N) {
   SDValue Op = GetPromotedInteger(N->getOperand(0));
   if (N->getOpcode() == ISD::VP_TRUNCATE)

diff  --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index a5c92ee463690..d4e61c8588901 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -340,6 +340,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   SDValue PromoteIntRes_LOAD(LoadSDNode *N);
   SDValue PromoteIntRes_MLOAD(MaskedLoadSDNode *N);
   SDValue PromoteIntRes_MGATHER(MaskedGatherSDNode *N);
+  SDValue PromoteIntRes_VECTOR_COMPRESS(SDNode *N);
   SDValue PromoteIntRes_Overflow(SDNode *N);
   SDValue PromoteIntRes_FFREXP(SDNode *N);
   SDValue PromoteIntRes_SADDSUBO(SDNode *N, unsigned ResNo);
@@ -412,6 +413,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   SDValue PromoteIntOp_MLOAD(MaskedLoadSDNode *N, unsigned OpNo);
   SDValue PromoteIntOp_MSCATTER(MaskedScatterSDNode *N, unsigned OpNo);
   SDValue PromoteIntOp_MGATHER(MaskedGatherSDNode *N, unsigned OpNo);
+  SDValue PromoteIntOp_VECTOR_COMPRESS(SDNode *N, unsigned OpNo);
   SDValue PromoteIntOp_FRAMERETURNADDR(SDNode *N);
   SDValue PromoteIntOp_FIX(SDNode *N);
   SDValue PromoteIntOp_ExpOp(SDNode *N);
@@ -927,6 +929,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   void SplitVecRes_MLOAD(MaskedLoadSDNode *MLD, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_Gather(MemSDNode *VPGT, SDValue &Lo, SDValue &Hi,
                           bool SplitSETCC = false);
+  void SplitVecRes_VECTOR_COMPRESS(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_ScalarOp(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_VP_SPLAT(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_STEP_VECTOR(SDNode *N, SDValue &Lo, SDValue &Hi);
@@ -1019,6 +1022,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   SDValue WidenVecRes_LOAD(SDNode* N);
   SDValue WidenVecRes_VP_LOAD(VPLoadSDNode *N);
   SDValue WidenVecRes_VP_STRIDED_LOAD(VPStridedLoadSDNode *N);
+  SDValue WidenVecRes_VECTOR_COMPRESS(SDNode *N);
   SDValue WidenVecRes_MLOAD(MaskedLoadSDNode* N);
   SDValue WidenVecRes_MGATHER(MaskedGatherSDNode* N);
   SDValue WidenVecRes_VP_GATHER(VPGatherSDNode* N);

diff  --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 307d1fc920d48..57843f0959ac2 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -455,6 +455,7 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
   case ISD::FP_TO_SINT_SAT:
   case ISD::FP_TO_UINT_SAT:
   case ISD::MGATHER:
+  case ISD::VECTOR_COMPRESS:
   case ISD::SCMP:
   case ISD::UCMP:
     Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0));
@@ -1123,6 +1124,9 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) {
       return;
 
     break;
+  case ISD::VECTOR_COMPRESS:
+    Results.push_back(TLI.expandVECTOR_COMPRESS(Node, DAG));
+    return;
   }
 
   SDValue Unrolled = DAG.UnrollVectorOp(Node);

diff  --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index ed629485c0c2b..92b62ccdc2755 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -1110,6 +1110,9 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::VP_GATHER:
     SplitVecRes_Gather(cast<MemSDNode>(N), Lo, Hi, /*SplitSETCC*/ true);
     break;
+  case ISD::VECTOR_COMPRESS:
+    SplitVecRes_VECTOR_COMPRESS(N, Lo, Hi);
+    break;
   case ISD::SETCC:
   case ISD::VP_SETCC:
     SplitVecRes_SETCC(N, Lo, Hi);
@@ -2401,6 +2404,17 @@ void DAGTypeLegalizer::SplitVecRes_Gather(MemSDNode *N, SDValue &Lo,
   ReplaceValueWith(SDValue(N, 1), Ch);
 }
 
+void DAGTypeLegalizer::SplitVecRes_VECTOR_COMPRESS(SDNode *N, SDValue &Lo,
+                                                   SDValue &Hi) {
+  // This is not "trivial", as there is a dependency between the two subvectors.
+  // Depending on the number of 1s in the mask, the elements from the Hi vector
+  // need to be moved to the Lo vector. So we just perform this as one "big"
+  // operation and then extract the Lo and Hi vectors from that. This gets rid
+  // of VECTOR_COMPRESS and all other operands can be legalized later.
+  SDValue Compressed = TLI.expandVECTOR_COMPRESS(N, DAG);
+  std::tie(Lo, Hi) = DAG.SplitVector(Compressed, SDLoc(N));
+}
+
 void DAGTypeLegalizer::SplitVecRes_SETCC(SDNode *N, SDValue &Lo, SDValue &Hi) {
   assert(N->getValueType(0).isVector() &&
          N->getOperand(0).getValueType().isVector() &&
@@ -4333,6 +4347,9 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::EXPERIMENTAL_VP_STRIDED_LOAD:
     Res = WidenVecRes_VP_STRIDED_LOAD(cast<VPStridedLoadSDNode>(N));
     break;
+  case ISD::VECTOR_COMPRESS:
+    Res = WidenVecRes_VECTOR_COMPRESS(N);
+    break;
   case ISD::MLOAD:
     Res = WidenVecRes_MLOAD(cast<MaskedLoadSDNode>(N));
     break;
@@ -5759,6 +5776,23 @@ SDValue DAGTypeLegalizer::WidenVecRes_VP_STRIDED_LOAD(VPStridedLoadSDNode *N) {
   return Res;
 }
 
+SDValue DAGTypeLegalizer::WidenVecRes_VECTOR_COMPRESS(SDNode *N) {
+  SDValue Vec = N->getOperand(0);
+  SDValue Mask = N->getOperand(1);
+  SDValue Passthru = N->getOperand(2);
+  EVT WideVecVT =
+      TLI.getTypeToTransformTo(*DAG.getContext(), Vec.getValueType());
+  EVT WideMaskVT = EVT::getVectorVT(*DAG.getContext(),
+                                    Mask.getValueType().getVectorElementType(),
+                                    WideVecVT.getVectorNumElements());
+
+  SDValue WideVec = ModifyToType(Vec, WideVecVT);
+  SDValue WideMask = ModifyToType(Mask, WideMaskVT, /*FillWithZeroes=*/true);
+  SDValue WidePassthru = ModifyToType(Passthru, WideVecVT);
+  return DAG.getNode(ISD::VECTOR_COMPRESS, SDLoc(N), WideVecVT, WideVec,
+                     WideMask, WidePassthru);
+}
+
 SDValue DAGTypeLegalizer::WidenVecRes_MLOAD(MaskedLoadSDNode *N) {
 
   EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(),N->getValueType(0));

diff  --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 94349ec97693f..2cd0e209f1c07 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -7556,6 +7556,22 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     if (N1.getValueType() == VT)
       return N1;
     break;
+  case ISD::VECTOR_COMPRESS: {
+    EVT VecVT = N1.getValueType();
+    [[maybe_unused]] EVT MaskVT = N2.getValueType();
+    [[maybe_unused]] EVT PassthruVT = N3.getValueType();
+    assert(VT == VecVT && "Vector and result type don't match.");
+    assert(VecVT.isVector() && MaskVT.isVector() && PassthruVT.isVector() &&
+           "All inputs must be vectors.");
+    assert(VecVT == PassthruVT && "Vector and passthru types don't match.");
+    assert(VecVT.getVectorElementCount() == MaskVT.getVectorElementCount() &&
+           "Vector and mask must have same number of elements.");
+
+    if (N1.isUndef() || N2.isUndef())
+      return N3;
+
+    break;
+  }
   }
 
   // Memoize node if it doesn't produce a glue result.

diff  --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index d5cbb733a408d..bbd4c3521d908 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -8115,6 +8115,13 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
   case Intrinsic::vector_deinterleave2:
     visitVectorDeinterleave(I);
     return;
+  case Intrinsic::experimental_vector_compress:
+    setValue(&I, DAG.getNode(ISD::VECTOR_COMPRESS, sdl,
+                             getValue(I.getArgOperand(0)).getValueType(),
+                             getValue(I.getArgOperand(0)),
+                             getValue(I.getArgOperand(1)),
+                             getValue(I.getArgOperand(2)), Flags));
+    return;
   case Intrinsic::experimental_convergence_anchor:
   case Intrinsic::experimental_convergence_entry:
   case Intrinsic::experimental_convergence_loop:

diff  --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index cc8de3a217f82..16fc52caebb75 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -434,6 +434,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::MSTORE:                     return "masked_store";
   case ISD::MGATHER:                    return "masked_gather";
   case ISD::MSCATTER:                   return "masked_scatter";
+  case ISD::VECTOR_COMPRESS:            return "vector_compress";
   case ISD::VAARG:                      return "vaarg";
   case ISD::VACOPY:                     return "vacopy";
   case ISD::VAEND:                      return "vaend";

diff  --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 1433c8821248d..adf14bd007356 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -11372,6 +11372,108 @@ SDValue TargetLowering::expandVectorSplice(SDNode *Node,
                      MachinePointerInfo::getUnknownStack(MF));
 }
 
+SDValue TargetLowering::expandVECTOR_COMPRESS(SDNode *Node,
+                                              SelectionDAG &DAG) const {
+  SDLoc DL(Node);
+  SDValue Vec = Node->getOperand(0);
+  SDValue Mask = Node->getOperand(1);
+  SDValue Passthru = Node->getOperand(2);
+
+  EVT VecVT = Vec.getValueType();
+  EVT ScalarVT = VecVT.getScalarType();
+  EVT MaskVT = Mask.getValueType();
+  EVT MaskScalarVT = MaskVT.getScalarType();
+
+  // Needs to be handled by targets that have scalable vector types.
+  if (VecVT.isScalableVector())
+    report_fatal_error("Cannot expand masked_compress for scalable vectors.");
+
+  SDValue StackPtr = DAG.CreateStackTemporary(
+      VecVT.getStoreSize(), DAG.getReducedAlign(VecVT, /*UseABI=*/false));
+  int FI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
+  MachinePointerInfo PtrInfo =
+      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI);
+
+  MVT PositionVT = getVectorIdxTy(DAG.getDataLayout());
+  SDValue Chain = DAG.getEntryNode();
+  SDValue OutPos = DAG.getConstant(0, DL, PositionVT);
+
+  bool HasPassthru = !Passthru.isUndef();
+
+  // If we have a passthru vector, store it on the stack, overwrite the matching
+  // positions and then re-write the last element that was potentially
+  // overwritten even though mask[i] = false.
+  if (HasPassthru)
+    Chain = DAG.getStore(Chain, DL, Passthru, StackPtr, PtrInfo);
+
+  SDValue LastWriteVal;
+  APInt PassthruSplatVal;
+  bool IsSplatPassthru =
+      ISD::isConstantSplatVector(Passthru.getNode(), PassthruSplatVal);
+
+  if (IsSplatPassthru) {
+    // As we do not know which position we wrote to last, we cannot simply
+    // access that index from the passthru vector. So we first check if passthru
+    // is a splat vector, to use any element ...
+    LastWriteVal = DAG.getConstant(PassthruSplatVal, DL, ScalarVT);
+  } else if (HasPassthru) {
+    // ... if it is not a splat vector, we need to get the passthru value at
+    // position = popcount(mask) and re-load it from the stack before it is
+    // overwritten in the loop below.
+    SDValue Popcount = DAG.getNode(
+        ISD::TRUNCATE, DL, MaskVT.changeVectorElementType(MVT::i1), Mask);
+    Popcount = DAG.getNode(ISD::ZERO_EXTEND, DL,
+                           MaskVT.changeVectorElementType(ScalarVT), Popcount);
+    Popcount = DAG.getNode(ISD::VECREDUCE_ADD, DL, ScalarVT, Popcount);
+    SDValue LastElmtPtr =
+        getVectorElementPointer(DAG, StackPtr, VecVT, Popcount);
+    LastWriteVal = DAG.getLoad(
+        ScalarVT, DL, Chain, LastElmtPtr,
+        MachinePointerInfo::getUnknownStack(DAG.getMachineFunction()));
+    Chain = LastWriteVal.getValue(1);
+  }
+
+  unsigned NumElms = VecVT.getVectorNumElements();
+  for (unsigned I = 0; I < NumElms; I++) {
+    SDValue Idx = DAG.getVectorIdxConstant(I, DL);
+
+    SDValue ValI = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Vec, Idx);
+    SDValue OutPtr = getVectorElementPointer(DAG, StackPtr, VecVT, OutPos);
+    Chain = DAG.getStore(
+        Chain, DL, ValI, OutPtr,
+        MachinePointerInfo::getUnknownStack(DAG.getMachineFunction()));
+
+    // Get the mask value and add it to the current output position. This
+    // either increments by 1 if MaskI is true or adds 0 otherwise.
+    // Freeze in case we have poison/undef mask entries.
+    SDValue MaskI = DAG.getFreeze(
+        DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskScalarVT, Mask, Idx));
+    MaskI = DAG.getFreeze(MaskI);
+    MaskI = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, MaskI);
+    MaskI = DAG.getNode(ISD::ZERO_EXTEND, DL, PositionVT, MaskI);
+    OutPos = DAG.getNode(ISD::ADD, DL, PositionVT, OutPos, MaskI);
+
+    if (HasPassthru && I == NumElms - 1) {
+      SDValue EndOfVector =
+          DAG.getConstant(VecVT.getVectorNumElements() - 1, DL, PositionVT);
+      SDValue AllLanesSelected =
+          DAG.getSetCC(DL, MVT::i1, OutPos, EndOfVector, ISD::CondCode::SETUGT);
+      OutPos = DAG.getNode(ISD::UMIN, DL, PositionVT, OutPos, EndOfVector);
+      OutPtr = getVectorElementPointer(DAG, StackPtr, VecVT, OutPos);
+
+      // Re-write the last ValI if all lanes were selected. Otherwise,
+      // overwrite the last write it with the passthru value.
+      LastWriteVal =
+          DAG.getSelect(DL, ScalarVT, AllLanesSelected, ValI, LastWriteVal);
+      Chain = DAG.getStore(
+          Chain, DL, LastWriteVal, OutPtr,
+          MachinePointerInfo::getUnknownStack(DAG.getMachineFunction()));
+    }
+  }
+
+  return DAG.getLoad(VecVT, DL, Chain, StackPtr, PtrInfo);
+}
+
 bool TargetLowering::LegalizeSetCCCondCode(SelectionDAG &DAG, EVT VT,
                                            SDValue &LHS, SDValue &RHS,
                                            SDValue &CC, SDValue Mask,

diff  --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index bf031c00a2449..8040f1eeae810 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -758,6 +758,9 @@ void TargetLoweringBase::initActions() {
     // Named vector shuffles default to expand.
     setOperationAction(ISD::VECTOR_SPLICE, VT, Expand);
 
+    // Only some target support this vector operation. Most need to expand it.
+    setOperationAction(ISD::VECTOR_COMPRESS, VT, Expand);
+
     // VP operations default to expand.
 #define BEGIN_REGISTER_VP_SDNODE(SDOPC, ...)                                   \
     setOperationAction(ISD::SDOPC, VT, Expand);

diff  --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index d42d5511a8242..a73c971020bd8 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -1202,6 +1202,9 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .scalarize(1)
       .lower();
 
+  // TODO: Update this to correct handling when adding AArch64/SVE support.
+  getActionDefinitionsBuilder(G_VECTOR_COMPRESS).lower();
+
   getActionDefinitionsBuilder({G_FSHL, G_FSHR})
       .customFor({{s32, s32}, {s32, s64}, {s64, s64}})
       .lower();

diff  --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-vector-compress.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-vector-compress.mir
new file mode 100644
index 0000000000000..cc7577473b548
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-vector-compress.mir
@@ -0,0 +1,156 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=aarch64 -run-pass=legalizer %s -o - | FileCheck %s
+---
+name:            test_vector_compress_v4s32
+body:             |
+  bb.0:
+    liveins: $q0, $d1
+
+    ; CHECK-LABEL: name: test_vector_compress_v4s32
+    ; CHECK: liveins: $q0, $d1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $d1
+    ; CHECK-NEXT: [[FRAME_INDEX:%[0-9]+]]:_(p0) = G_FRAME_INDEX %stack.0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<4 x s32>), [[C1]](s64)
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[C1]], [[C2]]
+    ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[MUL]](s64)
+    ; CHECK-NEXT: G_STORE [[EVEC]](s32), [[PTR_ADD]](p0) :: (store (s32))
+    ; CHECK-NEXT: [[EVEC1:%[0-9]+]]:_(s16) = G_EXTRACT_VECTOR_ELT [[COPY1]](<4 x s16>), [[C1]](s64)
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC1]](s16)
+    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C3]]
+    ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[C]], [[AND]]
+    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; CHECK-NEXT: [[EVEC2:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<4 x s32>), [[C4]](s64)
+    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ADD]], [[C5]]
+    ; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[AND1]](s32)
+    ; CHECK-NEXT: [[MUL1:%[0-9]+]]:_(s64) = G_MUL [[SEXT]], [[C2]]
+    ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[MUL1]](s64)
+    ; CHECK-NEXT: G_STORE [[EVEC2]](s32), [[PTR_ADD1]](p0) :: (store (s32))
+    ; CHECK-NEXT: [[EVEC3:%[0-9]+]]:_(s16) = G_EXTRACT_VECTOR_ELT [[COPY1]](<4 x s16>), [[C4]](s64)
+    ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC3]](s16)
+    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C3]]
+    ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[ADD]], [[AND2]]
+    ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; CHECK-NEXT: [[EVEC4:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<4 x s32>), [[C6]](s64)
+    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ADD1]], [[C5]]
+    ; CHECK-NEXT: [[SEXT1:%[0-9]+]]:_(s64) = G_SEXT [[AND3]](s32)
+    ; CHECK-NEXT: [[MUL2:%[0-9]+]]:_(s64) = G_MUL [[SEXT1]], [[C2]]
+    ; CHECK-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[MUL2]](s64)
+    ; CHECK-NEXT: G_STORE [[EVEC4]](s32), [[PTR_ADD2]](p0) :: (store (s32))
+    ; CHECK-NEXT: [[EVEC5:%[0-9]+]]:_(s16) = G_EXTRACT_VECTOR_ELT [[COPY1]](<4 x s16>), [[C6]](s64)
+    ; CHECK-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC5]](s16)
+    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[ANYEXT2]], [[C3]]
+    ; CHECK-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[ADD1]], [[AND4]]
+    ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 3
+    ; CHECK-NEXT: [[EVEC6:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<4 x s32>), [[C7]](s64)
+    ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[ADD2]], [[C5]]
+    ; CHECK-NEXT: [[SEXT2:%[0-9]+]]:_(s64) = G_SEXT [[AND5]](s32)
+    ; CHECK-NEXT: [[MUL3:%[0-9]+]]:_(s64) = G_MUL [[SEXT2]], [[C2]]
+    ; CHECK-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[MUL3]](s64)
+    ; CHECK-NEXT: G_STORE [[EVEC6]](s32), [[PTR_ADD3]](p0) :: (store (s32))
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[FRAME_INDEX]](p0) :: (load (<4 x s32>) from %stack.0)
+    ; CHECK-NEXT: $q0 = COPY [[LOAD]](<4 x s32>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $q0
+    %0:_(<4 x s32>) = COPY $q0
+    %1:_(<4 x s16>) = COPY $d1
+    %2:_(<4 x s32>) = G_IMPLICIT_DEF
+    %3:_(<4 x s32>) = G_VECTOR_COMPRESS %0(<4 x s32>), %1(<4 x s16>), %2(<4 x s32>)
+    $q0 = COPY %3(<4 x s32>)
+    RET_ReallyLR implicit $q0
+...
+---
+name:            test_vector_compress_v4s32_with_passthru
+body:             |
+  bb.0:
+    liveins: $q0, $d1, $q2
+
+
+    ; CHECK-LABEL: name: test_vector_compress_v4s32_with_passthru
+    ; CHECK: liveins: $q0, $d1, $q2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $d1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(<4 x s32>) = COPY $q2
+    ; CHECK-NEXT: [[FRAME_INDEX:%[0-9]+]]:_(p0) = G_FRAME_INDEX %stack.0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: G_STORE [[COPY2]](<4 x s32>), [[FRAME_INDEX]](p0) :: (store (<4 x s32>) into %stack.0)
+    ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(<4 x s32>) = G_ZEXT [[COPY1]](<4 x s16>)
+    ; CHECK-NEXT: [[VECREDUCE_ADD:%[0-9]+]]:_(s32) = G_VECREDUCE_ADD [[ZEXT]](<4 x s32>)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[VECREDUCE_ADD]], [[C1]]
+    ; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[AND]](s32)
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[SEXT]], [[C2]]
+    ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[MUL]](s64)
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s32))
+    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<4 x s32>), [[C3]](s64)
+    ; CHECK-NEXT: [[MUL1:%[0-9]+]]:_(s64) = G_MUL [[C3]], [[C2]]
+    ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[MUL1]](s64)
+    ; CHECK-NEXT: G_STORE [[EVEC]](s32), [[PTR_ADD1]](p0) :: (store (s32))
+    ; CHECK-NEXT: [[EVEC1:%[0-9]+]]:_(s16) = G_EXTRACT_VECTOR_ELT [[COPY1]](<4 x s16>), [[C3]](s64)
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC1]](s16)
+    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C4]]
+    ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[C]], [[AND1]]
+    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; CHECK-NEXT: [[EVEC2:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<4 x s32>), [[C5]](s64)
+    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[ADD]], [[C1]]
+    ; CHECK-NEXT: [[SEXT1:%[0-9]+]]:_(s64) = G_SEXT [[AND2]](s32)
+    ; CHECK-NEXT: [[MUL2:%[0-9]+]]:_(s64) = G_MUL [[SEXT1]], [[C2]]
+    ; CHECK-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[MUL2]](s64)
+    ; CHECK-NEXT: G_STORE [[EVEC2]](s32), [[PTR_ADD2]](p0) :: (store (s32))
+    ; CHECK-NEXT: [[EVEC3:%[0-9]+]]:_(s16) = G_EXTRACT_VECTOR_ELT [[COPY1]](<4 x s16>), [[C5]](s64)
+    ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC3]](s16)
+    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C4]]
+    ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[ADD]], [[AND3]]
+    ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; CHECK-NEXT: [[EVEC4:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<4 x s32>), [[C6]](s64)
+    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[ADD1]], [[C1]]
+    ; CHECK-NEXT: [[SEXT2:%[0-9]+]]:_(s64) = G_SEXT [[AND4]](s32)
+    ; CHECK-NEXT: [[MUL3:%[0-9]+]]:_(s64) = G_MUL [[SEXT2]], [[C2]]
+    ; CHECK-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[MUL3]](s64)
+    ; CHECK-NEXT: G_STORE [[EVEC4]](s32), [[PTR_ADD3]](p0) :: (store (s32))
+    ; CHECK-NEXT: [[EVEC5:%[0-9]+]]:_(s16) = G_EXTRACT_VECTOR_ELT [[COPY1]](<4 x s16>), [[C6]](s64)
+    ; CHECK-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC5]](s16)
+    ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[ANYEXT2]], [[C4]]
+    ; CHECK-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[ADD1]], [[AND5]]
+    ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 3
+    ; CHECK-NEXT: [[EVEC6:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<4 x s32>), [[C7]](s64)
+    ; CHECK-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[ADD2]], [[C1]]
+    ; CHECK-NEXT: [[SEXT3:%[0-9]+]]:_(s64) = G_SEXT [[AND6]](s32)
+    ; CHECK-NEXT: [[MUL4:%[0-9]+]]:_(s64) = G_MUL [[SEXT3]], [[C2]]
+    ; CHECK-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[MUL4]](s64)
+    ; CHECK-NEXT: G_STORE [[EVEC6]](s32), [[PTR_ADD4]](p0) :: (store (s32))
+    ; CHECK-NEXT: [[EVEC7:%[0-9]+]]:_(s16) = G_EXTRACT_VECTOR_ELT [[COPY1]](<4 x s16>), [[C7]](s64)
+    ; CHECK-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC7]](s16)
+    ; CHECK-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[ANYEXT3]], [[C4]]
+    ; CHECK-NEXT: [[ADD3:%[0-9]+]]:_(s32) = G_ADD [[ADD2]], [[AND7]]
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[ADD3]](s32), [[C1]]
+    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD3]](s32), [[C1]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s32), [[ADD3]], [[C1]]
+    ; CHECK-NEXT: [[AND8:%[0-9]+]]:_(s32) = G_AND [[SELECT]], [[C1]]
+    ; CHECK-NEXT: [[SEXT4:%[0-9]+]]:_(s64) = G_SEXT [[AND8]](s32)
+    ; CHECK-NEXT: [[MUL5:%[0-9]+]]:_(s64) = G_MUL [[SEXT4]], [[C2]]
+    ; CHECK-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[MUL5]](s64)
+    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s32), [[EVEC6]], [[LOAD]]
+    ; CHECK-NEXT: G_STORE [[SELECT1]](s32), [[PTR_ADD5]](p0) :: (store (s32))
+    ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[FRAME_INDEX]](p0) :: (load (<4 x s32>) from %stack.0)
+    ; CHECK-NEXT: $q0 = COPY [[LOAD1]](<4 x s32>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $q0
+    %0:_(<4 x s32>) = COPY $q0
+    %1:_(<4 x s16>) = COPY $d1
+    %2:_(<4 x s32>) = COPY $q2
+    %3:_(<4 x s32>) = G_VECTOR_COMPRESS %0(<4 x s32>), %1(<4 x s16>), %2(<4 x s32>)
+    $q0 = COPY %3(<4 x s32>)
+    RET_ReallyLR implicit $q0
+...
+
+
+

diff  --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
index 6db0b9326ca47..61ea3fb998374 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
@@ -646,6 +646,9 @@
 # DEBUG-NEXT: G_SPLAT_VECTOR (opcode {{[0-9]+}}): 2 type indices, 0 imm indices
 # DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined
+# DEBUG-NEXT: G_VECTOR_COMPRESS (opcode {{[0-9]+}}): 2 type indices, 0 imm indices
+# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: G_CTTZ (opcode {{[0-9]+}}): 2 type indices, 0 imm indices
 # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected

diff  --git a/llvm/test/CodeGen/AArch64/vector-compress.ll b/llvm/test/CodeGen/AArch64/vector-compress.ll
new file mode 100644
index 0000000000000..fcf5c546f2610
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/vector-compress.ll
@@ -0,0 +1,474 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=aarch64-apple-darwin -verify-machineinstrs < %s | FileCheck %s
+
+define <4 x i32> @test_compress_v4i32(<4 x i32> %vec, <4 x i1> %mask) {
+; CHECK-LABEL: test_compress_v4i32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ushll.4s v1, v1, #0
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str s0, [sp]
+; CHECK-NEXT:    shl.4s v1, v1, #31
+; CHECK-NEXT:    cmlt.4s v1, v1, #0
+; CHECK-NEXT:    mov.s w9, v1[1]
+; CHECK-NEXT:    mov.s w10, v1[2]
+; CHECK-NEXT:    fmov w11, s1
+; CHECK-NEXT:    bfi x8, x11, #2, #1
+; CHECK-NEXT:    and x11, x11, #0x1
+; CHECK-NEXT:    and x9, x9, #0x1
+; CHECK-NEXT:    and w10, w10, #0x1
+; CHECK-NEXT:    add x9, x11, x9
+; CHECK-NEXT:    mov x11, sp
+; CHECK-NEXT:    st1.s { v0 }[1], [x8]
+; CHECK-NEXT:    add w10, w9, w10
+; CHECK-NEXT:    orr x9, x11, x9, lsl #2
+; CHECK-NEXT:    bfi x11, x10, #2, #2
+; CHECK-NEXT:    st1.s { v0 }[2], [x9]
+; CHECK-NEXT:    st1.s { v0 }[3], [x11]
+; CHECK-NEXT:    ldr q0, [sp], #16
+; CHECK-NEXT:    ret
+    %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> %vec, <4 x i1> %mask, <4 x i32> undef)
+    ret <4 x i32> %out
+}
+
+
+define <4 x i32> @test_compress_v4i32_with_passthru(<4 x i32> %vec, <4 x i1> %mask, <4 x i32> %passthru) {
+; CHECK-LABEL: test_compress_v4i32_with_passthru:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    ushll.4s v1, v1, #0
+; CHECK-NEXT:    movi.4s v3, #1
+; CHECK-NEXT:    shl.4s v1, v1, #31
+; CHECK-NEXT:    cmlt.4s v1, v1, #0
+; CHECK-NEXT:    and.16b v3, v1, v3
+; CHECK-NEXT:    str q2, [sp, #-16]!
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    mov.s w8, v1[1]
+; CHECK-NEXT:    fmov w16, s1
+; CHECK-NEXT:    mov x12, sp
+; CHECK-NEXT:    mov.s w11, v1[2]
+; CHECK-NEXT:    addv.4s s2, v3
+; CHECK-NEXT:    mov x10, sp
+; CHECK-NEXT:    mov.s w13, v1[3]
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    mov x14, sp
+; CHECK-NEXT:    bfi x12, x16, #2, #1
+; CHECK-NEXT:    and x16, x16, #0x1
+; CHECK-NEXT:    mov w15, #3 ; =0x3
+; CHECK-NEXT:    and x8, x8, #0x1
+; CHECK-NEXT:    add x8, x16, x8
+; CHECK-NEXT:    fmov w16, s2
+; CHECK-NEXT:    and x11, x11, #0x1
+; CHECK-NEXT:    and x13, x13, #0x1
+; CHECK-NEXT:    add x11, x8, x11
+; CHECK-NEXT:    orr x8, x9, x8, lsl #2
+; CHECK-NEXT:    add x13, x11, x13
+; CHECK-NEXT:    bfi x14, x11, #2, #2
+; CHECK-NEXT:    bfi x10, x16, #2, #2
+; CHECK-NEXT:    mov.s w16, v0[3]
+; CHECK-NEXT:    cmp x13, #3
+; CHECK-NEXT:    csel x11, x13, x15, lo
+; CHECK-NEXT:    ldr w10, [x10]
+; CHECK-NEXT:    str s0, [sp]
+; CHECK-NEXT:    st1.s { v0 }[1], [x12]
+; CHECK-NEXT:    st1.s { v0 }[2], [x8]
+; CHECK-NEXT:    orr x8, x9, x11, lsl #2
+; CHECK-NEXT:    csel w9, w16, w10, hi
+; CHECK-NEXT:    st1.s { v0 }[3], [x14]
+; CHECK-NEXT:    str w9, [x8]
+; CHECK-NEXT:    ldr q0, [sp], #16
+; CHECK-NEXT:    ret
+    %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> %vec, <4 x i1> %mask, <4 x i32> %passthru)
+    ret <4 x i32> %out
+}
+
+define <2 x double> @test_compress_v2f64(<2 x double> %vec, <2 x i1> %mask) {
+; CHECK-LABEL: test_compress_v2f64:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ushll.2d v1, v1, #0
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str d0, [sp]
+; CHECK-NEXT:    shl.2d v1, v1, #63
+; CHECK-NEXT:    cmlt.2d v1, v1, #0
+; CHECK-NEXT:    fmov x9, d1
+; CHECK-NEXT:    bfi x8, x9, #3, #1
+; CHECK-NEXT:    st1.d { v0 }[1], [x8]
+; CHECK-NEXT:    ldr q0, [sp], #16
+; CHECK-NEXT:    ret
+    %out = call <2 x double> @llvm.experimental.vector.compress.v2f64(<2 x double> %vec, <2 x i1> %mask, <2 x double> undef)
+    ret <2 x double> %out
+}
+
+define <16 x i8> @test_compress_v16i8(<16 x i8> %vec, <16 x i1> %mask) {
+; CHECK-LABEL: test_compress_v16i8:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    shl.16b v1, v1, #7
+; CHECK-NEXT:    mov x12, sp
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    st1.b { v0 }[0], [x8]
+; CHECK-NEXT:    mov x13, sp
+; CHECK-NEXT:    cmlt.16b v1, v1, #0
+; CHECK-NEXT:    umov.b w9, v1[0]
+; CHECK-NEXT:    umov.b w10, v1[1]
+; CHECK-NEXT:    umov.b w11, v1[2]
+; CHECK-NEXT:    umov.b w14, v1[3]
+; CHECK-NEXT:    bfxil x12, x9, #0, #1
+; CHECK-NEXT:    and x10, x10, #0x1
+; CHECK-NEXT:    and x9, x9, #0x1
+; CHECK-NEXT:    add x9, x9, x10
+; CHECK-NEXT:    umov.b w10, v1[4]
+; CHECK-NEXT:    and x11, x11, #0x1
+; CHECK-NEXT:    st1.b { v0 }[1], [x12]
+; CHECK-NEXT:    orr x12, x8, x9
+; CHECK-NEXT:    add x9, x9, x11
+; CHECK-NEXT:    umov.b w11, v1[5]
+; CHECK-NEXT:    and x14, x14, #0x1
+; CHECK-NEXT:    st1.b { v0 }[2], [x12]
+; CHECK-NEXT:    add x14, x9, x14
+; CHECK-NEXT:    umov.b w12, v1[6]
+; CHECK-NEXT:    orr x9, x8, x9
+; CHECK-NEXT:    and x10, x10, #0x1
+; CHECK-NEXT:    st1.b { v0 }[3], [x9]
+; CHECK-NEXT:    orr x9, x8, x14
+; CHECK-NEXT:    add x10, x14, x10
+; CHECK-NEXT:    umov.b w14, v1[7]
+; CHECK-NEXT:    st1.b { v0 }[4], [x9]
+; CHECK-NEXT:    and x11, x11, #0x1
+; CHECK-NEXT:    bfxil x13, x10, #0, #4
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    add x10, x10, x11
+; CHECK-NEXT:    umov.b w11, v1[8]
+; CHECK-NEXT:    and x12, x12, #0x1
+; CHECK-NEXT:    bfxil x9, x10, #0, #4
+; CHECK-NEXT:    st1.b { v0 }[5], [x13]
+; CHECK-NEXT:    umov.b w13, v1[9]
+; CHECK-NEXT:    add x10, x10, x12
+; CHECK-NEXT:    mov x12, sp
+; CHECK-NEXT:    and x14, x14, #0x1
+; CHECK-NEXT:    st1.b { v0 }[6], [x9]
+; CHECK-NEXT:    umov.b w9, v1[10]
+; CHECK-NEXT:    bfxil x12, x10, #0, #4
+; CHECK-NEXT:    add x10, x10, x14
+; CHECK-NEXT:    mov x14, sp
+; CHECK-NEXT:    and x11, x11, #0x1
+; CHECK-NEXT:    bfxil x14, x10, #0, #4
+; CHECK-NEXT:    add x10, x10, x11
+; CHECK-NEXT:    mov x11, sp
+; CHECK-NEXT:    and x13, x13, #0x1
+; CHECK-NEXT:    st1.b { v0 }[7], [x12]
+; CHECK-NEXT:    mov x12, sp
+; CHECK-NEXT:    bfxil x11, x10, #0, #4
+; CHECK-NEXT:    add x10, x10, x13
+; CHECK-NEXT:    umov.b w13, v1[11]
+; CHECK-NEXT:    st1.b { v0 }[8], [x14]
+; CHECK-NEXT:    umov.b w14, v1[12]
+; CHECK-NEXT:    and x9, x9, #0x1
+; CHECK-NEXT:    bfxil x12, x10, #0, #4
+; CHECK-NEXT:    add x9, x10, x9
+; CHECK-NEXT:    mov x10, sp
+; CHECK-NEXT:    st1.b { v0 }[9], [x11]
+; CHECK-NEXT:    umov.b w11, v1[13]
+; CHECK-NEXT:    bfxil x10, x9, #0, #4
+; CHECK-NEXT:    st1.b { v0 }[10], [x12]
+; CHECK-NEXT:    umov.b w12, v1[14]
+; CHECK-NEXT:    and x13, x13, #0x1
+; CHECK-NEXT:    and x14, x14, #0x1
+; CHECK-NEXT:    add x9, x9, x13
+; CHECK-NEXT:    st1.b { v0 }[11], [x10]
+; CHECK-NEXT:    mov x10, sp
+; CHECK-NEXT:    add x13, x9, x14
+; CHECK-NEXT:    mov x14, sp
+; CHECK-NEXT:    bfxil x10, x9, #0, #4
+; CHECK-NEXT:    and x9, x11, #0x1
+; CHECK-NEXT:    mov x11, sp
+; CHECK-NEXT:    add x9, x13, x9
+; CHECK-NEXT:    and w12, w12, #0x1
+; CHECK-NEXT:    bfxil x14, x13, #0, #4
+; CHECK-NEXT:    bfxil x11, x9, #0, #4
+; CHECK-NEXT:    add w9, w9, w12
+; CHECK-NEXT:    st1.b { v0 }[12], [x10]
+; CHECK-NEXT:    bfxil x8, x9, #0, #4
+; CHECK-NEXT:    st1.b { v0 }[13], [x14]
+; CHECK-NEXT:    st1.b { v0 }[14], [x11]
+; CHECK-NEXT:    st1.b { v0 }[15], [x8]
+; CHECK-NEXT:    ldr q0, [sp], #16
+; CHECK-NEXT:    ret
+    %out = call <16 x i8> @llvm.experimental.vector.compress(<16 x i8> %vec, <16 x i1> %mask, <16 x i8> undef)
+    ret <16 x i8> %out
+}
+
+define <8 x i32> @test_compress_large(<8 x i32> %vec, <8 x i1> %mask) {
+; CHECK-LABEL: test_compress_large:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    sub sp, sp, #32
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    ; kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    umov.b w9, v2[0]
+; CHECK-NEXT:    umov.b w10, v2[1]
+; CHECK-NEXT:    mov x12, sp
+; CHECK-NEXT:    umov.b w11, v2[2]
+; CHECK-NEXT:    umov.b w13, v2[3]
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    umov.b w14, v2[4]
+; CHECK-NEXT:    str s0, [sp]
+; CHECK-NEXT:    and x10, x10, #0x1
+; CHECK-NEXT:    and x15, x9, #0x1
+; CHECK-NEXT:    bfi x12, x9, #2, #1
+; CHECK-NEXT:    and x9, x11, #0x1
+; CHECK-NEXT:    add x10, x15, x10
+; CHECK-NEXT:    umov.b w11, v2[5]
+; CHECK-NEXT:    add x9, x10, x9
+; CHECK-NEXT:    orr x15, x8, x10, lsl #2
+; CHECK-NEXT:    umov.b w10, v2[6]
+; CHECK-NEXT:    st1.s { v0 }[1], [x12]
+; CHECK-NEXT:    add x12, x8, x9, lsl #2
+; CHECK-NEXT:    and x13, x13, #0x1
+; CHECK-NEXT:    st1.s { v0 }[2], [x15]
+; CHECK-NEXT:    add x9, x9, x13
+; CHECK-NEXT:    st1.s { v0 }[3], [x12]
+; CHECK-NEXT:    and x12, x14, #0x1
+; CHECK-NEXT:    and x11, x11, #0x1
+; CHECK-NEXT:    add x12, x9, x12
+; CHECK-NEXT:    and w10, w10, #0x1
+; CHECK-NEXT:    and x9, x9, #0x7
+; CHECK-NEXT:    add x11, x12, x11
+; CHECK-NEXT:    and x12, x12, #0x7
+; CHECK-NEXT:    str s1, [x8, x9, lsl #2]
+; CHECK-NEXT:    add w10, w11, w10
+; CHECK-NEXT:    and x11, x11, #0x7
+; CHECK-NEXT:    add x12, x8, x12, lsl #2
+; CHECK-NEXT:    and x10, x10, #0x7
+; CHECK-NEXT:    add x9, x8, x11, lsl #2
+; CHECK-NEXT:    add x8, x8, x10, lsl #2
+; CHECK-NEXT:    st1.s { v1 }[1], [x12]
+; CHECK-NEXT:    st1.s { v1 }[2], [x9]
+; CHECK-NEXT:    st1.s { v1 }[3], [x8]
+; CHECK-NEXT:    ldp q0, q1, [sp], #32
+; CHECK-NEXT:    ret
+    %out = call <8 x i32> @llvm.experimental.vector.compress(<8 x i32> %vec, <8 x i1> %mask, <8 x i32> undef)
+    ret <8 x i32> %out
+}
+
+define <4 x i32> @test_compress_all_const() {
+; CHECK-LABEL: test_compress_all_const:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:  Lloh0:
+; CHECK-NEXT:    adrp x8, lCPI5_0 at PAGE
+; CHECK-NEXT:  Lloh1:
+; CHECK-NEXT:    ldr q0, [x8, lCPI5_0 at PAGEOFF]
+; CHECK-NEXT:    ret
+; CHECK-NEXT:    .loh AdrpLdr Lloh0, Lloh1
+    %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> <i32 3, i32 5, i32 7, i32 9>,
+                                                <4 x i1>   <i1 0,  i1 1,  i1 0,  i1 1>,
+                                                <4 x i32> undef)
+    ret <4 x i32> %out
+}
+
+define <4 x i32> @test_compress_const_mask(<4 x i32> %vec) {
+; CHECK-LABEL: test_compress_const_mask:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    mov.s v0[1], v0[3]
+; CHECK-NEXT:    ret
+    %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> %vec, <4 x i1> <i1 1, i1 undef, i1 0, i1 1>, <4 x i32> undef)
+    ret <4 x i32> %out
+}
+
+define <4 x i32> @test_compress_const_mask_passthrough(<4 x i32> %vec, <4 x i32> %passthru) {
+; CHECK-LABEL: test_compress_const_mask_passthrough:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    mov.d v1[0], v0[1]
+; CHECK-NEXT:    mov.s v1[0], v0[0]
+; CHECK-NEXT:    mov.16b v0, v1
+; CHECK-NEXT:    ret
+    %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> %vec, <4 x i1> <i1 1, i1 undef, i1 0, i1 1>, <4 x i32> %passthru)
+    ret <4 x i32> %out
+}
+
+define <4 x i32> @test_compress_const_mask_const_passthrough(<4 x i32> %vec) {
+; CHECK-LABEL: test_compress_const_mask_const_passthrough:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    mov.s v0[1], v0[3]
+; CHECK-NEXT:    mov w8, #7 ; =0x7
+; CHECK-NEXT:    mov.s v0[2], w8
+; CHECK-NEXT:    mov w8, #8 ; =0x8
+; CHECK-NEXT:    mov.s v0[3], w8
+; CHECK-NEXT:    ret
+    %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> %vec, <4 x i1> <i1 1, i1 0, i1 0, i1 1>, <4 x i32> <i32 5, i32 6, i32 7, i32 8>)
+    ret <4 x i32> %out
+}
+
+; We pass a placeholder value for the const_mask* tests to check that they are converted to a no-op by simply copying
+; the second vector input register to the return register or doing nothing.
+define <4 x i32> @test_compress_const_splat1_mask(<4 x i32> %ignore, <4 x i32> %vec) {
+; CHECK-LABEL: test_compress_const_splat1_mask:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    mov.16b v0, v1
+; CHECK-NEXT:    ret
+    %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> %vec, <4 x i1> splat (i1 -1), <4 x i32> undef)
+    ret <4 x i32> %out
+}
+define <4 x i32> @test_compress_const_splat0_mask(<4 x i32> %ignore, <4 x i32> %vec) {
+; CHECK-LABEL: test_compress_const_splat0_mask:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    ret
+    %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> %vec, <4 x i1> splat (i1 0), <4 x i32> undef)
+    ret <4 x i32> %out
+}
+define <4 x i32> @test_compress_undef_mask(<4 x i32> %ignore, <4 x i32> %vec) {
+; CHECK-LABEL: test_compress_undef_mask:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    ret
+    %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> %vec, <4 x i1> undef, <4 x i32> undef)
+    ret <4 x i32> %out
+}
+define <4 x i32> @test_compress_const_splat0_mask_with_passthru(<4 x i32> %ignore, <4 x i32> %vec, <4 x i32> %passthru) {
+; CHECK-LABEL: test_compress_const_splat0_mask_with_passthru:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    mov.16b v0, v2
+; CHECK-NEXT:    ret
+    %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> %vec, <4 x i1> splat (i1 0), <4 x i32> %passthru)
+    ret <4 x i32> %out
+}
+define <4 x i32> @test_compress_const_splat0_mask_without_passthru(<4 x i32> %ignore, <4 x i32> %vec) {
+; CHECK-LABEL: test_compress_const_splat0_mask_without_passthru:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    ret
+    %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> %vec, <4 x i1> splat (i1 0), <4 x i32> undef)
+    ret <4 x i32> %out
+}
+
+define <4 x i8> @test_compress_small(<4 x i8> %vec, <4 x i1> %mask) {
+; CHECK-LABEL: test_compress_small:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    shl.4h v1, v1, #15
+; CHECK-NEXT:    add x8, sp, #8
+; CHECK-NEXT:    ; kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    str h0, [sp, #8]
+; CHECK-NEXT:    cmlt.4h v1, v1, #0
+; CHECK-NEXT:    umov.h w9, v1[0]
+; CHECK-NEXT:    umov.h w10, v1[1]
+; CHECK-NEXT:    umov.h w11, v1[2]
+; CHECK-NEXT:    bfi x8, x9, #1, #1
+; CHECK-NEXT:    and x10, x10, #0x1
+; CHECK-NEXT:    and x9, x9, #0x1
+; CHECK-NEXT:    add x9, x9, x10
+; CHECK-NEXT:    and w11, w11, #0x1
+; CHECK-NEXT:    add x10, sp, #8
+; CHECK-NEXT:    add w11, w9, w11
+; CHECK-NEXT:    orr x9, x10, x9, lsl #1
+; CHECK-NEXT:    st1.h { v0 }[1], [x8]
+; CHECK-NEXT:    bfi x10, x11, #1, #2
+; CHECK-NEXT:    st1.h { v0 }[2], [x9]
+; CHECK-NEXT:    st1.h { v0 }[3], [x10]
+; CHECK-NEXT:    ldr d0, [sp, #8]
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
+    %out = call <4 x i8> @llvm.experimental.vector.compress(<4 x i8> %vec, <4 x i1> %mask, <4 x i8> undef)
+    ret <4 x i8> %out
+}
+
+define <4 x i4> @test_compress_illegal_element_type(<4 x i4> %vec, <4 x i1> %mask) {
+; CHECK-LABEL: test_compress_illegal_element_type:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    shl.4h v1, v1, #15
+; CHECK-NEXT:    add x8, sp, #8
+; CHECK-NEXT:    ; kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    str h0, [sp, #8]
+; CHECK-NEXT:    cmlt.4h v1, v1, #0
+; CHECK-NEXT:    umov.h w9, v1[0]
+; CHECK-NEXT:    umov.h w10, v1[1]
+; CHECK-NEXT:    umov.h w11, v1[2]
+; CHECK-NEXT:    bfi x8, x9, #1, #1
+; CHECK-NEXT:    and x10, x10, #0x1
+; CHECK-NEXT:    and x9, x9, #0x1
+; CHECK-NEXT:    add x9, x9, x10
+; CHECK-NEXT:    and w11, w11, #0x1
+; CHECK-NEXT:    add x10, sp, #8
+; CHECK-NEXT:    add w11, w9, w11
+; CHECK-NEXT:    orr x9, x10, x9, lsl #1
+; CHECK-NEXT:    st1.h { v0 }[1], [x8]
+; CHECK-NEXT:    bfi x10, x11, #1, #2
+; CHECK-NEXT:    st1.h { v0 }[2], [x9]
+; CHECK-NEXT:    st1.h { v0 }[3], [x10]
+; CHECK-NEXT:    ldr d0, [sp, #8]
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
+    %out = call <4 x i4> @llvm.experimental.vector.compress(<4 x i4> %vec, <4 x i1> %mask, <4 x i4> undef)
+    ret <4 x i4> %out
+}
+
+define <3 x i32> @test_compress_narrow(<3 x i32> %vec, <3 x i1> %mask) {
+; CHECK-LABEL: test_compress_narrow:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    movi.2d v1, #0000000000000000
+; CHECK-NEXT:    mov x11, sp
+; CHECK-NEXT:    str s0, [sp]
+; CHECK-NEXT:    mov.h v1[0], w0
+; CHECK-NEXT:    mov.h v1[1], w1
+; CHECK-NEXT:    mov.h v1[2], w2
+; CHECK-NEXT:    ushll.4s v1, v1, #0
+; CHECK-NEXT:    shl.4s v1, v1, #31
+; CHECK-NEXT:    cmlt.4s v1, v1, #0
+; CHECK-NEXT:    mov.s w8, v1[1]
+; CHECK-NEXT:    mov.s w9, v1[2]
+; CHECK-NEXT:    fmov w10, s1
+; CHECK-NEXT:    bfi x11, x10, #2, #1
+; CHECK-NEXT:    and x10, x10, #0x1
+; CHECK-NEXT:    and x8, x8, #0x1
+; CHECK-NEXT:    and w9, w9, #0x1
+; CHECK-NEXT:    add x8, x10, x8
+; CHECK-NEXT:    mov x10, sp
+; CHECK-NEXT:    st1.s { v0 }[1], [x11]
+; CHECK-NEXT:    add w9, w8, w9
+; CHECK-NEXT:    orr x8, x10, x8, lsl #2
+; CHECK-NEXT:    bfi x10, x9, #2, #2
+; CHECK-NEXT:    st1.s { v0 }[2], [x8]
+; CHECK-NEXT:    st1.s { v0 }[3], [x10]
+; CHECK-NEXT:    ldr q0, [sp], #16
+; CHECK-NEXT:    ret
+    %out = call <3 x i32> @llvm.experimental.vector.compress(<3 x i32> %vec, <3 x i1> %mask, <3 x i32> undef)
+    ret <3 x i32> %out
+}
+
+define <3 x i3> @test_compress_narrow_illegal_element_type(<3 x i3> %vec, <3 x i1> %mask) {
+; CHECK-LABEL: test_compress_narrow_illegal_element_type:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    movi.2d v0, #0000000000000000
+; CHECK-NEXT:    add x10, sp, #8
+; CHECK-NEXT:    strh w0, [sp, #8]
+; CHECK-NEXT:    mov.h v0[0], w3
+; CHECK-NEXT:    mov.h v0[1], w4
+; CHECK-NEXT:    mov.h v0[2], w5
+; CHECK-NEXT:    shl.4h v0, v0, #15
+; CHECK-NEXT:    cmlt.4h v0, v0, #0
+; CHECK-NEXT:    umov.h w8, v0[0]
+; CHECK-NEXT:    umov.h w9, v0[1]
+; CHECK-NEXT:    and x9, x9, #0x1
+; CHECK-NEXT:    and x11, x8, #0x1
+; CHECK-NEXT:    bfi x10, x8, #1, #1
+; CHECK-NEXT:    add x8, x11, x9
+; CHECK-NEXT:    add x9, sp, #8
+; CHECK-NEXT:    orr x8, x9, x8, lsl #1
+; CHECK-NEXT:    strh w1, [x10]
+; CHECK-NEXT:    strh w2, [x8]
+; CHECK-NEXT:    ldr d0, [sp, #8]
+; CHECK-NEXT:    umov.h w0, v0[0]
+; CHECK-NEXT:    umov.h w1, v0[1]
+; CHECK-NEXT:    umov.h w2, v0[2]
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
+    %out = call <3 x i3> @llvm.experimental.vector.compress(<3 x i3> %vec, <3 x i1> %mask, <3 x i3> undef)
+    ret <3 x i3> %out
+}


        


More information about the llvm-commits mailing list