[llvm] [AArch64] Extend custom lowering for SVE types in `@llvm.experimental.vector.compress` (PR #105515)

Tue Aug 27 09:54:58 PDT 2024

================
@@ -6690,16 +6688,67 @@ SDValue AArch64TargetLowering::LowerVECTOR_COMPRESS(SDValue Op,
   EVT ContainerVT = getSVEContainerType(VecVT);
   EVT CastVT = VecVT.changeVectorElementTypeToInteger();
 
-  // Convert to i32 or i64 for smaller types, as these are the only supported
-  // sizes for compact.
-  if (ContainerVT != VecVT) {
-    Vec = DAG.getBitcast(CastVT, Vec);
-    Vec = DAG.getNode(ISD::ANY_EXTEND, DL, ContainerVT, Vec);
-  }
+  // These vector types aren't supported by the `compact` instruction, so
+  // we split and compact them as <vscale x 4 x i32>, store them on the stack,
+  // and then merge them again. In the other cases, emit compact directly.
+  SDValue Compressed;
+  if (VecVT == MVT::nxv8i16 || VecVT == MVT::nxv8i8 || VecVT == MVT::nxv16i8) {
+    SDValue Chain = DAG.getEntryNode();
+    SDValue StackPtr = DAG.CreateStackTemporary(
+        VecVT.getStoreSize(), DAG.getReducedAlign(VecVT, /*UseABI=*/false));
+    MachineFunction &MF = DAG.getMachineFunction();
+
+    EVT PartialVecVT =
+        EVT::getVectorVT(*DAG.getContext(), ElmtVT, 4, /*isScalable*/ true);
+    EVT OffsetVT = getVectorIdxTy(DAG.getDataLayout());
+    SDValue Offset = DAG.getConstant(0, DL, OffsetVT);
+
+    for (unsigned I = 0; I < MinElmts; I += 4) {
+      SDValue PartialVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, PartialVecVT,
+                                       Vec, DAG.getVectorIdxConstant(I, DL));
+      PartialVec = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv4i32, PartialVec);
+
+      SDValue PartialMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::nxv4i1,
+                                        Mask, DAG.getVectorIdxConstant(I, DL));
+
+      SDValue PartialCompressed = DAG.getNode(
+          ISD::INTRINSIC_WO_CHAIN, DL, MVT::nxv4i32,
+          DAG.getConstant(Intrinsic::aarch64_sve_compact, DL, MVT::i64),
+          PartialMask, PartialVec);
+      PartialCompressed =
+          DAG.getNode(ISD::TRUNCATE, DL, PartialVecVT, PartialCompressed);
+
+      SDValue OutPtr = DAG.getNode(
+          ISD::ADD, DL, StackPtr.getValueType(), StackPtr,
+          DAG.getNode(
+              ISD::MUL, DL, OffsetVT, Offset,
+              DAG.getConstant(ElmtVT.getScalarSizeInBits() / 8, DL, OffsetVT)));
+      Chain = DAG.getStore(Chain, DL, PartialCompressed, OutPtr,
+                           MachinePointerInfo::getUnknownStack(MF));
+
+      SDValue PartialOffset = DAG.getNode(
+          ISD::INTRINSIC_WO_CHAIN, DL, OffsetVT,
+          DAG.getConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64),
+          PartialMask, PartialMask);
+      Offset = DAG.getNode(ISD::ADD, DL, OffsetVT, Offset, PartialOffset);
+    }
+
+    MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(
+        MF, cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex());
+    Compressed = DAG.getLoad(VecVT, DL, Chain, StackPtr, PtrInfo);
+  } else {
+    // Convert to i32 or i64 for smaller types, as these are the only supported
+    // sizes for compact.
+    if (ContainerVT != VecVT) {
+      Vec = DAG.getBitcast(CastVT, Vec);
+      Vec = DAG.getNode(ISD::ANY_EXTEND, DL, ContainerVT, Vec);
+    }
 
-  SDValue Compressed = DAG.getNode(
-      ISD::INTRINSIC_WO_CHAIN, DL, Vec.getValueType(),
-      DAG.getConstant(Intrinsic::aarch64_sve_compact, DL, MVT::i64), Mask, Vec);
+    Compressed = DAG.getNode(
+        ISD::INTRINSIC_WO_CHAIN, DL, Vec.getValueType(),
+        DAG.getConstant(Intrinsic::aarch64_sve_compact, DL, MVT::i64), Mask,
+        Vec);
+  }
 
   // compact fills with 0s, so if our passthru is all 0s, do nothing here.
   if (HasPassthru && !ISD::isConstantSplatVectorAllZeros(Passthru.getNode())) {
----------------
davemgreen wrote:

What happens with pass-throughs, especially if they are 0s? Could they read off the end of the last store? It might be worth adding some tests. 

https://github.com/llvm/llvm-project/pull/105515