[libcxx-commits] [clang-tools-extra] [clang] [flang] [libcxxabi] [openmp] [libcxx] [mlir] [llvm] [libc] [compiler-rt] [AArch64] Combine store (trunc X to <3 x i8>) to sequence of ST1.b. (PR #78637)

Tue Jan 23 14:28:37 PST 2024

================
@@ -21471,6 +21471,57 @@ bool isHalvingTruncateOfLegalScalableType(EVT SrcVT, EVT DstVT) {
          (SrcVT == MVT::nxv2i64 && DstVT == MVT::nxv2i32);
 }
 
+// Combine store (trunc X to <3 x i8>) to sequence of ST1.b.
+static SDValue combineI8TruncStore(StoreSDNode *ST, SelectionDAG &DAG,
+                                   const AArch64Subtarget *Subtarget) {
+  SDValue Value = ST->getValue();
+  EVT ValueVT = Value.getValueType();
+
+  if (ST->isVolatile() || !Subtarget->isLittleEndian() ||
+      ST->getOriginalAlign() >= 4 || Value.getOpcode() != ISD::TRUNCATE ||
+      ValueVT != EVT::getVectorVT(*DAG.getContext(), MVT::i8, 3))
+    return SDValue();
+
+  assert(ST->getOffset().isUndef() && "undef offset expected");
+  SDLoc DL(ST);
+  auto WideVT = EVT::getVectorVT(
+      *DAG.getContext(),
+      Value->getOperand(0).getValueType().getVectorElementType(), 4);
+  SDValue UndefVector = DAG.getUNDEF(WideVT);
+  SDValue WideTrunc = DAG.getNode(
+      ISD::INSERT_SUBVECTOR, DL, WideVT,
+      {UndefVector, Value->getOperand(0), DAG.getVectorIdxConstant(0, DL)});
+  SDValue Cast = DAG.getNode(
+      ISD::BITCAST, DL, WideVT.getSizeInBits() == 64 ? MVT::v8i8 : MVT::v16i8,
+      WideTrunc);
+
+  unsigned IdxScale = WideVT.getScalarSizeInBits() / 8;
----------------
fhahn wrote:

I am not sure, I tried but the code here combines the trunc with the stores; it looks like`scalarizeVectorStore` would generate a 16 bit store and a 8 bit store, and we first need to do the cast separately.

https://github.com/llvm/llvm-project/pull/78637