[llvm] cd89f5c - [SVE][CodeGen] Legalisation of truncate for scalable vectors

Thu Sep 10 03:38:46 PDT 2020

Author: Kerry McLaughlin
Date: 2020-09-10T11:35:33+01:00
New Revision: cd89f5c91b4bad90278a59865fc06a75211589a1

URL: https://github.com/llvm/llvm-project/commit/cd89f5c91b4bad90278a59865fc06a75211589a1
DIFF: https://github.com/llvm/llvm-project/commit/cd89f5c91b4bad90278a59865fc06a75211589a1.diff

LOG: [SVE][CodeGen] Legalisation of truncate for scalable vectors

Truncating from an illegal SVE type to a legal type, e.g.
`trunc <vscale x 4 x i64> %in to <vscale x 4 x i32>`
fails after PromoteIntOp_CONCAT_VECTORS attempts to
create a BUILD_VECTOR.

This patch changes the promote function to create a sequence of
INSERT_SUBVECTORs if the return type is scalable, and replaces
these with UNPK+UZP1 for AArch64.

Reviewed By: paulwalker-arm

Differential Revision: https://reviews.llvm.org/D86548

Added: 
    llvm/test/CodeGen/AArch64/sve-split-trunc.ll

Modified: 
    llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
    llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index e1881c20e5b3..bfe1b365efc4 100644

--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -4702,8 +4702,23 @@ SDValue DAGTypeLegalizer::PromoteIntOp_EXTRACT_SUBVECTOR(SDNode *N) {
 
 SDValue DAGTypeLegalizer::PromoteIntOp_CONCAT_VECTORS(SDNode *N) {
   SDLoc dl(N);
+
+  EVT ResVT = N->getValueType(0);
   unsigned NumElems = N->getNumOperands();
 
+  if (ResVT.isScalableVector()) {
+    SDValue ResVec = DAG.getUNDEF(ResVT);
+
+    for (unsigned OpIdx = 0; OpIdx < NumElems; ++OpIdx) {
+      SDValue Op = N->getOperand(OpIdx);
+      unsigned OpNumElts = Op.getValueType().getVectorMinNumElements();
+      ResVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ResVec, Op,
+                           DAG.getIntPtrConstant(OpIdx * OpNumElts, dl));
+    }
+
+    return ResVec;
+  }
+
   EVT RetSclrTy = N->getValueType(0).getVectorElementType();
 
   SmallVector<SDValue, 8> NewOps;

diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 063644716a65..d4f324490430 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -964,8 +964,10 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       }
     }
 
-    for (auto VT : {MVT::nxv8i8, MVT::nxv4i16, MVT::nxv2i32})
+    for (auto VT : {MVT::nxv8i8, MVT::nxv4i16, MVT::nxv2i32}) {
       setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
+      setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
+    }
 
     setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i8, Custom);
     setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom);
@@ -9099,9 +9101,34 @@ SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op,
   EVT InVT = Op.getOperand(1).getValueType();
   unsigned Idx = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
 
-  // We don't have any patterns for scalable vector yet.
-  if (InVT.isScalableVector())
+  if (InVT.isScalableVector()) {
+    SDLoc DL(Op);
+    EVT VT = Op.getValueType();
+
+    if (!isTypeLegal(VT) || !VT.isInteger())
+      return SDValue();
+
+    SDValue Vec0 = Op.getOperand(0);
+    SDValue Vec1 = Op.getOperand(1);
+
+    // Ensure the subvector is half the size of the main vector.
+    if (VT.getVectorElementCount() != (InVT.getVectorElementCount() * 2))
+      return SDValue();
+
+    // Extend elements of smaller vector...
+    EVT WideVT = InVT.widenIntegerVectorElementType(*(DAG.getContext()));
+    SDValue ExtVec = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Vec1);
+
+    if (Idx == 0) {
+      SDValue HiVec0 = DAG.getNode(AArch64ISD::UUNPKHI, DL, WideVT, Vec0);
+      return DAG.getNode(AArch64ISD::UZP1, DL, VT, ExtVec, HiVec0);
+    } else if (Idx == InVT.getVectorMinNumElements()) {
+      SDValue LoVec0 = DAG.getNode(AArch64ISD::UUNPKLO, DL, WideVT, Vec0);
+      return DAG.getNode(AArch64ISD::UZP1, DL, VT, LoVec0, ExtVec);
+    }
+
     return SDValue();
+  }
 
   // This will be matched by custom code during ISelDAGToDAG.
   if (Idx == 0 && isPackedVectorType(InVT, DAG) && Op.getOperand(0).isUndef())
@@ -13001,6 +13028,31 @@ static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
                       S->getMemOperand()->getFlags());
 }
 
+static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG) {
+  SDLoc DL(N);
+  SDValue Op0 = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+  EVT ResVT = N->getValueType(0);
+
+  // uzp1(unpklo(uzp1(x, y)), z) => uzp1(x, z)
+  if (Op0.getOpcode() == AArch64ISD::UUNPKLO) {
+    if (Op0.getOperand(0).getOpcode() == AArch64ISD::UZP1) {
+      SDValue X = Op0.getOperand(0).getOperand(0);
+      return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, X, Op1);
+    }
+  }
+
+  // uzp1(x, unpkhi(uzp1(y, z))) => uzp1(x, z)
+  if (Op1.getOpcode() == AArch64ISD::UUNPKHI) {
+    if (Op1.getOperand(0).getOpcode() == AArch64ISD::UZP1) {
+      SDValue Z = Op1.getOperand(0).getOperand(1);
+      return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0, Z);
+    }
+  }
+
+  return SDValue();
+}
+
 /// Target-specific DAG combine function for post-increment LD1 (lane) and
 /// post-increment LD1R.
 static SDValue performPostLD1Combine(SDNode *N,
@@ -14342,6 +14394,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
     return performPostLD1Combine(N, DCI, false);
   case AArch64ISD::NVCAST:
     return performNVCASTCombine(N);
+  case AArch64ISD::UZP1:
+    return performUzpCombine(N, DAG);
   case ISD::INSERT_VECTOR_ELT:
     return performPostLD1Combine(N, DCI, true);
   case ISD::INTRINSIC_VOID:

diff  --git a/llvm/test/CodeGen/AArch64/sve-split-trunc.ll b/llvm/test/CodeGen/AArch64/sve-split-trunc.ll
new file mode 100644
index 000000000000..6c81c49070fb
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-split-trunc.ll
@@ -0,0 +1,66 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve < %s | FileCheck %s
+
+define <vscale x 16 x i8> @trunc_i16toi8(<vscale x 16 x i16> %in) {
+; CHECK-LABEL: trunc_i16toi8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uzp1 z0.b, z0.b, z1.b
+; CHECK-NEXT:    ret
+  %out = trunc <vscale x 16 x i16> %in to <vscale x 16 x i8>
+  ret <vscale x 16 x i8> %out
+}
+
+define <vscale x 16 x i8> @trunc_i32toi8(<vscale x 16 x i32> %in) {
+; CHECK-LABEL: trunc_i32toi8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uzp1 z2.h, z2.h, z3.h
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT:    uzp1 z0.b, z0.b, z2.b
+; CHECK-NEXT:    ret
+  %out = trunc <vscale x 16 x i32> %in to <vscale x 16 x i8>
+  ret <vscale x 16 x i8> %out
+}
+
+define <vscale x 8 x i16> @trunc_i32toi16(<vscale x 8 x i32> %in) {
+; CHECK-LABEL: trunc_i32toi16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT:    ret
+  %out = trunc <vscale x 8 x i32> %in to <vscale x 8 x i16>
+  ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @trunc_i64toi32(<vscale x 4 x i64> %in) {
+; CHECK-LABEL: trunc_i64toi32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z1.s
+; CHECK-NEXT:    ret
+  %out = trunc <vscale x 4 x i64> %in to <vscale x 4 x i32>
+  ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 8 x i16> @trunc_i64toi16(<vscale x 8 x i64> %in) {
+; CHECK-LABEL: trunc_i64toi16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uzp1 z2.s, z2.s, z3.s
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z1.s
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z2.h
+; CHECK-NEXT:    ret
+  %out = trunc <vscale x 8 x i64> %in to <vscale x 8 x i16>
+  ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 16 x i8> @trunc_i64toi8(<vscale x 16 x i64> %in) {
+; CHECK-LABEL: trunc_i64toi8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uzp1 z6.s, z6.s, z7.s
+; CHECK-NEXT:    uzp1 z4.s, z4.s, z5.s
+; CHECK-NEXT:    uzp1 z2.s, z2.s, z3.s
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z1.s
+; CHECK-NEXT:    uzp1 z1.h, z4.h, z6.h
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z2.h
+; CHECK-NEXT:    uzp1 z0.b, z0.b, z1.b
+; CHECK-NEXT:    ret
+  %out = trunc <vscale x 16 x i64> %in to <vscale x 16 x i8>
+  ret <vscale x 16 x i8> %out
+}