[llvm] 9a3144d - [AArch64] Reuse larger DUP if available

Sun May 29 11:42:18 PDT 2022

Author: David Green
Date: 2022-05-29T19:42:13+01:00
New Revision: 9a3144d078389c19b269b8dd94b9f5306754c039

URL: https://github.com/llvm/llvm-project/commit/9a3144d078389c19b269b8dd94b9f5306754c039
DIFF: https://github.com/llvm/llvm-project/commit/9a3144d078389c19b269b8dd94b9f5306754c039.diff

LOG: [AArch64] Reuse larger DUP if available

If both a v2i32 DUP(x) and a v4i32 DUP(x) node exists, we can re-use the
larger node using a vector extract to obtain the smaller. This comes up
in the smull/smlal code, but needs a small fixup to allow the smull2
code in tryExtendDUPToExtractHigh/performAddSubLongCombine to still
match smull2 extracts.

Differential Revision: https://reviews.llvm.org/D126449

Added: 
    

Modified: 
    llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
    llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index d31008496ea4f..2652e98fb30b2 100644

--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -15147,6 +15147,11 @@ static SDValue tryCombineFixedPointConvert(SDNode *N,
 // It also supports immediate DUP-like nodes (MOVI/MVNi), which we can fold
 // similarly here.
 static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG) {
+  MVT VT = N.getSimpleValueType();
+  if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+      N.getConstantOperandVal(1) == 0)
+    N = N.getOperand(0);
+
   switch (N.getOpcode()) {
   case AArch64ISD::DUP:
   case AArch64ISD::DUPLANE8:
@@ -15167,18 +15172,19 @@ static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG) {
     return SDValue();
   }
 
-  MVT NarrowTy = N.getSimpleValueType();
-  if (!NarrowTy.is64BitVector())
+  if (!VT.is64BitVector())
     return SDValue();
 
-  MVT ElementTy = NarrowTy.getVectorElementType();
-  unsigned NumElems = NarrowTy.getVectorNumElements();
-  MVT NewVT = MVT::getVectorVT(ElementTy, NumElems * 2);
+  SDLoc DL(N);
+  unsigned NumElems = VT.getVectorNumElements();
+  if (N.getValueType().is64BitVector()) {
+    MVT ElementTy = VT.getVectorElementType();
+    MVT NewVT = MVT::getVectorVT(ElementTy, NumElems * 2);
+    N = DAG.getNode(N->getOpcode(), DL, NewVT, N->ops());
+  }
 
-  SDLoc dl(N);
-  return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, NarrowTy,
-                     DAG.getNode(N->getOpcode(), dl, NewVT, N->ops()),
-                     DAG.getConstant(NumElems, dl, MVT::i64));
+  return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N,
+                     DAG.getConstant(NumElems, DL, MVT::i64));
 }
 
 static bool isEssentiallyExtractHighSubvector(SDValue N) {
@@ -18225,6 +18231,24 @@ static SDValue performSelectCombine(SDNode *N,
   return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2));
 }
 
+static SDValue performDUPCombine(SDNode *N,
+                                 TargetLowering::DAGCombinerInfo &DCI) {
+  EVT VT = N->getValueType(0);
+  // If "v2i32 DUP(x)" and "v4i32 DUP(x)" both exist, use an extract from the
+  // 128bit vector version.
+  if (VT.is64BitVector() && DCI.isAfterLegalizeDAG()) {
+    EVT LVT = VT.getDoubleNumVectorElementsVT(*DCI.DAG.getContext());
+    if (SDNode *LN = DCI.DAG.getNodeIfExists(
+            N->getOpcode(), DCI.DAG.getVTList(LVT), {N->getOperand(0)})) {
+      SDLoc DL(N);
+      return DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SDValue(LN, 0),
+                             DCI.DAG.getConstant(0, DL, MVT::i64));
+    }
+  }
+
+  return performPostLD1Combine(N, DCI, false);
+}
+
 /// Get rid of unnecessary NVCASTs (that don't change the type).
 static SDValue performNVCASTCombine(SDNode *N) {
   if (N->getValueType(0) == N->getOperand(0).getValueType())
@@ -18948,7 +18972,7 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
   case AArch64ISD::CSEL:
     return performCSELCombine(N, DCI, DAG);
   case AArch64ISD::DUP:
-    return performPostLD1Combine(N, DCI, false);
+    return performDUPCombine(N, DCI);
   case AArch64ISD::NVCAST:
     return performNVCASTCombine(N);
   case AArch64ISD::SPLICE:

diff  --git a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
index 12b451f509f73..0ad7b8b15ad33 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
@@ -201,22 +201,21 @@ define void @larger_smull(i16* nocapture noundef readonly %x, i16 noundef %y, i3
 ; CHECK-NEXT:    b .LBB3_6
 ; CHECK-NEXT:  .LBB3_3: // %vector.ph
 ; CHECK-NEXT:    and x10, x9, #0xfffffff0
-; CHECK-NEXT:    dup v0.4h, w8
 ; CHECK-NEXT:    add x11, x2, #32
 ; CHECK-NEXT:    add x12, x0, #16
 ; CHECK-NEXT:    mov x13, x10
-; CHECK-NEXT:    dup v1.8h, w8
+; CHECK-NEXT:    dup v0.8h, w8
 ; CHECK-NEXT:  .LBB3_4: // %vector.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldp q2, q3, [x12, #-16]
+; CHECK-NEXT:    ldp q1, q2, [x12, #-16]
 ; CHECK-NEXT:    subs x13, x13, #16
 ; CHECK-NEXT:    add x12, x12, #32
-; CHECK-NEXT:    smull2 v4.4s, v1.8h, v2.8h
+; CHECK-NEXT:    smull2 v3.4s, v0.8h, v1.8h
+; CHECK-NEXT:    smull v1.4s, v0.4h, v1.4h
+; CHECK-NEXT:    smull2 v4.4s, v0.8h, v2.8h
 ; CHECK-NEXT:    smull v2.4s, v0.4h, v2.4h
-; CHECK-NEXT:    smull2 v5.4s, v1.8h, v3.8h
-; CHECK-NEXT:    smull v3.4s, v0.4h, v3.4h
-; CHECK-NEXT:    stp q2, q4, [x11, #-32]
-; CHECK-NEXT:    stp q3, q5, [x11], #64
+; CHECK-NEXT:    stp q1, q3, [x11, #-32]
+; CHECK-NEXT:    stp q2, q4, [x11], #64
 ; CHECK-NEXT:    b.ne .LBB3_4
 ; CHECK-NEXT:  // %bb.5: // %middle.block
 ; CHECK-NEXT:    cmp x10, x9
@@ -314,22 +313,21 @@ define void @larger_umull(i16* nocapture noundef readonly %x, i16 noundef %y, i3
 ; CHECK-NEXT:    b .LBB4_6
 ; CHECK-NEXT:  .LBB4_3: // %vector.ph
 ; CHECK-NEXT:    and x10, x9, #0xfffffff0
-; CHECK-NEXT:    dup v0.4h, w8
 ; CHECK-NEXT:    add x11, x2, #32
 ; CHECK-NEXT:    add x12, x0, #16
 ; CHECK-NEXT:    mov x13, x10
-; CHECK-NEXT:    dup v1.8h, w8
+; CHECK-NEXT:    dup v0.8h, w8
 ; CHECK-NEXT:  .LBB4_4: // %vector.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldp q2, q3, [x12, #-16]
+; CHECK-NEXT:    ldp q1, q2, [x12, #-16]
 ; CHECK-NEXT:    subs x13, x13, #16
 ; CHECK-NEXT:    add x12, x12, #32
-; CHECK-NEXT:    umull2 v4.4s, v1.8h, v2.8h
+; CHECK-NEXT:    umull2 v3.4s, v0.8h, v1.8h
+; CHECK-NEXT:    umull v1.4s, v0.4h, v1.4h
+; CHECK-NEXT:    umull2 v4.4s, v0.8h, v2.8h
 ; CHECK-NEXT:    umull v2.4s, v0.4h, v2.4h
-; CHECK-NEXT:    umull2 v5.4s, v1.8h, v3.8h
-; CHECK-NEXT:    umull v3.4s, v0.4h, v3.4h
-; CHECK-NEXT:    stp q2, q4, [x11, #-32]
-; CHECK-NEXT:    stp q3, q5, [x11], #64
+; CHECK-NEXT:    stp q1, q3, [x11, #-32]
+; CHECK-NEXT:    stp q2, q4, [x11], #64
 ; CHECK-NEXT:    b.ne .LBB4_4
 ; CHECK-NEXT:  // %bb.5: // %middle.block
 ; CHECK-NEXT:    cmp x10, x9