[llvm] 9a3144d - [AArch64] Reuse larger DUP if available
David Green via llvm-commits
llvm-commits at lists.llvm.org
Sun May 29 11:42:18 PDT 2022
Author: David Green
Date: 2022-05-29T19:42:13+01:00
New Revision: 9a3144d078389c19b269b8dd94b9f5306754c039
URL: https://github.com/llvm/llvm-project/commit/9a3144d078389c19b269b8dd94b9f5306754c039
DIFF: https://github.com/llvm/llvm-project/commit/9a3144d078389c19b269b8dd94b9f5306754c039.diff
LOG: [AArch64] Reuse larger DUP if available
If both a v2i32 DUP(x) and a v4i32 DUP(x) node exists, we can re-use the
larger node using a vector extract to obtain the smaller. This comes up
in the smull/smlal code, but needs a small fixup to allow the smull2
code in tryExtendDUPToExtractHigh/performAddSubLongCombine to still
match smull2 extracts.
Differential Revision: https://reviews.llvm.org/D126449
Added:
Modified:
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index d31008496ea4f..2652e98fb30b2 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -15147,6 +15147,11 @@ static SDValue tryCombineFixedPointConvert(SDNode *N,
// It also supports immediate DUP-like nodes (MOVI/MVNi), which we can fold
// similarly here.
static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG) {
+ MVT VT = N.getSimpleValueType();
+ if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+ N.getConstantOperandVal(1) == 0)
+ N = N.getOperand(0);
+
switch (N.getOpcode()) {
case AArch64ISD::DUP:
case AArch64ISD::DUPLANE8:
@@ -15167,18 +15172,19 @@ static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG) {
return SDValue();
}
- MVT NarrowTy = N.getSimpleValueType();
- if (!NarrowTy.is64BitVector())
+ if (!VT.is64BitVector())
return SDValue();
- MVT ElementTy = NarrowTy.getVectorElementType();
- unsigned NumElems = NarrowTy.getVectorNumElements();
- MVT NewVT = MVT::getVectorVT(ElementTy, NumElems * 2);
+ SDLoc DL(N);
+ unsigned NumElems = VT.getVectorNumElements();
+ if (N.getValueType().is64BitVector()) {
+ MVT ElementTy = VT.getVectorElementType();
+ MVT NewVT = MVT::getVectorVT(ElementTy, NumElems * 2);
+ N = DAG.getNode(N->getOpcode(), DL, NewVT, N->ops());
+ }
- SDLoc dl(N);
- return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, NarrowTy,
- DAG.getNode(N->getOpcode(), dl, NewVT, N->ops()),
- DAG.getConstant(NumElems, dl, MVT::i64));
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N,
+ DAG.getConstant(NumElems, DL, MVT::i64));
}
static bool isEssentiallyExtractHighSubvector(SDValue N) {
@@ -18225,6 +18231,24 @@ static SDValue performSelectCombine(SDNode *N,
return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2));
}
+static SDValue performDUPCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ EVT VT = N->getValueType(0);
+ // If "v2i32 DUP(x)" and "v4i32 DUP(x)" both exist, use an extract from the
+ // 128bit vector version.
+ if (VT.is64BitVector() && DCI.isAfterLegalizeDAG()) {
+ EVT LVT = VT.getDoubleNumVectorElementsVT(*DCI.DAG.getContext());
+ if (SDNode *LN = DCI.DAG.getNodeIfExists(
+ N->getOpcode(), DCI.DAG.getVTList(LVT), {N->getOperand(0)})) {
+ SDLoc DL(N);
+ return DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SDValue(LN, 0),
+ DCI.DAG.getConstant(0, DL, MVT::i64));
+ }
+ }
+
+ return performPostLD1Combine(N, DCI, false);
+}
+
/// Get rid of unnecessary NVCASTs (that don't change the type).
static SDValue performNVCASTCombine(SDNode *N) {
if (N->getValueType(0) == N->getOperand(0).getValueType())
@@ -18948,7 +18972,7 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
case AArch64ISD::CSEL:
return performCSELCombine(N, DCI, DAG);
case AArch64ISD::DUP:
- return performPostLD1Combine(N, DCI, false);
+ return performDUPCombine(N, DCI);
case AArch64ISD::NVCAST:
return performNVCASTCombine(N);
case AArch64ISD::SPLICE:
diff --git a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
index 12b451f509f73..0ad7b8b15ad33 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
@@ -201,22 +201,21 @@ define void @larger_smull(i16* nocapture noundef readonly %x, i16 noundef %y, i3
; CHECK-NEXT: b .LBB3_6
; CHECK-NEXT: .LBB3_3: // %vector.ph
; CHECK-NEXT: and x10, x9, #0xfffffff0
-; CHECK-NEXT: dup v0.4h, w8
; CHECK-NEXT: add x11, x2, #32
; CHECK-NEXT: add x12, x0, #16
; CHECK-NEXT: mov x13, x10
-; CHECK-NEXT: dup v1.8h, w8
+; CHECK-NEXT: dup v0.8h, w8
; CHECK-NEXT: .LBB3_4: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ldp q2, q3, [x12, #-16]
+; CHECK-NEXT: ldp q1, q2, [x12, #-16]
; CHECK-NEXT: subs x13, x13, #16
; CHECK-NEXT: add x12, x12, #32
-; CHECK-NEXT: smull2 v4.4s, v1.8h, v2.8h
+; CHECK-NEXT: smull2 v3.4s, v0.8h, v1.8h
+; CHECK-NEXT: smull v1.4s, v0.4h, v1.4h
+; CHECK-NEXT: smull2 v4.4s, v0.8h, v2.8h
; CHECK-NEXT: smull v2.4s, v0.4h, v2.4h
-; CHECK-NEXT: smull2 v5.4s, v1.8h, v3.8h
-; CHECK-NEXT: smull v3.4s, v0.4h, v3.4h
-; CHECK-NEXT: stp q2, q4, [x11, #-32]
-; CHECK-NEXT: stp q3, q5, [x11], #64
+; CHECK-NEXT: stp q1, q3, [x11, #-32]
+; CHECK-NEXT: stp q2, q4, [x11], #64
; CHECK-NEXT: b.ne .LBB3_4
; CHECK-NEXT: // %bb.5: // %middle.block
; CHECK-NEXT: cmp x10, x9
@@ -314,22 +313,21 @@ define void @larger_umull(i16* nocapture noundef readonly %x, i16 noundef %y, i3
; CHECK-NEXT: b .LBB4_6
; CHECK-NEXT: .LBB4_3: // %vector.ph
; CHECK-NEXT: and x10, x9, #0xfffffff0
-; CHECK-NEXT: dup v0.4h, w8
; CHECK-NEXT: add x11, x2, #32
; CHECK-NEXT: add x12, x0, #16
; CHECK-NEXT: mov x13, x10
-; CHECK-NEXT: dup v1.8h, w8
+; CHECK-NEXT: dup v0.8h, w8
; CHECK-NEXT: .LBB4_4: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ldp q2, q3, [x12, #-16]
+; CHECK-NEXT: ldp q1, q2, [x12, #-16]
; CHECK-NEXT: subs x13, x13, #16
; CHECK-NEXT: add x12, x12, #32
-; CHECK-NEXT: umull2 v4.4s, v1.8h, v2.8h
+; CHECK-NEXT: umull2 v3.4s, v0.8h, v1.8h
+; CHECK-NEXT: umull v1.4s, v0.4h, v1.4h
+; CHECK-NEXT: umull2 v4.4s, v0.8h, v2.8h
; CHECK-NEXT: umull v2.4s, v0.4h, v2.4h
-; CHECK-NEXT: umull2 v5.4s, v1.8h, v3.8h
-; CHECK-NEXT: umull v3.4s, v0.4h, v3.4h
-; CHECK-NEXT: stp q2, q4, [x11, #-32]
-; CHECK-NEXT: stp q3, q5, [x11], #64
+; CHECK-NEXT: stp q1, q3, [x11, #-32]
+; CHECK-NEXT: stp q2, q4, [x11], #64
; CHECK-NEXT: b.ne .LBB4_4
; CHECK-NEXT: // %bb.5: // %middle.block
; CHECK-NEXT: cmp x10, x9
More information about the llvm-commits
mailing list