[llvm] b8801ba - [AArch64] Common patterns between UMULL and int_aarch64_neon_umull
David Green via llvm-commits
llvm-commits at lists.llvm.org
Sat Feb 19 06:39:03 PST 2022
Author: David Green
Date: 2022-02-19T14:38:57Z
New Revision: b8801ba0503936bd42e6d16e291bf66209323723
URL: https://github.com/llvm/llvm-project/commit/b8801ba0503936bd42e6d16e291bf66209323723
DIFF: https://github.com/llvm/llvm-project/commit/b8801ba0503936bd42e6d16e291bf66209323723.diff
LOG: [AArch64] Common patterns between UMULL and int_aarch64_neon_umull
We have some duplicate patterns between the AArch64ISD::UMULL (/SMULL)
and the int_aarch64_neon_umull (/smull) intrinsics. They did not
replicate all the patterns though, leaving some gaps on instructions
like umlal2 from codegen. This commons all the patterns by converting
all int_aarch64_neon_umull intrinsics to UMULL nodes and removing the
duplicate for umull/smull intrinsics, so that all instructions go
through the same tablegen pattern.
This improves some of the longer-than-legal mla patterns, helping them
replace ext with umlal2.
Differential Revision: https://reviews.llvm.org/D119887
Added:
Modified:
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
llvm/lib/Target/AArch64/AArch64InstrInfo.td
llvm/test/CodeGen/AArch64/aarch64-wide-mul.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 757a5b042a02a..d4f9906e687f9 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -15447,7 +15447,11 @@ static SDValue performIntrinsicCombine(SDNode *N,
return DAG.getNode(ISD::FMINNUM, SDLoc(N), N->getValueType(0),
N->getOperand(1), N->getOperand(2));
case Intrinsic::aarch64_neon_smull:
+ return DAG.getNode(AArch64ISD::SMULL, SDLoc(N), N->getValueType(0),
+ N->getOperand(1), N->getOperand(2));
case Intrinsic::aarch64_neon_umull:
+ return DAG.getNode(AArch64ISD::UMULL, SDLoc(N), N->getValueType(0),
+ N->getOperand(1), N->getOperand(2));
case Intrinsic::aarch64_neon_pmull:
case Intrinsic::aarch64_neon_sqdmull:
return tryCombineLongOpWithDup(IID, N, DCI, DAG);
@@ -18131,6 +18135,9 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
return performVecReduceAddCombine(N, DCI.DAG, Subtarget);
case AArch64ISD::UADDV:
return performUADDVCombine(N, DAG);
+ case AArch64ISD::SMULL:
+ case AArch64ISD::UMULL:
+ return tryCombineLongOpWithDup(Intrinsic::not_intrinsic, N, DCI, DAG);
case ISD::INTRINSIC_VOID:
case ISD::INTRINSIC_W_CHAIN:
switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 664f670d741c0..509fd05806211 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -5105,10 +5105,10 @@ defm SADDL : SIMDLongThreeVectorBHS< 0, 0b0000, "saddl",
defm SADDW : SIMDWideThreeVectorBHS< 0, 0b0001, "saddw",
BinOpFrag<(add node:$LHS, (sext node:$RHS))>>;
defm SMLAL : SIMDLongThreeVectorTiedBHS<0, 0b1000, "smlal",
- TriOpFrag<(add node:$LHS, (int_aarch64_neon_smull node:$MHS, node:$RHS))>>;
+ TriOpFrag<(add node:$LHS, (AArch64smull node:$MHS, node:$RHS))>>;
defm SMLSL : SIMDLongThreeVectorTiedBHS<0, 0b1010, "smlsl",
- TriOpFrag<(sub node:$LHS, (int_aarch64_neon_smull node:$MHS, node:$RHS))>>;
-defm SMULL : SIMDLongThreeVectorBHS<0, 0b1100, "smull", int_aarch64_neon_smull>;
+ TriOpFrag<(sub node:$LHS, (AArch64smull node:$MHS, node:$RHS))>>;
+defm SMULL : SIMDLongThreeVectorBHS<0, 0b1100, "smull", AArch64smull>;
defm SQDMLAL : SIMDLongThreeVectorSQDMLXTiedHS<0, 0b1001, "sqdmlal",
int_aarch64_neon_sqadd>;
defm SQDMLSL : SIMDLongThreeVectorSQDMLXTiedHS<0, 0b1011, "sqdmlsl",
@@ -5126,10 +5126,10 @@ defm UADDL : SIMDLongThreeVectorBHS<1, 0b0000, "uaddl",
defm UADDW : SIMDWideThreeVectorBHS<1, 0b0001, "uaddw",
BinOpFrag<(add node:$LHS, (zanyext node:$RHS))>>;
defm UMLAL : SIMDLongThreeVectorTiedBHS<1, 0b1000, "umlal",
- TriOpFrag<(add node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>;
+ TriOpFrag<(add node:$LHS, (AArch64umull node:$MHS, node:$RHS))>>;
defm UMLSL : SIMDLongThreeVectorTiedBHS<1, 0b1010, "umlsl",
- TriOpFrag<(sub node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>;
-defm UMULL : SIMDLongThreeVectorBHS<1, 0b1100, "umull", int_aarch64_neon_umull>;
+ TriOpFrag<(sub node:$LHS, (AArch64umull node:$MHS, node:$RHS))>>;
+defm UMULL : SIMDLongThreeVectorBHS<1, 0b1100, "umull", AArch64umull>;
defm USUBL : SIMDLongThreeVectorBHS<1, 0b0010, "usubl",
BinOpFrag<(sub (zanyext node:$LHS), (zanyext node:$RHS))>>;
defm USUBW : SIMDWideThreeVectorBHS< 1, 0b0011, "usubw",
@@ -5164,74 +5164,15 @@ multiclass Neon_mul_acc_widen_patterns<SDPatternOperator opnode, SDPatternOperat
V64:$Rn, V64:$Rm)), dsub)>;
}
-defm : Neon_mul_acc_widen_patterns<add, int_aarch64_neon_umull,
+defm : Neon_mul_acc_widen_patterns<add, AArch64umull,
UMLALv8i8_v8i16, UMLALv4i16_v4i32, UMLALv2i32_v2i64>;
-defm : Neon_mul_acc_widen_patterns<add, int_aarch64_neon_smull,
+defm : Neon_mul_acc_widen_patterns<add, AArch64smull,
SMLALv8i8_v8i16, SMLALv4i16_v4i32, SMLALv2i32_v2i64>;
-defm : Neon_mul_acc_widen_patterns<sub, int_aarch64_neon_umull,
+defm : Neon_mul_acc_widen_patterns<sub, AArch64umull,
UMLSLv8i8_v8i16, UMLSLv4i16_v4i32, UMLSLv2i32_v2i64>;
-defm : Neon_mul_acc_widen_patterns<sub, int_aarch64_neon_smull,
+defm : Neon_mul_acc_widen_patterns<sub, AArch64smull,
SMLSLv8i8_v8i16, SMLSLv4i16_v4i32, SMLSLv2i32_v2i64>;
-// Additional patterns for SMULL and UMULL
-multiclass Neon_mul_widen_patterns<SDPatternOperator opnode,
- Instruction INST8B, Instruction INST4H, Instruction INST2S> {
- def : Pat<(v8i16 (opnode (v8i8 V64:$Rn), (v8i8 V64:$Rm))),
- (INST8B V64:$Rn, V64:$Rm)>;
- def : Pat<(v4i32 (opnode (v4i16 V64:$Rn), (v4i16 V64:$Rm))),
- (INST4H V64:$Rn, V64:$Rm)>;
- def : Pat<(v2i64 (opnode (v2i32 V64:$Rn), (v2i32 V64:$Rm))),
- (INST2S V64:$Rn, V64:$Rm)>;
-}
-
-defm : Neon_mul_widen_patterns<AArch64smull, SMULLv8i8_v8i16,
- SMULLv4i16_v4i32, SMULLv2i32_v2i64>;
-defm : Neon_mul_widen_patterns<AArch64umull, UMULLv8i8_v8i16,
- UMULLv4i16_v4i32, UMULLv2i32_v2i64>;
-
-// Patterns for smull2/umull2.
-multiclass Neon_mul_high_patterns<SDPatternOperator opnode,
- Instruction INST8B, Instruction INST4H, Instruction INST2S> {
- def : Pat<(v8i16 (opnode (extract_high_v16i8 V128:$Rn),
- (extract_high_v16i8 V128:$Rm))),
- (INST8B V128:$Rn, V128:$Rm)>;
- def : Pat<(v4i32 (opnode (extract_high_v8i16 V128:$Rn),
- (extract_high_v8i16 V128:$Rm))),
- (INST4H V128:$Rn, V128:$Rm)>;
- def : Pat<(v2i64 (opnode (extract_high_v4i32 V128:$Rn),
- (extract_high_v4i32 V128:$Rm))),
- (INST2S V128:$Rn, V128:$Rm)>;
-}
-
-defm : Neon_mul_high_patterns<AArch64smull, SMULLv16i8_v8i16,
- SMULLv8i16_v4i32, SMULLv4i32_v2i64>;
-defm : Neon_mul_high_patterns<AArch64umull, UMULLv16i8_v8i16,
- UMULLv8i16_v4i32, UMULLv4i32_v2i64>;
-
-// Additional patterns for SMLAL/SMLSL and UMLAL/UMLSL
-multiclass Neon_mulacc_widen_patterns<SDPatternOperator opnode,
- Instruction INST8B, Instruction INST4H, Instruction INST2S> {
- def : Pat<(v8i16 (opnode (v8i16 V128:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm))),
- (INST8B V128:$Rd, V64:$Rn, V64:$Rm)>;
- def : Pat<(v4i32 (opnode (v4i32 V128:$Rd), (v4i16 V64:$Rn), (v4i16 V64:$Rm))),
- (INST4H V128:$Rd, V64:$Rn, V64:$Rm)>;
- def : Pat<(v2i64 (opnode (v2i64 V128:$Rd), (v2i32 V64:$Rn), (v2i32 V64:$Rm))),
- (INST2S V128:$Rd, V64:$Rn, V64:$Rm)>;
-}
-
-defm : Neon_mulacc_widen_patterns<
- TriOpFrag<(add node:$LHS, (AArch64smull node:$MHS, node:$RHS))>,
- SMLALv8i8_v8i16, SMLALv4i16_v4i32, SMLALv2i32_v2i64>;
-defm : Neon_mulacc_widen_patterns<
- TriOpFrag<(add node:$LHS, (AArch64umull node:$MHS, node:$RHS))>,
- UMLALv8i8_v8i16, UMLALv4i16_v4i32, UMLALv2i32_v2i64>;
-defm : Neon_mulacc_widen_patterns<
- TriOpFrag<(sub node:$LHS, (AArch64smull node:$MHS, node:$RHS))>,
- SMLSLv8i8_v8i16, SMLSLv4i16_v4i32, SMLSLv2i32_v2i64>;
-defm : Neon_mulacc_widen_patterns<
- TriOpFrag<(sub node:$LHS, (AArch64umull node:$MHS, node:$RHS))>,
- UMLSLv8i8_v8i16, UMLSLv4i16_v4i32, UMLSLv2i32_v2i64>;
-
// Patterns for 64-bit pmull
def : Pat<(int_aarch64_neon_pmull64 V64:$Rn, V64:$Rm),
(PMULLv1i64 V64:$Rn, V64:$Rm)>;
@@ -6404,11 +6345,10 @@ defm MLS : SIMDVectorIndexedHSTied<1, 0b0100, "mls", null_frag>;
defm MUL : SIMDVectorIndexedHS<0, 0b1000, "mul", mul>;
defm SMLAL : SIMDVectorIndexedLongSDTied<0, 0b0010, "smlal",
- TriOpFrag<(add node:$LHS, (int_aarch64_neon_smull node:$MHS, node:$RHS))>>;
+ TriOpFrag<(add node:$LHS, (AArch64smull node:$MHS, node:$RHS))>>;
defm SMLSL : SIMDVectorIndexedLongSDTied<0, 0b0110, "smlsl",
- TriOpFrag<(sub node:$LHS, (int_aarch64_neon_smull node:$MHS, node:$RHS))>>;
-defm SMULL : SIMDVectorIndexedLongSD<0, 0b1010, "smull",
- int_aarch64_neon_smull>;
+ TriOpFrag<(sub node:$LHS, (AArch64smull node:$MHS, node:$RHS))>>;
+defm SMULL : SIMDVectorIndexedLongSD<0, 0b1010, "smull", AArch64smull>;
defm SQDMLAL : SIMDIndexedLongSQDMLXSDTied<0, 0b0011, "sqdmlal",
int_aarch64_neon_sqadd>;
defm SQDMLSL : SIMDIndexedLongSQDMLXSDTied<0, 0b0111, "sqdmlsl",
@@ -6419,11 +6359,10 @@ defm SQRDMLSH : SIMDIndexedSQRDMLxHSDTied<1, 0b1111, "sqrdmlsh",
int_aarch64_neon_sqrdmlsh>;
defm SQDMULL : SIMDIndexedLongSD<0, 0b1011, "sqdmull", int_aarch64_neon_sqdmull>;
defm UMLAL : SIMDVectorIndexedLongSDTied<1, 0b0010, "umlal",
- TriOpFrag<(add node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>;
+ TriOpFrag<(add node:$LHS, (AArch64umull node:$MHS, node:$RHS))>>;
defm UMLSL : SIMDVectorIndexedLongSDTied<1, 0b0110, "umlsl",
- TriOpFrag<(sub node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>;
-defm UMULL : SIMDVectorIndexedLongSD<1, 0b1010, "umull",
- int_aarch64_neon_umull>;
+ TriOpFrag<(sub node:$LHS, (AArch64umull node:$MHS, node:$RHS))>>;
+defm UMULL : SIMDVectorIndexedLongSD<1, 0b1010, "umull", AArch64umull>;
// A scalar sqdmull with the second operand being a vector lane can be
// handled directly with the indexed instruction encoding.
diff --git a/llvm/test/CodeGen/AArch64/aarch64-wide-mul.ll b/llvm/test/CodeGen/AArch64/aarch64-wide-mul.ll
index b591438b7ceef..1c52b359156f6 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-wide-mul.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-wide-mul.ll
@@ -71,12 +71,10 @@ entry:
define <16 x i16> @mla_i16(<16 x i8> %a, <16 x i8> %b, <16 x i16> %c) {
; CHECK-LABEL: mla_i16:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ext v4.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT: ext v5.16b, v1.16b, v1.16b, #8
+; CHECK-NEXT: umlal2 v3.8h, v0.16b, v1.16b
; CHECK-NEXT: umlal v2.8h, v0.8b, v1.8b
-; CHECK-NEXT: umlal v3.8h, v4.8b, v5.8b
-; CHECK-NEXT: mov v0.16b, v2.16b
; CHECK-NEXT: mov v1.16b, v3.16b
+; CHECK-NEXT: mov v0.16b, v2.16b
; CHECK-NEXT: ret
entry:
%ea = zext <16 x i8> %a to <16 x i16>
@@ -91,18 +89,14 @@ define <16 x i32> @mla_i32(<16 x i8> %a, <16 x i8> %b, <16 x i32> %c) {
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ushll v6.8h, v0.8b, #0
; CHECK-NEXT: ushll2 v0.8h, v0.16b, #0
-; CHECK-NEXT: ushll v7.8h, v1.8b, #0
-; CHECK-NEXT: ushll2 v1.8h, v1.16b, #0
-; CHECK-NEXT: ext v16.16b, v6.16b, v6.16b, #8
-; CHECK-NEXT: ext v17.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT: ext v18.16b, v7.16b, v7.16b, #8
-; CHECK-NEXT: ext v19.16b, v1.16b, v1.16b, #8
-; CHECK-NEXT: umlal v4.4s, v0.4h, v1.4h
-; CHECK-NEXT: umlal v2.4s, v6.4h, v7.4h
-; CHECK-NEXT: umlal v3.4s, v16.4h, v18.4h
-; CHECK-NEXT: umlal v5.4s, v17.4h, v19.4h
-; CHECK-NEXT: mov v0.16b, v2.16b
+; CHECK-NEXT: ushll2 v7.8h, v1.16b, #0
+; CHECK-NEXT: ushll v1.8h, v1.8b, #0
+; CHECK-NEXT: umlal2 v5.4s, v0.8h, v7.8h
+; CHECK-NEXT: umlal2 v3.4s, v6.8h, v1.8h
+; CHECK-NEXT: umlal v2.4s, v6.4h, v1.4h
+; CHECK-NEXT: umlal v4.4s, v0.4h, v7.4h
; CHECK-NEXT: mov v1.16b, v3.16b
+; CHECK-NEXT: mov v0.16b, v2.16b
; CHECK-NEXT: mov v2.16b, v4.16b
; CHECK-NEXT: mov v3.16b, v5.16b
; CHECK-NEXT: ret
@@ -117,43 +111,35 @@ entry:
define <16 x i64> @mla_i64(<16 x i8> %a, <16 x i8> %b, <16 x i64> %c) {
; CHECK-LABEL: mla_i64:
; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov v17.16b, v7.16b
+; CHECK-NEXT: mov v16.16b, v6.16b
+; CHECK-NEXT: ldp q6, q7, [sp]
; CHECK-NEXT: ushll v18.8h, v0.8b, #0
; CHECK-NEXT: ushll2 v0.8h, v0.16b, #0
-; CHECK-NEXT: ushll v25.8h, v1.8b, #0
+; CHECK-NEXT: ushll v21.8h, v1.8b, #0
; CHECK-NEXT: ushll2 v1.8h, v1.16b, #0
; CHECK-NEXT: ushll v19.4s, v18.4h, #0
; CHECK-NEXT: ushll v20.4s, v0.4h, #0
; CHECK-NEXT: ushll2 v18.4s, v18.8h, #0
-; CHECK-NEXT: ushll v26.4s, v25.4h, #0
-; CHECK-NEXT: ushll v27.4s, v1.4h, #0
-; CHECK-NEXT: ushll2 v25.4s, v25.8h, #0
-; CHECK-NEXT: mov v16.16b, v7.16b
-; CHECK-NEXT: mov v17.16b, v6.16b
-; CHECK-NEXT: ldp q6, q7, [sp]
+; CHECK-NEXT: ushll v22.4s, v21.4h, #0
+; CHECK-NEXT: ushll v23.4s, v1.4h, #0
+; CHECK-NEXT: ushll2 v21.4s, v21.8h, #0
; CHECK-NEXT: ushll2 v0.4s, v0.8h, #0
; CHECK-NEXT: ushll2 v1.4s, v1.8h, #0
-; CHECK-NEXT: ext v21.16b, v19.16b, v19.16b, #8
-; CHECK-NEXT: ext v22.16b, v20.16b, v20.16b, #8
-; CHECK-NEXT: ext v23.16b, v18.16b, v18.16b, #8
-; CHECK-NEXT: ext v28.16b, v26.16b, v26.16b, #8
-; CHECK-NEXT: ext v29.16b, v27.16b, v27.16b, #8
-; CHECK-NEXT: ext v30.16b, v25.16b, v25.16b, #8
-; CHECK-NEXT: ext v24.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT: ext v31.16b, v1.16b, v1.16b, #8
-; CHECK-NEXT: umlal v4.2d, v18.2s, v25.2s
-; CHECK-NEXT: umlal v17.2d, v20.2s, v27.2s
-; CHECK-NEXT: umlal v2.2d, v19.2s, v26.2s
-; CHECK-NEXT: umlal v3.2d, v21.2s, v28.2s
-; CHECK-NEXT: umlal v5.2d, v23.2s, v30.2s
-; CHECK-NEXT: umlal v16.2d, v22.2s, v29.2s
+; CHECK-NEXT: umlal2 v5.2d, v18.4s, v21.4s
+; CHECK-NEXT: umlal2 v17.2d, v20.4s, v23.4s
+; CHECK-NEXT: umlal2 v3.2d, v19.4s, v22.4s
+; CHECK-NEXT: umlal v2.2d, v19.2s, v22.2s
+; CHECK-NEXT: umlal v4.2d, v18.2s, v21.2s
+; CHECK-NEXT: umlal v16.2d, v20.2s, v23.2s
+; CHECK-NEXT: umlal2 v7.2d, v0.4s, v1.4s
; CHECK-NEXT: umlal v6.2d, v0.2s, v1.2s
-; CHECK-NEXT: umlal v7.2d, v24.2s, v31.2s
; CHECK-NEXT: mov v0.16b, v2.16b
; CHECK-NEXT: mov v1.16b, v3.16b
; CHECK-NEXT: mov v2.16b, v4.16b
; CHECK-NEXT: mov v3.16b, v5.16b
-; CHECK-NEXT: mov v4.16b, v17.16b
-; CHECK-NEXT: mov v5.16b, v16.16b
+; CHECK-NEXT: mov v4.16b, v16.16b
+; CHECK-NEXT: mov v5.16b, v17.16b
; CHECK-NEXT: ret
entry:
%ea = zext <16 x i8> %a to <16 x i64>
More information about the llvm-commits
mailing list