[llvm] [AArch64] Lower aarch64_neon_saddlv via SADDLV nodes. (PR #103307)
David Green via llvm-commits
llvm-commits at lists.llvm.org
Tue Aug 13 09:36:32 PDT 2024
https://github.com/davemgreen created https://github.com/llvm/llvm-project/pull/103307
This mirrors what GISel already does, extending the existing lowering of aarch64_neon_saddlv/aarch64_neon_uaddlv to SADDLV/UADDLV. This allows us to remove some tablegen patterns, and provides a little nicer codegen in places as the nodes represent the result being in a vector register correctly.
>From 99ee3744d5fd7612ffb5127337a5232695c7f801 Mon Sep 17 00:00:00 2001
From: David Green <david.green at arm.com>
Date: Tue, 13 Aug 2024 17:10:32 +0100
Subject: [PATCH] [AArch64] Lower aarch64_neon_saddlv via SADDLV nodes.
This mirrors what GISel already does, extending the existing lowering of
aarch64_neon_saddlv/aarch64_neon_uaddlv to SADDLV/UADDLV. This allows us to
remove some tablegen patterns, and provides a little nicer codegen in places as
the nodes represent the result being in a vector register correctly.
---
.../Target/AArch64/AArch64ISelLowering.cpp | 26 +++---
llvm/lib/Target/AArch64/AArch64InstrInfo.td | 93 ++-----------------
.../aarch64-neon-vector-insert-uaddlv.ll | 26 ++----
.../test/CodeGen/AArch64/arm64-neon-across.ll | 32 ++-----
4 files changed, 41 insertions(+), 136 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 314e7134dcd01a..cd8ab1c135baf2 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -6089,20 +6089,24 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, MaskAsInt,
DAG.getVectorIdxConstant(0, dl));
}
+ case Intrinsic::aarch64_neon_saddlv:
case Intrinsic::aarch64_neon_uaddlv: {
EVT OpVT = Op.getOperand(1).getValueType();
EVT ResVT = Op.getValueType();
- if (ResVT == MVT::i32 && (OpVT == MVT::v8i8 || OpVT == MVT::v16i8 ||
- OpVT == MVT::v8i16 || OpVT == MVT::v4i16)) {
- // In order to avoid insert_subvector, used v4i32 than v2i32.
- SDValue UADDLV =
- DAG.getNode(AArch64ISD::UADDLV, dl, MVT::v4i32, Op.getOperand(1));
- SDValue EXTRACT_VEC_ELT =
- DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, UADDLV,
- DAG.getConstant(0, dl, MVT::i64));
- return EXTRACT_VEC_ELT;
- }
- return SDValue();
+ assert(
+ ((ResVT == MVT::i32 && (OpVT == MVT::v8i8 || OpVT == MVT::v16i8 ||
+ OpVT == MVT::v8i16 || OpVT == MVT::v4i16)) ||
+ (ResVT == MVT::i64 && (OpVT == MVT::v4i32 || OpVT == MVT::v2i32))) &&
+ "Unexpected aarch64_neon_u/saddlv type");
+ // In order to avoid insert_subvector, used v4i32 than v2i32.
+ SDValue ADDLV = DAG.getNode(
+ IntNo == Intrinsic::aarch64_neon_uaddlv ? AArch64ISD::UADDLV
+ : AArch64ISD::SADDLV,
+ dl, ResVT == MVT::i32 ? MVT::v4i32 : MVT::v2i64, Op.getOperand(1));
+ SDValue EXTRACT_VEC_ELT = DAG.getNode(
+ ISD::EXTRACT_VECTOR_ELT, dl, ResVT == MVT::i32 ? MVT::i32 : MVT::i64,
+ ADDLV, DAG.getConstant(0, dl, MVT::i64));
+ return EXTRACT_VEC_ELT;
}
case Intrinsic::experimental_cttz_elts: {
SDValue CttzOp = Op.getOperand(1);
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index bb05dc85d29a1f..f9041bd3541a8c 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -7174,17 +7174,6 @@ multiclass SIMDAcrossLaneLongPairIntrinsicGISel<string Opc, SDPatternOperator ad
defm : SIMDAcrossLaneLongPairIntrinsicGISel<"UADDLV", AArch64uaddlp>;
defm : SIMDAcrossLaneLongPairIntrinsicGISel<"SADDLV", AArch64saddlp>;
-// Patterns for uaddlv(uaddlp(x)) ==> uaddlv
-def : Pat<(i64 (int_aarch64_neon_uaddlv (v4i32 (AArch64uaddlp (v8i16 V128:$op))))),
- (i64 (EXTRACT_SUBREG
- (v4i32 (SUBREG_TO_REG (i64 0), (UADDLVv8i16v V128:$op), ssub)),
- dsub))>;
-
-def : Pat<(i32 (int_aarch64_neon_uaddlv (v8i16 (AArch64uaddlp (v16i8 V128:$op))))),
- (i32 (EXTRACT_SUBREG
- (v8i16 (SUBREG_TO_REG (i64 0), (UADDLVv16i8v V128:$op), hsub)),
- ssub))>;
-
def : Pat<(v2i64 (AArch64uaddlv (v4i32 (AArch64uaddlp (v8i16 V128:$op))))),
(v2i64 (SUBREG_TO_REG (i64 0), (UADDLVv8i16v V128:$op), ssub))>;
@@ -7405,82 +7394,12 @@ defm : SIMDAcrossLanesVecReductionIntrinsic<"SMAXV", vecreduce_smax>;
def : Pat<(i32 (vecreduce_smax (v2i32 V64:$Rn))),
(i32 (EXTRACT_SUBREG (SMAXPv2i32 V64:$Rn, V64:$Rn), ssub))>;
-multiclass SIMDAcrossLanesSignedLongIntrinsic<string baseOpc, Intrinsic intOp> {
- def : Pat<(i32 (intOp (v8i8 V64:$Rn))),
- (i32 (SMOVvi16to32
- (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
- (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), hsub),
- (i64 0)))>;
-def : Pat<(i32 (intOp (v16i8 V128:$Rn))),
- (i32 (SMOVvi16to32
- (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
- (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), hsub),
- (i64 0)))>;
-
-def : Pat<(i32 (intOp (v4i16 V64:$Rn))),
- (i32 (EXTRACT_SUBREG
- (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
- (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), ssub),
- ssub))>;
-def : Pat<(i32 (intOp (v8i16 V128:$Rn))),
- (i32 (EXTRACT_SUBREG
- (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
- (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), ssub),
- ssub))>;
-
-def : Pat<(i64 (intOp (v4i32 V128:$Rn))),
- (i64 (EXTRACT_SUBREG
- (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
- (!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn), dsub),
- dsub))>;
-}
-
-multiclass SIMDAcrossLanesUnsignedLongIntrinsic<string baseOpc,
- Intrinsic intOp> {
- def : Pat<(i32 (intOp (v8i8 V64:$Rn))),
- (i32 (EXTRACT_SUBREG
- (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
- (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), hsub),
- ssub))>;
-def : Pat<(i32 (intOp (v16i8 V128:$Rn))),
- (i32 (EXTRACT_SUBREG
- (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
- (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), hsub),
- ssub))>;
-
-def : Pat<(i32 (intOp (v4i16 V64:$Rn))),
- (i32 (EXTRACT_SUBREG
- (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
- (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), ssub),
- ssub))>;
-def : Pat<(i32 (intOp (v8i16 V128:$Rn))),
- (i32 (EXTRACT_SUBREG
- (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
- (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), ssub),
- ssub))>;
-
-def : Pat<(i64 (intOp (v4i32 V128:$Rn))),
- (i64 (EXTRACT_SUBREG
- (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
- (!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn), dsub),
- dsub))>;
-}
-
-defm : SIMDAcrossLanesSignedLongIntrinsic<"SADDLV", int_aarch64_neon_saddlv>;
-defm : SIMDAcrossLanesUnsignedLongIntrinsic<"UADDLV", int_aarch64_neon_uaddlv>;
-
-// The vaddlv_s32 intrinsic gets mapped to SADDLP.
-def : Pat<(i64 (int_aarch64_neon_saddlv (v2i32 V64:$Rn))),
- (i64 (EXTRACT_SUBREG
- (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
- (SADDLPv2i32_v1i64 V64:$Rn), dsub),
- dsub))>;
-// The vaddlv_u32 intrinsic gets mapped to UADDLP.
-def : Pat<(i64 (int_aarch64_neon_uaddlv (v2i32 V64:$Rn))),
- (i64 (EXTRACT_SUBREG
- (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
- (UADDLPv2i32_v1i64 V64:$Rn), dsub),
- dsub))>;
+// The SADDLV v2i32 gets mapped to SADDLP.
+def : Pat<(v2i64 (AArch64saddlv (v2i32 V64:$Rn))),
+ (v2i64 (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), (SADDLPv2i32_v1i64 V64:$Rn), dsub))>;
+// The UADDLV v2i32 gets mapped to UADDLP.
+def : Pat<(v2i64 (AArch64uaddlv (v2i32 V64:$Rn))),
+ (v2i64 (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), (UADDLPv2i32_v1i64 V64:$Rn), dsub))>;
//------------------------------------------------------------------------------
// AdvSIMD modified immediate instructions
diff --git a/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll b/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll
index 75a549e348d472..2e165179381820 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll
@@ -146,11 +146,11 @@ define void @insert_vec_v6i64_uaddlv_from_v4i32(ptr %0) {
; CHECK-LABEL: insert_vec_v6i64_uaddlv_from_v4i32:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: movi.2d v0, #0000000000000000
-; CHECK-NEXT: movi.2d v2, #0000000000000000
; CHECK-NEXT: uaddlv.4s d1, v0
-; CHECK-NEXT: str d2, [x0, #16]
; CHECK-NEXT: mov.d v0[0], v1[0]
+; CHECK-NEXT: movi.2d v1, #0000000000000000
; CHECK-NEXT: ucvtf.2d v0, v0
+; CHECK-NEXT: str d1, [x0, #16]
; CHECK-NEXT: fcvtn v0.2s, v0.2d
; CHECK-NEXT: str q0, [x0]
; CHECK-NEXT: ret
@@ -491,9 +491,8 @@ define void @store_saddlv_v8i8(ptr %H, <8 x i8> %sum_h, i32 %idx) {
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: saddlv.8b h0, v0
; CHECK-NEXT: ; kill: def $w1 killed $w1 def $x1
-; CHECK-NEXT: sbfiz x9, x1, #3, #32
-; CHECK-NEXT: smov.h w8, v0[0]
-; CHECK-NEXT: str w8, [x0, x9]
+; CHECK-NEXT: sbfiz x8, x1, #3, #32
+; CHECK-NEXT: str s0, [x0, x8]
; CHECK-NEXT: ret
entry:
%vaddlvq_s32.i = tail call i32 @llvm.aarch64.neon.saddlv.i32.v8i8(<8 x i8> %sum_h)
@@ -508,9 +507,8 @@ define void @store_saddlv_v16i8(ptr %H, <16 x i8> %sum_h, i32 %idx) {
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: saddlv.16b h0, v0
; CHECK-NEXT: ; kill: def $w1 killed $w1 def $x1
-; CHECK-NEXT: sbfiz x9, x1, #3, #32
-; CHECK-NEXT: smov.h w8, v0[0]
-; CHECK-NEXT: str w8, [x0, x9]
+; CHECK-NEXT: sbfiz x8, x1, #3, #32
+; CHECK-NEXT: str s0, [x0, x8]
; CHECK-NEXT: ret
entry:
%vaddlvq_s32.i = tail call i32 @llvm.aarch64.neon.saddlv.i32.v16i8(<16 x i8> %sum_h)
@@ -526,8 +524,7 @@ define void @store_saddlv_v4i16(ptr %H, <4 x i16> %sum_h, i32 %idx) {
; CHECK-NEXT: saddlv.4h s0, v0
; CHECK-NEXT: ; kill: def $w1 killed $w1 def $x1
; CHECK-NEXT: sbfiz x8, x1, #3, #32
-; CHECK-NEXT: fmov w9, s0
-; CHECK-NEXT: str w9, [x0, x8]
+; CHECK-NEXT: str s0, [x0, x8]
; CHECK-NEXT: ret
entry:
%vaddlvq_s32.i = tail call i32 @llvm.aarch64.neon.saddlv.i32.v4i16(<4 x i16> %sum_h)
@@ -543,8 +540,7 @@ define void @store_saddlv_v8i16(ptr %H, <8 x i16> %sum_h, i32 %idx) {
; CHECK-NEXT: saddlv.8h s0, v0
; CHECK-NEXT: ; kill: def $w1 killed $w1 def $x1
; CHECK-NEXT: sbfiz x8, x1, #3, #32
-; CHECK-NEXT: fmov w9, s0
-; CHECK-NEXT: str w9, [x0, x8]
+; CHECK-NEXT: str s0, [x0, x8]
; CHECK-NEXT: ret
entry:
%vaddlvq_s32.i = tail call i32 @llvm.aarch64.neon.saddlv.i32.v8i16(<8 x i16> %sum_h)
@@ -558,8 +554,7 @@ define void @store_saddlv_v2i32(ptr %H, <2 x i32> %sum_h, i32 %idx) {
; CHECK-LABEL: store_saddlv_v2i32:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: saddlp.1d v0, v0
-; CHECK-NEXT: fmov x8, d0
-; CHECK-NEXT: str x8, [x0, w1, sxtw #3]
+; CHECK-NEXT: str d0, [x0, w1, sxtw #3]
; CHECK-NEXT: ret
entry:
%vaddlvq_s32.i = tail call i64 @llvm.aarch64.neon.saddlv.i64.v2i32(<2 x i32> %sum_h)
@@ -573,8 +568,7 @@ define void @store_saddlv_v4i32(ptr %H, <4 x i32> %sum_h, i32 %idx) {
; CHECK-LABEL: store_saddlv_v4i32:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: saddlv.4s d0, v0
-; CHECK-NEXT: fmov x8, d0
-; CHECK-NEXT: str x8, [x0, w1, sxtw #3]
+; CHECK-NEXT: str d0, [x0, w1, sxtw #3]
; CHECK-NEXT: ret
entry:
%vaddlvq_s32.i = tail call i64 @llvm.aarch64.neon.saddlv.i64.v4i32(<4 x i32> %sum_h)
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-across.ll b/llvm/test/CodeGen/AArch64/arm64-neon-across.ll
index 2899197abb2f44..84d009565ecb59 100644
--- a/llvm/test/CodeGen/AArch64/arm64-neon-across.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-across.ll
@@ -43,17 +43,11 @@ declare i32 @llvm.aarch64.neon.saddlv.i32.v4i16(<4 x i16>)
declare i32 @llvm.aarch64.neon.saddlv.i32.v8i8(<8 x i8>)
define i16 @test_vaddlv_s8(<8 x i8> %a) {
-; CHECK-SD-LABEL: test_vaddlv_s8:
-; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: saddlv h0, v0.8b
-; CHECK-SD-NEXT: smov w0, v0.h[0]
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: test_vaddlv_s8:
-; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: saddlv h0, v0.8b
-; CHECK-GI-NEXT: fmov w0, s0
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: test_vaddlv_s8:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: saddlv h0, v0.8b
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
entry:
%saddlvv.i = tail call i32 @llvm.aarch64.neon.saddlv.i32.v8i8(<8 x i8> %a)
%0 = trunc i32 %saddlvv.i to i16
@@ -95,17 +89,11 @@ entry:
}
define i16 @test_vaddlvq_s8(<16 x i8> %a) {
-; CHECK-SD-LABEL: test_vaddlvq_s8:
-; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: saddlv h0, v0.16b
-; CHECK-SD-NEXT: smov w0, v0.h[0]
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: test_vaddlvq_s8:
-; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: saddlv h0, v0.16b
-; CHECK-GI-NEXT: fmov w0, s0
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: test_vaddlvq_s8:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: saddlv h0, v0.16b
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
entry:
%saddlvv.i = tail call i32 @llvm.aarch64.neon.saddlv.i32.v16i8(<16 x i8> %a)
%0 = trunc i32 %saddlvv.i to i16
More information about the llvm-commits
mailing list