[PATCH] D159267: [AArch64] Remove copy instruction between uaddlv and dup
JinGu Kang via Phabricator via llvm-commits
llvm-commits at lists.llvm.org
Tue Sep 5 08:50:32 PDT 2023
jaykang10 added inline comments.
================
Comment at: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp:5322
+ EVT ResVT = Op.getValueType();
+ if (ResVT == MVT::i32 && (OpVT == MVT::v8i8 || OpVT == MVT::v16i8)) {
+ // In order to avoid insert_subvector, used v4i32 than v2i32.
----------------
jaykang10 wrote:
> efriedma wrote:
> > This could be extended to i16 uaddlv as well, but we can leave that for a followup, I guess.
> Yep, it seems there is no pattern for dup(extract_element) --> duplane with v8i16.
> Let's handle the type with other patch.
I have tried below patch.
```
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 2bb8e4324306..87c836905659 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -5327,7 +5327,8 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
case Intrinsic::aarch64_neon_uaddlv: {
EVT OpVT = Op.getOperand(1).getValueType();
EVT ResVT = Op.getValueType();
- if (ResVT == MVT::i32 && (OpVT == MVT::v8i8 || OpVT == MVT::v16i8)) {
+ if (ResVT == MVT::i32 &&
+ (OpVT == MVT::v8i8 || OpVT == MVT::v16i8 || OpVT == MVT::v8i16)) {
// In order to avoid insert_subvector, used v4i32 than v2i32.
SDValue UADDLV =
DAG.getNode(AArch64ISD::UADDLV, dl, MVT::v4i32, Op.getOperand(1));
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 4a1f46f2576a..658b22d312fb 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -6067,6 +6067,8 @@ defm : DUPWithTruncPats<v16i8, v4i16, v8i16, i32, DUPv16i8lane, VecIndex_x2>;
defm : DUPWithTruncPats<v16i8, v2i32, v4i32, i32, DUPv16i8lane, VecIndex_x4>;
defm : DUPWithTruncPats<v8i16, v2i32, v4i32, i32, DUPv8i16lane, VecIndex_x2>;
+defm : DUPWithTruncPats<v4i32, v2i32, v4i32, i32, DUPv8i16lane, VecIndex_x2>;
+
multiclass DUPWithTrunci64Pats<ValueType ResVT, Instruction DUP,
SDNodeXForm IdxXFORM> {
def : Pat<(ResVT (AArch64dup (i32 (trunc (extractelt (v2i64 V128:$Rn),
@@ -6462,12 +6464,21 @@ def : Pat<(i32 (int_aarch64_neon_uaddlv (v8i16 (AArch64uaddlp (v16i8 V128:$op)))
(v8i16 (SUBREG_TO_REG (i64 0), (UADDLVv16i8v V128:$op), hsub)),
ssub))>;
+def : Pat<(i32 (vector_extract
+ (v4i32 (AArch64uaddlv (v8i16 (AArch64uaddlp (v16i8 V128:$op))))), (i64 0))),
+ (i32 (EXTRACT_SUBREG
+ (v8i16 (SUBREG_TO_REG (i64 0), (UADDLVv16i8v V128:$op), hsub)),
+ ssub))>;
+
def : Pat<(v4i32 (AArch64uaddlv (v8i8 V64:$Rn))),
(v4i32 (SUBREG_TO_REG (i64 0), (UADDLVv8i8v V64:$Rn), hsub))>;
def : Pat<(v4i32 (AArch64uaddlv (v16i8 V128:$Rn))),
(v4i32 (SUBREG_TO_REG (i64 0), (UADDLVv16i8v V128:$Rn), hsub))>;
+def : Pat<(v4i32 (AArch64uaddlv (v8i16 V128:$Rn))),
+ (v4i32 (SUBREG_TO_REG (i64 0), (UADDLVv8i16v V128:$Rn), ssub))>;
+
// Patterns for across-vector intrinsics, that have a node equivalent, that
// returns a vector (with only the low lane defined) instead of a scalar.
// In effect, opNode is the same as (scalar_to_vector (IntNode)).
diff --git a/llvm/test/CodeGen/AArch64/neon-addlv.ll b/llvm/test/CodeGen/AArch64/neon-addlv.ll
index 0f5a19c7a0f3..0769adce87d3 100644
--- a/llvm/test/CodeGen/AArch64/neon-addlv.ll
+++ b/llvm/test/CodeGen/AArch64/neon-addlv.ll
@@ -178,8 +178,8 @@ entry:
ret i32 %0
}
-define dso_local <8 x i8> @bar(<8 x i8> noundef %a) local_unnamed_addr #0 {
-; CHECK-LABEL: bar:
+define dso_local <8 x i8> @uaddlv_v8i8(<8 x i8> %a) {
+; CHECK-LABEL: uaddlv_v8i8:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: uaddlv h0, v0.8b
; CHECK-NEXT: dup v0.8h, v0.h[0]
@@ -194,4 +194,22 @@ entry:
ret <8 x i8> %vrshrn_n2
}
+define dso_local <8 x i16> @uaddlv_v8i16(<8 x i16> %a) {
+; CHECK-LABEL: uaddlv_v8i16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: uaddlv s0, v0.8h
+; CHECK-NEXT: dup v1.8h, v0.h[0]
+; CHECK-NEXT: rshrn v0.4h, v1.4s, #3
+; CHECK-NEXT: rshrn2 v0.8h, v1.4s, #3
+; CHECK-NEXT: ret
+entry:
+ %vaddlv.i = tail call i32 @llvm.aarch64.neon.uaddlv.i32.v8i16(<8 x i16> %a)
+ %vecinit.i = insertelement <8 x i32> undef, i32 %vaddlv.i, i64 0
+ %vecinit7.i = shufflevector <8 x i32> %vecinit.i, <8 x i32> poison, <8 x i32> zeroinitializer
+ %vrshrn_n2 = tail call <8 x i16> @llvm.aarch64.neon.rshrn.v8i16(<8 x i32> %vecinit7.i, i32 3)
+ ret <8 x i16> %vrshrn_n2
+}
+
declare <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16>, i32)
+declare <8 x i16> @llvm.aarch64.neon.rshrn.v8i16(<8 x i32>, i32)
+declare i32 @llvm.aarch64.neon.uaddlv.i32.v8i16(<8 x i16>)
diff --git a/llvm/test/CodeGen/AArch64/uaddlv-vaddlp-combine.ll b/llvm/test/CodeGen/AArch64/uaddlv-vaddlp-combine.ll
index 8b48635b6694..e6b253b258f1 100644
--- a/llvm/test/CodeGen/AArch64/uaddlv-vaddlp-combine.ll
+++ b/llvm/test/CodeGen/AArch64/uaddlv-vaddlp-combine.ll
@@ -17,7 +17,8 @@ define i32 @uaddlv_uaddlp_v8i16(<8 x i16> %0) {
define i16 @uaddlv_uaddlp_v16i8(<16 x i8> %0) {
; CHECK-LABEL: uaddlv_uaddlp_v16i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: uaddlv h0, v0.16b
+; CHECK-NEXT: uaddlp v0.8h, v0.16b
+; CHECK-NEXT: uaddlv s0, v0.8h
; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
%2 = tail call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> %0)
```
As you can see, there is a regression on `uaddlv_uaddlp_v8i16` even though I added a pattern to cover the regression because the first pattern is matched earlier than second one.
```
first pattern
+defm : DUPWithTruncPats<v4i32, v2i32, v4i32, i32, DUPv8i16lane, VecIndex_x2>;
second pattern
+def : Pat<(i32 (vector_extract
+ (v4i32 (AArch64uaddlv (v8i16 (AArch64uaddlp (v16i8 V128:$op))))), (i64 0))),
+ (i32 (EXTRACT_SUBREG
+ (v8i16 (SUBREG_TO_REG (i64 0), (UADDLVv16i8v V128:$op), hsub)),
+ ssub))>;
+
```
I think it could be ok to keep uaddlv intrinsic than uaddlv sdnode for v8i16 type...
Repository:
rG LLVM Github Monorepo
CHANGES SINCE LAST ACTION
https://reviews.llvm.org/D159267/new/
https://reviews.llvm.org/D159267
More information about the llvm-commits
mailing list