[PATCH] D159267: [AArch64] Remove copy instruction between uaddlv and dup

Fri Sep 8 02:45:57 PDT 2023

jaykang10 added inline comments.

================
Comment at: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp:5322
+    EVT ResVT = Op.getValueType();
+    if (ResVT == MVT::i32 && (OpVT == MVT::v8i8 || OpVT == MVT::v16i8)) {
+      // In order to avoid insert_subvector, used v4i32 than v2i32.
----------------
efriedma wrote:
> jaykang10 wrote:
> > jaykang10 wrote:
> > > efriedma wrote:
> > > > This could be extended to i16 uaddlv as well, but we can leave that for a followup, I guess.
> > > Yep, it seems there is no pattern for dup(extract_element) --> duplane with v8i16.
> > > Let's handle the type with other patch.
> > I have tried below patch.
> > ```
> > diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
> > index 2bb8e4324306..87c836905659 100644
> > --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
> > +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
> > @@ -5327,7 +5327,8 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
> >    case Intrinsic::aarch64_neon_uaddlv: {
> >      EVT OpVT = Op.getOperand(1).getValueType();
> >      EVT ResVT = Op.getValueType();
> > -    if (ResVT == MVT::i32 && (OpVT == MVT::v8i8 || OpVT == MVT::v16i8)) {
> > +    if (ResVT == MVT::i32 &&
> > +        (OpVT == MVT::v8i8 || OpVT == MVT::v16i8 || OpVT == MVT::v8i16)) {
> >        // In order to avoid insert_subvector, used v4i32 than v2i32.
> >        SDValue UADDLV =
> >            DAG.getNode(AArch64ISD::UADDLV, dl, MVT::v4i32, Op.getOperand(1));
> > diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
> > index 4a1f46f2576a..658b22d312fb 100644
> > --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
> > +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
> > @@ -6067,6 +6067,8 @@ defm : DUPWithTruncPats<v16i8,  v4i16, v8i16, i32, DUPv16i8lane, VecIndex_x2>;
> >  defm : DUPWithTruncPats<v16i8,  v2i32, v4i32, i32, DUPv16i8lane, VecIndex_x4>;
> >  defm : DUPWithTruncPats<v8i16,  v2i32, v4i32, i32, DUPv8i16lane, VecIndex_x2>;
> >  
> > +defm : DUPWithTruncPats<v4i32,  v2i32, v4i32, i32, DUPv8i16lane, VecIndex_x2>;
> > +
> >  multiclass DUPWithTrunci64Pats<ValueType ResVT, Instruction DUP,
> >                                 SDNodeXForm IdxXFORM> {
> >    def : Pat<(ResVT (AArch64dup (i32 (trunc (extractelt (v2i64 V128:$Rn),
> > @@ -6462,12 +6464,21 @@ def : Pat<(i32 (int_aarch64_neon_uaddlv (v8i16 (AArch64uaddlp (v16i8 V128:$op)))
> >              (v8i16 (SUBREG_TO_REG (i64 0), (UADDLVv16i8v V128:$op), hsub)),
> >              ssub))>;
> >  
> > +def : Pat<(i32 (vector_extract
> > +            (v4i32 (AArch64uaddlv (v8i16 (AArch64uaddlp (v16i8 V128:$op))))), (i64 0))),
> > +          (i32 (EXTRACT_SUBREG
> > +            (v8i16 (SUBREG_TO_REG (i64 0), (UADDLVv16i8v V128:$op), hsub)),
> > +            ssub))>;
> > +
> >  def : Pat<(v4i32 (AArch64uaddlv (v8i8 V64:$Rn))),
> >            (v4i32 (SUBREG_TO_REG (i64 0), (UADDLVv8i8v V64:$Rn), hsub))>;
> >  
> >  def : Pat<(v4i32 (AArch64uaddlv (v16i8 V128:$Rn))),
> >            (v4i32 (SUBREG_TO_REG (i64 0), (UADDLVv16i8v V128:$Rn), hsub))>;
> >  
> > +def : Pat<(v4i32 (AArch64uaddlv (v8i16 V128:$Rn))),
> > +          (v4i32 (SUBREG_TO_REG (i64 0), (UADDLVv8i16v V128:$Rn), ssub))>;
> > +
> >  // Patterns for across-vector intrinsics, that have a node equivalent, that
> >  // returns a vector (with only the low lane defined) instead of a scalar.
> >  // In effect, opNode is the same as (scalar_to_vector (IntNode)).
> > diff --git a/llvm/test/CodeGen/AArch64/neon-addlv.ll b/llvm/test/CodeGen/AArch64/neon-addlv.ll
> > index 0f5a19c7a0f3..0769adce87d3 100644
> > --- a/llvm/test/CodeGen/AArch64/neon-addlv.ll
> > +++ b/llvm/test/CodeGen/AArch64/neon-addlv.ll
> > @@ -178,8 +178,8 @@ entry:
> >    ret i32 %0
> >  }
> > 
> > -define dso_local <8 x i8> @bar(<8 x i8> noundef %a) local_unnamed_addr #0 {
> > -; CHECK-LABEL: bar:
> > +define dso_local <8 x i8> @uaddlv_v8i8(<8 x i8> %a) {
> > +; CHECK-LABEL: uaddlv_v8i8:
> >  ; CHECK:       // %bb.0: // %entry
> >  ; CHECK-NEXT:    uaddlv h0, v0.8b
> >  ; CHECK-NEXT:    dup v0.8h, v0.h[0]
> > @@ -194,4 +194,22 @@ entry:
> >    ret <8 x i8> %vrshrn_n2
> >  }
> >  
> > +define dso_local <8 x i16> @uaddlv_v8i16(<8 x i16> %a) {
> > +; CHECK-LABEL: uaddlv_v8i16:
> > +; CHECK:       // %bb.0: // %entry
> > +; CHECK-NEXT:    uaddlv s0, v0.8h
> > +; CHECK-NEXT:    dup v1.8h, v0.h[0]
> > +; CHECK-NEXT:    rshrn v0.4h, v1.4s, #3
> > +; CHECK-NEXT:    rshrn2 v0.8h, v1.4s, #3
> > +; CHECK-NEXT:    ret
> > +entry:
> > +  %vaddlv.i = tail call i32 @llvm.aarch64.neon.uaddlv.i32.v8i16(<8 x i16> %a)
> > +  %vecinit.i = insertelement <8 x i32> undef, i32 %vaddlv.i, i64 0
> > +  %vecinit7.i = shufflevector <8 x i32> %vecinit.i, <8 x i32> poison, <8 x i32> zeroinitializer
> > +  %vrshrn_n2 = tail call <8 x i16> @llvm.aarch64.neon.rshrn.v8i16(<8 x i32> %vecinit7.i, i32 3)
> > +  ret <8 x i16> %vrshrn_n2
> > +}
> > +
> >  declare <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16>, i32)
> > +declare <8 x i16> @llvm.aarch64.neon.rshrn.v8i16(<8 x i32>, i32)
> > +declare i32 @llvm.aarch64.neon.uaddlv.i32.v8i16(<8 x i16>)
> > diff --git a/llvm/test/CodeGen/AArch64/uaddlv-vaddlp-combine.ll b/llvm/test/CodeGen/AArch64/uaddlv-vaddlp-combine.ll
> > index 8b48635b6694..e6b253b258f1 100644
> > --- a/llvm/test/CodeGen/AArch64/uaddlv-vaddlp-combine.ll
> > +++ b/llvm/test/CodeGen/AArch64/uaddlv-vaddlp-combine.ll
> > @@ -17,7 +17,8 @@ define i32 @uaddlv_uaddlp_v8i16(<8 x i16> %0) {
> >  define i16 @uaddlv_uaddlp_v16i8(<16 x i8> %0) {
> >  ; CHECK-LABEL: uaddlv_uaddlp_v16i8:
> >  ; CHECK:       // %bb.0:
> > -; CHECK-NEXT:    uaddlv h0, v0.16b
> > +; CHECK-NEXT:    uaddlp v0.8h, v0.16b
> > +; CHECK-NEXT:    uaddlv s0, v0.8h
> >  ; CHECK-NEXT:    fmov w0, s0
> >  ; CHECK-NEXT:    ret
> >    %2 = tail call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> %0)
> > ```
> > As you can see, there is a regression on `uaddlv_uaddlp_v8i16` even though I added a pattern to cover the regression because the first pattern is matched earlier than second one.
> > ```
> > first pattern
> > +defm : DUPWithTruncPats<v4i32,  v2i32, v4i32, i32, DUPv8i16lane, VecIndex_x2>;
> > 
> > second pattern
> > +def : Pat<(i32 (vector_extract
> > +            (v4i32 (AArch64uaddlv (v8i16 (AArch64uaddlp (v16i8 V128:$op))))), (i64 0))),
> > +          (i32 (EXTRACT_SUBREG
> > +            (v8i16 (SUBREG_TO_REG (i64 0), (UADDLVv16i8v V128:$op), hsub)),
> > +            ssub))>;
> > +
> > ```
> > I think it could be ok to keep uaddlv intrinsic than uaddlv sdnode for v8i16 type...
> Not sure I understand the issue here; more specific patterns (more nodes in the pattern definition) should win, no matter the order in the file, so you should be able to fix this with the right patterns.
I could make a mistake. Let me check it again.

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D159267/new/

https://reviews.llvm.org/D159267