[PATCH] D159267: [AArch64] Remove copy instruction between uaddlv and dup

Thu Sep 7 10:45:22 PDT 2023

efriedma added a comment.

re: the big-endian stuff I mentioned on the other ticket... it looks like it isn't a regression, but my concern is the code generated for ctpop_i32 for a big-endian target.  uaddlv v16i8 produces a result in h0 (element 0 of an 8 x i16), but we then access it as s0 (element 0 of a 4 x i32) without a bitcast.  So I think the bits end up in the wrong place?

================
Comment at: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp:5322
+    EVT ResVT = Op.getValueType();
+    if (ResVT == MVT::i32 && (OpVT == MVT::v8i8 || OpVT == MVT::v16i8)) {
+      // In order to avoid insert_subvector, used v4i32 than v2i32.
----------------
jaykang10 wrote:
> jaykang10 wrote:
> > efriedma wrote:
> > > This could be extended to i16 uaddlv as well, but we can leave that for a followup, I guess.
> > Yep, it seems there is no pattern for dup(extract_element) --> duplane with v8i16.
> > Let's handle the type with other patch.
> I have tried below patch.
> ```
> diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
> index 2bb8e4324306..87c836905659 100644
> --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
> +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
> @@ -5327,7 +5327,8 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
>    case Intrinsic::aarch64_neon_uaddlv: {
>      EVT OpVT = Op.getOperand(1).getValueType();
>      EVT ResVT = Op.getValueType();
> -    if (ResVT == MVT::i32 && (OpVT == MVT::v8i8 || OpVT == MVT::v16i8)) {
> +    if (ResVT == MVT::i32 &&
> +        (OpVT == MVT::v8i8 || OpVT == MVT::v16i8 || OpVT == MVT::v8i16)) {
>        // In order to avoid insert_subvector, used v4i32 than v2i32.
>        SDValue UADDLV =
>            DAG.getNode(AArch64ISD::UADDLV, dl, MVT::v4i32, Op.getOperand(1));
> diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
> index 4a1f46f2576a..658b22d312fb 100644
> --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
> +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
> @@ -6067,6 +6067,8 @@ defm : DUPWithTruncPats<v16i8,  v4i16, v8i16, i32, DUPv16i8lane, VecIndex_x2>;
>  defm : DUPWithTruncPats<v16i8,  v2i32, v4i32, i32, DUPv16i8lane, VecIndex_x4>;
>  defm : DUPWithTruncPats<v8i16,  v2i32, v4i32, i32, DUPv8i16lane, VecIndex_x2>;
>  
> +defm : DUPWithTruncPats<v4i32,  v2i32, v4i32, i32, DUPv8i16lane, VecIndex_x2>;
> +
>  multiclass DUPWithTrunci64Pats<ValueType ResVT, Instruction DUP,
>                                 SDNodeXForm IdxXFORM> {
>    def : Pat<(ResVT (AArch64dup (i32 (trunc (extractelt (v2i64 V128:$Rn),
> @@ -6462,12 +6464,21 @@ def : Pat<(i32 (int_aarch64_neon_uaddlv (v8i16 (AArch64uaddlp (v16i8 V128:$op)))
>              (v8i16 (SUBREG_TO_REG (i64 0), (UADDLVv16i8v V128:$op), hsub)),
>              ssub))>;
>  
> +def : Pat<(i32 (vector_extract
> +            (v4i32 (AArch64uaddlv (v8i16 (AArch64uaddlp (v16i8 V128:$op))))), (i64 0))),
> +          (i32 (EXTRACT_SUBREG
> +            (v8i16 (SUBREG_TO_REG (i64 0), (UADDLVv16i8v V128:$op), hsub)),
> +            ssub))>;
> +
>  def : Pat<(v4i32 (AArch64uaddlv (v8i8 V64:$Rn))),
>            (v4i32 (SUBREG_TO_REG (i64 0), (UADDLVv8i8v V64:$Rn), hsub))>;
>  
>  def : Pat<(v4i32 (AArch64uaddlv (v16i8 V128:$Rn))),
>            (v4i32 (SUBREG_TO_REG (i64 0), (UADDLVv16i8v V128:$Rn), hsub))>;
>  
> +def : Pat<(v4i32 (AArch64uaddlv (v8i16 V128:$Rn))),
> +          (v4i32 (SUBREG_TO_REG (i64 0), (UADDLVv8i16v V128:$Rn), ssub))>;
> +
>  // Patterns for across-vector intrinsics, that have a node equivalent, that
>  // returns a vector (with only the low lane defined) instead of a scalar.
>  // In effect, opNode is the same as (scalar_to_vector (IntNode)).
> diff --git a/llvm/test/CodeGen/AArch64/neon-addlv.ll b/llvm/test/CodeGen/AArch64/neon-addlv.ll
> index 0f5a19c7a0f3..0769adce87d3 100644
> --- a/llvm/test/CodeGen/AArch64/neon-addlv.ll
> +++ b/llvm/test/CodeGen/AArch64/neon-addlv.ll
> @@ -178,8 +178,8 @@ entry:
>    ret i32 %0
>  }
> 
> -define dso_local <8 x i8> @bar(<8 x i8> noundef %a) local_unnamed_addr #0 {
> -; CHECK-LABEL: bar:
> +define dso_local <8 x i8> @uaddlv_v8i8(<8 x i8> %a) {
> +; CHECK-LABEL: uaddlv_v8i8:
>  ; CHECK:       // %bb.0: // %entry
>  ; CHECK-NEXT:    uaddlv h0, v0.8b
>  ; CHECK-NEXT:    dup v0.8h, v0.h[0]
> @@ -194,4 +194,22 @@ entry:
>    ret <8 x i8> %vrshrn_n2
>  }
>  
> +define dso_local <8 x i16> @uaddlv_v8i16(<8 x i16> %a) {
> +; CHECK-LABEL: uaddlv_v8i16:
> +; CHECK:       // %bb.0: // %entry
> +; CHECK-NEXT:    uaddlv s0, v0.8h
> +; CHECK-NEXT:    dup v1.8h, v0.h[0]
> +; CHECK-NEXT:    rshrn v0.4h, v1.4s, #3
> +; CHECK-NEXT:    rshrn2 v0.8h, v1.4s, #3
> +; CHECK-NEXT:    ret
> +entry:
> +  %vaddlv.i = tail call i32 @llvm.aarch64.neon.uaddlv.i32.v8i16(<8 x i16> %a)
> +  %vecinit.i = insertelement <8 x i32> undef, i32 %vaddlv.i, i64 0
> +  %vecinit7.i = shufflevector <8 x i32> %vecinit.i, <8 x i32> poison, <8 x i32> zeroinitializer
> +  %vrshrn_n2 = tail call <8 x i16> @llvm.aarch64.neon.rshrn.v8i16(<8 x i32> %vecinit7.i, i32 3)
> +  ret <8 x i16> %vrshrn_n2
> +}
> +
>  declare <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16>, i32)
> +declare <8 x i16> @llvm.aarch64.neon.rshrn.v8i16(<8 x i32>, i32)
> +declare i32 @llvm.aarch64.neon.uaddlv.i32.v8i16(<8 x i16>)
> diff --git a/llvm/test/CodeGen/AArch64/uaddlv-vaddlp-combine.ll b/llvm/test/CodeGen/AArch64/uaddlv-vaddlp-combine.ll
> index 8b48635b6694..e6b253b258f1 100644
> --- a/llvm/test/CodeGen/AArch64/uaddlv-vaddlp-combine.ll
> +++ b/llvm/test/CodeGen/AArch64/uaddlv-vaddlp-combine.ll
> @@ -17,7 +17,8 @@ define i32 @uaddlv_uaddlp_v8i16(<8 x i16> %0) {
>  define i16 @uaddlv_uaddlp_v16i8(<16 x i8> %0) {
>  ; CHECK-LABEL: uaddlv_uaddlp_v16i8:
>  ; CHECK:       // %bb.0:
> -; CHECK-NEXT:    uaddlv h0, v0.16b
> +; CHECK-NEXT:    uaddlp v0.8h, v0.16b
> +; CHECK-NEXT:    uaddlv s0, v0.8h
>  ; CHECK-NEXT:    fmov w0, s0
>  ; CHECK-NEXT:    ret
>    %2 = tail call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> %0)
> ```
> As you can see, there is a regression on `uaddlv_uaddlp_v8i16` even though I added a pattern to cover the regression because the first pattern is matched earlier than second one.
> ```
> first pattern
> +defm : DUPWithTruncPats<v4i32,  v2i32, v4i32, i32, DUPv8i16lane, VecIndex_x2>;
> 
> second pattern
> +def : Pat<(i32 (vector_extract
> +            (v4i32 (AArch64uaddlv (v8i16 (AArch64uaddlp (v16i8 V128:$op))))), (i64 0))),
> +          (i32 (EXTRACT_SUBREG
> +            (v8i16 (SUBREG_TO_REG (i64 0), (UADDLVv16i8v V128:$op), hsub)),
> +            ssub))>;
> +
> ```
> I think it could be ok to keep uaddlv intrinsic than uaddlv sdnode for v8i16 type...
Not sure I understand the issue here; more specific patterns (more nodes in the pattern definition) should win, no matter the order in the file, so you should be able to fix this with the right patterns.

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D159267/new/

https://reviews.llvm.org/D159267