[PATCH] D150969: [AArch64] Try to convert two XTN and two SMLSL to UZP1, SMLSL and SMLSL2

Fri May 19 07:51:58 PDT 2023

jaykang10 created this revision.
jaykang10 added reviewers: dmgreen, efriedma, t.p.northover.
Herald added subscribers: jeroen.dobbelaere, hiraditya, kristof.beyls.
Herald added a project: All.
jaykang10 requested review of this revision.
Herald added a project: LLVM.
Herald added a subscriber: llvm-commits.

gcc generates less instructions than llvm from below intrinsic example.

  #include <arm_neon.h>

  void foo(int16x8_t a, int32x4_t acc, int32x4_t *out, const int32_t *p) {
      int16x8_t b = vcombine_s16(vmovn_s32(vld1q_s32(&p[0])),
                                 vmovn_s32(vld1q_s32(&p[4])));
      acc = vmlsl_s16(acc, vget_low_s16(a), vget_low_s16(b));
      acc = vmlsl_s16(acc, vget_high_s16(a), vget_high_s16(b));
      *out = acc;
  }

GCC output

  foo:
          ldp     q2, q3, [x1]
          uzp1    v2.8h, v2.8h, v3.8h
          smlsl   v1.4s, v0.4h, v2.4h
          smlsl2  v1.4s, v0.8h, v2.8h
          str     q1, [x0]
          ret

LLVM output

  ldp     q2, q3, [x1]
  ext     v4.16b, v0.16b, v0.16b, #8
  xtn     v2.4h, v2.4s
  smlsl   v1.4s, v0.4h, v2.4h
  xtn     v0.4h, v3.4s
  smlsl   v1.4s, v4.4h, v0.4h
  str     q1, [x0]
  ret

It looks gcc keeps the intrinsic function calls with builtin function calls.
For example, the `vmonv` and `vcombine` intrinsic function calls are matched to the `uzp1` pattern as below.

  _4 = __builtin_aarch64_xtnv4si (_3);(insn 9 8 10 (set (reg:V4SI 107)
  _6 = __builtin_aarch64_xtnv4si (_5);(insn 12 11 13 (set (reg:V4SI 109)
  _7 = {_4, _6};
  ...
  (insn 10 9 11 (set (reg:V8HI 108)
          (vec_concat:V8HI (truncate:V4HI (reg:V4SI 107))
              (const_vector:V4HI [
                      (const_int 0 [0]) repeated x4
                  ])))
       (nil))
  (insn 11 10 0 (set (reg:V4HI 93 [ _5 ])
          (subreg:V4HI (reg:V8HI 108) 0))
       (nil))
  (insn 13 12 14 (set (reg:V8HI 110)
          (vec_concat:V8HI (truncate:V4HI (reg:V4SI 109))
              (const_vector:V4HI [
                      (const_int 0 [0]) repeated x4
                  ])))
       (nil))
  (insn 14 13 0 (set (reg:V4HI 95 [ _7 ])
          (subreg:V4HI (reg:V8HI 110) 0))
       (nil))
  (insn 15 14 16 (set (reg:V8HI 111)
          (vec_concat:V8HI (reg:V4HI 93 [ _5 ])
              (reg:V4HI 95 [ _7 ])))
       (nil))
  ...
  (define_insn "*aarch64_narrow_trunc<mode>"
    [(set (match_operand:<VNARROWQ2> 0 "register_operand" "=w")
          (vec_concat:<VNARROWQ2>
            (truncate:<VNARROWQ>
              (match_operand:VQN 1 "register_operand" "w"))
            (truncate:<VNARROWQ>
              (match_operand:VQN 2 "register_operand" "w"))))]
    "TARGET_SIMD"
  {
    if (!BYTES_BIG_ENDIAN)
      return "uzp1\\t%0.<V2ntype>, %1.<V2ntype>, %2.<V2ntype>";
    else
      return "uzp1\\t%0.<V2ntype>, %2.<V2ntype>, %1.<V2ntype>";
  }
    [(set_attr "type" "neon_permute<q>")]
  )

It looks clang generates some intrinsic functions' deifintion. After inlining, some intrinsic function calls are optimized away as below.

  define dso_local void @foo(<8 x i16> noundef %a, <4 x i32> noundef %acc, ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly %p) local_unnamed_addr #0 {
  entry:
    %0 = load <4 x i32>, ptr %p, align 4
    %vmovn.i = trunc <4 x i32> %0 to <4 x i16>
    %arrayidx2 = getelementptr inbounds i32, ptr %p, i64 4
    %1 = load <4 x i32>, ptr %arrayidx2, align 4
    %vmovn.i17 = trunc <4 x i32> %1 to <4 x i16>
    %shuffle.i18 = shufflevector <8 x i16> %a, <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i18, <4 x i16> %vmovn.i)
    %shuffle.i19 = shufflevector <8 x i16> %a, <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
    %vmull2.i.i20 = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i19, <4 x i16> %vmovn.i17)
    %2 = add <4 x i32> %vmull2.i.i, %vmull2.i.i20
    %sub.i21 = sub <4 x i32> %acc, %2
    store <4 x i32> %sub.i21, ptr %out, align 16, !tbaa !6
    ret void
  }

For `uzp1` instruction, it is hard to match existing pattern for `uzp1` without `concat_vectors` which comes from `vcombine_s16`.
If clang does not generate the intrinsic function's definition and backend lowers the intrinsic function call, we could see similar code with gcc. However, I do not think it is good way. It could be better to generate the intrinsic function's definition and optimize the code after inlining.

Alternatively, I have tried to check the MIR code sequence with `smlsl` in AArch64MIPeepholeOpt pass. With this patch, the llvm output is as below.

  foo:
          ldp     q2, q3, [x1]
          uzp1    v2.8h, v2.8h, v3.8h
          smlsl   v1.4s, v0.4h, v2.4h
          smlsl2  v1.4s, v0.8h, v2.8h
          str     q1, [x0]
          ret

https://reviews.llvm.org/D150969

Files:
  llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
  llvm/test/CodeGen/AArch64/aarch64-smull.ll

-------------- next part --------------
A non-text attachment was scrubbed...
Name: D150969.523775.patch
Type: text/x-patch
Size: 8196 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20230519/1111c782/attachment.bin>