[clang] [AArch64] Remove copy instruction between uaddlv with v8i16 and dup (PR #66068)

Thu Sep 14 02:46:10 PDT 2023

================
@@ -194,8 +194,23 @@ entry:
   ret <8 x i8> %vrshrn_n2
 }
 
-declare <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16>, i32)
+define dso_local <8 x i16> @uaddlv_dup_v8i16(<8 x i16> %a) {
+; CHECK-LABEL: uaddlv_dup_v8i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    uaddlv s0, v0.8h
+; CHECK-NEXT:    dup v1.8h, v0.h[0]
+; CHECK-NEXT:    rshrn v0.4h, v1.4s, #3
+; CHECK-NEXT:    rshrn2 v0.8h, v1.4s, #3
+; CHECK-NEXT:    ret
+entry:
+  %vaddlv.i = tail call i32 @llvm.aarch64.neon.uaddlv.i32.v8i16(<8 x i16> %a)
+  %vecinit.i = insertelement <8 x i32> undef, i32 %vaddlv.i, i64 0
+  %vecinit7.i = shufflevector <8 x i32> %vecinit.i, <8 x i32> poison, <8 x i32> zeroinitializer
+  %vrshrn_n2 = tail call <8 x i16> @llvm.aarch64.neon.rshrn.v8i16(<8 x i32> %vecinit7.i, i32 3)
----------------
davemgreen wrote:

This isn't a valid neon intrinsic - they need to legal vector sizes for the inputs and outputs. I think it works in this case because it gets expanded to shifts and whatnot. Is there another instruction that could be used in it's place for the test? Maybe just a simple shift?

https://github.com/llvm/llvm-project/pull/66068