[llvm] 1da4d5a - [AArch64][SVE] Add hadd and rhadd support

Wed Dec 14 01:25:02 PST 2022

Author: David Green
Date: 2022-12-14T09:24:54Z
New Revision: 1da4d5aafad74a3d9a3b893335249c9e5b4a7ab9

URL: https://github.com/llvm/llvm-project/commit/1da4d5aafad74a3d9a3b893335249c9e5b4a7ab9
DIFF: https://github.com/llvm/llvm-project/commit/1da4d5aafad74a3d9a3b893335249c9e5b4a7ab9.diff

LOG: [AArch64][SVE] Add hadd and rhadd support

This adds basic HADD and RHADD support for SVE, by marking the AVGFLOOR
and AVGCEIL as custom and converting those to HADD_PRED/RHADD_PRED
AArch64 nodes. Both the existing intrinsics and the _PRED nodes are then
lowered to the _ZPmZ instructions.

Differential Revision: https://reviews.llvm.org/D131875

Added: 
    

Modified: 
    llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
    llvm/lib/Target/AArch64/AArch64ISelLowering.h
    llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
    llvm/test/CodeGen/AArch64/sve2-hadd.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index af4b98e75a801..bad7a1bcfda51 100644

--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1249,6 +1249,13 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::SREM, VT, Expand);
       setOperationAction(ISD::SDIVREM, VT, Expand);
       setOperationAction(ISD::UDIVREM, VT, Expand);
+
+      if (Subtarget->hasSVE2()) {
+        setOperationAction(ISD::AVGFLOORS, VT, Custom);
+        setOperationAction(ISD::AVGFLOORU, VT, Custom);
+        setOperationAction(ISD::AVGCEILS, VT, Custom);
+        setOperationAction(ISD::AVGCEILU, VT, Custom);
+      }
     }
 
     // Illegal unpacked integer vector types.
@@ -2219,9 +2226,13 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
     MAKE_CASE(AArch64ISD::TLSDESC_CALLSEQ)
     MAKE_CASE(AArch64ISD::ABDS_PRED)
     MAKE_CASE(AArch64ISD::ABDU_PRED)
+    MAKE_CASE(AArch64ISD::HADDS_PRED)
+    MAKE_CASE(AArch64ISD::HADDU_PRED)
     MAKE_CASE(AArch64ISD::MUL_PRED)
     MAKE_CASE(AArch64ISD::MULHS_PRED)
     MAKE_CASE(AArch64ISD::MULHU_PRED)
+    MAKE_CASE(AArch64ISD::RHADDS_PRED)
+    MAKE_CASE(AArch64ISD::RHADDU_PRED)
     MAKE_CASE(AArch64ISD::SDIV_PRED)
     MAKE_CASE(AArch64ISD::SHL_PRED)
     MAKE_CASE(AArch64ISD::SMAX_PRED)
@@ -5945,6 +5956,14 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
     return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDS_PRED);
   case ISD::ABDU:
     return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDU_PRED);
+  case ISD::AVGFLOORS:
+    return LowerToPredicatedOp(Op, DAG, AArch64ISD::HADDS_PRED);
+  case ISD::AVGFLOORU:
+    return LowerToPredicatedOp(Op, DAG, AArch64ISD::HADDU_PRED);
+  case ISD::AVGCEILS:
+    return LowerToPredicatedOp(Op, DAG, AArch64ISD::RHADDS_PRED);
+  case ISD::AVGCEILU:
+    return LowerToPredicatedOp(Op, DAG, AArch64ISD::RHADDU_PRED);
   case ISD::BITREVERSE:
     return LowerBitreverse(Op, DAG);
   case ISD::BSWAP:

diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 112e88535aae0..49d43d5bce707 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -102,9 +102,13 @@ enum NodeType : unsigned {
   FMINNM_PRED,
   FMUL_PRED,
   FSUB_PRED,
+  HADDS_PRED,
+  HADDU_PRED,
   MUL_PRED,
   MULHS_PRED,
   MULHU_PRED,
+  RHADDS_PRED,
+  RHADDU_PRED,
   SDIV_PRED,
   SHL_PRED,
   SMAX_PRED,

diff  --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 3a529041d7292..3900abd175cbd 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -189,11 +189,15 @@ def AArch64lsl_p  : SDNode<"AArch64ISD::SHL_PRED",  SDT_AArch64Arith>;
 def AArch64lsr_p  : SDNode<"AArch64ISD::SRL_PRED",  SDT_AArch64Arith>;
 def AArch64mul_p  : SDNode<"AArch64ISD::MUL_PRED",  SDT_AArch64Arith>;
 def AArch64sabd_p : SDNode<"AArch64ISD::ABDS_PRED", SDT_AArch64Arith>;
+def AArch64shadd_p : SDNode<"AArch64ISD::HADDS_PRED", SDT_AArch64Arith>;
+def AArch64srhadd_p : SDNode<"AArch64ISD::RHADDS_PRED", SDT_AArch64Arith>;
 def AArch64sdiv_p : SDNode<"AArch64ISD::SDIV_PRED", SDT_AArch64Arith>;
 def AArch64smax_p : SDNode<"AArch64ISD::SMAX_PRED", SDT_AArch64Arith>;
 def AArch64smin_p : SDNode<"AArch64ISD::SMIN_PRED", SDT_AArch64Arith>;
 def AArch64smulh_p : SDNode<"AArch64ISD::MULHS_PRED", SDT_AArch64Arith>;
 def AArch64uabd_p : SDNode<"AArch64ISD::ABDU_PRED", SDT_AArch64Arith>;
+def AArch64uhadd_p : SDNode<"AArch64ISD::HADDU_PRED", SDT_AArch64Arith>;
+def AArch64urhadd_p : SDNode<"AArch64ISD::RHADDU_PRED", SDT_AArch64Arith>;
 def AArch64udiv_p : SDNode<"AArch64ISD::UDIV_PRED", SDT_AArch64Arith>;
 def AArch64umax_p : SDNode<"AArch64ISD::UMAX_PRED", SDT_AArch64Arith>;
 def AArch64umin_p : SDNode<"AArch64ISD::UMIN_PRED", SDT_AArch64Arith>;
@@ -258,6 +262,19 @@ def AArch64fsub_m1 : PatFrags<(ops node:$pg, node:$op1, node:$op2), [
     (AArch64fsub_p (SVEAllActive), node:$op1, (vselect node:$pg, node:$op2, (SVEDup0)))
 ]>;
 
+def AArch64shadd : PatFrags<(ops node:$pg, node:$op1, node:$op2),
+                            [(int_aarch64_sve_shadd node:$pg, node:$op1, node:$op2),
+                             (AArch64shadd_p node:$pg, node:$op1, node:$op2)]>;
+def AArch64uhadd : PatFrags<(ops node:$pg, node:$op1, node:$op2),
+                            [(int_aarch64_sve_uhadd node:$pg, node:$op1, node:$op2),
+                             (AArch64uhadd_p node:$pg, node:$op1, node:$op2)]>;
+def AArch64srhadd : PatFrags<(ops node:$pg, node:$op1, node:$op2),
+                            [(int_aarch64_sve_srhadd node:$pg, node:$op1, node:$op2),
+                             (AArch64srhadd_p node:$pg, node:$op1, node:$op2)]>;
+def AArch64urhadd : PatFrags<(ops node:$pg, node:$op1, node:$op2),
+                            [(int_aarch64_sve_urhadd node:$pg, node:$op1, node:$op2),
+                             (AArch64urhadd_p node:$pg, node:$op1, node:$op2)]>;
+
 def AArch64saba : PatFrags<(ops node:$op1, node:$op2, node:$op3),
                            [(int_aarch64_sve_saba node:$op1, node:$op2, node:$op3),
                             (add node:$op1, (AArch64sabd_p (SVEAllActive), node:$op2, node:$op3))]>;
@@ -3288,12 +3305,12 @@ let Predicates = [HasSVE2orSME] in {
   defm SQDMLSLBT_ZZZ : sve2_int_mla_long<0b00011, "sqdmlslbt", int_aarch64_sve_sqdmlslbt>;
 
   // SVE2 integer halving add/subtract (predicated)
-  defm SHADD_ZPmZ  : sve2_int_arith_pred<0b100000, "shadd",  int_aarch64_sve_shadd>;
-  defm UHADD_ZPmZ  : sve2_int_arith_pred<0b100010, "uhadd",  int_aarch64_sve_uhadd>;
+  defm SHADD_ZPmZ  : sve2_int_arith_pred<0b100000, "shadd",  AArch64shadd>;
+  defm UHADD_ZPmZ  : sve2_int_arith_pred<0b100010, "uhadd",  AArch64uhadd>;
   defm SHSUB_ZPmZ  : sve2_int_arith_pred<0b100100, "shsub",  int_aarch64_sve_shsub>;
   defm UHSUB_ZPmZ  : sve2_int_arith_pred<0b100110, "uhsub",  int_aarch64_sve_uhsub>;
-  defm SRHADD_ZPmZ : sve2_int_arith_pred<0b101000, "srhadd", int_aarch64_sve_srhadd>;
-  defm URHADD_ZPmZ : sve2_int_arith_pred<0b101010, "urhadd", int_aarch64_sve_urhadd>;
+  defm SRHADD_ZPmZ : sve2_int_arith_pred<0b101000, "srhadd", AArch64srhadd>;
+  defm URHADD_ZPmZ : sve2_int_arith_pred<0b101010, "urhadd", AArch64urhadd>;
   defm SHSUBR_ZPmZ : sve2_int_arith_pred<0b101100, "shsubr", int_aarch64_sve_shsubr>;
   defm UHSUBR_ZPmZ : sve2_int_arith_pred<0b101110, "uhsubr", int_aarch64_sve_uhsubr>;
 

diff  --git a/llvm/test/CodeGen/AArch64/sve2-hadd.ll b/llvm/test/CodeGen/AArch64/sve2-hadd.ll
index 2d494c43d4ce7..6ad2ad0feacf6 100644
--- a/llvm/test/CodeGen/AArch64/sve2-hadd.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-hadd.ll
@@ -1,6 +1,36 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple aarch64-none-eabi -mattr=+sve2 -o - | FileCheck %s
 
+define <vscale x 2 x i64> @hadds_v2i64(<vscale x 2 x i64> %s0, <vscale x 2 x i64> %s1) {
+; CHECK-LABEL: hadds_v2i64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    shadd z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    ret
+entry:
+  %s0s = sext <vscale x 2 x i64> %s0 to <vscale x 2 x i128>
+  %s1s = sext <vscale x 2 x i64> %s1 to <vscale x 2 x i128>
+  %m = add <vscale x 2 x i128> %s0s, %s1s
+  %s = lshr <vscale x 2 x i128> %m, shufflevector (<vscale x 2 x i128> insertelement (<vscale x 2 x i128> poison, i128 1, i32 0), <vscale x 2 x i128> poison, <vscale x 2 x i32> zeroinitializer)
+  %s2 = trunc <vscale x 2 x i128> %s to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %s2
+}
+
+define <vscale x 2 x i64> @haddu_v2i64(<vscale x 2 x i64> %s0, <vscale x 2 x i64> %s1) {
+; CHECK-LABEL: haddu_v2i64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    uhadd z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    ret
+entry:
+  %s0s = zext <vscale x 2 x i64> %s0 to <vscale x 2 x i128>
+  %s1s = zext <vscale x 2 x i64> %s1 to <vscale x 2 x i128>
+  %m = add <vscale x 2 x i128> %s0s, %s1s
+  %s = lshr <vscale x 2 x i128> %m, shufflevector (<vscale x 2 x i128> insertelement (<vscale x 2 x i128> poison, i128 1, i32 0), <vscale x 2 x i128> poison, <vscale x 2 x i32> zeroinitializer)
+  %s2 = trunc <vscale x 2 x i128> %s to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %s2
+}
+
 define <vscale x 2 x i32> @hadds_v2i32(<vscale x 2 x i32> %s0, <vscale x 2 x i32> %s1) {
 ; CHECK-LABEL: hadds_v2i32:
 ; CHECK:       // %bb.0: // %entry
@@ -37,15 +67,8 @@ entry:
 define <vscale x 4 x i32> @hadds_v4i32(<vscale x 4 x i32> %s0, <vscale x 4 x i32> %s1) {
 ; CHECK-LABEL: hadds_v4i32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sunpkhi z2.d, z0.s
-; CHECK-NEXT:    sunpklo z0.d, z0.s
-; CHECK-NEXT:    sunpkhi z3.d, z1.s
-; CHECK-NEXT:    sunpklo z1.d, z1.s
-; CHECK-NEXT:    add z0.d, z0.d, z1.d
-; CHECK-NEXT:    add z1.d, z2.d, z3.d
-; CHECK-NEXT:    lsr z1.d, z1.d, #1
-; CHECK-NEXT:    lsr z0.d, z0.d, #1
-; CHECK-NEXT:    uzp1 z0.s, z0.s, z1.s
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    shadd z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %s0s = sext <vscale x 4 x i32> %s0 to <vscale x 4 x i64>
@@ -59,15 +82,8 @@ entry:
 define <vscale x 4 x i32> @haddu_v4i32(<vscale x 4 x i32> %s0, <vscale x 4 x i32> %s1) {
 ; CHECK-LABEL: haddu_v4i32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uunpkhi z2.d, z0.s
-; CHECK-NEXT:    uunpklo z0.d, z0.s
-; CHECK-NEXT:    uunpkhi z3.d, z1.s
-; CHECK-NEXT:    uunpklo z1.d, z1.s
-; CHECK-NEXT:    add z0.d, z0.d, z1.d
-; CHECK-NEXT:    add z1.d, z2.d, z3.d
-; CHECK-NEXT:    lsr z1.d, z1.d, #1
-; CHECK-NEXT:    lsr z0.d, z0.d, #1
-; CHECK-NEXT:    uzp1 z0.s, z0.s, z1.s
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    uhadd z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %s0s = zext <vscale x 4 x i32> %s0 to <vscale x 4 x i64>
@@ -152,15 +168,8 @@ entry:
 define <vscale x 8 x i16> @hadds_v8i16(<vscale x 8 x i16> %s0, <vscale x 8 x i16> %s1) {
 ; CHECK-LABEL: hadds_v8i16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sunpkhi z2.s, z0.h
-; CHECK-NEXT:    sunpklo z0.s, z0.h
-; CHECK-NEXT:    sunpkhi z3.s, z1.h
-; CHECK-NEXT:    sunpklo z1.s, z1.h
-; CHECK-NEXT:    add z0.s, z0.s, z1.s
-; CHECK-NEXT:    add z1.s, z2.s, z3.s
-; CHECK-NEXT:    lsr z1.s, z1.s, #1
-; CHECK-NEXT:    lsr z0.s, z0.s, #1
-; CHECK-NEXT:    uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    shadd z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    ret
 entry:
   %s0s = sext <vscale x 8 x i16> %s0 to <vscale x 8 x i32>
@@ -174,15 +183,8 @@ entry:
 define <vscale x 8 x i16> @haddu_v8i16(<vscale x 8 x i16> %s0, <vscale x 8 x i16> %s1) {
 ; CHECK-LABEL: haddu_v8i16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uunpkhi z2.s, z0.h
-; CHECK-NEXT:    uunpklo z0.s, z0.h
-; CHECK-NEXT:    uunpkhi z3.s, z1.h
-; CHECK-NEXT:    uunpklo z1.s, z1.h
-; CHECK-NEXT:    add z0.s, z0.s, z1.s
-; CHECK-NEXT:    add z1.s, z2.s, z3.s
-; CHECK-NEXT:    lsr z1.s, z1.s, #1
-; CHECK-NEXT:    lsr z0.s, z0.s, #1
-; CHECK-NEXT:    uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    uhadd z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    ret
 entry:
   %s0s = zext <vscale x 8 x i16> %s0 to <vscale x 8 x i32>
@@ -267,15 +269,8 @@ entry:
 define <vscale x 16 x i8> @hadds_v16i8(<vscale x 16 x i8> %s0, <vscale x 16 x i8> %s1) {
 ; CHECK-LABEL: hadds_v16i8:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sunpkhi z2.h, z0.b
-; CHECK-NEXT:    sunpklo z0.h, z0.b
-; CHECK-NEXT:    sunpkhi z3.h, z1.b
-; CHECK-NEXT:    sunpklo z1.h, z1.b
-; CHECK-NEXT:    add z0.h, z0.h, z1.h
-; CHECK-NEXT:    add z1.h, z2.h, z3.h
-; CHECK-NEXT:    lsr z1.h, z1.h, #1
-; CHECK-NEXT:    lsr z0.h, z0.h, #1
-; CHECK-NEXT:    uzp1 z0.b, z0.b, z1.b
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    shadd z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    ret
 entry:
   %s0s = sext <vscale x 16 x i8> %s0 to <vscale x 16 x i16>
@@ -289,15 +284,8 @@ entry:
 define <vscale x 16 x i8> @haddu_v16i8(<vscale x 16 x i8> %s0, <vscale x 16 x i8> %s1) {
 ; CHECK-LABEL: haddu_v16i8:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uunpkhi z2.h, z0.b
-; CHECK-NEXT:    uunpklo z0.h, z0.b
-; CHECK-NEXT:    uunpkhi z3.h, z1.b
-; CHECK-NEXT:    uunpklo z1.h, z1.b
-; CHECK-NEXT:    add z0.h, z0.h, z1.h
-; CHECK-NEXT:    add z1.h, z2.h, z3.h
-; CHECK-NEXT:    lsr z1.h, z1.h, #1
-; CHECK-NEXT:    lsr z0.h, z0.h, #1
-; CHECK-NEXT:    uzp1 z0.b, z0.b, z1.b
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    uhadd z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    ret
 entry:
   %s0s = zext <vscale x 16 x i8> %s0 to <vscale x 16 x i16>
@@ -308,6 +296,38 @@ entry:
   ret <vscale x 16 x i8> %s2
 }
 
+define <vscale x 2 x i64> @rhadds_v2i64(<vscale x 2 x i64> %s0, <vscale x 2 x i64> %s1) {
+; CHECK-LABEL: rhadds_v2i64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    srhadd z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    ret
+entry:
+  %s0s = sext <vscale x 2 x i64> %s0 to <vscale x 2 x i128>
+  %s1s = sext <vscale x 2 x i64> %s1 to <vscale x 2 x i128>
+  %add = add <vscale x 2 x i128> %s0s, %s1s
+  %add2 = add <vscale x 2 x i128> %add, shufflevector (<vscale x 2 x i128> insertelement (<vscale x 2 x i128> poison, i128 1, i32 0), <vscale x 2 x i128> poison, <vscale x 2 x i32> zeroinitializer)
+  %s = lshr <vscale x 2 x i128> %add2, shufflevector (<vscale x 2 x i128> insertelement (<vscale x 2 x i128> poison, i128 1, i32 0), <vscale x 2 x i128> poison, <vscale x 2 x i32> zeroinitializer)
+  %result = trunc <vscale x 2 x i128> %s to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %result
+}
+
+define <vscale x 2 x i64> @rhaddu_v2i64(<vscale x 2 x i64> %s0, <vscale x 2 x i64> %s1) {
+; CHECK-LABEL: rhaddu_v2i64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    urhadd z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    ret
+entry:
+  %s0s = zext <vscale x 2 x i64> %s0 to <vscale x 2 x i128>
+  %s1s = zext <vscale x 2 x i64> %s1 to <vscale x 2 x i128>
+  %add = add <vscale x 2 x i128> %s0s, %s1s
+  %add2 = add <vscale x 2 x i128> %add, shufflevector (<vscale x 2 x i128> insertelement (<vscale x 2 x i128> poison, i128 1, i32 0), <vscale x 2 x i128> poison, <vscale x 2 x i32> zeroinitializer)
+  %s = lshr <vscale x 2 x i128> %add2, shufflevector (<vscale x 2 x i128> insertelement (<vscale x 2 x i128> poison, i128 1, i32 0), <vscale x 2 x i128> poison, <vscale x 2 x i32> zeroinitializer)
+  %result = trunc <vscale x 2 x i128> %s to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %result
+}
+
 define <vscale x 2 x i32> @rhadds_v2i32(<vscale x 2 x i32> %s0, <vscale x 2 x i32> %s1) {
 ; CHECK-LABEL: rhadds_v2i32:
 ; CHECK:       // %bb.0: // %entry
@@ -352,18 +372,8 @@ entry:
 define <vscale x 4 x i32> @rhadds_v4i32(<vscale x 4 x i32> %s0, <vscale x 4 x i32> %s1) {
 ; CHECK-LABEL: rhadds_v4i32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov z2.d, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    sunpkhi z3.d, z0.s
-; CHECK-NEXT:    sunpklo z0.d, z0.s
-; CHECK-NEXT:    sunpkhi z4.d, z1.s
-; CHECK-NEXT:    sunpklo z1.d, z1.s
-; CHECK-NEXT:    eor z0.d, z0.d, z2.d
-; CHECK-NEXT:    eor z2.d, z3.d, z2.d
-; CHECK-NEXT:    sub z0.d, z1.d, z0.d
-; CHECK-NEXT:    sub z1.d, z4.d, z2.d
-; CHECK-NEXT:    lsr z0.d, z0.d, #1
-; CHECK-NEXT:    lsr z1.d, z1.d, #1
-; CHECK-NEXT:    uzp1 z0.s, z0.s, z1.s
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    srhadd z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %s0s = sext <vscale x 4 x i32> %s0 to <vscale x 4 x i64>
@@ -378,18 +388,8 @@ entry:
 define <vscale x 4 x i32> @rhaddu_v4i32(<vscale x 4 x i32> %s0, <vscale x 4 x i32> %s1) {
 ; CHECK-LABEL: rhaddu_v4i32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov z2.d, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    uunpkhi z3.d, z0.s
-; CHECK-NEXT:    uunpklo z0.d, z0.s
-; CHECK-NEXT:    uunpkhi z4.d, z1.s
-; CHECK-NEXT:    uunpklo z1.d, z1.s
-; CHECK-NEXT:    eor z0.d, z0.d, z2.d
-; CHECK-NEXT:    eor z2.d, z3.d, z2.d
-; CHECK-NEXT:    sub z0.d, z1.d, z0.d
-; CHECK-NEXT:    sub z1.d, z4.d, z2.d
-; CHECK-NEXT:    lsr z0.d, z0.d, #1
-; CHECK-NEXT:    lsr z1.d, z1.d, #1
-; CHECK-NEXT:    uzp1 z0.s, z0.s, z1.s
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    urhadd z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %s0s = zext <vscale x 4 x i32> %s0 to <vscale x 4 x i64>
@@ -487,18 +487,8 @@ entry:
 define <vscale x 8 x i16> @rhadds_v8i16(<vscale x 8 x i16> %s0, <vscale x 8 x i16> %s1) {
 ; CHECK-LABEL: rhadds_v8i16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov z2.s, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    sunpkhi z3.s, z0.h
-; CHECK-NEXT:    sunpklo z0.s, z0.h
-; CHECK-NEXT:    sunpkhi z4.s, z1.h
-; CHECK-NEXT:    sunpklo z1.s, z1.h
-; CHECK-NEXT:    eor z0.d, z0.d, z2.d
-; CHECK-NEXT:    eor z2.d, z3.d, z2.d
-; CHECK-NEXT:    sub z0.s, z1.s, z0.s
-; CHECK-NEXT:    sub z1.s, z4.s, z2.s
-; CHECK-NEXT:    lsr z0.s, z0.s, #1
-; CHECK-NEXT:    lsr z1.s, z1.s, #1
-; CHECK-NEXT:    uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    srhadd z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    ret
 entry:
   %s0s = sext <vscale x 8 x i16> %s0 to <vscale x 8 x i32>
@@ -513,18 +503,8 @@ entry:
 define <vscale x 8 x i16> @rhaddu_v8i16(<vscale x 8 x i16> %s0, <vscale x 8 x i16> %s1) {
 ; CHECK-LABEL: rhaddu_v8i16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov z2.s, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    uunpkhi z3.s, z0.h
-; CHECK-NEXT:    uunpklo z0.s, z0.h
-; CHECK-NEXT:    uunpkhi z4.s, z1.h
-; CHECK-NEXT:    uunpklo z1.s, z1.h
-; CHECK-NEXT:    eor z0.d, z0.d, z2.d
-; CHECK-NEXT:    eor z2.d, z3.d, z2.d
-; CHECK-NEXT:    sub z0.s, z1.s, z0.s
-; CHECK-NEXT:    sub z1.s, z4.s, z2.s
-; CHECK-NEXT:    lsr z0.s, z0.s, #1
-; CHECK-NEXT:    lsr z1.s, z1.s, #1
-; CHECK-NEXT:    uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    urhadd z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    ret
 entry:
   %s0s = zext <vscale x 8 x i16> %s0 to <vscale x 8 x i32>
@@ -622,18 +602,8 @@ entry:
 define <vscale x 16 x i8> @rhadds_v16i8(<vscale x 16 x i8> %s0, <vscale x 16 x i8> %s1) {
 ; CHECK-LABEL: rhadds_v16i8:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov z2.h, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    sunpkhi z3.h, z0.b
-; CHECK-NEXT:    sunpklo z0.h, z0.b
-; CHECK-NEXT:    sunpkhi z4.h, z1.b
-; CHECK-NEXT:    sunpklo z1.h, z1.b
-; CHECK-NEXT:    eor z0.d, z0.d, z2.d
-; CHECK-NEXT:    eor z2.d, z3.d, z2.d
-; CHECK-NEXT:    sub z0.h, z1.h, z0.h
-; CHECK-NEXT:    sub z1.h, z4.h, z2.h
-; CHECK-NEXT:    lsr z0.h, z0.h, #1
-; CHECK-NEXT:    lsr z1.h, z1.h, #1
-; CHECK-NEXT:    uzp1 z0.b, z0.b, z1.b
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    srhadd z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    ret
 entry:
   %s0s = sext <vscale x 16 x i8> %s0 to <vscale x 16 x i16>
@@ -648,18 +618,8 @@ entry:
 define <vscale x 16 x i8> @rhaddu_v16i8(<vscale x 16 x i8> %s0, <vscale x 16 x i8> %s1) {
 ; CHECK-LABEL: rhaddu_v16i8:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov z2.h, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    uunpkhi z3.h, z0.b
-; CHECK-NEXT:    uunpklo z0.h, z0.b
-; CHECK-NEXT:    uunpkhi z4.h, z1.b
-; CHECK-NEXT:    uunpklo z1.h, z1.b
-; CHECK-NEXT:    eor z0.d, z0.d, z2.d
-; CHECK-NEXT:    eor z2.d, z3.d, z2.d
-; CHECK-NEXT:    sub z0.h, z1.h, z0.h
-; CHECK-NEXT:    sub z1.h, z4.h, z2.h
-; CHECK-NEXT:    lsr z0.h, z0.h, #1
-; CHECK-NEXT:    lsr z1.h, z1.h, #1
-; CHECK-NEXT:    uzp1 z0.b, z0.b, z1.b
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    urhadd z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    ret
 entry:
   %s0s = zext <vscale x 16 x i8> %s0 to <vscale x 16 x i16>