[llvm] Matched some basic ISD::AVGFLOORU patterns (PR #84903)

Shourya Goel via llvm-commits llvm-commits at lists.llvm.org
Tue Mar 12 09:47:13 PDT 2024


https://github.com/Sh0g0-1758 updated https://github.com/llvm/llvm-project/pull/84903

>From 3c7737e1740c5b3d6a5bdb5bb1b1eea1f6195009 Mon Sep 17 00:00:00 2001
From: Sh0g0-1758 <shouryagoel10000 at gmail.com>
Date: Tue, 12 Mar 2024 15:05:36 +0530
Subject: [PATCH 1/2] Matched some basic ISD::AVGFLOORU patterns

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 34 +++++++++++++++++++
 llvm/test/CodeGen/AArch64/dag-fixedwidth.ll   | 11 ++++++
 2 files changed, 45 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/dag-fixedwidth.ll

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 5476ef87971436..f6132e9627bf8e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -2826,6 +2826,36 @@ SDValue DAGCombiner::visitADDLike(SDNode *N) {
   return SDValue();
 }
 
+// Attempt to form ext(avgflooru(A, B)) from add(and(A, B), lshr(xor(A, B), 1))
+static SDValue combineFixedwidthToAVG(SDNode *N, SelectionDAG &DAG) {
+  assert(N->getOpcode() == ISD::ADD && "ADD node is required here");
+  SDValue And = N->getOperand(0);
+  SDValue Lshr = N->getOperand(1);
+  if (And.getOpcode() != ISD::AND || Lshr.getOpcode() != ISD::SRL)
+    return SDValue();
+  SDValue Xor = Lshr.getOperand(0);
+  if (Xor.getOpcode() != ISD::XOR)
+    return SDValue();
+  SDValue And1 = And.getOperand(0);
+  SDValue And2 = And.getOperand(1);
+  SDValue Xor1 = Xor.getOperand(0);
+  SDValue Xor2 = Xor.getOperand(1);
+  if(Xor1 != And1 && Xor2 != And2)
+    return SDValue();
+  // Is the right shift using an immediate value of 1?
+  ConstantSDNode *N1C = isConstOrConstSplat(Lshr.getOperand(1));
+  if (!N1C || N1C->getAPIntValue() != 1)
+    return SDValue();
+  EVT VT = And.getValueType();
+  SDLoc DL(N);
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (!TLI.isOperationLegalOrCustom(ISD::AVGFLOORU, VT))
+    return SDValue();
+  return DAG.getNode(ISD::AVGFLOORU, DL, VT,
+                     DAG.getExtOrTrunc(false, And1, DL, VT),
+                     DAG.getExtOrTrunc(false, And2, DL, VT));
+}
+
 SDValue DAGCombiner::visitADD(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -2841,6 +2871,10 @@ SDValue DAGCombiner::visitADD(SDNode *N) {
   if (SDValue V = foldAddSubOfSignBit(N, DAG))
     return V;
 
+  // Try to match AVG fixedwidth pattern
+  if (SDValue V = combineFixedwidthToAVG(N, DAG))
+    return V;
+
   // fold (a+b) -> (a|b) iff a and b share no bits.
   if ((!LegalOperations || TLI.isOperationLegal(ISD::OR, VT)) &&
       DAG.haveNoCommonBitsSet(N0, N1))
diff --git a/llvm/test/CodeGen/AArch64/dag-fixedwidth.ll b/llvm/test/CodeGen/AArch64/dag-fixedwidth.ll
new file mode 100644
index 00000000000000..33a6b12f3c86a6
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/dag-fixedwidth.ll
@@ -0,0 +1,11 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=aarch64 | FileCheck %s
+
+define i4 @fixedwidth(i4 %a0, i4 %a1)  {
+; CHECK-LABEL: fixedwidth:
+  %and = and i4 %a0, %a1
+  %xor = xor i4 %a0, %a1
+  %srl = lshr i4 %xor, 1
+  %res = add i4 %and, %srl
+  ret i4 %res
+}

>From 34e82a81904977f14739cdcc5ecdb6dc93478606 Mon Sep 17 00:00:00 2001
From: Sh0g0-1758 <shouryagoel10000 at gmail.com>
Date: Tue, 12 Mar 2024 22:16:56 +0530
Subject: [PATCH 2/2] Refactor tests and ran automatation script

---
 llvm/test/CodeGen/AArch64/dag-fixedwidth.ll |  11 --
 llvm/test/CodeGen/AArch64/hadd-combine.ll   | 182 +++++++++++++++-----
 2 files changed, 138 insertions(+), 55 deletions(-)
 delete mode 100644 llvm/test/CodeGen/AArch64/dag-fixedwidth.ll

diff --git a/llvm/test/CodeGen/AArch64/dag-fixedwidth.ll b/llvm/test/CodeGen/AArch64/dag-fixedwidth.ll
deleted file mode 100644
index 33a6b12f3c86a6..00000000000000
--- a/llvm/test/CodeGen/AArch64/dag-fixedwidth.ll
+++ /dev/null
@@ -1,11 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=aarch64 | FileCheck %s
-
-define i4 @fixedwidth(i4 %a0, i4 %a1)  {
-; CHECK-LABEL: fixedwidth:
-  %and = and i4 %a0, %a1
-  %xor = xor i4 %a0, %a1
-  %srl = lshr i4 %xor, 1
-  %res = add i4 %and, %srl
-  ret i4 %res
-}
diff --git a/llvm/test/CodeGen/AArch64/hadd-combine.ll b/llvm/test/CodeGen/AArch64/hadd-combine.ll
index 2269d75cdbb9ed..629cd071086aca 100644
--- a/llvm/test/CodeGen/AArch64/hadd-combine.ll
+++ b/llvm/test/CodeGen/AArch64/hadd-combine.ll
@@ -17,8 +17,11 @@ define <8 x i16> @haddu_base(<8 x i16> %src1, <8 x i16> %src2) {
 define <8 x i16> @haddu_const(<8 x i16> %src1) {
 ; CHECK-LABEL: haddu_const:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v1.8h, #1
-; CHECK-NEXT:    uhadd v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    movi v1.4s, #1
+; CHECK-NEXT:    uaddw v2.4s, v1.4s, v0.4h
+; CHECK-NEXT:    uaddw2 v1.4s, v1.4s, v0.8h
+; CHECK-NEXT:    shrn v0.4h, v2.4s, #1
+; CHECK-NEXT:    shrn2 v0.8h, v1.4s, #1
 ; CHECK-NEXT:    ret
   %zextsrc1 = zext <8 x i16> %src1 to <8 x i32>
   %add = add <8 x i32> %zextsrc1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -30,8 +33,11 @@ define <8 x i16> @haddu_const(<8 x i16> %src1) {
 define <8 x i16> @haddu_const_lhs(<8 x i16> %src1) {
 ; CHECK-LABEL: haddu_const_lhs:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v1.8h, #1
-; CHECK-NEXT:    uhadd v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    movi v1.4s, #1
+; CHECK-NEXT:    uaddw v2.4s, v1.4s, v0.4h
+; CHECK-NEXT:    uaddw2 v1.4s, v1.4s, v0.8h
+; CHECK-NEXT:    shrn v0.4h, v2.4s, #1
+; CHECK-NEXT:    shrn2 v0.8h, v1.4s, #1
 ; CHECK-NEXT:    ret
   %zextsrc1 = zext <8 x i16> %src1 to <8 x i32>
   %add = add <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, %zextsrc1
@@ -120,7 +126,7 @@ define <8 x i16> @haddu_i_const_lhs(<8 x i16> %src1) {
 ; CHECK-LABEL: haddu_i_const_lhs:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    movi v1.8h, #1
-; CHECK-NEXT:    uhadd v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    uhadd v0.8h, v1.8h, v0.8h
 ; CHECK-NEXT:    ret
   %result = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>, <8 x i16> %src1)
   ret <8 x i16> %result
@@ -129,7 +135,8 @@ define <8 x i16> @haddu_i_const_lhs(<8 x i16> %src1) {
 define <8 x i16> @haddu_i_const_zero(<8 x i16> %src1) {
 ; CHECK-LABEL: haddu_i_const_zero:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ushr v0.8h, v0.8h, #1
+; CHECK-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-NEXT:    uhadd v0.8h, v1.8h, v0.8h
 ; CHECK-NEXT:    ret
   %result = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, <8 x i16> %src1)
   ret <8 x i16> %result
@@ -138,7 +145,9 @@ define <8 x i16> @haddu_i_const_zero(<8 x i16> %src1) {
 define <8 x i16> @haddu_i_const_both() {
 ; CHECK-LABEL: haddu_i_const_both:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v0.8h, #2
+; CHECK-NEXT:    movi v0.8h, #1
+; CHECK-NEXT:    movi v1.8h, #3
+; CHECK-NEXT:    uhadd v0.8h, v1.8h, v0.8h
 ; CHECK-NEXT:    ret
   %result = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
   ret <8 x i16> %result
@@ -147,7 +156,9 @@ define <8 x i16> @haddu_i_const_both() {
 define <8 x i16> @haddu_i_const_bothhigh() {
 ; CHECK-LABEL: haddu_i_const_bothhigh:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvni v0.8h, #1
+; CHECK-NEXT:    movi v0.2d, #0xffffffffffffffff
+; CHECK-NEXT:    mvni v1.8h, #1
+; CHECK-NEXT:    uhadd v0.8h, v1.8h, v0.8h
 ; CHECK-NEXT:    ret
   %result = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> <i16 65534, i16 65534, i16 65534, i16 65534, i16 65534, i16 65534, i16 65534, i16 65534>, <8 x i16> <i16 65535, i16 65535, i16 65535, i16 65535, i16 65535, i16 65535, i16 65535, i16 65535>)
   ret <8 x i16> %result
@@ -156,7 +167,7 @@ define <8 x i16> @haddu_i_const_bothhigh() {
 define <8 x i16> @haddu_i_undef(<8 x i16> %t, <8 x i16> %src1) {
 ; CHECK-LABEL: haddu_i_undef:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    uhadd v0.8h, v0.8h, v1.8h
 ; CHECK-NEXT:    ret
   %result = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> undef, <8 x i16> %src1)
   ret <8 x i16> %result
@@ -182,8 +193,11 @@ define <8 x i16> @hadds_base(<8 x i16> %src1, <8 x i16> %src2) {
 define <8 x i16> @hadds_const(<8 x i16> %src1) {
 ; CHECK-LABEL: hadds_const:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v1.8h, #1
-; CHECK-NEXT:    shadd v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    movi v1.4s, #1
+; CHECK-NEXT:    saddw v2.4s, v1.4s, v0.4h
+; CHECK-NEXT:    saddw2 v1.4s, v1.4s, v0.8h
+; CHECK-NEXT:    shrn v0.4h, v2.4s, #1
+; CHECK-NEXT:    shrn2 v0.8h, v1.4s, #1
 ; CHECK-NEXT:    ret
   %zextsrc1 = sext <8 x i16> %src1 to <8 x i32>
   %add = add <8 x i32> %zextsrc1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -195,8 +209,11 @@ define <8 x i16> @hadds_const(<8 x i16> %src1) {
 define <8 x i16> @hadds_const_lhs(<8 x i16> %src1) {
 ; CHECK-LABEL: hadds_const_lhs:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v1.8h, #1
-; CHECK-NEXT:    shadd v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    movi v1.4s, #1
+; CHECK-NEXT:    saddw v2.4s, v1.4s, v0.4h
+; CHECK-NEXT:    saddw2 v1.4s, v1.4s, v0.8h
+; CHECK-NEXT:    shrn v0.4h, v2.4s, #1
+; CHECK-NEXT:    shrn2 v0.8h, v1.4s, #1
 ; CHECK-NEXT:    ret
   %zextsrc1 = sext <8 x i16> %src1 to <8 x i32>
   %add = add <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, %zextsrc1
@@ -234,7 +251,7 @@ define <8 x i16> @hadds_const_both() {
 define <8 x i16> @hadds_const_bothhigh() {
 ; CHECK-LABEL: hadds_const_bothhigh:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #32766 // =0x7ffe
+; CHECK-NEXT:    mov w8, #32766
 ; CHECK-NEXT:    dup v0.8h, w8
 ; CHECK-NEXT:    ret
   %ext1 = sext <8 x i16> <i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766> to <8 x i32>
@@ -286,7 +303,7 @@ define <8 x i16> @hadds_i_const_lhs(<8 x i16> %src1) {
 ; CHECK-LABEL: hadds_i_const_lhs:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    movi v1.8h, #1
-; CHECK-NEXT:    shadd v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    shadd v0.8h, v1.8h, v0.8h
 ; CHECK-NEXT:    ret
   %result = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>, <8 x i16> %src1)
   ret <8 x i16> %result
@@ -295,7 +312,8 @@ define <8 x i16> @hadds_i_const_lhs(<8 x i16> %src1) {
 define <8 x i16> @hadds_i_const_zero(<8 x i16> %src1) {
 ; CHECK-LABEL: hadds_i_const_zero:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sshr v0.8h, v0.8h, #1
+; CHECK-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-NEXT:    shadd v0.8h, v1.8h, v0.8h
 ; CHECK-NEXT:    ret
   %result = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, <8 x i16> %src1)
   ret <8 x i16> %result
@@ -304,7 +322,9 @@ define <8 x i16> @hadds_i_const_zero(<8 x i16> %src1) {
 define <8 x i16> @hadds_i_const_both() {
 ; CHECK-LABEL: hadds_i_const_both:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v0.8h, #2
+; CHECK-NEXT:    movi v0.8h, #1
+; CHECK-NEXT:    movi v1.8h, #3
+; CHECK-NEXT:    shadd v0.8h, v1.8h, v0.8h
 ; CHECK-NEXT:    ret
   %result = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
   ret <8 x i16> %result
@@ -313,8 +333,10 @@ define <8 x i16> @hadds_i_const_both() {
 define <8 x i16> @hadds_i_const_bothhigh() {
 ; CHECK-LABEL: hadds_i_const_bothhigh:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #32766 // =0x7ffe
-; CHECK-NEXT:    dup v0.8h, w8
+; CHECK-NEXT:    mov w8, #32766
+; CHECK-NEXT:    mvni v0.8h, #128, lsl #8
+; CHECK-NEXT:    dup v1.8h, w8
+; CHECK-NEXT:    shadd v0.8h, v1.8h, v0.8h
 ; CHECK-NEXT:    ret
   %result = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> <i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766>, <8 x i16> <i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767>)
   ret <8 x i16> %result
@@ -323,7 +345,7 @@ define <8 x i16> @hadds_i_const_bothhigh() {
 define <8 x i16> @hadds_i_undef(<8 x i16> %t, <8 x i16> %src1) {
 ; CHECK-LABEL: hadds_i_undef:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    shadd v0.8h, v0.8h, v1.8h
 ; CHECK-NEXT:    ret
   %result = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> undef, <8 x i16> %src1)
   ret <8 x i16> %result
@@ -350,8 +372,11 @@ define <8 x i16> @rhaddu_base(<8 x i16> %src1, <8 x i16> %src2) {
 define <8 x i16> @rhaddu_const(<8 x i16> %src1) {
 ; CHECK-LABEL: rhaddu_const:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v1.8h, #1
-; CHECK-NEXT:    urhadd v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    movi v1.4s, #2
+; CHECK-NEXT:    uaddw v2.4s, v1.4s, v0.4h
+; CHECK-NEXT:    uaddw2 v1.4s, v1.4s, v0.8h
+; CHECK-NEXT:    shrn v0.4h, v2.4s, #1
+; CHECK-NEXT:    shrn2 v0.8h, v1.4s, #1
 ; CHECK-NEXT:    ret
   %zextsrc1 = zext <8 x i16> %src1 to <8 x i32>
   %add1 = add <8 x i32> %zextsrc1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -364,8 +389,11 @@ define <8 x i16> @rhaddu_const(<8 x i16> %src1) {
 define <8 x i16> @rhaddu_const_lhs(<8 x i16> %src1) {
 ; CHECK-LABEL: rhaddu_const_lhs:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v1.8h, #1
-; CHECK-NEXT:    urhadd v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    movi v1.4s, #2
+; CHECK-NEXT:    uaddw v2.4s, v1.4s, v0.4h
+; CHECK-NEXT:    uaddw2 v1.4s, v1.4s, v0.8h
+; CHECK-NEXT:    shrn v0.4h, v2.4s, #1
+; CHECK-NEXT:    shrn2 v0.8h, v1.4s, #1
 ; CHECK-NEXT:    ret
   %zextsrc1 = zext <8 x i16> %src1 to <8 x i32>
   %add1 = add <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, %zextsrc1
@@ -378,8 +406,11 @@ define <8 x i16> @rhaddu_const_lhs(<8 x i16> %src1) {
 define <8 x i16> @rhaddu_const_zero(<8 x i16> %src1) {
 ; CHECK-LABEL: rhaddu_const_zero:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v1.8h, #1
-; CHECK-NEXT:    uhadd v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    movi v1.4s, #1
+; CHECK-NEXT:    uaddw v2.4s, v1.4s, v0.4h
+; CHECK-NEXT:    uaddw2 v1.4s, v1.4s, v0.8h
+; CHECK-NEXT:    shrn v0.4h, v2.4s, #1
+; CHECK-NEXT:    shrn2 v0.8h, v1.4s, #1
 ; CHECK-NEXT:    ret
   %zextsrc1 = zext <8 x i16> %src1 to <8 x i32>
   %add1 = add <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, %zextsrc1
@@ -418,8 +449,11 @@ define <8 x i16> @rhaddu_const_bothhigh() {
 define <8 x i16> @rhaddu_undef(<8 x i16> %src1) {
 ; CHECK-LABEL: rhaddu_undef:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v1.8h, #1
-; CHECK-NEXT:    uhadd v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    movi v1.4s, #1
+; CHECK-NEXT:    uaddw v2.4s, v1.4s, v0.4h
+; CHECK-NEXT:    uaddw2 v1.4s, v1.4s, v0.8h
+; CHECK-NEXT:    shrn v0.4h, v2.4s, #1
+; CHECK-NEXT:    shrn2 v0.8h, v1.4s, #1
 ; CHECK-NEXT:    ret
   %zextsrc1 = zext <8 x i16> %src1 to <8 x i32>
   %zextsrc2 = zext <8 x i16> undef to <8 x i32>
@@ -455,7 +489,7 @@ define <8 x i16> @rhaddu_i_const_lhs(<8 x i16> %src1) {
 ; CHECK-LABEL: rhaddu_i_const_lhs:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    movi v1.8h, #1
-; CHECK-NEXT:    urhadd v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    urhadd v0.8h, v1.8h, v0.8h
 ; CHECK-NEXT:    ret
   %result = call <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>, <8 x i16> %src1)
   ret <8 x i16> %result
@@ -465,7 +499,7 @@ define <8 x i16> @rhaddu_i_const_zero(<8 x i16> %src1) {
 ; CHECK-LABEL: rhaddu_i_const_zero:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-NEXT:    urhadd v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    urhadd v0.8h, v1.8h, v0.8h
 ; CHECK-NEXT:    ret
   %result = call <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, <8 x i16> %src1)
   ret <8 x i16> %result
@@ -474,7 +508,9 @@ define <8 x i16> @rhaddu_i_const_zero(<8 x i16> %src1) {
 define <8 x i16> @rhaddu_i_const_both() {
 ; CHECK-LABEL: rhaddu_i_const_both:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v0.8h, #2
+; CHECK-NEXT:    movi v0.8h, #1
+; CHECK-NEXT:    movi v1.8h, #3
+; CHECK-NEXT:    urhadd v0.8h, v1.8h, v0.8h
 ; CHECK-NEXT:    ret
   %result = call <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
   ret <8 x i16> %result
@@ -484,6 +520,8 @@ define <8 x i16> @rhaddu_i_const_bothhigh() {
 ; CHECK-LABEL: rhaddu_i_const_bothhigh:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    movi v0.2d, #0xffffffffffffffff
+; CHECK-NEXT:    mvni v1.8h, #1
+; CHECK-NEXT:    urhadd v0.8h, v1.8h, v0.8h
 ; CHECK-NEXT:    ret
   %result = call <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16> <i16 65534, i16 65534, i16 65534, i16 65534, i16 65534, i16 65534, i16 65534, i16 65534>, <8 x i16> <i16 65535, i16 65535, i16 65535, i16 65535, i16 65535, i16 65535, i16 65535, i16 65535>)
   ret <8 x i16> %result
@@ -492,7 +530,7 @@ define <8 x i16> @rhaddu_i_const_bothhigh() {
 define <8 x i16> @rhaddu_i_undef(<8 x i16> %t, <8 x i16> %src1) {
 ; CHECK-LABEL: rhaddu_i_undef:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    urhadd v0.8h, v0.8h, v1.8h
 ; CHECK-NEXT:    ret
   %result = call <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16> undef, <8 x i16> %src1)
   ret <8 x i16> %result
@@ -519,8 +557,11 @@ define <8 x i16> @rhadds_base(<8 x i16> %src1, <8 x i16> %src2) {
 define <8 x i16> @rhadds_const(<8 x i16> %src1) {
 ; CHECK-LABEL: rhadds_const:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v1.8h, #1
-; CHECK-NEXT:    srhadd v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    movi v1.4s, #2
+; CHECK-NEXT:    saddw v2.4s, v1.4s, v0.4h
+; CHECK-NEXT:    saddw2 v1.4s, v1.4s, v0.8h
+; CHECK-NEXT:    shrn v0.4h, v2.4s, #1
+; CHECK-NEXT:    shrn2 v0.8h, v1.4s, #1
 ; CHECK-NEXT:    ret
   %zextsrc1 = sext <8 x i16> %src1 to <8 x i32>
   %add1 = add <8 x i32> %zextsrc1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -533,8 +574,11 @@ define <8 x i16> @rhadds_const(<8 x i16> %src1) {
 define <8 x i16> @rhadds_const_lhs(<8 x i16> %src1) {
 ; CHECK-LABEL: rhadds_const_lhs:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v1.8h, #1
-; CHECK-NEXT:    srhadd v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    movi v1.4s, #2
+; CHECK-NEXT:    saddw v2.4s, v1.4s, v0.4h
+; CHECK-NEXT:    saddw2 v1.4s, v1.4s, v0.8h
+; CHECK-NEXT:    shrn v0.4h, v2.4s, #1
+; CHECK-NEXT:    shrn2 v0.8h, v1.4s, #1
 ; CHECK-NEXT:    ret
   %zextsrc1 = sext <8 x i16> %src1 to <8 x i32>
   %add1 = add <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, %zextsrc1
@@ -547,8 +591,11 @@ define <8 x i16> @rhadds_const_lhs(<8 x i16> %src1) {
 define <8 x i16> @rhadds_const_zero(<8 x i16> %src1) {
 ; CHECK-LABEL: rhadds_const_zero:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v1.8h, #1
-; CHECK-NEXT:    shadd v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    movi v1.4s, #1
+; CHECK-NEXT:    saddw v2.4s, v1.4s, v0.4h
+; CHECK-NEXT:    saddw2 v1.4s, v1.4s, v0.8h
+; CHECK-NEXT:    shrn v0.4h, v2.4s, #1
+; CHECK-NEXT:    shrn2 v0.8h, v1.4s, #1
 ; CHECK-NEXT:    ret
   %zextsrc1 = sext <8 x i16> %src1 to <8 x i32>
   %add1 = add <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, %zextsrc1
@@ -587,8 +634,11 @@ define <8 x i16> @rhadds_const_bothhigh() {
 define <8 x i16> @rhadds_undef(<8 x i16> %src1) {
 ; CHECK-LABEL: rhadds_undef:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v1.8h, #1
-; CHECK-NEXT:    shadd v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    movi v1.4s, #1
+; CHECK-NEXT:    saddw v2.4s, v1.4s, v0.4h
+; CHECK-NEXT:    saddw2 v1.4s, v1.4s, v0.8h
+; CHECK-NEXT:    shrn v0.4h, v2.4s, #1
+; CHECK-NEXT:    shrn2 v0.8h, v1.4s, #1
 ; CHECK-NEXT:    ret
   %zextsrc1 = sext <8 x i16> %src1 to <8 x i32>
   %zextsrc2 = sext <8 x i16> undef to <8 x i32>
@@ -624,7 +674,7 @@ define <8 x i16> @rhadds_i_const_lhs(<8 x i16> %src1) {
 ; CHECK-LABEL: rhadds_i_const_lhs:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    movi v1.8h, #1
-; CHECK-NEXT:    srhadd v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    srhadd v0.8h, v1.8h, v0.8h
 ; CHECK-NEXT:    ret
   %result = call <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>, <8 x i16> %src1)
   ret <8 x i16> %result
@@ -634,7 +684,7 @@ define <8 x i16> @rhadds_i_const_zero(<8 x i16> %src1) {
 ; CHECK-LABEL: rhadds_i_const_zero:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-NEXT:    srhadd v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    srhadd v0.8h, v1.8h, v0.8h
 ; CHECK-NEXT:    ret
   %result = call <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, <8 x i16> %src1)
   ret <8 x i16> %result
@@ -643,7 +693,9 @@ define <8 x i16> @rhadds_i_const_zero(<8 x i16> %src1) {
 define <8 x i16> @rhadds_i_const_both() {
 ; CHECK-LABEL: rhadds_i_const_both:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v0.8h, #2
+; CHECK-NEXT:    movi v0.8h, #1
+; CHECK-NEXT:    movi v1.8h, #3
+; CHECK-NEXT:    srhadd v0.8h, v1.8h, v0.8h
 ; CHECK-NEXT:    ret
   %result = call <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
   ret <8 x i16> %result
@@ -652,7 +704,10 @@ define <8 x i16> @rhadds_i_const_both() {
 define <8 x i16> @rhadds_i_const_bothhigh() {
 ; CHECK-LABEL: rhadds_i_const_bothhigh:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #32766
 ; CHECK-NEXT:    mvni v0.8h, #128, lsl #8
+; CHECK-NEXT:    dup v1.8h, w8
+; CHECK-NEXT:    srhadd v0.8h, v1.8h, v0.8h
 ; CHECK-NEXT:    ret
   %result = call <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16> <i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766>, <8 x i16> <i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767>)
   ret <8 x i16> %result
@@ -661,7 +716,7 @@ define <8 x i16> @rhadds_i_const_bothhigh() {
 define <8 x i16> @rhadds_i_undef(<8 x i16> %t, <8 x i16> %src1) {
 ; CHECK-LABEL: rhadds_i_undef:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    srhadd v0.8h, v0.8h, v1.8h
 ; CHECK-NEXT:    ret
   %result = call <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16> undef, <8 x i16> %src1)
   ret <8 x i16> %result
@@ -671,6 +726,7 @@ define <8 x i16> @rhadds_i_undef(<8 x i16> %t, <8 x i16> %src1) {
 define <8 x i8> @shadd_v8i8(<8 x i8> %x) {
 ; CHECK-LABEL: shadd_v8i8:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    shadd v0.8b, v0.8b, v0.8b
 ; CHECK-NEXT:    ret
   %r = tail call <8 x i8> @llvm.aarch64.neon.shadd.v8i8(<8 x i8> %x, <8 x i8> %x)
   ret <8 x i8> %r
@@ -679,6 +735,7 @@ define <8 x i8> @shadd_v8i8(<8 x i8> %x) {
 define <4 x i16> @shadd_v4i16(<4 x i16> %x) {
 ; CHECK-LABEL: shadd_v4i16:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    shadd v0.4h, v0.4h, v0.4h
 ; CHECK-NEXT:    ret
   %r = tail call <4 x i16> @llvm.aarch64.neon.shadd.v4i16(<4 x i16> %x, <4 x i16> %x)
   ret <4 x i16> %r
@@ -687,6 +744,7 @@ define <4 x i16> @shadd_v4i16(<4 x i16> %x) {
 define <2 x i32> @shadd_v2i32(<2 x i32> %x) {
 ; CHECK-LABEL: shadd_v2i32:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    shadd v0.2s, v0.2s, v0.2s
 ; CHECK-NEXT:    ret
   %r = tail call <2 x i32> @llvm.aarch64.neon.shadd.v2i32(<2 x i32> %x, <2 x i32> %x)
   ret <2 x i32> %r
@@ -695,6 +753,7 @@ define <2 x i32> @shadd_v2i32(<2 x i32> %x) {
 define <16 x i8> @shadd_v16i8(<16 x i8> %x) {
 ; CHECK-LABEL: shadd_v16i8:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    shadd v0.16b, v0.16b, v0.16b
 ; CHECK-NEXT:    ret
   %r = tail call <16 x i8> @llvm.aarch64.neon.shadd.v16i8(<16 x i8> %x, <16 x i8> %x)
   ret <16 x i8> %r
@@ -703,6 +762,7 @@ define <16 x i8> @shadd_v16i8(<16 x i8> %x) {
 define <8 x i16> @shadd_v8i16(<8 x i16> %x) {
 ; CHECK-LABEL: shadd_v8i16:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    shadd v0.8h, v0.8h, v0.8h
 ; CHECK-NEXT:    ret
   %r = tail call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> %x, <8 x i16> %x)
   ret <8 x i16> %r
@@ -711,6 +771,7 @@ define <8 x i16> @shadd_v8i16(<8 x i16> %x) {
 define <4 x i32> @shadd_v4i32(<4 x i32> %x) {
 ; CHECK-LABEL: shadd_v4i32:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    shadd v0.4s, v0.4s, v0.4s
 ; CHECK-NEXT:    ret
   %r = tail call <4 x i32> @llvm.aarch64.neon.shadd.v4i32(<4 x i32> %x, <4 x i32> %x)
   ret <4 x i32> %r
@@ -719,6 +780,7 @@ define <4 x i32> @shadd_v4i32(<4 x i32> %x) {
 define <8 x i8> @uhadd_v8i8(<8 x i8> %x) {
 ; CHECK-LABEL: uhadd_v8i8:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    uhadd v0.8b, v0.8b, v0.8b
 ; CHECK-NEXT:    ret
   %r = tail call <8 x i8> @llvm.aarch64.neon.uhadd.v8i8(<8 x i8> %x, <8 x i8> %x)
   ret <8 x i8> %r
@@ -727,6 +789,7 @@ define <8 x i8> @uhadd_v8i8(<8 x i8> %x) {
 define <4 x i16> @uhadd_v4i16(<4 x i16> %x) {
 ; CHECK-LABEL: uhadd_v4i16:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    uhadd v0.4h, v0.4h, v0.4h
 ; CHECK-NEXT:    ret
   %r = tail call <4 x i16> @llvm.aarch64.neon.uhadd.v4i16(<4 x i16> %x, <4 x i16> %x)
   ret <4 x i16> %r
@@ -735,6 +798,7 @@ define <4 x i16> @uhadd_v4i16(<4 x i16> %x) {
 define <2 x i32> @uhadd_v2i32(<2 x i32> %x) {
 ; CHECK-LABEL: uhadd_v2i32:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    uhadd v0.2s, v0.2s, v0.2s
 ; CHECK-NEXT:    ret
   %r = tail call <2 x i32> @llvm.aarch64.neon.uhadd.v2i32(<2 x i32> %x, <2 x i32> %x)
   ret <2 x i32> %r
@@ -743,6 +807,7 @@ define <2 x i32> @uhadd_v2i32(<2 x i32> %x) {
 define <16 x i8> @uhadd_v16i8(<16 x i8> %x) {
 ; CHECK-LABEL: uhadd_v16i8:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    uhadd v0.16b, v0.16b, v0.16b
 ; CHECK-NEXT:    ret
   %r = tail call <16 x i8> @llvm.aarch64.neon.uhadd.v16i8(<16 x i8> %x, <16 x i8> %x)
   ret <16 x i8> %r
@@ -751,6 +816,7 @@ define <16 x i8> @uhadd_v16i8(<16 x i8> %x) {
 define <8 x i16> @uhadd_v8i16(<8 x i16> %x) {
 ; CHECK-LABEL: uhadd_v8i16:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    uhadd v0.8h, v0.8h, v0.8h
 ; CHECK-NEXT:    ret
   %r = tail call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> %x, <8 x i16> %x)
   ret <8 x i16> %r
@@ -759,6 +825,7 @@ define <8 x i16> @uhadd_v8i16(<8 x i16> %x) {
 define <4 x i32> @uhadd_v4i32(<4 x i32> %x) {
 ; CHECK-LABEL: uhadd_v4i32:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    uhadd v0.4s, v0.4s, v0.4s
 ; CHECK-NEXT:    ret
   %r = tail call <4 x i32> @llvm.aarch64.neon.uhadd.v4i32(<4 x i32> %x, <4 x i32> %x)
   ret <4 x i32> %r
@@ -766,6 +833,7 @@ define <4 x i32> @uhadd_v4i32(<4 x i32> %x) {
 define <8 x i8> @srhadd_v8i8(<8 x i8> %x) {
 ; CHECK-LABEL: srhadd_v8i8:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    srhadd v0.8b, v0.8b, v0.8b
 ; CHECK-NEXT:    ret
   %r = tail call <8 x i8> @llvm.aarch64.neon.srhadd.v8i8(<8 x i8> %x, <8 x i8> %x)
   ret <8 x i8> %r
@@ -774,6 +842,7 @@ define <8 x i8> @srhadd_v8i8(<8 x i8> %x) {
 define <4 x i16> @srhadd_v4i16(<4 x i16> %x) {
 ; CHECK-LABEL: srhadd_v4i16:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    srhadd v0.4h, v0.4h, v0.4h
 ; CHECK-NEXT:    ret
   %r = tail call <4 x i16> @llvm.aarch64.neon.srhadd.v4i16(<4 x i16> %x, <4 x i16> %x)
   ret <4 x i16> %r
@@ -782,6 +851,7 @@ define <4 x i16> @srhadd_v4i16(<4 x i16> %x) {
 define <2 x i32> @srhadd_v2i32(<2 x i32> %x) {
 ; CHECK-LABEL: srhadd_v2i32:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    srhadd v0.2s, v0.2s, v0.2s
 ; CHECK-NEXT:    ret
   %r = tail call <2 x i32> @llvm.aarch64.neon.srhadd.v2i32(<2 x i32> %x, <2 x i32> %x)
   ret <2 x i32> %r
@@ -790,6 +860,7 @@ define <2 x i32> @srhadd_v2i32(<2 x i32> %x) {
 define <16 x i8> @srhadd_v16i8(<16 x i8> %x) {
 ; CHECK-LABEL: srhadd_v16i8:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    srhadd v0.16b, v0.16b, v0.16b
 ; CHECK-NEXT:    ret
   %r = tail call <16 x i8> @llvm.aarch64.neon.srhadd.v16i8(<16 x i8> %x, <16 x i8> %x)
   ret <16 x i8> %r
@@ -798,6 +869,7 @@ define <16 x i8> @srhadd_v16i8(<16 x i8> %x) {
 define <8 x i16> @srhadd_v8i16(<8 x i16> %x) {
 ; CHECK-LABEL: srhadd_v8i16:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    srhadd v0.8h, v0.8h, v0.8h
 ; CHECK-NEXT:    ret
   %r = tail call <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16> %x, <8 x i16> %x)
   ret <8 x i16> %r
@@ -806,6 +878,7 @@ define <8 x i16> @srhadd_v8i16(<8 x i16> %x) {
 define <4 x i32> @srhadd_v4i32(<4 x i32> %x) {
 ; CHECK-LABEL: srhadd_v4i32:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    srhadd v0.4s, v0.4s, v0.4s
 ; CHECK-NEXT:    ret
   %r = tail call <4 x i32> @llvm.aarch64.neon.srhadd.v4i32(<4 x i32> %x, <4 x i32> %x)
   ret <4 x i32> %r
@@ -814,6 +887,7 @@ define <4 x i32> @srhadd_v4i32(<4 x i32> %x) {
 define <8 x i8> @urhadd_v8i8(<8 x i8> %x) {
 ; CHECK-LABEL: urhadd_v8i8:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    urhadd v0.8b, v0.8b, v0.8b
 ; CHECK-NEXT:    ret
   %r = tail call <8 x i8> @llvm.aarch64.neon.urhadd.v8i8(<8 x i8> %x, <8 x i8> %x)
   ret <8 x i8> %r
@@ -822,6 +896,7 @@ define <8 x i8> @urhadd_v8i8(<8 x i8> %x) {
 define <4 x i16> @urhadd_v4i16(<4 x i16> %x) {
 ; CHECK-LABEL: urhadd_v4i16:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    urhadd v0.4h, v0.4h, v0.4h
 ; CHECK-NEXT:    ret
   %r = tail call <4 x i16> @llvm.aarch64.neon.urhadd.v4i16(<4 x i16> %x, <4 x i16> %x)
   ret <4 x i16> %r
@@ -830,6 +905,7 @@ define <4 x i16> @urhadd_v4i16(<4 x i16> %x) {
 define <2 x i32> @urhadd_v2i32(<2 x i32> %x) {
 ; CHECK-LABEL: urhadd_v2i32:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    urhadd v0.2s, v0.2s, v0.2s
 ; CHECK-NEXT:    ret
   %r = tail call <2 x i32> @llvm.aarch64.neon.urhadd.v2i32(<2 x i32> %x, <2 x i32> %x)
   ret <2 x i32> %r
@@ -838,6 +914,7 @@ define <2 x i32> @urhadd_v2i32(<2 x i32> %x) {
 define <16 x i8> @urhadd_v16i8(<16 x i8> %x) {
 ; CHECK-LABEL: urhadd_v16i8:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    urhadd v0.16b, v0.16b, v0.16b
 ; CHECK-NEXT:    ret
   %r = tail call <16 x i8> @llvm.aarch64.neon.urhadd.v16i8(<16 x i8> %x, <16 x i8> %x)
   ret <16 x i8> %r
@@ -846,6 +923,7 @@ define <16 x i8> @urhadd_v16i8(<16 x i8> %x) {
 define <8 x i16> @urhadd_v8i16(<8 x i16> %x) {
 ; CHECK-LABEL: urhadd_v8i16:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    urhadd v0.8h, v0.8h, v0.8h
 ; CHECK-NEXT:    ret
   %r = tail call <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16> %x, <8 x i16> %x)
   ret <8 x i16> %r
@@ -854,11 +932,27 @@ define <8 x i16> @urhadd_v8i16(<8 x i16> %x) {
 define <4 x i32> @urhadd_v4i32(<4 x i32> %x) {
 ; CHECK-LABEL: urhadd_v4i32:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    urhadd v0.4s, v0.4s, v0.4s
 ; CHECK-NEXT:    ret
   %r = tail call <4 x i32> @llvm.aarch64.neon.urhadd.v4i32(<4 x i32> %x, <4 x i32> %x)
   ret <4 x i32> %r
 }
 
+define <4 x i32> @fixedwidth(<4 x i32> %a0, <4 x i32> %a1)  {
+; CHECK-LABEL: fixedwidth:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and v2.16b, v0.16b, v1.16b
+; CHECK-NEXT:    eor v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    usra v2.4s, v0.4s, #1
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    ret
+  %and = and <4 x i32> %a0, %a1
+  %xor = xor <4 x i32> %a0, %a1
+  %srl = lshr <4 x i32> %xor, <i32 1, i32 1, i32 1, i32 1>
+  %res = add <4 x i32> %and, %srl
+  ret <4 x i32> %res
+}
+
 declare <8 x i8> @llvm.aarch64.neon.shadd.v8i8(<8 x i8>, <8 x i8>)
 declare <4 x i16> @llvm.aarch64.neon.shadd.v4i16(<4 x i16>, <4 x i16>)
 declare <2 x i32> @llvm.aarch64.neon.shadd.v2i32(<2 x i32>, <2 x i32>)



More information about the llvm-commits mailing list