[llvm] [AArch64] Prefer `SADDLV` over `ADDV` for vector mask reductions (PR #183522)

Tue Mar 3 06:57:35 PST 2026

https://github.com/MacDue updated https://github.com/llvm/llvm-project/pull/183522

>From cf1b1ffbc657b8cef625d2d2e14aff74f72b2cc1 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Thu, 26 Feb 2026 13:03:54 +0000
Subject: [PATCH 1/3] Precommit tests

---
 .../CodeGen/AArch64/extend_vecreduce_add.ll   | 213 ++++++++++++++++++
 1 file changed, 213 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/extend_vecreduce_add.ll

diff --git a/llvm/test/CodeGen/AArch64/extend_vecreduce_add.ll b/llvm/test/CodeGen/AArch64/extend_vecreduce_add.ll
new file mode 100644
index 0000000000000..fd3fcd12eae81
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/extend_vecreduce_add.ll
@@ -0,0 +1,213 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -start-before=codegenprepare | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define i32 @vmask_reduce_i32_v8i8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: vmask_reduce_i32_v8i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmgt v0.8b, v1.8b, v0.8b
+; CHECK-NEXT:    addv b0, v0.8b
+; CHECK-NEXT:    smov w0, v0.b[0]
+; CHECK-NEXT:    ret
+  %mask = icmp slt <8 x i8> %a, %b
+  %t1 = sext <8 x i1> %mask to <8 x i8>
+  %t2 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %t1)
+  %t3 = sext i8 %t2 to i32
+  ret i32 %t3
+}
+
+define i32 @vmask_reduce_i32_v16i8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: vmask_reduce_i32_v16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmgt v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    addv b0, v0.16b
+; CHECK-NEXT:    smov w0, v0.b[0]
+; CHECK-NEXT:    ret
+  %mask = icmp slt <16 x i8> %a, %b
+  %t1 = sext <16 x i1> %mask to <16 x i8>
+  %t2 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %t1)
+  %t3 = sext i8 %t2 to i32
+  ret i32 %t3
+}
+
+define i32 @vmask_reduce_i32_v4i16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK-LABEL: vmask_reduce_i32_v4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmgt v0.4h, v1.4h, v0.4h
+; CHECK-NEXT:    addv h0, v0.4h
+; CHECK-NEXT:    smov w0, v0.h[0]
+; CHECK-NEXT:    ret
+  %mask = icmp slt <4 x i16> %a, %b
+  %t1 = sext <4 x i1> %mask to <4 x i16>
+  %t2 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %t1)
+  %t3 = sext i16 %t2 to i32
+  ret i32 %t3
+}
+
+define i32 @vmask_reduce_i32_v8i16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: vmask_reduce_i32_v8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmgt v0.8h, v1.8h, v0.8h
+; CHECK-NEXT:    addv h0, v0.8h
+; CHECK-NEXT:    smov w0, v0.h[0]
+; CHECK-NEXT:    ret
+  %mask = icmp slt <8 x i16> %a, %b
+  %t1 = sext <8 x i1> %mask to <8 x i16>
+  %t2 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %t1)
+  %t3 = sext i16 %t2 to i32
+  ret i32 %t3
+}
+
+define i64 @vmask_reduce_i32_v2i32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK-LABEL: vmask_reduce_i32_v2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmgt v0.2s, v1.2s, v0.2s
+; CHECK-NEXT:    addp v0.2s, v0.2s, v0.2s
+; CHECK-NEXT:    smov x0, v0.s[0]
+; CHECK-NEXT:    ret
+  %mask = icmp slt <2 x i32> %a, %b
+  %t1 = sext <2 x i1> %mask to <2 x i32>
+  %t2 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %t1)
+  %t3 = sext i32 %t2 to i64
+  ret i64 %t3
+}
+
+define i64 @vmask_reduce_i64_v8i8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: vmask_reduce_i64_v8i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmgt v0.8b, v1.8b, v0.8b
+; CHECK-NEXT:    addv b0, v0.8b
+; CHECK-NEXT:    smov x0, v0.b[0]
+; CHECK-NEXT:    ret
+  %mask = icmp slt <8 x i8> %a, %b
+  %t1 = sext <8 x i1> %mask to <8 x i8>
+  %t2 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %t1)
+  %t3 = sext i8 %t2 to i64
+  ret i64 %t3
+}
+
+define i64 @vmask_reduce_i64_v16i8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: vmask_reduce_i64_v16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmgt v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    addv b0, v0.16b
+; CHECK-NEXT:    smov x0, v0.b[0]
+; CHECK-NEXT:    ret
+  %mask = icmp slt <16 x i8> %a, %b
+  %t1 = sext <16 x i1> %mask to <16 x i8>
+  %t2 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %t1)
+  %t3 = sext i8 %t2 to i64
+  ret i64 %t3
+}
+
+define i64 @vmask_reduce_i64_v4i16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK-LABEL: vmask_reduce_i64_v4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmgt v0.4h, v1.4h, v0.4h
+; CHECK-NEXT:    addv h0, v0.4h
+; CHECK-NEXT:    smov x0, v0.h[0]
+; CHECK-NEXT:    ret
+  %mask = icmp slt <4 x i16> %a, %b
+  %t1 = sext <4 x i1> %mask to <4 x i16>
+  %t2 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %t1)
+  %t3 = sext i16 %t2 to i64
+  ret i64 %t3
+}
+
+define i64 @vmask_reduce_i64_v8i16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: vmask_reduce_i64_v8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmgt v0.8h, v1.8h, v0.8h
+; CHECK-NEXT:    addv h0, v0.8h
+; CHECK-NEXT:    smov x0, v0.h[0]
+; CHECK-NEXT:    ret
+  %mask = icmp slt <8 x i16> %a, %b
+  %t1 = sext <8 x i1> %mask to <8 x i16>
+  %t2 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %t1)
+  %t3 = sext i16 %t2 to i64
+  ret i64 %t3
+}
+
+; TODO: We should use a saddlv here to avoid the smov.
+define i64 @vmask_reduce_i64_v4i32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: vmask_reduce_i64_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmgt v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    addv s0, v0.4s
+; CHECK-NEXT:    smov x0, v0.s[0]
+; CHECK-NEXT:    ret
+  %mask = icmp slt <4 x i32> %a, %b
+  %t1 = sext <4 x i1> %mask to <4 x i32>
+  %t2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %t1)
+  %t3 = sext i32 %t2 to i64
+  ret i64 %t3
+}
+
+define i32 @vmask_popcount_i32_v16i8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: vmask_popcount_i32_v16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmgt v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    addv b0, v0.16b
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    sub w0, w8, w9, sxtb
+; CHECK-NEXT:    ret
+  %mask = icmp slt <16 x i8> %a, %b
+  %t1 = sext <16 x i1> %mask to <16 x i8>
+  %t2 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %t1)
+  %t3 = sext i8 %t2 to i32
+  %t4 = sub i32 0, %t3
+  ret i32 %t4
+}
+
+define i32 @vmask_popcount_i32_v4i16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK-LABEL: vmask_popcount_i32_v4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmgt v0.4h, v1.4h, v0.4h
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    addv h0, v0.4h
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    sub w0, w8, w9, sxth
+; CHECK-NEXT:    ret
+  %mask = icmp slt <4 x i16> %a, %b
+  %t1 = sext <4 x i1> %mask to <4 x i16>
+  %t2 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %t1)
+  %t3 = sext i16 %t2 to i32
+  %t4 = sub i32 0, %t3
+  ret i32 %t4
+}
+
+define i64 @vmask_popcount_i64_v16i8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: vmask_popcount_i64_v16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmgt v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    addv b0, v0.16b
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    sub x0, x8, w9, sxtb
+; CHECK-NEXT:    ret
+  %mask = icmp slt <16 x i8> %a, %b
+  %t1 = sext <16 x i1> %mask to <16 x i8>
+  %t2 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %t1)
+  %t3 = sext i8 %t2 to i64
+  %t4 = sub i64 0, %t3
+  ret i64 %t4
+}
+
+define i64 @vmask_popcount_i64_v4i16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK-LABEL: vmask_popcount_i64_v4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmgt v0.4h, v1.4h, v0.4h
+; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    addv h0, v0.4h
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    sub x0, x8, w9, sxth
+; CHECK-NEXT:    ret
+  %mask = icmp slt <4 x i16> %a, %b
+  %t1 = sext <4 x i1> %mask to <4 x i16>
+  %t2 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %t1)
+  %t3 = sext i16 %t2 to i64
+  %t4 = sub i64 0, %t3
+  ret i64 %t4
+}

>From fecd62c74431e8540d0eaca0c2926f13665be84e Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Thu, 26 Feb 2026 13:08:12 +0000
Subject: [PATCH 2/3] [AArch64] Prefer `SADDLV` over `ADDV` for vector mask
 reductions

This adds a DAG combine:
 `SEXT_IN_REG(VECREDUCE_ADD(SETcc))` -> `SADDLV(SETcc)`

The original reduction should not overflow, so replacing it with
`SADDLV` should be equivalent. This can eliminate the sign-extend in
some cases.
---
 .../Target/AArch64/AArch64ISelDAGToDAG.cpp    | 12 ++++
 .../Target/AArch64/AArch64ISelLowering.cpp    | 44 ++++++++++++
 llvm/test/CodeGen/AArch64/combine-sdiv.ll     | 72 +++++++++----------
 .../CodeGen/AArch64/extend_vecreduce_add.ll   | 60 ++++++++--------
 4 files changed, 119 insertions(+), 69 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 67f4e127b0c87..4d2b0e86fdb10 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -989,6 +989,18 @@ bool AArch64DAGToDAGISel::SelectArithExtendedRegister(SDValue N, SDValue &Reg,
     if (Ext == AArch64_AM::InvalidShiftExtend)
       return false;
 
+    // Don't match sext of vector extracts. These can use SMOV, but if we match
+    // this an extended register, we'll always fold the extend into an ALU op
+    // user of the extend (which results in a UMOV).
+    if (Ext >= AArch64_AM::SXTB) {
+      SDValue Op = N.getOperand(0);
+      if (Op->getOpcode() == ISD::ANY_EXTEND)
+        Op = Op->getOperand(0);
+      if (Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+          Op.getOperand(0).getValueType().isFixedLengthVector())
+        return false;
+    }
+
     Reg = N.getOperand(0);
 
     // Don't match if free 32-bit -> 64-bit zext can be used instead. Use the
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 0e24f022b6fd0..4b89b780cb54c 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -28135,9 +28135,53 @@ static SDValue performGatherLoadCombine(SDNode *N, SelectionDAG &DAG,
   return DAG.getMergeValues({Load, LoadChain}, DL);
 }
 
+static SDValue performVecReduceAddToSADDLVCombine(
+    SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) {
+  if (DCI.isBeforeLegalize())
+    return SDValue();
+
+  EVT VT = N->getValueType(0);
+
+  // Look through ANY_EXTENDS (which can occur between the VECREDUCE for i64).
+  SDValue Op = N->getOperand(0);
+  if (Op->getOpcode() == ISD::ANY_EXTEND)
+    Op = Op->getOperand(0);
+
+  if (Op->getOpcode() != ISD::VECREDUCE_ADD || !Op.hasOneUse())
+    return SDValue();
+
+  SDValue Vec = Op->getOperand(0);
+
+  EVT VecVT = Vec.getValueType();
+  EVT EltVT = VecVT.getScalarType();
+  EVT SrcVT = cast<VTSDNode>(N->getOperand(1))->getVT();
+  // TODO: Support i32 -> i64 (that won't use a SIGN_EXTEND_INREG).
+  assert(EltVT == MVT::i8 || EltVT == MVT::i16);
+
+  if (Vec->getOpcode() != ISD::SETCC || SrcVT != EltVT)
+    return SDValue();
+
+  SDLoc DL(N);
+  unsigned WideEltBits = EltVT.getScalarSizeInBits() * 2;
+  MVT WideEltVT = MVT::getIntegerVT(WideEltBits);
+  MVT WideVecVT = MVT::getVectorVT(WideEltVT, 128 / WideEltBits);
+
+  // Replace SEXT_IN_REG(VECREDUCE_ADD(SETcc)) with SADDLV(SETcc).
+  SDValue SADDLV = DAG.getNode(AArch64ISD::SADDLV, DL, MVT::v4i32, Vec);
+  SADDLV = DAG.getNode(AArch64ISD::NVCAST, DL, WideVecVT, SADDLV);
+  SDValue Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SADDLV,
+                               DAG.getConstant(0, DL, MVT::i64));
+  Result = DAG.getNode(ISD::ANY_EXTEND, DL, VT, Result);
+  return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Result,
+                     DAG.getValueType(WideEltVT));
+}
+
 static SDValue
 performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
                               SelectionDAG &DAG) {
+  if (SDValue Result = performVecReduceAddToSADDLVCombine(N, DCI, DAG))
+    return Result;
+
   SDLoc DL(N);
   SDValue Src = N->getOperand(0);
   unsigned Opc = Src->getOpcode();
diff --git a/llvm/test/CodeGen/AArch64/combine-sdiv.ll b/llvm/test/CodeGen/AArch64/combine-sdiv.ll
index e804a941716e3..fbb33db3fb7a9 100644
--- a/llvm/test/CodeGen/AArch64/combine-sdiv.ll
+++ b/llvm/test/CodeGen/AArch64/combine-sdiv.ll
@@ -801,31 +801,30 @@ define <4 x i32> @combine_vec_sdiv_by_pow2b_PosAndNeg(<4 x i32> %x) {
 define <16 x i8> @non_splat_minus_one_divisor_0(<16 x i8> %A) {
 ; CHECK-SD-LABEL: non_splat_minus_one_divisor_0:
 ; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    umov w9, v0.b[0]
-; CHECK-SD-NEXT:    mov w8, wzr
-; CHECK-SD-NEXT:    umov w10, v0.b[1]
-; CHECK-SD-NEXT:    sub w9, w8, w9, sxtb
-; CHECK-SD-NEXT:    sub w10, w8, w10, sxtb
-; CHECK-SD-NEXT:    fmov s1, w9
-; CHECK-SD-NEXT:    smov w9, v0.b[2]
-; CHECK-SD-NEXT:    mov v1.b[1], w10
-; CHECK-SD-NEXT:    umov w10, v0.b[3]
-; CHECK-SD-NEXT:    mov v1.b[2], w9
-; CHECK-SD-NEXT:    sub w9, w8, w10, sxtb
-; CHECK-SD-NEXT:    umov w10, v0.b[4]
-; CHECK-SD-NEXT:    mov v1.b[3], w9
-; CHECK-SD-NEXT:    sub w9, w8, w10, sxtb
-; CHECK-SD-NEXT:    umov w10, v0.b[5]
-; CHECK-SD-NEXT:    mov v1.b[4], w9
-; CHECK-SD-NEXT:    sub w9, w8, w10, sxtb
-; CHECK-SD-NEXT:    umov w10, v0.b[7]
-; CHECK-SD-NEXT:    mov v1.b[5], w9
-; CHECK-SD-NEXT:    smov w9, v0.b[6]
-; CHECK-SD-NEXT:    mov v1.b[6], w9
-; CHECK-SD-NEXT:    sub w9, w8, w10, sxtb
-; CHECK-SD-NEXT:    umov w10, v0.b[8]
-; CHECK-SD-NEXT:    mov v1.b[7], w9
-; CHECK-SD-NEXT:    sub w8, w8, w10, sxtb
+; CHECK-SD-NEXT:    smov w8, v0.b[0]
+; CHECK-SD-NEXT:    smov w9, v0.b[1]
+; CHECK-SD-NEXT:    neg w8, w8
+; CHECK-SD-NEXT:    neg w9, w9
+; CHECK-SD-NEXT:    fmov s1, w8
+; CHECK-SD-NEXT:    smov w8, v0.b[2]
+; CHECK-SD-NEXT:    mov v1.b[1], w9
+; CHECK-SD-NEXT:    smov w9, v0.b[3]
+; CHECK-SD-NEXT:    mov v1.b[2], w8
+; CHECK-SD-NEXT:    neg w8, w9
+; CHECK-SD-NEXT:    smov w9, v0.b[4]
+; CHECK-SD-NEXT:    mov v1.b[3], w8
+; CHECK-SD-NEXT:    neg w8, w9
+; CHECK-SD-NEXT:    smov w9, v0.b[5]
+; CHECK-SD-NEXT:    mov v1.b[4], w8
+; CHECK-SD-NEXT:    neg w8, w9
+; CHECK-SD-NEXT:    smov w9, v0.b[7]
+; CHECK-SD-NEXT:    mov v1.b[5], w8
+; CHECK-SD-NEXT:    smov w8, v0.b[6]
+; CHECK-SD-NEXT:    mov v1.b[6], w8
+; CHECK-SD-NEXT:    neg w8, w9
+; CHECK-SD-NEXT:    smov w9, v0.b[8]
+; CHECK-SD-NEXT:    mov v1.b[7], w8
+; CHECK-SD-NEXT:    neg w8, w9
 ; CHECK-SD-NEXT:    mov v1.b[8], w8
 ; CHECK-SD-NEXT:    smov w8, v0.b[9]
 ; CHECK-SD-NEXT:    mov v1.b[9], w8
@@ -1154,18 +1153,17 @@ define <8 x i16> @combine_vec_sdiv_nonuniform6(<8 x i16> %x) {
 define <8 x i16> @combine_vec_sdiv_nonuniform7(<8 x i16> %x) {
 ; CHECK-SD-LABEL: combine_vec_sdiv_nonuniform7:
 ; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    umov w9, v0.h[0]
-; CHECK-SD-NEXT:    mov w8, wzr
-; CHECK-SD-NEXT:    umov w10, v0.h[1]
-; CHECK-SD-NEXT:    umov w11, v0.h[2]
-; CHECK-SD-NEXT:    sub w9, w8, w9, sxth
-; CHECK-SD-NEXT:    sub w10, w8, w10, sxth
-; CHECK-SD-NEXT:    fmov s1, w9
-; CHECK-SD-NEXT:    sub w9, w8, w11, sxth
-; CHECK-SD-NEXT:    mov v1.h[1], w10
-; CHECK-SD-NEXT:    umov w10, v0.h[3]
-; CHECK-SD-NEXT:    mov v1.h[2], w9
-; CHECK-SD-NEXT:    sub w8, w8, w10, sxth
+; CHECK-SD-NEXT:    smov w8, v0.h[0]
+; CHECK-SD-NEXT:    smov w9, v0.h[1]
+; CHECK-SD-NEXT:    neg w8, w8
+; CHECK-SD-NEXT:    neg w9, w9
+; CHECK-SD-NEXT:    fmov s1, w8
+; CHECK-SD-NEXT:    smov w8, v0.h[2]
+; CHECK-SD-NEXT:    mov v1.h[1], w9
+; CHECK-SD-NEXT:    smov w9, v0.h[3]
+; CHECK-SD-NEXT:    neg w8, w8
+; CHECK-SD-NEXT:    mov v1.h[2], w8
+; CHECK-SD-NEXT:    neg w8, w9
 ; CHECK-SD-NEXT:    mov v1.h[3], w8
 ; CHECK-SD-NEXT:    smov w8, v0.h[4]
 ; CHECK-SD-NEXT:    mov v1.h[4], w8
diff --git a/llvm/test/CodeGen/AArch64/extend_vecreduce_add.ll b/llvm/test/CodeGen/AArch64/extend_vecreduce_add.ll
index fd3fcd12eae81..c6fba2cd56e98 100644
--- a/llvm/test/CodeGen/AArch64/extend_vecreduce_add.ll
+++ b/llvm/test/CodeGen/AArch64/extend_vecreduce_add.ll
@@ -7,8 +7,8 @@ define i32 @vmask_reduce_i32_v8i8(<8 x i8> %a, <8 x i8> %b) {
 ; CHECK-LABEL: vmask_reduce_i32_v8i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    cmgt v0.8b, v1.8b, v0.8b
-; CHECK-NEXT:    addv b0, v0.8b
-; CHECK-NEXT:    smov w0, v0.b[0]
+; CHECK-NEXT:    saddlv h0, v0.8b
+; CHECK-NEXT:    smov w0, v0.h[0]
 ; CHECK-NEXT:    ret
   %mask = icmp slt <8 x i8> %a, %b
   %t1 = sext <8 x i1> %mask to <8 x i8>
@@ -21,8 +21,8 @@ define i32 @vmask_reduce_i32_v16i8(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK-LABEL: vmask_reduce_i32_v16i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    cmgt v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    addv b0, v0.16b
-; CHECK-NEXT:    smov w0, v0.b[0]
+; CHECK-NEXT:    saddlv h0, v0.16b
+; CHECK-NEXT:    smov w0, v0.h[0]
 ; CHECK-NEXT:    ret
   %mask = icmp slt <16 x i8> %a, %b
   %t1 = sext <16 x i1> %mask to <16 x i8>
@@ -35,8 +35,8 @@ define i32 @vmask_reduce_i32_v4i16(<4 x i16> %a, <4 x i16> %b) {
 ; CHECK-LABEL: vmask_reduce_i32_v4i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    cmgt v0.4h, v1.4h, v0.4h
-; CHECK-NEXT:    addv h0, v0.4h
-; CHECK-NEXT:    smov w0, v0.h[0]
+; CHECK-NEXT:    saddlv s0, v0.4h
+; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
   %mask = icmp slt <4 x i16> %a, %b
   %t1 = sext <4 x i1> %mask to <4 x i16>
@@ -49,8 +49,8 @@ define i32 @vmask_reduce_i32_v8i16(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK-LABEL: vmask_reduce_i32_v8i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    cmgt v0.8h, v1.8h, v0.8h
-; CHECK-NEXT:    addv h0, v0.8h
-; CHECK-NEXT:    smov w0, v0.h[0]
+; CHECK-NEXT:    saddlv s0, v0.8h
+; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
   %mask = icmp slt <8 x i16> %a, %b
   %t1 = sext <8 x i1> %mask to <8 x i16>
@@ -77,8 +77,8 @@ define i64 @vmask_reduce_i64_v8i8(<8 x i8> %a, <8 x i8> %b) {
 ; CHECK-LABEL: vmask_reduce_i64_v8i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    cmgt v0.8b, v1.8b, v0.8b
-; CHECK-NEXT:    addv b0, v0.8b
-; CHECK-NEXT:    smov x0, v0.b[0]
+; CHECK-NEXT:    saddlv h0, v0.8b
+; CHECK-NEXT:    smov x0, v0.h[0]
 ; CHECK-NEXT:    ret
   %mask = icmp slt <8 x i8> %a, %b
   %t1 = sext <8 x i1> %mask to <8 x i8>
@@ -91,8 +91,8 @@ define i64 @vmask_reduce_i64_v16i8(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK-LABEL: vmask_reduce_i64_v16i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    cmgt v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    addv b0, v0.16b
-; CHECK-NEXT:    smov x0, v0.b[0]
+; CHECK-NEXT:    saddlv h0, v0.16b
+; CHECK-NEXT:    smov x0, v0.h[0]
 ; CHECK-NEXT:    ret
   %mask = icmp slt <16 x i8> %a, %b
   %t1 = sext <16 x i1> %mask to <16 x i8>
@@ -105,8 +105,8 @@ define i64 @vmask_reduce_i64_v4i16(<4 x i16> %a, <4 x i16> %b) {
 ; CHECK-LABEL: vmask_reduce_i64_v4i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    cmgt v0.4h, v1.4h, v0.4h
-; CHECK-NEXT:    addv h0, v0.4h
-; CHECK-NEXT:    smov x0, v0.h[0]
+; CHECK-NEXT:    saddlv s0, v0.4h
+; CHECK-NEXT:    smov x0, v0.s[0]
 ; CHECK-NEXT:    ret
   %mask = icmp slt <4 x i16> %a, %b
   %t1 = sext <4 x i1> %mask to <4 x i16>
@@ -119,8 +119,8 @@ define i64 @vmask_reduce_i64_v8i16(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK-LABEL: vmask_reduce_i64_v8i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    cmgt v0.8h, v1.8h, v0.8h
-; CHECK-NEXT:    addv h0, v0.8h
-; CHECK-NEXT:    smov x0, v0.h[0]
+; CHECK-NEXT:    saddlv s0, v0.8h
+; CHECK-NEXT:    smov x0, v0.s[0]
 ; CHECK-NEXT:    ret
   %mask = icmp slt <8 x i16> %a, %b
   %t1 = sext <8 x i1> %mask to <8 x i16>
@@ -148,10 +148,9 @@ define i32 @vmask_popcount_i32_v16i8(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK-LABEL: vmask_popcount_i32_v16i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    cmgt v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    mov w8, wzr
-; CHECK-NEXT:    addv b0, v0.16b
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    sub w0, w8, w9, sxtb
+; CHECK-NEXT:    saddlv h0, v0.16b
+; CHECK-NEXT:    smov w8, v0.h[0]
+; CHECK-NEXT:    neg w0, w8
 ; CHECK-NEXT:    ret
   %mask = icmp slt <16 x i8> %a, %b
   %t1 = sext <16 x i1> %mask to <16 x i8>
@@ -165,10 +164,9 @@ define i32 @vmask_popcount_i32_v4i16(<4 x i16> %a, <4 x i16> %b) {
 ; CHECK-LABEL: vmask_popcount_i32_v4i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    cmgt v0.4h, v1.4h, v0.4h
-; CHECK-NEXT:    mov w8, wzr
-; CHECK-NEXT:    addv h0, v0.4h
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    sub w0, w8, w9, sxth
+; CHECK-NEXT:    saddlv s0, v0.4h
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    neg w0, w8
 ; CHECK-NEXT:    ret
   %mask = icmp slt <4 x i16> %a, %b
   %t1 = sext <4 x i1> %mask to <4 x i16>
@@ -182,10 +180,9 @@ define i64 @vmask_popcount_i64_v16i8(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK-LABEL: vmask_popcount_i64_v16i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    cmgt v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    mov x8, xzr
-; CHECK-NEXT:    addv b0, v0.16b
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    sub x0, x8, w9, sxtb
+; CHECK-NEXT:    saddlv h0, v0.16b
+; CHECK-NEXT:    smov x8, v0.h[0]
+; CHECK-NEXT:    neg x0, x8
 ; CHECK-NEXT:    ret
   %mask = icmp slt <16 x i8> %a, %b
   %t1 = sext <16 x i1> %mask to <16 x i8>
@@ -199,10 +196,9 @@ define i64 @vmask_popcount_i64_v4i16(<4 x i16> %a, <4 x i16> %b) {
 ; CHECK-LABEL: vmask_popcount_i64_v4i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    cmgt v0.4h, v1.4h, v0.4h
-; CHECK-NEXT:    mov x8, xzr
-; CHECK-NEXT:    addv h0, v0.4h
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    sub x0, x8, w9, sxth
+; CHECK-NEXT:    saddlv s0, v0.4h
+; CHECK-NEXT:    smov x8, v0.s[0]
+; CHECK-NEXT:    neg x0, x8
 ; CHECK-NEXT:    ret
   %mask = icmp slt <4 x i16> %a, %b
   %t1 = sext <4 x i1> %mask to <4 x i16>

>From 252cced86a2128f8c6994aebf6606ded3c8dfecf Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Tue, 3 Mar 2026 14:56:08 +0000
Subject: [PATCH 3/3] Fixups

---
 llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp     |  4 ++--
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp     |  5 ++++-
 .../AArch64/GISel/AArch64InstructionSelector.cpp    | 13 +------------
 .../AArch64/MCTargetDesc/AArch64AddressingModes.h   | 13 +++++++++++++
 4 files changed, 20 insertions(+), 15 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 4d2b0e86fdb10..2a7e53216c62b 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -990,9 +990,9 @@ bool AArch64DAGToDAGISel::SelectArithExtendedRegister(SDValue N, SDValue &Reg,
       return false;
 
     // Don't match sext of vector extracts. These can use SMOV, but if we match
-    // this an extended register, we'll always fold the extend into an ALU op
+    // this as an extended register, we'll always fold the extend into an ALU op
     // user of the extend (which results in a UMOV).
-    if (Ext >= AArch64_AM::SXTB) {
+    if (AArch64_AM::isSignExtendShiftType(Ext)) {
       SDValue Op = N.getOperand(0);
       if (Op->getOpcode() == ISD::ANY_EXTEND)
         Op = Op->getOperand(0);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 4b89b780cb54c..b96b575f4c8c3 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -28135,6 +28135,9 @@ static SDValue performGatherLoadCombine(SDNode *N, SelectionDAG &DAG,
   return DAG.getMergeValues({Load, LoadChain}, DL);
 }
 
+/// Attempts to fold SEXT_IN_REG(VECREDUCE_ADD(SETcc)) to SADDLV(SETcc).
+/// Note: With legal types a VECREDUCE_ADD of SETcc won't overflow so is
+/// equivalent to SADDLV(SETcc).
 static SDValue performVecReduceAddToSADDLVCombine(
     SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) {
   if (DCI.isBeforeLegalize())
@@ -28170,7 +28173,7 @@ static SDValue performVecReduceAddToSADDLVCombine(
   SDValue SADDLV = DAG.getNode(AArch64ISD::SADDLV, DL, MVT::v4i32, Vec);
   SADDLV = DAG.getNode(AArch64ISD::NVCAST, DL, WideVecVT, SADDLV);
   SDValue Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SADDLV,
-                               DAG.getConstant(0, DL, MVT::i64));
+                               DAG.getVectorIdxConstant(0, DL));
   Result = DAG.getNode(ISD::ANY_EXTEND, DL, VT, Result);
   return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Result,
                      DAG.getValueType(WideEltVT));
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index 32c91831d9fb7..6cc715c54024c 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -7188,17 +7188,6 @@ bool AArch64InstructionSelector::isWorthFoldingIntoExtendedReg(
                 [](MachineInstr &Use) { return Use.mayLoadOrStore(); });
 }
 
-static bool isSignExtendShiftType(AArch64_AM::ShiftExtendType Type) {
-  switch (Type) {
-  case AArch64_AM::SXTB:
-  case AArch64_AM::SXTH:
-  case AArch64_AM::SXTW:
-    return true;
-  default:
-    return false;
-  }
-}
-
 InstructionSelector::ComplexRendererFns
 AArch64InstructionSelector::selectExtendedSHL(
     MachineOperand &Root, MachineOperand &Base, MachineOperand &Offset,
@@ -7281,7 +7270,7 @@ AArch64InstructionSelector::selectExtendedSHL(
       if (Ext == AArch64_AM::InvalidShiftExtend)
         return std::nullopt;
 
-      SignExtend = isSignExtendShiftType(Ext) ? 1 : 0;
+      SignExtend = AArch64_AM::isSignExtendShiftType(Ext) ? 1 : 0;
       // We only support SXTW for signed extension here.
       if (SignExtend && Ext != AArch64_AM::SXTW)
         return std::nullopt;
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
index 1d3f089e2130b..0492fdf3442c1 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
@@ -48,6 +48,19 @@ enum ShiftExtendType {
   SXTX,
 };
 
+/// isSignExtendShiftType - Returns true if \p Type is sign extending.
+static inline bool isSignExtendShiftType(AArch64_AM::ShiftExtendType Type) {
+  switch (Type) {
+  case AArch64_AM::SXTB:
+  case AArch64_AM::SXTH:
+  case AArch64_AM::SXTW:
+  case AArch64_AM::SXTX:
+    return true;
+  default:
+    return false;
+  }
+}
+
 /// getShiftName - Get the string encoding for the shift type.
 static inline const char *getShiftExtendName(AArch64_AM::ShiftExtendType ST) {
   switch (ST) {