[llvm] [AArch64] Use `AArch64ISD::UADDLP` for manual widening adjacent arithmetic (zext/shuffle combination) (PR #189255)
Rajveer Singh Bharadwaj via llvm-commits
llvm-commits at lists.llvm.org
Wed Apr 1 04:48:47 PDT 2026
https://github.com/Rajveer100 updated https://github.com/llvm/llvm-project/pull/189255
>From f9738af40ba3375d8c9dd595cb62fd836e12c0fe Mon Sep 17 00:00:00 2001
From: Rajveer <rajveer.developer at icloud.com>
Date: Sun, 29 Mar 2026 21:20:24 +0530
Subject: [PATCH] [AArch64] Use `AArch64ISD::UADDLP` for manual widening
adjacent arithmetic (zext/shuffle combination)
Resolves #181490
This optimises the output for the following
patterns to `uaddlp(..., ...)`:
- add(shuffle_vec(zext(v0, ...), ...), shuffle_vec(zext(v0, ...), ...))
- add(zext(shuffle_vec(v0, ...), ...), zext(shuffle_vec(v0, ...), ...))
---
.../Target/AArch64/AArch64ISelLowering.cpp | 128 ++++++++++++++++++
llvm/test/CodeGen/AArch64/addp-shuffle.ll | 8 +-
llvm/test/CodeGen/AArch64/uaddlp.ll | 80 +++++++++++
3 files changed, 210 insertions(+), 6 deletions(-)
create mode 100644 llvm/test/CodeGen/AArch64/uaddlp.ll
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 38db1ac4a2fb9..06ceb33e7f260 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -22310,6 +22310,132 @@ static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG) {
return DAG.getNode(AArch64ISD::CSEL, DL, VT, RHS, LHS, CCVal, Cmp);
}
+static bool isEvenOddShufflePair(SDValue Op0, SDValue Op1) {
+ auto *S0 = dyn_cast<ShuffleVectorSDNode>(Op0);
+ auto *S1 = dyn_cast<ShuffleVectorSDNode>(Op1);
+
+ if (!S0 || !S1)
+ return false;
+
+ if (S0->getOperand(0) != S1->getOperand(0))
+ return false;
+
+ ArrayRef<int> Mask0 = S0->getMask();
+ ArrayRef<int> Mask1 = S1->getMask();
+
+ if (Mask0.size() != Mask1.size())
+ return false;
+
+ auto IsValidSequence = [](ArrayRef<int> Mask0, ArrayRef<int> Mask1) {
+ bool isValid = true;
+ for (unsigned I = 0; I < Mask0.size(); ++I) {
+ if (Mask0[I] != (int)(I * 2) || Mask1[I] != (int)(I * 2 + 1))
+ isValid = false;
+ }
+ return isValid;
+ };
+
+ return IsValidSequence(Mask0, Mask1) || IsValidSequence(Mask1, Mask0);
+}
+
+static SDValue isEvenOddBuildVector(SDValue Op0, SDValue Op1) {
+ using namespace llvm::SDPatternMatch;
+
+ auto *B0 = dyn_cast<BuildVectorSDNode>(Op0);
+ auto *B1 = dyn_cast<BuildVectorSDNode>(Op1);
+
+ if (!B0 || !B1)
+ return SDValue();
+
+ if (B0->getNumOperands() != B1->getNumOperands())
+ return SDValue();
+
+ auto IsValidSequence = [](BuildVectorSDNode *B0, BuildVectorSDNode *B1) {
+ SDValue Src;
+ for (unsigned int I = 0; I < B0->getNumOperands(); ++I) {
+ SDValue ExtVecElt0 = B0->getOperand(I);
+ SDValue ExtVecElt1 = B1->getOperand(I);
+
+ if (!ExtVecElt0 || !ExtVecElt1)
+ return SDValue();
+
+ SDValue Vec;
+ APInt Idx0, Idx1;
+ if (!sd_match(ExtVecElt0, m_ExtractElt(m_Value(Vec), m_ConstInt(Idx0))) ||
+ !sd_match(ExtVecElt1,
+ m_ExtractElt(m_Specific(Vec), m_ConstInt(Idx1))))
+ return SDValue();
+
+ if (!Src)
+ Src = Vec;
+
+ if (Src != Vec)
+ return SDValue();
+
+ if (Idx0 != (I * 2) || Idx1 != (I * 2 + 1))
+ return SDValue();
+ }
+ return Src;
+ };
+
+ if (SDValue Src = IsValidSequence(B0, B1))
+ return Src;
+
+ if (SDValue Src = IsValidSequence(B1, B0))
+ return Src;
+
+ return SDValue();
+}
+
+static SDValue performVectorReduceWithExtAndShuffleToUADDLP(SDNode *N,
+ SelectionDAG &DAG) {
+ EVT VT = N->getValueType(0);
+ SDLoc DL(N);
+
+ if (N->getOpcode() != ISD::ADD)
+ return SDValue();
+
+ SDValue Op0 = N->getOperand(0);
+ SDValue Op1 = N->getOperand(1);
+
+ if (Op0.getOpcode() == ISD::VECTOR_SHUFFLE &&
+ Op1.getOpcode() == ISD::VECTOR_SHUFFLE) {
+ SDValue Z0 = Op0.getOperand(0);
+ SDValue Z1 = Op1.getOperand(0);
+
+ if (Z0 == Z1 && Z0.getOpcode() == ISD::ZERO_EXTEND) {
+ if (isEvenOddShufflePair(Op0, Op1)) {
+ SDValue Src = Z0.getOperand(0);
+ if (Src.getOpcode() == ISD::EXTRACT_SUBVECTOR)
+ Src = Src.getOperand(0);
+ return DAG.getNode(AArch64ISD::UADDLP, DL, VT, Src);
+ }
+ }
+ }
+
+ if (Op0.getOpcode() == ISD::ZERO_EXTEND &&
+ Op1.getOpcode() == ISD::ZERO_EXTEND) {
+ SDValue S0 = Op0.getOperand(0);
+ SDValue S1 = Op1.getOperand(0);
+
+ if (S0.getOpcode() == ISD::BUILD_VECTOR &&
+ S1.getOpcode() == ISD::BUILD_VECTOR) {
+ if (SDValue Src = isEvenOddBuildVector(S0, S1)) {
+ return DAG.getNode(AArch64ISD::UADDLP, DL, VT, Src);
+ }
+ }
+
+ if (S0.getOpcode() == ISD::VECTOR_SHUFFLE &&
+ S1.getOpcode() == ISD::VECTOR_SHUFFLE) {
+ if (isEvenOddShufflePair(S0, S1)) {
+ return DAG.getNode(AArch64ISD::UADDLP, DL, VT, S0.getOperand(0));
+ }
+ }
+ }
+
+ return SDValue();
+}
+
// ADD(UADDV a, UADDV b) --> UADDV(ADD a, b)
static SDValue performAddUADDVCombine(SDNode *N, SelectionDAG &DAG) {
EVT VT = N->getValueType(0);
@@ -23416,6 +23542,8 @@ static SDValue performAddWithSBCCombine(SDNode *N, SelectionDAG &DAG) {
static SDValue performAddSubCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI) {
+ if (SDValue Val = performVectorReduceWithExtAndShuffleToUADDLP(N, DCI.DAG))
+ return Val;
// Try to change sum of two reductions.
if (SDValue Val = performAddUADDVCombine(N, DCI.DAG))
return Val;
diff --git a/llvm/test/CodeGen/AArch64/addp-shuffle.ll b/llvm/test/CodeGen/AArch64/addp-shuffle.ll
index 7ba01adad011c..0756ebffca5c4 100644
--- a/llvm/test/CodeGen/AArch64/addp-shuffle.ll
+++ b/llvm/test/CodeGen/AArch64/addp-shuffle.ll
@@ -160,12 +160,8 @@ define <4 x i32> @udot(<4 x i32> %z, <16 x i8> %a, <16 x i8> %b) {
; CHECK: // %bb.0:
; CHECK-NEXT: umull v3.8h, v1.8b, v2.8b
; CHECK-NEXT: umull2 v1.8h, v1.16b, v2.16b
-; CHECK-NEXT: ushll2 v2.4s, v3.8h, #0
-; CHECK-NEXT: ushll v3.4s, v3.4h, #0
-; CHECK-NEXT: ushll2 v4.4s, v1.8h, #0
-; CHECK-NEXT: ushll v1.4s, v1.4h, #0
-; CHECK-NEXT: addp v2.4s, v3.4s, v2.4s
-; CHECK-NEXT: addp v1.4s, v1.4s, v4.4s
+; CHECK-NEXT: uaddlp v1.4s, v1.8h
+; CHECK-NEXT: uaddlp v2.4s, v3.8h
; CHECK-NEXT: addp v1.4s, v2.4s, v1.4s
; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/uaddlp.ll b/llvm/test/CodeGen/AArch64/uaddlp.ll
new file mode 100644
index 0000000000000..00c60d78c3fef
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/uaddlp.ll
@@ -0,0 +1,80 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple aarch64-none-linux-gnu < %s | FileCheck %s
+
+define <8 x i16> @vpaddlq_u8_v1(<16 x i8> %a) {
+; CHECK-LABEL: vpaddlq_u8_v1:
+; CHECK: // %bb.0: // %start
+; CHECK-NEXT: uaddlp v0.8h, v0.16b
+; CHECK-NEXT: ret
+start:
+ %_0 = tail call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> %a)
+ ret <8 x i16> %_0
+}
+
+define <8 x i16> @vpaddlq_u8_v2_widen_shuffle_add(<16 x i8> %a) {
+; CHECK-LABEL: vpaddlq_u8_v2_widen_shuffle_add:
+; CHECK: // %bb.0: // %start
+; CHECK-NEXT: uaddlp v0.8h, v0.16b
+; CHECK-NEXT: ret
+start:
+ %0 = zext <16 x i8> %a to <16 x i16>
+ %1 = shufflevector <16 x i16> %0, <16 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+ %2 = shufflevector <16 x i16> %0, <16 x i16> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+ %3 = add nuw nsw <8 x i16> %1, %2
+ ret <8 x i16> %3
+}
+
+define range(i16 0, 511) <8 x i16> @vpaddlq_u8_v3_shuffle_widen_add(<16 x i8> %a) {
+; CHECK-LABEL: vpaddlq_u8_v3_shuffle_widen_add:
+; CHECK: // %bb.0: // %start
+; CHECK-NEXT: uaddlp v0.8h, v0.16b
+; CHECK-NEXT: ret
+start:
+ %0 = shufflevector <16 x i8> %a, <16 x i8> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+ %1 = shufflevector <16 x i8> %a, <16 x i8> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+ %2 = zext <8 x i8> %0 to <8 x i16>
+ %3 = zext <8 x i8> %1 to <8 x i16>
+ %4 = add nuw nsw <8 x i16> %2, %3
+ ret <8 x i16> %4
+}
+
+define <8 x i16> @vpaddlq_u8_v2_widen_shuffle_add_neg(<16 x i8> %a) {
+; CHECK-LABEL: vpaddlq_u8_v2_widen_shuffle_add_neg:
+; CHECK: // %bb.0: // %start
+; CHECK-NEXT: ushll2 v2.8h, v0.16b, #0
+; CHECK-NEXT: adrp x8, .LCPI3_0
+; CHECK-NEXT: adrp x9, .LCPI3_1
+; CHECK-NEXT: ushll v1.8h, v0.8b, #0
+; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI3_0]
+; CHECK-NEXT: ldr q3, [x9, :lo12:.LCPI3_1]
+; CHECK-NEXT: tbl v0.16b, { v1.16b, v2.16b }, v0.16b
+; CHECK-NEXT: tbl v1.16b, { v1.16b, v2.16b }, v3.16b
+; CHECK-NEXT: add v0.8h, v0.8h, v1.8h
+; CHECK-NEXT: ret
+start:
+ %0 = zext <16 x i8> %a to <16 x i16>
+ %1 = shufflevector <16 x i16> %0, <16 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 11, i32 10, i32 12, i32 14>
+ %2 = shufflevector <16 x i16> %0, <16 x i16> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 8, i32 13, i32 15>
+ %3 = add nuw nsw <8 x i16> %1, %2
+ ret <8 x i16> %3
+}
+
+define range(i16 0, 511) <8 x i16> @vpaddlq_u8_v3_shuffle_widen_add_neg(<16 x i8> %a) {
+; CHECK-LABEL: vpaddlq_u8_v3_shuffle_widen_add_neg:
+; CHECK: // %bb.0: // %start
+; CHECK-NEXT: adrp x8, .LCPI4_0
+; CHECK-NEXT: adrp x9, .LCPI4_1
+; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI4_0]
+; CHECK-NEXT: ldr d2, [x9, :lo12:.LCPI4_1]
+; CHECK-NEXT: tbl v1.8b, { v0.16b }, v1.8b
+; CHECK-NEXT: tbl v0.8b, { v0.16b }, v2.8b
+; CHECK-NEXT: uaddl v0.8h, v1.8b, v0.8b
+; CHECK-NEXT: ret
+start:
+ %0 = shufflevector <16 x i8> %a, <16 x i8> poison, <8 x i32> <i32 0, i32 3, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+ %1 = shufflevector <16 x i8> %a, <16 x i8> poison, <8 x i32> <i32 1, i32 2, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+ %2 = zext <8 x i8> %0 to <8 x i16>
+ %3 = zext <8 x i8> %1 to <8 x i16>
+ %4 = add nuw nsw <8 x i16> %2, %3
+ ret <8 x i16> %4
+}
More information about the llvm-commits
mailing list