[llvm] [AArch64] optimize manual addp (PR #181549)

Sun Feb 15 06:06:55 PST 2026

llvmbot wrote:




@llvm/pr-subscribers-backend-aarch64

Author: Folkert de Vries (folkertdev)

<details>
<summary>Changes</summary>

the `addp` was not recognized before. There is logic to recognize `addp` but it only fires for wider vectors, and only if the shift pattern is different in a way that, for the smaller vector widths, never makes it to isel.

```
vpadd_s16_intrin:
        addp    v0.4h, v0.4h, v1.4h
        ret

vpadd_s16_manual:
        uzp1    v2.4h, v0.4h, v1.4h
        uzp2    v0.4h, v0.4h, v1.4h
        add     v0.4h, v2.4h, v0.4h
        ret
```

So, add some logic to catch the deinterleaving shuffle and convert to `addp`.

---
Full diff: https://github.com/llvm/llvm-project/pull/181549.diff


2 Files Affected:

- (modified) llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (+38) 
- (modified) llvm/test/CodeGen/AArch64/addp-shuffle.ll (+84) 


``````````diff

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 2b2e057c80373..d5cf4102e1873 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -23022,6 +23022,42 @@ static SDValue performAddTruncShiftCombine(SDNode *N, SelectionDAG &DAG) {
   return DAG.getNode(ISD::ADD, DL, VT, Trunc, Shift);
 }
 
+static SDValue performAddIntoAddP(SDNode *N, SelectionDAG &DAG) {
+  if (N->getOpcode() != ISD::ADD)
+    return SDValue();
+
+  EVT VT = N->getValueType(0);
+  if (!VT.isVector() || VT.isScalableVector() ||
+      !VT.getScalarType().isInteger())
+    return SDValue();
+
+  unsigned VBits = VT.getSizeInBits();
+  if (VBits != 64 && VBits != 128)
+    return SDValue();
+
+  // Check whether the two shuffles deinterleave, so one mask is all even
+  // indices, the other all odd indices of a vector.
+  auto *S0 = dyn_cast<ShuffleVectorSDNode>(N->getOperand(0));
+  auto *S1 = dyn_cast<ShuffleVectorSDNode>(N->getOperand(1));
+  if (!S0 || !S1)
+    return SDValue();
+
+  if (S0->getOperand(0) != S1->getOperand(0) ||
+      S0->getOperand(1) != S1->getOperand(1))
+    return SDValue();
+
+  ArrayRef<int> M0 = S0->getMask(), M1 = S1->getMask();
+  auto Is = [](ArrayRef<int> M, unsigned I) {
+    return ShuffleVectorInst::isDeInterleaveMaskOfFactor(M, 2, I);
+  };
+  if (!((Is(M0, 0) && Is(M1, 1)) || (Is(M0, 1) && Is(M1, 0))))
+    return SDValue();
+
+  SDLoc DL(N);
+  return DAG.getNode(AArch64ISD::ADDP, DL, VT, S0->getOperand(0),
+                     S0->getOperand(1));
+}
+
 static SDValue performAddSubCombine(SDNode *N,
                                     TargetLowering::DAGCombinerInfo &DCI) {
   // Try to change sum of two reductions.
@@ -23047,6 +23083,8 @@ static SDValue performAddSubCombine(SDNode *N,
     return Val;
   if (SDValue Val = performAddTruncShiftCombine(N, DCI.DAG))
     return Val;
+  if (SDValue Val = performAddIntoAddP(N, DCI.DAG))
+    return Val;
 
   if (SDValue Val = performExtBinopLoadFold(N, DCI.DAG))
     return Val;
diff --git a/llvm/test/CodeGen/AArch64/addp-shuffle.ll b/llvm/test/CodeGen/AArch64/addp-shuffle.ll
index 54c96820285d3..9b62addaef385 100644
--- a/llvm/test/CodeGen/AArch64/addp-shuffle.ll
+++ b/llvm/test/CodeGen/AArch64/addp-shuffle.ll
@@ -208,3 +208,87 @@ define <4 x i32> @sdot(<4 x i32> %z, <16 x i8> %a, <16 x i8> %b) {
   %n = add <4 x i32> %z, %o
   ret <4 x i32> %n
 }
+
+define <8 x i8> @manual_addp_v8i8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: manual_addp_v8i8:
+; CHECK:       // %bb.0: // %start
+; CHECK-NEXT:    addp v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ret
+start:
+  %0 = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %1 = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %2 = add <8 x i8> %0, %1
+  ret <8 x i8> %2
+}
+
+define <4 x i16> @manual_addp_v4i16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK-LABEL: manual_addp_v4i16:
+; CHECK:       // %bb.0: // %start
+; CHECK-NEXT:    addp v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    ret
+start:
+  %0 = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %1 = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %2 = add <4 x i16> %0, %1
+  ret <4 x i16> %2
+}
+
+define <2 x i32> @manual_addp_v2i32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK-LABEL: manual_addp_v2i32:
+; CHECK:       // %bb.0: // %start
+; CHECK-NEXT:    addp v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    ret
+start:
+  %0 = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
+  %1 = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
+  %2 = add <2 x i32> %0, %1
+  ret <2 x i32> %2
+}
+
+define <16 x i8> @manual_addp_v16i8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: manual_addp_v16i8:
+; CHECK:       // %bb.0: // %start
+; CHECK-NEXT:    addp v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ret
+start:
+  %0 = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+  %1 = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+  %2 = add <16 x i8> %0, %1
+  ret <16 x i8> %2
+}
+
+define <8 x i16> @manual_addp_v8i16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: manual_addp_v8i16:
+; CHECK:       // %bb.0: // %start
+; CHECK-NEXT:    addp v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    ret
+start:
+  %0 = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %1 = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %2 = add <8 x i16> %0, %1
+  ret <8 x i16> %2
+}
+
+define <4 x i32> @manual_addp_v4i32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: manual_addp_v4i32:
+; CHECK:       // %bb.0: // %start
+; CHECK-NEXT:    addp v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ret
+start:
+  %0 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %2 = add <4 x i32> %0, %1
+  ret <4 x i32> %2
+}
+
+define <2 x i64> @manual_addp_v2i64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: manual_addp_v2i64:
+; CHECK:       // %bb.0: // %start
+; CHECK-NEXT:    addp v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    ret
+start:
+  %0 = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
+  %1 = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
+  %2 = add <2 x i64> %0, %1
+  ret <2 x i64> %2
+}

``````````

</details>


https://github.com/llvm/llvm-project/pull/181549