[llvm] [AArch64] Expand UADDLV patterns to handle two-step i8->i16->i32 extends (PR #146078)

Tue Jul 8 09:03:35 PDT 2025

https://github.com/igogo-x86 updated https://github.com/llvm/llvm-project/pull/146078

>From 13ddd5abb115af2b65bfe1d9311e88a789fae80f Mon Sep 17 00:00:00 2001
From: Igor Kirillov <igor.kirillov at arm.com>
Date: Fri, 27 Jun 2025 13:43:35 +0000
Subject: [PATCH 1/2] [AArch64] Expand UADDLV patterns to handle two-step
 i8->i16->i32 extends

Closes #142961
---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 35 +++++++++++++++----
 llvm/test/CodeGen/AArch64/neon-sad.ll         | 22 ++++++++++++
 2 files changed, 50 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index cdb68684b3856..3afdfc74d0ec1 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -18052,6 +18052,15 @@ static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
 //  v16i32 abs(
 //    v16i32 sub(
 //     v16i32 [sign|zero]_extend(v16i8 a), v16i32 [sign|zero]_extend(v16i8 b))))
+//
+//  or
+//
+// i32 vecreduce_add(
+//  v16i32 zext(
+//   v16i16 abs(
+//    v16i16 sub(
+//     v16i16 [sign|zero]_extend(v16i8 a), v16i16 [sign|zero]_extend(v16i8 b))))
+//
 // =================>
 // i32 vecreduce_add(
 //   v4i32 UADDLP(
@@ -18067,23 +18076,35 @@ static SDValue performVecReduceAddCombineWithUADDLP(SDNode *N,
     return SDValue();
 
   SDValue VecReduceOp0 = N->getOperand(0);
+  bool SawTrailingZext = false;
+  // Look through an optional post-ABS ZEXT from v16i16 -> v16i32.
+  if (VecReduceOp0.getOpcode() == ISD::ZERO_EXTEND &&
+      VecReduceOp0->getValueType(0) == MVT::v16i32 &&
+      VecReduceOp0->getOperand(0)->getOpcode() == ISD::ABS &&
+      VecReduceOp0->getOperand(0)->getValueType(0) == MVT::v16i16) {
+    SawTrailingZext = true;
+    VecReduceOp0 = VecReduceOp0.getOperand(0);
+  }
+
+  // Peel off an optional post-ABS extend (v16i16 -> v16i32).
+  MVT AbsInputVT = SawTrailingZext ? MVT::v16i16 : MVT::v16i32;
+  // Assumed v16i16 or v16i32 abs input
   unsigned Opcode = VecReduceOp0.getOpcode();
-  // Assumed v16i32 abs
-  if (Opcode != ISD::ABS || VecReduceOp0->getValueType(0) != MVT::v16i32)
+  if (Opcode != ISD::ABS || VecReduceOp0->getValueType(0) != AbsInputVT)
     return SDValue();
 
   SDValue ABS = VecReduceOp0;
-  // Assumed v16i32 sub
+  // Assumed v16i16 or v16i32 sub
   if (ABS->getOperand(0)->getOpcode() != ISD::SUB ||
-      ABS->getOperand(0)->getValueType(0) != MVT::v16i32)
+      ABS->getOperand(0)->getValueType(0) != AbsInputVT)
     return SDValue();
 
   SDValue SUB = ABS->getOperand(0);
   unsigned Opcode0 = SUB->getOperand(0).getOpcode();
   unsigned Opcode1 = SUB->getOperand(1).getOpcode();
-  // Assumed v16i32 type
-  if (SUB->getOperand(0)->getValueType(0) != MVT::v16i32 ||
-      SUB->getOperand(1)->getValueType(0) != MVT::v16i32)
+  // Assumed v16i16 or v16i32 type
+  if (SUB->getOperand(0)->getValueType(0) != AbsInputVT ||
+      SUB->getOperand(1)->getValueType(0) != AbsInputVT)
     return SDValue();
 
   // Assumed zext or sext
diff --git a/llvm/test/CodeGen/AArch64/neon-sad.ll b/llvm/test/CodeGen/AArch64/neon-sad.ll
index c0cfe8d8ca3cc..1f0ca9ca7ad36 100644
--- a/llvm/test/CodeGen/AArch64/neon-sad.ll
+++ b/llvm/test/CodeGen/AArch64/neon-sad.ll
@@ -45,3 +45,25 @@ entry:
   %6 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %5)
   ret i32 %6
 }
+
+define i32 @test_sad_v16i8_two_step_zext(ptr noundef readonly %a, ptr noundef readonly %b) {
+; CHECK-LABEL: test_sad_v16i8_two_step_zext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ldr q1, [x1]
+; CHECK-NEXT:    uabdl v2.8h, v1.8b, v0.8b
+; CHECK-NEXT:    uabal2 v2.8h, v1.16b, v0.16b
+; CHECK-NEXT:    uaddlv s0, v2.8h
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+entry:
+  %0 = load <16 x i8>, ptr %a
+  %1 = zext <16 x i8> %0 to <16 x i16>
+  %2 = load <16 x i8>, ptr %b
+  %3 = zext <16 x i8> %2 to <16 x i16>
+  %4 = sub nsw <16 x i16> %3, %1
+  %5 = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> %4, i1 false)
+  %6 = zext <16 x i16> %5 to <16 x i32>
+  %7 = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %6)
+  ret i32 %7
+}

>From c830ff2706da1723f89bd84249e6ce91d5040556 Mon Sep 17 00:00:00 2001
From: Igor Kirillov <igor.kirillov at arm.com>
Date: Tue, 8 Jul 2025 16:01:48 +0000
Subject: [PATCH 2/2] Add test

---
 llvm/test/CodeGen/AArch64/neon-sad.ll | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/llvm/test/CodeGen/AArch64/neon-sad.ll b/llvm/test/CodeGen/AArch64/neon-sad.ll
index 1f0ca9ca7ad36..f21fafea50451 100644
--- a/llvm/test/CodeGen/AArch64/neon-sad.ll
+++ b/llvm/test/CodeGen/AArch64/neon-sad.ll
@@ -67,3 +67,25 @@ entry:
   %7 = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %6)
   ret i32 %7
 }
+
+define i32 @test_sad_v16i8_two_step_sext(ptr noundef readonly %a, ptr noundef readonly %b) {
+; CHECK-LABEL: test_sad_v16i8_two_step_sext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ldr q1, [x1]
+; CHECK-NEXT:    sabdl v2.8h, v1.8b, v0.8b
+; CHECK-NEXT:    sabal2 v2.8h, v1.16b, v0.16b
+; CHECK-NEXT:    uaddlv s0, v2.8h
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+entry:
+  %0 = load <16 x i8>, ptr %a
+  %1 = sext <16 x i8> %0 to <16 x i16>
+  %2 = load <16 x i8>, ptr %b
+  %3 = sext <16 x i8> %2 to <16 x i16>
+  %4 = sub nsw <16 x i16> %3, %1
+  %5 = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> %4, i1 false)
+  %6 = zext <16 x i16> %5 to <16 x i32>
+  %7 = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %6)
+  ret i32 %7
+}