[llvm] [AArch64] Expand UADDLV patterns to handle two-step i8->i16->i32 extends (PR #146078)
Igor Kirillov via llvm-commits
llvm-commits at lists.llvm.org
Fri Jun 27 06:47:05 PDT 2025
https://github.com/igogo-x86 created https://github.com/llvm/llvm-project/pull/146078
Should help #142961
>From 37c63b78b87785d2638879defbbdc9587fa4be00 Mon Sep 17 00:00:00 2001
From: Igor Kirillov <igor.kirillov at arm.com>
Date: Fri, 27 Jun 2025 13:43:35 +0000
Subject: [PATCH] [AArch64] Expand UADDLV patterns to handle two-step
i8->i16->i32 extends
Closes #142961
---
.../Target/AArch64/AArch64ISelLowering.cpp | 35 +++++++++++++++----
llvm/test/CodeGen/AArch64/neon-sad.ll | 22 ++++++++++++
2 files changed, 50 insertions(+), 7 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 13835747c91e5..59e761ebbe188 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -18024,6 +18024,15 @@ static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
// v16i32 abs(
// v16i32 sub(
// v16i32 [sign|zero]_extend(v16i8 a), v16i32 [sign|zero]_extend(v16i8 b))))
+//
+// or
+//
+// i32 vecreduce_add(
+// v16i32 zext(
+// v16i16 abs(
+// v16i16 sub(
+// v16i16 [sign|zero]_extend(v16i8 a), v16i16 [sign|zero]_extend(v16i8 b))))
+//
// =================>
// i32 vecreduce_add(
// v4i32 UADDLP(
@@ -18039,23 +18048,35 @@ static SDValue performVecReduceAddCombineWithUADDLP(SDNode *N,
return SDValue();
SDValue VecReduceOp0 = N->getOperand(0);
+ bool SawTrailingZext = false;
+ // Look through an optional post-ABS ZEXT from v16i16 -> v16i32.
+ if (VecReduceOp0.getOpcode() == ISD::ZERO_EXTEND &&
+ VecReduceOp0->getValueType(0) == MVT::v16i32 &&
+ VecReduceOp0->getOperand(0)->getOpcode() == ISD::ABS &&
+ VecReduceOp0->getOperand(0)->getValueType(0) == MVT::v16i16) {
+ SawTrailingZext = true;
+ VecReduceOp0 = VecReduceOp0.getOperand(0);
+ }
+
+ // Peel off an optional post-ABS extend (v16i16 -> v16i32).
+ MVT AbsInputVT = SawTrailingZext ? MVT::v16i16 : MVT::v16i32;
+ // Assumed v16i16 or v16i32 abs input
unsigned Opcode = VecReduceOp0.getOpcode();
- // Assumed v16i32 abs
- if (Opcode != ISD::ABS || VecReduceOp0->getValueType(0) != MVT::v16i32)
+ if (Opcode != ISD::ABS || VecReduceOp0->getValueType(0) != AbsInputVT)
return SDValue();
SDValue ABS = VecReduceOp0;
- // Assumed v16i32 sub
+ // Assumed v16i16 or v16i32 sub
if (ABS->getOperand(0)->getOpcode() != ISD::SUB ||
- ABS->getOperand(0)->getValueType(0) != MVT::v16i32)
+ ABS->getOperand(0)->getValueType(0) != AbsInputVT)
return SDValue();
SDValue SUB = ABS->getOperand(0);
unsigned Opcode0 = SUB->getOperand(0).getOpcode();
unsigned Opcode1 = SUB->getOperand(1).getOpcode();
- // Assumed v16i32 type
- if (SUB->getOperand(0)->getValueType(0) != MVT::v16i32 ||
- SUB->getOperand(1)->getValueType(0) != MVT::v16i32)
+ // Assumed v16i16 or v16i32 type
+ if (SUB->getOperand(0)->getValueType(0) != AbsInputVT ||
+ SUB->getOperand(1)->getValueType(0) != AbsInputVT)
return SDValue();
// Assumed zext or sext
diff --git a/llvm/test/CodeGen/AArch64/neon-sad.ll b/llvm/test/CodeGen/AArch64/neon-sad.ll
index c0cfe8d8ca3cc..1f0ca9ca7ad36 100644
--- a/llvm/test/CodeGen/AArch64/neon-sad.ll
+++ b/llvm/test/CodeGen/AArch64/neon-sad.ll
@@ -45,3 +45,25 @@ entry:
%6 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %5)
ret i32 %6
}
+
+define i32 @test_sad_v16i8_two_step_zext(ptr noundef readonly %a, ptr noundef readonly %b) {
+; CHECK-LABEL: test_sad_v16i8_two_step_zext:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: ldr q0, [x0]
+; CHECK-NEXT: ldr q1, [x1]
+; CHECK-NEXT: uabdl v2.8h, v1.8b, v0.8b
+; CHECK-NEXT: uabal2 v2.8h, v1.16b, v0.16b
+; CHECK-NEXT: uaddlv s0, v2.8h
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
+entry:
+ %0 = load <16 x i8>, ptr %a
+ %1 = zext <16 x i8> %0 to <16 x i16>
+ %2 = load <16 x i8>, ptr %b
+ %3 = zext <16 x i8> %2 to <16 x i16>
+ %4 = sub nsw <16 x i16> %3, %1
+ %5 = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> %4, i1 false)
+ %6 = zext <16 x i16> %5 to <16 x i32>
+ %7 = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %6)
+ ret i32 %7
+}
More information about the llvm-commits
mailing list