[llvm-branch-commits] [llvm] 0db6ae1 - [AArch64] Fix partial_reduce v16i8 -> v2i32 (#177119)
Cullen Rhodes via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Thu Jan 22 23:46:10 PST 2026
Author: Sander de Smalen
Date: 2026-01-23T07:45:13Z
New Revision: 0db6ae11b8852ec030ad480b4bdae48fd0dfbd79
URL: https://github.com/llvm/llvm-project/commit/0db6ae11b8852ec030ad480b4bdae48fd0dfbd79
DIFF: https://github.com/llvm/llvm-project/commit/0db6ae11b8852ec030ad480b4bdae48fd0dfbd79.diff
LOG: [AArch64] Fix partial_reduce v16i8 -> v2i32 (#177119)
The lowering doesn't need to check for `ConvertToScalable`, because it
lowers to another `PARTIAL_REDUCE_*MLA` node, which is subsequently
lowered using either fixed-length or scalable types.
This fixes https://github.com/llvm/llvm-project/issues/176954
Re-generate check lines
The check lines for SME were different because of sub-register liveness,
which is enabled for streaming functions on trunk, but isn't enabled on
the release branch.
(cherry picked from commit de997639876db38d20c7ed9fb0c683a239d56bf5)
Added:
Modified:
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 7789f6d48a41e..d99aeafd1869e 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -31537,13 +31537,9 @@ AArch64TargetLowering::LowerPARTIAL_REDUCE_MLA(SDValue Op,
EVT OrigResultVT = ResultVT;
EVT OpVT = LHS.getValueType();
- bool ConvertToScalable =
- ResultVT.isFixedLengthVector() &&
- useSVEForFixedLengthVectorVT(ResultVT, /*OverrideNEON=*/true);
-
// We can handle this case natively by accumulating into a wider
// zero-padded vector.
- if (!ConvertToScalable && ResultVT == MVT::v2i32 && OpVT == MVT::v16i8) {
+ if (ResultVT == MVT::v2i32 && OpVT == MVT::v16i8) {
SDValue ZeroVec = DAG.getConstant(0, DL, MVT::v4i32);
SDValue WideAcc = DAG.getInsertSubvector(DL, ZeroVec, Acc, 0);
SDValue Wide =
@@ -31552,6 +31548,10 @@ AArch64TargetLowering::LowerPARTIAL_REDUCE_MLA(SDValue Op,
return DAG.getExtractSubvector(DL, MVT::v2i32, Reduced, 0);
}
+ bool ConvertToScalable =
+ ResultVT.isFixedLengthVector() &&
+ useSVEForFixedLengthVectorVT(ResultVT, /*OverrideNEON=*/true);
+
if (ConvertToScalable) {
ResultVT = getContainerForFixedLengthVector(DAG, ResultVT);
OpVT = getContainerForFixedLengthVector(DAG, LHS.getValueType());
diff --git a/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll b/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll
index da0c01f13b960..d648b82cc1c96 100644
--- a/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll
+++ b/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll
@@ -1299,3 +1299,49 @@ entry:
%partial.reduce = tail call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %acc, <vscale x 16 x i32> %mult)
ret <vscale x 4 x i32> %partial.reduce
}
+
+define <2 x i32> @udot_v16i8tov2i32(<2 x i32> %acc, <16 x i8> %input) "target-features"="+dotprod" {
+; CHECK-SVE2-LABEL: udot_v16i8tov2i32:
+; CHECK-SVE2: // %bb.0: // %entry
+; CHECK-SVE2-NEXT: movi v2.16b, #1
+; CHECK-SVE2-NEXT: fmov d0, d0
+; CHECK-SVE2-NEXT: // kill: def $q1 killed $q1 def $z1
+; CHECK-SVE2-NEXT: udot z0.s, z1.b, z2.b
+; CHECK-SVE2-NEXT: addp v0.4s, v0.4s, v0.4s
+; CHECK-SVE2-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-SVE2-NEXT: ret
+;
+; CHECK-SVE2-I8MM-LABEL: udot_v16i8tov2i32:
+; CHECK-SVE2-I8MM: // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT: movi v2.16b, #1
+; CHECK-SVE2-I8MM-NEXT: fmov d0, d0
+; CHECK-SVE2-I8MM-NEXT: // kill: def $q1 killed $q1 def $z1
+; CHECK-SVE2-I8MM-NEXT: udot z0.s, z1.b, z2.b
+; CHECK-SVE2-I8MM-NEXT: addp v0.4s, v0.4s, v0.4s
+; CHECK-SVE2-I8MM-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-SVE2-I8MM-NEXT: ret
+;
+; CHECK-SME-LABEL: udot_v16i8tov2i32:
+; CHECK-SME: // %bb.0: // %entry
+; CHECK-SME-NEXT: // kill: def $q1 killed $q1 def $z1
+; CHECK-SME-NEXT: uunpklo z2.h, z1.b
+; CHECK-SME-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-SME-NEXT: ext z1.b, z1.b, z1.b, #8
+; CHECK-SME-NEXT: uaddwb z0.s, z0.s, z2.h
+; CHECK-SME-NEXT: uunpklo z1.h, z1.b
+; CHECK-SME-NEXT: uaddwt z0.s, z0.s, z2.h
+; CHECK-SME-NEXT: ext z2.b, z2.b, z2.b, #8
+; CHECK-SME-NEXT: uaddwb z0.s, z0.s, z2.h
+; CHECK-SME-NEXT: uaddwt z0.s, z0.s, z2.h
+; CHECK-SME-NEXT: uaddwb z0.s, z0.s, z1.h
+; CHECK-SME-NEXT: uaddwt z0.s, z0.s, z1.h
+; CHECK-SME-NEXT: ext z1.b, z1.b, z1.b, #8
+; CHECK-SME-NEXT: uaddwb z0.s, z0.s, z1.h
+; CHECK-SME-NEXT: uaddwt z0.s, z0.s, z1.h
+; CHECK-SME-NEXT: // kill: def $d0 killed $d0 killed $z0
+; CHECK-SME-NEXT: ret
+entry:
+ %input.wide = zext <16 x i8> %input to <16 x i32>
+ %partial.reduce = tail call <2 x i32> @llvm.vector.partial.reduce.add(<2 x i32> %acc, <16 x i32> %input.wide)
+ ret <2 x i32> %partial.reduce
+}
More information about the llvm-branch-commits
mailing list