[llvm] [SelectionDAG][x86] Ensure vector reduction optimization (PR #144231)
Suhajda Tamás via llvm-commits
llvm-commits at lists.llvm.org
Sun Feb 15 11:43:27 PST 2026
https://github.com/sutajo updated https://github.com/llvm/llvm-project/pull/144231
>From 5a6b5173136ddb1d8b53658c7ea1bd1601eea7fc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Suhajda=20Tam=C3=A1s?= <sutajo at gmail.com>
Date: Sat, 14 Jun 2025 19:04:49 +0200
Subject: [PATCH 1/3] [x86] Add test for reduction
---
llvm/test/CodeGen/X86/optimize-reduction.ll | 140 ++++++++++++++++++++
1 file changed, 140 insertions(+)
create mode 100644 llvm/test/CodeGen/X86/optimize-reduction.ll
diff --git a/llvm/test/CodeGen/X86/optimize-reduction.ll b/llvm/test/CodeGen/X86/optimize-reduction.ll
new file mode 100644
index 0000000000000..003c41612b8bf
--- /dev/null
+++ b/llvm/test/CodeGen/X86/optimize-reduction.ll
@@ -0,0 +1,140 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse4.1,+fast-hops | FileCheck %s --check-prefixes=SSE41
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-hops | FileCheck %s --check-prefixes=AVX2
+
+define { i16, i16 } @test_reduce_v16i16_with_umin(<16 x i16> %x, <16 x i16> %y) {
+; SSE41-LABEL: test_reduce_v16i16_with_umin:
+; SSE41: # %bb.0:
+; SSE41-NEXT: movdqa %xmm0, %xmm4
+; SSE41-NEXT: pminuw %xmm1, %xmm4
+; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3]
+; SSE41-NEXT: pminuw %xmm4, %xmm5
+; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,1,1]
+; SSE41-NEXT: pminuw %xmm5, %xmm6
+; SSE41-NEXT: movdqa %xmm6, %xmm5
+; SSE41-NEXT: psrld $16, %xmm5
+; SSE41-NEXT: pminuw %xmm6, %xmm5
+; SSE41-NEXT: phminposuw %xmm4, %xmm4
+; SSE41-NEXT: movd %xmm4, %eax
+; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[0,0,0,0,4,5,6,7]
+; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1]
+; SSE41-NEXT: pcmpeqw %xmm4, %xmm1
+; SSE41-NEXT: pcmpeqd %xmm5, %xmm5
+; SSE41-NEXT: pxor %xmm5, %xmm1
+; SSE41-NEXT: por %xmm3, %xmm1
+; SSE41-NEXT: pcmpeqw %xmm4, %xmm0
+; SSE41-NEXT: pxor %xmm5, %xmm0
+; SSE41-NEXT: por %xmm2, %xmm0
+; SSE41-NEXT: pminuw %xmm1, %xmm0
+; SSE41-NEXT: phminposuw %xmm0, %xmm0
+; SSE41-NEXT: movd %xmm0, %edx
+; SSE41-NEXT: # kill: def $ax killed $ax killed $eax
+; SSE41-NEXT: # kill: def $dx killed $dx killed $edx
+; SSE41-NEXT: retq
+;
+; AVX2-LABEL: test_reduce_v16i16_with_umin:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpminuw %xmm2, %xmm0, %xmm2
+; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
+; AVX2-NEXT: vpminuw %xmm3, %xmm2, %xmm3
+; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,1,1]
+; AVX2-NEXT: vpminuw %xmm4, %xmm3, %xmm3
+; AVX2-NEXT: vpsrld $16, %xmm3, %xmm4
+; AVX2-NEXT: vphminposuw %xmm2, %xmm2
+; AVX2-NEXT: vmovd %xmm2, %eax
+; AVX2-NEXT: vpminuw %xmm4, %xmm3, %xmm2
+; AVX2-NEXT: vpbroadcastw %xmm2, %ymm2
+; AVX2-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
+; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vphminposuw %xmm0, %xmm0
+; AVX2-NEXT: vmovd %xmm0, %edx
+; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
+; AVX2-NEXT: # kill: def $dx killed $dx killed $edx
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+ %min_x = tail call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> %x)
+ %min_x_vec = insertelement <1 x i16> poison, i16 %min_x, i64 0
+ %min_x_splat = shufflevector <1 x i16> %min_x_vec, <1 x i16> poison, <16 x i32> zeroinitializer
+ %cmp = icmp eq <16 x i16> %x, %min_x_splat
+ %select = select <16 x i1> %cmp, <16 x i16> %y, <16 x i16> splat (i16 -1)
+ %select_min = tail call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> %select)
+ %ret_0 = insertvalue { i16, i16 } poison, i16 %min_x, 0
+ %ret = insertvalue { i16, i16 } %ret_0, i16 %select_min, 1
+ ret { i16, i16 } %ret
+}
+
+define { i16, i16 } @test_reduce_v16i16_with_add(<16 x i16> %x, <16 x i16> %y) {
+; SSE41-LABEL: test_reduce_v16i16_with_add:
+; SSE41: # %bb.0: # %start
+; SSE41-NEXT: movdqa %xmm0, %xmm4
+; SSE41-NEXT: paddw %xmm1, %xmm4
+; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3]
+; SSE41-NEXT: paddw %xmm4, %xmm5
+; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,1,1]
+; SSE41-NEXT: paddw %xmm5, %xmm4
+; SSE41-NEXT: phaddw %xmm4, %xmm4
+; SSE41-NEXT: movdqa %xmm1, %xmm5
+; SSE41-NEXT: phaddw %xmm0, %xmm5
+; SSE41-NEXT: phaddw %xmm5, %xmm5
+; SSE41-NEXT: phaddw %xmm5, %xmm5
+; SSE41-NEXT: phaddw %xmm5, %xmm5
+; SSE41-NEXT: movd %xmm5, %eax
+; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7]
+; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1]
+; SSE41-NEXT: pcmpeqw %xmm4, %xmm1
+; SSE41-NEXT: pcmpeqd %xmm5, %xmm5
+; SSE41-NEXT: pxor %xmm5, %xmm1
+; SSE41-NEXT: por %xmm3, %xmm1
+; SSE41-NEXT: pcmpeqw %xmm4, %xmm0
+; SSE41-NEXT: pxor %xmm5, %xmm0
+; SSE41-NEXT: por %xmm2, %xmm0
+; SSE41-NEXT: pminuw %xmm1, %xmm0
+; SSE41-NEXT: phminposuw %xmm0, %xmm0
+; SSE41-NEXT: movd %xmm0, %edx
+; SSE41-NEXT: # kill: def $ax killed $ax killed $eax
+; SSE41-NEXT: # kill: def $dx killed $dx killed $edx
+; SSE41-NEXT: retq
+;
+; AVX2-LABEL: test_reduce_v16i16_with_add:
+; AVX2: # %bb.0: # %start
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpaddw %xmm2, %xmm0, %xmm3
+; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,2,3]
+; AVX2-NEXT: vpaddw %xmm4, %xmm3, %xmm3
+; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,1,1]
+; AVX2-NEXT: vpaddw %xmm4, %xmm3, %xmm3
+; AVX2-NEXT: vphaddw %xmm3, %xmm3, %xmm3
+; AVX2-NEXT: vphaddw %xmm0, %xmm2, %xmm2
+; AVX2-NEXT: vphaddw %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vphaddw %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vphaddw %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vmovd %xmm2, %eax
+; AVX2-NEXT: vpbroadcastw %xmm3, %ymm2
+; AVX2-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
+; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vphminposuw %xmm0, %xmm0
+; AVX2-NEXT: vmovd %xmm0, %edx
+; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
+; AVX2-NEXT: # kill: def $dx killed $dx killed $edx
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+start:
+ %sum_x = tail call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %x)
+ %sum_x_vec = insertelement <1 x i16> poison, i16 %sum_x, i64 0
+ %sum_x_splat = shufflevector <1 x i16> %sum_x_vec, <1 x i16> poison, <16 x i32> zeroinitializer
+ %cmp = icmp eq <16 x i16> %x, %sum_x_splat
+ %select = select <16 x i1> %cmp, <16 x i16> %y, <16 x i16> splat (i16 -1)
+ %select_min = tail call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> %select)
+ %ret_0 = insertvalue { i16, i16 } poison, i16 %sum_x, 0
+ %ret = insertvalue { i16, i16 } %ret_0, i16 %select_min, 1
+ ret { i16, i16 } %ret
+}
>From 52128ed87860be76033064585b42c1046519daaf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Suhajda=20Tam=C3=A1s?= <sutajo at gmail.com>
Date: Sat, 14 Jun 2025 19:09:44 +0200
Subject: [PATCH 2/3] [x86] Implement optimization and update tests
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 106 ++++++++++++--------
llvm/test/CodeGen/X86/optimize-reduction.ll | 45 ++-------
2 files changed, 73 insertions(+), 78 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 88616e0f2a0c9..a7d347f80b5af 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -46976,27 +46976,32 @@ static SDValue createPSADBW(SelectionDAG &DAG, SDValue N0, SDValue N1,
// Attempt to replace an min/max v8i16/v16i8 horizontal reduction with
// PHMINPOSUW.
-static SDValue combineMinMaxReduction(SDNode *Extract, SelectionDAG &DAG,
- const X86Subtarget &Subtarget) {
+static std::pair<SDValue, bool>
+combineMinMaxReduction(SDNode *Extract, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ const std::pair<SDValue, bool> NoMatch{};
+
// Bail without SSE41.
if (!Subtarget.hasSSE41())
- return SDValue();
+ return NoMatch;
EVT ExtractVT = Extract->getValueType(0);
if (ExtractVT != MVT::i16 && ExtractVT != MVT::i8)
- return SDValue();
+ return NoMatch;
// Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.
ISD::NodeType BinOp;
SDValue Src = DAG.matchBinOpReduction(
Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN}, true);
if (!Src)
- return SDValue();
+ return NoMatch;
+
+ bool FoundPartialReduction = Src.getOpcode() == ISD::EXTRACT_SUBVECTOR;
EVT SrcVT = Src.getValueType();
EVT SrcSVT = SrcVT.getScalarType();
if (SrcSVT != ExtractVT || (SrcVT.getSizeInBits() % 128) != 0)
- return SDValue();
+ return NoMatch;
SDLoc DL(Extract);
SDValue MinPos = Src;
@@ -47045,8 +47050,9 @@ static SDValue combineMinMaxReduction(SDNode *Extract, SelectionDAG &DAG,
if (Mask)
MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
- return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos,
- DAG.getVectorIdxConstant(0, DL));
+ return {DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos,
+ DAG.getVectorIdxConstant(0, DL)),
+ FoundPartialReduction};
}
// Attempt to replace an all_of/any_of/parity style horizontal reduction with a MOVMSK.
@@ -47260,6 +47266,9 @@ static SDValue combineVPDPBUSDPattern(SDNode *Extract, SelectionDAG &DAG,
Extract->getOperand(1));
}
+// Check whether this extract is the root of a sum of absolute differences
+// pattern. This has to be done here because we really want it to happen
+// pre-legalizatio.
static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
using namespace SDPatternMatch;
@@ -47697,19 +47706,24 @@ static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG,
/// Try to convert a vector reduction sequence composed of binops and shuffles
/// into horizontal ops.
-static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG,
- const X86Subtarget &Subtarget) {
+static std::pair<SDValue, bool>
+combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller");
+ const std::pair<SDValue, bool> NoMatch{};
+
// We need at least SSE2 to anything here.
if (!Subtarget.hasSSE2())
- return SDValue();
+ return NoMatch;
ISD::NodeType Opc;
SDValue Rdx = DAG.matchBinOpReduction(ExtElt, Opc,
{ISD::ADD, ISD::MUL, ISD::FADD}, true);
if (!Rdx)
- return SDValue();
+ return NoMatch;
+
+ bool FoundPartialReduction = Rdx.getOpcode() == ISD::EXTRACT_SUBVECTOR;
SDValue Index = ExtElt->getOperand(1);
assert(isNullConstant(Index) &&
@@ -47718,7 +47732,7 @@ static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG,
EVT VT = ExtElt->getValueType(0);
EVT VecVT = Rdx.getValueType();
if (VecVT.getScalarType() != VT)
- return SDValue();
+ return NoMatch;
SDLoc DL(ExtElt);
unsigned NumElts = VecVT.getVectorNumElements();
@@ -47745,7 +47759,7 @@ static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG,
// vXi8 mul reduction - promote to vXi16 mul reduction.
if (Opc == ISD::MUL) {
if (VT != MVT::i8 || NumElts < 4 || !isPowerOf2_32(NumElts))
- return SDValue();
+ return NoMatch;
if (VecVT.getSizeInBits() >= 128) {
EVT WideVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts / 2);
SDValue Lo = getUnpackl(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
@@ -47773,7 +47787,8 @@ static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG,
DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
{1, -1, -1, -1, -1, -1, -1, -1}));
Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
- return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
+ return {DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index),
+ FoundPartialReduction};
}
// vXi8 add reduction - sub 128-bit vector.
@@ -47782,12 +47797,13 @@ static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG,
Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
DAG.getConstant(0, DL, MVT::v16i8));
Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
- return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
+ return {DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index),
+ FoundPartialReduction};
}
// Must be a >=128-bit vector with pow2 elements.
if ((VecVT.getSizeInBits() % 128) != 0 || !isPowerOf2_32(NumElts))
- return SDValue();
+ return NoMatch;
// vXi8 add reduction - sum lo/hi halves then use PSADBW.
if (VT == MVT::i8) {
@@ -47806,7 +47822,8 @@ static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG,
Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
getZeroVector(MVT::v16i8, Subtarget, DAG, DL));
Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
- return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
+ return {DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index),
+ FoundPartialReduction};
}
// See if we can use vXi8 PSADBW add reduction for larger zext types.
@@ -47853,12 +47870,13 @@ static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG,
VecVT = MVT::getVectorVT(VT.getSimpleVT(), 128 / VT.getSizeInBits());
Rdx = DAG.getBitcast(VecVT, Rdx);
- return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
+ return {DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index),
+ FoundPartialReduction};
}
// Only use (F)HADD opcodes if they aren't microcoded or minimizes codesize.
if (!shouldUseHorizontalOp(true, DAG, Subtarget))
- return SDValue();
+ return NoMatch;
unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD;
@@ -47876,14 +47894,15 @@ static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG,
}
if (!((VecVT == MVT::v8i16 || VecVT == MVT::v4i32) && Subtarget.hasSSSE3()) &&
!((VecVT == MVT::v4f32 || VecVT == MVT::v2f64) && Subtarget.hasSSE3()))
- return SDValue();
+ return NoMatch;
// extract (add (shuf X), X), 0 --> extract (hadd X, X), 0
unsigned ReductionSteps = Log2_32(VecVT.getVectorNumElements());
for (unsigned i = 0; i != ReductionSteps; ++i)
Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Rdx, Rdx);
- return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
+ return {DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index),
+ FoundPartialReduction};
}
/// Detect vector gather/scatter index generation and convert it from being a
@@ -47977,26 +47996,29 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32,
InputVector.getOperand(0));
- // Check whether this extract is the root of a sum of absolute differences
- // pattern. This has to be done here because we really want it to happen
- // pre-legalization,
- if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
- return SAD;
-
- if (SDValue VPDPBUSD = combineVPDPBUSDPattern(N, DAG, Subtarget))
- return VPDPBUSD;
-
- // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
- if (SDValue Cmp = combinePredicateReduction(N, DAG, Subtarget))
- return Cmp;
-
- // Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW.
- if (SDValue MinMax = combineMinMaxReduction(N, DAG, Subtarget))
- return MinMax;
-
- // Attempt to optimize ADD/FADD/MUL reductions with HADD, promotion etc..
- if (SDValue V = combineArithReduction(N, DAG, Subtarget))
- return V;
+ SDValue BinOpReduction;
+ bool IsPartialReduction = false;
+ if (auto Result = std::tie(BinOpReduction, IsPartialReduction);
+ (BinOpReduction = combineBasicSADPattern(N, DAG, Subtarget)) ||
+ (BinOpReduction = combineVPDPBUSDPattern(N, DAG, Subtarget)) ||
+ (BinOpReduction = combinePredicateReduction(N, DAG, Subtarget)) ||
+ ((Result = combineMinMaxReduction(N, DAG, Subtarget)), BinOpReduction) ||
+ ((Result = combineArithReduction(N, DAG, Subtarget)), BinOpReduction)) {
+ SDValue ExtractEltOperand = N->getOperand(0);
+ DCI.CombineTo(N, BinOpReduction);
+
+ if (!IsPartialReduction) {
+ // Replace also ExtractEltOperand.
+ // This is safe to do, because N resulted directly from a full reduction,
+ // whch means all the elements are undefined except for the 0th element.
+ SDValue V =
+ DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(BinOpReduction),
+ ExtractEltOperand->getValueType(0), BinOpReduction);
+ DCI.CombineTo(ExtractEltOperand.getNode(), V);
+ }
+
+ return SDValue(N, 0); // Return N so it doesn't get rechecked!
+ }
if (SDValue V = scalarizeExtEltFP(N, DAG, Subtarget, DCI))
return V;
diff --git a/llvm/test/CodeGen/X86/optimize-reduction.ll b/llvm/test/CodeGen/X86/optimize-reduction.ll
index 003c41612b8bf..1cedacf6e9ce1 100644
--- a/llvm/test/CodeGen/X86/optimize-reduction.ll
+++ b/llvm/test/CodeGen/X86/optimize-reduction.ll
@@ -7,16 +7,9 @@ define { i16, i16 } @test_reduce_v16i16_with_umin(<16 x i16> %x, <16 x i16> %y)
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm4
; SSE41-NEXT: pminuw %xmm1, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3]
-; SSE41-NEXT: pminuw %xmm4, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,1,1]
-; SSE41-NEXT: pminuw %xmm5, %xmm6
-; SSE41-NEXT: movdqa %xmm6, %xmm5
-; SSE41-NEXT: psrld $16, %xmm5
-; SSE41-NEXT: pminuw %xmm6, %xmm5
; SSE41-NEXT: phminposuw %xmm4, %xmm4
; SSE41-NEXT: movd %xmm4, %eax
-; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[0,0,0,0,4,5,6,7]
+; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7]
; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1]
; SSE41-NEXT: pcmpeqw %xmm4, %xmm1
; SSE41-NEXT: pcmpeqd %xmm5, %xmm5
@@ -36,14 +29,8 @@ define { i16, i16 } @test_reduce_v16i16_with_umin(<16 x i16> %x, <16 x i16> %y)
; AVX2: # %bb.0:
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
; AVX2-NEXT: vpminuw %xmm2, %xmm0, %xmm2
-; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
-; AVX2-NEXT: vpminuw %xmm3, %xmm2, %xmm3
-; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,1,1]
-; AVX2-NEXT: vpminuw %xmm4, %xmm3, %xmm3
-; AVX2-NEXT: vpsrld $16, %xmm3, %xmm4
; AVX2-NEXT: vphminposuw %xmm2, %xmm2
; AVX2-NEXT: vmovd %xmm2, %eax
-; AVX2-NEXT: vpminuw %xmm4, %xmm3, %xmm2
; AVX2-NEXT: vpbroadcastw %xmm2, %ymm2
; AVX2-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
@@ -70,20 +57,13 @@ define { i16, i16 } @test_reduce_v16i16_with_umin(<16 x i16> %x, <16 x i16> %y)
define { i16, i16 } @test_reduce_v16i16_with_add(<16 x i16> %x, <16 x i16> %y) {
; SSE41-LABEL: test_reduce_v16i16_with_add:
-; SSE41: # %bb.0: # %start
-; SSE41-NEXT: movdqa %xmm0, %xmm4
-; SSE41-NEXT: paddw %xmm1, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3]
-; SSE41-NEXT: paddw %xmm4, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,1,1]
-; SSE41-NEXT: paddw %xmm5, %xmm4
+; SSE41: # %bb.0:
+; SSE41-NEXT: movdqa %xmm1, %xmm4
+; SSE41-NEXT: phaddw %xmm0, %xmm4
+; SSE41-NEXT: phaddw %xmm4, %xmm4
; SSE41-NEXT: phaddw %xmm4, %xmm4
-; SSE41-NEXT: movdqa %xmm1, %xmm5
-; SSE41-NEXT: phaddw %xmm0, %xmm5
-; SSE41-NEXT: phaddw %xmm5, %xmm5
-; SSE41-NEXT: phaddw %xmm5, %xmm5
-; SSE41-NEXT: phaddw %xmm5, %xmm5
-; SSE41-NEXT: movd %xmm5, %eax
+; SSE41-NEXT: phaddw %xmm4, %xmm4
+; SSE41-NEXT: movd %xmm4, %eax
; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7]
; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1]
; SSE41-NEXT: pcmpeqw %xmm4, %xmm1
@@ -101,20 +81,14 @@ define { i16, i16 } @test_reduce_v16i16_with_add(<16 x i16> %x, <16 x i16> %y) {
; SSE41-NEXT: retq
;
; AVX2-LABEL: test_reduce_v16i16_with_add:
-; AVX2: # %bb.0: # %start
+; AVX2: # %bb.0:
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX2-NEXT: vpaddw %xmm2, %xmm0, %xmm3
-; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,2,3]
-; AVX2-NEXT: vpaddw %xmm4, %xmm3, %xmm3
-; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,1,1]
-; AVX2-NEXT: vpaddw %xmm4, %xmm3, %xmm3
-; AVX2-NEXT: vphaddw %xmm3, %xmm3, %xmm3
; AVX2-NEXT: vphaddw %xmm0, %xmm2, %xmm2
; AVX2-NEXT: vphaddw %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vphaddw %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vphaddw %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vmovd %xmm2, %eax
-; AVX2-NEXT: vpbroadcastw %xmm3, %ymm2
+; AVX2-NEXT: vpbroadcastw %xmm2, %ymm2
; AVX2-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0
@@ -127,7 +101,6 @@ define { i16, i16 } @test_reduce_v16i16_with_add(<16 x i16> %x, <16 x i16> %y) {
; AVX2-NEXT: # kill: def $dx killed $dx killed $edx
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
-start:
%sum_x = tail call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %x)
%sum_x_vec = insertelement <1 x i16> poison, i16 %sum_x, i64 0
%sum_x_splat = shufflevector <1 x i16> %sum_x_vec, <1 x i16> poison, <16 x i32> zeroinitializer
>From a9b6f45006b81b3b2a008e4a8615a2ad925cfc67 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tam=C3=A1s=20Suhajda?= <sutajo at gmail.com>
Date: Sun, 15 Feb 2026 20:43:12 +0100
Subject: [PATCH 3/3] Updates tests
---
llvm/test/CodeGen/X86/optimize-reduction.ll | 123 +++++++++++++++++---
1 file changed, 110 insertions(+), 13 deletions(-)
diff --git a/llvm/test/CodeGen/X86/optimize-reduction.ll b/llvm/test/CodeGen/X86/optimize-reduction.ll
index 1cedacf6e9ce1..30e5ae442e00c 100644
--- a/llvm/test/CodeGen/X86/optimize-reduction.ll
+++ b/llvm/test/CodeGen/X86/optimize-reduction.ll
@@ -1,6 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse4.1,+fast-hops | FileCheck %s --check-prefixes=SSE41
-; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-hops | FileCheck %s --check-prefixes=AVX2
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE41
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse4.1,+fast-hops | FileCheck %s --check-prefixes=SSE41-FAST
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-hops | FileCheck %s --check-prefixes=AVX2-FAST
define { i16, i16 } @test_reduce_v16i16_with_umin(<16 x i16> %x, <16 x i16> %y) {
; SSE41-LABEL: test_reduce_v16i16_with_umin:
@@ -44,6 +46,48 @@ define { i16, i16 } @test_reduce_v16i16_with_umin(<16 x i16> %x, <16 x i16> %y)
; AVX2-NEXT: # kill: def $dx killed $dx killed $edx
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
+;
+; SSE41-FAST-LABEL: test_reduce_v16i16_with_umin:
+; SSE41-FAST: # %bb.0:
+; SSE41-FAST-NEXT: movdqa %xmm0, %xmm4
+; SSE41-FAST-NEXT: pminuw %xmm1, %xmm4
+; SSE41-FAST-NEXT: phminposuw %xmm4, %xmm4
+; SSE41-FAST-NEXT: movd %xmm4, %eax
+; SSE41-FAST-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7]
+; SSE41-FAST-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1]
+; SSE41-FAST-NEXT: pcmpeqw %xmm4, %xmm1
+; SSE41-FAST-NEXT: pcmpeqd %xmm5, %xmm5
+; SSE41-FAST-NEXT: pxor %xmm5, %xmm1
+; SSE41-FAST-NEXT: por %xmm3, %xmm1
+; SSE41-FAST-NEXT: pcmpeqw %xmm4, %xmm0
+; SSE41-FAST-NEXT: pxor %xmm5, %xmm0
+; SSE41-FAST-NEXT: por %xmm2, %xmm0
+; SSE41-FAST-NEXT: pminuw %xmm1, %xmm0
+; SSE41-FAST-NEXT: phminposuw %xmm0, %xmm0
+; SSE41-FAST-NEXT: movd %xmm0, %edx
+; SSE41-FAST-NEXT: # kill: def $ax killed $ax killed $eax
+; SSE41-FAST-NEXT: # kill: def $dx killed $dx killed $edx
+; SSE41-FAST-NEXT: retq
+;
+; AVX2-FAST-LABEL: test_reduce_v16i16_with_umin:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-FAST-NEXT: vpminuw %xmm2, %xmm0, %xmm2
+; AVX2-FAST-NEXT: vphminposuw %xmm2, %xmm2
+; AVX2-FAST-NEXT: vmovd %xmm2, %eax
+; AVX2-FAST-NEXT: vpbroadcastw %xmm2, %ymm2
+; AVX2-FAST-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0
+; AVX2-FAST-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
+; AVX2-FAST-NEXT: vpxor %ymm2, %ymm0, %ymm0
+; AVX2-FAST-NEXT: vpor %ymm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-FAST-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; AVX2-FAST-NEXT: vphminposuw %xmm0, %xmm0
+; AVX2-FAST-NEXT: vmovd %xmm0, %edx
+; AVX2-FAST-NEXT: # kill: def $ax killed $ax killed $eax
+; AVX2-FAST-NEXT: # kill: def $dx killed $dx killed $edx
+; AVX2-FAST-NEXT: vzeroupper
+; AVX2-FAST-NEXT: retq
%min_x = tail call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> %x)
%min_x_vec = insertelement <1 x i16> poison, i16 %min_x, i64 0
%min_x_splat = shufflevector <1 x i16> %min_x_vec, <1 x i16> poison, <16 x i32> zeroinitializer
@@ -58,13 +102,17 @@ define { i16, i16 } @test_reduce_v16i16_with_umin(<16 x i16> %x, <16 x i16> %y)
define { i16, i16 } @test_reduce_v16i16_with_add(<16 x i16> %x, <16 x i16> %y) {
; SSE41-LABEL: test_reduce_v16i16_with_add:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm1, %xmm4
-; SSE41-NEXT: phaddw %xmm0, %xmm4
-; SSE41-NEXT: phaddw %xmm4, %xmm4
-; SSE41-NEXT: phaddw %xmm4, %xmm4
-; SSE41-NEXT: phaddw %xmm4, %xmm4
-; SSE41-NEXT: movd %xmm4, %eax
-; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7]
+; SSE41-NEXT: movdqa %xmm0, %xmm4
+; SSE41-NEXT: paddw %xmm1, %xmm4
+; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3]
+; SSE41-NEXT: paddw %xmm4, %xmm5
+; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,1,1]
+; SSE41-NEXT: paddw %xmm5, %xmm4
+; SSE41-NEXT: movdqa %xmm4, %xmm5
+; SSE41-NEXT: psrld $16, %xmm5
+; SSE41-NEXT: paddw %xmm4, %xmm5
+; SSE41-NEXT: movd %xmm5, %eax
+; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[0,0,0,0,4,5,6,7]
; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1]
; SSE41-NEXT: pcmpeqw %xmm4, %xmm1
; SSE41-NEXT: pcmpeqd %xmm5, %xmm5
@@ -83,10 +131,13 @@ define { i16, i16 } @test_reduce_v16i16_with_add(<16 x i16> %x, <16 x i16> %y) {
; AVX2-LABEL: test_reduce_v16i16_with_add:
; AVX2: # %bb.0:
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX2-NEXT: vphaddw %xmm0, %xmm2, %xmm2
-; AVX2-NEXT: vphaddw %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vphaddw %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vphaddw %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpaddw %xmm2, %xmm0, %xmm2
+; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
+; AVX2-NEXT: vpaddw %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,1,1]
+; AVX2-NEXT: vpaddw %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpsrld $16, %xmm2, %xmm3
+; AVX2-NEXT: vpaddw %xmm3, %xmm2, %xmm2
; AVX2-NEXT: vmovd %xmm2, %eax
; AVX2-NEXT: vpbroadcastw %xmm2, %ymm2
; AVX2-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0
@@ -101,6 +152,52 @@ define { i16, i16 } @test_reduce_v16i16_with_add(<16 x i16> %x, <16 x i16> %y) {
; AVX2-NEXT: # kill: def $dx killed $dx killed $edx
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
+;
+; SSE41-FAST-LABEL: test_reduce_v16i16_with_add:
+; SSE41-FAST: # %bb.0:
+; SSE41-FAST-NEXT: movdqa %xmm1, %xmm4
+; SSE41-FAST-NEXT: phaddw %xmm0, %xmm4
+; SSE41-FAST-NEXT: phaddw %xmm4, %xmm4
+; SSE41-FAST-NEXT: phaddw %xmm4, %xmm4
+; SSE41-FAST-NEXT: phaddw %xmm4, %xmm4
+; SSE41-FAST-NEXT: movd %xmm4, %eax
+; SSE41-FAST-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7]
+; SSE41-FAST-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1]
+; SSE41-FAST-NEXT: pcmpeqw %xmm4, %xmm1
+; SSE41-FAST-NEXT: pcmpeqd %xmm5, %xmm5
+; SSE41-FAST-NEXT: pxor %xmm5, %xmm1
+; SSE41-FAST-NEXT: por %xmm3, %xmm1
+; SSE41-FAST-NEXT: pcmpeqw %xmm4, %xmm0
+; SSE41-FAST-NEXT: pxor %xmm5, %xmm0
+; SSE41-FAST-NEXT: por %xmm2, %xmm0
+; SSE41-FAST-NEXT: pminuw %xmm1, %xmm0
+; SSE41-FAST-NEXT: phminposuw %xmm0, %xmm0
+; SSE41-FAST-NEXT: movd %xmm0, %edx
+; SSE41-FAST-NEXT: # kill: def $ax killed $ax killed $eax
+; SSE41-FAST-NEXT: # kill: def $dx killed $dx killed $edx
+; SSE41-FAST-NEXT: retq
+;
+; AVX2-FAST-LABEL: test_reduce_v16i16_with_add:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-FAST-NEXT: vphaddw %xmm0, %xmm2, %xmm2
+; AVX2-FAST-NEXT: vphaddw %xmm2, %xmm2, %xmm2
+; AVX2-FAST-NEXT: vphaddw %xmm2, %xmm2, %xmm2
+; AVX2-FAST-NEXT: vphaddw %xmm2, %xmm2, %xmm2
+; AVX2-FAST-NEXT: vmovd %xmm2, %eax
+; AVX2-FAST-NEXT: vpbroadcastw %xmm2, %ymm2
+; AVX2-FAST-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0
+; AVX2-FAST-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
+; AVX2-FAST-NEXT: vpxor %ymm2, %ymm0, %ymm0
+; AVX2-FAST-NEXT: vpor %ymm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-FAST-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; AVX2-FAST-NEXT: vphminposuw %xmm0, %xmm0
+; AVX2-FAST-NEXT: vmovd %xmm0, %edx
+; AVX2-FAST-NEXT: # kill: def $ax killed $ax killed $eax
+; AVX2-FAST-NEXT: # kill: def $dx killed $dx killed $edx
+; AVX2-FAST-NEXT: vzeroupper
+; AVX2-FAST-NEXT: retq
%sum_x = tail call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %x)
%sum_x_vec = insertelement <1 x i16> poison, i16 %sum_x, i64 0
%sum_x_splat = shufflevector <1 x i16> %sum_x_vec, <1 x i16> poison, <16 x i32> zeroinitializer
More information about the llvm-commits
mailing list