[llvm] [VectorCombine] New folding pattern for extract/binop/shuffle chains (PR #145232)
Rajveer Singh Bharadwaj via llvm-commits
llvm-commits at lists.llvm.org
Sat Jul 5 03:55:08 PDT 2025
https://github.com/Rajveer100 updated https://github.com/llvm/llvm-project/pull/145232
>From 44a3268b9cd043ac96dc50f1f3b339c2307f20d1 Mon Sep 17 00:00:00 2001
From: Rajveer <rajveer.developer at icloud.com>
Date: Sun, 22 Jun 2025 17:39:34 +0530
Subject: [PATCH 1/2] [VectorCombine] New folding pattern for
extract/binop/shuffle chains
Resolves #144654
Part of #143088
This adds a new `foldShuffleChainsToReduce` for horizontal reduction of
patterns like:
```llvm
define i16 @test_reduce_v8i16(<8 x i16> %a0) local_unnamed_addr #0 {
%1 = shufflevector <8 x i16> %a0, <8 x i16> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
%2 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %a0, <8 x i16> %1)
%3 = shufflevector <8 x i16> %2, <8 x i16> poison, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%4 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %2, <8 x i16> %3)
%5 = shufflevector <8 x i16> %4, <8 x i16> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%6 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %4, <8 x i16> %5)
%7 = extractelement <8 x i16> %6, i64 0
ret i16 %7
}
```
...which can be reduced to a llvm.vector.reduce.umin.v8i16(%a0) intrinsic call.
Similar transformation for other ops when costs permit to do so.
---
.../Transforms/Vectorize/VectorCombine.cpp | 177 ++++++++++++++++
.../X86/shuffle-chain-reduction-umin.ll | 200 ++++++++++++++++++
.../fold-shuffle-chains-to-reduce.ll | 127 +++++++++++
3 files changed, 504 insertions(+)
create mode 100644 llvm/test/Transforms/VectorCombine/X86/shuffle-chain-reduction-umin.ll
create mode 100644 llvm/test/Transforms/VectorCombine/fold-shuffle-chains-to-reduce.ll
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 19e82099e87f0..c7cc8290e88e5 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -130,6 +130,7 @@ class VectorCombine {
bool foldShuffleOfIntrinsics(Instruction &I);
bool foldShuffleToIdentity(Instruction &I);
bool foldShuffleFromReductions(Instruction &I);
+ bool foldShuffleChainsToReduce(Instruction &I);
bool foldCastFromReductions(Instruction &I);
bool foldSelectShuffle(Instruction &I, bool FromReduction = false);
bool foldInterleaveIntrinsics(Instruction &I);
@@ -2988,6 +2989,179 @@ bool VectorCombine::foldShuffleFromReductions(Instruction &I) {
return foldSelectShuffle(*Shuffle, true);
}
+bool VectorCombine::foldShuffleChainsToReduce(Instruction &I) {
+ auto *EEI = dyn_cast<ExtractElementInst>(&I);
+ if (!EEI)
+ return false;
+
+ std::queue<Value *> InstWorklist;
+ Value *InitEEV = nullptr;
+ Intrinsic::ID CommonOp = 0;
+
+ bool IsFirstCallInst = true;
+ bool ShouldBeCallInst = true;
+
+ SmallVector<Value *, 3> PrevVecV(3, nullptr);
+ int64_t ShuffleMaskHalf = -1, ExpectedShuffleMaskHalf = 1;
+ int64_t VecSize = -1;
+
+ Value *VecOp;
+ if (!match(&I, m_ExtractElt(m_Value(VecOp), m_Zero())))
+ return false;
+
+ auto *FVT = dyn_cast<FixedVectorType>(VecOp->getType());
+ if (!FVT)
+ return false;
+
+ VecSize = FVT->getNumElements();
+ if (VecSize < 2 || (VecSize % 2) != 0)
+ return false;
+
+ ShuffleMaskHalf = 1;
+ PrevVecV[2] = VecOp;
+ InitEEV = EEI;
+
+ InstWorklist.push(PrevVecV[2]);
+
+ while (!InstWorklist.empty()) {
+ Value *V = InstWorklist.front();
+ InstWorklist.pop();
+
+ auto *CI = dyn_cast<Instruction>(V);
+ if (!CI)
+ return false;
+
+ if (auto *CallI = dyn_cast<CallInst>(CI)) {
+ if (!ShouldBeCallInst || !PrevVecV[2])
+ return false;
+
+ if (!IsFirstCallInst &&
+ any_of(PrevVecV, [](Value *VecV) { return VecV == nullptr; }))
+ return false;
+
+ if (CallI != (IsFirstCallInst ? PrevVecV[2] : PrevVecV[0]))
+ return false;
+ IsFirstCallInst = false;
+
+ auto *II = dyn_cast<IntrinsicInst>(CallI);
+ if (!II)
+ return false;
+
+ if (!CommonOp)
+ CommonOp = II->getIntrinsicID();
+ if (II->getIntrinsicID() != CommonOp)
+ return false;
+
+ switch (II->getIntrinsicID()) {
+ case Intrinsic::umin:
+ case Intrinsic::umax:
+ case Intrinsic::smin:
+ case Intrinsic::smax: {
+ auto *Op0 = CallI->getOperand(0);
+ auto *Op1 = CallI->getOperand(1);
+ PrevVecV[0] = Op0;
+ PrevVecV[1] = Op1;
+ break;
+ }
+ default:
+ return false;
+ }
+ ShouldBeCallInst ^= 1;
+
+ if (!isa<ShuffleVectorInst>(PrevVecV[1]))
+ std::swap(PrevVecV[0], PrevVecV[1]);
+ InstWorklist.push(PrevVecV[1]);
+ InstWorklist.push(PrevVecV[0]);
+ } else if (auto *SVInst = dyn_cast<ShuffleVectorInst>(CI)) {
+ if (ShouldBeCallInst ||
+ any_of(PrevVecV, [](Value *VecV) { return VecV == nullptr; }))
+ return false;
+
+ if (SVInst != PrevVecV[1])
+ return false;
+
+ auto *ShuffleVec = SVInst->getOperand(0);
+ if (!ShuffleVec || ShuffleVec != PrevVecV[0])
+ return false;
+
+ SmallVector<int> CurMask;
+ SVInst->getShuffleMask(CurMask);
+
+ if (ShuffleMaskHalf != ExpectedShuffleMaskHalf)
+ return false;
+ ExpectedShuffleMaskHalf *= 2;
+
+ for (int Mask = 0, MaskSize = CurMask.size(); Mask != MaskSize; ++Mask) {
+ if (Mask < ShuffleMaskHalf && CurMask[Mask] != ShuffleMaskHalf + Mask)
+ return false;
+ if (Mask >= ShuffleMaskHalf && CurMask[Mask] != -1)
+ return false;
+ }
+ ShuffleMaskHalf *= 2;
+ if (ExpectedShuffleMaskHalf == VecSize)
+ break;
+ ShouldBeCallInst ^= 1;
+ } else {
+ return false;
+ }
+ }
+
+ if (ShouldBeCallInst)
+ return false;
+
+ assert(VecSize != -1 && ExpectedShuffleMaskHalf == VecSize &&
+ "Expected Match for Vector Size and Mask Half");
+
+ Value *FinalVecV = PrevVecV[0];
+ auto *FinalVecVTy = dyn_cast<FixedVectorType>(FinalVecV->getType());
+
+ if (!InitEEV || !FinalVecV)
+ return false;
+
+ assert(FinalVecVTy && "Expected non-null value for Vector Type");
+
+ Intrinsic::ID ReducedOp = 0;
+ switch (CommonOp) {
+ case Intrinsic::umin:
+ ReducedOp = Intrinsic::vector_reduce_umin;
+ break;
+ case Intrinsic::umax:
+ ReducedOp = Intrinsic::vector_reduce_umax;
+ break;
+ case Intrinsic::smin:
+ ReducedOp = Intrinsic::vector_reduce_smin;
+ break;
+ case Intrinsic::smax:
+ ReducedOp = Intrinsic::vector_reduce_smax;
+ break;
+ default:
+ return false;
+ }
+
+ InstructionCost OrigCost = 0;
+ unsigned int NumLevels = Log2_64(VecSize);
+
+ for (unsigned int Level = 0; Level < NumLevels; ++Level) {
+ OrigCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
+ FinalVecVTy, FinalVecVTy);
+ OrigCost += TTI.getArithmeticInstrCost(Instruction::ICmp, FinalVecVTy);
+ }
+ OrigCost += TTI.getVectorInstrCost(Instruction::ExtractElement, FinalVecVTy,
+ CostKind, 0);
+
+ IntrinsicCostAttributes ICA(ReducedOp, FinalVecVTy, {FinalVecV});
+ InstructionCost NewCost = TTI.getIntrinsicInstrCost(ICA, CostKind);
+
+ if (NewCost >= OrigCost)
+ return false;
+
+ auto *ReducedResult =
+ Builder.CreateIntrinsic(ReducedOp, {FinalVecV->getType()}, {FinalVecV});
+ replaceValue(*InitEEV, *ReducedResult);
+
+ return true;
+}
+
/// Determine if its more efficient to fold:
/// reduce(trunc(x)) -> trunc(reduce(x)).
/// reduce(sext(x)) -> sext(reduce(x)).
@@ -3705,6 +3879,9 @@ bool VectorCombine::run() {
MadeChange |= foldShuffleFromReductions(I);
MadeChange |= foldCastFromReductions(I);
break;
+ case Instruction::ExtractElement:
+ MadeChange |= foldShuffleChainsToReduce(I);
+ break;
case Instruction::ICmp:
case Instruction::FCmp:
MadeChange |= foldExtractExtract(I);
diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle-chain-reduction-umin.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle-chain-reduction-umin.ll
new file mode 100644
index 0000000000000..82b20ccc5b8f5
--- /dev/null
+++ b/llvm/test/Transforms/VectorCombine/X86/shuffle-chain-reduction-umin.ll
@@ -0,0 +1,200 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=x86_64-- -mcpu=x86-64 -passes=vector-combine -S %s | FileCheck %s
+; RUN: opt -mtriple=x86_64-- -mcpu=x86-64-v2 -passes=vector-combine -S %s | FileCheck %s
+; RUN: opt -mtriple=x86_64-- -mcpu=x86-64-v3 -passes=vector-combine -S %s | FileCheck %s
+; RUN: opt -mtriple=x86_64-- -mcpu=x86-64-v4 -passes=vector-combine -S %s | FileCheck %s
+
+define i16 @test_reduce_v8i16(<8 x i16> %a0) {
+; CHECK-LABEL: define i16 @test_reduce_v8i16(
+; CHECK-SAME: <8 x i16> [[A0:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> [[A0]])
+; CHECK-NEXT: ret i16 [[TMP1]]
+;
+ %1 = shufflevector <8 x i16> %a0, <8 x i16> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+ %2 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %a0, <8 x i16> %1)
+ %3 = shufflevector <8 x i16> %2, <8 x i16> poison, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %4 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %2, <8 x i16> %3)
+ %5 = shufflevector <8 x i16> %4, <8 x i16> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %6 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %4, <8 x i16> %5)
+ %7 = extractelement <8 x i16> %6, i64 0
+ ret i16 %7
+}
+
+define i8 @test_reduce_v16i8(<16 x i8> %a0) {
+;
+; CHECK-LABEL: define i8 @test_reduce_v16i8(
+; CHECK-SAME: <16 x i8> [[A0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP8:%.*]] = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> [[A0]])
+; CHECK-NEXT: ret i8 [[TMP8]]
+;
+ %1 = shufflevector <16 x i8> %a0, <16 x i8> poison, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %2 = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> %a0, <16 x i8> %1)
+ %3 = shufflevector <16 x i8> %2, <16 x i8> poison, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %4 = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> %2, <16 x i8> %3)
+ %5 = shufflevector <16 x i8> %4, <16 x i8> poison, <16 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %6 = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> %4, <16 x i8> %5)
+ %7 = shufflevector <16 x i8> %6, <16 x i8> poison, <16 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %8 = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> %6, <16 x i8> %7)
+ %9 = extractelement <16 x i8> %8, i64 0
+ ret i8 %9
+}
+
+define i8 @test_reduce_v32i8(<32 x i8> %a0) {
+; CHECK-LABEL: define i8 @test_reduce_v32i8(
+; CHECK-SAME: <32 x i8> [[A0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> [[A0]])
+; CHECK-NEXT: ret i8 [[TMP1]]
+;
+ %1 = shufflevector <32 x i8> %a0, <32 x i8> poison, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23,
+ i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %2 = tail call <32 x i8> @llvm.umin.v32i8(<32 x i8> %a0, <32 x i8> %1)
+ %3 = shufflevector <32 x i8> %2, <32 x i8> poison, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %4 = tail call <32 x i8> @llvm.umin.v32i8(<32 x i8> %2, <32 x i8> %3)
+ %5 = shufflevector <32 x i8> %4, <32 x i8> poison, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %6 = tail call <32 x i8> @llvm.umin.v32i8(<32 x i8> %4, <32 x i8> %5)
+ %7 = shufflevector <32 x i8> %6, <32 x i8> poison, <32 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %8 = tail call <32 x i8> @llvm.umin.v32i8(<32 x i8> %6, <32 x i8> %7)
+ %9 = shufflevector <32 x i8> %8, <32 x i8> poison, <32 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %10 = tail call <32 x i8> @llvm.umin.v32i8(<32 x i8> %8, <32 x i8> %9)
+ %11 = extractelement <32 x i8> %10, i64 0
+ ret i8 %11
+}
+
+define i16 @test_reduce_v16i16(<16 x i16> %a0) {
+; CHECK-LABEL: define i16 @test_reduce_v16i16(
+; CHECK-SAME: <16 x i16> [[A0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> [[A0]])
+; CHECK-NEXT: ret i16 [[TMP1]]
+;
+ %1 = shufflevector <16 x i16> %a0, <16 x i16> poison, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %2 = tail call <16 x i16> @llvm.umin.v16i16(<16 x i16> %a0, <16 x i16> %1)
+ %3 = shufflevector <16 x i16> %2, <16 x i16> poison, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %4 = tail call <16 x i16> @llvm.umin.v16i16(<16 x i16> %2, <16 x i16> %3)
+ %5 = shufflevector <16 x i16> %4, <16 x i16> poison, <16 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %6 = tail call <16 x i16> @llvm.umin.v16i16(<16 x i16> %4, <16 x i16> %5)
+ %7 = shufflevector <16 x i16> %6, <16 x i16> poison, <16 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %8 = tail call <16 x i16> @llvm.umin.v16i16(<16 x i16> %6, <16 x i16> %7)
+ %9 = extractelement <16 x i16> %8, i64 0
+ ret i16 %9
+}
+
+define i8 @test_reduce_v64i8(<64 x i8> %a0) {
+; CHECK-LABEL: define i8 @test_reduce_v64i8(
+; CHECK-SAME: <64 x i8> [[A0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> [[A0]])
+; CHECK-NEXT: ret i8 [[TMP1]]
+;
+ %1 = shufflevector <64 x i8> %a0, <64 x i8> poison, <64 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39,
+ i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47,
+ i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55,
+ i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %2 = tail call <64 x i8> @llvm.umin.v64i8(<64 x i8> %a0, <64 x i8> %1)
+ %3 = shufflevector <64 x i8> %2, <64 x i8> poison, <64 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23,
+ i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %4 = tail call <64 x i8> @llvm.umin.v64i8(<64 x i8> %2, <64 x i8> %3)
+ %5 = shufflevector <64 x i8> %4, <64 x i8> poison, <64 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %6 = tail call <64 x i8> @llvm.umin.v64i8(<64 x i8> %4, <64 x i8> %5)
+ %7 = shufflevector <64 x i8> %6, <64 x i8> poison, <64 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %8 = tail call <64 x i8> @llvm.umin.v64i8(<64 x i8> %6, <64 x i8> %7)
+ %9 = shufflevector <64 x i8> %8, <64 x i8> poison, <64 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %10 = tail call <64 x i8> @llvm.umin.v64i8(<64 x i8> %8, <64 x i8> %9)
+ %11 = shufflevector <64 x i8> %10, <64 x i8> poison, <64 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %12 = tail call <64 x i8> @llvm.umin.v64i8(<64 x i8> %10, <64 x i8> %11)
+ %13 = extractelement <64 x i8> %12, i64 0
+ ret i8 %13
+}
+
+define i16 @test_reduce_v32i16(<32 x i16> %a0) {
+; CHECK-LABEL: define i16 @test_reduce_v32i16(
+; CHECK-SAME: <32 x i16> [[A0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> [[A0]])
+; CHECK-NEXT: ret i16 [[TMP1]]
+;
+ %1 = shufflevector <32 x i16> %a0, <32 x i16> poison, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23,
+ i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %2 = tail call <32 x i16> @llvm.umin.v32i16(<32 x i16> %a0, <32 x i16> %1)
+ %3 = shufflevector <32 x i16> %2, <32 x i16> poison, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %4 = tail call <32 x i16> @llvm.umin.v32i16(<32 x i16> %2, <32 x i16> %3)
+ %5 = shufflevector <32 x i16> %4, <32 x i16> poison, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %6 = tail call <32 x i16> @llvm.umin.v32i16(<32 x i16> %4, <32 x i16> %5)
+ %7 = shufflevector <32 x i16> %6, <32 x i16> poison, <32 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %8 = tail call <32 x i16> @llvm.umin.v32i16(<32 x i16> %6, <32 x i16> %7)
+ %9 = shufflevector <32 x i16> %8, <32 x i16> poison, <32 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+ i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %10 = tail call <32 x i16> @llvm.umin.v32i16(<32 x i16> %8, <32 x i16> %9)
+ %11 = extractelement <32 x i16> %10, i64 0
+ ret i16 %11
+}
diff --git a/llvm/test/Transforms/VectorCombine/fold-shuffle-chains-to-reduce.ll b/llvm/test/Transforms/VectorCombine/fold-shuffle-chains-to-reduce.ll
new file mode 100644
index 0000000000000..3cb25ba4ecce6
--- /dev/null
+++ b/llvm/test/Transforms/VectorCombine/fold-shuffle-chains-to-reduce.ll
@@ -0,0 +1,127 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=vector-combine -S | FileCheck %s
+
+define i16 @test_reduce_v8i16(<8 x i16> %a0) {
+; CHECK-LABEL: define i16 @test_reduce_v8i16(
+; CHECK-SAME: <8 x i16> [[A0:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> [[A0]])
+; CHECK-NEXT: ret i16 [[TMP1]]
+;
+ %1 = shufflevector <8 x i16> %a0, <8 x i16> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+ %2 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %a0, <8 x i16> %1)
+ %3 = shufflevector <8 x i16> %2, <8 x i16> poison, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %4 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %2, <8 x i16> %3)
+ %5 = shufflevector <8 x i16> %4, <8 x i16> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %6 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %4, <8 x i16> %5)
+ %7 = extractelement <8 x i16> %6, i64 0
+ ret i16 %7
+}
+
+define i16 @test_reduce_v8i16_2(<8 x i16> %a0) {
+; CHECK-LABEL: define i16 @test_reduce_v8i16_2(
+; CHECK-SAME: <8 x i16> [[A0:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[A0]], <8 x i16> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[A0]], <8 x i16> [[TMP1]])
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP4:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP3]])
+; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP6:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP5]])
+; CHECK-NEXT: [[TMP13:%.*]] = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> [[A0]])
+; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x i16> [[TMP6]], <8 x i16> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP9:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[A0]], <8 x i16> [[TMP8]])
+; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <8 x i16> [[TMP9]], <8 x i16> poison, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP11:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]])
+; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <8 x i16> [[TMP11]], <8 x i16> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP16:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]])
+; CHECK-NEXT: [[TMP14:%.*]] = extractelement <8 x i16> [[TMP16]], i64 0
+; CHECK-NEXT: [[TMP15:%.*]] = tail call i16 @llvm.umin.i16(i16 [[TMP13]], i16 [[TMP14]])
+; CHECK-NEXT: ret i16 [[TMP15]]
+;
+ %1 = shufflevector <8 x i16> %a0, <8 x i16> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+ %2 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %a0, <8 x i16> %1)
+ %3 = shufflevector <8 x i16> %2, <8 x i16> poison, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %4 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %2, <8 x i16> %3)
+ %5 = shufflevector <8 x i16> %4, <8 x i16> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %6 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %4, <8 x i16> %5)
+ %7 = extractelement <8 x i16> %6, i64 0
+
+ %8 = shufflevector <8 x i16> %6, <8 x i16> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+ %9 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %a0, <8 x i16> %8)
+ %10 = shufflevector <8 x i16> %9, <8 x i16> poison, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %11 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %9, <8 x i16> %10)
+ %12 = shufflevector <8 x i16> %11, <8 x i16> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %13 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %11, <8 x i16> %12)
+ %14 = extractelement <8 x i16> %13, i64 0
+
+ %15 = tail call i16 @llvm.umin.i16(i16 %7, i16 %14)
+
+ ret i16 %15
+}
+
+define i16 @test_reduce_v8i16_neg1(<8 x i16> %a0) {
+; CHECK-LABEL: define i16 @test_reduce_v8i16_neg1(
+; CHECK-SAME: <8 x i16> [[A0:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[A0]], <8 x i16> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 1, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[A0]], <8 x i16> [[TMP1]])
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP4:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP3]])
+; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP6:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP5]])
+; CHECK-NEXT: [[TMP7:%.*]] = extractelement <8 x i16> [[TMP6]], i64 0
+; CHECK-NEXT: ret i16 [[TMP7]]
+;
+ %1 = shufflevector <8 x i16> %a0, <8 x i16> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 1, i32 poison, i32 poison, i32 poison, i32 poison>
+ %2 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %a0, <8 x i16> %1)
+ %3 = shufflevector <8 x i16> %2, <8 x i16> poison, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %4 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %2, <8 x i16> %3)
+ %5 = shufflevector <8 x i16> %4, <8 x i16> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %6 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %4, <8 x i16> %5)
+ %7 = extractelement <8 x i16> %6, i64 0
+ ret i16 %7
+}
+
+define i16 @test_reduce_v8i16_neg2(<8 x i16> %a0) {
+; CHECK-LABEL: define i16 @test_reduce_v8i16_neg2(
+; CHECK-SAME: <8 x i16> [[A0:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[A0]], <8 x i16> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[A0]], <8 x i16> [[TMP1]])
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP4:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP3]])
+; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP6:%.*]] = tail call <8 x i16> @llvm.umax.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP5]])
+; CHECK-NEXT: [[TMP7:%.*]] = extractelement <8 x i16> [[TMP6]], i64 0
+; CHECK-NEXT: ret i16 [[TMP7]]
+;
+ %1 = shufflevector <8 x i16> %a0, <8 x i16> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+ %2 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %a0, <8 x i16> %1)
+ %3 = shufflevector <8 x i16> %2, <8 x i16> poison, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %4 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %2, <8 x i16> %3)
+ %5 = shufflevector <8 x i16> %4, <8 x i16> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %6 = tail call <8 x i16> @llvm.umax.v8i16(<8 x i16> %4, <8 x i16> %5)
+ %7 = extractelement <8 x i16> %6, i64 0
+ ret i16 %7
+}
+
+define i16 @test_reduce_v8i16_neg3(<8 x i16> %a0) {
+; CHECK-LABEL: define i16 @test_reduce_v8i16_neg3(
+; CHECK-SAME: <8 x i16> [[A0:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[A0]], <8 x i16> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[A0]], <8 x i16> [[TMP1]])
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP4:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP3]])
+; CHECK-NEXT: [[TMP5:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP3]])
+; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP7:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP5]], <8 x i16> [[TMP6]])
+; CHECK-NEXT: [[TMP8:%.*]] = extractelement <8 x i16> [[TMP7]], i64 0
+; CHECK-NEXT: ret i16 [[TMP8]]
+;
+ %1 = shufflevector <8 x i16> %a0, <8 x i16> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+ %2 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %a0, <8 x i16> %1)
+ %3 = shufflevector <8 x i16> %2, <8 x i16> poison, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %4 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %2, <8 x i16> %3)
+ %5 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %2, <8 x i16> %3)
+ %6 = shufflevector <8 x i16> %4, <8 x i16> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %7 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %5, <8 x i16> %6)
+ %8 = extractelement <8 x i16> %7, i64 0
+ ret i16 %8
+}
>From eb9570df3557679ee41e3e098c0d202a2ff95408 Mon Sep 17 00:00:00 2001
From: Rajveer <rajveer.developer at icloud.com>
Date: Sat, 28 Jun 2025 16:31:51 +0530
Subject: [PATCH 2/2] Include support for Add/Mul/Or/And/Xor Binary Operations
---
.../Transforms/Vectorize/VectorCombine.cpp | 252 +++++++++++++-----
.../fold-shuffle-chains-to-reduce.ll | 68 +++++
2 files changed, 257 insertions(+), 63 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index c7cc8290e88e5..f8fb74de49bd2 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -2989,21 +2989,72 @@ bool VectorCombine::foldShuffleFromReductions(Instruction &I) {
return foldSelectShuffle(*Shuffle, true);
}
+/// For a given chain of patterns of the following form:
+///
+/// ```
+/// %1 = shufflevector <n x ty1> %0, <n x ty1> poison <n x ty2> mask
+///
+/// %2 = tail call <n x ty1> llvm.<umin/umax/smin/smax>(<n x ty1> %0, <n x
+/// ty1> %1)
+/// OR
+/// %2 = add/mul/or/and/xor <n x ty1> %0, %1
+///
+/// %3 = shufflevector <n x ty1> %2, <n x ty1> poison <n x ty2> mask
+/// ...
+/// ...
+/// %(i - 1) = tail call <n x ty1> llvm.<umin/umax/smin/smax>(<n x ty1> %(i -
+/// 3), <n x ty1> %(i - 2)
+/// OR
+/// %(i - 1) = add/mul/or/and/xor <n x ty1> %(i - 3), %(i - 2)
+///
+/// %(i) = extractelement <n x ty1> %(i - 1), 0
+/// ```
+///
+/// Where:
+/// `mask` follows a partition pattern:
+///
+/// Ex:
+/// [n = 8, p = poison]
+///
+/// 4 5 6 7 | p p p p
+/// 2 3 | p p p p p p
+/// 1 | p p p p p p p
+///
+/// For powers of 2, there's a consistent pattern, but for other cases
+/// the parity of the current half value at each step decides the
+/// next partition half (see `ExpectedParityMask` for more logical details
+/// in generalising this).
+///
+/// Ex:
+/// [n = 6]
+///
+/// 3 4 5 | p p p
+/// 1 2 | p p p p
+/// 1 | p p p p p
bool VectorCombine::foldShuffleChainsToReduce(Instruction &I) {
+ // Going bottom-up for the pattern.
auto *EEI = dyn_cast<ExtractElementInst>(&I);
if (!EEI)
return false;
std::queue<Value *> InstWorklist;
+ InstructionCost OrigCost = 0;
+
Value *InitEEV = nullptr;
- Intrinsic::ID CommonOp = 0;
- bool IsFirstCallInst = true;
- bool ShouldBeCallInst = true;
+ // Common instruction operation after each shuffle op.
+ unsigned int CommonCallOp = 0;
+ Instruction::BinaryOps CommonBinOp = Instruction::BinaryOpsEnd;
+ bool IsFirstCallOrBinInst = true;
+ bool ShouldBeCallOrBinInst = true;
+
+ // This stores the last used instructions for shuffle/common op.
+ //
+ // PrevVecV[2] stores the first vector from extract element instruction,
+ // while PrevVecV[0] / PrevVecV[1] store the last two simultaneous
+ // instructions from either shuffle/common op.
SmallVector<Value *, 3> PrevVecV(3, nullptr);
- int64_t ShuffleMaskHalf = -1, ExpectedShuffleMaskHalf = 1;
- int64_t VecSize = -1;
Value *VecOp;
if (!match(&I, m_ExtractElt(m_Value(VecOp), m_Zero())))
@@ -3013,11 +3064,29 @@ bool VectorCombine::foldShuffleChainsToReduce(Instruction &I) {
if (!FVT)
return false;
- VecSize = FVT->getNumElements();
- if (VecSize < 2 || (VecSize % 2) != 0)
+ int64_t VecSize = FVT->getNumElements();
+ if (VecSize < 2)
return false;
- ShuffleMaskHalf = 1;
+ // Number of levels would be ~log2(n), considering we always partition
+ // by half for this fold pattern.
+ unsigned int NumLevels = Log2_64_Ceil(VecSize), VisitedCnt = 0;
+ int64_t ShuffleMaskHalf = 1, ExpectedParityMask = 0;
+
+ // This is how we generalise for all element sizes.
+ // At each step, if vector size is odd, we need non-poison
+ // values to cover the dominant half so we don't miss out on any element.
+ //
+ // This mask will help us retrieve this as we go from bottom to top:
+ //
+ // Mask Set -> N = N * 2 - 1
+ // Mask Unset -> N = N * 2
+ for (int Cur = VecSize, Mask = NumLevels - 1; Cur > 1;
+ Cur = (Cur + 1) / 2, --Mask) {
+ if (Cur & 1)
+ ExpectedParityMask |= (1ll << Mask);
+ }
+
PrevVecV[2] = VecOp;
InitEEV = EEI;
@@ -3031,25 +3100,23 @@ bool VectorCombine::foldShuffleChainsToReduce(Instruction &I) {
if (!CI)
return false;
- if (auto *CallI = dyn_cast<CallInst>(CI)) {
- if (!ShouldBeCallInst || !PrevVecV[2])
+ if (auto *II = dyn_cast<IntrinsicInst>(CI)) {
+ if (!ShouldBeCallOrBinInst || !PrevVecV[2])
return false;
- if (!IsFirstCallInst &&
+ if (!IsFirstCallOrBinInst &&
any_of(PrevVecV, [](Value *VecV) { return VecV == nullptr; }))
return false;
- if (CallI != (IsFirstCallInst ? PrevVecV[2] : PrevVecV[0]))
- return false;
- IsFirstCallInst = false;
-
- auto *II = dyn_cast<IntrinsicInst>(CallI);
- if (!II)
+ // For the first found call/bin op, the vector has to come from the
+ // extract element op.
+ if (II != (IsFirstCallOrBinInst ? PrevVecV[2] : PrevVecV[0]))
return false;
+ IsFirstCallOrBinInst = false;
- if (!CommonOp)
- CommonOp = II->getIntrinsicID();
- if (II->getIntrinsicID() != CommonOp)
+ if (!CommonCallOp)
+ CommonCallOp = II->getIntrinsicID();
+ if (II->getIntrinsicID() != CommonCallOp)
return false;
switch (II->getIntrinsicID()) {
@@ -3057,8 +3124,56 @@ bool VectorCombine::foldShuffleChainsToReduce(Instruction &I) {
case Intrinsic::umax:
case Intrinsic::smin:
case Intrinsic::smax: {
- auto *Op0 = CallI->getOperand(0);
- auto *Op1 = CallI->getOperand(1);
+ auto *Op0 = II->getOperand(0);
+ auto *Op1 = II->getOperand(1);
+ PrevVecV[0] = Op0;
+ PrevVecV[1] = Op1;
+ break;
+ }
+ default:
+ return false;
+ }
+ ShouldBeCallOrBinInst ^= 1;
+
+ IntrinsicCostAttributes ICA(
+ CommonCallOp, II->getType(),
+ {PrevVecV[0]->getType(), PrevVecV[1]->getType()});
+ OrigCost += TTI.getIntrinsicInstrCost(ICA, CostKind);
+
+ // We may need a swap here since it can be (a, b) or (b, a)
+ // and accordinly change as we go up.
+ if (!isa<ShuffleVectorInst>(PrevVecV[1]))
+ std::swap(PrevVecV[0], PrevVecV[1]);
+ InstWorklist.push(PrevVecV[1]);
+ InstWorklist.push(PrevVecV[0]);
+ } else if (auto *BinOp = dyn_cast<BinaryOperator>(CI)) {
+ // Similar logic for bin ops.
+
+ if (!ShouldBeCallOrBinInst || !PrevVecV[2])
+ return false;
+
+ if (!IsFirstCallOrBinInst &&
+ any_of(PrevVecV, [](Value *VecV) { return VecV == nullptr; }))
+ return false;
+
+ if (BinOp != (IsFirstCallOrBinInst ? PrevVecV[2] : PrevVecV[0]))
+ return false;
+ IsFirstCallOrBinInst = false;
+
+ if (CommonBinOp == Instruction::BinaryOpsEnd)
+ CommonBinOp = BinOp->getOpcode();
+
+ if (BinOp->getOpcode() != CommonBinOp)
+ return false;
+
+ switch (CommonBinOp) {
+ case BinaryOperator::Add:
+ case BinaryOperator::Mul:
+ case BinaryOperator::Or:
+ case BinaryOperator::And:
+ case BinaryOperator::Xor: {
+ auto *Op0 = BinOp->getOperand(0);
+ auto *Op1 = BinOp->getOperand(1);
PrevVecV[0] = Op0;
PrevVecV[1] = Op1;
break;
@@ -3066,14 +3181,19 @@ bool VectorCombine::foldShuffleChainsToReduce(Instruction &I) {
default:
return false;
}
- ShouldBeCallInst ^= 1;
+ ShouldBeCallOrBinInst ^= 1;
+
+ OrigCost +=
+ TTI.getArithmeticInstrCost(CommonBinOp, BinOp->getType(), CostKind);
if (!isa<ShuffleVectorInst>(PrevVecV[1]))
std::swap(PrevVecV[0], PrevVecV[1]);
InstWorklist.push(PrevVecV[1]);
InstWorklist.push(PrevVecV[0]);
} else if (auto *SVInst = dyn_cast<ShuffleVectorInst>(CI)) {
- if (ShouldBeCallInst ||
+ // We shouldn't have any null values in the previous vectors,
+ // is so, there was a mismatch in pattern.
+ if (ShouldBeCallOrBinInst ||
any_of(PrevVecV, [](Value *VecV) { return VecV == nullptr; }))
return false;
@@ -3084,70 +3204,76 @@ bool VectorCombine::foldShuffleChainsToReduce(Instruction &I) {
if (!ShuffleVec || ShuffleVec != PrevVecV[0])
return false;
- SmallVector<int> CurMask;
- SVInst->getShuffleMask(CurMask);
-
- if (ShuffleMaskHalf != ExpectedShuffleMaskHalf)
+ if (!isa<PoisonValue>(SVInst->getOperand(1)))
return false;
- ExpectedShuffleMaskHalf *= 2;
+ ArrayRef<int> CurMask = SVInst->getShuffleMask();
+
+ // Subtract the parity mask when checking the condition.
for (int Mask = 0, MaskSize = CurMask.size(); Mask != MaskSize; ++Mask) {
- if (Mask < ShuffleMaskHalf && CurMask[Mask] != ShuffleMaskHalf + Mask)
+ if (Mask < ShuffleMaskHalf &&
+ CurMask[Mask] != ShuffleMaskHalf + Mask - (ExpectedParityMask & 1))
return false;
if (Mask >= ShuffleMaskHalf && CurMask[Mask] != -1)
return false;
}
+
+ // Update mask values.
ShuffleMaskHalf *= 2;
- if (ExpectedShuffleMaskHalf == VecSize)
+ ShuffleMaskHalf -= (ExpectedParityMask & 1);
+ ExpectedParityMask >>= 1;
+
+ OrigCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
+ SVInst->getType(), SVInst->getType(),
+ CurMask, CostKind);
+
+ VisitedCnt += 1;
+ if (!ExpectedParityMask && VisitedCnt == NumLevels)
break;
- ShouldBeCallInst ^= 1;
+
+ ShouldBeCallOrBinInst ^= 1;
} else {
return false;
}
}
- if (ShouldBeCallInst)
+ // Pattern should end with a shuffle op.
+ if (ShouldBeCallOrBinInst)
return false;
- assert(VecSize != -1 && ExpectedShuffleMaskHalf == VecSize &&
- "Expected Match for Vector Size and Mask Half");
+ assert(VecSize != -1 && "Expected Match for Vector Size");
Value *FinalVecV = PrevVecV[0];
- auto *FinalVecVTy = dyn_cast<FixedVectorType>(FinalVecV->getType());
-
if (!InitEEV || !FinalVecV)
return false;
+ auto *FinalVecVTy = dyn_cast<FixedVectorType>(FinalVecV->getType());
+
assert(FinalVecVTy && "Expected non-null value for Vector Type");
Intrinsic::ID ReducedOp = 0;
- switch (CommonOp) {
- case Intrinsic::umin:
- ReducedOp = Intrinsic::vector_reduce_umin;
- break;
- case Intrinsic::umax:
- ReducedOp = Intrinsic::vector_reduce_umax;
- break;
- case Intrinsic::smin:
- ReducedOp = Intrinsic::vector_reduce_smin;
- break;
- case Intrinsic::smax:
- ReducedOp = Intrinsic::vector_reduce_smax;
- break;
- default:
- return false;
- }
-
- InstructionCost OrigCost = 0;
- unsigned int NumLevels = Log2_64(VecSize);
-
- for (unsigned int Level = 0; Level < NumLevels; ++Level) {
- OrigCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
- FinalVecVTy, FinalVecVTy);
- OrigCost += TTI.getArithmeticInstrCost(Instruction::ICmp, FinalVecVTy);
+ if (CommonCallOp) {
+ switch (CommonCallOp) {
+ case Intrinsic::umin:
+ ReducedOp = Intrinsic::vector_reduce_umin;
+ break;
+ case Intrinsic::umax:
+ ReducedOp = Intrinsic::vector_reduce_umax;
+ break;
+ case Intrinsic::smin:
+ ReducedOp = Intrinsic::vector_reduce_smin;
+ break;
+ case Intrinsic::smax:
+ ReducedOp = Intrinsic::vector_reduce_smax;
+ break;
+ default:
+ return false;
+ }
+ } else if (CommonBinOp != Instruction::BinaryOpsEnd) {
+ ReducedOp = getReductionForBinop(CommonBinOp);
+ if (!ReducedOp)
+ return false;
}
- OrigCost += TTI.getVectorInstrCost(Instruction::ExtractElement, FinalVecVTy,
- CostKind, 0);
IntrinsicCostAttributes ICA(ReducedOp, FinalVecVTy, {FinalVecV});
InstructionCost NewCost = TTI.getIntrinsicInstrCost(ICA, CostKind);
diff --git a/llvm/test/Transforms/VectorCombine/fold-shuffle-chains-to-reduce.ll b/llvm/test/Transforms/VectorCombine/fold-shuffle-chains-to-reduce.ll
index 3cb25ba4ecce6..403ce33b5344e 100644
--- a/llvm/test/Transforms/VectorCombine/fold-shuffle-chains-to-reduce.ll
+++ b/llvm/test/Transforms/VectorCombine/fold-shuffle-chains-to-reduce.ll
@@ -17,6 +17,52 @@ define i16 @test_reduce_v8i16(<8 x i16> %a0) {
ret i16 %7
}
+define i16 @test_reduce_v7i16_or(<7 x i16> %a0) {
+; CHECK-LABEL: define i16 @test_reduce_v7i16_or(
+; CHECK-SAME: <7 x i16> [[A0:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.vector.reduce.or.v7i16(<7 x i16> [[A0]])
+; CHECK-NEXT: ret i16 [[TMP1]]
+;
+ %1 = shufflevector <7 x i16> %a0, <7 x i16> poison, <7 x i32> <i32 3, i32 4, i32 5, i32 6, i32 poison, i32 poison, i32 poison>
+ %2 = or <7 x i16> %a0, %1
+ %3 = shufflevector <7 x i16> %2, <7 x i16> poison, <7 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %4 = or <7 x i16> %2, %3
+ %5 = shufflevector <7 x i16> %4, <7 x i16> poison, <7 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %6 = or <7 x i16> %4, %5
+ %7 = extractelement <7 x i16> %6, i64 0
+ ret i16 %7
+}
+
+define i16 @test_reduce_v3i16_and(<3 x i16> %a0) {
+; CHECK-LABEL: define i16 @test_reduce_v3i16_and(
+; CHECK-SAME: <3 x i16> [[A0:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.vector.reduce.and.v3i16(<3 x i16> [[A0]])
+; CHECK-NEXT: ret i16 [[TMP1]]
+;
+ %1 = shufflevector <3 x i16> %a0, <3 x i16> poison, <3 x i32> <i32 1, i32 2, i32 poison>
+ %2 = and <3 x i16> %a0, %1
+ %3 = shufflevector <3 x i16> %2, <3 x i16> poison, <3 x i32> <i32 1, i32 poison, i32 poison>
+ %4 = and <3 x i16> %2, %3
+ %5 = extractelement <3 x i16> %4, i64 0
+ ret i16 %5
+}
+
+define i16 @test_reduce_v6i16_xor(<6 x i16> %a0) {
+; CHECK-LABEL: define i16 @test_reduce_v6i16_xor(
+; CHECK-SAME: <6 x i16> [[A0:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.vector.reduce.xor.v6i16(<6 x i16> [[A0]])
+; CHECK-NEXT: ret i16 [[TMP1]]
+;
+ %1 = shufflevector <6 x i16> %a0, <6 x i16> poison, <6 x i32> <i32 3, i32 4, i32 5, i32 poison, i32 poison, i32 poison>
+ %2 = xor <6 x i16> %a0, %1
+ %3 = shufflevector <6 x i16> %2, <6 x i16> poison, <6 x i32> <i32 1, i32 2, i32 poison, i32 poison, i32 poison, i32 poison>
+ %4 = xor <6 x i16> %2, %3
+ %5 = shufflevector <6 x i16> %4, <6 x i16> poison, <6 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %6 = xor <6 x i16> %4, %5
+ %7 = extractelement <6 x i16> %6, i64 0
+ ret i16 %7
+}
+
define i16 @test_reduce_v8i16_2(<8 x i16> %a0) {
; CHECK-LABEL: define i16 @test_reduce_v8i16_2(
; CHECK-SAME: <8 x i16> [[A0:%.*]]) {
@@ -125,3 +171,25 @@ define i16 @test_reduce_v8i16_neg3(<8 x i16> %a0) {
%8 = extractelement <8 x i16> %7, i64 0
ret i16 %8
}
+
+define i16 @test_reduce_v6i16_xor_neg(<6 x i16> %a0) {
+; CHECK-LABEL: define i16 @test_reduce_v6i16_xor_neg(
+; CHECK-SAME: <6 x i16> [[A0:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <6 x i16> [[A0]], <6 x i16> poison, <6 x i32> <i32 3, i32 4, i32 5, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP2:%.*]] = xor <6 x i16> [[A0]], [[TMP1]]
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <6 x i16> [[TMP2]], <6 x i16> poison, <6 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP4:%.*]] = xor <6 x i16> [[TMP2]], [[TMP3]]
+; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <6 x i16> [[TMP4]], <6 x i16> poison, <6 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP6:%.*]] = xor <6 x i16> [[TMP4]], [[TMP5]]
+; CHECK-NEXT: [[TMP7:%.*]] = extractelement <6 x i16> [[TMP6]], i64 0
+; CHECK-NEXT: ret i16 [[TMP7]]
+;
+ %1 = shufflevector <6 x i16> %a0, <6 x i16> poison, <6 x i32> <i32 3, i32 4, i32 5, i32 poison, i32 poison, i32 poison>
+ %2 = xor <6 x i16> %a0, %1
+ %3 = shufflevector <6 x i16> %2, <6 x i16> poison, <6 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+ %4 = xor <6 x i16> %2, %3
+ %5 = shufflevector <6 x i16> %4, <6 x i16> poison, <6 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %6 = xor <6 x i16> %4, %5
+ %7 = extractelement <6 x i16> %6, i64 0
+ ret i16 %7
+}
More information about the llvm-commits
mailing list