[llvm] [VectorCombine] New folding pattern for extract/binop/shuffle chains (PR #145232)
via llvm-commits
llvm-commits at lists.llvm.org
Sun Jun 22 05:15:18 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-vectorizers
Author: Rajveer Singh Bharadwaj (Rajveer100)
<details>
<summary>Changes</summary>
Resolves #<!-- -->144654
Part of #<!-- -->143088
This adds a new `foldShuffleChainsToReduce` for horizontal reduction of patterns like:
```llvm
define i16 @<!-- -->test_reduce_v8i16(<8 x i16> %a0) local_unnamed_addr #<!-- -->0 {
%1 = shufflevector <8 x i16> %a0, <8 x i16> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
%2 = tail call <8 x i16> @<!-- -->llvm.umin.v8i16(<8 x i16> %a0, <8 x i16> %1)
%3 = shufflevector <8 x i16> %2, <8 x i16> poison, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%4 = tail call <8 x i16> @<!-- -->llvm.umin.v8i16(<8 x i16> %2, <8 x i16> %3)
%5 = shufflevector <8 x i16> %4, <8 x i16> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%6 = tail call <8 x i16> @<!-- -->llvm.umin.v8i16(<8 x i16> %4, <8 x i16> %5)
%7 = extractelement <8 x i16> %6, i64 0
ret i16 %7
}
```
...which can be reduced to a llvm.vector.reduce.umin.v8i16(%a0) intrinsic call.
Similar transformation for other ops when costs permit to do so.
---
Full diff: https://github.com/llvm/llvm-project/pull/145232.diff
2 Files Affected:
- (modified) llvm/lib/Transforms/Vectorize/VectorCombine.cpp (+126)
- (added) llvm/test/Transforms/VectorCombine/fold-shuffle-chains-to-reduce.ll (+18)
``````````diff
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 52cb1dbb33b86..aca939c4f534d 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -129,6 +129,7 @@ class VectorCombine {
bool foldShuffleOfIntrinsics(Instruction &I);
bool foldShuffleToIdentity(Instruction &I);
bool foldShuffleFromReductions(Instruction &I);
+ bool foldShuffleChainsToReduce(Instruction &I);
bool foldCastFromReductions(Instruction &I);
bool foldSelectShuffle(Instruction &I, bool FromReduction = false);
bool foldInterleaveIntrinsics(Instruction &I);
@@ -2910,6 +2911,130 @@ bool VectorCombine::foldShuffleFromReductions(Instruction &I) {
return foldSelectShuffle(*Shuffle, true);
}
+bool VectorCombine::foldShuffleChainsToReduce(Instruction &I) {
+ auto *SVI = dyn_cast<ShuffleVectorInst>(&I);
+ if (!SVI)
+ return false;
+
+ std::queue<Value *> Worklist;
+ SmallVector<Instruction *> ToEraseFromParent;
+
+ SmallVector<int> ShuffleMask;
+ bool IsShuffleOp = true;
+
+ Worklist.push(SVI);
+ SVI->getShuffleMask(ShuffleMask);
+
+ if (ShuffleMask.size() < 2)
+ return false;
+
+ Instruction *Prev0 = nullptr, *Prev1 = nullptr;
+ Instruction *LastOp = nullptr;
+
+ int MaskHalfPos = ShuffleMask.size() / 2;
+ bool IsFirst = true;
+
+ while (!Worklist.empty()) {
+ Value *V = Worklist.front();
+ Worklist.pop();
+
+ auto *CI = dyn_cast<Instruction>(V);
+ if (!CI)
+ return false;
+
+ if (auto *SV = dyn_cast<ShuffleVectorInst>(V)) {
+ if (!IsShuffleOp || MaskHalfPos < 1 || (!Prev1 && !IsFirst))
+ return false;
+
+ auto *Op0 = SV->getOperand(0);
+ auto *Op1 = SV->getOperand(1);
+ if (!Op0 || !Op1)
+ return false;
+
+ auto *FVT = dyn_cast<FixedVectorType>(Op1->getType());
+ if (!FVT || !isa<PoisonValue>(Op1))
+ return false;
+
+ SmallVector<int> CurrentMask;
+ SV->getShuffleMask(CurrentMask);
+
+ int64_t MaskSize = CurrentMask.size();
+ for (int MaskPos = 0; MaskPos != MaskSize; ++MaskPos) {
+ if (MaskPos < MaskHalfPos && CurrentMask[MaskPos] != MaskHalfPos + MaskPos)
+ return false;
+ if (MaskPos >= MaskHalfPos && CurrentMask[MaskPos] != -1)
+ return false;
+ }
+ MaskHalfPos /= 2;
+ Prev0 = SV;
+ } else if (auto *Call = dyn_cast<CallInst>(V)) {
+ if (IsShuffleOp || !Prev0)
+ return false;
+
+ auto *II = dyn_cast<IntrinsicInst>(Call);
+ if (!II)
+ return false;
+
+ switch (II->getIntrinsicID()) {
+ case Intrinsic::umin: {
+ auto *Op0 = Call->getOperand(0);
+ auto *Op1 = Call->getOperand(1);
+ if (!(Op0 == Prev0 && Op1 == Prev1) && !(Op0 == Prev1 && Op1 == Prev0) && !IsFirst)
+ return false;
+
+ if (!IsFirst)
+ Prev0 = Prev1;
+ else
+ IsFirst = false;
+ Prev1 = Call;
+ break;
+ }
+ default:
+ return false;
+ }
+ } else if (auto *ExtractElement = dyn_cast<ExtractElementInst>(CI)) {
+ if (!IsShuffleOp || !Prev0 || !Prev1 || MaskHalfPos != 0)
+ return false;
+
+ auto *Op0 = ExtractElement->getOperand(0);
+ auto *Op1 = ExtractElement->getOperand(1);
+ if (Op0 != Prev1)
+ return false;
+
+ if (auto *Op1Idx = dyn_cast<ConstantInt>(Op1)) {
+ if (Op1Idx->getValue() != 0)
+ return false;
+ } else {
+ return false;
+ }
+ LastOp = ExtractElement;
+ break;
+ }
+ IsShuffleOp ^= 1;
+ ToEraseFromParent.push_back(CI);
+
+ auto *NextI = CI->getNextNode();
+ if (!NextI)
+ return false;
+ Worklist.push(NextI);
+ }
+
+ if (!LastOp)
+ return false;
+
+ auto *ReducedResult = Builder.CreateIntrinsic(Intrinsic::vector_reduce_umin, {SVI->getType()}, {SVI->getOperand(0)});
+ replaceValue(*LastOp, *ReducedResult);
+
+ ToEraseFromParent.push_back(LastOp);
+
+ std::reverse(ToEraseFromParent.begin(), ToEraseFromParent.end());
+ // for (auto &Instr : ToEraseFromParent)
+ // eraseInstruction(*Instr);
+ // Instr->eraseFromParent();
+
+ return true;
+}
+
/// Determine if its more efficient to fold:
/// reduce(trunc(x)) -> trunc(reduce(x)).
/// reduce(sext(x)) -> sext(reduce(x)).
@@ -3607,6 +3732,7 @@ bool VectorCombine::run() {
MadeChange |= foldShuffleOfIntrinsics(I);
MadeChange |= foldSelectShuffle(I);
MadeChange |= foldShuffleToIdentity(I);
+ MadeChange |= foldShuffleChainsToReduce(I);
break;
case Instruction::BitCast:
MadeChange |= foldBitcastShuffle(I);
diff --git a/llvm/test/Transforms/VectorCombine/fold-shuffle-chains-to-reduce.ll b/llvm/test/Transforms/VectorCombine/fold-shuffle-chains-to-reduce.ll
new file mode 100644
index 0000000000000..6f21eb5097fde
--- /dev/null
+++ b/llvm/test/Transforms/VectorCombine/fold-shuffle-chains-to-reduce.ll
@@ -0,0 +1,18 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=vector-combine -S | FileCheck %s
+
+define i16 @test_reduce_v8i16(<8 x i16> %a0) local_unnamed_addr #0 {
+; CHECK-LABEL: define i16 @test_reduce_v8i16(
+; CHECK-SAME: <8 x i16> [[A0:%.*]]) local_unnamed_addr {
+; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> [[A0]])
+; CHECK-NEXT: ret i16 [[TMP1]]
+;
+ %1 = shufflevector <8 x i16> %a0, <8 x i16> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+ %2 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %a0, <8 x i16> %1)
+ %3 = shufflevector <8 x i16> %2, <8 x i16> poison, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %4 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %2, <8 x i16> %3)
+ %5 = shufflevector <8 x i16> %4, <8 x i16> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %6 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %4, <8 x i16> %5)
+ %7 = extractelement <8 x i16> %6, i64 0
+ ret i16 %7
+}
``````````
</details>
https://github.com/llvm/llvm-project/pull/145232
More information about the llvm-commits
mailing list