[llvm] [VectorCombine] Expand `vector_insert` into shufflevector for earlier cost optimizations (#145512) (PR #146479)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Jul 1 00:02:27 PDT 2025
https://github.com/laurenmchin created https://github.com/llvm/llvm-project/pull/146479
Move folding logic from `InstCombineCalls` to `VectorCombine` to ensure `vector_insert` intrinsics are expanded into shufflevector instructions before cost-based shuffle optimizations run.
Canonicalizes fixed-width vectors only.
Fixes [#145512](https://github.com/llvm/llvm-project/issues/145512)
>From 27569450c58c82fd717007da9510944993660a04 Mon Sep 17 00:00:00 2001
From: Lauren <lchin at berkeley.edu>
Date: Tue, 1 Jul 2025 02:55:20 -0400
Subject: [PATCH] [VectorCombine] Expand `vector_insert` into shufflevector for
earlier cost optimizations (#145512)
Move folding logic from `InstCombineCalls` to `VectorCombine` to ensure `vector_insert` intrinsics are expanded into shufflevector instructions before cost-based shuffle optimizations run.
Canonicalizes fixed-width vectors only.
---
.../InstCombine/InstCombineCalls.cpp | 46 ------------
.../Transforms/Vectorize/VectorCombine.cpp | 71 +++++++++++++++++++
.../VectorCombine/fold-vector-insert.ll | 71 +++++++++++++++++++
3 files changed, 142 insertions(+), 46 deletions(-)
create mode 100644 llvm/test/Transforms/VectorCombine/fold-vector-insert.ll
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 5b398d3b75f59..df29024a86f67 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -3462,52 +3462,6 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
}
break;
}
- case Intrinsic::vector_insert: {
- Value *Vec = II->getArgOperand(0);
- Value *SubVec = II->getArgOperand(1);
- Value *Idx = II->getArgOperand(2);
- auto *DstTy = dyn_cast<FixedVectorType>(II->getType());
- auto *VecTy = dyn_cast<FixedVectorType>(Vec->getType());
- auto *SubVecTy = dyn_cast<FixedVectorType>(SubVec->getType());
-
- // Only canonicalize if the destination vector, Vec, and SubVec are all
- // fixed vectors.
- if (DstTy && VecTy && SubVecTy) {
- unsigned DstNumElts = DstTy->getNumElements();
- unsigned VecNumElts = VecTy->getNumElements();
- unsigned SubVecNumElts = SubVecTy->getNumElements();
- unsigned IdxN = cast<ConstantInt>(Idx)->getZExtValue();
-
- // An insert that entirely overwrites Vec with SubVec is a nop.
- if (VecNumElts == SubVecNumElts)
- return replaceInstUsesWith(CI, SubVec);
-
- // Widen SubVec into a vector of the same width as Vec, since
- // shufflevector requires the two input vectors to be the same width.
- // Elements beyond the bounds of SubVec within the widened vector are
- // undefined.
- SmallVector<int, 8> WidenMask;
- unsigned i;
- for (i = 0; i != SubVecNumElts; ++i)
- WidenMask.push_back(i);
- for (; i != VecNumElts; ++i)
- WidenMask.push_back(PoisonMaskElem);
-
- Value *WidenShuffle = Builder.CreateShuffleVector(SubVec, WidenMask);
-
- SmallVector<int, 8> Mask;
- for (unsigned i = 0; i != IdxN; ++i)
- Mask.push_back(i);
- for (unsigned i = DstNumElts; i != DstNumElts + SubVecNumElts; ++i)
- Mask.push_back(i);
- for (unsigned i = IdxN + SubVecNumElts; i != DstNumElts; ++i)
- Mask.push_back(i);
-
- Value *Shuffle = Builder.CreateShuffleVector(Vec, WidenShuffle, Mask);
- return replaceInstUsesWith(CI, Shuffle);
- }
- break;
- }
case Intrinsic::vector_extract: {
Value *Vec = II->getArgOperand(0);
Value *Idx = II->getArgOperand(1);
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 19e82099e87f0..dbbc6c5a07ec8 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -112,6 +112,7 @@ class VectorCombine {
bool foldExtractExtract(Instruction &I);
bool foldInsExtFNeg(Instruction &I);
bool foldInsExtBinop(Instruction &I);
+ bool foldVectorInsertToShuffle(Instruction &I);
bool foldInsExtVectorToShuffle(Instruction &I);
bool foldBitOpOfBitcasts(Instruction &I);
bool foldBitcastShuffle(Instruction &I);
@@ -804,6 +805,73 @@ bool VectorCombine::foldInsExtBinop(Instruction &I) {
return true;
}
+/// Try to fold vector_insert intrinsics into shufflevector instructions.
+bool VectorCombine::foldVectorInsertToShuffle(Instruction &I) {
+ auto *II = dyn_cast<IntrinsicInst>(&I);
+ // This optimization only applies to vector_insert intrinsics.
+ if (!II || II->getIntrinsicID() != Intrinsic::vector_insert)
+ return false;
+
+ Value *Vec = II->getArgOperand(0);
+ Value *SubVec = II->getArgOperand(1);
+ Value *Idx = II->getArgOperand(2);
+
+ // Caller guarantees DstTy is a fixed vector.
+ auto *DstTy = cast<FixedVectorType>(II->getType());
+ auto *VecTy = dyn_cast<FixedVectorType>(Vec->getType());
+ auto *SubVecTy = dyn_cast<FixedVectorType>(SubVec->getType());
+
+ // Only canonicalize if Vec and SubVec are both fixed vectors.
+ if (!VecTy || !SubVecTy)
+ return false;
+
+ unsigned DstNumElts = DstTy->getNumElements();
+ unsigned VecNumElts = VecTy->getNumElements();
+ unsigned SubVecNumElts = SubVecTy->getNumElements();
+ auto *SubVecPtr = dyn_cast<ConstantInt>(Idx);
+ if (!SubVecPtr)
+ return false;
+
+ unsigned SubVecIdx = SubVecPtr->getZExtValue();
+
+ // Ensure insertion of SubVec doesn't exceed Dst bounds.
+ if (SubVecIdx % SubVecNumElts != 0 || SubVecIdx + SubVecNumElts > DstNumElts)
+ return false;
+
+ // An insert that entirely overwrites Vec with SubVec is a nop.
+ if (VecNumElts == SubVecNumElts) {
+ replaceValue(I, *SubVec);
+ return true;
+ }
+
+ // Widen SubVec into a vector of the same width as Vec, since
+ // shufflevector requires the two input vectors to be the same width.
+ // Elements beyond the bounds of SubVec within the widened vector are
+ // undefined.
+ SmallVector<int, 8> WidenMask;
+ unsigned int i = 0;
+ for (i = 0; i != SubVecNumElts; ++i)
+ WidenMask.push_back(i);
+ for (; i != VecNumElts; ++i)
+ WidenMask.push_back(PoisonMaskElem);
+
+ auto *WidenShuffle = Builder.CreateShuffleVector(SubVec, WidenMask);
+ Worklist.pushValue(WidenShuffle);
+
+ SmallVector<int, 8> Mask;
+ unsigned int j;
+ for (i = 0; i != SubVecIdx; ++i)
+ Mask.push_back(i);
+ for (j = 0; j != SubVecNumElts; ++j)
+ Mask.push_back(DstNumElts + j);
+ for (i = SubVecIdx + SubVecNumElts; i != DstNumElts; ++i)
+ Mask.push_back(i);
+
+ auto *Shuffle = Builder.CreateShuffleVector(Vec, WidenShuffle, Mask);
+ replaceValue(I, *Shuffle);
+ return true;
+}
+
bool VectorCombine::foldBitOpOfBitcasts(Instruction &I) {
// Match: bitop(bitcast(x), bitcast(y)) -> bitcast(bitop(x, y))
Value *LHSSrc, *RHSSrc;
@@ -3639,6 +3707,9 @@ bool VectorCombine::run() {
// dispatching to folding functions if there's no chance of matching.
if (IsFixedVectorType) {
switch (Opcode) {
+ case Instruction::Call:
+ MadeChange |= foldVectorInsertToShuffle(I);
+ break;
case Instruction::InsertElement:
MadeChange |= vectorizeLoadInsert(I);
break;
diff --git a/llvm/test/Transforms/VectorCombine/fold-vector-insert.ll b/llvm/test/Transforms/VectorCombine/fold-vector-insert.ll
new file mode 100644
index 0000000000000..976fdb322005b
--- /dev/null
+++ b/llvm/test/Transforms/VectorCombine/fold-vector-insert.ll
@@ -0,0 +1,71 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=vector-combine -S | FileCheck %s
+
+declare <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32>, <4 x i32>, i64)
+declare <8 x i32> @llvm.vector.insert.v8i32.v8i32(<8 x i32>, <8 x i32>, i64)
+declare <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32>, <2 x i32>, i64)
+declare <8 x i32> @llvm.vector.insert.v8i32.v1i32(<8 x i32>, <1 x i32>, i64)
+declare <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v2i32(<vscale x 4 x i32>, <2 x i32>, i64)
+
+define <8 x i32> @vector_insert_begin(<8 x i32> %vec, <4 x i32> %subvec) {
+; CHECK-LABEL: define <8 x i32> @vector_insert_begin(
+; CHECK-SAME: <8 x i32> [[VEC:%.*]], <4 x i32> [[SUBVEC:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[SUBVEC]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[RESULT:%.*]] = shufflevector <8 x i32> [[VEC]], <8 x i32> [[TMP1]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: ret <8 x i32> [[RESULT]]
+;
+ %result = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> %vec, <4 x i32> %subvec, i64 0)
+ ret <8 x i32> %result
+}
+
+define <8 x i32> @vector_insert_middle(<8 x i32> %vec, <2 x i32> %subvec) {
+; CHECK-LABEL: define <8 x i32> @vector_insert_middle(
+; CHECK-SAME: <8 x i32> [[VEC:%.*]], <2 x i32> [[SUBVEC:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[SUBVEC]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[RESULT:%.*]] = shufflevector <8 x i32> [[VEC]], <8 x i32> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: ret <8 x i32> [[RESULT]]
+;
+ %result = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> %vec, <2 x i32> %subvec, i64 2)
+ ret <8 x i32> %result
+}
+
+define <8 x i32> @vector_insert_end(<8 x i32> %vec, <4 x i32> %subvec) {
+; CHECK-LABEL: define <8 x i32> @vector_insert_end(
+; CHECK-SAME: <8 x i32> [[VEC:%.*]], <4 x i32> [[SUBVEC:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[SUBVEC]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[RESULT:%.*]] = shufflevector <8 x i32> [[VEC]], <8 x i32> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT: ret <8 x i32> [[RESULT]]
+;
+ %result = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> %vec, <4 x i32> %subvec, i64 4)
+ ret <8 x i32> %result
+}
+
+define <8 x i32> @vector_insert_overwrite(<8 x i32> %vec, <8 x i32> %subvec) {
+; CHECK-LABEL: define <8 x i32> @vector_insert_overwrite(
+; CHECK-SAME: <8 x i32> [[VEC:%.*]], <8 x i32> [[SUBVEC:%.*]]) {
+; CHECK-NEXT: ret <8 x i32> [[SUBVEC]]
+;
+ %result = call <8 x i32> @llvm.vector.insert.v8i32.v8i32(<8 x i32> %vec, <8 x i32> %subvec, i64 0)
+ ret <8 x i32> %result
+}
+
+define <8 x i32> @vector_insert_single_element_at_end(<8 x i32> %vec, <1 x i32> %subvec) {
+; CHECK-LABEL: define <8 x i32> @vector_insert_single_element_at_end(
+; CHECK-SAME: <8 x i32> [[VEC:%.*]], <1 x i32> [[SUBVEC:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <1 x i32> [[SUBVEC]], <1 x i32> poison, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[RESULT:%.*]] = shufflevector <8 x i32> [[VEC]], <8 x i32> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 8>
+; CHECK-NEXT: ret <8 x i32> [[RESULT]]
+;
+ %result = call <8 x i32> @llvm.vector.insert.v8i32.v1i32(<8 x i32> %vec, <1 x i32> %subvec, i64 7)
+ ret <8 x i32> %result
+}
+
+define <vscale x 4 x i32> @vector_insert_no_fold_scalable(<vscale x 4 x i32> %vec, <2 x i32> %subvec) {
+; CHECK-LABEL: define <vscale x 4 x i32> @vector_insert_no_fold_scalable(
+; CHECK-SAME: <vscale x 4 x i32> [[VEC:%.*]], <2 x i32> [[SUBVEC:%.*]]) {
+; CHECK-NEXT: [[RESULT:%.*]] = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v2i32(<vscale x 4 x i32> [[VEC]], <2 x i32> [[SUBVEC]], i64 0)
+; CHECK-NEXT: ret <vscale x 4 x i32> [[RESULT]]
+;
+ %result = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v2i32(<vscale x 4 x i32> %vec, <2 x i32> %subvec, i64 0)
+ ret <vscale x 4 x i32> %result
+}
More information about the llvm-commits
mailing list