[llvm] [VectorCombine] Expand `vector_insert` into shufflevector for earlier cost optimizations (#145512) (PR #146479)

Tue Jul 1 10:30:35 PDT 2025

================
@@ -804,6 +805,66 @@ bool VectorCombine::foldInsExtBinop(Instruction &I) {
   return true;
 }
 
+/// Try to fold vector_insert intrinsics into shufflevector instructions.
+bool VectorCombine::foldVectorInsertToShuffle(Instruction &I) {
+  auto *II = dyn_cast<IntrinsicInst>(&I);
+  // This optimization only applies to vector_insert intrinsics.
+  if (!II || II->getIntrinsicID() != Intrinsic::vector_insert)
+    return false;
+
+  Value *Vec = II->getArgOperand(0);
+  Value *SubVec = II->getArgOperand(1);
+  Value *Idx = II->getArgOperand(2);
+
+  // Caller guarantees DstTy is a fixed vector.
+  auto *DstTy = cast<FixedVectorType>(II->getType());
+  auto *VecTy = dyn_cast<FixedVectorType>(Vec->getType());
+  auto *SubVecTy = dyn_cast<FixedVectorType>(SubVec->getType());
+
+  // Only canonicalize if Vec and SubVec are both fixed vectors.
+  if (!VecTy || !SubVecTy)
+    return false;
+
+  unsigned DstNumElts = DstTy->getNumElements();
+  unsigned VecNumElts = VecTy->getNumElements();
+  unsigned SubVecNumElts = SubVecTy->getNumElements();
+  auto *SubVecPtr = dyn_cast<ConstantInt>(Idx);
+  if (!SubVecPtr)
+    return false;
+
+  unsigned SubVecIdx = SubVecPtr->getZExtValue();
+
+  // Ensure insertion of SubVec doesn't exceed Dst bounds.
+  if (SubVecIdx % SubVecNumElts != 0 || SubVecIdx + SubVecNumElts > DstNumElts)
+    return false;
+
+  // An insert that entirely overwrites Vec with SubVec is a nop.
+  if (VecNumElts == SubVecNumElts) {
+    replaceValue(I, *SubVec);
+    return true;
+  }
+
+  // Widen SubVec into a vector of the same width as Vec, since
+  // shufflevector requires the two input vectors to be the same width.
+  // Elements beyond the bounds of SubVec within the widened vector are
+  // undefined.
+  SmallVector<int, 8> WidenMask(VecNumElts, PoisonMaskElem);
+  std::iota(WidenMask.begin(), WidenMask.begin() + SubVecNumElts, 0);
+  std::fill(WidenMask.begin() + SubVecNumElts, WidenMask.end(), PoisonMaskElem);
+
+  auto *WidenShuffle = Builder.CreateShuffleVector(SubVec, WidenMask);
+  Worklist.pushValue(WidenShuffle);
+
+  SmallVector<int, 8> Mask(DstNumElts);
+  std::iota(Mask.begin(), Mask.begin() + SubVecIdx, 0);
+  std::iota(Mask.begin() + SubVecIdx, Mask.begin() + SubVecIdx + SubVecNumElts, DstNumElts);
+  std::iota(Mask.begin() + SubVecIdx + SubVecNumElts, Mask.end(), SubVecIdx + SubVecNumElts);
----------------
laurenmchin wrote:

yes, you're right. thanks for pointing that out — the second call was redundant and overwriting part of the first. i’ve updated it to just use two iota calls, with the second overwriting the relevant subrange in-place.


https://github.com/llvm/llvm-project/pull/146479