[llvm] [RISCV][TTI] Reduce cost of a build_vector pattern (PR #108419)

Mon Sep 16 10:12:35 PDT 2024

================
@@ -616,6 +616,39 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
   return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp);
 }
 
+static unsigned isM1OrSmaller(MVT VT) {
+  RISCVII::VLMUL LMUL = RISCVTargetLowering::getLMUL(VT);
+  return (LMUL == RISCVII::VLMUL::LMUL_F8 || LMUL == RISCVII::VLMUL::LMUL_F4 ||
+          LMUL == RISCVII::VLMUL::LMUL_F2 || LMUL == RISCVII::VLMUL::LMUL_1);
+}
+
+InstructionCost RISCVTTIImpl::getScalarizationOverhead(
+    VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
+    TTI::TargetCostKind CostKind) {
+  if (isa<ScalableVectorType>(Ty))
+    return InstructionCost::getInvalid();
+
+  // A build_vector (which is m1 sized or smaller) can be done in no
+  // worse than one vslide1down.vx per element in the type.  We could
+  // in theory do an explode_vector in the inverse manner, but our
+  // lowering today does not have a first class node for this pattern.
+  InstructionCost Cost = BaseT::getScalarizationOverhead(
+      Ty, DemandedElts, Insert, Extract, CostKind);
+  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
+  if (Insert && !Extract && LT.first.isValid() && LT.second.isVector() &&
+      Ty->getScalarSizeInBits() != 1) {
+    MVT ContainerVT = LT.second;
+    if (ContainerVT.isFixedLengthVector())
+      ContainerVT = TLI->getContainerForFixedLengthVector(ContainerVT);
+    if (isM1OrSmaller(ContainerVT)) {
----------------
lukel97 wrote:

Was there a specific reason for limiting it to M1 and below? I just did a quick check and this:

```llvm
define <8 x i32> @foo(i32 %e1, i32 %e2, i32 %e3, i32 %e4, i32 %e5, i32 %e6, i32 %e7, i32 %e8) {
  %v1 = insertelement <8 x i32> poison, i32 %e1, i32 0
  %v2 = insertelement <8 x i32> %v1, i32 %e2, i32 1
  %v3 = insertelement <8 x i32> %v2, i32 %e3, i32 2
  %v4 = insertelement <8 x i32> %v3, i32 %e4, i32 3
  %v5 = insertelement <8 x i32> %v4, i32 %e5, i32 4
  %v6 = insertelement <8 x i32> %v5, i32 %e6, i32 5
  %v7 = insertelement <8 x i32> %v6, i32 %e7, i32 6
  %v8 = insertelement <8 x i32> %v7, i32 %e8, i32 7
  ret <8 x i32> %v8
}
```

Gets lowered to a bunch of M2 vslidedowns

```nasm
buildvec_v8i32_pack: 
	vsetivli	zero, 8, e32, m2, ta, ma
	vmv.v.x	v8, a0
	vslide1down.vx	v8, v8, a1
	vslide1down.vx	v8, v8, a2
	vslide1down.vx	v8, v8, a3
	vslide1down.vx	v8, v8, a4
	vslide1down.vx	v8, v8, a5
	vslide1down.vx	v8, v8, a6
	vslide1down.vx	v8, v8, a7
	ret
```

Although it looks like with higher LMULs it ends up going through the stack at some point.

https://github.com/llvm/llvm-project/pull/108419