[llvm] [CostModel/RISCV] Fix costs of vector [l](lrint|lround) (PR #146058)

Tue Jul 29 01:43:19 PDT 2025

================
@@ -1251,11 +1248,41 @@ RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
   switch (ICA.getID()) {
   case Intrinsic::lrint:
   case Intrinsic::llrint:
-    // We can't currently lower half or bfloat vector lrint/llrint.
-    if (auto *VecTy = dyn_cast<VectorType>(ICA.getArgTypes()[0]);
-        VecTy && VecTy->getElementType()->is16bitFPTy())
-      return InstructionCost::getInvalid();
-    [[fallthrough]];
+  case Intrinsic::lround:
+  case Intrinsic::llround: {
+    auto LT = getTypeLegalizationCost(RetTy);
+    auto *SrcTy = ICA.getArgTypes().front();
+    auto SrcLT = getTypeLegalizationCost(SrcTy);
+    if (ST->hasVInstructions() && LT.second.isVector()) {
+      ArrayRef<unsigned> Ops;
+      unsigned SrcEltSz =
+          DL.getTypeSizeInBits(cast<VectorType>(SrcTy)->getElementType());
+      unsigned DstEltSz =
+          DL.getTypeSizeInBits(cast<VectorType>(RetTy)->getElementType());
+      if (LT.second.getVectorElementType() == MVT::bf16) {
+        if (DstEltSz == 32)
+          Ops = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFCVT_X_F_V};
+        else
+          Ops = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFWCVT_X_F_V};
+      } else if (LT.second.getVectorElementType() == MVT::f16 &&
+                 !ST->hasVInstructionsF16()) {
+        if (DstEltSz == 32)
+          Ops = {RISCV::VFWCVT_F_F_V, RISCV::VFCVT_X_F_V};
+        else
+          Ops = {RISCV::VFWCVT_F_F_V, RISCV::VFWCVT_X_F_V};
+
+      } else if (SrcEltSz < DstEltSz) {
+        Ops = {RISCV::VFNCVT_X_F_W, RISCV::VMV_V_V};
----------------
lukel97 wrote:

Where does the vmv.v.v come from? I see in `llround-sdnode.ll` there's some tests that have this:

```llvm
define <vscale x 1 x i64> @llround_nxv1i64_nxv1f32(<vscale x 1 x float> %x) {
; CHECK-LABEL: llround_nxv1i64_nxv1f32:
; CHECK:       # %bb.0:
; CHECK-NEXT:    fsrmi a0, 4
; CHECK-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
; CHECK-NEXT:    vfwcvt.x.f.v v9, v8
; CHECK-NEXT:    fsrm a0
; CHECK-NEXT:    vmv1r.v v8, v9
; CHECK-NEXT:    ret
  %a = call <vscale x 1 x i64> @llvm.llround.nxv1i64.nxv1f32(<vscale x 1 x float> %x)
  ret <vscale x 1 x i64> %a
}
```

But to me that looks like it was inserted because of the vector register group constraints. I don't think we should cost for it, hopefully most of the time the register allocator won't need to insert the move?

https://github.com/llvm/llvm-project/pull/146058