[llvm] [RISCV] Handle fixed length vectors with exact VLEN in lowerINSERT_SUBVECTOR (PR #84107)

Sun Mar 31 23:07:04 PDT 2024

https://github.com/lukel97 updated https://github.com/llvm/llvm-project/pull/84107

>From f70251204eb8fd3f1222605a4b60753d9df42184 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Mon, 26 Feb 2024 14:42:00 +0800
Subject: [PATCH 1/2] [RISCV] Handle fixed length vectors with exact VLEN in
 lowerINSERT_SUBVECTOR

This is the insert_subvector equivalent to #79949, where we can avoid sliding up
by the full LMUL amount if we know the exact subregister the subvector will be
inserted into.

This mirrors the lowerEXTRACT_SUBVECTOR changes in that we handle this in two
parts:

- We handle fixed length subvector types by converting the subvector to a
  scalable vector. But unlike EXTRACT_SUBVECTOR, we may also need to convert the
  vector being inserted into too.

- Whenever we don't need a vslideup because either the subvector aligns to a
  vector register group *or* the vector is undef, we need to emit an
  insert_subreg ourselves because RISCVISelDAGToDAG::Select doesn't correctly
  handle fixed length subvectors yet: see d7a28f7ad

I've left RISCVISelDAGToDAG::Select untouched for now (minus relaxing an
invariant), so that the insert_subvector and extract_subvector code paths are
the same.

We should teach it to properly handle fixed length subvectors in a follow-up
patch, so that the "exact subregsiter" logic is handled in one place instead of
being spread across both RISCVISelDAGToDAG.cpp and RISCVISelLowering.cpp.
---
 llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp   |   8 +-
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   | 130 ++++++++---
 .../rvv/fixed-vectors-insert-subvector.ll     | 163 +++++++++-----
 .../RISCV/rvv/fixed-vectors-shuffle-concat.ll | 205 ++++++++++--------
 4 files changed, 334 insertions(+), 172 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index 1b8c1434c9f2d9..3fea03ec892dc2 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -2063,8 +2063,14 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
     MVT SubVecContainerVT = SubVecVT;
     // Establish the correct scalable-vector types for any fixed-length type.
     if (SubVecVT.isFixedLengthVector()) {
-      assert(Idx == 0 && V.isUndef());
       SubVecContainerVT = TLI.getContainerForFixedLengthVector(SubVecVT);
+      bool AlignedToVecReg = false;
+      if (auto VLen = Subtarget->getRealVLen();
+          VLen && SubVecVT.getSizeInBits() ==
+                      SubVecContainerVT.getSizeInBits().getKnownMinValue() *
+                          (*VLen / RISCV::RVVBitsPerBlock))
+        AlignedToVecReg = true;
+      assert(Idx == 0 && (AlignedToVecReg || V.isUndef()));
     }
     MVT ContainerVT = VT;
     if (VT.isFixedLengthVector())
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 750d70c03eabd7..42302b86775ad2 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -9596,6 +9596,21 @@ SDValue RISCVTargetLowering::lowerVPREDUCE(SDValue Op,
                            Vec, Mask, VL, DL, DAG, Subtarget);
 }
 
+/// Returns true if \p LHS is known to be equal to \p RHS, taking into account
+/// if VLEN is exactly known by \p Subtarget and thus vscale when handling
+/// scalable quantities.
+static bool isKnownEQ(ElementCount LHS, ElementCount RHS,
+                      const RISCVSubtarget &Subtarget) {
+  if (auto VLen = Subtarget.getRealVLen()) {
+    const unsigned Vscale = *VLen / RISCV::RVVBitsPerBlock;
+    if (LHS.isScalable())
+      LHS = ElementCount::getFixed(LHS.getKnownMinValue() * Vscale);
+    if (RHS.isScalable())
+      RHS = ElementCount::getFixed(RHS.getKnownMinValue() * Vscale);
+  }
+  return LHS == RHS;
+}
+
 SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
                                                    SelectionDAG &DAG) const {
   SDValue Vec = Op.getOperand(0);
@@ -9645,12 +9660,13 @@ SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
     }
   }
 
-  // If the subvector vector is a fixed-length type, we cannot use subregister
-  // manipulation to simplify the codegen; we don't know which register of a
-  // LMUL group contains the specific subvector as we only know the minimum
-  // register size. Therefore we must slide the vector group up the full
-  // amount.
-  if (SubVecVT.isFixedLengthVector()) {
+  // If the subvector vector is a fixed-length type and we don't know VLEN
+  // exactly, we cannot use subregister manipulation to simplify the codegen; we
+  // don't know which register of a LMUL group contains the specific subvector
+  // as we only know the minimum register size. Therefore we must slide the
+  // vector group up the full amount.
+  const auto VLen = Subtarget.getRealVLen();
+  if (SubVecVT.isFixedLengthVector() && !VLen) {
     if (OrigIdx == 0 && Vec.isUndef() && !VecVT.isFixedLengthVector())
       return Op;
     MVT ContainerVT = VecVT;
@@ -9698,41 +9714,92 @@ SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
     return DAG.getBitcast(Op.getValueType(), SubVec);
   }
 
-  unsigned SubRegIdx, RemIdx;
-  std::tie(SubRegIdx, RemIdx) =
-      RISCVTargetLowering::decomposeSubvectorInsertExtractToSubRegs(
-          VecVT, SubVecVT, OrigIdx, TRI);
+  MVT ContainerVecVT = VecVT;
+  if (VecVT.isFixedLengthVector()) {
+    ContainerVecVT = getContainerForFixedLengthVector(VecVT);
+    Vec = convertToScalableVector(ContainerVecVT, Vec, DAG, Subtarget);
+  }
 
-  RISCVII::VLMUL SubVecLMUL = RISCVTargetLowering::getLMUL(SubVecVT);
+  MVT ContainerSubVecVT = SubVecVT;
+  if (SubVecVT.isFixedLengthVector()) {
+    ContainerSubVecVT = getContainerForFixedLengthVector(SubVecVT);
+    SubVec = convertToScalableVector(ContainerSubVecVT, SubVec, DAG, Subtarget);
+  }
+
+  unsigned SubRegIdx;
+  ElementCount RemIdx;
+  // insert_subvector scales the index by vscale if the subvector is scalable,
+  // and decomposeSubvectorInsertExtractToSubRegs takes this into account. So if
+  // we have a fixed length subvector, we need to adjust the index by 1/vscale.
+  if (SubVecVT.isFixedLengthVector()) {
+    assert(VLen);
+    unsigned Vscale = *VLen / RISCV::RVVBitsPerBlock;
+    auto Decompose =
+        RISCVTargetLowering::decomposeSubvectorInsertExtractToSubRegs(
+            ContainerVecVT, ContainerSubVecVT, OrigIdx / Vscale, TRI);
+    SubRegIdx = Decompose.first;
+    RemIdx = ElementCount::getFixed((Decompose.second * Vscale) +
+                                    (OrigIdx % Vscale));
+  } else {
+    auto Decompose =
+        RISCVTargetLowering::decomposeSubvectorInsertExtractToSubRegs(
+            ContainerVecVT, ContainerSubVecVT, OrigIdx, TRI);
+    SubRegIdx = Decompose.first;
+    RemIdx = ElementCount::getScalable(Decompose.second);
+  }
+
+  RISCVII::VLMUL SubVecLMUL = RISCVTargetLowering::getLMUL(ContainerSubVecVT);
   bool IsSubVecPartReg = SubVecLMUL == RISCVII::VLMUL::LMUL_F2 ||
                          SubVecLMUL == RISCVII::VLMUL::LMUL_F4 ||
                          SubVecLMUL == RISCVII::VLMUL::LMUL_F8;
+  bool AlignedToVecReg = !IsSubVecPartReg;
+  if (SubVecVT.isFixedLengthVector())
+    AlignedToVecReg &= SubVecVT.getSizeInBits() ==
+                       ContainerSubVecVT.getSizeInBits().getKnownMinValue() *
+                           (*VLen / RISCV::RVVBitsPerBlock);
 
   // 1. If the Idx has been completely eliminated and this subvector's size is
   // a vector register or a multiple thereof, or the surrounding elements are
   // undef, then this is a subvector insert which naturally aligns to a vector
   // register. These can easily be handled using subregister manipulation.
-  // 2. If the subvector is smaller than a vector register, then the insertion
-  // must preserve the undisturbed elements of the register. We do this by
-  // lowering to an EXTRACT_SUBVECTOR grabbing the nearest LMUL=1 vector type
-  // (which resolves to a subregister copy), performing a VSLIDEUP to place the
-  // subvector within the vector register, and an INSERT_SUBVECTOR of that
+  // 2. If the subvector isn't exactly aligned to a vector register group, then
+  // the insertion must preserve the undisturbed elements of the register. We do
+  // this by lowering to an EXTRACT_SUBVECTOR grabbing the nearest LMUL=1 vector
+  // type (which resolves to a subregister copy), performing a VSLIDEUP to place
+  // the subvector within the vector register, and an INSERT_SUBVECTOR of that
   // LMUL=1 type back into the larger vector (resolving to another subregister
   // operation). See below for how our VSLIDEUP works. We go via a LMUL=1 type
   // to avoid allocating a large register group to hold our subvector.
-  if (RemIdx == 0 && (!IsSubVecPartReg || Vec.isUndef()))
+  if (RemIdx.isZero() && (AlignedToVecReg || Vec.isUndef())) {
+    if (SubVecVT.isFixedLengthVector()) {
+      // We may get NoSubRegister if inserting at index 0 and the subvec
+      // container is the same as the vector, e.g. vec=v4i32,subvec=v4i32,idx=0
+      if (SubRegIdx == RISCV::NoSubRegister) {
+        assert(OrigIdx == 0);
+        return Op;
+      }
+
+      SDValue Insert =
+          DAG.getTargetInsertSubreg(SubRegIdx, DL, ContainerVecVT, Vec, SubVec);
+      if (VecVT.isFixedLengthVector())
+        Insert = convertFromScalableVector(VecVT, Insert, DAG, Subtarget);
+      return Insert;
+    }
     return Op;
+  }
 
   // VSLIDEUP works by leaving elements 0<i<OFFSET undisturbed, elements
   // OFFSET<=i<VL set to the "subvector" and vl<=i<VLMAX set to the tail policy
   // (in our case undisturbed). This means we can set up a subvector insertion
   // where OFFSET is the insertion offset, and the VL is the OFFSET plus the
   // size of the subvector.
-  MVT InterSubVT = VecVT;
+  MVT InterSubVT = ContainerVecVT;
   SDValue AlignedExtract = Vec;
-  unsigned AlignedIdx = OrigIdx - RemIdx;
-  if (VecVT.bitsGT(getLMUL1VT(VecVT))) {
-    InterSubVT = getLMUL1VT(VecVT);
+  unsigned AlignedIdx = OrigIdx - RemIdx.getKnownMinValue();
+  if (SubVecVT.isFixedLengthVector())
+    AlignedIdx /= *VLen / RISCV::RVVBitsPerBlock;
+  if (ContainerVecVT.bitsGT(getLMUL1VT(ContainerVecVT))) {
+    InterSubVT = getLMUL1VT(ContainerVecVT);
     // Extract a subvector equal to the nearest full vector register type. This
     // should resolve to a EXTRACT_SUBREG instruction.
     AlignedExtract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InterSubVT, Vec,
@@ -9743,25 +9810,23 @@ SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
                        DAG.getUNDEF(InterSubVT), SubVec,
                        DAG.getVectorIdxConstant(0, DL));
 
-  auto [Mask, VL] = getDefaultScalableVLOps(VecVT, DL, DAG, Subtarget);
+  auto [Mask, VL] = getDefaultVLOps(VecVT, ContainerVecVT, DL, DAG, Subtarget);
 
-  ElementCount EndIndex =
-      ElementCount::getScalable(RemIdx) + SubVecVT.getVectorElementCount();
-  VL = computeVLMax(SubVecVT, DL, DAG);
+  ElementCount EndIndex = RemIdx + SubVecVT.getVectorElementCount();
+  VL = DAG.getElementCount(DL, XLenVT, SubVecVT.getVectorElementCount());
 
   // Use tail agnostic policy if we're inserting over InterSubVT's tail.
   unsigned Policy = RISCVII::TAIL_UNDISTURBED_MASK_UNDISTURBED;
-  if (EndIndex == InterSubVT.getVectorElementCount())
+  if (isKnownEQ(EndIndex, InterSubVT.getVectorElementCount(), Subtarget))
     Policy = RISCVII::TAIL_AGNOSTIC;
 
   // If we're inserting into the lowest elements, use a tail undisturbed
   // vmv.v.v.
-  if (RemIdx == 0) {
+  if (RemIdx.isZero()) {
     SubVec = DAG.getNode(RISCVISD::VMV_V_V_VL, DL, InterSubVT, AlignedExtract,
                          SubVec, VL);
   } else {
-    SDValue SlideupAmt =
-        DAG.getVScale(DL, XLenVT, APInt(XLenVT.getSizeInBits(), RemIdx));
+    SDValue SlideupAmt = DAG.getElementCount(DL, XLenVT, RemIdx);
 
     // Construct the vector length corresponding to RemIdx + length(SubVecVT).
     VL = DAG.getNode(ISD::ADD, DL, XLenVT, SlideupAmt, VL);
@@ -9772,10 +9837,13 @@ SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
 
   // If required, insert this subvector back into the correct vector register.
   // This should resolve to an INSERT_SUBREG instruction.
-  if (VecVT.bitsGT(InterSubVT))
-    SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VecVT, Vec, SubVec,
+  if (ContainerVecVT.bitsGT(InterSubVT))
+    SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ContainerVecVT, Vec, SubVec,
                          DAG.getVectorIdxConstant(AlignedIdx, DL));
 
+  if (VecVT.isFixedLengthVector())
+    SubVec = convertFromScalableVector(VecVT, SubVec, DAG, Subtarget);
+
   // We might have bitcast from a mask type: cast back to the original type if
   // required.
   return DAG.getBitcast(Op.getSimpleValueType(), SubVec);
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll
index 9f0240c53b219a..9dae07e2928706 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll
@@ -9,39 +9,63 @@
 ; RUN: llc < %s -mtriple=riscv64 -mattr=+m,v -riscv-v-vector-bits-max=128 -verify-machineinstrs | FileCheck -check-prefixes=CHECK,VLS,RV64VLS %s
 
 define <vscale x 8 x i32> @insert_nxv8i32_v2i32_0(<vscale x 8 x i32> %vec, ptr %svp) {
-; CHECK-LABEL: insert_nxv8i32_v2i32_0:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vle32.v v12, (a0)
-; CHECK-NEXT:    vsetivli zero, 2, e32, m4, tu, ma
-; CHECK-NEXT:    vmv.v.v v8, v12
-; CHECK-NEXT:    ret
+; VLA-LABEL: insert_nxv8i32_v2i32_0:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLA-NEXT:    vle32.v v12, (a0)
+; VLA-NEXT:    vsetivli zero, 2, e32, m4, tu, ma
+; VLA-NEXT:    vmv.v.v v8, v12
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: insert_nxv8i32_v2i32_0:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLS-NEXT:    vle32.v v12, (a0)
+; VLS-NEXT:    vsetivli zero, 2, e32, m1, tu, ma
+; VLS-NEXT:    vmv.v.v v8, v12
+; VLS-NEXT:    ret
   %sv = load <2 x i32>, ptr %svp
   %v = call <vscale x 8 x i32> @llvm.vector.insert.v2i32.nxv8i32(<vscale x 8 x i32> %vec, <2 x i32> %sv, i64 0)
   ret <vscale x 8 x i32> %v
 }
 
 define <vscale x 8 x i32> @insert_nxv8i32_v2i32_2(<vscale x 8 x i32> %vec, ptr %svp) {
-; CHECK-LABEL: insert_nxv8i32_v2i32_2:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vle32.v v12, (a0)
-; CHECK-NEXT:    vsetivli zero, 4, e32, m4, tu, ma
-; CHECK-NEXT:    vslideup.vi v8, v12, 2
-; CHECK-NEXT:    ret
+; VLA-LABEL: insert_nxv8i32_v2i32_2:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLA-NEXT:    vle32.v v12, (a0)
+; VLA-NEXT:    vsetivli zero, 4, e32, m4, tu, ma
+; VLA-NEXT:    vslideup.vi v8, v12, 2
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: insert_nxv8i32_v2i32_2:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLS-NEXT:    vle32.v v12, (a0)
+; VLS-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; VLS-NEXT:    vslideup.vi v8, v12, 2
+; VLS-NEXT:    ret
   %sv = load <2 x i32>, ptr %svp
   %v = call <vscale x 8 x i32> @llvm.vector.insert.v2i32.nxv8i32(<vscale x 8 x i32> %vec, <2 x i32> %sv, i64 2)
   ret <vscale x 8 x i32> %v
 }
 
 define <vscale x 8 x i32> @insert_nxv8i32_v2i32_6(<vscale x 8 x i32> %vec, ptr %svp) {
-; CHECK-LABEL: insert_nxv8i32_v2i32_6:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vle32.v v12, (a0)
-; CHECK-NEXT:    vsetivli zero, 8, e32, m4, tu, ma
-; CHECK-NEXT:    vslideup.vi v8, v12, 6
-; CHECK-NEXT:    ret
+; VLA-LABEL: insert_nxv8i32_v2i32_6:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLA-NEXT:    vle32.v v12, (a0)
+; VLA-NEXT:    vsetivli zero, 8, e32, m4, tu, ma
+; VLA-NEXT:    vslideup.vi v8, v12, 6
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: insert_nxv8i32_v2i32_6:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLS-NEXT:    vle32.v v12, (a0)
+; VLS-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; VLS-NEXT:    vslideup.vi v9, v12, 2
+; VLS-NEXT:    ret
   %sv = load <2 x i32>, ptr %svp
   %v = call <vscale x 8 x i32> @llvm.vector.insert.v2i32.nxv8i32(<vscale x 8 x i32> %vec, <2 x i32> %sv, i64 6)
   ret <vscale x 8 x i32> %v
@@ -58,9 +82,7 @@ define <vscale x 8 x i32> @insert_nxv8i32_v8i32_0(<vscale x 8 x i32> %vec, ptr %
 ;
 ; VLS-LABEL: insert_nxv8i32_v8i32_0:
 ; VLS:       # %bb.0:
-; VLS-NEXT:    vl2re32.v v12, (a0)
-; VLS-NEXT:    vsetivli zero, 8, e32, m4, tu, ma
-; VLS-NEXT:    vmv.v.v v8, v12
+; VLS-NEXT:    vl2re32.v v8, (a0)
 ; VLS-NEXT:    ret
   %sv = load <8 x i32>, ptr %svp
   %v = call <vscale x 8 x i32> @llvm.vector.insert.v8i32.nxv8i32(<vscale x 8 x i32> %vec, <8 x i32> %sv, i64 0)
@@ -78,9 +100,7 @@ define <vscale x 8 x i32> @insert_nxv8i32_v8i32_8(<vscale x 8 x i32> %vec, ptr %
 ;
 ; VLS-LABEL: insert_nxv8i32_v8i32_8:
 ; VLS:       # %bb.0:
-; VLS-NEXT:    vl2re32.v v12, (a0)
-; VLS-NEXT:    vsetivli zero, 16, e32, m4, tu, ma
-; VLS-NEXT:    vslideup.vi v8, v12, 8
+; VLS-NEXT:    vl2re32.v v10, (a0)
 ; VLS-NEXT:    ret
   %sv = load <8 x i32>, ptr %svp
   %v = call <vscale x 8 x i32> @llvm.vector.insert.v8i32.nxv8i32(<vscale x 8 x i32> %vec, <8 x i32> %sv, i64 8)
@@ -98,6 +118,31 @@ define <vscale x 8 x i32> @insert_nxv8i32_undef_v2i32_0(ptr %svp) {
   ret <vscale x 8 x i32> %v
 }
 
+define <vscale x 2 x i32> @insert_nxv8i32_v4i32_0(<vscale x 2 x i32> %vec, <4 x i32> %subvec) {
+; VLA-LABEL: insert_nxv8i32_v4i32_0:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 4, e32, m1, tu, ma
+; VLA-NEXT:    vmv.v.v v8, v9
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: insert_nxv8i32_v4i32_0:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vmv1r.v v8, v9
+; VLS-NEXT:    ret
+  %v = call <vscale x 2 x i32> @llvm.vector.insert.nxv2i32.v4i32(<vscale x 2 x i32> %vec, <4 x i32> %subvec, i64 0)
+  ret <vscale x 2 x i32> %v
+}
+
+
+define <4 x i32> @insert_v4i32_v4i32_0(<4 x i32> %vec, <4 x i32> %subvec) {
+; CHECK-LABEL: insert_v4i32_v4i32_0:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+  %v = call <4 x i32> @llvm.vector.insert.v4i32.v4i32(<4 x i32> %vec, <4 x i32> %subvec, i64 0)
+  ret <4 x i32> %v
+}
+
 define void @insert_v4i32_v2i32_0(ptr %vp, ptr %svp) {
 ; VLA-LABEL: insert_v4i32_v2i32_0:
 ; VLA:       # %bb.0:
@@ -175,6 +220,31 @@ define void @insert_v4i32_undef_v2i32_0(ptr %vp, ptr %svp) {
   ret void
 }
 
+; This tests the code path in RISCVISelDAGToDAG::Select where we select an
+; insert_subvector with a fixed vector and fixed subvector type. The phi here is
+; used to prevent the fixed insert_subvector from being combined away into a
+; scalable insert_subvector.
+define <4 x i32> @insert_v4i32_undef_v2i32_0_phi(<2 x i32> %subvec, i1 %cond) {
+; CHECK-LABEL: insert_v4i32_undef_v2i32_0_phi:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    andi a0, a0, 1
+; CHECK-NEXT:    bnez a0, .LBB11_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vmv.v.i v8, 0
+; CHECK-NEXT:  .LBB11_2: # %bar
+; CHECK-NEXT:    ret
+entry:
+  br i1 %cond, label %foo, label %bar
+foo:
+  %v = call <4 x i32> @llvm.vector.insert.v2i32.v4i32(<4 x i32> undef, <2 x i32> %subvec, i64 0)
+  br label %bar
+bar:
+  %w = phi <4 x i32> [%v, %foo], [zeroinitializer, %entry]
+  ret <4 x i32> %w
+}
+
+
 define void @insert_v8i32_v2i32_0(ptr %vp, ptr %svp) {
 ; VLA-LABEL: insert_v8i32_v2i32_0:
 ; VLA:       # %bb.0:
@@ -193,7 +263,7 @@ define void @insert_v8i32_v2i32_0(ptr %vp, ptr %svp) {
 ; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; VLS-NEXT:    vle32.v v8, (a1)
 ; VLS-NEXT:    vl2re32.v v10, (a0)
-; VLS-NEXT:    vsetivli zero, 2, e32, m2, tu, ma
+; VLS-NEXT:    vsetivli zero, 2, e32, m1, tu, ma
 ; VLS-NEXT:    vmv.v.v v10, v8
 ; VLS-NEXT:    vs2r.v v10, (a0)
 ; VLS-NEXT:    ret
@@ -220,11 +290,11 @@ define void @insert_v8i32_v2i32_2(ptr %vp, ptr %svp) {
 ; VLS-LABEL: insert_v8i32_v2i32_2:
 ; VLS:       # %bb.0:
 ; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; VLS-NEXT:    vl2re32.v v8, (a0)
-; VLS-NEXT:    vle32.v v10, (a1)
-; VLS-NEXT:    vsetivli zero, 4, e32, m2, tu, ma
-; VLS-NEXT:    vslideup.vi v8, v10, 2
-; VLS-NEXT:    vs2r.v v8, (a0)
+; VLS-NEXT:    vle32.v v8, (a1)
+; VLS-NEXT:    vl2re32.v v10, (a0)
+; VLS-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; VLS-NEXT:    vslideup.vi v10, v8, 2
+; VLS-NEXT:    vs2r.v v10, (a0)
 ; VLS-NEXT:    ret
   %sv = load <2 x i32>, ptr %svp
   %vec = load <8 x i32>, ptr %vp
@@ -247,11 +317,11 @@ define void @insert_v8i32_v2i32_6(ptr %vp, ptr %svp) {
 ; VLS-LABEL: insert_v8i32_v2i32_6:
 ; VLS:       # %bb.0:
 ; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; VLS-NEXT:    vl2re32.v v8, (a0)
-; VLS-NEXT:    vle32.v v10, (a1)
-; VLS-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; VLS-NEXT:    vslideup.vi v8, v10, 6
-; VLS-NEXT:    vs2r.v v8, (a0)
+; VLS-NEXT:    vle32.v v8, (a1)
+; VLS-NEXT:    vl2re32.v v10, (a0)
+; VLS-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; VLS-NEXT:    vslideup.vi v11, v8, 2
+; VLS-NEXT:    vs2r.v v10, (a0)
 ; VLS-NEXT:    ret
   %sv = load <2 x i32>, ptr %svp
   %vec = load <8 x i32>, ptr %vp
@@ -274,9 +344,9 @@ define void @insert_v8i32_undef_v2i32_6(ptr %vp, ptr %svp) {
 ; VLS:       # %bb.0:
 ; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; VLS-NEXT:    vle32.v v8, (a1)
-; VLS-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; VLS-NEXT:    vslideup.vi v10, v8, 6
-; VLS-NEXT:    vs2r.v v10, (a0)
+; VLS-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; VLS-NEXT:    vslideup.vi v9, v8, 2
+; VLS-NEXT:    vs2r.v v8, (a0)
 ; VLS-NEXT:    ret
   %sv = load <2 x i32>, ptr %svp
   %v = call <8 x i32> @llvm.vector.insert.v2i32.v8i32(<8 x i32> undef, <2 x i32> %sv, i64 6)
@@ -542,9 +612,7 @@ define void @insert_v2i64_nxv16i64(ptr %psv0, ptr %psv1, ptr %out) {
 ; VLS-LABEL: insert_v2i64_nxv16i64:
 ; VLS:       # %bb.0:
 ; VLS-NEXT:    vl1re64.v v8, (a0)
-; VLS-NEXT:    vl1re64.v v16, (a1)
-; VLS-NEXT:    vsetivli zero, 6, e64, m8, tu, ma
-; VLS-NEXT:    vslideup.vi v8, v16, 4
+; VLS-NEXT:    vl1re64.v v10, (a1)
 ; VLS-NEXT:    vs8r.v v8, (a2)
 ; VLS-NEXT:    ret
   %sv0 = load <2 x i64>, ptr %psv0
@@ -586,10 +654,8 @@ define void @insert_v2i64_nxv16i64_lo2(ptr %psv, ptr %out) {
 ;
 ; VLS-LABEL: insert_v2i64_nxv16i64_lo2:
 ; VLS:       # %bb.0:
-; VLS-NEXT:    vl1re64.v v8, (a0)
-; VLS-NEXT:    vsetivli zero, 4, e64, m8, ta, ma
-; VLS-NEXT:    vslideup.vi v16, v8, 2
-; VLS-NEXT:    vs8r.v v16, (a1)
+; VLS-NEXT:    vl1re64.v v9, (a0)
+; VLS-NEXT:    vs8r.v v8, (a1)
 ; VLS-NEXT:    ret
   %sv = load <2 x i64>, ptr %psv
   %v = call <vscale x 16 x i64> @llvm.vector.insert.v2i64.nxv16i64(<vscale x 16 x i64> undef, <2 x i64> %sv, i64 2)
@@ -632,7 +698,6 @@ define void @insert_v2i64_nxv16i64_hi(ptr %psv, ptr %out) {
 ; RV32-NEXT:    lw s0, 72(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    addi sp, sp, 80
 ; RV32-NEXT:    ret
-;
 ; RV64-LABEL: insert_v2i64_nxv16i64_hi:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    addi sp, sp, -80
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll
index 8474f95edd813f..41ab706ababd76 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll
@@ -59,25 +59,39 @@ define <8 x i16> @concat_8xv1i16(<1 x i16> %a, <1 x i16> %b, <1 x i16> %c, <1 x
 }
 
 define <8 x i32> @concat_2xv4i32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: concat_2xv4i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v10, v9
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v10, 4
-; CHECK-NEXT:    ret
+; VLA-LABEL: concat_2xv4i32:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vmv1r.v v10, v9
+; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLA-NEXT:    vslideup.vi v8, v10, 4
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: concat_2xv4i32:
+; VLS:       # %bb.0:
+; VLS-NEXT:    ret
   %ab = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   ret <8 x i32> %ab
 }
 
 define <8 x i32> @concat_4xv2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d) {
-; CHECK-LABEL: concat_4xv2i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vslideup.vi v10, v11, 2
-; CHECK-NEXT:    vslideup.vi v8, v9, 2
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v10, 4
-; CHECK-NEXT:    ret
+; VLA-LABEL: concat_4xv2i32:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; VLA-NEXT:    vslideup.vi v10, v11, 2
+; VLA-NEXT:    vslideup.vi v8, v9, 2
+; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLA-NEXT:    vslideup.vi v8, v10, 4
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: concat_4xv2i32:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vmv1r.v v13, v10
+; VLS-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; VLS-NEXT:    vmv1r.v v12, v8
+; VLS-NEXT:    vslideup.vi v13, v11, 2
+; VLS-NEXT:    vslideup.vi v12, v9, 2
+; VLS-NEXT:    vmv2r.v v8, v12
+; VLS-NEXT:    ret
   %ab = shufflevector <2 x i32> %a, <2 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %cd = shufflevector <2 x i32> %c, <2 x i32> %d, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %abcd = shufflevector <4 x i32> %ab, <4 x i32> %cd, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -85,21 +99,38 @@ define <8 x i32> @concat_4xv2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x
 }
 
 define <8 x i32> @concat_8xv1i32(<1 x i32> %a, <1 x i32> %b, <1 x i32> %c, <1 x i32> %d, <1 x i32> %e, <1 x i32> %f, <1 x i32> %g, <1 x i32> %h) {
-; CHECK-LABEL: concat_8xv1i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vi v14, v15, 1
-; CHECK-NEXT:    vslideup.vi v12, v13, 1
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vslideup.vi v12, v14, 2
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vi v10, v11, 1
-; CHECK-NEXT:    vslideup.vi v8, v9, 1
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v10, 2
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v12, 4
-; CHECK-NEXT:    ret
+; VLA-LABEL: concat_8xv1i32:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLA-NEXT:    vslideup.vi v14, v15, 1
+; VLA-NEXT:    vslideup.vi v12, v13, 1
+; VLA-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; VLA-NEXT:    vslideup.vi v12, v14, 2
+; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLA-NEXT:    vslideup.vi v10, v11, 1
+; VLA-NEXT:    vslideup.vi v8, v9, 1
+; VLA-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; VLA-NEXT:    vslideup.vi v8, v10, 2
+; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLA-NEXT:    vslideup.vi v8, v12, 4
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: concat_8xv1i32:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vmv1r.v v17, v12
+; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLS-NEXT:    vslideup.vi v14, v15, 1
+; VLS-NEXT:    vmv1r.v v16, v8
+; VLS-NEXT:    vslideup.vi v17, v13, 1
+; VLS-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; VLS-NEXT:    vslideup.vi v17, v14, 2
+; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLS-NEXT:    vslideup.vi v10, v11, 1
+; VLS-NEXT:    vslideup.vi v16, v9, 1
+; VLS-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; VLS-NEXT:    vslideup.vi v16, v10, 2
+; VLS-NEXT:    vmv2r.v v8, v16
+; VLS-NEXT:    ret
   %ab = shufflevector <1 x i32> %a, <1 x i32> %b, <2 x i32> <i32 0, i32 1>
   %cd = shufflevector <1 x i32> %c, <1 x i32> %d, <2 x i32> <i32 0, i32 1>
   %abcd = shufflevector <2 x i32> %ab, <2 x i32> %cd, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -111,28 +142,36 @@ define <8 x i32> @concat_8xv1i32(<1 x i32> %a, <1 x i32> %b, <1 x i32> %c, <1 x
 }
 
 define <16 x i32> @concat_2xv8i32(<8 x i32> %a, <8 x i32> %b) {
-; CHECK-LABEL: concat_2xv8i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv2r.v v12, v10
-; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v12, 8
-; CHECK-NEXT:    ret
+; VLA-LABEL: concat_2xv8i32:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vmv2r.v v12, v10
+; VLA-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; VLA-NEXT:    vslideup.vi v8, v12, 8
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: concat_2xv8i32:
+; VLS:       # %bb.0:
+; VLS-NEXT:    ret
   %v = shufflevector <8 x i32> %a, <8 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   ret <16 x i32> %v
 }
 
 define <16 x i32> @concat_4xv4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) {
-; CHECK-LABEL: concat_4xv4i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v14, v11
-; CHECK-NEXT:    vmv1r.v v12, v10
-; CHECK-NEXT:    vmv1r.v v10, v9
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vslideup.vi v12, v14, 4
-; CHECK-NEXT:    vslideup.vi v8, v10, 4
-; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v12, 8
-; CHECK-NEXT:    ret
+; VLA-LABEL: concat_4xv4i32:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vmv1r.v v14, v11
+; VLA-NEXT:    vmv1r.v v12, v10
+; VLA-NEXT:    vmv1r.v v10, v9
+; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLA-NEXT:    vslideup.vi v12, v14, 4
+; VLA-NEXT:    vslideup.vi v8, v10, 4
+; VLA-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; VLA-NEXT:    vslideup.vi v8, v12, 8
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: concat_4xv4i32:
+; VLS:       # %bb.0:
+; VLS-NEXT:    ret
   %ab = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %cd = shufflevector <4 x i32> %c, <4 x i32> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %abcd = shufflevector <8 x i32> %ab, <8 x i32> %cd, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -140,21 +179,35 @@ define <16 x i32> @concat_4xv4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x
 }
 
 define <16 x i32> @concat_8xv2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d, <2 x i32> %e, <2 x i32> %f, <2 x i32> %g, <2 x i32> %h) {
-; CHECK-LABEL: concat_8xv2i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vslideup.vi v14, v15, 2
-; CHECK-NEXT:    vslideup.vi v12, v13, 2
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vslideup.vi v12, v14, 4
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vslideup.vi v10, v11, 2
-; CHECK-NEXT:    vslideup.vi v8, v9, 2
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v10, 4
-; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v12, 8
-; CHECK-NEXT:    ret
+; VLA-LABEL: concat_8xv2i32:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; VLA-NEXT:    vslideup.vi v14, v15, 2
+; VLA-NEXT:    vslideup.vi v12, v13, 2
+; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLA-NEXT:    vslideup.vi v12, v14, 4
+; VLA-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; VLA-NEXT:    vslideup.vi v10, v11, 2
+; VLA-NEXT:    vslideup.vi v8, v9, 2
+; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLA-NEXT:    vslideup.vi v8, v10, 4
+; VLA-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; VLA-NEXT:    vslideup.vi v8, v12, 8
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: concat_8xv2i32:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vmv1r.v v19, v14
+; VLS-NEXT:    vmv1r.v v18, v12
+; VLS-NEXT:    vmv1r.v v17, v10
+; VLS-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; VLS-NEXT:    vmv1r.v v16, v8
+; VLS-NEXT:    vslideup.vi v19, v15, 2
+; VLS-NEXT:    vslideup.vi v18, v13, 2
+; VLS-NEXT:    vslideup.vi v17, v11, 2
+; VLS-NEXT:    vslideup.vi v16, v9, 2
+; VLS-NEXT:    vmv4r.v v8, v16
+; VLS-NEXT:    ret
   %ab = shufflevector <2 x i32> %a, <2 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %cd = shufflevector <2 x i32> %c, <2 x i32> %d, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %abcd = shufflevector <4 x i32> %ab, <4 x i32> %cd, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -176,9 +229,6 @@ define <32 x i32> @concat_2xv16i32(<16 x i32> %a, <16 x i32> %b) {
 ;
 ; VLS-LABEL: concat_2xv16i32:
 ; VLS:       # %bb.0:
-; VLS-NEXT:    vmv4r.v v16, v12
-; VLS-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
-; VLS-NEXT:    vslideup.vi v8, v16, 16
 ; VLS-NEXT:    ret
   %ab = shufflevector <16 x i32> %a, <16 x i32> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
   ret <32 x i32> %ab
@@ -200,14 +250,6 @@ define <32 x i32> @concat_4xv8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x
 ;
 ; VLS-LABEL: concat_4xv8i32:
 ; VLS:       # %bb.0:
-; VLS-NEXT:    vmv2r.v v20, v14
-; VLS-NEXT:    vmv2r.v v16, v12
-; VLS-NEXT:    vmv2r.v v12, v10
-; VLS-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; VLS-NEXT:    vslideup.vi v16, v20, 8
-; VLS-NEXT:    vslideup.vi v8, v12, 8
-; VLS-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
-; VLS-NEXT:    vslideup.vi v8, v16, 16
 ; VLS-NEXT:    ret
   %ab = shufflevector <8 x i32> %a, <8 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %cd = shufflevector <8 x i32> %c, <8 x i32> %d, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -242,25 +284,6 @@ define <32 x i32> @concat_8xv4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x
 ;
 ; VLS-LABEL: concat_8xv4i32:
 ; VLS:       # %bb.0:
-; VLS-NEXT:    vmv1r.v v18, v15
-; VLS-NEXT:    vmv1r.v v20, v14
-; VLS-NEXT:    vmv1r.v v22, v13
-; VLS-NEXT:    vmv1r.v v16, v12
-; VLS-NEXT:    vmv1r.v v14, v11
-; VLS-NEXT:    vmv1r.v v12, v10
-; VLS-NEXT:    vmv1r.v v10, v9
-; VLS-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; VLS-NEXT:    vslideup.vi v20, v18, 4
-; VLS-NEXT:    vslideup.vi v16, v22, 4
-; VLS-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; VLS-NEXT:    vslideup.vi v16, v20, 8
-; VLS-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; VLS-NEXT:    vslideup.vi v12, v14, 4
-; VLS-NEXT:    vslideup.vi v8, v10, 4
-; VLS-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; VLS-NEXT:    vslideup.vi v8, v12, 8
-; VLS-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
-; VLS-NEXT:    vslideup.vi v8, v16, 16
 ; VLS-NEXT:    ret
   %ab = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %cd = shufflevector <4 x i32> %c, <4 x i32> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>

>From 5a86047d5a9873305951a0f7b4a576286b4c1637 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Mon, 1 Apr 2024 14:02:36 +0800
Subject: [PATCH 2/2] Address review comments:

* Rework the subvec size check by introducing a isKnownMultipleOf helper function on TypeSize/ElementCount. We can then reason that the subvector is exactly vector register sized if it is a multiple of the vector register size.
* Rename variables to clarify we are checking the subvec size
* Update wording of comment
---
 llvm/include/llvm/Support/TypeSize.h        | 12 +++++
 llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp | 14 +++---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 50 +++++++--------------
 llvm/lib/Target/RISCV/RISCVSubtarget.h      | 11 +++++
 4 files changed, 46 insertions(+), 41 deletions(-)

diff --git a/llvm/include/llvm/Support/TypeSize.h b/llvm/include/llvm/Support/TypeSize.h
index 68dbe1ea3062ab..c6779e258be7cb 100644
--- a/llvm/include/llvm/Support/TypeSize.h
+++ b/llvm/include/llvm/Support/TypeSize.h
@@ -181,6 +181,18 @@ template <typename LeafTy, typename ValueTy> class FixedOrScalableQuantity {
     return getKnownMinValue() % RHS == 0;
   }
 
+  /// Returns whether or not the callee is known to be a multiple of RHS.
+  constexpr bool isKnownMultipleOf(const FixedOrScalableQuantity &RHS) const {
+    // x % y == 0 => x % y == 0
+    // x % y == 0 => (vscale * x) % y == 0
+    // x % y == 0 => (vscale * x) % (vscale * y) == 0
+    // but
+    // x % y == 0 !=> x % (vscale * y) == 0
+    if (!isScalable() && RHS.isScalable())
+      return false;
+    return getKnownMinValue() % RHS.getKnownMinValue() == 0;
+  }
+
   // Return the minimum value with the assumption that the count is exact.
   // Use in places where a scalable count doesn't make sense (e.g. non-vector
   // types, or vectors in backends which don't support scalable vectors).
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index 3fea03ec892dc2..169dde057c402b 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -2064,13 +2064,13 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
     // Establish the correct scalable-vector types for any fixed-length type.
     if (SubVecVT.isFixedLengthVector()) {
       SubVecContainerVT = TLI.getContainerForFixedLengthVector(SubVecVT);
-      bool AlignedToVecReg = false;
-      if (auto VLen = Subtarget->getRealVLen();
-          VLen && SubVecVT.getSizeInBits() ==
-                      SubVecContainerVT.getSizeInBits().getKnownMinValue() *
-                          (*VLen / RISCV::RVVBitsPerBlock))
-        AlignedToVecReg = true;
-      assert(Idx == 0 && (AlignedToVecReg || V.isUndef()));
+#ifndef NDEBUG
+      TypeSize VecRegSize = TypeSize::getScalable(RISCV::RVVBitsPerBlock);
+      bool ExactlyVecRegSized =
+          Subtarget->expandVScale(SubVecVT.getSizeInBits())
+              .isKnownMultipleOf(Subtarget->expandVScale(VecRegSize));
+      assert(Idx == 0 && (ExactlyVecRegSized || V.isUndef()));
+#endif
     }
     MVT ContainerVT = VT;
     if (VT.isFixedLengthVector())
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 42302b86775ad2..073db13e0ef8e9 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -9596,21 +9596,6 @@ SDValue RISCVTargetLowering::lowerVPREDUCE(SDValue Op,
                            Vec, Mask, VL, DL, DAG, Subtarget);
 }
 
-/// Returns true if \p LHS is known to be equal to \p RHS, taking into account
-/// if VLEN is exactly known by \p Subtarget and thus vscale when handling
-/// scalable quantities.
-static bool isKnownEQ(ElementCount LHS, ElementCount RHS,
-                      const RISCVSubtarget &Subtarget) {
-  if (auto VLen = Subtarget.getRealVLen()) {
-    const unsigned Vscale = *VLen / RISCV::RVVBitsPerBlock;
-    if (LHS.isScalable())
-      LHS = ElementCount::getFixed(LHS.getKnownMinValue() * Vscale);
-    if (RHS.isScalable())
-      RHS = ElementCount::getFixed(RHS.getKnownMinValue() * Vscale);
-  }
-  return LHS == RHS;
-}
-
 SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
                                                    SelectionDAG &DAG) const {
   SDValue Vec = Op.getOperand(0);
@@ -9748,29 +9733,25 @@ SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
     RemIdx = ElementCount::getScalable(Decompose.second);
   }
 
-  RISCVII::VLMUL SubVecLMUL = RISCVTargetLowering::getLMUL(ContainerSubVecVT);
-  bool IsSubVecPartReg = SubVecLMUL == RISCVII::VLMUL::LMUL_F2 ||
-                         SubVecLMUL == RISCVII::VLMUL::LMUL_F4 ||
-                         SubVecLMUL == RISCVII::VLMUL::LMUL_F8;
-  bool AlignedToVecReg = !IsSubVecPartReg;
-  if (SubVecVT.isFixedLengthVector())
-    AlignedToVecReg &= SubVecVT.getSizeInBits() ==
-                       ContainerSubVecVT.getSizeInBits().getKnownMinValue() *
-                           (*VLen / RISCV::RVVBitsPerBlock);
+  TypeSize VecRegSize = TypeSize::getScalable(RISCV::RVVBitsPerBlock);
+  bool ExactlyVecRegSized =
+      Subtarget.expandVScale(SubVecVT.getSizeInBits())
+          .isKnownMultipleOf(Subtarget.expandVScale(VecRegSize));
 
   // 1. If the Idx has been completely eliminated and this subvector's size is
   // a vector register or a multiple thereof, or the surrounding elements are
   // undef, then this is a subvector insert which naturally aligns to a vector
   // register. These can easily be handled using subregister manipulation.
-  // 2. If the subvector isn't exactly aligned to a vector register group, then
-  // the insertion must preserve the undisturbed elements of the register. We do
-  // this by lowering to an EXTRACT_SUBVECTOR grabbing the nearest LMUL=1 vector
-  // type (which resolves to a subregister copy), performing a VSLIDEUP to place
-  // the subvector within the vector register, and an INSERT_SUBVECTOR of that
-  // LMUL=1 type back into the larger vector (resolving to another subregister
-  // operation). See below for how our VSLIDEUP works. We go via a LMUL=1 type
-  // to avoid allocating a large register group to hold our subvector.
-  if (RemIdx.isZero() && (AlignedToVecReg || Vec.isUndef())) {
+  // 2. If the subvector isn't an exact multiple of a valid register group size,
+  // then the insertion must preserve the undisturbed elements of the register.
+  // We do this by lowering to an EXTRACT_SUBVECTOR grabbing the nearest LMUL=1
+  // vector type (which resolves to a subregister copy), performing a VSLIDEUP
+  // to place the subvector within the vector register, and an INSERT_SUBVECTOR
+  // of that LMUL=1 type back into the larger vector (resolving to another
+  // subregister operation). See below for how our VSLIDEUP works. We go via a
+  // LMUL=1 type to avoid allocating a large register group to hold our
+  // subvector.
+  if (RemIdx.isZero() && (ExactlyVecRegSized || Vec.isUndef())) {
     if (SubVecVT.isFixedLengthVector()) {
       // We may get NoSubRegister if inserting at index 0 and the subvec
       // container is the same as the vector, e.g. vec=v4i32,subvec=v4i32,idx=0
@@ -9817,7 +9798,8 @@ SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
 
   // Use tail agnostic policy if we're inserting over InterSubVT's tail.
   unsigned Policy = RISCVII::TAIL_UNDISTURBED_MASK_UNDISTURBED;
-  if (isKnownEQ(EndIndex, InterSubVT.getVectorElementCount(), Subtarget))
+  if (Subtarget.expandVScale(EndIndex) ==
+      Subtarget.expandVScale(InterSubVT.getVectorElementCount()))
     Policy = RISCVII::TAIL_AGNOSTIC;
 
   // If we're inserting into the lowest elements, use a tail undisturbed
diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h
index ba108912d93400..d7e788e8b502a6 100644
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.h
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h
@@ -200,6 +200,17 @@ class RISCVSubtarget : public RISCVGenSubtargetInfo {
     return Min;
   }
 
+  /// If the ElementCount or TypeSize \p X is scalable and VScale (VLEN) is
+  /// exactly known, returns \p X converted to a fixed quantity. Otherwise
+  /// returns \p X unmodified.
+  template <typename Quantity> Quantity expandVScale(Quantity X) const {
+    if (auto VLen = getRealVLen(); VLen && X.isScalable()) {
+      const unsigned VScale = *VLen / RISCV::RVVBitsPerBlock;
+      X = Quantity::getFixed(X.getKnownMinValue() * VScale);
+    }
+    return X;
+  }
+
   RISCVABI::ABI getTargetABI() const { return TargetABI; }
   bool isSoftFPABI() const {
     return TargetABI == RISCVABI::ABI_LP64 ||