[llvm] [RISCV] Handle fixed length vectors with exact VLEN in lowerINSERT_SUBVECTOR (PR #84107)

Tue Apr 30 05:31:21 PDT 2024

https://github.com/lukel97 updated https://github.com/llvm/llvm-project/pull/84107

>From 0bb35d87f2e076de5c498b5667e4db75c8e47d2c Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Mon, 26 Feb 2024 14:42:00 +0800
Subject: [PATCH 1/5] [RISCV] Handle fixed length vectors with exact VLEN in
 lowerINSERT_SUBVECTOR

This is the insert_subvector equivalent to #79949, where we can avoid sliding up
by the full LMUL amount if we know the exact subregister the subvector will be
inserted into.

This mirrors the lowerEXTRACT_SUBVECTOR changes in that we handle this in two
parts:

- We handle fixed length subvector types by converting the subvector to a
  scalable vector. But unlike EXTRACT_SUBVECTOR, we may also need to convert the
  vector being inserted into too.

- Whenever we don't need a vslideup because either the subvector aligns to a
  vector register group *or* the vector is undef, we need to emit an
  insert_subreg ourselves because RISCVISelDAGToDAG::Select doesn't correctly
  handle fixed length subvectors yet: see d7a28f7ad

I've left RISCVISelDAGToDAG::Select untouched for now (minus relaxing an
invariant), so that the insert_subvector and extract_subvector code paths are
the same.

We should teach it to properly handle fixed length subvectors in a follow-up
patch, so that the "exact subregsiter" logic is handled in one place instead of
being spread across both RISCVISelDAGToDAG.cpp and RISCVISelLowering.cpp.
---
 llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp   |   8 +-
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   | 130 ++++++++---
 .../rvv/fixed-vectors-insert-subvector.ll     | 163 +++++++++-----
 .../RISCV/rvv/fixed-vectors-shuffle-concat.ll | 205 ++++++++++--------
 4 files changed, 334 insertions(+), 172 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index da1543bd7112a2..cc8d4bb0c7dab3 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -2099,8 +2099,14 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
     MVT SubVecContainerVT = SubVecVT;
     // Establish the correct scalable-vector types for any fixed-length type.
     if (SubVecVT.isFixedLengthVector()) {
-      assert(Idx == 0 && V.isUndef());
       SubVecContainerVT = TLI.getContainerForFixedLengthVector(SubVecVT);
+      bool AlignedToVecReg = false;
+      if (auto VLen = Subtarget->getRealVLen();
+          VLen && SubVecVT.getSizeInBits() ==
+                      SubVecContainerVT.getSizeInBits().getKnownMinValue() *
+                          (*VLen / RISCV::RVVBitsPerBlock))
+        AlignedToVecReg = true;
+      assert(Idx == 0 && (AlignedToVecReg || V.isUndef()));
     }
     MVT ContainerVT = VT;
     if (VT.isFixedLengthVector())
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index fe8edcf39681db..9bb984632124ea 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -9723,6 +9723,21 @@ SDValue RISCVTargetLowering::lowerVPREDUCE(SDValue Op,
                            Vec, Mask, VL, DL, DAG, Subtarget);
 }
 
+/// Returns true if \p LHS is known to be equal to \p RHS, taking into account
+/// if VLEN is exactly known by \p Subtarget and thus vscale when handling
+/// scalable quantities.
+static bool isKnownEQ(ElementCount LHS, ElementCount RHS,
+                      const RISCVSubtarget &Subtarget) {
+  if (auto VLen = Subtarget.getRealVLen()) {
+    const unsigned Vscale = *VLen / RISCV::RVVBitsPerBlock;
+    if (LHS.isScalable())
+      LHS = ElementCount::getFixed(LHS.getKnownMinValue() * Vscale);
+    if (RHS.isScalable())
+      RHS = ElementCount::getFixed(RHS.getKnownMinValue() * Vscale);
+  }
+  return LHS == RHS;
+}
+
 SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
                                                    SelectionDAG &DAG) const {
   SDValue Vec = Op.getOperand(0);
@@ -9772,12 +9787,13 @@ SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
     }
   }
 
-  // If the subvector vector is a fixed-length type, we cannot use subregister
-  // manipulation to simplify the codegen; we don't know which register of a
-  // LMUL group contains the specific subvector as we only know the minimum
-  // register size. Therefore we must slide the vector group up the full
-  // amount.
-  if (SubVecVT.isFixedLengthVector()) {
+  // If the subvector vector is a fixed-length type and we don't know VLEN
+  // exactly, we cannot use subregister manipulation to simplify the codegen; we
+  // don't know which register of a LMUL group contains the specific subvector
+  // as we only know the minimum register size. Therefore we must slide the
+  // vector group up the full amount.
+  const auto VLen = Subtarget.getRealVLen();
+  if (SubVecVT.isFixedLengthVector() && !VLen) {
     if (OrigIdx == 0 && Vec.isUndef() && !VecVT.isFixedLengthVector())
       return Op;
     MVT ContainerVT = VecVT;
@@ -9825,41 +9841,92 @@ SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
     return DAG.getBitcast(Op.getValueType(), SubVec);
   }
 
-  unsigned SubRegIdx, RemIdx;
-  std::tie(SubRegIdx, RemIdx) =
-      RISCVTargetLowering::decomposeSubvectorInsertExtractToSubRegs(
-          VecVT, SubVecVT, OrigIdx, TRI);
+  MVT ContainerVecVT = VecVT;
+  if (VecVT.isFixedLengthVector()) {
+    ContainerVecVT = getContainerForFixedLengthVector(VecVT);
+    Vec = convertToScalableVector(ContainerVecVT, Vec, DAG, Subtarget);
+  }
 
-  RISCVII::VLMUL SubVecLMUL = RISCVTargetLowering::getLMUL(SubVecVT);
+  MVT ContainerSubVecVT = SubVecVT;
+  if (SubVecVT.isFixedLengthVector()) {
+    ContainerSubVecVT = getContainerForFixedLengthVector(SubVecVT);
+    SubVec = convertToScalableVector(ContainerSubVecVT, SubVec, DAG, Subtarget);
+  }
+
+  unsigned SubRegIdx;
+  ElementCount RemIdx;
+  // insert_subvector scales the index by vscale if the subvector is scalable,
+  // and decomposeSubvectorInsertExtractToSubRegs takes this into account. So if
+  // we have a fixed length subvector, we need to adjust the index by 1/vscale.
+  if (SubVecVT.isFixedLengthVector()) {
+    assert(VLen);
+    unsigned Vscale = *VLen / RISCV::RVVBitsPerBlock;
+    auto Decompose =
+        RISCVTargetLowering::decomposeSubvectorInsertExtractToSubRegs(
+            ContainerVecVT, ContainerSubVecVT, OrigIdx / Vscale, TRI);
+    SubRegIdx = Decompose.first;
+    RemIdx = ElementCount::getFixed((Decompose.second * Vscale) +
+                                    (OrigIdx % Vscale));
+  } else {
+    auto Decompose =
+        RISCVTargetLowering::decomposeSubvectorInsertExtractToSubRegs(
+            ContainerVecVT, ContainerSubVecVT, OrigIdx, TRI);
+    SubRegIdx = Decompose.first;
+    RemIdx = ElementCount::getScalable(Decompose.second);
+  }
+
+  RISCVII::VLMUL SubVecLMUL = RISCVTargetLowering::getLMUL(ContainerSubVecVT);
   bool IsSubVecPartReg = SubVecLMUL == RISCVII::VLMUL::LMUL_F2 ||
                          SubVecLMUL == RISCVII::VLMUL::LMUL_F4 ||
                          SubVecLMUL == RISCVII::VLMUL::LMUL_F8;
+  bool AlignedToVecReg = !IsSubVecPartReg;
+  if (SubVecVT.isFixedLengthVector())
+    AlignedToVecReg &= SubVecVT.getSizeInBits() ==
+                       ContainerSubVecVT.getSizeInBits().getKnownMinValue() *
+                           (*VLen / RISCV::RVVBitsPerBlock);
 
   // 1. If the Idx has been completely eliminated and this subvector's size is
   // a vector register or a multiple thereof, or the surrounding elements are
   // undef, then this is a subvector insert which naturally aligns to a vector
   // register. These can easily be handled using subregister manipulation.
-  // 2. If the subvector is smaller than a vector register, then the insertion
-  // must preserve the undisturbed elements of the register. We do this by
-  // lowering to an EXTRACT_SUBVECTOR grabbing the nearest LMUL=1 vector type
-  // (which resolves to a subregister copy), performing a VSLIDEUP to place the
-  // subvector within the vector register, and an INSERT_SUBVECTOR of that
+  // 2. If the subvector isn't exactly aligned to a vector register group, then
+  // the insertion must preserve the undisturbed elements of the register. We do
+  // this by lowering to an EXTRACT_SUBVECTOR grabbing the nearest LMUL=1 vector
+  // type (which resolves to a subregister copy), performing a VSLIDEUP to place
+  // the subvector within the vector register, and an INSERT_SUBVECTOR of that
   // LMUL=1 type back into the larger vector (resolving to another subregister
   // operation). See below for how our VSLIDEUP works. We go via a LMUL=1 type
   // to avoid allocating a large register group to hold our subvector.
-  if (RemIdx == 0 && (!IsSubVecPartReg || Vec.isUndef()))
+  if (RemIdx.isZero() && (AlignedToVecReg || Vec.isUndef())) {
+    if (SubVecVT.isFixedLengthVector()) {
+      // We may get NoSubRegister if inserting at index 0 and the subvec
+      // container is the same as the vector, e.g. vec=v4i32,subvec=v4i32,idx=0
+      if (SubRegIdx == RISCV::NoSubRegister) {
+        assert(OrigIdx == 0);
+        return Op;
+      }
+
+      SDValue Insert =
+          DAG.getTargetInsertSubreg(SubRegIdx, DL, ContainerVecVT, Vec, SubVec);
+      if (VecVT.isFixedLengthVector())
+        Insert = convertFromScalableVector(VecVT, Insert, DAG, Subtarget);
+      return Insert;
+    }
     return Op;
+  }
 
   // VSLIDEUP works by leaving elements 0<i<OFFSET undisturbed, elements
   // OFFSET<=i<VL set to the "subvector" and vl<=i<VLMAX set to the tail policy
   // (in our case undisturbed). This means we can set up a subvector insertion
   // where OFFSET is the insertion offset, and the VL is the OFFSET plus the
   // size of the subvector.
-  MVT InterSubVT = VecVT;
+  MVT InterSubVT = ContainerVecVT;
   SDValue AlignedExtract = Vec;
-  unsigned AlignedIdx = OrigIdx - RemIdx;
-  if (VecVT.bitsGT(getLMUL1VT(VecVT))) {
-    InterSubVT = getLMUL1VT(VecVT);
+  unsigned AlignedIdx = OrigIdx - RemIdx.getKnownMinValue();
+  if (SubVecVT.isFixedLengthVector())
+    AlignedIdx /= *VLen / RISCV::RVVBitsPerBlock;
+  if (ContainerVecVT.bitsGT(getLMUL1VT(ContainerVecVT))) {
+    InterSubVT = getLMUL1VT(ContainerVecVT);
     // Extract a subvector equal to the nearest full vector register type. This
     // should resolve to a EXTRACT_SUBREG instruction.
     AlignedExtract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InterSubVT, Vec,
@@ -9870,25 +9937,23 @@ SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
                        DAG.getUNDEF(InterSubVT), SubVec,
                        DAG.getVectorIdxConstant(0, DL));
 
-  auto [Mask, VL] = getDefaultScalableVLOps(VecVT, DL, DAG, Subtarget);
+  auto [Mask, VL] = getDefaultVLOps(VecVT, ContainerVecVT, DL, DAG, Subtarget);
 
-  ElementCount EndIndex =
-      ElementCount::getScalable(RemIdx) + SubVecVT.getVectorElementCount();
-  VL = computeVLMax(SubVecVT, DL, DAG);
+  ElementCount EndIndex = RemIdx + SubVecVT.getVectorElementCount();
+  VL = DAG.getElementCount(DL, XLenVT, SubVecVT.getVectorElementCount());
 
   // Use tail agnostic policy if we're inserting over InterSubVT's tail.
   unsigned Policy = RISCVII::TAIL_UNDISTURBED_MASK_UNDISTURBED;
-  if (EndIndex == InterSubVT.getVectorElementCount())
+  if (isKnownEQ(EndIndex, InterSubVT.getVectorElementCount(), Subtarget))
     Policy = RISCVII::TAIL_AGNOSTIC;
 
   // If we're inserting into the lowest elements, use a tail undisturbed
   // vmv.v.v.
-  if (RemIdx == 0) {
+  if (RemIdx.isZero()) {
     SubVec = DAG.getNode(RISCVISD::VMV_V_V_VL, DL, InterSubVT, AlignedExtract,
                          SubVec, VL);
   } else {
-    SDValue SlideupAmt =
-        DAG.getVScale(DL, XLenVT, APInt(XLenVT.getSizeInBits(), RemIdx));
+    SDValue SlideupAmt = DAG.getElementCount(DL, XLenVT, RemIdx);
 
     // Construct the vector length corresponding to RemIdx + length(SubVecVT).
     VL = DAG.getNode(ISD::ADD, DL, XLenVT, SlideupAmt, VL);
@@ -9899,10 +9964,13 @@ SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
 
   // If required, insert this subvector back into the correct vector register.
   // This should resolve to an INSERT_SUBREG instruction.
-  if (VecVT.bitsGT(InterSubVT))
-    SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VecVT, Vec, SubVec,
+  if (ContainerVecVT.bitsGT(InterSubVT))
+    SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ContainerVecVT, Vec, SubVec,
                          DAG.getVectorIdxConstant(AlignedIdx, DL));
 
+  if (VecVT.isFixedLengthVector())
+    SubVec = convertFromScalableVector(VecVT, SubVec, DAG, Subtarget);
+
   // We might have bitcast from a mask type: cast back to the original type if
   // required.
   return DAG.getBitcast(Op.getSimpleValueType(), SubVec);
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll
index ab6df1d3e883fd..53de1a87553553 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll
@@ -9,39 +9,63 @@
 ; RUN: llc < %s -mtriple=riscv64 -mattr=+m,v -riscv-v-vector-bits-max=128 -verify-machineinstrs | FileCheck -check-prefixes=CHECK,VLS,RV64VLS %s
 
 define <vscale x 8 x i32> @insert_nxv8i32_v2i32_0(<vscale x 8 x i32> %vec, ptr %svp) {
-; CHECK-LABEL: insert_nxv8i32_v2i32_0:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vle32.v v12, (a0)
-; CHECK-NEXT:    vsetivli zero, 2, e32, m4, tu, ma
-; CHECK-NEXT:    vmv.v.v v8, v12
-; CHECK-NEXT:    ret
+; VLA-LABEL: insert_nxv8i32_v2i32_0:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLA-NEXT:    vle32.v v12, (a0)
+; VLA-NEXT:    vsetivli zero, 2, e32, m4, tu, ma
+; VLA-NEXT:    vmv.v.v v8, v12
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: insert_nxv8i32_v2i32_0:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLS-NEXT:    vle32.v v12, (a0)
+; VLS-NEXT:    vsetivli zero, 2, e32, m1, tu, ma
+; VLS-NEXT:    vmv.v.v v8, v12
+; VLS-NEXT:    ret
   %sv = load <2 x i32>, ptr %svp
   %v = call <vscale x 8 x i32> @llvm.vector.insert.v2i32.nxv8i32(<vscale x 8 x i32> %vec, <2 x i32> %sv, i64 0)
   ret <vscale x 8 x i32> %v
 }
 
 define <vscale x 8 x i32> @insert_nxv8i32_v2i32_2(<vscale x 8 x i32> %vec, ptr %svp) {
-; CHECK-LABEL: insert_nxv8i32_v2i32_2:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vle32.v v12, (a0)
-; CHECK-NEXT:    vsetivli zero, 4, e32, m4, tu, ma
-; CHECK-NEXT:    vslideup.vi v8, v12, 2
-; CHECK-NEXT:    ret
+; VLA-LABEL: insert_nxv8i32_v2i32_2:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLA-NEXT:    vle32.v v12, (a0)
+; VLA-NEXT:    vsetivli zero, 4, e32, m4, tu, ma
+; VLA-NEXT:    vslideup.vi v8, v12, 2
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: insert_nxv8i32_v2i32_2:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLS-NEXT:    vle32.v v12, (a0)
+; VLS-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; VLS-NEXT:    vslideup.vi v8, v12, 2
+; VLS-NEXT:    ret
   %sv = load <2 x i32>, ptr %svp
   %v = call <vscale x 8 x i32> @llvm.vector.insert.v2i32.nxv8i32(<vscale x 8 x i32> %vec, <2 x i32> %sv, i64 2)
   ret <vscale x 8 x i32> %v
 }
 
 define <vscale x 8 x i32> @insert_nxv8i32_v2i32_6(<vscale x 8 x i32> %vec, ptr %svp) {
-; CHECK-LABEL: insert_nxv8i32_v2i32_6:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vle32.v v12, (a0)
-; CHECK-NEXT:    vsetivli zero, 8, e32, m4, tu, ma
-; CHECK-NEXT:    vslideup.vi v8, v12, 6
-; CHECK-NEXT:    ret
+; VLA-LABEL: insert_nxv8i32_v2i32_6:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLA-NEXT:    vle32.v v12, (a0)
+; VLA-NEXT:    vsetivli zero, 8, e32, m4, tu, ma
+; VLA-NEXT:    vslideup.vi v8, v12, 6
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: insert_nxv8i32_v2i32_6:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLS-NEXT:    vle32.v v12, (a0)
+; VLS-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; VLS-NEXT:    vslideup.vi v9, v12, 2
+; VLS-NEXT:    ret
   %sv = load <2 x i32>, ptr %svp
   %v = call <vscale x 8 x i32> @llvm.vector.insert.v2i32.nxv8i32(<vscale x 8 x i32> %vec, <2 x i32> %sv, i64 6)
   ret <vscale x 8 x i32> %v
@@ -58,9 +82,7 @@ define <vscale x 8 x i32> @insert_nxv8i32_v8i32_0(<vscale x 8 x i32> %vec, ptr %
 ;
 ; VLS-LABEL: insert_nxv8i32_v8i32_0:
 ; VLS:       # %bb.0:
-; VLS-NEXT:    vl2re32.v v12, (a0)
-; VLS-NEXT:    vsetivli zero, 8, e32, m4, tu, ma
-; VLS-NEXT:    vmv.v.v v8, v12
+; VLS-NEXT:    vl2re32.v v8, (a0)
 ; VLS-NEXT:    ret
   %sv = load <8 x i32>, ptr %svp
   %v = call <vscale x 8 x i32> @llvm.vector.insert.v8i32.nxv8i32(<vscale x 8 x i32> %vec, <8 x i32> %sv, i64 0)
@@ -78,9 +100,7 @@ define <vscale x 8 x i32> @insert_nxv8i32_v8i32_8(<vscale x 8 x i32> %vec, ptr %
 ;
 ; VLS-LABEL: insert_nxv8i32_v8i32_8:
 ; VLS:       # %bb.0:
-; VLS-NEXT:    vl2re32.v v12, (a0)
-; VLS-NEXT:    vsetivli zero, 16, e32, m4, tu, ma
-; VLS-NEXT:    vslideup.vi v8, v12, 8
+; VLS-NEXT:    vl2re32.v v10, (a0)
 ; VLS-NEXT:    ret
   %sv = load <8 x i32>, ptr %svp
   %v = call <vscale x 8 x i32> @llvm.vector.insert.v8i32.nxv8i32(<vscale x 8 x i32> %vec, <8 x i32> %sv, i64 8)
@@ -98,6 +118,31 @@ define <vscale x 8 x i32> @insert_nxv8i32_undef_v2i32_0(ptr %svp) {
   ret <vscale x 8 x i32> %v
 }
 
+define <vscale x 2 x i32> @insert_nxv8i32_v4i32_0(<vscale x 2 x i32> %vec, <4 x i32> %subvec) {
+; VLA-LABEL: insert_nxv8i32_v4i32_0:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 4, e32, m1, tu, ma
+; VLA-NEXT:    vmv.v.v v8, v9
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: insert_nxv8i32_v4i32_0:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vmv1r.v v8, v9
+; VLS-NEXT:    ret
+  %v = call <vscale x 2 x i32> @llvm.vector.insert.nxv2i32.v4i32(<vscale x 2 x i32> %vec, <4 x i32> %subvec, i64 0)
+  ret <vscale x 2 x i32> %v
+}
+
+
+define <4 x i32> @insert_v4i32_v4i32_0(<4 x i32> %vec, <4 x i32> %subvec) {
+; CHECK-LABEL: insert_v4i32_v4i32_0:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+  %v = call <4 x i32> @llvm.vector.insert.v4i32.v4i32(<4 x i32> %vec, <4 x i32> %subvec, i64 0)
+  ret <4 x i32> %v
+}
+
 define void @insert_v4i32_v2i32_0(ptr %vp, ptr %svp) {
 ; VLA-LABEL: insert_v4i32_v2i32_0:
 ; VLA:       # %bb.0:
@@ -175,6 +220,31 @@ define void @insert_v4i32_undef_v2i32_0(ptr %vp, ptr %svp) {
   ret void
 }
 
+; This tests the code path in RISCVISelDAGToDAG::Select where we select an
+; insert_subvector with a fixed vector and fixed subvector type. The phi here is
+; used to prevent the fixed insert_subvector from being combined away into a
+; scalable insert_subvector.
+define <4 x i32> @insert_v4i32_undef_v2i32_0_phi(<2 x i32> %subvec, i1 %cond) {
+; CHECK-LABEL: insert_v4i32_undef_v2i32_0_phi:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    andi a0, a0, 1
+; CHECK-NEXT:    bnez a0, .LBB11_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vmv.v.i v8, 0
+; CHECK-NEXT:  .LBB11_2: # %bar
+; CHECK-NEXT:    ret
+entry:
+  br i1 %cond, label %foo, label %bar
+foo:
+  %v = call <4 x i32> @llvm.vector.insert.v2i32.v4i32(<4 x i32> undef, <2 x i32> %subvec, i64 0)
+  br label %bar
+bar:
+  %w = phi <4 x i32> [%v, %foo], [zeroinitializer, %entry]
+  ret <4 x i32> %w
+}
+
+
 define void @insert_v8i32_v2i32_0(ptr %vp, ptr %svp) {
 ; VLA-LABEL: insert_v8i32_v2i32_0:
 ; VLA:       # %bb.0:
@@ -193,7 +263,7 @@ define void @insert_v8i32_v2i32_0(ptr %vp, ptr %svp) {
 ; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; VLS-NEXT:    vle32.v v8, (a1)
 ; VLS-NEXT:    vl2re32.v v10, (a0)
-; VLS-NEXT:    vsetivli zero, 2, e32, m2, tu, ma
+; VLS-NEXT:    vsetivli zero, 2, e32, m1, tu, ma
 ; VLS-NEXT:    vmv.v.v v10, v8
 ; VLS-NEXT:    vs2r.v v10, (a0)
 ; VLS-NEXT:    ret
@@ -220,11 +290,11 @@ define void @insert_v8i32_v2i32_2(ptr %vp, ptr %svp) {
 ; VLS-LABEL: insert_v8i32_v2i32_2:
 ; VLS:       # %bb.0:
 ; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; VLS-NEXT:    vl2re32.v v8, (a0)
-; VLS-NEXT:    vle32.v v10, (a1)
-; VLS-NEXT:    vsetivli zero, 4, e32, m2, tu, ma
-; VLS-NEXT:    vslideup.vi v8, v10, 2
-; VLS-NEXT:    vs2r.v v8, (a0)
+; VLS-NEXT:    vle32.v v8, (a1)
+; VLS-NEXT:    vl2re32.v v10, (a0)
+; VLS-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; VLS-NEXT:    vslideup.vi v10, v8, 2
+; VLS-NEXT:    vs2r.v v10, (a0)
 ; VLS-NEXT:    ret
   %sv = load <2 x i32>, ptr %svp
   %vec = load <8 x i32>, ptr %vp
@@ -247,11 +317,11 @@ define void @insert_v8i32_v2i32_6(ptr %vp, ptr %svp) {
 ; VLS-LABEL: insert_v8i32_v2i32_6:
 ; VLS:       # %bb.0:
 ; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; VLS-NEXT:    vl2re32.v v8, (a0)
-; VLS-NEXT:    vle32.v v10, (a1)
-; VLS-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; VLS-NEXT:    vslideup.vi v8, v10, 6
-; VLS-NEXT:    vs2r.v v8, (a0)
+; VLS-NEXT:    vle32.v v8, (a1)
+; VLS-NEXT:    vl2re32.v v10, (a0)
+; VLS-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; VLS-NEXT:    vslideup.vi v11, v8, 2
+; VLS-NEXT:    vs2r.v v10, (a0)
 ; VLS-NEXT:    ret
   %sv = load <2 x i32>, ptr %svp
   %vec = load <8 x i32>, ptr %vp
@@ -274,9 +344,9 @@ define void @insert_v8i32_undef_v2i32_6(ptr %vp, ptr %svp) {
 ; VLS:       # %bb.0:
 ; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; VLS-NEXT:    vle32.v v8, (a1)
-; VLS-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; VLS-NEXT:    vslideup.vi v10, v8, 6
-; VLS-NEXT:    vs2r.v v10, (a0)
+; VLS-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; VLS-NEXT:    vslideup.vi v9, v8, 2
+; VLS-NEXT:    vs2r.v v8, (a0)
 ; VLS-NEXT:    ret
   %sv = load <2 x i32>, ptr %svp
   %v = call <8 x i32> @llvm.vector.insert.v2i32.v8i32(<8 x i32> undef, <2 x i32> %sv, i64 6)
@@ -542,9 +612,7 @@ define void @insert_v2i64_nxv16i64(ptr %psv0, ptr %psv1, ptr %out) {
 ; VLS-LABEL: insert_v2i64_nxv16i64:
 ; VLS:       # %bb.0:
 ; VLS-NEXT:    vl1re64.v v8, (a0)
-; VLS-NEXT:    vl1re64.v v16, (a1)
-; VLS-NEXT:    vsetivli zero, 6, e64, m8, tu, ma
-; VLS-NEXT:    vslideup.vi v8, v16, 4
+; VLS-NEXT:    vl1re64.v v10, (a1)
 ; VLS-NEXT:    vs8r.v v8, (a2)
 ; VLS-NEXT:    ret
   %sv0 = load <2 x i64>, ptr %psv0
@@ -586,10 +654,8 @@ define void @insert_v2i64_nxv16i64_lo2(ptr %psv, ptr %out) {
 ;
 ; VLS-LABEL: insert_v2i64_nxv16i64_lo2:
 ; VLS:       # %bb.0:
-; VLS-NEXT:    vl1re64.v v8, (a0)
-; VLS-NEXT:    vsetivli zero, 4, e64, m8, ta, ma
-; VLS-NEXT:    vslideup.vi v16, v8, 2
-; VLS-NEXT:    vs8r.v v16, (a1)
+; VLS-NEXT:    vl1re64.v v9, (a0)
+; VLS-NEXT:    vs8r.v v8, (a1)
 ; VLS-NEXT:    ret
   %sv = load <2 x i64>, ptr %psv
   %v = call <vscale x 16 x i64> @llvm.vector.insert.v2i64.nxv16i64(<vscale x 16 x i64> undef, <2 x i64> %sv, i64 2)
@@ -633,7 +699,6 @@ define void @insert_v2i64_nxv16i64_hi(ptr %psv, ptr %out) {
 ; RV32-NEXT:    lw s0, 72(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    addi sp, sp, 80
 ; RV32-NEXT:    ret
-;
 ; RV64-LABEL: insert_v2i64_nxv16i64_hi:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    addi sp, sp, -80
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll
index 98e6b8f2dd760a..76d6f8931d2840 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll
@@ -6,25 +6,39 @@
 ; RUN: llc < %s -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-max=128 -verify-machineinstrs | FileCheck -check-prefixes=CHECK,VLS %s
 
 define <8 x i32> @concat_2xv4i32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: concat_2xv4i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v10, v9
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v10, 4
-; CHECK-NEXT:    ret
+; VLA-LABEL: concat_2xv4i32:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vmv1r.v v10, v9
+; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLA-NEXT:    vslideup.vi v8, v10, 4
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: concat_2xv4i32:
+; VLS:       # %bb.0:
+; VLS-NEXT:    ret
   %ab = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   ret <8 x i32> %ab
 }
 
 define <8 x i32> @concat_4xv2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d) {
-; CHECK-LABEL: concat_4xv2i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vslideup.vi v10, v11, 2
-; CHECK-NEXT:    vslideup.vi v8, v9, 2
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v10, 4
-; CHECK-NEXT:    ret
+; VLA-LABEL: concat_4xv2i32:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; VLA-NEXT:    vslideup.vi v10, v11, 2
+; VLA-NEXT:    vslideup.vi v8, v9, 2
+; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLA-NEXT:    vslideup.vi v8, v10, 4
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: concat_4xv2i32:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vmv1r.v v13, v10
+; VLS-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; VLS-NEXT:    vmv1r.v v12, v8
+; VLS-NEXT:    vslideup.vi v13, v11, 2
+; VLS-NEXT:    vslideup.vi v12, v9, 2
+; VLS-NEXT:    vmv2r.v v8, v12
+; VLS-NEXT:    ret
   %ab = shufflevector <2 x i32> %a, <2 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %cd = shufflevector <2 x i32> %c, <2 x i32> %d, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %abcd = shufflevector <4 x i32> %ab, <4 x i32> %cd, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -32,21 +46,38 @@ define <8 x i32> @concat_4xv2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x
 }
 
 define <8 x i32> @concat_8xv1i32(<1 x i32> %a, <1 x i32> %b, <1 x i32> %c, <1 x i32> %d, <1 x i32> %e, <1 x i32> %f, <1 x i32> %g, <1 x i32> %h) {
-; CHECK-LABEL: concat_8xv1i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vi v14, v15, 1
-; CHECK-NEXT:    vslideup.vi v12, v13, 1
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vslideup.vi v12, v14, 2
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vi v10, v11, 1
-; CHECK-NEXT:    vslideup.vi v8, v9, 1
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v10, 2
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v12, 4
-; CHECK-NEXT:    ret
+; VLA-LABEL: concat_8xv1i32:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLA-NEXT:    vslideup.vi v14, v15, 1
+; VLA-NEXT:    vslideup.vi v12, v13, 1
+; VLA-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; VLA-NEXT:    vslideup.vi v12, v14, 2
+; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLA-NEXT:    vslideup.vi v10, v11, 1
+; VLA-NEXT:    vslideup.vi v8, v9, 1
+; VLA-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; VLA-NEXT:    vslideup.vi v8, v10, 2
+; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLA-NEXT:    vslideup.vi v8, v12, 4
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: concat_8xv1i32:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vmv1r.v v17, v12
+; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLS-NEXT:    vslideup.vi v14, v15, 1
+; VLS-NEXT:    vmv1r.v v16, v8
+; VLS-NEXT:    vslideup.vi v17, v13, 1
+; VLS-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; VLS-NEXT:    vslideup.vi v17, v14, 2
+; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLS-NEXT:    vslideup.vi v10, v11, 1
+; VLS-NEXT:    vslideup.vi v16, v9, 1
+; VLS-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; VLS-NEXT:    vslideup.vi v16, v10, 2
+; VLS-NEXT:    vmv2r.v v8, v16
+; VLS-NEXT:    ret
   %ab = shufflevector <1 x i32> %a, <1 x i32> %b, <2 x i32> <i32 0, i32 1>
   %cd = shufflevector <1 x i32> %c, <1 x i32> %d, <2 x i32> <i32 0, i32 1>
   %abcd = shufflevector <2 x i32> %ab, <2 x i32> %cd, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -58,28 +89,36 @@ define <8 x i32> @concat_8xv1i32(<1 x i32> %a, <1 x i32> %b, <1 x i32> %c, <1 x
 }
 
 define <16 x i32> @concat_2xv8i32(<8 x i32> %a, <8 x i32> %b) {
-; CHECK-LABEL: concat_2xv8i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv2r.v v12, v10
-; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v12, 8
-; CHECK-NEXT:    ret
+; VLA-LABEL: concat_2xv8i32:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vmv2r.v v12, v10
+; VLA-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; VLA-NEXT:    vslideup.vi v8, v12, 8
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: concat_2xv8i32:
+; VLS:       # %bb.0:
+; VLS-NEXT:    ret
   %v = shufflevector <8 x i32> %a, <8 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   ret <16 x i32> %v
 }
 
 define <16 x i32> @concat_4xv4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) {
-; CHECK-LABEL: concat_4xv4i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v14, v11
-; CHECK-NEXT:    vmv1r.v v12, v10
-; CHECK-NEXT:    vmv1r.v v10, v9
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vslideup.vi v12, v14, 4
-; CHECK-NEXT:    vslideup.vi v8, v10, 4
-; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v12, 8
-; CHECK-NEXT:    ret
+; VLA-LABEL: concat_4xv4i32:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vmv1r.v v14, v11
+; VLA-NEXT:    vmv1r.v v12, v10
+; VLA-NEXT:    vmv1r.v v10, v9
+; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLA-NEXT:    vslideup.vi v12, v14, 4
+; VLA-NEXT:    vslideup.vi v8, v10, 4
+; VLA-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; VLA-NEXT:    vslideup.vi v8, v12, 8
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: concat_4xv4i32:
+; VLS:       # %bb.0:
+; VLS-NEXT:    ret
   %ab = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %cd = shufflevector <4 x i32> %c, <4 x i32> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %abcd = shufflevector <8 x i32> %ab, <8 x i32> %cd, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -87,21 +126,35 @@ define <16 x i32> @concat_4xv4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x
 }
 
 define <16 x i32> @concat_8xv2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d, <2 x i32> %e, <2 x i32> %f, <2 x i32> %g, <2 x i32> %h) {
-; CHECK-LABEL: concat_8xv2i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vslideup.vi v14, v15, 2
-; CHECK-NEXT:    vslideup.vi v12, v13, 2
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vslideup.vi v12, v14, 4
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vslideup.vi v10, v11, 2
-; CHECK-NEXT:    vslideup.vi v8, v9, 2
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v10, 4
-; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v12, 8
-; CHECK-NEXT:    ret
+; VLA-LABEL: concat_8xv2i32:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; VLA-NEXT:    vslideup.vi v14, v15, 2
+; VLA-NEXT:    vslideup.vi v12, v13, 2
+; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLA-NEXT:    vslideup.vi v12, v14, 4
+; VLA-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; VLA-NEXT:    vslideup.vi v10, v11, 2
+; VLA-NEXT:    vslideup.vi v8, v9, 2
+; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLA-NEXT:    vslideup.vi v8, v10, 4
+; VLA-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; VLA-NEXT:    vslideup.vi v8, v12, 8
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: concat_8xv2i32:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vmv1r.v v19, v14
+; VLS-NEXT:    vmv1r.v v18, v12
+; VLS-NEXT:    vmv1r.v v17, v10
+; VLS-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; VLS-NEXT:    vmv1r.v v16, v8
+; VLS-NEXT:    vslideup.vi v19, v15, 2
+; VLS-NEXT:    vslideup.vi v18, v13, 2
+; VLS-NEXT:    vslideup.vi v17, v11, 2
+; VLS-NEXT:    vslideup.vi v16, v9, 2
+; VLS-NEXT:    vmv4r.v v8, v16
+; VLS-NEXT:    ret
   %ab = shufflevector <2 x i32> %a, <2 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %cd = shufflevector <2 x i32> %c, <2 x i32> %d, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %abcd = shufflevector <4 x i32> %ab, <4 x i32> %cd, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -123,9 +176,6 @@ define <32 x i32> @concat_2xv16i32(<16 x i32> %a, <16 x i32> %b) {
 ;
 ; VLS-LABEL: concat_2xv16i32:
 ; VLS:       # %bb.0:
-; VLS-NEXT:    vmv4r.v v16, v12
-; VLS-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
-; VLS-NEXT:    vslideup.vi v8, v16, 16
 ; VLS-NEXT:    ret
   %ab = shufflevector <16 x i32> %a, <16 x i32> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
   ret <32 x i32> %ab
@@ -147,14 +197,6 @@ define <32 x i32> @concat_4xv8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x
 ;
 ; VLS-LABEL: concat_4xv8i32:
 ; VLS:       # %bb.0:
-; VLS-NEXT:    vmv2r.v v20, v14
-; VLS-NEXT:    vmv2r.v v16, v12
-; VLS-NEXT:    vmv2r.v v12, v10
-; VLS-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; VLS-NEXT:    vslideup.vi v16, v20, 8
-; VLS-NEXT:    vslideup.vi v8, v12, 8
-; VLS-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
-; VLS-NEXT:    vslideup.vi v8, v16, 16
 ; VLS-NEXT:    ret
   %ab = shufflevector <8 x i32> %a, <8 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %cd = shufflevector <8 x i32> %c, <8 x i32> %d, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -189,25 +231,6 @@ define <32 x i32> @concat_8xv4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x
 ;
 ; VLS-LABEL: concat_8xv4i32:
 ; VLS:       # %bb.0:
-; VLS-NEXT:    vmv1r.v v18, v15
-; VLS-NEXT:    vmv1r.v v20, v14
-; VLS-NEXT:    vmv1r.v v22, v13
-; VLS-NEXT:    vmv1r.v v16, v12
-; VLS-NEXT:    vmv1r.v v14, v11
-; VLS-NEXT:    vmv1r.v v12, v10
-; VLS-NEXT:    vmv1r.v v10, v9
-; VLS-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; VLS-NEXT:    vslideup.vi v20, v18, 4
-; VLS-NEXT:    vslideup.vi v16, v22, 4
-; VLS-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; VLS-NEXT:    vslideup.vi v16, v20, 8
-; VLS-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; VLS-NEXT:    vslideup.vi v12, v14, 4
-; VLS-NEXT:    vslideup.vi v8, v10, 4
-; VLS-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; VLS-NEXT:    vslideup.vi v8, v12, 8
-; VLS-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
-; VLS-NEXT:    vslideup.vi v8, v16, 16
 ; VLS-NEXT:    ret
   %ab = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %cd = shufflevector <4 x i32> %c, <4 x i32> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>

>From 1a1a661a3bc6bec69436fe928b4f91589fcdbf8a Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Mon, 1 Apr 2024 14:02:36 +0800
Subject: [PATCH 2/5] Address review comments:

* Rework the subvec size check by introducing a isKnownMultipleOf helper function on TypeSize/ElementCount. We can then reason that the subvector is exactly vector register sized if it is a multiple of the vector register size.
* Rename variables to clarify we are checking the subvec size
* Update wording of comment
---
 llvm/include/llvm/Support/TypeSize.h        | 12 +++++
 llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp | 14 +++---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 50 +++++++--------------
 llvm/lib/Target/RISCV/RISCVSubtarget.h      | 11 +++++
 4 files changed, 46 insertions(+), 41 deletions(-)

diff --git a/llvm/include/llvm/Support/TypeSize.h b/llvm/include/llvm/Support/TypeSize.h
index 68dbe1ea3062ab..c6779e258be7cb 100644
--- a/llvm/include/llvm/Support/TypeSize.h
+++ b/llvm/include/llvm/Support/TypeSize.h
@@ -181,6 +181,18 @@ template <typename LeafTy, typename ValueTy> class FixedOrScalableQuantity {
     return getKnownMinValue() % RHS == 0;
   }
 
+  /// Returns whether or not the callee is known to be a multiple of RHS.
+  constexpr bool isKnownMultipleOf(const FixedOrScalableQuantity &RHS) const {
+    // x % y == 0 => x % y == 0
+    // x % y == 0 => (vscale * x) % y == 0
+    // x % y == 0 => (vscale * x) % (vscale * y) == 0
+    // but
+    // x % y == 0 !=> x % (vscale * y) == 0
+    if (!isScalable() && RHS.isScalable())
+      return false;
+    return getKnownMinValue() % RHS.getKnownMinValue() == 0;
+  }
+
   // Return the minimum value with the assumption that the count is exact.
   // Use in places where a scalable count doesn't make sense (e.g. non-vector
   // types, or vectors in backends which don't support scalable vectors).
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index cc8d4bb0c7dab3..278476c5330386 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -2100,13 +2100,13 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
     // Establish the correct scalable-vector types for any fixed-length type.
     if (SubVecVT.isFixedLengthVector()) {
       SubVecContainerVT = TLI.getContainerForFixedLengthVector(SubVecVT);
-      bool AlignedToVecReg = false;
-      if (auto VLen = Subtarget->getRealVLen();
-          VLen && SubVecVT.getSizeInBits() ==
-                      SubVecContainerVT.getSizeInBits().getKnownMinValue() *
-                          (*VLen / RISCV::RVVBitsPerBlock))
-        AlignedToVecReg = true;
-      assert(Idx == 0 && (AlignedToVecReg || V.isUndef()));
+#ifndef NDEBUG
+      TypeSize VecRegSize = TypeSize::getScalable(RISCV::RVVBitsPerBlock);
+      bool ExactlyVecRegSized =
+          Subtarget->expandVScale(SubVecVT.getSizeInBits())
+              .isKnownMultipleOf(Subtarget->expandVScale(VecRegSize));
+      assert(Idx == 0 && (ExactlyVecRegSized || V.isUndef()));
+#endif
     }
     MVT ContainerVT = VT;
     if (VT.isFixedLengthVector())
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 9bb984632124ea..122c4752670488 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -9723,21 +9723,6 @@ SDValue RISCVTargetLowering::lowerVPREDUCE(SDValue Op,
                            Vec, Mask, VL, DL, DAG, Subtarget);
 }
 
-/// Returns true if \p LHS is known to be equal to \p RHS, taking into account
-/// if VLEN is exactly known by \p Subtarget and thus vscale when handling
-/// scalable quantities.
-static bool isKnownEQ(ElementCount LHS, ElementCount RHS,
-                      const RISCVSubtarget &Subtarget) {
-  if (auto VLen = Subtarget.getRealVLen()) {
-    const unsigned Vscale = *VLen / RISCV::RVVBitsPerBlock;
-    if (LHS.isScalable())
-      LHS = ElementCount::getFixed(LHS.getKnownMinValue() * Vscale);
-    if (RHS.isScalable())
-      RHS = ElementCount::getFixed(RHS.getKnownMinValue() * Vscale);
-  }
-  return LHS == RHS;
-}
-
 SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
                                                    SelectionDAG &DAG) const {
   SDValue Vec = Op.getOperand(0);
@@ -9875,29 +9860,25 @@ SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
     RemIdx = ElementCount::getScalable(Decompose.second);
   }
 
-  RISCVII::VLMUL SubVecLMUL = RISCVTargetLowering::getLMUL(ContainerSubVecVT);
-  bool IsSubVecPartReg = SubVecLMUL == RISCVII::VLMUL::LMUL_F2 ||
-                         SubVecLMUL == RISCVII::VLMUL::LMUL_F4 ||
-                         SubVecLMUL == RISCVII::VLMUL::LMUL_F8;
-  bool AlignedToVecReg = !IsSubVecPartReg;
-  if (SubVecVT.isFixedLengthVector())
-    AlignedToVecReg &= SubVecVT.getSizeInBits() ==
-                       ContainerSubVecVT.getSizeInBits().getKnownMinValue() *
-                           (*VLen / RISCV::RVVBitsPerBlock);
+  TypeSize VecRegSize = TypeSize::getScalable(RISCV::RVVBitsPerBlock);
+  bool ExactlyVecRegSized =
+      Subtarget.expandVScale(SubVecVT.getSizeInBits())
+          .isKnownMultipleOf(Subtarget.expandVScale(VecRegSize));
 
   // 1. If the Idx has been completely eliminated and this subvector's size is
   // a vector register or a multiple thereof, or the surrounding elements are
   // undef, then this is a subvector insert which naturally aligns to a vector
   // register. These can easily be handled using subregister manipulation.
-  // 2. If the subvector isn't exactly aligned to a vector register group, then
-  // the insertion must preserve the undisturbed elements of the register. We do
-  // this by lowering to an EXTRACT_SUBVECTOR grabbing the nearest LMUL=1 vector
-  // type (which resolves to a subregister copy), performing a VSLIDEUP to place
-  // the subvector within the vector register, and an INSERT_SUBVECTOR of that
-  // LMUL=1 type back into the larger vector (resolving to another subregister
-  // operation). See below for how our VSLIDEUP works. We go via a LMUL=1 type
-  // to avoid allocating a large register group to hold our subvector.
-  if (RemIdx.isZero() && (AlignedToVecReg || Vec.isUndef())) {
+  // 2. If the subvector isn't an exact multiple of a valid register group size,
+  // then the insertion must preserve the undisturbed elements of the register.
+  // We do this by lowering to an EXTRACT_SUBVECTOR grabbing the nearest LMUL=1
+  // vector type (which resolves to a subregister copy), performing a VSLIDEUP
+  // to place the subvector within the vector register, and an INSERT_SUBVECTOR
+  // of that LMUL=1 type back into the larger vector (resolving to another
+  // subregister operation). See below for how our VSLIDEUP works. We go via a
+  // LMUL=1 type to avoid allocating a large register group to hold our
+  // subvector.
+  if (RemIdx.isZero() && (ExactlyVecRegSized || Vec.isUndef())) {
     if (SubVecVT.isFixedLengthVector()) {
       // We may get NoSubRegister if inserting at index 0 and the subvec
       // container is the same as the vector, e.g. vec=v4i32,subvec=v4i32,idx=0
@@ -9944,7 +9925,8 @@ SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
 
   // Use tail agnostic policy if we're inserting over InterSubVT's tail.
   unsigned Policy = RISCVII::TAIL_UNDISTURBED_MASK_UNDISTURBED;
-  if (isKnownEQ(EndIndex, InterSubVT.getVectorElementCount(), Subtarget))
+  if (Subtarget.expandVScale(EndIndex) ==
+      Subtarget.expandVScale(InterSubVT.getVectorElementCount()))
     Policy = RISCVII::TAIL_AGNOSTIC;
 
   // If we're inserting into the lowest elements, use a tail undisturbed
diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h
index 85f8f5f654fe7c..c880c9e921e0ea 100644
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.h
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h
@@ -200,6 +200,17 @@ class RISCVSubtarget : public RISCVGenSubtargetInfo {
     return Min;
   }
 
+  /// If the ElementCount or TypeSize \p X is scalable and VScale (VLEN) is
+  /// exactly known, returns \p X converted to a fixed quantity. Otherwise
+  /// returns \p X unmodified.
+  template <typename Quantity> Quantity expandVScale(Quantity X) const {
+    if (auto VLen = getRealVLen(); VLen && X.isScalable()) {
+      const unsigned VScale = *VLen / RISCV::RVVBitsPerBlock;
+      X = Quantity::getFixed(X.getKnownMinValue() * VScale);
+    }
+    return X;
+  }
+
   RISCVABI::ABI getTargetABI() const { return TargetABI; }
   bool isSoftFPABI() const {
     return TargetABI == RISCVABI::ABI_LP64 ||

>From 3cee43edb7d311c678becd37f5568f5c295dc012 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Tue, 9 Apr 2024 07:13:57 +0800
Subject: [PATCH 3/5] Add asserts that the subvector size is a power of 2

This is an invariant needed if we want to check that the subvector exactly fills a vector register by checking if it's a multiple of vlen.
---
 llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp | 2 ++
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index 278476c5330386..42d523ae4788c3 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -2105,6 +2105,8 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
       bool ExactlyVecRegSized =
           Subtarget->expandVScale(SubVecVT.getSizeInBits())
               .isKnownMultipleOf(Subtarget->expandVScale(VecRegSize));
+      assert(isPowerOf2_64(Subtarget->expandVScale(SubVecVT.getSizeInBits())
+                               .getKnownMinValue()));
       assert(Idx == 0 && (ExactlyVecRegSized || V.isUndef()));
 #endif
     }
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 122c4752670488..bcda9ec781fc0a 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -9861,6 +9861,8 @@ SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
   }
 
   TypeSize VecRegSize = TypeSize::getScalable(RISCV::RVVBitsPerBlock);
+  assert(isPowerOf2_64(
+      Subtarget.expandVScale(SubVecVT.getSizeInBits()).getKnownMinValue()));
   bool ExactlyVecRegSized =
       Subtarget.expandVScale(SubVecVT.getSizeInBits())
           .isKnownMultipleOf(Subtarget.expandVScale(VecRegSize));

>From 8d1f95f69213884471429ad912d28eea6ccc2cc5 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Thu, 25 Apr 2024 15:56:31 +0800
Subject: [PATCH 4/5] Add [[maybe_unused]], remove unused check prefix

An unused check prefix apparently causes an error? Not sure if it was
always like that.
---
 llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp               | 2 +-
 .../CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll     | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index 42d523ae4788c3..f41eb4d4ffb276 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -2102,7 +2102,7 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
       SubVecContainerVT = TLI.getContainerForFixedLengthVector(SubVecVT);
 #ifndef NDEBUG
       TypeSize VecRegSize = TypeSize::getScalable(RISCV::RVVBitsPerBlock);
-      bool ExactlyVecRegSized =
+      [[maybe_unused]] bool ExactlyVecRegSized =
           Subtarget->expandVScale(SubVecVT.getSizeInBits())
               .isKnownMultipleOf(Subtarget->expandVScale(VecRegSize));
       assert(isPowerOf2_64(Subtarget->expandVScale(SubVecVT.getSizeInBits())
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll
index 76d6f8931d2840..609b4e98248926 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc < %s -mtriple=riscv32 -mattr=+v -verify-machineinstrs | FileCheck -check-prefixes=CHECK,VLA %s
-; RUN: llc < %s -mtriple=riscv64 -mattr=+v -verify-machineinstrs | FileCheck -check-prefixes=CHECK,VLA %s
+; RUN: llc < %s -mtriple=riscv32 -mattr=+v -verify-machineinstrs | FileCheck -check-prefix=VLA %s
+; RUN: llc < %s -mtriple=riscv64 -mattr=+v -verify-machineinstrs | FileCheck -check-prefix=VLA %s
 
-; RUN: llc < %s -mtriple=riscv32 -mattr=+v -riscv-v-vector-bits-max=128 -verify-machineinstrs | FileCheck -check-prefixes=CHECK,VLS %s
-; RUN: llc < %s -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-max=128 -verify-machineinstrs | FileCheck -check-prefixes=CHECK,VLS %s
+; RUN: llc < %s -mtriple=riscv32 -mattr=+v -riscv-v-vector-bits-max=128 -verify-machineinstrs | FileCheck -check-prefix=VLS %s
+; RUN: llc < %s -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-max=128 -verify-machineinstrs | FileCheck -check-prefix=VLS %s
 
 define <8 x i32> @concat_2xv4i32(<4 x i32> %a, <4 x i32> %b) {
 ; VLA-LABEL: concat_2xv4i32:

>From 29ed9cd819ee3fdc88edae07fdb46fa8c70e1a7b Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Tue, 30 Apr 2024 20:30:03 +0800
Subject: [PATCH 5/5] Remove NDEBUG gating around assertions

---
 llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index f41eb4d4ffb276..dc3ad5ac5908c2 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -2100,7 +2100,6 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
     // Establish the correct scalable-vector types for any fixed-length type.
     if (SubVecVT.isFixedLengthVector()) {
       SubVecContainerVT = TLI.getContainerForFixedLengthVector(SubVecVT);
-#ifndef NDEBUG
       TypeSize VecRegSize = TypeSize::getScalable(RISCV::RVVBitsPerBlock);
       [[maybe_unused]] bool ExactlyVecRegSized =
           Subtarget->expandVScale(SubVecVT.getSizeInBits())
@@ -2108,7 +2107,6 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
       assert(isPowerOf2_64(Subtarget->expandVScale(SubVecVT.getSizeInBits())
                                .getKnownMinValue()));
       assert(Idx == 0 && (ExactlyVecRegSized || V.isUndef()));
-#endif
     }
     MVT ContainerVT = VT;
     if (VT.isFixedLengthVector())