[llvm] [RISCV] Handle fixed length vectors with exact VLEN in lowerINSERT_SUBVECTOR (PR #84107)

Thu Apr 25 00:57:29 PDT 2024

https://github.com/lukel97 updated https://github.com/llvm/llvm-project/pull/84107

>From 62febf32d7942a9facb8726cdf5be0e2ce319bc3 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Mon, 26 Feb 2024 14:42:00 +0800
Subject: [PATCH 1/4] [RISCV] Handle fixed length vectors with exact VLEN in
 lowerINSERT_SUBVECTOR

This is the insert_subvector equivalent to #79949, where we can avoid sliding up
by the full LMUL amount if we know the exact subregister the subvector will be
inserted into.

This mirrors the lowerEXTRACT_SUBVECTOR changes in that we handle this in two
parts:

- We handle fixed length subvector types by converting the subvector to a
  scalable vector. But unlike EXTRACT_SUBVECTOR, we may also need to convert the
  vector being inserted into too.

- Whenever we don't need a vslideup because either the subvector aligns to a
  vector register group *or* the vector is undef, we need to emit an
  insert_subreg ourselves because RISCVISelDAGToDAG::Select doesn't correctly
  handle fixed length subvectors yet: see d7a28f7ad

I've left RISCVISelDAGToDAG::Select untouched for now (minus relaxing an
invariant), so that the insert_subvector and extract_subvector code paths are
the same.

We should teach it to properly handle fixed length subvectors in a follow-up
patch, so that the "exact subregsiter" logic is handled in one place instead of
being spread across both RISCVISelDAGToDAG.cpp and RISCVISelLowering.cpp.
---
 llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp   |   8 +-
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   | 130 ++++++++---
 .../rvv/fixed-vectors-insert-subvector.ll     | 163 +++++++++-----
 .../RISCV/rvv/fixed-vectors-shuffle-concat.ll | 205 ++++++++++--------
 4 files changed, 334 insertions(+), 172 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index b0568297a470a7..26d9407d7312fb 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -2099,8 +2099,14 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
     MVT SubVecContainerVT = SubVecVT;
     // Establish the correct scalable-vector types for any fixed-length type.
     if (SubVecVT.isFixedLengthVector()) {
-      assert(Idx == 0 && V.isUndef());
       SubVecContainerVT = TLI.getContainerForFixedLengthVector(SubVecVT);
+      bool AlignedToVecReg = false;
+      if (auto VLen = Subtarget->getRealVLen();
+          VLen && SubVecVT.getSizeInBits() ==
+                      SubVecContainerVT.getSizeInBits().getKnownMinValue() *
+                          (*VLen / RISCV::RVVBitsPerBlock))
+        AlignedToVecReg = true;
+      assert(Idx == 0 && (AlignedToVecReg || V.isUndef()));
     }
     MVT ContainerVT = VT;
     if (VT.isFixedLengthVector())
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 6529ab7a84a133..d4f36747536b6f 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -9720,6 +9720,21 @@ SDValue RISCVTargetLowering::lowerVPREDUCE(SDValue Op,
                            Vec, Mask, VL, DL, DAG, Subtarget);
 }
 
+/// Returns true if \p LHS is known to be equal to \p RHS, taking into account
+/// if VLEN is exactly known by \p Subtarget and thus vscale when handling
+/// scalable quantities.
+static bool isKnownEQ(ElementCount LHS, ElementCount RHS,
+                      const RISCVSubtarget &Subtarget) {
+  if (auto VLen = Subtarget.getRealVLen()) {
+    const unsigned Vscale = *VLen / RISCV::RVVBitsPerBlock;
+    if (LHS.isScalable())
+      LHS = ElementCount::getFixed(LHS.getKnownMinValue() * Vscale);
+    if (RHS.isScalable())
+      RHS = ElementCount::getFixed(RHS.getKnownMinValue() * Vscale);
+  }
+  return LHS == RHS;
+}
+
 SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
                                                    SelectionDAG &DAG) const {
   SDValue Vec = Op.getOperand(0);
@@ -9769,12 +9784,13 @@ SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
     }
   }
 
-  // If the subvector vector is a fixed-length type, we cannot use subregister
-  // manipulation to simplify the codegen; we don't know which register of a
-  // LMUL group contains the specific subvector as we only know the minimum
-  // register size. Therefore we must slide the vector group up the full
-  // amount.
-  if (SubVecVT.isFixedLengthVector()) {
+  // If the subvector vector is a fixed-length type and we don't know VLEN
+  // exactly, we cannot use subregister manipulation to simplify the codegen; we
+  // don't know which register of a LMUL group contains the specific subvector
+  // as we only know the minimum register size. Therefore we must slide the
+  // vector group up the full amount.
+  const auto VLen = Subtarget.getRealVLen();
+  if (SubVecVT.isFixedLengthVector() && !VLen) {
     if (OrigIdx == 0 && Vec.isUndef() && !VecVT.isFixedLengthVector())
       return Op;
     MVT ContainerVT = VecVT;
@@ -9822,41 +9838,92 @@ SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
     return DAG.getBitcast(Op.getValueType(), SubVec);
   }
 
-  unsigned SubRegIdx, RemIdx;
-  std::tie(SubRegIdx, RemIdx) =
-      RISCVTargetLowering::decomposeSubvectorInsertExtractToSubRegs(
-          VecVT, SubVecVT, OrigIdx, TRI);
+  MVT ContainerVecVT = VecVT;
+  if (VecVT.isFixedLengthVector()) {
+    ContainerVecVT = getContainerForFixedLengthVector(VecVT);
+    Vec = convertToScalableVector(ContainerVecVT, Vec, DAG, Subtarget);
+  }
 
-  RISCVII::VLMUL SubVecLMUL = RISCVTargetLowering::getLMUL(SubVecVT);
+  MVT ContainerSubVecVT = SubVecVT;
+  if (SubVecVT.isFixedLengthVector()) {
+    ContainerSubVecVT = getContainerForFixedLengthVector(SubVecVT);
+    SubVec = convertToScalableVector(ContainerSubVecVT, SubVec, DAG, Subtarget);
+  }
+
+  unsigned SubRegIdx;
+  ElementCount RemIdx;
+  // insert_subvector scales the index by vscale if the subvector is scalable,
+  // and decomposeSubvectorInsertExtractToSubRegs takes this into account. So if
+  // we have a fixed length subvector, we need to adjust the index by 1/vscale.
+  if (SubVecVT.isFixedLengthVector()) {
+    assert(VLen);
+    unsigned Vscale = *VLen / RISCV::RVVBitsPerBlock;
+    auto Decompose =
+        RISCVTargetLowering::decomposeSubvectorInsertExtractToSubRegs(
+            ContainerVecVT, ContainerSubVecVT, OrigIdx / Vscale, TRI);
+    SubRegIdx = Decompose.first;
+    RemIdx = ElementCount::getFixed((Decompose.second * Vscale) +
+                                    (OrigIdx % Vscale));
+  } else {
+    auto Decompose =
+        RISCVTargetLowering::decomposeSubvectorInsertExtractToSubRegs(
+            ContainerVecVT, ContainerSubVecVT, OrigIdx, TRI);
+    SubRegIdx = Decompose.first;
+    RemIdx = ElementCount::getScalable(Decompose.second);
+  }
+
+  RISCVII::VLMUL SubVecLMUL = RISCVTargetLowering::getLMUL(ContainerSubVecVT);
   bool IsSubVecPartReg = SubVecLMUL == RISCVII::VLMUL::LMUL_F2 ||
                          SubVecLMUL == RISCVII::VLMUL::LMUL_F4 ||
                          SubVecLMUL == RISCVII::VLMUL::LMUL_F8;
+  bool AlignedToVecReg = !IsSubVecPartReg;
+  if (SubVecVT.isFixedLengthVector())
+    AlignedToVecReg &= SubVecVT.getSizeInBits() ==
+                       ContainerSubVecVT.getSizeInBits().getKnownMinValue() *
+                           (*VLen / RISCV::RVVBitsPerBlock);
 
   // 1. If the Idx has been completely eliminated and this subvector's size is
   // a vector register or a multiple thereof, or the surrounding elements are
   // undef, then this is a subvector insert which naturally aligns to a vector
   // register. These can easily be handled using subregister manipulation.
-  // 2. If the subvector is smaller than a vector register, then the insertion
-  // must preserve the undisturbed elements of the register. We do this by
-  // lowering to an EXTRACT_SUBVECTOR grabbing the nearest LMUL=1 vector type
-  // (which resolves to a subregister copy), performing a VSLIDEUP to place the
-  // subvector within the vector register, and an INSERT_SUBVECTOR of that
+  // 2. If the subvector isn't exactly aligned to a vector register group, then
+  // the insertion must preserve the undisturbed elements of the register. We do
+  // this by lowering to an EXTRACT_SUBVECTOR grabbing the nearest LMUL=1 vector
+  // type (which resolves to a subregister copy), performing a VSLIDEUP to place
+  // the subvector within the vector register, and an INSERT_SUBVECTOR of that
   // LMUL=1 type back into the larger vector (resolving to another subregister
   // operation). See below for how our VSLIDEUP works. We go via a LMUL=1 type
   // to avoid allocating a large register group to hold our subvector.
-  if (RemIdx == 0 && (!IsSubVecPartReg || Vec.isUndef()))
+  if (RemIdx.isZero() && (AlignedToVecReg || Vec.isUndef())) {
+    if (SubVecVT.isFixedLengthVector()) {
+      // We may get NoSubRegister if inserting at index 0 and the subvec
+      // container is the same as the vector, e.g. vec=v4i32,subvec=v4i32,idx=0
+      if (SubRegIdx == RISCV::NoSubRegister) {
+        assert(OrigIdx == 0);
+        return Op;
+      }
+
+      SDValue Insert =
+          DAG.getTargetInsertSubreg(SubRegIdx, DL, ContainerVecVT, Vec, SubVec);
+      if (VecVT.isFixedLengthVector())
+        Insert = convertFromScalableVector(VecVT, Insert, DAG, Subtarget);
+      return Insert;
+    }
     return Op;
+  }
 
   // VSLIDEUP works by leaving elements 0<i<OFFSET undisturbed, elements
   // OFFSET<=i<VL set to the "subvector" and vl<=i<VLMAX set to the tail policy
   // (in our case undisturbed). This means we can set up a subvector insertion
   // where OFFSET is the insertion offset, and the VL is the OFFSET plus the
   // size of the subvector.
-  MVT InterSubVT = VecVT;
+  MVT InterSubVT = ContainerVecVT;
   SDValue AlignedExtract = Vec;
-  unsigned AlignedIdx = OrigIdx - RemIdx;
-  if (VecVT.bitsGT(getLMUL1VT(VecVT))) {
-    InterSubVT = getLMUL1VT(VecVT);
+  unsigned AlignedIdx = OrigIdx - RemIdx.getKnownMinValue();
+  if (SubVecVT.isFixedLengthVector())
+    AlignedIdx /= *VLen / RISCV::RVVBitsPerBlock;
+  if (ContainerVecVT.bitsGT(getLMUL1VT(ContainerVecVT))) {
+    InterSubVT = getLMUL1VT(ContainerVecVT);
     // Extract a subvector equal to the nearest full vector register type. This
     // should resolve to a EXTRACT_SUBREG instruction.
     AlignedExtract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InterSubVT, Vec,
@@ -9867,25 +9934,23 @@ SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
                        DAG.getUNDEF(InterSubVT), SubVec,
                        DAG.getVectorIdxConstant(0, DL));
 
-  auto [Mask, VL] = getDefaultScalableVLOps(VecVT, DL, DAG, Subtarget);
+  auto [Mask, VL] = getDefaultVLOps(VecVT, ContainerVecVT, DL, DAG, Subtarget);
 
-  ElementCount EndIndex =
-      ElementCount::getScalable(RemIdx) + SubVecVT.getVectorElementCount();
-  VL = computeVLMax(SubVecVT, DL, DAG);
+  ElementCount EndIndex = RemIdx + SubVecVT.getVectorElementCount();
+  VL = DAG.getElementCount(DL, XLenVT, SubVecVT.getVectorElementCount());
 
   // Use tail agnostic policy if we're inserting over InterSubVT's tail.
   unsigned Policy = RISCVII::TAIL_UNDISTURBED_MASK_UNDISTURBED;
-  if (EndIndex == InterSubVT.getVectorElementCount())
+  if (isKnownEQ(EndIndex, InterSubVT.getVectorElementCount(), Subtarget))
     Policy = RISCVII::TAIL_AGNOSTIC;
 
   // If we're inserting into the lowest elements, use a tail undisturbed
   // vmv.v.v.
-  if (RemIdx == 0) {
+  if (RemIdx.isZero()) {
     SubVec = DAG.getNode(RISCVISD::VMV_V_V_VL, DL, InterSubVT, AlignedExtract,
                          SubVec, VL);
   } else {
-    SDValue SlideupAmt =
-        DAG.getVScale(DL, XLenVT, APInt(XLenVT.getSizeInBits(), RemIdx));
+    SDValue SlideupAmt = DAG.getElementCount(DL, XLenVT, RemIdx);
 
     // Construct the vector length corresponding to RemIdx + length(SubVecVT).
     VL = DAG.getNode(ISD::ADD, DL, XLenVT, SlideupAmt, VL);
@@ -9896,10 +9961,13 @@ SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
 
   // If required, insert this subvector back into the correct vector register.
   // This should resolve to an INSERT_SUBREG instruction.
-  if (VecVT.bitsGT(InterSubVT))
-    SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VecVT, Vec, SubVec,
+  if (ContainerVecVT.bitsGT(InterSubVT))
+    SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ContainerVecVT, Vec, SubVec,
                          DAG.getVectorIdxConstant(AlignedIdx, DL));
 
+  if (VecVT.isFixedLengthVector())
+    SubVec = convertFromScalableVector(VecVT, SubVec, DAG, Subtarget);
+
   // We might have bitcast from a mask type: cast back to the original type if
   // required.
   return DAG.getBitcast(Op.getSimpleValueType(), SubVec);
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll
index ab6df1d3e883fd..53de1a87553553 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll
@@ -9,39 +9,63 @@
 ; RUN: llc < %s -mtriple=riscv64 -mattr=+m,v -riscv-v-vector-bits-max=128 -verify-machineinstrs | FileCheck -check-prefixes=CHECK,VLS,RV64VLS %s
 
 define <vscale x 8 x i32> @insert_nxv8i32_v2i32_0(<vscale x 8 x i32> %vec, ptr %svp) {
-; CHECK-LABEL: insert_nxv8i32_v2i32_0:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vle32.v v12, (a0)
-; CHECK-NEXT:    vsetivli zero, 2, e32, m4, tu, ma
-; CHECK-NEXT:    vmv.v.v v8, v12
-; CHECK-NEXT:    ret
+; VLA-LABEL: insert_nxv8i32_v2i32_0:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLA-NEXT:    vle32.v v12, (a0)
+; VLA-NEXT:    vsetivli zero, 2, e32, m4, tu, ma
+; VLA-NEXT:    vmv.v.v v8, v12
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: insert_nxv8i32_v2i32_0:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLS-NEXT:    vle32.v v12, (a0)
+; VLS-NEXT:    vsetivli zero, 2, e32, m1, tu, ma
+; VLS-NEXT:    vmv.v.v v8, v12
+; VLS-NEXT:    ret
   %sv = load <2 x i32>, ptr %svp
   %v = call <vscale x 8 x i32> @llvm.vector.insert.v2i32.nxv8i32(<vscale x 8 x i32> %vec, <2 x i32> %sv, i64 0)
   ret <vscale x 8 x i32> %v
 }
 
 define <vscale x 8 x i32> @insert_nxv8i32_v2i32_2(<vscale x 8 x i32> %vec, ptr %svp) {
-; CHECK-LABEL: insert_nxv8i32_v2i32_2:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vle32.v v12, (a0)
-; CHECK-NEXT:    vsetivli zero, 4, e32, m4, tu, ma
-; CHECK-NEXT:    vslideup.vi v8, v12, 2
-; CHECK-NEXT:    ret
+; VLA-LABEL: insert_nxv8i32_v2i32_2:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLA-NEXT:    vle32.v v12, (a0)
+; VLA-NEXT:    vsetivli zero, 4, e32, m4, tu, ma
+; VLA-NEXT:    vslideup.vi v8, v12, 2
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: insert_nxv8i32_v2i32_2:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLS-NEXT:    vle32.v v12, (a0)
+; VLS-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; VLS-NEXT:    vslideup.vi v8, v12, 2
+; VLS-NEXT:    ret
   %sv = load <2 x i32>, ptr %svp
   %v = call <vscale x 8 x i32> @llvm.vector.insert.v2i32.nxv8i32(<vscale x 8 x i32> %vec, <2 x i32> %sv, i64 2)
   ret <vscale x 8 x i32> %v
 }
 
 define <vscale x 8 x i32> @insert_nxv8i32_v2i32_6(<vscale x 8 x i32> %vec, ptr %svp) {
-; CHECK-LABEL: insert_nxv8i32_v2i32_6:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vle32.v v12, (a0)
-; CHECK-NEXT:    vsetivli zero, 8, e32, m4, tu, ma
-; CHECK-NEXT:    vslideup.vi v8, v12, 6
-; CHECK-NEXT:    ret
+; VLA-LABEL: insert_nxv8i32_v2i32_6:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLA-NEXT:    vle32.v v12, (a0)
+; VLA-NEXT:    vsetivli zero, 8, e32, m4, tu, ma
+; VLA-NEXT:    vslideup.vi v8, v12, 6
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: insert_nxv8i32_v2i32_6:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLS-NEXT:    vle32.v v12, (a0)
+; VLS-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; VLS-NEXT:    vslideup.vi v9, v12, 2
+; VLS-NEXT:    ret
   %sv = load <2 x i32>, ptr %svp
   %v = call <vscale x 8 x i32> @llvm.vector.insert.v2i32.nxv8i32(<vscale x 8 x i32> %vec, <2 x i32> %sv, i64 6)
   ret <vscale x 8 x i32> %v
@@ -58,9 +82,7 @@ define <vscale x 8 x i32> @insert_nxv8i32_v8i32_0(<vscale x 8 x i32> %vec, ptr %
 ;
 ; VLS-LABEL: insert_nxv8i32_v8i32_0:
 ; VLS:       # %bb.0:
-; VLS-NEXT:    vl2re32.v v12, (a0)
-; VLS-NEXT:    vsetivli zero, 8, e32, m4, tu, ma
-; VLS-NEXT:    vmv.v.v v8, v12
+; VLS-NEXT:    vl2re32.v v8, (a0)
 ; VLS-NEXT:    ret
   %sv = load <8 x i32>, ptr %svp
   %v = call <vscale x 8 x i32> @llvm.vector.insert.v8i32.nxv8i32(<vscale x 8 x i32> %vec, <8 x i32> %sv, i64 0)
@@ -78,9 +100,7 @@ define <vscale x 8 x i32> @insert_nxv8i32_v8i32_8(<vscale x 8 x i32> %vec, ptr %
 ;
 ; VLS-LABEL: insert_nxv8i32_v8i32_8:
 ; VLS:       # %bb.0:
-; VLS-NEXT:    vl2re32.v v12, (a0)
-; VLS-NEXT:    vsetivli zero, 16, e32, m4, tu, ma
-; VLS-NEXT:    vslideup.vi v8, v12, 8
+; VLS-NEXT:    vl2re32.v v10, (a0)
 ; VLS-NEXT:    ret
   %sv = load <8 x i32>, ptr %svp
   %v = call <vscale x 8 x i32> @llvm.vector.insert.v8i32.nxv8i32(<vscale x 8 x i32> %vec, <8 x i32> %sv, i64 8)
@@ -98,6 +118,31 @@ define <vscale x 8 x i32> @insert_nxv8i32_undef_v2i32_0(ptr %svp) {
   ret <vscale x 8 x i32> %v
 }
 
+define <vscale x 2 x i32> @insert_nxv8i32_v4i32_0(<vscale x 2 x i32> %vec, <4 x i32> %subvec) {
+; VLA-LABEL: insert_nxv8i32_v4i32_0:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 4, e32, m1, tu, ma
+; VLA-NEXT:    vmv.v.v v8, v9
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: insert_nxv8i32_v4i32_0:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vmv1r.v v8, v9
+; VLS-NEXT:    ret
+  %v = call <vscale x 2 x i32> @llvm.vector.insert.nxv2i32.v4i32(<vscale x 2 x i32> %vec, <4 x i32> %subvec, i64 0)
+  ret <vscale x 2 x i32> %v
+}
+
+
+define <4 x i32> @insert_v4i32_v4i32_0(<4 x i32> %vec, <4 x i32> %subvec) {
+; CHECK-LABEL: insert_v4i32_v4i32_0:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+  %v = call <4 x i32> @llvm.vector.insert.v4i32.v4i32(<4 x i32> %vec, <4 x i32> %subvec, i64 0)
+  ret <4 x i32> %v
+}
+
 define void @insert_v4i32_v2i32_0(ptr %vp, ptr %svp) {
 ; VLA-LABEL: insert_v4i32_v2i32_0:
 ; VLA:       # %bb.0:
@@ -175,6 +220,31 @@ define void @insert_v4i32_undef_v2i32_0(ptr %vp, ptr %svp) {
   ret void
 }
 
+; This tests the code path in RISCVISelDAGToDAG::Select where we select an
+; insert_subvector with a fixed vector and fixed subvector type. The phi here is
+; used to prevent the fixed insert_subvector from being combined away into a
+; scalable insert_subvector.
+define <4 x i32> @insert_v4i32_undef_v2i32_0_phi(<2 x i32> %subvec, i1 %cond) {
+; CHECK-LABEL: insert_v4i32_undef_v2i32_0_phi:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    andi a0, a0, 1
+; CHECK-NEXT:    bnez a0, .LBB11_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vmv.v.i v8, 0
+; CHECK-NEXT:  .LBB11_2: # %bar
+; CHECK-NEXT:    ret
+entry:
+  br i1 %cond, label %foo, label %bar
+foo:
+  %v = call <4 x i32> @llvm.vector.insert.v2i32.v4i32(<4 x i32> undef, <2 x i32> %subvec, i64 0)
+  br label %bar
+bar:
+  %w = phi <4 x i32> [%v, %foo], [zeroinitializer, %entry]
+  ret <4 x i32> %w
+}
+
+
 define void @insert_v8i32_v2i32_0(ptr %vp, ptr %svp) {
 ; VLA-LABEL: insert_v8i32_v2i32_0:
 ; VLA:       # %bb.0:
@@ -193,7 +263,7 @@ define void @insert_v8i32_v2i32_0(ptr %vp, ptr %svp) {
 ; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; VLS-NEXT:    vle32.v v8, (a1)
 ; VLS-NEXT:    vl2re32.v v10, (a0)
-; VLS-NEXT:    vsetivli zero, 2, e32, m2, tu, ma
+; VLS-NEXT:    vsetivli zero, 2, e32, m1, tu, ma
 ; VLS-NEXT:    vmv.v.v v10, v8
 ; VLS-NEXT:    vs2r.v v10, (a0)
 ; VLS-NEXT:    ret
@@ -220,11 +290,11 @@ define void @insert_v8i32_v2i32_2(ptr %vp, ptr %svp) {
 ; VLS-LABEL: insert_v8i32_v2i32_2:
 ; VLS:       # %bb.0:
 ; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; VLS-NEXT:    vl2re32.v v8, (a0)
-; VLS-NEXT:    vle32.v v10, (a1)
-; VLS-NEXT:    vsetivli zero, 4, e32, m2, tu, ma
-; VLS-NEXT:    vslideup.vi v8, v10, 2
-; VLS-NEXT:    vs2r.v v8, (a0)
+; VLS-NEXT:    vle32.v v8, (a1)
+; VLS-NEXT:    vl2re32.v v10, (a0)
+; VLS-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; VLS-NEXT:    vslideup.vi v10, v8, 2
+; VLS-NEXT:    vs2r.v v10, (a0)
 ; VLS-NEXT:    ret
   %sv = load <2 x i32>, ptr %svp
   %vec = load <8 x i32>, ptr %vp
@@ -247,11 +317,11 @@ define void @insert_v8i32_v2i32_6(ptr %vp, ptr %svp) {
 ; VLS-LABEL: insert_v8i32_v2i32_6:
 ; VLS:       # %bb.0:
 ; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; VLS-NEXT:    vl2re32.v v8, (a0)
-; VLS-NEXT:    vle32.v v10, (a1)
-; VLS-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; VLS-NEXT:    vslideup.vi v8, v10, 6
-; VLS-NEXT:    vs2r.v v8, (a0)
+; VLS-NEXT:    vle32.v v8, (a1)
+; VLS-NEXT:    vl2re32.v v10, (a0)
+; VLS-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; VLS-NEXT:    vslideup.vi v11, v8, 2
+; VLS-NEXT:    vs2r.v v10, (a0)
 ; VLS-NEXT:    ret
   %sv = load <2 x i32>, ptr %svp
   %vec = load <8 x i32>, ptr %vp
@@ -274,9 +344,9 @@ define void @insert_v8i32_undef_v2i32_6(ptr %vp, ptr %svp) {
 ; VLS:       # %bb.0:
 ; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; VLS-NEXT:    vle32.v v8, (a1)
-; VLS-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; VLS-NEXT:    vslideup.vi v10, v8, 6
-; VLS-NEXT:    vs2r.v v10, (a0)
+; VLS-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; VLS-NEXT:    vslideup.vi v9, v8, 2
+; VLS-NEXT:    vs2r.v v8, (a0)
 ; VLS-NEXT:    ret
   %sv = load <2 x i32>, ptr %svp
   %v = call <8 x i32> @llvm.vector.insert.v2i32.v8i32(<8 x i32> undef, <2 x i32> %sv, i64 6)
@@ -542,9 +612,7 @@ define void @insert_v2i64_nxv16i64(ptr %psv0, ptr %psv1, ptr %out) {
 ; VLS-LABEL: insert_v2i64_nxv16i64:
 ; VLS:       # %bb.0:
 ; VLS-NEXT:    vl1re64.v v8, (a0)
-; VLS-NEXT:    vl1re64.v v16, (a1)
-; VLS-NEXT:    vsetivli zero, 6, e64, m8, tu, ma
-; VLS-NEXT:    vslideup.vi v8, v16, 4
+; VLS-NEXT:    vl1re64.v v10, (a1)
 ; VLS-NEXT:    vs8r.v v8, (a2)
 ; VLS-NEXT:    ret
   %sv0 = load <2 x i64>, ptr %psv0
@@ -586,10 +654,8 @@ define void @insert_v2i64_nxv16i64_lo2(ptr %psv, ptr %out) {
 ;
 ; VLS-LABEL: insert_v2i64_nxv16i64_lo2:
 ; VLS:       # %bb.0:
-; VLS-NEXT:    vl1re64.v v8, (a0)
-; VLS-NEXT:    vsetivli zero, 4, e64, m8, ta, ma
-; VLS-NEXT:    vslideup.vi v16, v8, 2
-; VLS-NEXT:    vs8r.v v16, (a1)
+; VLS-NEXT:    vl1re64.v v9, (a0)
+; VLS-NEXT:    vs8r.v v8, (a1)
 ; VLS-NEXT:    ret
   %sv = load <2 x i64>, ptr %psv
   %v = call <vscale x 16 x i64> @llvm.vector.insert.v2i64.nxv16i64(<vscale x 16 x i64> undef, <2 x i64> %sv, i64 2)
@@ -633,7 +699,6 @@ define void @insert_v2i64_nxv16i64_hi(ptr %psv, ptr %out) {
 ; RV32-NEXT:    lw s0, 72(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    addi sp, sp, 80
 ; RV32-NEXT:    ret
-;
 ; RV64-LABEL: insert_v2i64_nxv16i64_hi:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    addi sp, sp, -80
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll
index 98e6b8f2dd760a..76d6f8931d2840 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll
@@ -6,25 +6,39 @@
 ; RUN: llc < %s -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-max=128 -verify-machineinstrs | FileCheck -check-prefixes=CHECK,VLS %s
 
 define <8 x i32> @concat_2xv4i32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: concat_2xv4i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v10, v9
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v10, 4
-; CHECK-NEXT:    ret
+; VLA-LABEL: concat_2xv4i32:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vmv1r.v v10, v9
+; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLA-NEXT:    vslideup.vi v8, v10, 4
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: concat_2xv4i32:
+; VLS:       # %bb.0:
+; VLS-NEXT:    ret
   %ab = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   ret <8 x i32> %ab
 }
 
 define <8 x i32> @concat_4xv2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d) {
-; CHECK-LABEL: concat_4xv2i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vslideup.vi v10, v11, 2
-; CHECK-NEXT:    vslideup.vi v8, v9, 2
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v10, 4
-; CHECK-NEXT:    ret
+; VLA-LABEL: concat_4xv2i32:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; VLA-NEXT:    vslideup.vi v10, v11, 2
+; VLA-NEXT:    vslideup.vi v8, v9, 2
+; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLA-NEXT:    vslideup.vi v8, v10, 4
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: concat_4xv2i32:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vmv1r.v v13, v10
+; VLS-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; VLS-NEXT:    vmv1r.v v12, v8
+; VLS-NEXT:    vslideup.vi v13, v11, 2
+; VLS-NEXT:    vslideup.vi v12, v9, 2
+; VLS-NEXT:    vmv2r.v v8, v12
+; VLS-NEXT:    ret
   %ab = shufflevector <2 x i32> %a, <2 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %cd = shufflevector <2 x i32> %c, <2 x i32> %d, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %abcd = shufflevector <4 x i32> %ab, <4 x i32> %cd, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -32,21 +46,38 @@ define <8 x i32> @concat_4xv2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x
 }
 
 define <8 x i32> @concat_8xv1i32(<1 x i32> %a, <1 x i32> %b, <1 x i32> %c, <1 x i32> %d, <1 x i32> %e, <1 x i32> %f, <1 x i32> %g, <1 x i32> %h) {
-; CHECK-LABEL: concat_8xv1i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vi v14, v15, 1
-; CHECK-NEXT:    vslideup.vi v12, v13, 1
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vslideup.vi v12, v14, 2
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vi v10, v11, 1
-; CHECK-NEXT:    vslideup.vi v8, v9, 1
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v10, 2
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v12, 4
-; CHECK-NEXT:    ret
+; VLA-LABEL: concat_8xv1i32:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLA-NEXT:    vslideup.vi v14, v15, 1
+; VLA-NEXT:    vslideup.vi v12, v13, 1
+; VLA-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; VLA-NEXT:    vslideup.vi v12, v14, 2
+; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLA-NEXT:    vslideup.vi v10, v11, 1
+; VLA-NEXT:    vslideup.vi v8, v9, 1
+; VLA-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; VLA-NEXT:    vslideup.vi v8, v10, 2
+; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLA-NEXT:    vslideup.vi v8, v12, 4
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: concat_8xv1i32:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vmv1r.v v17, v12
+; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLS-NEXT:    vslideup.vi v14, v15, 1
+; VLS-NEXT:    vmv1r.v v16, v8
+; VLS-NEXT:    vslideup.vi v17, v13, 1
+; VLS-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; VLS-NEXT:    vslideup.vi v17, v14, 2
+; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLS-NEXT:    vslideup.vi v10, v11, 1
+; VLS-NEXT:    vslideup.vi v16, v9, 1
+; VLS-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; VLS-NEXT:    vslideup.vi v16, v10, 2
+; VLS-NEXT:    vmv2r.v v8, v16
+; VLS-NEXT:    ret
   %ab = shufflevector <1 x i32> %a, <1 x i32> %b, <2 x i32> <i32 0, i32 1>
   %cd = shufflevector <1 x i32> %c, <1 x i32> %d, <2 x i32> <i32 0, i32 1>
   %abcd = shufflevector <2 x i32> %ab, <2 x i32> %cd, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -58,28 +89,36 @@ define <8 x i32> @concat_8xv1i32(<1 x i32> %a, <1 x i32> %b, <1 x i32> %c, <1 x
 }
 
 define <16 x i32> @concat_2xv8i32(<8 x i32> %a, <8 x i32> %b) {
-; CHECK-LABEL: concat_2xv8i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv2r.v v12, v10
-; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v12, 8
-; CHECK-NEXT:    ret
+; VLA-LABEL: concat_2xv8i32:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vmv2r.v v12, v10
+; VLA-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; VLA-NEXT:    vslideup.vi v8, v12, 8
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: concat_2xv8i32:
+; VLS:       # %bb.0:
+; VLS-NEXT:    ret
   %v = shufflevector <8 x i32> %a, <8 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   ret <16 x i32> %v
 }
 
 define <16 x i32> @concat_4xv4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) {
-; CHECK-LABEL: concat_4xv4i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v14, v11
-; CHECK-NEXT:    vmv1r.v v12, v10
-; CHECK-NEXT:    vmv1r.v v10, v9
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vslideup.vi v12, v14, 4
-; CHECK-NEXT:    vslideup.vi v8, v10, 4
-; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v12, 8
-; CHECK-NEXT:    ret
+; VLA-LABEL: concat_4xv4i32:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vmv1r.v v14, v11
+; VLA-NEXT:    vmv1r.v v12, v10
+; VLA-NEXT:    vmv1r.v v10, v9
+; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLA-NEXT:    vslideup.vi v12, v14, 4
+; VLA-NEXT:    vslideup.vi v8, v10, 4
+; VLA-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; VLA-NEXT:    vslideup.vi v8, v12, 8
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: concat_4xv4i32:
+; VLS:       # %bb.0:
+; VLS-NEXT:    ret
   %ab = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %cd = shufflevector <4 x i32> %c, <4 x i32> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %abcd = shufflevector <8 x i32> %ab, <8 x i32> %cd, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -87,21 +126,35 @@ define <16 x i32> @concat_4xv4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x
 }
 
 define <16 x i32> @concat_8xv2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d, <2 x i32> %e, <2 x i32> %f, <2 x i32> %g, <2 x i32> %h) {
-; CHECK-LABEL: concat_8xv2i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vslideup.vi v14, v15, 2
-; CHECK-NEXT:    vslideup.vi v12, v13, 2
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vslideup.vi v12, v14, 4
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vslideup.vi v10, v11, 2
-; CHECK-NEXT:    vslideup.vi v8, v9, 2
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v10, 4
-; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v12, 8
-; CHECK-NEXT:    ret
+; VLA-LABEL: concat_8xv2i32:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; VLA-NEXT:    vslideup.vi v14, v15, 2
+; VLA-NEXT:    vslideup.vi v12, v13, 2
+; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLA-NEXT:    vslideup.vi v12, v14, 4
+; VLA-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; VLA-NEXT:    vslideup.vi v10, v11, 2
+; VLA-NEXT:    vslideup.vi v8, v9, 2
+; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLA-NEXT:    vslideup.vi v8, v10, 4
+; VLA-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; VLA-NEXT:    vslideup.vi v8, v12, 8
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: concat_8xv2i32:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vmv1r.v v19, v14
+; VLS-NEXT:    vmv1r.v v18, v12
+; VLS-NEXT:    vmv1r.v v17, v10
+; VLS-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; VLS-NEXT:    vmv1r.v v16, v8
+; VLS-NEXT:    vslideup.vi v19, v15, 2
+; VLS-NEXT:    vslideup.vi v18, v13, 2
+; VLS-NEXT:    vslideup.vi v17, v11, 2
+; VLS-NEXT:    vslideup.vi v16, v9, 2
+; VLS-NEXT:    vmv4r.v v8, v16
+; VLS-NEXT:    ret
   %ab = shufflevector <2 x i32> %a, <2 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %cd = shufflevector <2 x i32> %c, <2 x i32> %d, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %abcd = shufflevector <4 x i32> %ab, <4 x i32> %cd, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -123,9 +176,6 @@ define <32 x i32> @concat_2xv16i32(<16 x i32> %a, <16 x i32> %b) {
 ;
 ; VLS-LABEL: concat_2xv16i32:
 ; VLS:       # %bb.0:
-; VLS-NEXT:    vmv4r.v v16, v12
-; VLS-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
-; VLS-NEXT:    vslideup.vi v8, v16, 16
 ; VLS-NEXT:    ret
   %ab = shufflevector <16 x i32> %a, <16 x i32> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
   ret <32 x i32> %ab
@@ -147,14 +197,6 @@ define <32 x i32> @concat_4xv8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x
 ;
 ; VLS-LABEL: concat_4xv8i32:
 ; VLS:       # %bb.0:
-; VLS-NEXT:    vmv2r.v v20, v14
-; VLS-NEXT:    vmv2r.v v16, v12
-; VLS-NEXT:    vmv2r.v v12, v10
-; VLS-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; VLS-NEXT:    vslideup.vi v16, v20, 8
-; VLS-NEXT:    vslideup.vi v8, v12, 8
-; VLS-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
-; VLS-NEXT:    vslideup.vi v8, v16, 16
 ; VLS-NEXT:    ret
   %ab = shufflevector <8 x i32> %a, <8 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %cd = shufflevector <8 x i32> %c, <8 x i32> %d, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -189,25 +231,6 @@ define <32 x i32> @concat_8xv4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x
 ;
 ; VLS-LABEL: concat_8xv4i32:
 ; VLS:       # %bb.0:
-; VLS-NEXT:    vmv1r.v v18, v15
-; VLS-NEXT:    vmv1r.v v20, v14
-; VLS-NEXT:    vmv1r.v v22, v13
-; VLS-NEXT:    vmv1r.v v16, v12
-; VLS-NEXT:    vmv1r.v v14, v11
-; VLS-NEXT:    vmv1r.v v12, v10
-; VLS-NEXT:    vmv1r.v v10, v9
-; VLS-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; VLS-NEXT:    vslideup.vi v20, v18, 4
-; VLS-NEXT:    vslideup.vi v16, v22, 4
-; VLS-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; VLS-NEXT:    vslideup.vi v16, v20, 8
-; VLS-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; VLS-NEXT:    vslideup.vi v12, v14, 4
-; VLS-NEXT:    vslideup.vi v8, v10, 4
-; VLS-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; VLS-NEXT:    vslideup.vi v8, v12, 8
-; VLS-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
-; VLS-NEXT:    vslideup.vi v8, v16, 16
 ; VLS-NEXT:    ret
   %ab = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %cd = shufflevector <4 x i32> %c, <4 x i32> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>

>From 20fce7f2646a2678ed61ccc0c72a5682fd6f5b4b Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Mon, 1 Apr 2024 14:02:36 +0800
Subject: [PATCH 2/4] Address review comments:

* Rework the subvec size check by introducing a isKnownMultipleOf helper function on TypeSize/ElementCount. We can then reason that the subvector is exactly vector register sized if it is a multiple of the vector register size.
* Rename variables to clarify we are checking the subvec size
* Update wording of comment
---
 llvm/include/llvm/Support/TypeSize.h        | 12 +++++
 llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp | 14 +++---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 50 +++++++--------------
 llvm/lib/Target/RISCV/RISCVSubtarget.h      | 11 +++++
 4 files changed, 46 insertions(+), 41 deletions(-)

diff --git a/llvm/include/llvm/Support/TypeSize.h b/llvm/include/llvm/Support/TypeSize.h
index 68dbe1ea3062ab..c6779e258be7cb 100644
--- a/llvm/include/llvm/Support/TypeSize.h
+++ b/llvm/include/llvm/Support/TypeSize.h
@@ -181,6 +181,18 @@ template <typename LeafTy, typename ValueTy> class FixedOrScalableQuantity {
     return getKnownMinValue() % RHS == 0;
   }
 
+  /// Returns whether or not the callee is known to be a multiple of RHS.
+  constexpr bool isKnownMultipleOf(const FixedOrScalableQuantity &RHS) const {
+    // x % y == 0 => x % y == 0
+    // x % y == 0 => (vscale * x) % y == 0
+    // x % y == 0 => (vscale * x) % (vscale * y) == 0
+    // but
+    // x % y == 0 !=> x % (vscale * y) == 0
+    if (!isScalable() && RHS.isScalable())
+      return false;
+    return getKnownMinValue() % RHS.getKnownMinValue() == 0;
+  }
+
   // Return the minimum value with the assumption that the count is exact.
   // Use in places where a scalable count doesn't make sense (e.g. non-vector
   // types, or vectors in backends which don't support scalable vectors).
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index 26d9407d7312fb..53a2a7ae98e29d 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -2100,13 +2100,13 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
     // Establish the correct scalable-vector types for any fixed-length type.
     if (SubVecVT.isFixedLengthVector()) {
       SubVecContainerVT = TLI.getContainerForFixedLengthVector(SubVecVT);
-      bool AlignedToVecReg = false;
-      if (auto VLen = Subtarget->getRealVLen();
-          VLen && SubVecVT.getSizeInBits() ==
-                      SubVecContainerVT.getSizeInBits().getKnownMinValue() *
-                          (*VLen / RISCV::RVVBitsPerBlock))
-        AlignedToVecReg = true;
-      assert(Idx == 0 && (AlignedToVecReg || V.isUndef()));
+#ifndef NDEBUG
+      TypeSize VecRegSize = TypeSize::getScalable(RISCV::RVVBitsPerBlock);
+      bool ExactlyVecRegSized =
+          Subtarget->expandVScale(SubVecVT.getSizeInBits())
+              .isKnownMultipleOf(Subtarget->expandVScale(VecRegSize));
+      assert(Idx == 0 && (ExactlyVecRegSized || V.isUndef()));
+#endif
     }
     MVT ContainerVT = VT;
     if (VT.isFixedLengthVector())
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index d4f36747536b6f..e61b03d976293b 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -9720,21 +9720,6 @@ SDValue RISCVTargetLowering::lowerVPREDUCE(SDValue Op,
                            Vec, Mask, VL, DL, DAG, Subtarget);
 }
 
-/// Returns true if \p LHS is known to be equal to \p RHS, taking into account
-/// if VLEN is exactly known by \p Subtarget and thus vscale when handling
-/// scalable quantities.
-static bool isKnownEQ(ElementCount LHS, ElementCount RHS,
-                      const RISCVSubtarget &Subtarget) {
-  if (auto VLen = Subtarget.getRealVLen()) {
-    const unsigned Vscale = *VLen / RISCV::RVVBitsPerBlock;
-    if (LHS.isScalable())
-      LHS = ElementCount::getFixed(LHS.getKnownMinValue() * Vscale);
-    if (RHS.isScalable())
-      RHS = ElementCount::getFixed(RHS.getKnownMinValue() * Vscale);
-  }
-  return LHS == RHS;
-}
-
 SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
                                                    SelectionDAG &DAG) const {
   SDValue Vec = Op.getOperand(0);
@@ -9872,29 +9857,25 @@ SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
     RemIdx = ElementCount::getScalable(Decompose.second);
   }
 
-  RISCVII::VLMUL SubVecLMUL = RISCVTargetLowering::getLMUL(ContainerSubVecVT);
-  bool IsSubVecPartReg = SubVecLMUL == RISCVII::VLMUL::LMUL_F2 ||
-                         SubVecLMUL == RISCVII::VLMUL::LMUL_F4 ||
-                         SubVecLMUL == RISCVII::VLMUL::LMUL_F8;
-  bool AlignedToVecReg = !IsSubVecPartReg;
-  if (SubVecVT.isFixedLengthVector())
-    AlignedToVecReg &= SubVecVT.getSizeInBits() ==
-                       ContainerSubVecVT.getSizeInBits().getKnownMinValue() *
-                           (*VLen / RISCV::RVVBitsPerBlock);
+  TypeSize VecRegSize = TypeSize::getScalable(RISCV::RVVBitsPerBlock);
+  bool ExactlyVecRegSized =
+      Subtarget.expandVScale(SubVecVT.getSizeInBits())
+          .isKnownMultipleOf(Subtarget.expandVScale(VecRegSize));
 
   // 1. If the Idx has been completely eliminated and this subvector's size is
   // a vector register or a multiple thereof, or the surrounding elements are
   // undef, then this is a subvector insert which naturally aligns to a vector
   // register. These can easily be handled using subregister manipulation.
-  // 2. If the subvector isn't exactly aligned to a vector register group, then
-  // the insertion must preserve the undisturbed elements of the register. We do
-  // this by lowering to an EXTRACT_SUBVECTOR grabbing the nearest LMUL=1 vector
-  // type (which resolves to a subregister copy), performing a VSLIDEUP to place
-  // the subvector within the vector register, and an INSERT_SUBVECTOR of that
-  // LMUL=1 type back into the larger vector (resolving to another subregister
-  // operation). See below for how our VSLIDEUP works. We go via a LMUL=1 type
-  // to avoid allocating a large register group to hold our subvector.
-  if (RemIdx.isZero() && (AlignedToVecReg || Vec.isUndef())) {
+  // 2. If the subvector isn't an exact multiple of a valid register group size,
+  // then the insertion must preserve the undisturbed elements of the register.
+  // We do this by lowering to an EXTRACT_SUBVECTOR grabbing the nearest LMUL=1
+  // vector type (which resolves to a subregister copy), performing a VSLIDEUP
+  // to place the subvector within the vector register, and an INSERT_SUBVECTOR
+  // of that LMUL=1 type back into the larger vector (resolving to another
+  // subregister operation). See below for how our VSLIDEUP works. We go via a
+  // LMUL=1 type to avoid allocating a large register group to hold our
+  // subvector.
+  if (RemIdx.isZero() && (ExactlyVecRegSized || Vec.isUndef())) {
     if (SubVecVT.isFixedLengthVector()) {
       // We may get NoSubRegister if inserting at index 0 and the subvec
       // container is the same as the vector, e.g. vec=v4i32,subvec=v4i32,idx=0
@@ -9941,7 +9922,8 @@ SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
 
   // Use tail agnostic policy if we're inserting over InterSubVT's tail.
   unsigned Policy = RISCVII::TAIL_UNDISTURBED_MASK_UNDISTURBED;
-  if (isKnownEQ(EndIndex, InterSubVT.getVectorElementCount(), Subtarget))
+  if (Subtarget.expandVScale(EndIndex) ==
+      Subtarget.expandVScale(InterSubVT.getVectorElementCount()))
     Policy = RISCVII::TAIL_AGNOSTIC;
 
   // If we're inserting into the lowest elements, use a tail undisturbed
diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h
index 85f8f5f654fe7c..c880c9e921e0ea 100644
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.h
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h
@@ -200,6 +200,17 @@ class RISCVSubtarget : public RISCVGenSubtargetInfo {
     return Min;
   }
 
+  /// If the ElementCount or TypeSize \p X is scalable and VScale (VLEN) is
+  /// exactly known, returns \p X converted to a fixed quantity. Otherwise
+  /// returns \p X unmodified.
+  template <typename Quantity> Quantity expandVScale(Quantity X) const {
+    if (auto VLen = getRealVLen(); VLen && X.isScalable()) {
+      const unsigned VScale = *VLen / RISCV::RVVBitsPerBlock;
+      X = Quantity::getFixed(X.getKnownMinValue() * VScale);
+    }
+    return X;
+  }
+
   RISCVABI::ABI getTargetABI() const { return TargetABI; }
   bool isSoftFPABI() const {
     return TargetABI == RISCVABI::ABI_LP64 ||

>From edb66ba149034677c05da5f9407b8bc481726697 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Tue, 9 Apr 2024 07:13:57 +0800
Subject: [PATCH 3/4] Add asserts that the subvector size is a power of 2

This is an invariant needed if we want to check that the subvector exactly fills a vector register by checking if it's a multiple of vlen.
---
 llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp | 2 ++
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index 53a2a7ae98e29d..9ba741d64add97 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -2105,6 +2105,8 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
       bool ExactlyVecRegSized =
           Subtarget->expandVScale(SubVecVT.getSizeInBits())
               .isKnownMultipleOf(Subtarget->expandVScale(VecRegSize));
+      assert(isPowerOf2_64(Subtarget->expandVScale(SubVecVT.getSizeInBits())
+                               .getKnownMinValue()));
       assert(Idx == 0 && (ExactlyVecRegSized || V.isUndef()));
 #endif
     }
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index e61b03d976293b..3925175703e9a2 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -9858,6 +9858,8 @@ SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
   }
 
   TypeSize VecRegSize = TypeSize::getScalable(RISCV::RVVBitsPerBlock);
+  assert(isPowerOf2_64(
+      Subtarget.expandVScale(SubVecVT.getSizeInBits()).getKnownMinValue()));
   bool ExactlyVecRegSized =
       Subtarget.expandVScale(SubVecVT.getSizeInBits())
           .isKnownMultipleOf(Subtarget.expandVScale(VecRegSize));

>From a4e8d52a67ffb6c5f0b3726bfad6c8d2a8b81f75 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Thu, 25 Apr 2024 15:56:31 +0800
Subject: [PATCH 4/4] Add [[maybe_unused]], remove unused check prefix

An unused check prefix apparently causes an error? Not sure if it was
always like that.
---
 llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp               | 2 +-
 .../CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll     | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index 9ba741d64add97..425030575e4e71 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -2102,7 +2102,7 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
       SubVecContainerVT = TLI.getContainerForFixedLengthVector(SubVecVT);
 #ifndef NDEBUG
       TypeSize VecRegSize = TypeSize::getScalable(RISCV::RVVBitsPerBlock);
-      bool ExactlyVecRegSized =
+      [[maybe_unused]] bool ExactlyVecRegSized =
           Subtarget->expandVScale(SubVecVT.getSizeInBits())
               .isKnownMultipleOf(Subtarget->expandVScale(VecRegSize));
       assert(isPowerOf2_64(Subtarget->expandVScale(SubVecVT.getSizeInBits())
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll
index 76d6f8931d2840..609b4e98248926 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc < %s -mtriple=riscv32 -mattr=+v -verify-machineinstrs | FileCheck -check-prefixes=CHECK,VLA %s
-; RUN: llc < %s -mtriple=riscv64 -mattr=+v -verify-machineinstrs | FileCheck -check-prefixes=CHECK,VLA %s
+; RUN: llc < %s -mtriple=riscv32 -mattr=+v -verify-machineinstrs | FileCheck -check-prefix=VLA %s
+; RUN: llc < %s -mtriple=riscv64 -mattr=+v -verify-machineinstrs | FileCheck -check-prefix=VLA %s
 
-; RUN: llc < %s -mtriple=riscv32 -mattr=+v -riscv-v-vector-bits-max=128 -verify-machineinstrs | FileCheck -check-prefixes=CHECK,VLS %s
-; RUN: llc < %s -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-max=128 -verify-machineinstrs | FileCheck -check-prefixes=CHECK,VLS %s
+; RUN: llc < %s -mtriple=riscv32 -mattr=+v -riscv-v-vector-bits-max=128 -verify-machineinstrs | FileCheck -check-prefix=VLS %s
+; RUN: llc < %s -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-max=128 -verify-machineinstrs | FileCheck -check-prefix=VLS %s
 
 define <8 x i32> @concat_2xv4i32(<4 x i32> %a, <4 x i32> %b) {
 ; VLA-LABEL: concat_2xv4i32: