[llvm] c9a93c3 - [RISCV] Lower vector shuffles to vrgather operations

Fraser Cormack via llvm-commits llvm-commits at lists.llvm.org
Mon Apr 19 03:20:32 PDT 2021


Author: Fraser Cormack
Date: 2021-04-19T11:13:13+01:00
New Revision: c9a93c3e0128e3f35620e69d6dad200cd9e37d3c

URL: https://github.com/llvm/llvm-project/commit/c9a93c3e0128e3f35620e69d6dad200cd9e37d3c
DIFF: https://github.com/llvm/llvm-project/commit/c9a93c3e0128e3f35620e69d6dad200cd9e37d3c.diff

LOG: [RISCV] Lower vector shuffles to vrgather operations

This patch extends the lowering of RVV fixed-length vector shuffles to
avoid the default stack expansion and instead lower to vrgather
instructions.

For "permute"-style shuffles where one vector is swizzled, we can lower
to one vrgather. For shuffles involving two vector operands, we lower to
one unmasked vrgather (or splat, where appropriate) followed by a masked
vrgather which blends in the second half.

On occasion, when it's not possible to create a legal BUILD_VECTOR for
the indices, we use vrgatherei16 instructions with 16-bit index types.

For 8-bit element vectors where we may have indices over 255, we have a
fairly blunt fallback to the stack expansion to avoid custom-splitting
of the vector types.

To enable the selection of masked vrgather instructions, this patch
extends the various RISCVISD::VRGATHER nodes to take a passthru operand.

Reviewed By: craig.topper

Differential Revision: https://reviews.llvm.org/D100549

Added: 
    

Modified: 
    llvm/lib/Target/RISCV/RISCVISelLowering.cpp
    llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 398366452f72a..01f363fb6bedd 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -1453,6 +1453,74 @@ static SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
   return SDValue();
 }
 
+// Called by type legalization to handle splat of i64 on RV32.
+// FIXME: We can optimize this when the type has sign or zero bits in one
+// of the halves.
+static SDValue splatSplitI64WithVL(const SDLoc &DL, MVT VT, SDValue Scalar,
+                                   SDValue VL, SelectionDAG &DAG) {
+  SDValue ThirtyTwoV = DAG.getConstant(32, DL, VT);
+  SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Scalar,
+                           DAG.getConstant(0, DL, MVT::i32));
+  SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Scalar,
+                           DAG.getConstant(1, DL, MVT::i32));
+
+  // vmv.v.x vX, hi
+  // vsll.vx vX, vX, /*32*/
+  // vmv.v.x vY, lo
+  // vsll.vx vY, vY, /*32*/
+  // vsrl.vx vY, vY, /*32*/
+  // vor.vv vX, vX, vY
+  MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorElementCount());
+  SDValue Mask = DAG.getNode(RISCVISD::VMSET_VL, DL, MaskVT, VL);
+  Lo = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, VT, Lo, VL);
+  Lo = DAG.getNode(RISCVISD::SHL_VL, DL, VT, Lo, ThirtyTwoV, Mask, VL);
+  Lo = DAG.getNode(RISCVISD::SRL_VL, DL, VT, Lo, ThirtyTwoV, Mask, VL);
+
+  Hi = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, VT, Hi, VL);
+  Hi = DAG.getNode(RISCVISD::SHL_VL, DL, VT, Hi, ThirtyTwoV, Mask, VL);
+
+  return DAG.getNode(RISCVISD::OR_VL, DL, VT, Lo, Hi, Mask, VL);
+}
+
+// This function lowers a splat of a scalar operand Splat with the vector
+// length VL. It ensures the final sequence is type legal, which is useful when
+// lowering a splat after type legalization.
+static SDValue lowerScalarSplat(SDValue Scalar, SDValue VL, MVT VT, SDLoc DL,
+                                SelectionDAG &DAG,
+                                const RISCVSubtarget &Subtarget) {
+  if (VT.isFloatingPoint())
+    return DAG.getNode(RISCVISD::VFMV_V_F_VL, DL, VT, Scalar, VL);
+
+  MVT XLenVT = Subtarget.getXLenVT();
+
+  // Simplest case is that the operand needs to be promoted to XLenVT.
+  if (Scalar.getValueType().bitsLE(XLenVT)) {
+    // If the operand is a constant, sign extend to increase our chances
+    // of being able to use a .vi instruction. ANY_EXTEND would become a
+    // a zero extend and the simm5 check in isel would fail.
+    // FIXME: Should we ignore the upper bits in isel instead?
+    unsigned ExtOpc =
+        isa<ConstantSDNode>(Scalar) ? ISD::SIGN_EXTEND : ISD::ANY_EXTEND;
+    Scalar = DAG.getNode(ExtOpc, DL, XLenVT, Scalar);
+    return DAG.getNode(RISCVISD::VMV_V_X_VL, DL, VT, Scalar, VL);
+  }
+
+  assert(XLenVT == MVT::i32 && Scalar.getValueType() == MVT::i64 &&
+         "Unexpected scalar for splat lowering!");
+
+  // If this is a sign-extended 32-bit constant, we can truncate it and rely
+  // on the instruction to sign-extend since SEW>XLEN.
+  if (auto *CVal = dyn_cast<ConstantSDNode>(Scalar)) {
+    if (isInt<32>(CVal->getSExtValue()))
+      return DAG.getNode(RISCVISD::VMV_V_X_VL, DL, VT,
+                         DAG.getConstant(CVal->getSExtValue(), DL, MVT::i32),
+                         VL);
+  }
+
+  // Otherwise use the more complicated splatting algorithm.
+  return splatSplitI64WithVL(DL, VT, Scalar, VL, DAG);
+}
+
 static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
                                    const RISCVSubtarget &Subtarget) {
   SDValue V1 = Op.getOperand(0);
@@ -1463,48 +1531,130 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
   unsigned NumElts = VT.getVectorNumElements();
   ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
 
+  MVT ContainerVT =
+      RISCVTargetLowering::getContainerForFixedLengthVector(DAG, VT, Subtarget);
+
+  SDValue TrueMask, VL;
+  std::tie(TrueMask, VL) = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget);
+
   if (SVN->isSplat()) {
     int Lane = SVN->getSplatIndex();
     if (Lane >= 0) {
-      MVT ContainerVT = RISCVTargetLowering::getContainerForFixedLengthVector(
-          DAG, VT, Subtarget);
-
       V1 = convertToScalableVector(ContainerVT, V1, DAG, Subtarget);
       assert(Lane < (int)NumElts && "Unexpected lane!");
-
-      SDValue Mask, VL;
-      std::tie(Mask, VL) = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget);
       SDValue Gather =
           DAG.getNode(RISCVISD::VRGATHER_VX_VL, DL, ContainerVT, V1,
-                      DAG.getConstant(Lane, DL, XLenVT), Mask, VL);
+                      DAG.getConstant(Lane, DL, XLenVT), TrueMask, VL);
       return convertFromScalableVector(VT, Gather, DAG, Subtarget);
     }
   }
 
-  // Detect shuffles which can be re-expressed as vector selects.
-  SmallVector<SDValue> MaskVals;
-  // By default we preserve the original operand order, and select LHS as true
-  // and RHS as false. However, since RVV vector selects may feature splats but
-  // only on the LHS, we may choose to invert our mask and instead select
-  // between RHS and LHS.
-  bool SwapOps = DAG.isSplatValue(V2) && !DAG.isSplatValue(V1);
-
+  // Detect shuffles which can be re-expressed as vector selects; these are
+  // shuffles in which each element in the destination is taken from an element
+  // at the corresponding index in either source vectors.
   bool IsSelect = all_of(enumerate(SVN->getMask()), [&](const auto &MaskIdx) {
     int MaskIndex = MaskIdx.value();
-    bool SelectMaskVal = (MaskIndex < (int)NumElts) ^ SwapOps;
-    MaskVals.push_back(DAG.getConstant(SelectMaskVal, DL, XLenVT));
     return MaskIndex < 0 || MaskIdx.index() == (unsigned)MaskIndex % NumElts;
   });
 
-  if (IsSelect) {
-    assert(MaskVals.size() == NumElts && "Unexpected select-like shuffle");
-    MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
-    SDValue SelectMask = DAG.getBuildVector(MaskVT, DL, MaskVals);
-    return DAG.getNode(ISD::VSELECT, DL, VT, SelectMask, SwapOps ? V2 : V1,
-                       SwapOps ? V1 : V2);
+  assert(!V1.isUndef() && "Unexpected shuffle canonicalization");
+
+  SmallVector<SDValue> MaskVals;
+  // As a backup, shuffles can be lowered via a vrgather instruction, possibly
+  // merged with a second vrgather.
+  SmallVector<SDValue> GatherIndicesLHS, GatherIndicesRHS;
+
+  // By default we preserve the original operand order, and use a mask to
+  // select LHS as true and RHS as false. However, since RVV vector selects may
+  // feature splats but only on the LHS, we may choose to invert our mask and
+  // instead select between RHS and LHS.
+  bool SwapOps = DAG.isSplatValue(V2) && !DAG.isSplatValue(V1);
+  bool InvertMask = IsSelect == SwapOps;
+
+  // Now construct the mask that will be used by the vselect or blended
+  // vrgather operation. For vrgathers, construct the appropriate indices into
+  // each vector.
+  for (int MaskIndex : SVN->getMask()) {
+    bool SelectMaskVal = (MaskIndex < (int)NumElts) ^ InvertMask;
+    MaskVals.push_back(DAG.getConstant(SelectMaskVal, DL, XLenVT));
+    if (!IsSelect) {
+      bool IsLHS = MaskIndex < (int)NumElts;
+      // For "undef" elements of -1, shuffle in element 0 instead.
+      GatherIndicesLHS.push_back(
+          DAG.getConstant(IsLHS ? std::max(MaskIndex, 0) : 0, DL, XLenVT));
+      // TODO: If we're masking out unused elements anyway, it might produce
+      // better code if we use the most-common element index instead of 0.
+      GatherIndicesRHS.push_back(
+          DAG.getConstant(IsLHS ? 0 : MaskIndex - NumElts, DL, XLenVT));
+    }
   }
 
-  return SDValue();
+  if (SwapOps) {
+    std::swap(V1, V2);
+    std::swap(GatherIndicesLHS, GatherIndicesRHS);
+  }
+
+  assert(MaskVals.size() == NumElts && "Unexpected select-like shuffle");
+  MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
+  SDValue SelectMask = DAG.getBuildVector(MaskVT, DL, MaskVals);
+
+  if (IsSelect)
+    return DAG.getNode(ISD::VSELECT, DL, VT, SelectMask, V1, V2);
+
+  if (VT.getScalarSizeInBits() == 8 && VT.getVectorNumElements() > 256) {
+    // On such a large vector we're unable to use i8 as the index type.
+    // FIXME: We could promote the index to i16 and use vrgatherei16, but that
+    // may involve vector splitting if we're already at LMUL=8, or our
+    // user-supplied maximum fixed-length LMUL.
+    return SDValue();
+  }
+
+  unsigned GatherOpc = RISCVISD::VRGATHER_VV_VL;
+  MVT IndexVT = VT.changeTypeToInteger();
+  // Since we can't introduce illegal index types at this stage, use i16 and
+  // vrgatherei16 if the corresponding index type for plain vrgather is greater
+  // than XLenVT.
+  if (IndexVT.getScalarType().bitsGT(XLenVT)) {
+    GatherOpc = RISCVISD::VRGATHEREI16_VV_VL;
+    IndexVT = IndexVT.changeVectorElementType(MVT::i16);
+  }
+
+  MVT IndexContainerVT =
+      ContainerVT.changeVectorElementType(IndexVT.getScalarType());
+
+  SDValue Gather;
+  // TODO: This doesn't trigger for i64 vectors on RV32, since there we
+  // encounter a bitcasted BUILD_VECTOR with low/high i32 values.
+  if (SDValue SplatValue = DAG.getSplatValue(V1)) {
+    Gather = lowerScalarSplat(SplatValue, VL, ContainerVT, DL, DAG, Subtarget);
+  } else {
+    SDValue LHSIndices = DAG.getBuildVector(IndexVT, DL, GatherIndicesLHS);
+    LHSIndices =
+        convertToScalableVector(IndexContainerVT, LHSIndices, DAG, Subtarget);
+
+    V1 = convertToScalableVector(ContainerVT, V1, DAG, Subtarget);
+    Gather =
+        DAG.getNode(GatherOpc, DL, ContainerVT, V1, LHSIndices, TrueMask, VL);
+  }
+
+  // If a second vector operand is used by this shuffle, blend it in with an
+  // additional vrgather.
+  if (!V2.isUndef()) {
+    MVT MaskContainerVT = ContainerVT.changeVectorElementType(MVT::i1);
+    SelectMask =
+        convertToScalableVector(MaskContainerVT, SelectMask, DAG, Subtarget);
+
+    SDValue RHSIndices = DAG.getBuildVector(IndexVT, DL, GatherIndicesRHS);
+    RHSIndices =
+        convertToScalableVector(IndexContainerVT, RHSIndices, DAG, Subtarget);
+
+    V2 = convertToScalableVector(ContainerVT, V2, DAG, Subtarget);
+    V2 = DAG.getNode(GatherOpc, DL, ContainerVT, V2, RHSIndices, TrueMask, VL);
+    Gather = DAG.getNode(RISCVISD::VSELECT_VL, DL, ContainerVT, SelectMask, V2,
+                         Gather, VL);
+  }
+
+  return convertFromScalableVector(VT, Gather, DAG, Subtarget);
 }
 
 static SDValue getRVVFPExtendOrRound(SDValue Op, MVT VT, MVT ContainerVT,
@@ -2778,35 +2928,6 @@ SDValue RISCVTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
   return DAG.getNode(ISD::TRUNCATE, DL, EltVT, Elt0);
 }
 
-// Called by type legalization to handle splat of i64 on RV32.
-// FIXME: We can optimize this when the type has sign or zero bits in one
-// of the halves.
-static SDValue splatSplitI64WithVL(const SDLoc &DL, MVT VT, SDValue Scalar,
-                                   SDValue VL, SelectionDAG &DAG) {
-  SDValue ThirtyTwoV = DAG.getConstant(32, DL, VT);
-  SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Scalar,
-                           DAG.getConstant(0, DL, MVT::i32));
-  SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Scalar,
-                           DAG.getConstant(1, DL, MVT::i32));
-
-  // vmv.v.x vX, hi
-  // vsll.vx vX, vX, /*32*/
-  // vmv.v.x vY, lo
-  // vsll.vx vY, vY, /*32*/
-  // vsrl.vx vY, vY, /*32*/
-  // vor.vv vX, vX, vY
-  MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorElementCount());
-  SDValue Mask = DAG.getNode(RISCVISD::VMSET_VL, DL, MaskVT, VL);
-  Lo = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, VT, Lo, VL);
-  Lo = DAG.getNode(RISCVISD::SHL_VL, DL, VT, Lo, ThirtyTwoV, Mask, VL);
-  Lo = DAG.getNode(RISCVISD::SRL_VL, DL, VT, Lo, ThirtyTwoV, Mask, VL);
-
-  Hi = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, VT, Hi, VL);
-  Hi = DAG.getNode(RISCVISD::SHL_VL, DL, VT, Hi, ThirtyTwoV, Mask, VL);
-
-  return DAG.getNode(RISCVISD::OR_VL, DL, VT, Lo, Hi, Mask, VL);
-}
-
 // Some RVV intrinsics may claim that they want an integer operand to be
 // promoted or expanded.
 static SDValue lowerVectorIntrinsicSplats(SDValue Op, SelectionDAG &DAG,
@@ -2904,31 +3025,9 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     assert(Op.getValueType() == XLenVT && "Unexpected VT!");
     return DAG.getNode(RISCVISD::VMV_X_S, DL, Op.getValueType(),
                        Op.getOperand(1));
-  case Intrinsic::riscv_vmv_v_x: {
-    SDValue Scalar = Op.getOperand(1);
-    if (Scalar.getValueType().bitsLE(XLenVT)) {
-      unsigned ExtOpc =
-          isa<ConstantSDNode>(Scalar) ? ISD::SIGN_EXTEND : ISD::ANY_EXTEND;
-      Scalar = DAG.getNode(ExtOpc, DL, XLenVT, Scalar);
-      return DAG.getNode(RISCVISD::VMV_V_X_VL, DL, Op.getValueType(), Scalar,
-                         Op.getOperand(2));
-    }
-
-    assert(Scalar.getValueType() == MVT::i64 && "Unexpected scalar VT!");
-
-    // If this is a sign-extended 32-bit constant, we can truncate it and rely
-    // on the instruction to sign-extend since SEW>XLEN.
-    if (auto *CVal = dyn_cast<ConstantSDNode>(Scalar)) {
-      if (isInt<32>(CVal->getSExtValue()))
-        return DAG.getNode(RISCVISD::VMV_V_X_VL, DL, Op.getValueType(),
-                           DAG.getConstant(CVal->getSExtValue(), DL, MVT::i32),
-                           Op.getOperand(2));
-    }
-
-    // Otherwise use the more complicated splatting algorithm.
-    return splatSplitI64WithVL(DL, Op.getSimpleValueType(), Scalar,
-                               Op.getOperand(2), DAG);
-  }
+  case Intrinsic::riscv_vmv_v_x:
+    return lowerScalarSplat(Op.getOperand(1), Op.getOperand(2),
+                            Op.getSimpleValueType(), DL, DAG, Subtarget);
   case Intrinsic::riscv_vfmv_v_f:
     return DAG.getNode(RISCVISD::VFMV_V_F_VL, DL, Op.getValueType(),
                        Op.getOperand(1), Op.getOperand(2));

diff  --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
index 08cdb34538119..bac2e5b6de8fb 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
@@ -1089,6 +1089,18 @@ foreach vti = AllIntegerVectors in {
             (!cast<Instruction>("PseudoVRGATHER_VI_"# vti.LMul.MX)
                  vti.RegClass:$rs2, uimm5:$imm, GPR:$vl, vti.SEW)>;
 
+  def : Pat<(vti.Vector (riscv_vselect_vl (vti.Mask VMV0:$vm),
+                                          (riscv_vrgather_vv_vl
+                                            vti.RegClass:$rs2,
+                                            vti.RegClass:$rs1,
+                                            (vti.Mask true_mask),
+                                            VLOpFrag),
+                                          vti.RegClass:$merge,
+                                          VLOpFrag)),
+            (!cast<Instruction>("PseudoVRGATHER_VV_"# vti.LMul.MX#"_MASK")
+                 vti.RegClass:$merge, vti.RegClass:$rs2, vti.RegClass:$rs1,
+                 vti.Mask:$vm, GPR:$vl, vti.SEW)>;
+
   // emul = lmul * 16 / sew
   defvar vlmul = vti.LMul;
   defvar octuple_lmul = octuple_from_str<vlmul.MX>.ret;
@@ -1103,6 +1115,18 @@ foreach vti = AllIntegerVectors in {
                                                     VLOpFrag)),
               (!cast<Instruction>(inst)
                    vti.RegClass:$rs2, ivti.RegClass:$rs1, GPR:$vl, vti.SEW)>;
+
+    def : Pat<(vti.Vector (riscv_vselect_vl (vti.Mask VMV0:$vm),
+                                            (riscv_vrgatherei16_vv_vl
+                                              vti.RegClass:$rs2,
+                                              (ivti.Vector ivti.RegClass:$rs1),
+                                              (vti.Mask true_mask),
+                                              VLOpFrag),
+                                            vti.RegClass:$merge,
+                                            VLOpFrag)),
+              (!cast<Instruction>(inst#"_MASK")
+                   vti.RegClass:$merge, vti.RegClass:$rs2, ivti.RegClass:$rs1,
+                   vti.Mask:$vm, GPR:$vl, vti.SEW)>;
   }
 }
 
@@ -1136,6 +1160,18 @@ foreach vti = AllFloatVectors in {
             (!cast<Instruction>("PseudoVRGATHER_VI_"# vti.LMul.MX)
                  vti.RegClass:$rs2, uimm5:$imm, GPR:$vl, vti.SEW)>;
 
+  def : Pat<(vti.Vector (riscv_vselect_vl (vti.Mask VMV0:$vm),
+                                          (riscv_vrgather_vv_vl
+                                            vti.RegClass:$rs2,
+                                            (ivti.Vector vti.RegClass:$rs1),
+                                            (vti.Mask true_mask),
+                                            VLOpFrag),
+                                          vti.RegClass:$merge,
+                                          VLOpFrag)),
+            (!cast<Instruction>("PseudoVRGATHER_VV_"# vti.LMul.MX#"_MASK")
+                 vti.RegClass:$merge, vti.RegClass:$rs2, vti.RegClass:$rs1,
+                 vti.Mask:$vm, GPR:$vl, vti.SEW)>;
+
   defvar vlmul = vti.LMul;
   defvar octuple_lmul = octuple_from_str<vlmul.MX>.ret;
   defvar octuple_emul = !srl(!mul(octuple_lmul, 16), shift_amount<vti.SEW>.val);
@@ -1149,6 +1185,18 @@ foreach vti = AllFloatVectors in {
                                                     VLOpFrag)),
               (!cast<Instruction>(inst)
                    vti.RegClass:$rs2, ivti.RegClass:$rs1, GPR:$vl, vti.SEW)>;
+
+    def : Pat<(vti.Vector (riscv_vselect_vl (vti.Mask VMV0:$vm),
+                                            (riscv_vrgatherei16_vv_vl
+                                              vti.RegClass:$rs2,
+                                              (ivti.Vector ivti.RegClass:$rs1),
+                                              (vti.Mask true_mask),
+                                              VLOpFrag),
+                                            vti.RegClass:$merge,
+                                            VLOpFrag)),
+              (!cast<Instruction>(inst#"_MASK")
+                   vti.RegClass:$merge, vti.RegClass:$rs2, ivti.RegClass:$rs1,
+                   vti.Mask:$vm, GPR:$vl, vti.SEW)>;
   }
 }
 

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll
index 69c76a463716f..7644c04ce0b57 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll
@@ -61,76 +61,23 @@ define <4 x double> @shuffle_vf_v4f64(<4 x double> %x) {
 define <4 x double> @vrgather_permute_shuffle_vu_v4f64(<4 x double> %x) {
 ; RV32-LABEL: vrgather_permute_shuffle_vu_v4f64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -64
-; RV32-NEXT:    .cfi_def_cfa_offset 64
-; RV32-NEXT:    sw ra, 60(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s0, 56(sp) # 4-byte Folded Spill
-; RV32-NEXT:    .cfi_offset ra, -4
-; RV32-NEXT:    .cfi_offset s0, -8
-; RV32-NEXT:    addi s0, sp, 64
-; RV32-NEXT:    .cfi_def_cfa s0, 0
-; RV32-NEXT:    andi sp, sp, -32
-; RV32-NEXT:    vsetivli a0, 1, e64,m2,ta,mu
-; RV32-NEXT:    vslidedown.vi v26, v8, 1
-; RV32-NEXT:    vfmv.f.s ft0, v26
-; RV32-NEXT:    vsetivli a0, 2, e64,m1,ta,mu
-; RV32-NEXT:    vfmv.v.f v25, ft0
-; RV32-NEXT:    vsetvli zero, zero, e64,m2,ta,mu
-; RV32-NEXT:    vfmv.f.s ft1, v8
-; RV32-NEXT:    vsetivli a0, 2, e64,m1,ta,mu
-; RV32-NEXT:    vfmv.s.f v25, ft1
-; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vse64.v v25, (a0)
-; RV32-NEXT:    vsetivli a0, 1, e64,m2,ta,mu
-; RV32-NEXT:    vslidedown.vi v26, v8, 2
-; RV32-NEXT:    vfmv.f.s ft1, v26
-; RV32-NEXT:    vsetivli a0, 2, e64,m1,ta,mu
-; RV32-NEXT:    vfmv.v.f v25, ft1
-; RV32-NEXT:    vfmv.s.f v25, ft0
-; RV32-NEXT:    vse64.v v25, (sp)
+; RV32-NEXT:    lui a0, %hi(.LCPI4_0)
+; RV32-NEXT:    addi a0, a0, %lo(.LCPI4_0)
+; RV32-NEXT:    vsetivli a1, 4, e16,m1,ta,mu
+; RV32-NEXT:    vle16.v v25, (a0)
 ; RV32-NEXT:    vsetivli a0, 4, e64,m2,ta,mu
-; RV32-NEXT:    vle64.v v8, (sp)
-; RV32-NEXT:    addi sp, s0, -64
-; RV32-NEXT:    lw s0, 56(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw ra, 60(sp) # 4-byte Folded Reload
-; RV32-NEXT:    addi sp, sp, 64
+; RV32-NEXT:    vrgatherei16.vv v26, v8, v25
+; RV32-NEXT:    vmv2r.v v8, v26
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vrgather_permute_shuffle_vu_v4f64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -64
-; RV64-NEXT:    .cfi_def_cfa_offset 64
-; RV64-NEXT:    sd ra, 56(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s0, 48(sp) # 8-byte Folded Spill
-; RV64-NEXT:    .cfi_offset ra, -8
-; RV64-NEXT:    .cfi_offset s0, -16
-; RV64-NEXT:    addi s0, sp, 64
-; RV64-NEXT:    .cfi_def_cfa s0, 0
-; RV64-NEXT:    andi sp, sp, -32
-; RV64-NEXT:    vsetivli a0, 1, e64,m2,ta,mu
-; RV64-NEXT:    vslidedown.vi v26, v8, 1
-; RV64-NEXT:    vfmv.f.s ft0, v26
-; RV64-NEXT:    vsetivli a0, 2, e64,m1,ta,mu
-; RV64-NEXT:    vfmv.v.f v25, ft0
-; RV64-NEXT:    vsetvli zero, zero, e64,m2,ta,mu
-; RV64-NEXT:    vfmv.f.s ft1, v8
-; RV64-NEXT:    vsetivli a0, 2, e64,m1,ta,mu
-; RV64-NEXT:    vfmv.s.f v25, ft1
-; RV64-NEXT:    addi a0, sp, 16
-; RV64-NEXT:    vse64.v v25, (a0)
-; RV64-NEXT:    vsetivli a0, 1, e64,m2,ta,mu
-; RV64-NEXT:    vslidedown.vi v26, v8, 2
-; RV64-NEXT:    vfmv.f.s ft1, v26
-; RV64-NEXT:    vsetivli a0, 2, e64,m1,ta,mu
-; RV64-NEXT:    vfmv.v.f v25, ft1
-; RV64-NEXT:    vfmv.s.f v25, ft0
-; RV64-NEXT:    vse64.v v25, (sp)
-; RV64-NEXT:    vsetivli a0, 4, e64,m2,ta,mu
-; RV64-NEXT:    vle64.v v8, (sp)
-; RV64-NEXT:    addi sp, s0, -64
-; RV64-NEXT:    ld s0, 48(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld ra, 56(sp) # 8-byte Folded Reload
-; RV64-NEXT:    addi sp, sp, 64
+; RV64-NEXT:    lui a0, %hi(.LCPI4_0)
+; RV64-NEXT:    addi a0, a0, %lo(.LCPI4_0)
+; RV64-NEXT:    vsetivli a1, 4, e64,m2,ta,mu
+; RV64-NEXT:    vle64.v v28, (a0)
+; RV64-NEXT:    vrgather.vv v26, v8, v28
+; RV64-NEXT:    vmv2r.v v8, v26
 ; RV64-NEXT:    ret
   %s = shufflevector <4 x double> %x, <4 x double> undef, <4 x i32> <i32 1, i32 2, i32 0, i32 1>
   ret <4 x double> %s
@@ -139,76 +86,23 @@ define <4 x double> @vrgather_permute_shuffle_vu_v4f64(<4 x double> %x) {
 define <4 x double> @vrgather_permute_shuffle_uv_v4f64(<4 x double> %x) {
 ; RV32-LABEL: vrgather_permute_shuffle_uv_v4f64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -64
-; RV32-NEXT:    .cfi_def_cfa_offset 64
-; RV32-NEXT:    sw ra, 60(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s0, 56(sp) # 4-byte Folded Spill
-; RV32-NEXT:    .cfi_offset ra, -4
-; RV32-NEXT:    .cfi_offset s0, -8
-; RV32-NEXT:    addi s0, sp, 64
-; RV32-NEXT:    .cfi_def_cfa s0, 0
-; RV32-NEXT:    andi sp, sp, -32
-; RV32-NEXT:    vsetivli a0, 1, e64,m2,ta,mu
-; RV32-NEXT:    vslidedown.vi v26, v8, 1
-; RV32-NEXT:    vfmv.f.s ft0, v26
-; RV32-NEXT:    vsetivli a0, 2, e64,m1,ta,mu
-; RV32-NEXT:    vfmv.v.f v25, ft0
-; RV32-NEXT:    vsetvli zero, zero, e64,m2,ta,mu
-; RV32-NEXT:    vfmv.f.s ft1, v8
-; RV32-NEXT:    vsetivli a0, 2, e64,m1,ta,mu
-; RV32-NEXT:    vfmv.s.f v25, ft1
-; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vse64.v v25, (a0)
-; RV32-NEXT:    vsetivli a0, 1, e64,m2,ta,mu
-; RV32-NEXT:    vslidedown.vi v26, v8, 2
-; RV32-NEXT:    vfmv.f.s ft1, v26
-; RV32-NEXT:    vsetivli a0, 2, e64,m1,ta,mu
-; RV32-NEXT:    vfmv.v.f v25, ft1
-; RV32-NEXT:    vfmv.s.f v25, ft0
-; RV32-NEXT:    vse64.v v25, (sp)
+; RV32-NEXT:    lui a0, %hi(.LCPI5_0)
+; RV32-NEXT:    addi a0, a0, %lo(.LCPI5_0)
+; RV32-NEXT:    vsetivli a1, 4, e16,m1,ta,mu
+; RV32-NEXT:    vle16.v v25, (a0)
 ; RV32-NEXT:    vsetivli a0, 4, e64,m2,ta,mu
-; RV32-NEXT:    vle64.v v8, (sp)
-; RV32-NEXT:    addi sp, s0, -64
-; RV32-NEXT:    lw s0, 56(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw ra, 60(sp) # 4-byte Folded Reload
-; RV32-NEXT:    addi sp, sp, 64
+; RV32-NEXT:    vrgatherei16.vv v26, v8, v25
+; RV32-NEXT:    vmv2r.v v8, v26
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vrgather_permute_shuffle_uv_v4f64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -64
-; RV64-NEXT:    .cfi_def_cfa_offset 64
-; RV64-NEXT:    sd ra, 56(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s0, 48(sp) # 8-byte Folded Spill
-; RV64-NEXT:    .cfi_offset ra, -8
-; RV64-NEXT:    .cfi_offset s0, -16
-; RV64-NEXT:    addi s0, sp, 64
-; RV64-NEXT:    .cfi_def_cfa s0, 0
-; RV64-NEXT:    andi sp, sp, -32
-; RV64-NEXT:    vsetivli a0, 1, e64,m2,ta,mu
-; RV64-NEXT:    vslidedown.vi v26, v8, 1
-; RV64-NEXT:    vfmv.f.s ft0, v26
-; RV64-NEXT:    vsetivli a0, 2, e64,m1,ta,mu
-; RV64-NEXT:    vfmv.v.f v25, ft0
-; RV64-NEXT:    vsetvli zero, zero, e64,m2,ta,mu
-; RV64-NEXT:    vfmv.f.s ft1, v8
-; RV64-NEXT:    vsetivli a0, 2, e64,m1,ta,mu
-; RV64-NEXT:    vfmv.s.f v25, ft1
-; RV64-NEXT:    addi a0, sp, 16
-; RV64-NEXT:    vse64.v v25, (a0)
-; RV64-NEXT:    vsetivli a0, 1, e64,m2,ta,mu
-; RV64-NEXT:    vslidedown.vi v26, v8, 2
-; RV64-NEXT:    vfmv.f.s ft1, v26
-; RV64-NEXT:    vsetivli a0, 2, e64,m1,ta,mu
-; RV64-NEXT:    vfmv.v.f v25, ft1
-; RV64-NEXT:    vfmv.s.f v25, ft0
-; RV64-NEXT:    vse64.v v25, (sp)
-; RV64-NEXT:    vsetivli a0, 4, e64,m2,ta,mu
-; RV64-NEXT:    vle64.v v8, (sp)
-; RV64-NEXT:    addi sp, s0, -64
-; RV64-NEXT:    ld s0, 48(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld ra, 56(sp) # 8-byte Folded Reload
-; RV64-NEXT:    addi sp, sp, 64
+; RV64-NEXT:    lui a0, %hi(.LCPI5_0)
+; RV64-NEXT:    addi a0, a0, %lo(.LCPI5_0)
+; RV64-NEXT:    vsetivli a1, 4, e64,m2,ta,mu
+; RV64-NEXT:    vle64.v v28, (a0)
+; RV64-NEXT:    vrgather.vv v26, v8, v28
+; RV64-NEXT:    vmv2r.v v8, v26
 ; RV64-NEXT:    ret
   %s = shufflevector <4 x double> undef, <4 x double> %x, <4 x i32> <i32 5, i32 6, i32 4, i32 5>
   ret <4 x double> %s
@@ -217,84 +111,45 @@ define <4 x double> @vrgather_permute_shuffle_uv_v4f64(<4 x double> %x) {
 define <4 x double> @vrgather_shuffle_vv_v4f64(<4 x double> %x, <4 x double> %y) {
 ; RV32-LABEL: vrgather_shuffle_vv_v4f64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -64
-; RV32-NEXT:    .cfi_def_cfa_offset 64
-; RV32-NEXT:    sw ra, 60(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s0, 56(sp) # 4-byte Folded Spill
-; RV32-NEXT:    .cfi_offset ra, -4
-; RV32-NEXT:    .cfi_offset s0, -8
-; RV32-NEXT:    addi s0, sp, 64
-; RV32-NEXT:    .cfi_def_cfa s0, 0
-; RV32-NEXT:    andi sp, sp, -32
-; RV32-NEXT:    vsetivli a0, 1, e64,m2,ta,mu
-; RV32-NEXT:    vslidedown.vi v26, v10, 1
-; RV32-NEXT:    vfmv.f.s ft0, v26
-; RV32-NEXT:    vsetivli a0, 2, e64,m1,ta,mu
-; RV32-NEXT:    vfmv.v.f v25, ft0
-; RV32-NEXT:    vsetvli zero, zero, e64,m2,ta,mu
-; RV32-NEXT:    vfmv.f.s ft0, v8
-; RV32-NEXT:    vsetivli a0, 2, e64,m1,ta,mu
-; RV32-NEXT:    vfmv.s.f v25, ft0
-; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vse64.v v25, (a0)
-; RV32-NEXT:    vsetivli a0, 1, e64,m2,ta,mu
-; RV32-NEXT:    vslidedown.vi v26, v8, 2
-; RV32-NEXT:    vfmv.f.s ft0, v26
-; RV32-NEXT:    vsetivli a0, 2, e64,m1,ta,mu
-; RV32-NEXT:    vfmv.v.f v25, ft0
-; RV32-NEXT:    vsetivli a0, 1, e64,m2,ta,mu
-; RV32-NEXT:    vslidedown.vi v26, v8, 1
-; RV32-NEXT:    vfmv.f.s ft0, v26
-; RV32-NEXT:    vsetivli a0, 2, e64,m1,ta,mu
-; RV32-NEXT:    vfmv.s.f v25, ft0
-; RV32-NEXT:    vse64.v v25, (sp)
+; RV32-NEXT:    addi a0, zero, 1
+; RV32-NEXT:    addi a1, zero, 8
+; RV32-NEXT:    vsetivli a2, 1, e8,m1,ta,mu
+; RV32-NEXT:    vmv.s.x v0, a1
+; RV32-NEXT:    vsetivli a1, 4, e16,m1,ta,mu
+; RV32-NEXT:    vmv.s.x v25, a0
+; RV32-NEXT:    vmv.v.i v28, 0
+; RV32-NEXT:    vsetivli a0, 4, e16,m1,tu,mu
+; RV32-NEXT:    vslideup.vi v28, v25, 3
+; RV32-NEXT:    lui a0, %hi(.LCPI6_0)
+; RV32-NEXT:    addi a0, a0, %lo(.LCPI6_0)
+; RV32-NEXT:    vsetivli a1, 4, e16,m1,ta,mu
+; RV32-NEXT:    vle16.v v25, (a0)
 ; RV32-NEXT:    vsetivli a0, 4, e64,m2,ta,mu
-; RV32-NEXT:    vle64.v v8, (sp)
-; RV32-NEXT:    addi sp, s0, -64
-; RV32-NEXT:    lw s0, 56(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw ra, 60(sp) # 4-byte Folded Reload
-; RV32-NEXT:    addi sp, sp, 64
+; RV32-NEXT:    vrgatherei16.vv v26, v8, v25
+; RV32-NEXT:    vsetivli a0, 4, e64,m2,tu,mu
+; RV32-NEXT:    vrgatherei16.vv v26, v10, v28, v0.t
+; RV32-NEXT:    vmv2r.v v8, v26
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vrgather_shuffle_vv_v4f64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -64
-; RV64-NEXT:    .cfi_def_cfa_offset 64
-; RV64-NEXT:    sd ra, 56(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s0, 48(sp) # 8-byte Folded Spill
-; RV64-NEXT:    .cfi_offset ra, -8
-; RV64-NEXT:    .cfi_offset s0, -16
-; RV64-NEXT:    addi s0, sp, 64
-; RV64-NEXT:    .cfi_def_cfa s0, 0
-; RV64-NEXT:    andi sp, sp, -32
-; RV64-NEXT:    vsetivli a0, 1, e64,m2,ta,mu
-; RV64-NEXT:    vslidedown.vi v26, v10, 1
-; RV64-NEXT:    vfmv.f.s ft0, v26
-; RV64-NEXT:    vsetivli a0, 2, e64,m1,ta,mu
-; RV64-NEXT:    vfmv.v.f v25, ft0
-; RV64-NEXT:    vsetvli zero, zero, e64,m2,ta,mu
-; RV64-NEXT:    vfmv.f.s ft0, v8
-; RV64-NEXT:    vsetivli a0, 2, e64,m1,ta,mu
-; RV64-NEXT:    vfmv.s.f v25, ft0
-; RV64-NEXT:    addi a0, sp, 16
-; RV64-NEXT:    vse64.v v25, (a0)
-; RV64-NEXT:    vsetivli a0, 1, e64,m2,ta,mu
-; RV64-NEXT:    vslidedown.vi v26, v8, 2
-; RV64-NEXT:    vfmv.f.s ft0, v26
-; RV64-NEXT:    vsetivli a0, 2, e64,m1,ta,mu
-; RV64-NEXT:    vfmv.v.f v25, ft0
-; RV64-NEXT:    vsetivli a0, 1, e64,m2,ta,mu
-; RV64-NEXT:    vslidedown.vi v26, v8, 1
-; RV64-NEXT:    vfmv.f.s ft0, v26
-; RV64-NEXT:    vsetivli a0, 2, e64,m1,ta,mu
-; RV64-NEXT:    vfmv.s.f v25, ft0
-; RV64-NEXT:    vse64.v v25, (sp)
-; RV64-NEXT:    vsetivli a0, 4, e64,m2,ta,mu
-; RV64-NEXT:    vle64.v v8, (sp)
-; RV64-NEXT:    addi sp, s0, -64
-; RV64-NEXT:    ld s0, 48(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld ra, 56(sp) # 8-byte Folded Reload
-; RV64-NEXT:    addi sp, sp, 64
+; RV64-NEXT:    addi a0, zero, 1
+; RV64-NEXT:    vsetivli a1, 4, e64,m2,ta,mu
+; RV64-NEXT:    vmv.s.x v26, a0
+; RV64-NEXT:    vmv.v.i v28, 0
+; RV64-NEXT:    vsetivli a0, 4, e64,m2,tu,mu
+; RV64-NEXT:    vslideup.vi v28, v26, 3
+; RV64-NEXT:    addi a0, zero, 8
+; RV64-NEXT:    vsetivli a1, 1, e8,m1,ta,mu
+; RV64-NEXT:    vmv.s.x v0, a0
+; RV64-NEXT:    lui a0, %hi(.LCPI6_0)
+; RV64-NEXT:    addi a0, a0, %lo(.LCPI6_0)
+; RV64-NEXT:    vsetivli a1, 4, e64,m2,ta,mu
+; RV64-NEXT:    vle64.v v30, (a0)
+; RV64-NEXT:    vrgather.vv v26, v8, v30
+; RV64-NEXT:    vsetivli a0, 4, e64,m2,tu,mu
+; RV64-NEXT:    vrgather.vv v26, v10, v28, v0.t
+; RV64-NEXT:    vmv2r.v v8, v26
 ; RV64-NEXT:    ret
   %s = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 1, i32 2, i32 0, i32 5>
   ret <4 x double> %s
@@ -303,72 +158,37 @@ define <4 x double> @vrgather_shuffle_vv_v4f64(<4 x double> %x, <4 x double> %y)
 define <4 x double> @vrgather_shuffle_xv_v4f64(<4 x double> %x) {
 ; RV32-LABEL: vrgather_shuffle_xv_v4f64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -64
-; RV32-NEXT:    .cfi_def_cfa_offset 64
-; RV32-NEXT:    sw ra, 60(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s0, 56(sp) # 4-byte Folded Spill
-; RV32-NEXT:    .cfi_offset ra, -4
-; RV32-NEXT:    .cfi_offset s0, -8
-; RV32-NEXT:    addi s0, sp, 64
-; RV32-NEXT:    .cfi_def_cfa s0, 0
-; RV32-NEXT:    andi sp, sp, -32
-; RV32-NEXT:    vsetivli a0, 1, e64,m2,ta,mu
-; RV32-NEXT:    vslidedown.vi v26, v8, 1
-; RV32-NEXT:    vfmv.f.s ft0, v26
-; RV32-NEXT:    vsetivli a0, 2, e64,m1,ta,mu
-; RV32-NEXT:    vfmv.v.f v25, ft0
-; RV32-NEXT:    vsetivli a0, 1, e64,m2,ta,mu
-; RV32-NEXT:    vslidedown.vi v26, v8, 2
-; RV32-NEXT:    vfmv.f.s ft0, v26
-; RV32-NEXT:    vsetivli a0, 2, e64,m1,ta,mu
-; RV32-NEXT:    lui a0, %hi(.LCPI7_0)
-; RV32-NEXT:    fld ft1, %lo(.LCPI7_0)(a0)
-; RV32-NEXT:    vfmv.s.f v25, ft0
-; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vse64.v v25, (a0)
-; RV32-NEXT:    fsd ft1, 8(sp)
-; RV32-NEXT:    fsd ft1, 0(sp)
+; RV32-NEXT:    addi a0, zero, 12
+; RV32-NEXT:    lui a1, %hi(.LCPI7_0)
+; RV32-NEXT:    fld ft0, %lo(.LCPI7_0)(a1)
+; RV32-NEXT:    vsetivli a1, 1, e8,m1,ta,mu
+; RV32-NEXT:    vmv.s.x v0, a0
 ; RV32-NEXT:    vsetivli a0, 4, e64,m2,ta,mu
-; RV32-NEXT:    vle64.v v8, (sp)
-; RV32-NEXT:    addi sp, s0, -64
-; RV32-NEXT:    lw s0, 56(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw ra, 60(sp) # 4-byte Folded Reload
-; RV32-NEXT:    addi sp, sp, 64
+; RV32-NEXT:    vfmv.v.f v26, ft0
+; RV32-NEXT:    lui a0, %hi(.LCPI7_1)
+; RV32-NEXT:    addi a0, a0, %lo(.LCPI7_1)
+; RV32-NEXT:    vsetivli a1, 4, e16,m1,ta,mu
+; RV32-NEXT:    vle16.v v25, (a0)
+; RV32-NEXT:    vsetivli a0, 4, e64,m2,tu,mu
+; RV32-NEXT:    vrgatherei16.vv v26, v8, v25, v0.t
+; RV32-NEXT:    vmv2r.v v8, v26
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vrgather_shuffle_xv_v4f64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -64
-; RV64-NEXT:    .cfi_def_cfa_offset 64
-; RV64-NEXT:    sd ra, 56(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s0, 48(sp) # 8-byte Folded Spill
-; RV64-NEXT:    .cfi_offset ra, -8
-; RV64-NEXT:    .cfi_offset s0, -16
-; RV64-NEXT:    addi s0, sp, 64
-; RV64-NEXT:    .cfi_def_cfa s0, 0
-; RV64-NEXT:    andi sp, sp, -32
-; RV64-NEXT:    vsetivli a0, 1, e64,m2,ta,mu
-; RV64-NEXT:    vslidedown.vi v26, v8, 1
-; RV64-NEXT:    vfmv.f.s ft0, v26
-; RV64-NEXT:    vsetivli a0, 2, e64,m1,ta,mu
-; RV64-NEXT:    vfmv.v.f v25, ft0
-; RV64-NEXT:    vsetivli a0, 1, e64,m2,ta,mu
-; RV64-NEXT:    vslidedown.vi v26, v8, 2
-; RV64-NEXT:    vfmv.f.s ft0, v26
-; RV64-NEXT:    vsetivli a0, 2, e64,m1,ta,mu
+; RV64-NEXT:    addi a0, zero, 12
+; RV64-NEXT:    vsetivli a1, 1, e8,m1,ta,mu
+; RV64-NEXT:    vmv.s.x v0, a0
 ; RV64-NEXT:    lui a0, %hi(.LCPI7_0)
-; RV64-NEXT:    fld ft1, %lo(.LCPI7_0)(a0)
-; RV64-NEXT:    vfmv.s.f v25, ft0
-; RV64-NEXT:    addi a0, sp, 16
-; RV64-NEXT:    vse64.v v25, (a0)
-; RV64-NEXT:    fsd ft1, 8(sp)
-; RV64-NEXT:    fsd ft1, 0(sp)
-; RV64-NEXT:    vsetivli a0, 4, e64,m2,ta,mu
-; RV64-NEXT:    vle64.v v8, (sp)
-; RV64-NEXT:    addi sp, s0, -64
-; RV64-NEXT:    ld s0, 48(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld ra, 56(sp) # 8-byte Folded Reload
-; RV64-NEXT:    addi sp, sp, 64
+; RV64-NEXT:    addi a0, a0, %lo(.LCPI7_0)
+; RV64-NEXT:    lui a1, %hi(.LCPI7_1)
+; RV64-NEXT:    fld ft0, %lo(.LCPI7_1)(a1)
+; RV64-NEXT:    vsetivli a1, 4, e64,m2,ta,mu
+; RV64-NEXT:    vle64.v v28, (a0)
+; RV64-NEXT:    vfmv.v.f v26, ft0
+; RV64-NEXT:    vsetivli a0, 4, e64,m2,tu,mu
+; RV64-NEXT:    vrgather.vv v26, v8, v28, v0.t
+; RV64-NEXT:    vmv2r.v v8, v26
 ; RV64-NEXT:    ret
   %s = shufflevector <4 x double> <double 2.0, double 2.0, double 2.0, double 2.0>, <4 x double> %x, <4 x i32> <i32 0, i32 3, i32 6, i32 5>
   ret <4 x double> %s
@@ -377,68 +197,40 @@ define <4 x double> @vrgather_shuffle_xv_v4f64(<4 x double> %x) {
 define <4 x double> @vrgather_shuffle_vx_v4f64(<4 x double> %x) {
 ; RV32-LABEL: vrgather_shuffle_vx_v4f64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -64
-; RV32-NEXT:    .cfi_def_cfa_offset 64
-; RV32-NEXT:    sw ra, 60(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s0, 56(sp) # 4-byte Folded Spill
-; RV32-NEXT:    .cfi_offset ra, -4
-; RV32-NEXT:    .cfi_offset s0, -8
-; RV32-NEXT:    addi s0, sp, 64
-; RV32-NEXT:    .cfi_def_cfa s0, 0
-; RV32-NEXT:    andi sp, sp, -32
-; RV32-NEXT:    vsetivli a0, 1, e64,m2,ta,mu
-; RV32-NEXT:    vslidedown.vi v26, v8, 3
-; RV32-NEXT:    vfmv.f.s ft0, v26
-; RV32-NEXT:    vsetivli a0, 2, e64,m1,ta,mu
-; RV32-NEXT:    vfmv.v.f v25, ft0
-; RV32-NEXT:    vsetvli zero, zero, e64,m2,ta,mu
-; RV32-NEXT:    vfmv.f.s ft0, v8
+; RV32-NEXT:    addi a0, zero, 3
+; RV32-NEXT:    vsetivli a1, 1, e8,m1,ta,mu
+; RV32-NEXT:    vmv.s.x v0, a0
+; RV32-NEXT:    vsetivli a1, 4, e16,m1,ta,mu
+; RV32-NEXT:    vmv.s.x v25, a0
+; RV32-NEXT:    vmv.v.i v28, 0
 ; RV32-NEXT:    lui a0, %hi(.LCPI8_0)
-; RV32-NEXT:    fld ft1, %lo(.LCPI8_0)(a0)
-; RV32-NEXT:    vsetivli a0, 2, e64,m1,ta,mu
-; RV32-NEXT:    vfmv.s.f v25, ft0
-; RV32-NEXT:    vse64.v v25, (sp)
-; RV32-NEXT:    fsd ft1, 24(sp)
-; RV32-NEXT:    fsd ft1, 16(sp)
+; RV32-NEXT:    fld ft0, %lo(.LCPI8_0)(a0)
+; RV32-NEXT:    vsetivli a0, 2, e16,m1,tu,mu
+; RV32-NEXT:    vslideup.vi v28, v25, 1
 ; RV32-NEXT:    vsetivli a0, 4, e64,m2,ta,mu
-; RV32-NEXT:    vle64.v v8, (sp)
-; RV32-NEXT:    addi sp, s0, -64
-; RV32-NEXT:    lw s0, 56(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw ra, 60(sp) # 4-byte Folded Reload
-; RV32-NEXT:    addi sp, sp, 64
+; RV32-NEXT:    vfmv.v.f v26, ft0
+; RV32-NEXT:    vsetivli a0, 4, e64,m2,tu,mu
+; RV32-NEXT:    vrgatherei16.vv v26, v8, v28, v0.t
+; RV32-NEXT:    vmv2r.v v8, v26
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vrgather_shuffle_vx_v4f64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -64
-; RV64-NEXT:    .cfi_def_cfa_offset 64
-; RV64-NEXT:    sd ra, 56(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s0, 48(sp) # 8-byte Folded Spill
-; RV64-NEXT:    .cfi_offset ra, -8
-; RV64-NEXT:    .cfi_offset s0, -16
-; RV64-NEXT:    addi s0, sp, 64
-; RV64-NEXT:    .cfi_def_cfa s0, 0
-; RV64-NEXT:    andi sp, sp, -32
-; RV64-NEXT:    vsetivli a0, 1, e64,m2,ta,mu
-; RV64-NEXT:    vslidedown.vi v26, v8, 3
-; RV64-NEXT:    vfmv.f.s ft0, v26
-; RV64-NEXT:    vsetivli a0, 2, e64,m1,ta,mu
-; RV64-NEXT:    vfmv.v.f v25, ft0
-; RV64-NEXT:    vsetvli zero, zero, e64,m2,ta,mu
-; RV64-NEXT:    vfmv.f.s ft0, v8
-; RV64-NEXT:    lui a0, %hi(.LCPI8_0)
-; RV64-NEXT:    fld ft1, %lo(.LCPI8_0)(a0)
-; RV64-NEXT:    vsetivli a0, 2, e64,m1,ta,mu
-; RV64-NEXT:    vfmv.s.f v25, ft0
-; RV64-NEXT:    vse64.v v25, (sp)
-; RV64-NEXT:    fsd ft1, 24(sp)
-; RV64-NEXT:    fsd ft1, 16(sp)
+; RV64-NEXT:    addi a0, zero, 3
+; RV64-NEXT:    vsetivli a1, 4, e64,m2,ta,mu
+; RV64-NEXT:    vmv.s.x v26, a0
+; RV64-NEXT:    vmv.v.i v28, 0
+; RV64-NEXT:    vsetivli a1, 2, e64,m2,tu,mu
+; RV64-NEXT:    vslideup.vi v28, v26, 1
+; RV64-NEXT:    lui a1, %hi(.LCPI8_0)
+; RV64-NEXT:    fld ft0, %lo(.LCPI8_0)(a1)
+; RV64-NEXT:    vsetivli a1, 1, e8,m1,ta,mu
+; RV64-NEXT:    vmv.s.x v0, a0
 ; RV64-NEXT:    vsetivli a0, 4, e64,m2,ta,mu
-; RV64-NEXT:    vle64.v v8, (sp)
-; RV64-NEXT:    addi sp, s0, -64
-; RV64-NEXT:    ld s0, 48(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld ra, 56(sp) # 8-byte Folded Reload
-; RV64-NEXT:    addi sp, sp, 64
+; RV64-NEXT:    vfmv.v.f v26, ft0
+; RV64-NEXT:    vsetivli a0, 4, e64,m2,tu,mu
+; RV64-NEXT:    vrgather.vv v26, v8, v28, v0.t
+; RV64-NEXT:    vmv2r.v v8, v26
 ; RV64-NEXT:    ret
   %s = shufflevector <4 x double> %x, <4 x double> <double 2.0, double 2.0, double 2.0, double 2.0>, <4 x i32> <i32 0, i32 3, i32 6, i32 5>
   ret <4 x double> %s

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
index ff64f94b6ef5e..a3f29a25e13ce 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
@@ -57,23 +57,12 @@ define <4 x i16> @shuffle_vx_v4i16(<4 x i16> %x) {
 define <4 x i16> @vrgather_permute_shuffle_vu_v4i16(<4 x i16> %x) {
 ; CHECK-LABEL: vrgather_permute_shuffle_vu_v4i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    vsetvli zero, zero, e16,m1,ta,mu
-; CHECK-NEXT:    vmv.x.s a0, v8
-; CHECK-NEXT:    sh a0, 12(sp)
-; CHECK-NEXT:    vsetivli a0, 1, e16,m1,ta,mu
-; CHECK-NEXT:    vslidedown.vi v25, v8, 1
-; CHECK-NEXT:    vmv.x.s a0, v25
-; CHECK-NEXT:    sh a0, 14(sp)
-; CHECK-NEXT:    vslidedown.vi v25, v8, 2
-; CHECK-NEXT:    vmv.x.s a1, v25
-; CHECK-NEXT:    sh a1, 10(sp)
-; CHECK-NEXT:    sh a0, 8(sp)
-; CHECK-NEXT:    vsetivli a0, 4, e16,m1,ta,mu
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vle16.v v8, (a0)
-; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    lui a0, %hi(.LCPI4_0)
+; CHECK-NEXT:    addi a0, a0, %lo(.LCPI4_0)
+; CHECK-NEXT:    vsetivli a1, 4, e16,m1,ta,mu
+; CHECK-NEXT:    vle16.v v26, (a0)
+; CHECK-NEXT:    vrgather.vv v25, v8, v26
+; CHECK-NEXT:    vmv1r.v v8, v25
 ; CHECK-NEXT:    ret
   %s = shufflevector <4 x i16> %x, <4 x i16> undef, <4 x i32> <i32 1, i32 2, i32 0, i32 1>
   ret <4 x i16> %s
@@ -82,23 +71,12 @@ define <4 x i16> @vrgather_permute_shuffle_vu_v4i16(<4 x i16> %x) {
 define <4 x i16> @vrgather_permute_shuffle_uv_v4i16(<4 x i16> %x) {
 ; CHECK-LABEL: vrgather_permute_shuffle_uv_v4i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    vsetvli zero, zero, e16,m1,ta,mu
-; CHECK-NEXT:    vmv.x.s a0, v8
-; CHECK-NEXT:    sh a0, 12(sp)
-; CHECK-NEXT:    vsetivli a0, 1, e16,m1,ta,mu
-; CHECK-NEXT:    vslidedown.vi v25, v8, 1
-; CHECK-NEXT:    vmv.x.s a0, v25
-; CHECK-NEXT:    sh a0, 14(sp)
-; CHECK-NEXT:    vslidedown.vi v25, v8, 2
-; CHECK-NEXT:    vmv.x.s a1, v25
-; CHECK-NEXT:    sh a1, 10(sp)
-; CHECK-NEXT:    sh a0, 8(sp)
-; CHECK-NEXT:    vsetivli a0, 4, e16,m1,ta,mu
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vle16.v v8, (a0)
-; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    lui a0, %hi(.LCPI5_0)
+; CHECK-NEXT:    addi a0, a0, %lo(.LCPI5_0)
+; CHECK-NEXT:    vsetivli a1, 4, e16,m1,ta,mu
+; CHECK-NEXT:    vle16.v v26, (a0)
+; CHECK-NEXT:    vrgather.vv v25, v8, v26
+; CHECK-NEXT:    vmv1r.v v8, v25
 ; CHECK-NEXT:    ret
   %s = shufflevector <4 x i16> undef, <4 x i16> %x, <4 x i32> <i32 5, i32 6, i32 4, i32 5>
   ret <4 x i16> %s
@@ -107,114 +85,64 @@ define <4 x i16> @vrgather_permute_shuffle_uv_v4i16(<4 x i16> %x) {
 define <4 x i16> @vrgather_shuffle_vv_v4i16(<4 x i16> %x, <4 x i16> %y) {
 ; CHECK-LABEL: vrgather_shuffle_vv_v4i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    vsetvli zero, zero, e16,m1,ta,mu
-; CHECK-NEXT:    vmv.x.s a0, v8
-; CHECK-NEXT:    sh a0, 12(sp)
-; CHECK-NEXT:    vsetivli a0, 1, e16,m1,ta,mu
-; CHECK-NEXT:    vslidedown.vi v25, v9, 1
-; CHECK-NEXT:    vmv.x.s a0, v25
-; CHECK-NEXT:    sh a0, 14(sp)
-; CHECK-NEXT:    vslidedown.vi v25, v8, 2
-; CHECK-NEXT:    vmv.x.s a0, v25
-; CHECK-NEXT:    sh a0, 10(sp)
-; CHECK-NEXT:    vslidedown.vi v25, v8, 1
-; CHECK-NEXT:    vmv.x.s a0, v25
-; CHECK-NEXT:    sh a0, 8(sp)
-; CHECK-NEXT:    vsetivli a0, 4, e16,m1,ta,mu
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vle16.v v8, (a0)
-; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    addi a0, zero, 1
+; CHECK-NEXT:    vsetivli a1, 4, e16,m1,ta,mu
+; CHECK-NEXT:    vmv.s.x v25, a0
+; CHECK-NEXT:    vmv.v.i v26, 0
+; CHECK-NEXT:    vsetivli a0, 4, e16,m1,tu,mu
+; CHECK-NEXT:    vslideup.vi v26, v25, 3
+; CHECK-NEXT:    addi a0, zero, 8
+; CHECK-NEXT:    vsetivli a1, 1, e8,m1,ta,mu
+; CHECK-NEXT:    vmv.s.x v0, a0
+; CHECK-NEXT:    lui a0, %hi(.LCPI6_0)
+; CHECK-NEXT:    addi a0, a0, %lo(.LCPI6_0)
+; CHECK-NEXT:    vsetivli a1, 4, e16,m1,ta,mu
+; CHECK-NEXT:    vle16.v v27, (a0)
+; CHECK-NEXT:    vrgather.vv v25, v8, v27
+; CHECK-NEXT:    vsetivli a0, 4, e16,m1,tu,mu
+; CHECK-NEXT:    vrgather.vv v25, v9, v26, v0.t
+; CHECK-NEXT:    vmv1r.v v8, v25
 ; CHECK-NEXT:    ret
   %s = shufflevector <4 x i16> %x, <4 x i16> %y, <4 x i32> <i32 1, i32 2, i32 0, i32 5>
   ret <4 x i16> %s
 }
 
 define <4 x i16> @vrgather_shuffle_xv_v4i16(<4 x i16> %x) {
-; RV32-LABEL: vrgather_shuffle_xv_v4i16:
-; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    lui a0, 80
-; RV32-NEXT:    addi a0, a0, 5
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    vsetivli a0, 1, e16,m1,ta,mu
-; RV32-NEXT:    vslidedown.vi v25, v8, 1
-; RV32-NEXT:    vmv.x.s a0, v25
-; RV32-NEXT:    sh a0, 14(sp)
-; RV32-NEXT:    vslidedown.vi v25, v8, 2
-; RV32-NEXT:    vmv.x.s a0, v25
-; RV32-NEXT:    sh a0, 12(sp)
-; RV32-NEXT:    vsetivli a0, 4, e16,m1,ta,mu
-; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vle16.v v8, (a0)
-; RV32-NEXT:    addi sp, sp, 16
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vrgather_shuffle_xv_v4i16:
-; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -16
-; RV64-NEXT:    .cfi_def_cfa_offset 16
-; RV64-NEXT:    lui a0, 80
-; RV64-NEXT:    addiw a0, a0, 5
-; RV64-NEXT:    sw a0, 8(sp)
-; RV64-NEXT:    vsetivli a0, 1, e16,m1,ta,mu
-; RV64-NEXT:    vslidedown.vi v25, v8, 1
-; RV64-NEXT:    vmv.x.s a0, v25
-; RV64-NEXT:    sh a0, 14(sp)
-; RV64-NEXT:    vslidedown.vi v25, v8, 2
-; RV64-NEXT:    vmv.x.s a0, v25
-; RV64-NEXT:    sh a0, 12(sp)
-; RV64-NEXT:    vsetivli a0, 4, e16,m1,ta,mu
-; RV64-NEXT:    addi a0, sp, 8
-; RV64-NEXT:    vle16.v v8, (a0)
-; RV64-NEXT:    addi sp, sp, 16
-; RV64-NEXT:    ret
+; CHECK-LABEL: vrgather_shuffle_xv_v4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, zero, 12
+; CHECK-NEXT:    vsetivli a1, 1, e8,m1,ta,mu
+; CHECK-NEXT:    vmv.s.x v0, a0
+; CHECK-NEXT:    lui a0, %hi(.LCPI7_0)
+; CHECK-NEXT:    addi a0, a0, %lo(.LCPI7_0)
+; CHECK-NEXT:    vsetivli a1, 4, e16,m1,ta,mu
+; CHECK-NEXT:    vle16.v v26, (a0)
+; CHECK-NEXT:    vmv.v.i v25, 5
+; CHECK-NEXT:    vsetivli a0, 4, e16,m1,tu,mu
+; CHECK-NEXT:    vrgather.vv v25, v8, v26, v0.t
+; CHECK-NEXT:    vmv1r.v v8, v25
+; CHECK-NEXT:    ret
   %s = shufflevector <4 x i16> <i16 5, i16 5, i16 5, i16 5>, <4 x i16> %x, <4 x i32> <i32 0, i32 3, i32 6, i32 5>
   ret <4 x i16> %s
 }
 
 define <4 x i16> @vrgather_shuffle_vx_v4i16(<4 x i16> %x) {
-; RV32-LABEL: vrgather_shuffle_vx_v4i16:
-; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    lui a0, 80
-; RV32-NEXT:    addi a0, a0, 5
-; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    vsetvli zero, zero, e16,m1,ta,mu
-; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    sh a0, 8(sp)
-; RV32-NEXT:    vsetivli a0, 1, e16,m1,ta,mu
-; RV32-NEXT:    vslidedown.vi v25, v8, 3
-; RV32-NEXT:    vmv.x.s a0, v25
-; RV32-NEXT:    sh a0, 10(sp)
-; RV32-NEXT:    vsetivli a0, 4, e16,m1,ta,mu
-; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vle16.v v8, (a0)
-; RV32-NEXT:    addi sp, sp, 16
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vrgather_shuffle_vx_v4i16:
-; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -16
-; RV64-NEXT:    .cfi_def_cfa_offset 16
-; RV64-NEXT:    lui a0, 80
-; RV64-NEXT:    addiw a0, a0, 5
-; RV64-NEXT:    sw a0, 12(sp)
-; RV64-NEXT:    vsetvli zero, zero, e16,m1,ta,mu
-; RV64-NEXT:    vmv.x.s a0, v8
-; RV64-NEXT:    sh a0, 8(sp)
-; RV64-NEXT:    vsetivli a0, 1, e16,m1,ta,mu
-; RV64-NEXT:    vslidedown.vi v25, v8, 3
-; RV64-NEXT:    vmv.x.s a0, v25
-; RV64-NEXT:    sh a0, 10(sp)
-; RV64-NEXT:    vsetivli a0, 4, e16,m1,ta,mu
-; RV64-NEXT:    addi a0, sp, 8
-; RV64-NEXT:    vle16.v v8, (a0)
-; RV64-NEXT:    addi sp, sp, 16
-; RV64-NEXT:    ret
+; CHECK-LABEL: vrgather_shuffle_vx_v4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, zero, 3
+; CHECK-NEXT:    vsetivli a1, 4, e16,m1,ta,mu
+; CHECK-NEXT:    vmv.s.x v25, a0
+; CHECK-NEXT:    vmv.v.i v26, 0
+; CHECK-NEXT:    vsetivli a1, 2, e16,m1,tu,mu
+; CHECK-NEXT:    vslideup.vi v26, v25, 1
+; CHECK-NEXT:    vsetivli a1, 1, e8,m1,ta,mu
+; CHECK-NEXT:    vmv.s.x v0, a0
+; CHECK-NEXT:    vsetivli a0, 4, e16,m1,ta,mu
+; CHECK-NEXT:    vmv.v.i v25, 5
+; CHECK-NEXT:    vsetivli a0, 4, e16,m1,tu,mu
+; CHECK-NEXT:    vrgather.vv v25, v8, v26, v0.t
+; CHECK-NEXT:    vmv1r.v v8, v25
+; CHECK-NEXT:    ret
   %s = shufflevector <4 x i16> %x, <4 x i16> <i16 5, i16 5, i16 5, i16 5>, <4 x i32> <i32 0, i32 3, i32 6, i32 5>
   ret <4 x i16> %s
 }
@@ -222,96 +150,23 @@ define <4 x i16> @vrgather_shuffle_vx_v4i16(<4 x i16> %x) {
 define <8 x i64> @vrgather_permute_shuffle_vu_v8i64(<8 x i64> %x) {
 ; RV32-LABEL: vrgather_permute_shuffle_vu_v8i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -128
-; RV32-NEXT:    .cfi_def_cfa_offset 128
-; RV32-NEXT:    sw ra, 124(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s0, 120(sp) # 4-byte Folded Spill
-; RV32-NEXT:    .cfi_offset ra, -4
-; RV32-NEXT:    .cfi_offset s0, -8
-; RV32-NEXT:    addi s0, sp, 128
-; RV32-NEXT:    .cfi_def_cfa s0, 0
-; RV32-NEXT:    andi sp, sp, -64
-; RV32-NEXT:    vsetvli zero, zero, e32,m4,ta,mu
-; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    sw a0, 48(sp)
-; RV32-NEXT:    sw a0, 16(sp)
-; RV32-NEXT:    vsetivli a0, 1, e32,m4,ta,mu
-; RV32-NEXT:    vslidedown.vi v28, v8, 3
-; RV32-NEXT:    vmv.x.s a0, v28
-; RV32-NEXT:    sw a0, 60(sp)
-; RV32-NEXT:    vslidedown.vi v28, v8, 2
-; RV32-NEXT:    vmv.x.s a1, v28
-; RV32-NEXT:    sw a1, 56(sp)
-; RV32-NEXT:    vslidedown.vi v28, v8, 1
-; RV32-NEXT:    vmv.x.s a2, v28
-; RV32-NEXT:    sw a2, 52(sp)
-; RV32-NEXT:    vslidedown.vi v28, v8, 13
-; RV32-NEXT:    vmv.x.s a3, v28
-; RV32-NEXT:    sw a3, 44(sp)
-; RV32-NEXT:    vslidedown.vi v28, v8, 12
-; RV32-NEXT:    vmv.x.s a3, v28
-; RV32-NEXT:    sw a3, 40(sp)
-; RV32-NEXT:    vslidedown.vi v28, v8, 15
-; RV32-NEXT:    vmv.x.s a3, v28
-; RV32-NEXT:    sw a3, 36(sp)
-; RV32-NEXT:    vslidedown.vi v28, v8, 14
-; RV32-NEXT:    vmv.x.s a3, v28
-; RV32-NEXT:    sw a3, 32(sp)
-; RV32-NEXT:    sw a0, 28(sp)
-; RV32-NEXT:    sw a1, 24(sp)
-; RV32-NEXT:    sw a2, 20(sp)
-; RV32-NEXT:    vslidedown.vi v28, v8, 5
-; RV32-NEXT:    vmv.x.s a2, v28
-; RV32-NEXT:    sw a2, 12(sp)
-; RV32-NEXT:    vslidedown.vi v28, v8, 4
-; RV32-NEXT:    vmv.x.s a2, v28
-; RV32-NEXT:    sw a2, 8(sp)
-; RV32-NEXT:    sw a0, 4(sp)
-; RV32-NEXT:    sw a1, 0(sp)
-; RV32-NEXT:    vsetivli a0, 16, e32,m4,ta,mu
-; RV32-NEXT:    vle32.v v8, (sp)
-; RV32-NEXT:    addi sp, s0, -128
-; RV32-NEXT:    lw s0, 120(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw ra, 124(sp) # 4-byte Folded Reload
-; RV32-NEXT:    addi sp, sp, 128
+; RV32-NEXT:    lui a0, %hi(.LCPI9_0)
+; RV32-NEXT:    addi a0, a0, %lo(.LCPI9_0)
+; RV32-NEXT:    vsetivli a1, 8, e16,m1,ta,mu
+; RV32-NEXT:    vle16.v v25, (a0)
+; RV32-NEXT:    vsetivli a0, 8, e64,m4,ta,mu
+; RV32-NEXT:    vrgatherei16.vv v28, v8, v25
+; RV32-NEXT:    vmv4r.v v8, v28
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vrgather_permute_shuffle_vu_v8i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -128
-; RV64-NEXT:    .cfi_def_cfa_offset 128
-; RV64-NEXT:    sd ra, 120(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s0, 112(sp) # 8-byte Folded Spill
-; RV64-NEXT:    .cfi_offset ra, -8
-; RV64-NEXT:    .cfi_offset s0, -16
-; RV64-NEXT:    addi s0, sp, 128
-; RV64-NEXT:    .cfi_def_cfa s0, 0
-; RV64-NEXT:    andi sp, sp, -64
-; RV64-NEXT:    vsetvli zero, zero, e64,m4,ta,mu
-; RV64-NEXT:    vmv.x.s a0, v8
-; RV64-NEXT:    sd a0, 48(sp)
-; RV64-NEXT:    sd a0, 16(sp)
-; RV64-NEXT:    vsetivli a0, 1, e64,m4,ta,mu
-; RV64-NEXT:    vslidedown.vi v28, v8, 1
-; RV64-NEXT:    vmv.x.s a0, v28
-; RV64-NEXT:    sd a0, 56(sp)
-; RV64-NEXT:    vslidedown.vi v28, v8, 6
-; RV64-NEXT:    vmv.x.s a1, v28
-; RV64-NEXT:    sd a1, 40(sp)
-; RV64-NEXT:    vslidedown.vi v28, v8, 7
-; RV64-NEXT:    vmv.x.s a1, v28
-; RV64-NEXT:    sd a1, 32(sp)
-; RV64-NEXT:    sd a0, 24(sp)
-; RV64-NEXT:    vslidedown.vi v28, v8, 2
-; RV64-NEXT:    vmv.x.s a1, v28
-; RV64-NEXT:    sd a1, 8(sp)
-; RV64-NEXT:    sd a0, 0(sp)
-; RV64-NEXT:    vsetivli a0, 8, e64,m4,ta,mu
-; RV64-NEXT:    vle64.v v8, (sp)
-; RV64-NEXT:    addi sp, s0, -128
-; RV64-NEXT:    ld s0, 112(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld ra, 120(sp) # 8-byte Folded Reload
-; RV64-NEXT:    addi sp, sp, 128
+; RV64-NEXT:    lui a0, %hi(.LCPI9_0)
+; RV64-NEXT:    addi a0, a0, %lo(.LCPI9_0)
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vle64.v v12, (a0)
+; RV64-NEXT:    vrgather.vv v28, v8, v12
+; RV64-NEXT:    vmv4r.v v8, v28
 ; RV64-NEXT:    ret
   %s = shufflevector <8 x i64> %x, <8 x i64> undef, <8 x i32> <i32 1, i32 2, i32 0, i32 1, i32 7, i32 6, i32 0, i32 1>
   ret <8 x i64> %s
@@ -320,96 +175,23 @@ define <8 x i64> @vrgather_permute_shuffle_vu_v8i64(<8 x i64> %x) {
 define <8 x i64> @vrgather_permute_shuffle_uv_v8i64(<8 x i64> %x) {
 ; RV32-LABEL: vrgather_permute_shuffle_uv_v8i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -128
-; RV32-NEXT:    .cfi_def_cfa_offset 128
-; RV32-NEXT:    sw ra, 124(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s0, 120(sp) # 4-byte Folded Spill
-; RV32-NEXT:    .cfi_offset ra, -4
-; RV32-NEXT:    .cfi_offset s0, -8
-; RV32-NEXT:    addi s0, sp, 128
-; RV32-NEXT:    .cfi_def_cfa s0, 0
-; RV32-NEXT:    andi sp, sp, -64
-; RV32-NEXT:    vsetvli zero, zero, e32,m4,ta,mu
-; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    sw a0, 48(sp)
-; RV32-NEXT:    sw a0, 40(sp)
-; RV32-NEXT:    sw a0, 16(sp)
-; RV32-NEXT:    vsetivli a0, 1, e32,m4,ta,mu
-; RV32-NEXT:    vslidedown.vi v28, v8, 7
-; RV32-NEXT:    vmv.x.s a0, v28
-; RV32-NEXT:    sw a0, 60(sp)
-; RV32-NEXT:    vslidedown.vi v28, v8, 6
-; RV32-NEXT:    vmv.x.s a0, v28
-; RV32-NEXT:    sw a0, 56(sp)
-; RV32-NEXT:    vslidedown.vi v28, v8, 1
-; RV32-NEXT:    vmv.x.s a0, v28
-; RV32-NEXT:    sw a0, 52(sp)
-; RV32-NEXT:    sw a0, 44(sp)
-; RV32-NEXT:    vslidedown.vi v28, v8, 15
-; RV32-NEXT:    vmv.x.s a1, v28
-; RV32-NEXT:    sw a1, 36(sp)
-; RV32-NEXT:    vslidedown.vi v28, v8, 14
-; RV32-NEXT:    vmv.x.s a1, v28
-; RV32-NEXT:    sw a1, 32(sp)
-; RV32-NEXT:    vslidedown.vi v28, v8, 3
-; RV32-NEXT:    vmv.x.s a1, v28
-; RV32-NEXT:    sw a1, 28(sp)
-; RV32-NEXT:    vslidedown.vi v28, v8, 2
-; RV32-NEXT:    vmv.x.s a2, v28
-; RV32-NEXT:    sw a2, 24(sp)
-; RV32-NEXT:    sw a0, 20(sp)
-; RV32-NEXT:    vslidedown.vi v28, v8, 5
-; RV32-NEXT:    vmv.x.s a0, v28
-; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    vslidedown.vi v28, v8, 4
-; RV32-NEXT:    vmv.x.s a0, v28
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    sw a1, 4(sp)
-; RV32-NEXT:    sw a2, 0(sp)
-; RV32-NEXT:    vsetivli a0, 16, e32,m4,ta,mu
-; RV32-NEXT:    vle32.v v8, (sp)
-; RV32-NEXT:    addi sp, s0, -128
-; RV32-NEXT:    lw s0, 120(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw ra, 124(sp) # 4-byte Folded Reload
-; RV32-NEXT:    addi sp, sp, 128
+; RV32-NEXT:    lui a0, %hi(.LCPI10_0)
+; RV32-NEXT:    addi a0, a0, %lo(.LCPI10_0)
+; RV32-NEXT:    vsetivli a1, 8, e16,m1,ta,mu
+; RV32-NEXT:    vle16.v v25, (a0)
+; RV32-NEXT:    vsetivli a0, 8, e64,m4,ta,mu
+; RV32-NEXT:    vrgatherei16.vv v28, v8, v25
+; RV32-NEXT:    vmv4r.v v8, v28
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vrgather_permute_shuffle_uv_v8i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -128
-; RV64-NEXT:    .cfi_def_cfa_offset 128
-; RV64-NEXT:    sd ra, 120(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s0, 112(sp) # 8-byte Folded Spill
-; RV64-NEXT:    .cfi_offset ra, -8
-; RV64-NEXT:    .cfi_offset s0, -16
-; RV64-NEXT:    addi s0, sp, 128
-; RV64-NEXT:    .cfi_def_cfa s0, 0
-; RV64-NEXT:    andi sp, sp, -64
-; RV64-NEXT:    vsetvli zero, zero, e64,m4,ta,mu
-; RV64-NEXT:    vmv.x.s a0, v8
-; RV64-NEXT:    sd a0, 48(sp)
-; RV64-NEXT:    sd a0, 40(sp)
-; RV64-NEXT:    sd a0, 16(sp)
-; RV64-NEXT:    vsetivli a0, 1, e64,m4,ta,mu
-; RV64-NEXT:    vslidedown.vi v28, v8, 3
-; RV64-NEXT:    vmv.x.s a0, v28
-; RV64-NEXT:    sd a0, 56(sp)
-; RV64-NEXT:    vslidedown.vi v28, v8, 7
-; RV64-NEXT:    vmv.x.s a0, v28
-; RV64-NEXT:    sd a0, 32(sp)
-; RV64-NEXT:    vslidedown.vi v28, v8, 1
-; RV64-NEXT:    vmv.x.s a0, v28
-; RV64-NEXT:    sd a0, 24(sp)
-; RV64-NEXT:    vslidedown.vi v28, v8, 2
-; RV64-NEXT:    vmv.x.s a1, v28
-; RV64-NEXT:    sd a1, 8(sp)
-; RV64-NEXT:    sd a0, 0(sp)
-; RV64-NEXT:    vsetivli a0, 8, e64,m4,ta,mu
-; RV64-NEXT:    vle64.v v8, (sp)
-; RV64-NEXT:    addi sp, s0, -128
-; RV64-NEXT:    ld s0, 112(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld ra, 120(sp) # 8-byte Folded Reload
-; RV64-NEXT:    addi sp, sp, 128
+; RV64-NEXT:    lui a0, %hi(.LCPI10_0)
+; RV64-NEXT:    addi a0, a0, %lo(.LCPI10_0)
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vle64.v v12, (a0)
+; RV64-NEXT:    vrgather.vv v28, v8, v12
+; RV64-NEXT:    vmv4r.v v8, v28
 ; RV64-NEXT:    ret
   %s = shufflevector <8 x i64> undef, <8 x i64> %x, <8 x i32> <i32 9, i32 10, i32 8, i32 9, i32 15, i32 8, i32 8, i32 11>
   ret <8 x i64> %s
@@ -418,102 +200,55 @@ define <8 x i64> @vrgather_permute_shuffle_uv_v8i64(<8 x i64> %x) {
 define <8 x i64> @vrgather_shuffle_vv_v8i64(<8 x i64> %x, <8 x i64> %y) {
 ; RV32-LABEL: vrgather_shuffle_vv_v8i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -128
-; RV32-NEXT:    .cfi_def_cfa_offset 128
-; RV32-NEXT:    sw ra, 124(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s0, 120(sp) # 4-byte Folded Spill
-; RV32-NEXT:    .cfi_offset ra, -4
-; RV32-NEXT:    .cfi_offset s0, -8
-; RV32-NEXT:    addi s0, sp, 128
-; RV32-NEXT:    .cfi_def_cfa s0, 0
-; RV32-NEXT:    andi sp, sp, -64
-; RV32-NEXT:    vsetivli a0, 1, e32,m4,ta,mu
-; RV32-NEXT:    vslidedown.vi v28, v12, 11
-; RV32-NEXT:    vmv.x.s a0, v28
-; RV32-NEXT:    sw a0, 60(sp)
-; RV32-NEXT:    vslidedown.vi v28, v12, 10
-; RV32-NEXT:    vmv.x.s a0, v28
-; RV32-NEXT:    sw a0, 56(sp)
-; RV32-NEXT:    vslidedown.vi v28, v8, 7
-; RV32-NEXT:    vmv.x.s a0, v28
-; RV32-NEXT:    sw a0, 52(sp)
-; RV32-NEXT:    vslidedown.vi v28, v8, 6
-; RV32-NEXT:    vmv.x.s a0, v28
-; RV32-NEXT:    sw a0, 48(sp)
-; RV32-NEXT:    vslidedown.vi v28, v12, 5
-; RV32-NEXT:    vmv.x.s a0, v28
-; RV32-NEXT:    sw a0, 44(sp)
-; RV32-NEXT:    vslidedown.vi v28, v12, 4
-; RV32-NEXT:    vmv.x.s a1, v28
-; RV32-NEXT:    sw a1, 40(sp)
-; RV32-NEXT:    vslidedown.vi v28, v8, 3
-; RV32-NEXT:    vmv.x.s a2, v28
-; RV32-NEXT:    sw a2, 36(sp)
-; RV32-NEXT:    vslidedown.vi v28, v8, 2
-; RV32-NEXT:    vmv.x.s a3, v28
-; RV32-NEXT:    sw a3, 32(sp)
-; RV32-NEXT:    vslidedown.vi v28, v8, 11
-; RV32-NEXT:    vmv.x.s a4, v28
-; RV32-NEXT:    sw a4, 28(sp)
-; RV32-NEXT:    vslidedown.vi v28, v8, 10
-; RV32-NEXT:    vmv.x.s a4, v28
-; RV32-NEXT:    sw a4, 24(sp)
-; RV32-NEXT:    sw a0, 20(sp)
-; RV32-NEXT:    sw a1, 16(sp)
-; RV32-NEXT:    vslidedown.vi v28, v8, 5
-; RV32-NEXT:    vmv.x.s a0, v28
-; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    vslidedown.vi v28, v8, 4
-; RV32-NEXT:    vmv.x.s a0, v28
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    sw a2, 4(sp)
-; RV32-NEXT:    sw a3, 0(sp)
-; RV32-NEXT:    vsetivli a0, 16, e32,m4,ta,mu
-; RV32-NEXT:    vle32.v v8, (sp)
-; RV32-NEXT:    addi sp, s0, -128
-; RV32-NEXT:    lw s0, 120(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw ra, 124(sp) # 4-byte Folded Reload
-; RV32-NEXT:    addi sp, sp, 128
+; RV32-NEXT:    addi a0, zero, 5
+; RV32-NEXT:    vsetivli a1, 8, e16,m1,ta,mu
+; RV32-NEXT:    vmv.s.x v25, a0
+; RV32-NEXT:    addi a0, zero, 36
+; RV32-NEXT:    vsetivli a1, 1, e8,m1,ta,mu
+; RV32-NEXT:    vmv.s.x v0, a0
+; RV32-NEXT:    vsetivli a0, 8, e16,m1,ta,mu
+; RV32-NEXT:    vmv.v.i v26, 0
+; RV32-NEXT:    vmerge.vim v26, v26, 2, v0
+; RV32-NEXT:    vsetivli a0, 8, e16,m1,tu,mu
+; RV32-NEXT:    vslideup.vi v26, v25, 7
+; RV32-NEXT:    addi a0, zero, 164
+; RV32-NEXT:    vsetivli a1, 1, e8,m1,ta,mu
+; RV32-NEXT:    vmv.s.x v0, a0
+; RV32-NEXT:    lui a0, %hi(.LCPI11_0)
+; RV32-NEXT:    addi a0, a0, %lo(.LCPI11_0)
+; RV32-NEXT:    vsetivli a1, 8, e16,m1,ta,mu
+; RV32-NEXT:    vle16.v v25, (a0)
+; RV32-NEXT:    vsetivli a0, 8, e64,m4,ta,mu
+; RV32-NEXT:    vrgatherei16.vv v28, v8, v25
+; RV32-NEXT:    vsetivli a0, 8, e64,m4,tu,mu
+; RV32-NEXT:    vrgatherei16.vv v28, v12, v26, v0.t
+; RV32-NEXT:    vmv4r.v v8, v28
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vrgather_shuffle_vv_v8i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -128
-; RV64-NEXT:    .cfi_def_cfa_offset 128
-; RV64-NEXT:    sd ra, 120(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s0, 112(sp) # 8-byte Folded Spill
-; RV64-NEXT:    .cfi_offset ra, -8
-; RV64-NEXT:    .cfi_offset s0, -16
-; RV64-NEXT:    addi s0, sp, 128
-; RV64-NEXT:    .cfi_def_cfa s0, 0
-; RV64-NEXT:    andi sp, sp, -64
-; RV64-NEXT:    vsetivli a0, 1, e64,m4,ta,mu
-; RV64-NEXT:    vslidedown.vi v28, v12, 5
-; RV64-NEXT:    vmv.x.s a0, v28
-; RV64-NEXT:    sd a0, 56(sp)
-; RV64-NEXT:    vslidedown.vi v28, v8, 3
-; RV64-NEXT:    vmv.x.s a0, v28
-; RV64-NEXT:    sd a0, 48(sp)
-; RV64-NEXT:    vslidedown.vi v28, v12, 2
-; RV64-NEXT:    vmv.x.s a0, v28
-; RV64-NEXT:    sd a0, 40(sp)
-; RV64-NEXT:    vslidedown.vi v28, v8, 1
-; RV64-NEXT:    vmv.x.s a1, v28
-; RV64-NEXT:    sd a1, 32(sp)
-; RV64-NEXT:    vslidedown.vi v28, v8, 5
-; RV64-NEXT:    vmv.x.s a2, v28
-; RV64-NEXT:    sd a2, 24(sp)
-; RV64-NEXT:    sd a0, 16(sp)
-; RV64-NEXT:    vslidedown.vi v28, v8, 2
-; RV64-NEXT:    vmv.x.s a0, v28
-; RV64-NEXT:    sd a0, 8(sp)
-; RV64-NEXT:    sd a1, 0(sp)
+; RV64-NEXT:    addi a0, zero, 5
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vmv.s.x v28, a0
+; RV64-NEXT:    addi a0, zero, 36
+; RV64-NEXT:    vsetivli a1, 1, e8,m1,ta,mu
+; RV64-NEXT:    vmv.s.x v0, a0
 ; RV64-NEXT:    vsetivli a0, 8, e64,m4,ta,mu
-; RV64-NEXT:    vle64.v v8, (sp)
-; RV64-NEXT:    addi sp, s0, -128
-; RV64-NEXT:    ld s0, 112(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld ra, 120(sp) # 8-byte Folded Reload
-; RV64-NEXT:    addi sp, sp, 128
+; RV64-NEXT:    vmv.v.i v16, 0
+; RV64-NEXT:    vmerge.vim v16, v16, 2, v0
+; RV64-NEXT:    vsetivli a0, 8, e64,m4,tu,mu
+; RV64-NEXT:    vslideup.vi v16, v28, 7
+; RV64-NEXT:    addi a0, zero, 164
+; RV64-NEXT:    vsetivli a1, 1, e8,m1,ta,mu
+; RV64-NEXT:    vmv.s.x v0, a0
+; RV64-NEXT:    lui a0, %hi(.LCPI11_0)
+; RV64-NEXT:    addi a0, a0, %lo(.LCPI11_0)
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vle64.v v20, (a0)
+; RV64-NEXT:    vrgather.vv v28, v8, v20
+; RV64-NEXT:    vsetivli a0, 8, e64,m4,tu,mu
+; RV64-NEXT:    vrgather.vv v28, v12, v16, v0.t
+; RV64-NEXT:    vmv4r.v v8, v28
 ; RV64-NEXT:    ret
   %s = shufflevector <8 x i64> %x, <8 x i64> %y, <8 x i32> <i32 1, i32 2, i32 10, i32 5, i32 1, i32 10, i32 3, i32 13>
   ret <8 x i64> %s
@@ -522,86 +257,52 @@ define <8 x i64> @vrgather_shuffle_vv_v8i64(<8 x i64> %x, <8 x i64> %y) {
 define <8 x i64> @vrgather_shuffle_xv_v8i64(<8 x i64> %x) {
 ; RV32-LABEL: vrgather_shuffle_xv_v8i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -128
-; RV32-NEXT:    .cfi_def_cfa_offset 128
-; RV32-NEXT:    sw ra, 124(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s0, 120(sp) # 4-byte Folded Spill
-; RV32-NEXT:    .cfi_offset ra, -4
-; RV32-NEXT:    .cfi_offset s0, -8
-; RV32-NEXT:    addi s0, sp, 128
-; RV32-NEXT:    .cfi_def_cfa s0, 0
-; RV32-NEXT:    andi sp, sp, -64
-; RV32-NEXT:    addi a0, zero, -1
-; RV32-NEXT:    sw a0, 60(sp)
-; RV32-NEXT:    sw a0, 56(sp)
-; RV32-NEXT:    sw a0, 28(sp)
-; RV32-NEXT:    sw a0, 24(sp)
-; RV32-NEXT:    sw a0, 20(sp)
-; RV32-NEXT:    sw a0, 16(sp)
-; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    vsetvli zero, zero, e32,m4,ta,mu
-; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    sw a0, 32(sp)
-; RV32-NEXT:    sw a0, 0(sp)
-; RV32-NEXT:    vsetivli a0, 1, e32,m4,ta,mu
-; RV32-NEXT:    vslidedown.vi v28, v8, 13
-; RV32-NEXT:    vmv.x.s a0, v28
-; RV32-NEXT:    sw a0, 52(sp)
-; RV32-NEXT:    vslidedown.vi v28, v8, 12
-; RV32-NEXT:    vmv.x.s a0, v28
-; RV32-NEXT:    sw a0, 48(sp)
-; RV32-NEXT:    vslidedown.vi v28, v8, 9
-; RV32-NEXT:    vmv.x.s a0, v28
-; RV32-NEXT:    sw a0, 44(sp)
-; RV32-NEXT:    vslidedown.vi v28, v8, 8
-; RV32-NEXT:    vmv.x.s a0, v28
-; RV32-NEXT:    sw a0, 40(sp)
-; RV32-NEXT:    vslidedown.vi v28, v8, 1
-; RV32-NEXT:    vmv.x.s a0, v28
-; RV32-NEXT:    sw a0, 36(sp)
-; RV32-NEXT:    sw a0, 4(sp)
+; RV32-NEXT:    addi a0, zero, 6
+; RV32-NEXT:    vsetivli a1, 8, e16,m1,ta,mu
+; RV32-NEXT:    vmv.s.x v25, a0
+; RV32-NEXT:    addi a0, zero, 4
+; RV32-NEXT:    vmv.s.x v26, a0
+; RV32-NEXT:    vmv.v.i v27, 0
+; RV32-NEXT:    vsetivli a0, 6, e16,m1,tu,mu
+; RV32-NEXT:    vslideup.vi v27, v26, 5
+; RV32-NEXT:    vsetivli a0, 7, e16,m1,tu,mu
+; RV32-NEXT:    vslideup.vi v27, v25, 6
+; RV32-NEXT:    addi a0, zero, 113
+; RV32-NEXT:    vsetivli a1, 1, e8,m1,ta,mu
+; RV32-NEXT:    vmv.s.x v0, a0
+; RV32-NEXT:    lui a0, %hi(.LCPI12_0)
+; RV32-NEXT:    addi a0, a0, %lo(.LCPI12_0)
+; RV32-NEXT:    vsetivli a1, 8, e16,m1,ta,mu
+; RV32-NEXT:    vle16.v v25, (a0)
 ; RV32-NEXT:    vsetivli a0, 16, e32,m4,ta,mu
-; RV32-NEXT:    vle32.v v8, (sp)
-; RV32-NEXT:    addi sp, s0, -128
-; RV32-NEXT:    lw s0, 120(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw ra, 124(sp) # 4-byte Folded Reload
-; RV32-NEXT:    addi sp, sp, 128
+; RV32-NEXT:    vmv.v.i v12, -1
+; RV32-NEXT:    vsetivli a0, 8, e64,m4,ta,mu
+; RV32-NEXT:    vrgatherei16.vv v28, v12, v25
+; RV32-NEXT:    vsetivli a0, 8, e64,m4,tu,mu
+; RV32-NEXT:    vrgatherei16.vv v28, v8, v27, v0.t
+; RV32-NEXT:    vmv4r.v v8, v28
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vrgather_shuffle_xv_v8i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -128
-; RV64-NEXT:    .cfi_def_cfa_offset 128
-; RV64-NEXT:    sd ra, 120(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s0, 112(sp) # 8-byte Folded Spill
-; RV64-NEXT:    .cfi_offset ra, -8
-; RV64-NEXT:    .cfi_offset s0, -16
-; RV64-NEXT:    addi s0, sp, 128
-; RV64-NEXT:    .cfi_def_cfa s0, 0
-; RV64-NEXT:    andi sp, sp, -64
-; RV64-NEXT:    addi a0, zero, -1
-; RV64-NEXT:    sd a0, 56(sp)
-; RV64-NEXT:    sd a0, 24(sp)
-; RV64-NEXT:    sd a0, 16(sp)
-; RV64-NEXT:    sd a0, 8(sp)
-; RV64-NEXT:    vsetvli zero, zero, e64,m4,ta,mu
-; RV64-NEXT:    vmv.x.s a0, v8
-; RV64-NEXT:    sd a0, 32(sp)
-; RV64-NEXT:    sd a0, 0(sp)
-; RV64-NEXT:    vsetivli a0, 1, e64,m4,ta,mu
-; RV64-NEXT:    vslidedown.vi v28, v8, 6
-; RV64-NEXT:    vmv.x.s a0, v28
-; RV64-NEXT:    sd a0, 48(sp)
-; RV64-NEXT:    vslidedown.vi v28, v8, 4
-; RV64-NEXT:    vmv.x.s a0, v28
-; RV64-NEXT:    sd a0, 40(sp)
+; RV64-NEXT:    addi a0, zero, 6
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vmv.s.x v28, a0
+; RV64-NEXT:    addi a0, zero, 4
+; RV64-NEXT:    vmv.s.x v12, a0
+; RV64-NEXT:    vmv.v.i v16, 0
+; RV64-NEXT:    vsetivli a0, 6, e64,m4,tu,mu
+; RV64-NEXT:    vslideup.vi v16, v12, 5
+; RV64-NEXT:    vsetivli a0, 7, e64,m4,tu,mu
+; RV64-NEXT:    vslideup.vi v16, v28, 6
+; RV64-NEXT:    addi a0, zero, 113
+; RV64-NEXT:    vsetivli a1, 1, e8,m1,ta,mu
+; RV64-NEXT:    vmv.s.x v0, a0
 ; RV64-NEXT:    vsetivli a0, 8, e64,m4,ta,mu
-; RV64-NEXT:    vle64.v v8, (sp)
-; RV64-NEXT:    addi sp, s0, -128
-; RV64-NEXT:    ld s0, 112(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld ra, 120(sp) # 8-byte Folded Reload
-; RV64-NEXT:    addi sp, sp, 128
+; RV64-NEXT:    vmv.v.i v28, -1
+; RV64-NEXT:    vsetivli a0, 8, e64,m4,tu,mu
+; RV64-NEXT:    vrgather.vv v28, v8, v16, v0.t
+; RV64-NEXT:    vmv4r.v v8, v28
 ; RV64-NEXT:    ret
   %s = shufflevector <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>, <8 x i64> %x, <8 x i32> <i32 8, i32 3, i32 6, i32 5, i32 8, i32 12, i32 14, i32 3>
   ret <8 x i64> %s
@@ -610,98 +311,39 @@ define <8 x i64> @vrgather_shuffle_xv_v8i64(<8 x i64> %x) {
 define <8 x i64> @vrgather_shuffle_vx_v8i64(<8 x i64> %x) {
 ; RV32-LABEL: vrgather_shuffle_vx_v8i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -128
-; RV32-NEXT:    .cfi_def_cfa_offset 128
-; RV32-NEXT:    sw ra, 124(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s0, 120(sp) # 4-byte Folded Spill
-; RV32-NEXT:    .cfi_offset ra, -4
-; RV32-NEXT:    .cfi_offset s0, -8
-; RV32-NEXT:    addi s0, sp, 128
-; RV32-NEXT:    .cfi_def_cfa s0, 0
-; RV32-NEXT:    andi sp, sp, -64
-; RV32-NEXT:    sw zero, 60(sp)
-; RV32-NEXT:    addi a0, zero, 5
-; RV32-NEXT:    sw a0, 56(sp)
-; RV32-NEXT:    sw zero, 28(sp)
-; RV32-NEXT:    sw a0, 24(sp)
-; RV32-NEXT:    sw zero, 20(sp)
-; RV32-NEXT:    sw a0, 16(sp)
-; RV32-NEXT:    vsetvli zero, zero, e32,m4,ta,mu
-; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    sw a0, 0(sp)
-; RV32-NEXT:    vsetivli a0, 1, e32,m4,ta,mu
-; RV32-NEXT:    vslidedown.vi v28, v8, 15
-; RV32-NEXT:    vmv.x.s a0, v28
-; RV32-NEXT:    sw a0, 52(sp)
-; RV32-NEXT:    vslidedown.vi v28, v8, 14
-; RV32-NEXT:    vmv.x.s a0, v28
-; RV32-NEXT:    sw a0, 48(sp)
-; RV32-NEXT:    vslidedown.vi v28, v8, 3
-; RV32-NEXT:    vmv.x.s a0, v28
-; RV32-NEXT:    sw a0, 44(sp)
-; RV32-NEXT:    vslidedown.vi v28, v8, 2
-; RV32-NEXT:    vmv.x.s a0, v28
-; RV32-NEXT:    sw a0, 40(sp)
-; RV32-NEXT:    vslidedown.vi v28, v8, 9
-; RV32-NEXT:    vmv.x.s a0, v28
-; RV32-NEXT:    sw a0, 36(sp)
-; RV32-NEXT:    vslidedown.vi v28, v8, 8
-; RV32-NEXT:    vmv.x.s a0, v28
-; RV32-NEXT:    sw a0, 32(sp)
-; RV32-NEXT:    vslidedown.vi v28, v8, 7
-; RV32-NEXT:    vmv.x.s a0, v28
-; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    vslidedown.vi v28, v8, 6
-; RV32-NEXT:    vmv.x.s a0, v28
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    vslidedown.vi v28, v8, 1
-; RV32-NEXT:    vmv.x.s a0, v28
-; RV32-NEXT:    sw a0, 4(sp)
-; RV32-NEXT:    vsetivli a0, 16, e32,m4,ta,mu
-; RV32-NEXT:    vle32.v v8, (sp)
-; RV32-NEXT:    addi sp, s0, -128
-; RV32-NEXT:    lw s0, 120(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw ra, 124(sp) # 4-byte Folded Reload
-; RV32-NEXT:    addi sp, sp, 128
+; RV32-NEXT:    addi a0, zero, 140
+; RV32-NEXT:    vsetivli a1, 1, e8,m1,ta,mu
+; RV32-NEXT:    vmv.s.x v0, a0
+; RV32-NEXT:    lui a0, %hi(.LCPI13_0)
+; RV32-NEXT:    addi a0, a0, %lo(.LCPI13_0)
+; RV32-NEXT:    vsetivli a1, 8, e16,m1,ta,mu
+; RV32-NEXT:    vle16.v v25, (a0)
+; RV32-NEXT:    vsetivli a0, 8, e64,m4,ta,mu
+; RV32-NEXT:    vrgatherei16.vv v28, v8, v25
+; RV32-NEXT:    lui a0, %hi(.LCPI13_1)
+; RV32-NEXT:    addi a0, a0, %lo(.LCPI13_1)
+; RV32-NEXT:    vsetivli a1, 8, e16,m1,ta,mu
+; RV32-NEXT:    vle16.v v25, (a0)
+; RV32-NEXT:    vsetivli a0, 8, e64,m4,ta,mu
+; RV32-NEXT:    vmv.v.i v8, 5
+; RV32-NEXT:    vsetivli a0, 8, e64,m4,tu,mu
+; RV32-NEXT:    vrgatherei16.vv v28, v8, v25, v0.t
+; RV32-NEXT:    vmv4r.v v8, v28
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vrgather_shuffle_vx_v8i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -128
-; RV64-NEXT:    .cfi_def_cfa_offset 128
-; RV64-NEXT:    sd ra, 120(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s0, 112(sp) # 8-byte Folded Spill
-; RV64-NEXT:    .cfi_offset ra, -8
-; RV64-NEXT:    .cfi_offset s0, -16
-; RV64-NEXT:    addi s0, sp, 128
-; RV64-NEXT:    .cfi_def_cfa s0, 0
-; RV64-NEXT:    andi sp, sp, -64
-; RV64-NEXT:    addi a0, zero, 5
-; RV64-NEXT:    sd a0, 56(sp)
-; RV64-NEXT:    sd a0, 24(sp)
-; RV64-NEXT:    sd a0, 16(sp)
-; RV64-NEXT:    vsetvli zero, zero, e64,m4,ta,mu
-; RV64-NEXT:    vmv.x.s a0, v8
-; RV64-NEXT:    sd a0, 0(sp)
-; RV64-NEXT:    vsetivli a0, 1, e64,m4,ta,mu
-; RV64-NEXT:    vslidedown.vi v28, v8, 7
-; RV64-NEXT:    vmv.x.s a0, v28
-; RV64-NEXT:    sd a0, 48(sp)
-; RV64-NEXT:    vslidedown.vi v28, v8, 1
-; RV64-NEXT:    vmv.x.s a0, v28
-; RV64-NEXT:    sd a0, 40(sp)
-; RV64-NEXT:    vslidedown.vi v28, v8, 4
-; RV64-NEXT:    vmv.x.s a0, v28
-; RV64-NEXT:    sd a0, 32(sp)
-; RV64-NEXT:    vslidedown.vi v28, v8, 3
-; RV64-NEXT:    vmv.x.s a0, v28
-; RV64-NEXT:    sd a0, 8(sp)
-; RV64-NEXT:    vsetivli a0, 8, e64,m4,ta,mu
-; RV64-NEXT:    vle64.v v8, (sp)
-; RV64-NEXT:    addi sp, s0, -128
-; RV64-NEXT:    ld s0, 112(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld ra, 120(sp) # 8-byte Folded Reload
-; RV64-NEXT:    addi sp, sp, 128
+; RV64-NEXT:    addi a0, zero, 115
+; RV64-NEXT:    vsetivli a1, 1, e8,m1,ta,mu
+; RV64-NEXT:    vmv.s.x v0, a0
+; RV64-NEXT:    lui a0, %hi(.LCPI13_0)
+; RV64-NEXT:    addi a0, a0, %lo(.LCPI13_0)
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vle64.v v12, (a0)
+; RV64-NEXT:    vmv.v.i v28, 5
+; RV64-NEXT:    vsetivli a0, 8, e64,m4,tu,mu
+; RV64-NEXT:    vrgather.vv v28, v8, v12, v0.t
+; RV64-NEXT:    vmv4r.v v8, v28
 ; RV64-NEXT:    ret
   %s = shufflevector <8 x i64> %x, <8 x i64> <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>, <8 x i32> <i32 0, i32 3, i32 10, i32 9, i32 4, i32 1, i32 7, i32 14>
   ret <8 x i64> %s


        


More information about the llvm-commits mailing list