[llvm] [RISCV] Lower non-power-of-2 vector to nearest power-of-2 vector leng… (PR #106092)

Kito Cheng via llvm-commits llvm-commits at lists.llvm.org
Mon Aug 26 08:30:00 PDT 2024


https://github.com/kito-cheng created https://github.com/llvm/llvm-project/pull/106092

…th with VP intrinsic

It's still early stage for this patch, but I would like to kick this out to demonstrate the possility of this approach,
although it's mostly nullify by #104689, but it can get some improve after adding more pattern, and will add later.

The idea of this patch is lowee the non-power-of-2 vector to nearest power-of-2 vector length with VP intrinsic, and put vector insert and extrat for converting the type from/to the original vector type.

Example:

```
define void @vls3i8(ptr align 8 %array) {
entry:
  %1 = load <3 x i8>, ptr %array, align 1
  %2 = add<3 x i8> %1, %1
  store <3 x i8> %2, ptr %array, align 1
  ret void
}
```

```
define void @vls3i8(ptr align 8 %array) #0 {
entry:
  %0 = call <vscale x 4 x i8> @llvm.vp.load.nxv4i8.p0(ptr %array, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x
i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 3)
 %1 = call <3 x i8> @llvm.vector.extract.v3i8.nxv4i8(<vscale x 4 x i8> %0, i64 0)
 %2 = call <vscale x 4 x i8> @llvm.vector.insert.nxv4i8.v3i8(<vscale x 4 x i8> poison, <3 x i8> %1, i64 0)
 %3 = call <vscale x 4 x i8> @llvm.vector.insert.nxv4i8.v3i8(<vscale x 4 x i8> poison, <3 x i8> %1, i64 0)
 %4 = call <vscale x 4 x i8> @llvm.vp.add.nxv4i8(<vscale x 4 x i8> %2, <vscale x 4 x i8> %3, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> i
nsertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 3)
 %5 = call <3 x i8> @llvm.vector.extract.v3i8.nxv4i8(<vscale x 4 x i8> %4, i64 0)
 %6 = call <vscale x 4 x i8> @llvm.vector.insert.nxv4i8.v3i8(<vscale x 4 x i8> poison, <3 x i8> %5, i64 0)
 call void @llvm.vp.store.nxv4i8.p0(<vscale x 4 x i8> %6, ptr %array, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x
4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 3)
 ret void
}

```

>From a7a504426c28ab95b700f34e6bf5c381574bac25 Mon Sep 17 00:00:00 2001
From: Kito Cheng <kito.cheng at sifive.com>
Date: Mon, 5 Aug 2024 18:13:37 +0800
Subject: [PATCH] [RISCV] Lower non-power-of-2 vector to nearest power-of-2
 vector length with VP intrinsic

It's still early stage for this patch, but I would like to kick this out
to demonstrate the possility of this approach,
although it's mostly nullify by #104689, but it can get some improve
after adding more pattern, and will add later.

The idea of this patch is lowee the non-power-of-2 vector to nearest
power-of-2 vector length with VP intrinsic, and put vector insert and
extrat for converting the type from/to the original vector type.

Example:

```
define void @vls3i8(ptr align 8 %array) {
entry:
  %1 = load <3 x i8>, ptr %array, align 1
  %2 = add<3 x i8> %1, %1
  store <3 x i8> %2, ptr %array, align 1
  ret void
}
```

```
define void @vls3i8(ptr align 8 %array) #0 {
entry:
  %0 = call <vscale x 4 x i8> @llvm.vp.load.nxv4i8.p0(ptr %array, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x
i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 3)
 %1 = call <3 x i8> @llvm.vector.extract.v3i8.nxv4i8(<vscale x 4 x i8> %0, i64 0)
 %2 = call <vscale x 4 x i8> @llvm.vector.insert.nxv4i8.v3i8(<vscale x 4 x i8> poison, <3 x i8> %1, i64 0)
 %3 = call <vscale x 4 x i8> @llvm.vector.insert.nxv4i8.v3i8(<vscale x 4 x i8> poison, <3 x i8> %1, i64 0)
 %4 = call <vscale x 4 x i8> @llvm.vp.add.nxv4i8(<vscale x 4 x i8> %2, <vscale x 4 x i8> %3, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> i
nsertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 3)
 %5 = call <3 x i8> @llvm.vector.extract.v3i8.nxv4i8(<vscale x 4 x i8> %4, i64 0)
 %6 = call <vscale x 4 x i8> @llvm.vector.insert.nxv4i8.v3i8(<vscale x 4 x i8> poison, <3 x i8> %5, i64 0)
 call void @llvm.vp.store.nxv4i8.p0(<vscale x 4 x i8> %6, ptr %array, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x
4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 3)
 ret void
}

```
---
 .../SelectionDAG/LegalizeVectorTypes.cpp      |   12 +
 llvm/lib/Target/RISCV/CMakeLists.txt          |    1 +
 llvm/lib/Target/RISCV/RISCV.h                 |    3 +
 .../RISCV/RISCVLegalizeNonPowerOf2Vector.cpp  |  199 +
 llvm/lib/Target/RISCV/RISCVTargetMachine.cpp  |    2 +
 .../CodeGen/RISCV/rvv/fixed-vectors-abs.ll    |   18 +
 .../RISCV/rvv/fixed-vectors-extract.ll        |   56 +-
 .../CodeGen/RISCV/rvv/fixed-vectors-insert.ll |   53 +-
 .../rvv/fixed-vectors-interleaved-access.ll   | 3818 +++++++++++++----
 .../CodeGen/RISCV/rvv/fixed-vectors-load.ll   |  253 +-
 10 files changed, 3591 insertions(+), 824 deletions(-)
 create mode 100644 llvm/lib/Target/RISCV/RISCVLegalizeNonPowerOf2Vector.cpp

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 475d5806467d98..ca6b600ece6298 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -25,6 +25,7 @@
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TypeSize.h"
 #include "llvm/Support/raw_ostream.h"
@@ -5686,6 +5687,17 @@ SDValue DAGTypeLegalizer::WidenVecRes_EXTRACT_SUBVECTOR(SDNode *N) {
   unsigned WidenNumElts = WidenVT.getVectorMinNumElements();
   unsigned InNumElts = InVT.getVectorMinNumElements();
   unsigned VTNumElts = VT.getVectorMinNumElements();
+
+  if (InVT.isScalableVector())
+  {
+     unsigned EltSize = InVT.getScalarType ().getFixedSizeInBits ();
+
+     unsigned MinVScale = getVScaleRange(&DAG.getMachineFunction ().getFunction(), 64)
+                                 .getUnsignedMin().getZExtValue ();
+     InNumElts = InNumElts * MinVScale;
+  }
+
+
   assert(IdxVal % VTNumElts == 0 &&
          "Expected Idx to be a multiple of subvector minimum vector length");
   if (IdxVal % WidenNumElts == 0 && IdxVal + WidenNumElts < InNumElts)
diff --git a/llvm/lib/Target/RISCV/CMakeLists.txt b/llvm/lib/Target/RISCV/CMakeLists.txt
index cbb4c2cedfb97e..78f3523699f309 100644
--- a/llvm/lib/Target/RISCV/CMakeLists.txt
+++ b/llvm/lib/Target/RISCV/CMakeLists.txt
@@ -57,6 +57,7 @@ add_llvm_target(RISCVCodeGen
   RISCVTargetObjectFile.cpp
   RISCVTargetTransformInfo.cpp
   RISCVVectorPeephole.cpp
+  RISCVLegalizeNonPowerOf2Vector.cpp
   GISel/RISCVCallLowering.cpp
   GISel/RISCVInstructionSelector.cpp
   GISel/RISCVLegalizerInfo.cpp
diff --git a/llvm/lib/Target/RISCV/RISCV.h b/llvm/lib/Target/RISCV/RISCV.h
index 5a94ada8f8dd46..4204ed9c1004ad 100644
--- a/llvm/lib/Target/RISCV/RISCV.h
+++ b/llvm/lib/Target/RISCV/RISCV.h
@@ -99,6 +99,9 @@ void initializeRISCVO0PreLegalizerCombinerPass(PassRegistry &);
 
 FunctionPass *createRISCVPreLegalizerCombiner();
 void initializeRISCVPreLegalizerCombinerPass(PassRegistry &);
+
+FunctionPass *createRISCVLegalizeNonPowerOf2Vector();
+void initializeRISCVLegalizeNonPowerOf2VectorPass(PassRegistry &);
 } // namespace llvm
 
 #endif
diff --git a/llvm/lib/Target/RISCV/RISCVLegalizeNonPowerOf2Vector.cpp b/llvm/lib/Target/RISCV/RISCVLegalizeNonPowerOf2Vector.cpp
new file mode 100644
index 00000000000000..98bca6b96fa339
--- /dev/null
+++ b/llvm/lib/Target/RISCV/RISCVLegalizeNonPowerOf2Vector.cpp
@@ -0,0 +1,199 @@
+#include "RISCV.h"
+#include "RISCVTargetMachine.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/VectorBuilder.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "riscv-legalize-non-power-of-2-vector"
+#define PASS_NAME "Legalize non-power-of-2 vector type"
+
+namespace {
+class RISCVLegalizeNonPowerOf2Vector : public FunctionPass {
+  const RISCVSubtarget *ST;
+  unsigned MinVScale;
+
+public:
+  static char ID;
+  RISCVLegalizeNonPowerOf2Vector() : FunctionPass(ID) {}
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<TargetPassConfig>();
+  }
+
+  bool runOnFunction(Function &F) override;
+  StringRef getPassName() const override { return PASS_NAME; }
+
+private:
+  FixedVectorType *extracUsedFixedVectorType(const Instruction &I) const;
+
+  bool isTargetType(FixedVectorType *VecTy) const;
+
+  ScalableVectorType *
+  getContainerForFixedLengthVector(FixedVectorType *FixedVecTy);
+};
+} // namespace
+
+FixedVectorType *RISCVLegalizeNonPowerOf2Vector::extracUsedFixedVectorType(
+    const Instruction &I) const {
+  if (isa<FixedVectorType>(I.getType())) {
+    return cast<FixedVectorType>(I.getType());
+  } else if (isa<StoreInst>(I) &&
+             isa<FixedVectorType>(
+                 cast<StoreInst>(&I)->getValueOperand()->getType())) {
+    return cast<FixedVectorType>(
+        cast<StoreInst>(&I)->getValueOperand()->getType());
+  }
+  return nullptr;
+}
+
+ScalableVectorType *
+RISCVLegalizeNonPowerOf2Vector::getContainerForFixedLengthVector(
+    FixedVectorType *FixedVecTy) {
+  // TODO: Consider vscale_range to pick a better/smaller type.
+  //
+  uint64_t 	 NumElts =
+      std::max<uint64_t>((NextPowerOf2 (FixedVecTy->getNumElements()) / MinVScale), 1);
+
+  Type *ElementType = FixedVecTy->getElementType();
+
+  if (ElementType->isIntegerTy(1))
+      NumElts = std::max(NumElts, 8UL);
+
+  return ScalableVectorType::get(ElementType, NumElts);
+}
+
+bool RISCVLegalizeNonPowerOf2Vector::isTargetType(
+    FixedVectorType *VecTy) const {
+  if (isPowerOf2_32(VecTy->getNumElements()))
+    return false;
+
+  Type *EltTy = VecTy->getElementType();
+
+  if (EltTy->isIntegerTy(1))
+    return false;
+
+  if (EltTy->isIntegerTy(64))
+    return ST->hasVInstructionsI64();
+  else if (EltTy->isFloatTy())
+    return ST->hasVInstructionsF32();
+  else if (EltTy->isDoubleTy())
+    return ST->hasVInstructionsF64();
+  else if (EltTy->isHalfTy())
+    return ST->hasVInstructionsF16Minimal();
+  else if (EltTy->isBFloatTy())
+    return ST->hasVInstructionsBF16Minimal();
+
+  return (EltTy->isIntegerTy(1) || EltTy->isIntegerTy(8) ||
+          EltTy->isIntegerTy(16) || EltTy->isIntegerTy(32));
+}
+
+bool RISCVLegalizeNonPowerOf2Vector::runOnFunction(Function &F) {
+
+  if (skipFunction(F))
+    return false;
+
+  auto &TPC = getAnalysis<TargetPassConfig>();
+  auto &TM = TPC.getTM<RISCVTargetMachine>();
+  ST = &TM.getSubtarget<RISCVSubtarget>(F);
+
+  if (!ST->hasVInstructions())
+    return false;
+
+  auto Attr = F.getFnAttribute(Attribute::VScaleRange);
+  if (Attr.isValid()) {
+      MinVScale = Attr.getVScaleRangeMin ();
+  } else {
+    unsigned MinVLen = ST->getRealMinVLen();
+    if (MinVLen < RISCV::RVVBitsPerBlock)
+      return false;
+    MinVScale = MinVLen / RISCV::RVVBitsPerBlock;
+    AttrBuilder AB(F.getContext());
+    AB.addVScaleRangeAttr(MinVScale,
+                         std::optional<unsigned>());
+
+    F.addFnAttr (AB.getAttribute(Attribute::VScaleRange));
+  }
+
+  bool Modified = false;
+  std::vector<Instruction *> ToBeRemoved;
+  for (auto &BB : F) {
+    for (auto &I : make_range(BB.rbegin(), BB.rend())) {
+      if (auto VecTy = extracUsedFixedVectorType(I)) {
+        if (!isTargetType(VecTy)) {
+          continue;
+        }
+
+        Value *I64Zero = ConstantInt::get(Type::getInt64Ty(F.getContext()), 0);
+
+        // Replace fixed length vector with scalable vector
+        IRBuilder<> Builder(&I);
+        VectorBuilder VecBuilder(Builder);
+        VecBuilder.setStaticVL(VecTy->getNumElements());
+        VectorType *NewVecTy = getContainerForFixedLengthVector(VecTy);
+        VecBuilder.setMask(Builder.CreateVectorSplat(
+            NewVecTy->getElementCount(), Builder.getTrue()));
+
+        if (auto *BinOp = dyn_cast<BinaryOperator>(&I)) {
+          Value *Op1 = BinOp->getOperand(0);
+          Value *Op2 = BinOp->getOperand(1);
+          Value *NewOp1 = Builder.CreateInsertVector(
+              NewVecTy, PoisonValue::get(NewVecTy), Op1, I64Zero);
+          Value *NewOp2 = Builder.CreateInsertVector(
+              NewVecTy, PoisonValue::get(NewVecTy), Op2, I64Zero);
+          Value *NewBinOp = VecBuilder.createVectorInstruction(
+              BinOp->getOpcode(), NewVecTy, {NewOp1, NewOp2});
+          Value *FinalResult =
+              Builder.CreateExtractVector(VecTy, NewBinOp, I64Zero);
+          BinOp->replaceAllUsesWith(FinalResult);
+          ToBeRemoved.push_back(BinOp);
+          Modified = true;
+        } else if (auto *StoreOp = dyn_cast<StoreInst>(&I)) {
+          Value *Val = StoreOp->getOperand(0);
+          Value *Addr = StoreOp->getOperand(1);
+          Value *NewVal = Builder.CreateInsertVector(
+              NewVecTy, PoisonValue::get(NewVecTy), Val, I64Zero);
+          Value *NewStoreOp = VecBuilder.createVectorInstruction(
+              StoreOp->getOpcode(), NewVecTy, {NewVal, Addr});
+          StoreOp->replaceAllUsesWith(NewStoreOp);
+          ToBeRemoved.push_back(StoreOp);
+        } else if (auto *LoadOp = dyn_cast<LoadInst>(&I)) {
+          Value *Addr = LoadOp->getOperand(0);
+          Value *NewLoadOp = VecBuilder.createVectorInstruction(
+              LoadOp->getOpcode(), NewVecTy, {Addr});
+          Value *FinalResult =
+              Builder.CreateExtractVector(VecTy, NewLoadOp, I64Zero);
+          LoadOp->replaceAllUsesWith(FinalResult);
+          ToBeRemoved.push_back(LoadOp);
+        }
+      }
+    }
+  }
+  for_each(ToBeRemoved.begin(), ToBeRemoved.end(),
+           [](Instruction *I) { I->eraseFromParent(); });
+  return Modified;
+}
+
+char RISCVLegalizeNonPowerOf2Vector::ID = 0;
+
+INITIALIZE_PASS_BEGIN(RISCVLegalizeNonPowerOf2Vector, DEBUG_TYPE, PASS_NAME,
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_END(RISCVLegalizeNonPowerOf2Vector, DEBUG_TYPE, PASS_NAME,
+                    false, false)
+
+FunctionPass *llvm::createRISCVLegalizeNonPowerOf2Vector() {
+  return new RISCVLegalizeNonPowerOf2Vector();
+}
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index 794df2212dfa53..1616269955f9f5 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -128,6 +128,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVTarget() {
   initializeRISCVDAGToDAGISelLegacyPass(*PR);
   initializeRISCVMoveMergePass(*PR);
   initializeRISCVPushPopOptPass(*PR);
+  initializeRISCVLegalizeNonPowerOf2VectorPass(*PR);
 }
 
 static StringRef computeDataLayout(const Triple &TT,
@@ -452,6 +453,7 @@ bool RISCVPassConfig::addPreISel() {
 void RISCVPassConfig::addCodeGenPrepare() {
   if (getOptLevel() != CodeGenOptLevel::None)
     addPass(createTypePromotionLegacyPass());
+  addPass(createRISCVLegalizeNonPowerOf2Vector());
   TargetPassConfig::addCodeGenPrepare();
 }
 
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-abs.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-abs.ll
index ac7d3d9109e39c..72d85758f18f9a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-abs.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-abs.ll
@@ -39,7 +39,25 @@ define void @abs_v6i16(ptr %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vslidedown.vi v9, v8, 1
+; CHECK-NEXT:    vmv.x.s a1, v9
+; CHECK-NEXT:    vmv.x.s a2, v8
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vmv.v.x v9, a2
+; CHECK-NEXT:    vslide1down.vx v9, v9, a1
+; CHECK-NEXT:    vslidedown.vi v10, v8, 2
+; CHECK-NEXT:    vmv.x.s a1, v10
+; CHECK-NEXT:    vslide1down.vx v9, v9, a1
+; CHECK-NEXT:    vslidedown.vi v10, v8, 3
+; CHECK-NEXT:    vmv.x.s a1, v10
+; CHECK-NEXT:    vslide1down.vx v9, v9, a1
+; CHECK-NEXT:    vslidedown.vi v10, v8, 4
+; CHECK-NEXT:    vmv.x.s a1, v10
+; CHECK-NEXT:    vslide1down.vx v9, v9, a1
+; CHECK-NEXT:    vslidedown.vi v8, v8, 5
+; CHECK-NEXT:    vmv.x.s a1, v8
+; CHECK-NEXT:    vslide1down.vx v8, v9, a1
+; CHECK-NEXT:    vslidedown.vi v8, v8, 2
 ; CHECK-NEXT:    vrsub.vi v9, v8, 0
 ; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; CHECK-NEXT:    vmax.vv v8, v8, v9
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll
index d309da6df7dc70..411e20f9980a3c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll
@@ -220,7 +220,18 @@ define i64 @extractelt_v3i64(ptr %x) nounwind {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 3, e64, m2, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
-; RV32-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
+; RV32-NEXT:    vslidedown.vi v8, v8, 2
+; RV32-NEXT:    vmv.x.s a0, v8
+; RV32-NEXT:    vmv.s.x v10, a0
+; RV32-NEXT:    li a0, 32
+; RV32-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
+; RV32-NEXT:    vsrl.vx v8, v8, a0
+; RV32-NEXT:    vmv.x.s a0, v8
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v8, a0
+; RV32-NEXT:    vsetivli zero, 5, e32, m2, tu, ma
+; RV32-NEXT:    vslideup.vi v8, v10, 4
+; RV32-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV32-NEXT:    vslidedown.vi v10, v8, 4
 ; RV32-NEXT:    vmv.x.s a0, v10
 ; RV32-NEXT:    vslidedown.vi v8, v8, 5
@@ -567,10 +578,37 @@ define i64 @extractelt_v3i64_idx(ptr %x, i32 zeroext %idx) nounwind {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 3, e64, m2, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
-; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV32-NEXT:    vadd.vv v8, v8, v8
+; RV32-NEXT:    li a0, 32
+; RV32-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
+; RV32-NEXT:    vsrl.vx v10, v8, a0
+; RV32-NEXT:    vmv.x.s a2, v10
+; RV32-NEXT:    vmv.x.s a3, v8
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a3
+; RV32-NEXT:    vslide1down.vx v10, v10, a2
+; RV32-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
+; RV32-NEXT:    vslidedown.vi v12, v8, 1
+; RV32-NEXT:    vmv.x.s a2, v12
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vslide1down.vx v10, v10, a2
+; RV32-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
+; RV32-NEXT:    vsrl.vx v12, v12, a0
+; RV32-NEXT:    vmv.x.s a2, v12
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vslide1down.vx v10, v10, a2
+; RV32-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
+; RV32-NEXT:    vslidedown.vi v8, v8, 2
+; RV32-NEXT:    vmv.x.s a2, v8
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vslide1down.vx v10, v10, a2
+; RV32-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
+; RV32-NEXT:    vsrl.vx v8, v8, a0
+; RV32-NEXT:    vmv.x.s a0, v8
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vslide1down.vx v8, v10, a0
+; RV32-NEXT:    vslidedown.vi v8, v8, 2
 ; RV32-NEXT:    add a1, a1, a1
-; RV32-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32-NEXT:    vslidedown.vx v10, v8, a1
 ; RV32-NEXT:    vmv.x.s a0, v10
 ; RV32-NEXT:    addi a1, a1, 1
@@ -582,8 +620,18 @@ define i64 @extractelt_v3i64_idx(ptr %x, i32 zeroext %idx) nounwind {
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetivli zero, 3, e64, m2, ta, ma
 ; RV64-NEXT:    vle64.v v8, (a0)
-; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV64-NEXT:    vadd.vv v8, v8, v8
+; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; RV64-NEXT:    vslidedown.vi v10, v8, 1
+; RV64-NEXT:    vmv.x.s a0, v10
+; RV64-NEXT:    vmv.x.s a2, v8
+; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV64-NEXT:    vmv.v.x v10, a2
+; RV64-NEXT:    vslide1down.vx v10, v10, a0
+; RV64-NEXT:    vslidedown.vi v8, v8, 2
+; RV64-NEXT:    vmv.x.s a0, v8
+; RV64-NEXT:    vslide1down.vx v8, v10, a0
+; RV64-NEXT:    vslidedown.vi v8, v8, 1
 ; RV64-NEXT:    vslidedown.vx v8, v8, a1
 ; RV64-NEXT:    vmv.x.s a0, v8
 ; RV64-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll
index 776a1e9bab6b26..622e5610da60cd 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll
@@ -254,13 +254,60 @@ define <3 x i64> @insertelt_v3i64(<3 x i64> %a, i64 %y) {
 define void @insertelt_v3i64_store(ptr %x, i64 %y) {
 ; RV32-LABEL: insertelt_v3i64_store:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    sw a2, 20(a0)
-; RV32-NEXT:    sw a1, 16(a0)
+; RV32-NEXT:    vsetivli zero, 3, e64, m2, ta, ma
+; RV32-NEXT:    vle64.v v8, (a0)
+; RV32-NEXT:    li a3, 32
+; RV32-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
+; RV32-NEXT:    vsrl.vx v10, v8, a3
+; RV32-NEXT:    vmv.x.s a4, v10
+; RV32-NEXT:    vmv.x.s a5, v8
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a5
+; RV32-NEXT:    vslide1down.vx v10, v10, a4
+; RV32-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
+; RV32-NEXT:    vslidedown.vi v12, v8, 1
+; RV32-NEXT:    vmv.x.s a4, v12
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vslide1down.vx v10, v10, a4
+; RV32-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
+; RV32-NEXT:    vsrl.vx v12, v12, a3
+; RV32-NEXT:    vmv.x.s a4, v12
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vslide1down.vx v10, v10, a4
+; RV32-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
+; RV32-NEXT:    vslidedown.vi v8, v8, 2
+; RV32-NEXT:    vmv.x.s a4, v8
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vslide1down.vx v10, v10, a4
+; RV32-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
+; RV32-NEXT:    vsrl.vx v8, v8, a3
+; RV32-NEXT:    vmv.x.s a3, v8
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vslide1down.vx v8, v10, a3
+; RV32-NEXT:    vslidedown.vi v8, v8, 2
+; RV32-NEXT:    vsetivli zero, 2, e32, m2, ta, ma
+; RV32-NEXT:    vslide1down.vx v10, v8, a1
+; RV32-NEXT:    vslide1down.vx v10, v10, a2
+; RV32-NEXT:    vsetivli zero, 3, e64, m2, tu, ma
+; RV32-NEXT:    vslideup.vi v8, v10, 2
+; RV32-NEXT:    vse64.v v8, (a0)
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: insertelt_v3i64_store:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    sd a1, 16(a0)
+; RV64-NEXT:    vsetivli zero, 3, e64, m2, ta, ma
+; RV64-NEXT:    vle64.v v8, (a0)
+; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; RV64-NEXT:    vslidedown.vi v9, v8, 1
+; RV64-NEXT:    vmv.x.s a2, v9
+; RV64-NEXT:    vmv.x.s a3, v8
+; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV64-NEXT:    vmv.v.x v8, a3
+; RV64-NEXT:    vslide1down.vx v8, v8, a2
+; RV64-NEXT:    vslide1down.vx v8, v8, a1
+; RV64-NEXT:    vslidedown.vi v8, v8, 1
+; RV64-NEXT:    vsetivli zero, 3, e64, m2, ta, ma
+; RV64-NEXT:    vse64.v v8, (a0)
 ; RV64-NEXT:    ret
   %a = load <3 x i64>, ptr %x, align 8
   %b = insertelement <3 x i64> %a, i64 %y, i32 2
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
index bc3e135a588a6f..5713046f2354a1 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
@@ -8,51 +8,33 @@
 
 ; FIXME: This should be widened to a vlseg2 of <4 x i32> with VL set to 3
 define {<3 x i32>, <3 x i32>} @load_factor2_v3(ptr %ptr) {
-; RV32-LABEL: load_factor2_v3:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
-; RV32-NEXT:    vle32.v v10, (a0)
-; RV32-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
-; RV32-NEXT:    vslidedown.vi v9, v10, 2
-; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; RV32-NEXT:    vwaddu.vv v8, v10, v9
-; RV32-NEXT:    li a0, -1
-; RV32-NEXT:    vwmaccu.vx v8, a0, v9
-; RV32-NEXT:    vmv.v.i v0, 4
-; RV32-NEXT:    vsetivli zero, 4, e32, m2, ta, ma
-; RV32-NEXT:    vslidedown.vi v12, v10, 4
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
-; RV32-NEXT:    vrgather.vi v8, v12, 0, v0.t
-; RV32-NEXT:    vid.v v9
-; RV32-NEXT:    vadd.vv v9, v9, v9
-; RV32-NEXT:    vadd.vi v11, v9, 1
-; RV32-NEXT:    vrgather.vv v9, v10, v11
-; RV32-NEXT:    vrgather.vi v9, v12, 1, v0.t
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: load_factor2_v3:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
-; RV64-NEXT:    vle32.v v10, (a0)
-; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV64-NEXT:    vid.v v8
-; RV64-NEXT:    vadd.vv v8, v8, v8
-; RV64-NEXT:    vadd.vi v8, v8, 1
-; RV64-NEXT:    vrgather.vv v9, v10, v8
-; RV64-NEXT:    vmv.v.i v0, 4
-; RV64-NEXT:    vsetivli zero, 4, e32, m2, ta, ma
-; RV64-NEXT:    vslidedown.vi v12, v10, 4
-; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
-; RV64-NEXT:    vrgather.vi v9, v12, 1, v0.t
-; RV64-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
-; RV64-NEXT:    vslidedown.vi v11, v10, 2
-; RV64-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; RV64-NEXT:    vwaddu.vv v8, v10, v11
-; RV64-NEXT:    li a0, -1
-; RV64-NEXT:    vwmaccu.vx v8, a0, v11
-; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
-; RV64-NEXT:    vrgather.vi v8, v12, 0, v0.t
-; RV64-NEXT:    ret
+; CHECK-LABEL: load_factor2_v3:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vslidedown.vi v10, v8, 5
+; CHECK-NEXT:    vmv.x.s a0, v10
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v10, v8, 3
+; CHECK-NEXT:    vmv.x.s a1, v10
+; CHECK-NEXT:    vslidedown.vi v10, v8, 1
+; CHECK-NEXT:    vmv.x.s a2, v10
+; CHECK-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
+; CHECK-NEXT:    vslidedown.vi v10, v8, 4
+; CHECK-NEXT:    vmv.x.s a3, v10
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v9, v8, 2
+; CHECK-NEXT:    vmv.x.s a4, v9
+; CHECK-NEXT:    vmv.x.s a5, v8
+; CHECK-NEXT:    vmv.v.x v8, a5
+; CHECK-NEXT:    vslide1down.vx v8, v8, a4
+; CHECK-NEXT:    vslide1down.vx v8, v8, a3
+; CHECK-NEXT:    vslidedown.vi v8, v8, 1
+; CHECK-NEXT:    vmv.v.x v9, a2
+; CHECK-NEXT:    vslide1down.vx v9, v9, a1
+; CHECK-NEXT:    vslide1down.vx v9, v9, a0
+; CHECK-NEXT:    vslidedown.vi v9, v9, 1
+; CHECK-NEXT:    ret
   %interleaved.vec = load <6 x i32>, ptr %ptr
   %v0 = shufflevector <6 x i32> %interleaved.vec, <6 x i32> poison, <3 x i32> <i32 0, i32 2, i32 4>
   %v1 = shufflevector <6 x i32> %interleaved.vec, <6 x i32> poison, <3 x i32> <i32 1, i32 3, i32 5>
@@ -156,899 +138,3111 @@ define {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} @load_
 define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_factor6_too_big(ptr %ptr) {
 ; RV32-LABEL: load_factor6_too_big:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    addi sp, sp, -560
+; RV32-NEXT:    .cfi_def_cfa_offset 560
+; RV32-NEXT:    sw ra, 556(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s0, 552(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s2, 548(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s3, 544(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s4, 540(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s5, 536(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s6, 532(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s7, 528(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s8, 524(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s9, 520(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s10, 516(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s11, 512(sp) # 4-byte Folded Spill
+; RV32-NEXT:    .cfi_offset ra, -4
+; RV32-NEXT:    .cfi_offset s0, -8
+; RV32-NEXT:    .cfi_offset s2, -12
+; RV32-NEXT:    .cfi_offset s3, -16
+; RV32-NEXT:    .cfi_offset s4, -20
+; RV32-NEXT:    .cfi_offset s5, -24
+; RV32-NEXT:    .cfi_offset s6, -28
+; RV32-NEXT:    .cfi_offset s7, -32
+; RV32-NEXT:    .cfi_offset s8, -36
+; RV32-NEXT:    .cfi_offset s9, -40
+; RV32-NEXT:    .cfi_offset s10, -44
+; RV32-NEXT:    .cfi_offset s11, -48
+; RV32-NEXT:    addi s0, sp, 560
+; RV32-NEXT:    .cfi_def_cfa s0, 0
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    li a3, 80
+; RV32-NEXT:    lui a3, 1
+; RV32-NEXT:    addi a3, a3, -1736
 ; RV32-NEXT:    mul a2, a2, a3
 ; RV32-NEXT:    sub sp, sp, a2
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xd0, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 80 * vlenb
-; RV32-NEXT:    addi a3, a1, 256
-; RV32-NEXT:    li a2, 32
-; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT:    vle32.v v16, (a3)
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 6
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a3, a1, 128
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vslideup.vi v8, v16, 4
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 40
-; RV32-NEXT:    mul a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs4r.v v8, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    lui a4, 12
-; RV32-NEXT:    vmv.s.x v0, a4
-; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
-; RV32-NEXT:    vslidedown.vi v16, v16, 16
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 56
-; RV32-NEXT:    mul a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    vmv1r.v v3, v0
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
-; RV32-NEXT:    vslideup.vi v8, v16, 10, v0.t
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 44
-; RV32-NEXT:    mul a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs4r.v v8, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    lui a4, %hi(.LCPI6_0)
-; RV32-NEXT:    addi a4, a4, %lo(.LCPI6_0)
-; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
-; RV32-NEXT:    vle16.v v8, (a4)
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs4r.v v8, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    lui a4, %hi(.LCPI6_1)
-; RV32-NEXT:    addi a4, a4, %lo(.LCPI6_1)
-; RV32-NEXT:    lui a5, 1
-; RV32-NEXT:    vle16.v v8, (a4)
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a6, 24
-; RV32-NEXT:    mul a4, a4, a6
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs4r.v v8, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    vle32.v v8, (a1)
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a4, 72
-; RV32-NEXT:    mul a1, a1, a4
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    vle32.v v24, (a3)
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 48
-; RV32-NEXT:    mul a1, a1, a3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a1, a5, -64
-; RV32-NEXT:    vmv.s.x v0, a1
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 36
-; RV32-NEXT:    mul a1, a1, a3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs1r.v v0, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 5
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl4r.v v4, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v16, v8, v4
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 24
-; RV32-NEXT:    mul a1, a1, a3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v16, v24, v8, v0.t
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 44
-; RV32-NEXT:    mul a1, a1, a3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vsetivli zero, 12, e32, m4, tu, ma
-; RV32-NEXT:    vmv.v.v v8, v16
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 44
-; RV32-NEXT:    mul a1, a1, a3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    andi sp, sp, -128
+; RV32-NEXT:    sw a0, 124(sp) # 4-byte Folded Spill
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a4, a0, 1
+; RV32-NEXT:    li t0, 48
+; RV32-NEXT:    sub a2, t0, a4
+; RV32-NEXT:    sltiu a3, a2, 49
+; RV32-NEXT:    neg a3, a3
+; RV32-NEXT:    and a6, a3, a2
+; RV32-NEXT:    sub a5, a6, a0
+; RV32-NEXT:    sltu a7, a6, a5
+; RV32-NEXT:    addi a7, a7, -1
+; RV32-NEXT:    slli a3, a0, 5
+; RV32-NEXT:    slli a2, a0, 3
+; RV32-NEXT:    sub a3, a3, a2
+; RV32-NEXT:    bltu a4, t0, .LBB6_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    li a4, 48
+; RV32-NEXT:  .LBB6_2:
+; RV32-NEXT:    and a7, a7, a5
+; RV32-NEXT:    add a5, a1, a3
+; RV32-NEXT:    sub t0, a4, a0
+; RV32-NEXT:    sltu t1, a4, t0
+; RV32-NEXT:    addi t1, t1, -1
+; RV32-NEXT:    and t0, t1, t0
+; RV32-NEXT:    add t1, a1, a2
+; RV32-NEXT:    bltu a6, a0, .LBB6_4
+; RV32-NEXT:  # %bb.3:
+; RV32-NEXT:    mv a6, a0
+; RV32-NEXT:  .LBB6_4:
+; RV32-NEXT:    vsetvli zero, a7, e64, m8, ta, ma
+; RV32-NEXT:    vle64.v v8, (a5)
+; RV32-NEXT:    vsetvli zero, t0, e64, m8, ta, ma
+; RV32-NEXT:    vle64.v v16, (t1)
+; RV32-NEXT:    slli a5, a0, 4
+; RV32-NEXT:    add a7, a1, a5
+; RV32-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
+; RV32-NEXT:    vle64.v v24, (a7)
+; RV32-NEXT:    bltu a4, a0, .LBB6_6
+; RV32-NEXT:  # %bb.5:
+; RV32-NEXT:    mv a4, a0
+; RV32-NEXT:  .LBB6_6:
+; RV32-NEXT:    vsetvli zero, a4, e64, m8, ta, ma
+; RV32-NEXT:    vle64.v v0, (a1)
+; RV32-NEXT:    li a1, 1304
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 512
+; RV32-NEXT:    vs8r.v v0, (a0)
+; RV32-NEXT:    add a1, a0, a5
+; RV32-NEXT:    vs8r.v v24, (a1)
+; RV32-NEXT:    add a1, a0, a2
+; RV32-NEXT:    vs8r.v v16, (a1)
+; RV32-NEXT:    add a0, a0, a3
+; RV32-NEXT:    vs8r.v v8, (a0)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 1272
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 512
+; RV32-NEXT:    vs8r.v v0, (a0)
+; RV32-NEXT:    add a1, a0, a5
+; RV32-NEXT:    vs8r.v v24, (a1)
+; RV32-NEXT:    add a1, a0, a2
+; RV32-NEXT:    vs8r.v v16, (a1)
+; RV32-NEXT:    add a0, a0, a3
+; RV32-NEXT:    vs8r.v v8, (a0)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 1240
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 512
+; RV32-NEXT:    vs8r.v v0, (a0)
+; RV32-NEXT:    add a1, a0, a5
+; RV32-NEXT:    vs8r.v v24, (a1)
+; RV32-NEXT:    add a1, a0, a2
+; RV32-NEXT:    vs8r.v v16, (a1)
+; RV32-NEXT:    add a0, a0, a3
+; RV32-NEXT:    vs8r.v v8, (a0)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 1208
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 512
+; RV32-NEXT:    vs8r.v v0, (a0)
+; RV32-NEXT:    add a1, a0, a5
+; RV32-NEXT:    vs8r.v v24, (a1)
+; RV32-NEXT:    add a1, a0, a2
+; RV32-NEXT:    vs8r.v v16, (a1)
+; RV32-NEXT:    add a0, a0, a3
+; RV32-NEXT:    vs8r.v v8, (a0)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 1176
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 512
+; RV32-NEXT:    vs8r.v v0, (a0)
+; RV32-NEXT:    add a1, a0, a5
+; RV32-NEXT:    vs8r.v v24, (a1)
+; RV32-NEXT:    add a1, a0, a2
+; RV32-NEXT:    vs8r.v v16, (a1)
+; RV32-NEXT:    add a0, a0, a3
+; RV32-NEXT:    vs8r.v v8, (a0)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 1144
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 512
+; RV32-NEXT:    vs8r.v v0, (a0)
+; RV32-NEXT:    add a1, a0, a5
+; RV32-NEXT:    vs8r.v v24, (a1)
+; RV32-NEXT:    add a1, a0, a2
+; RV32-NEXT:    vs8r.v v16, (a1)
+; RV32-NEXT:    add a0, a0, a3
+; RV32-NEXT:    vs8r.v v8, (a0)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 1112
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 512
+; RV32-NEXT:    vs8r.v v0, (a0)
+; RV32-NEXT:    add a1, a0, a5
+; RV32-NEXT:    vs8r.v v24, (a1)
+; RV32-NEXT:    add a1, a0, a2
+; RV32-NEXT:    vs8r.v v16, (a1)
+; RV32-NEXT:    add a0, a0, a3
+; RV32-NEXT:    vs8r.v v8, (a0)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 1080
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 512
+; RV32-NEXT:    vs8r.v v0, (a0)
+; RV32-NEXT:    add a1, a0, a5
+; RV32-NEXT:    vs8r.v v24, (a1)
+; RV32-NEXT:    add a1, a0, a2
+; RV32-NEXT:    vs8r.v v16, (a1)
+; RV32-NEXT:    add a0, a0, a3
+; RV32-NEXT:    vs8r.v v8, (a0)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 1048
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 512
+; RV32-NEXT:    vs8r.v v0, (a0)
+; RV32-NEXT:    add a1, a0, a5
+; RV32-NEXT:    vs8r.v v24, (a1)
+; RV32-NEXT:    add a1, a0, a2
+; RV32-NEXT:    vs8r.v v16, (a1)
+; RV32-NEXT:    add a0, a0, a3
+; RV32-NEXT:    vs8r.v v8, (a0)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 1016
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 512
+; RV32-NEXT:    vs8r.v v0, (a0)
+; RV32-NEXT:    add a1, a0, a5
+; RV32-NEXT:    vs8r.v v24, (a1)
+; RV32-NEXT:    add a1, a0, a2
+; RV32-NEXT:    vs8r.v v16, (a1)
+; RV32-NEXT:    add a0, a0, a3
+; RV32-NEXT:    vs8r.v v8, (a0)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 984
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 512
+; RV32-NEXT:    vs8r.v v0, (a0)
+; RV32-NEXT:    add a1, a0, a5
+; RV32-NEXT:    vs8r.v v24, (a1)
+; RV32-NEXT:    add a1, a0, a2
+; RV32-NEXT:    vs8r.v v16, (a1)
+; RV32-NEXT:    add a0, a0, a3
+; RV32-NEXT:    vs8r.v v8, (a0)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 952
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 512
+; RV32-NEXT:    vs8r.v v0, (a0)
+; RV32-NEXT:    add a1, a0, a5
+; RV32-NEXT:    vs8r.v v24, (a1)
+; RV32-NEXT:    add a1, a0, a2
+; RV32-NEXT:    vs8r.v v16, (a1)
+; RV32-NEXT:    add a0, a0, a3
+; RV32-NEXT:    vs8r.v v8, (a0)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 920
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 512
+; RV32-NEXT:    vs8r.v v0, (a0)
+; RV32-NEXT:    add a1, a0, a5
+; RV32-NEXT:    vs8r.v v24, (a1)
+; RV32-NEXT:    add a1, a0, a2
+; RV32-NEXT:    vs8r.v v16, (a1)
+; RV32-NEXT:    add a0, a0, a3
+; RV32-NEXT:    vs8r.v v8, (a0)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 888
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 512
+; RV32-NEXT:    vs8r.v v0, (a0)
+; RV32-NEXT:    add a1, a0, a5
+; RV32-NEXT:    vs8r.v v24, (a1)
+; RV32-NEXT:    add a1, a0, a2
+; RV32-NEXT:    vs8r.v v16, (a1)
+; RV32-NEXT:    add a0, a0, a3
+; RV32-NEXT:    vs8r.v v8, (a0)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 856
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 512
+; RV32-NEXT:    vs8r.v v0, (a0)
+; RV32-NEXT:    add a1, a0, a5
+; RV32-NEXT:    vs8r.v v24, (a1)
+; RV32-NEXT:    add a1, a0, a2
+; RV32-NEXT:    vs8r.v v16, (a1)
+; RV32-NEXT:    add a0, a0, a3
+; RV32-NEXT:    vs8r.v v8, (a0)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 824
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 512
+; RV32-NEXT:    vs8r.v v0, (a0)
+; RV32-NEXT:    add a1, a0, a5
+; RV32-NEXT:    vs8r.v v24, (a1)
+; RV32-NEXT:    add a1, a0, a2
+; RV32-NEXT:    vs8r.v v16, (a1)
+; RV32-NEXT:    add a0, a0, a3
+; RV32-NEXT:    vs8r.v v8, (a0)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 792
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 512
+; RV32-NEXT:    vs8r.v v0, (a0)
+; RV32-NEXT:    add a1, a0, a5
+; RV32-NEXT:    vs8r.v v24, (a1)
+; RV32-NEXT:    add a1, a0, a2
+; RV32-NEXT:    vs8r.v v16, (a1)
+; RV32-NEXT:    add a0, a0, a3
+; RV32-NEXT:    vs8r.v v8, (a0)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 760
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 512
+; RV32-NEXT:    vs8r.v v0, (a0)
+; RV32-NEXT:    add a1, a0, a5
+; RV32-NEXT:    vs8r.v v24, (a1)
+; RV32-NEXT:    add a1, a0, a2
+; RV32-NEXT:    vs8r.v v16, (a1)
+; RV32-NEXT:    add a0, a0, a3
+; RV32-NEXT:    vs8r.v v8, (a0)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 728
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 512
+; RV32-NEXT:    vs8r.v v0, (a0)
+; RV32-NEXT:    add a1, a0, a5
+; RV32-NEXT:    vs8r.v v24, (a1)
+; RV32-NEXT:    add a1, a0, a2
+; RV32-NEXT:    vs8r.v v16, (a1)
+; RV32-NEXT:    add a0, a0, a3
+; RV32-NEXT:    vs8r.v v8, (a0)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 696
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 512
+; RV32-NEXT:    vs8r.v v0, (a0)
+; RV32-NEXT:    add a1, a0, a5
+; RV32-NEXT:    vs8r.v v24, (a1)
+; RV32-NEXT:    add a1, a0, a2
+; RV32-NEXT:    vs8r.v v16, (a1)
+; RV32-NEXT:    add a0, a0, a3
+; RV32-NEXT:    vs8r.v v8, (a0)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 664
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 512
+; RV32-NEXT:    vs8r.v v0, (a0)
+; RV32-NEXT:    add a1, a0, a5
+; RV32-NEXT:    vs8r.v v24, (a1)
+; RV32-NEXT:    add a1, a0, a2
+; RV32-NEXT:    vs8r.v v16, (a1)
+; RV32-NEXT:    add a0, a0, a3
+; RV32-NEXT:    vs8r.v v8, (a0)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 632
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 512
+; RV32-NEXT:    vs8r.v v0, (a0)
+; RV32-NEXT:    add a1, a0, a5
+; RV32-NEXT:    vs8r.v v24, (a1)
+; RV32-NEXT:    add a1, a0, a2
+; RV32-NEXT:    vs8r.v v16, (a1)
+; RV32-NEXT:    add a0, a0, a3
+; RV32-NEXT:    vs8r.v v8, (a0)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 600
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 512
+; RV32-NEXT:    vs8r.v v0, (a0)
+; RV32-NEXT:    add a1, a0, a5
+; RV32-NEXT:    vs8r.v v24, (a1)
+; RV32-NEXT:    add a1, a0, a2
+; RV32-NEXT:    vs8r.v v16, (a1)
+; RV32-NEXT:    add a0, a0, a3
+; RV32-NEXT:    vs8r.v v8, (a0)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 568
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 512
+; RV32-NEXT:    vs8r.v v0, (a0)
+; RV32-NEXT:    add a1, a0, a5
+; RV32-NEXT:    vs8r.v v24, (a1)
+; RV32-NEXT:    add a1, a0, a2
+; RV32-NEXT:    vs8r.v v16, (a1)
+; RV32-NEXT:    add a0, a0, a3
+; RV32-NEXT:    vs8r.v v8, (a0)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 536
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 512
+; RV32-NEXT:    vs8r.v v0, (a0)
+; RV32-NEXT:    add a1, a0, a5
+; RV32-NEXT:    vs8r.v v24, (a1)
+; RV32-NEXT:    add a1, a0, a2
+; RV32-NEXT:    vs8r.v v16, (a1)
+; RV32-NEXT:    add a0, a0, a3
+; RV32-NEXT:    vs8r.v v8, (a0)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 504
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 512
+; RV32-NEXT:    vs8r.v v0, (a0)
+; RV32-NEXT:    add a1, a0, a5
+; RV32-NEXT:    vs8r.v v24, (a1)
+; RV32-NEXT:    add a1, a0, a2
+; RV32-NEXT:    vs8r.v v16, (a1)
+; RV32-NEXT:    add a0, a0, a3
+; RV32-NEXT:    vs8r.v v8, (a0)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 472
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 512
+; RV32-NEXT:    vs8r.v v0, (a0)
+; RV32-NEXT:    add a1, a0, a5
+; RV32-NEXT:    vs8r.v v24, (a1)
+; RV32-NEXT:    add a1, a0, a2
+; RV32-NEXT:    vs8r.v v16, (a1)
+; RV32-NEXT:    add a0, a0, a3
+; RV32-NEXT:    vs8r.v v8, (a0)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 440
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 512
+; RV32-NEXT:    vs8r.v v0, (a0)
+; RV32-NEXT:    add a1, a0, a5
+; RV32-NEXT:    vs8r.v v24, (a1)
+; RV32-NEXT:    add a1, a0, a2
+; RV32-NEXT:    vs8r.v v16, (a1)
+; RV32-NEXT:    add a0, a0, a3
+; RV32-NEXT:    vs8r.v v8, (a0)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 408
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 512
+; RV32-NEXT:    vs8r.v v0, (a0)
+; RV32-NEXT:    add a1, a0, a5
+; RV32-NEXT:    vs8r.v v24, (a1)
+; RV32-NEXT:    add a1, a0, a2
+; RV32-NEXT:    vs8r.v v16, (a1)
+; RV32-NEXT:    add a0, a0, a3
+; RV32-NEXT:    vs8r.v v8, (a0)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 376
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 512
+; RV32-NEXT:    vs8r.v v0, (a0)
+; RV32-NEXT:    add a1, a0, a5
+; RV32-NEXT:    vs8r.v v24, (a1)
+; RV32-NEXT:    add a1, a0, a2
+; RV32-NEXT:    vs8r.v v16, (a1)
+; RV32-NEXT:    add a0, a0, a3
+; RV32-NEXT:    vs8r.v v8, (a0)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 344
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 512
+; RV32-NEXT:    vs8r.v v0, (a0)
+; RV32-NEXT:    add a1, a0, a5
+; RV32-NEXT:    vs8r.v v24, (a1)
+; RV32-NEXT:    add a1, a0, a2
+; RV32-NEXT:    vs8r.v v16, (a1)
+; RV32-NEXT:    add a0, a0, a3
+; RV32-NEXT:    vs8r.v v8, (a0)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 312
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 512
+; RV32-NEXT:    vs8r.v v0, (a0)
+; RV32-NEXT:    add a1, a0, a5
+; RV32-NEXT:    vs8r.v v24, (a1)
+; RV32-NEXT:    add a1, a0, a2
+; RV32-NEXT:    vs8r.v v16, (a1)
+; RV32-NEXT:    add a0, a0, a3
+; RV32-NEXT:    vs8r.v v8, (a0)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 296
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 512
+; RV32-NEXT:    vs8r.v v0, (a0)
+; RV32-NEXT:    add a0, a0, a2
+; RV32-NEXT:    vs8r.v v16, (a0)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 280
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 512
+; RV32-NEXT:    vs8r.v v0, (a0)
+; RV32-NEXT:    add a0, a0, a2
+; RV32-NEXT:    vs8r.v v16, (a0)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 264
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 512
+; RV32-NEXT:    vs8r.v v0, (a0)
+; RV32-NEXT:    add a0, a0, a2
+; RV32-NEXT:    vs8r.v v16, (a0)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 248
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 512
+; RV32-NEXT:    vs8r.v v0, (a0)
+; RV32-NEXT:    add a0, a0, a2
+; RV32-NEXT:    vs8r.v v16, (a0)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 232
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 512
+; RV32-NEXT:    vs8r.v v0, (a0)
+; RV32-NEXT:    add a0, a0, a2
+; RV32-NEXT:    vs8r.v v16, (a0)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 216
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 512
+; RV32-NEXT:    vs8r.v v0, (a0)
+; RV32-NEXT:    add a0, a0, a2
+; RV32-NEXT:    vs8r.v v16, (a0)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 200
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 512
+; RV32-NEXT:    vs8r.v v0, (a0)
+; RV32-NEXT:    add a0, a0, a2
+; RV32-NEXT:    vs8r.v v16, (a0)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 184
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 512
+; RV32-NEXT:    vs8r.v v0, (a0)
+; RV32-NEXT:    add a0, a0, a2
+; RV32-NEXT:    vs8r.v v16, (a0)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 168
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 512
+; RV32-NEXT:    vs8r.v v0, (a0)
+; RV32-NEXT:    add a0, a0, a2
+; RV32-NEXT:    vs8r.v v16, (a0)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 152
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 512
+; RV32-NEXT:    vs8r.v v0, (a0)
+; RV32-NEXT:    add a0, a0, a2
+; RV32-NEXT:    vs8r.v v16, (a0)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 136
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 512
+; RV32-NEXT:    vs8r.v v0, (a0)
+; RV32-NEXT:    add a0, a0, a2
+; RV32-NEXT:    vs8r.v v16, (a0)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 120
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 512
+; RV32-NEXT:    vs8r.v v0, (a0)
+; RV32-NEXT:    add a0, a0, a2
+; RV32-NEXT:    vs8r.v v16, (a0)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 104
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 512
+; RV32-NEXT:    vs8r.v v0, (a0)
+; RV32-NEXT:    add a0, a0, a2
+; RV32-NEXT:    vs8r.v v16, (a0)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 88
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 512
+; RV32-NEXT:    vs8r.v v0, (a0)
+; RV32-NEXT:    add a0, a0, a2
+; RV32-NEXT:    vs8r.v v16, (a0)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 72
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 512
+; RV32-NEXT:    vs8r.v v0, (a0)
+; RV32-NEXT:    add a0, a0, a2
+; RV32-NEXT:    vs8r.v v16, (a0)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 56
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 512
+; RV32-NEXT:    vs8r.v v0, (a0)
+; RV32-NEXT:    add a0, a0, a2
+; RV32-NEXT:    vs8r.v v16, (a0)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 6
+; RV32-NEXT:    lui a0, 1
+; RV32-NEXT:    addi a0, a0, -1800
+; RV32-NEXT:    mul a1, a1, a0
 ; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
-; RV32-NEXT:    vslideup.vi v12, v8, 2
-; RV32-NEXT:    vmv1r.v v8, v3
+; RV32-NEXT:    addi a1, a1, 512
+; RV32-NEXT:    vs8r.v v0, (a1)
+; RV32-NEXT:    add a0, a1, a5
+; RV32-NEXT:    vs8r.v v24, (a0)
+; RV32-NEXT:    add a0, a1, a2
+; RV32-NEXT:    vs8r.v v16, (a0)
+; RV32-NEXT:    add a0, a1, a3
+; RV32-NEXT:    vs8r.v v8, (a0)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    lui a0, 1
+; RV32-NEXT:    addi a0, a0, -1768
+; RV32-NEXT:    mul a1, a1, a0
 ; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs1r.v v3, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    vmv1r.v v0, v3
+; RV32-NEXT:    addi a1, a1, 512
+; RV32-NEXT:    vs8r.v v0, (a1)
+; RV32-NEXT:    add a0, a1, a5
+; RV32-NEXT:    vs8r.v v24, (a0)
+; RV32-NEXT:    add a0, a1, a2
+; RV32-NEXT:    vs8r.v v16, (a0)
+; RV32-NEXT:    add a0, a1, a3
+; RV32-NEXT:    vs8r.v v8, (a0)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 56
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    li a0, 1944
+; RV32-NEXT:    mul a1, a1, a0
 ; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vslideup.vi v12, v16, 8, v0.t
-; RV32-NEXT:    lui a1, %hi(.LCPI6_2)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_2)
-; RV32-NEXT:    lui a3, %hi(.LCPI6_3)
-; RV32-NEXT:    addi a3, a3, %lo(.LCPI6_3)
-; RV32-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; RV32-NEXT:    vle16.v v0, (a1)
-; RV32-NEXT:    vle16.v v4, (a3)
-; RV32-NEXT:    lui a1, %hi(.LCPI6_4)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_4)
-; RV32-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; RV32-NEXT:    vle16.v v10, (a1)
+; RV32-NEXT:    addi a1, a1, 512
+; RV32-NEXT:    vs8r.v v0, (a1)
+; RV32-NEXT:    add a0, a1, a5
+; RV32-NEXT:    vs8r.v v24, (a0)
+; RV32-NEXT:    add a0, a1, a2
+; RV32-NEXT:    vs8r.v v16, (a0)
+; RV32-NEXT:    add a0, a1, a3
+; RV32-NEXT:    vs8r.v v8, (a0)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 72
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    li a0, 1912
+; RV32-NEXT:    mul a1, a1, a0
 ; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
-; RV32-NEXT:    vrgatherei16.vv v24, v16, v0
+; RV32-NEXT:    addi a1, a1, 512
+; RV32-NEXT:    vs8r.v v0, (a1)
+; RV32-NEXT:    add a0, a1, a5
+; RV32-NEXT:    vs8r.v v24, (a0)
+; RV32-NEXT:    add a0, a1, a2
+; RV32-NEXT:    vs8r.v v16, (a0)
+; RV32-NEXT:    add a0, a1, a3
+; RV32-NEXT:    vs8r.v v8, (a0)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 36
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    li a0, 1560
+; RV32-NEXT:    mul a1, a1, a0
 ; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    addi a1, a1, 512
+; RV32-NEXT:    vs8r.v v0, (a1)
+; RV32-NEXT:    add a0, a1, a5
+; RV32-NEXT:    vs8r.v v24, (a0)
+; RV32-NEXT:    add a0, a1, a2
+; RV32-NEXT:    vs8r.v v16, (a0)
+; RV32-NEXT:    add a0, a1, a3
+; RV32-NEXT:    vs8r.v v8, (a0)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 48
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    li a0, 1528
+; RV32-NEXT:    mul a1, a1, a0
 ; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v24, v16, v4, v0.t
-; RV32-NEXT:    vsetivli zero, 12, e32, m4, tu, ma
-; RV32-NEXT:    vmv.v.v v12, v24
+; RV32-NEXT:    addi a1, a1, 512
+; RV32-NEXT:    vs8r.v v0, (a1)
+; RV32-NEXT:    add a0, a1, a5
+; RV32-NEXT:    vs8r.v v24, (a0)
+; RV32-NEXT:    add a0, a1, a2
+; RV32-NEXT:    vs8r.v v16, (a0)
+; RV32-NEXT:    add a0, a1, a3
+; RV32-NEXT:    vs8r.v v8, (a0)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 36
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    lui a0, 1
+; RV32-NEXT:    addi a0, a0, -1832
+; RV32-NEXT:    mul a1, a1, a0
 ; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    addi a1, a1, 512
+; RV32-NEXT:    vs8r.v v0, (a1)
+; RV32-NEXT:    add a0, a1, a5
+; RV32-NEXT:    vs8r.v v24, (a0)
+; RV32-NEXT:    add a0, a1, a2
+; RV32-NEXT:    vs8r.v v16, (a0)
+; RV32-NEXT:    add a0, a1, a3
+; RV32-NEXT:    vs8r.v v8, (a0)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 6
+; RV32-NEXT:    lui a0, 1
+; RV32-NEXT:    addi a0, a0, -1864
+; RV32-NEXT:    mul a1, a1, a0
 ; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
-; RV32-NEXT:    vrgatherei16.vv v12, v24, v10
-; RV32-NEXT:    vmv1r.v v0, v8
+; RV32-NEXT:    addi a1, a1, 512
+; RV32-NEXT:    vs8r.v v0, (a1)
+; RV32-NEXT:    add a0, a1, a5
+; RV32-NEXT:    vs8r.v v24, (a0)
+; RV32-NEXT:    add a0, a1, a2
+; RV32-NEXT:    vs8r.v v16, (a0)
+; RV32-NEXT:    add a0, a1, a3
+; RV32-NEXT:    vs8r.v v8, (a0)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 56
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    li a0, 1880
+; RV32-NEXT:    mul a1, a1, a0
 ; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vslideup.vi v12, v24, 6, v0.t
+; RV32-NEXT:    addi a1, a1, 512
+; RV32-NEXT:    vs8r.v v0, (a1)
+; RV32-NEXT:    add a0, a1, a5
+; RV32-NEXT:    vs8r.v v24, (a0)
+; RV32-NEXT:    add a0, a1, a2
+; RV32-NEXT:    vs8r.v v16, (a0)
+; RV32-NEXT:    add a0, a1, a3
+; RV32-NEXT:    vs8r.v v8, (a0)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 5
+; RV32-NEXT:    li a0, 1848
+; RV32-NEXT:    mul a1, a1, a0
 ; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    lui a1, %hi(.LCPI6_5)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_5)
-; RV32-NEXT:    lui a3, %hi(.LCPI6_6)
-; RV32-NEXT:    addi a3, a3, %lo(.LCPI6_6)
-; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
-; RV32-NEXT:    vle16.v v12, (a1)
-; RV32-NEXT:    vle16.v v8, (a3)
+; RV32-NEXT:    addi a1, a1, 512
+; RV32-NEXT:    vs8r.v v0, (a1)
+; RV32-NEXT:    add a0, a1, a5
+; RV32-NEXT:    vs8r.v v24, (a0)
+; RV32-NEXT:    add a0, a1, a2
+; RV32-NEXT:    vs8r.v v16, (a0)
+; RV32-NEXT:    add a0, a1, a3
+; RV32-NEXT:    vs8r.v v8, (a0)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 12
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    li a0, 1496
+; RV32-NEXT:    mul a1, a1, a0
 ; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    li a1, 960
-; RV32-NEXT:    vmv.s.x v8, a1
+; RV32-NEXT:    addi a1, a1, 512
+; RV32-NEXT:    vs8r.v v0, (a1)
+; RV32-NEXT:    add a0, a1, a5
+; RV32-NEXT:    vs8r.v v24, (a0)
+; RV32-NEXT:    add a0, a1, a2
+; RV32-NEXT:    vs8r.v v16, (a0)
+; RV32-NEXT:    add a0, a1, a3
+; RV32-NEXT:    vs8r.v v8, (a0)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 72
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    li a0, 1464
+; RV32-NEXT:    mul a1, a1, a0
 ; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v24, v0, v12
-; RV32-NEXT:    vmv1r.v v3, v8
-; RV32-NEXT:    vmv1r.v v0, v8
+; RV32-NEXT:    addi a1, a1, 512
+; RV32-NEXT:    vs8r.v v0, (a1)
+; RV32-NEXT:    add a0, a1, a5
+; RV32-NEXT:    vs8r.v v24, (a0)
+; RV32-NEXT:    add a0, a1, a2
+; RV32-NEXT:    vs8r.v v16, (a0)
+; RV32-NEXT:    add a0, a1, a3
+; RV32-NEXT:    vs8r.v v8, (a0)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 12
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    lui a0, 1
+; RV32-NEXT:    addi a0, a0, -1896
+; RV32-NEXT:    mul a1, a1, a0
 ; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v24, v16, v8, v0.t
+; RV32-NEXT:    addi a1, a1, 512
+; RV32-NEXT:    vs8r.v v0, (a1)
+; RV32-NEXT:    add a0, a1, a5
+; RV32-NEXT:    vs8r.v v24, (a0)
+; RV32-NEXT:    add a0, a1, a2
+; RV32-NEXT:    vs8r.v v16, (a0)
+; RV32-NEXT:    add a0, a1, a3
+; RV32-NEXT:    vs8r.v v8, (a0)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 24
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    lui a0, 1
+; RV32-NEXT:    addi a0, a0, -1928
+; RV32-NEXT:    mul a1, a1, a0
 ; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    lui a1, %hi(.LCPI6_7)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_7)
-; RV32-NEXT:    lui a3, %hi(.LCPI6_8)
-; RV32-NEXT:    addi a3, a3, %lo(.LCPI6_8)
-; RV32-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; RV32-NEXT:    vle16.v v8, (a1)
-; RV32-NEXT:    lui a1, %hi(.LCPI6_9)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_9)
-; RV32-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; RV32-NEXT:    vle16.v v4, (a3)
-; RV32-NEXT:    vle16.v v12, (a1)
+; RV32-NEXT:    addi a1, a1, 512
+; RV32-NEXT:    vs8r.v v0, (a1)
+; RV32-NEXT:    add a0, a1, a5
+; RV32-NEXT:    vs8r.v v24, (a0)
+; RV32-NEXT:    add a0, a1, a2
+; RV32-NEXT:    vs8r.v v16, (a0)
+; RV32-NEXT:    add a0, a1, a3
+; RV32-NEXT:    vs8r.v v8, (a0)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    li a0, 1816
+; RV32-NEXT:    mul a1, a1, a0
 ; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    addi a1, a1, 512
+; RV32-NEXT:    vs8r.v v0, (a1)
+; RV32-NEXT:    add a0, a1, a5
+; RV32-NEXT:    vs8r.v v24, (a0)
+; RV32-NEXT:    add a0, a1, a2
+; RV32-NEXT:    vs8r.v v16, (a0)
+; RV32-NEXT:    add a0, a1, a3
+; RV32-NEXT:    vs8r.v v8, (a0)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 6
+; RV32-NEXT:    li a0, 1784
+; RV32-NEXT:    mul a1, a1, a0
 ; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
-; RV32-NEXT:    vrgatherei16.vv v12, v24, v8
+; RV32-NEXT:    addi a1, a1, 512
+; RV32-NEXT:    vs8r.v v0, (a1)
+; RV32-NEXT:    add a0, a1, a5
+; RV32-NEXT:    vs8r.v v24, (a0)
+; RV32-NEXT:    add a0, a1, a2
+; RV32-NEXT:    vs8r.v v16, (a0)
+; RV32-NEXT:    add a0, a1, a3
+; RV32-NEXT:    vs8r.v v8, (a0)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    li a0, 1432
+; RV32-NEXT:    mul a1, a1, a0
 ; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    addi a1, a1, 512
+; RV32-NEXT:    vs8r.v v0, (a1)
+; RV32-NEXT:    add a0, a1, a5
+; RV32-NEXT:    vs8r.v v24, (a0)
+; RV32-NEXT:    add a0, a1, a2
+; RV32-NEXT:    vs8r.v v16, (a0)
+; RV32-NEXT:    add a0, a1, a3
+; RV32-NEXT:    vs8r.v v8, (a0)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 56
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    li a0, 1400
+; RV32-NEXT:    mul a1, a1, a0
 ; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vmv4r.v v24, v16
-; RV32-NEXT:    vslideup.vi v12, v16, 4, v0.t
+; RV32-NEXT:    addi a1, a1, 512
+; RV32-NEXT:    vs8r.v v0, (a1)
+; RV32-NEXT:    add a0, a1, a5
+; RV32-NEXT:    vs8r.v v24, (a0)
+; RV32-NEXT:    add a0, a1, a2
+; RV32-NEXT:    vs8r.v v16, (a0)
+; RV32-NEXT:    add a0, a1, a3
+; RV32-NEXT:    vs8r.v v8, (a0)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 12
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    lui a0, 1
+; RV32-NEXT:    addi a0, a0, -1960
+; RV32-NEXT:    mul a1, a1, a0
 ; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    addi a1, a1, 512
+; RV32-NEXT:    vs8r.v v0, (a1)
+; RV32-NEXT:    add a0, a1, a5
+; RV32-NEXT:    vs8r.v v24, (a0)
+; RV32-NEXT:    add a0, a1, a2
+; RV32-NEXT:    vs8r.v v16, (a0)
+; RV32-NEXT:    add a0, a1, a3
+; RV32-NEXT:    vs8r.v v8, (a0)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 72
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    lui a0, 1
+; RV32-NEXT:    addi a0, a0, -1992
+; RV32-NEXT:    mul a1, a1, a0
 ; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
-; RV32-NEXT:    vrgatherei16.vv v8, v16, v4
-; RV32-NEXT:    vmv1r.v v0, v3
+; RV32-NEXT:    addi a1, a1, 512
+; RV32-NEXT:    vs8r.v v0, (a1)
+; RV32-NEXT:    add a0, a1, a5
+; RV32-NEXT:    vs8r.v v24, (a0)
+; RV32-NEXT:    add a0, a1, a2
+; RV32-NEXT:    vs8r.v v16, (a0)
+; RV32-NEXT:    add a0, a1, a3
+; RV32-NEXT:    vs8r.v v8, (a0)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 48
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    li a0, 1752
+; RV32-NEXT:    mul a1, a1, a0
 ; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    addi a1, a1, 512
+; RV32-NEXT:    vs8r.v v0, (a1)
+; RV32-NEXT:    add a0, a1, a5
+; RV32-NEXT:    vs8r.v v24, (a0)
+; RV32-NEXT:    add a0, a1, a2
+; RV32-NEXT:    vs8r.v v16, (a0)
+; RV32-NEXT:    add a0, a1, a3
+; RV32-NEXT:    vs8r.v v8, (a0)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    li a0, 1720
+; RV32-NEXT:    mul a1, a1, a0
 ; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl4r.v v28, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v8, v16, v28, v0.t
+; RV32-NEXT:    addi a1, a1, 512
+; RV32-NEXT:    vs8r.v v0, (a1)
+; RV32-NEXT:    add a0, a1, a5
+; RV32-NEXT:    vs8r.v v24, (a0)
+; RV32-NEXT:    add a0, a1, a2
+; RV32-NEXT:    vs8r.v v16, (a0)
+; RV32-NEXT:    add a0, a1, a3
+; RV32-NEXT:    vs8r.v v8, (a0)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    li a0, 1368
+; RV32-NEXT:    mul a1, a1, a0
 ; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    lui a1, %hi(.LCPI6_10)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_10)
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
-; RV32-NEXT:    vle16.v v12, (a1)
-; RV32-NEXT:    lui a1, 15
-; RV32-NEXT:    vmv.s.x v3, a1
+; RV32-NEXT:    addi a1, a1, 512
+; RV32-NEXT:    vs8r.v v0, (a1)
+; RV32-NEXT:    add a0, a1, a5
+; RV32-NEXT:    vs8r.v v24, (a0)
+; RV32-NEXT:    add a0, a1, a2
+; RV32-NEXT:    vs8r.v v16, (a0)
+; RV32-NEXT:    add a0, a1, a3
+; RV32-NEXT:    vs8r.v v8, (a0)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 6
+; RV32-NEXT:    li a0, 1336
+; RV32-NEXT:    mul a1, a1, a0
 ; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vslideup.vi v8, v16, 6
-; RV32-NEXT:    vmv1r.v v0, v3
-; RV32-NEXT:    vrgatherei16.vv v8, v24, v12, v0.t
+; RV32-NEXT:    addi a1, a1, 512
+; RV32-NEXT:    vs8r.v v0, (a1)
+; RV32-NEXT:    add a0, a1, a5
+; RV32-NEXT:    vs8r.v v24, (a0)
+; RV32-NEXT:    add a0, a1, a2
+; RV32-NEXT:    vs8r.v v16, (a0)
+; RV32-NEXT:    add a0, a1, a3
+; RV32-NEXT:    vs8r.v v8, (a0)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 2
+; RV32-NEXT:    lui a0, 1
+; RV32-NEXT:    addi a0, a0, -2024
+; RV32-NEXT:    mul a1, a1, a0
 ; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    lui a1, %hi(.LCPI6_11)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_11)
-; RV32-NEXT:    lui a3, %hi(.LCPI6_12)
-; RV32-NEXT:    addi a3, a3, %lo(.LCPI6_12)
-; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
-; RV32-NEXT:    vle16.v v24, (a1)
-; RV32-NEXT:    vle16.v v4, (a3)
-; RV32-NEXT:    li a1, 1008
-; RV32-NEXT:    vmv.s.x v0, a1
+; RV32-NEXT:    addi a1, a1, 512
+; RV32-NEXT:    vs8r.v v0, (a1)
+; RV32-NEXT:    add a0, a1, a5
+; RV32-NEXT:    vs8r.v v24, (a0)
+; RV32-NEXT:    add a0, a1, a2
+; RV32-NEXT:    vs8r.v v16, (a0)
+; RV32-NEXT:    add a0, a1, a3
+; RV32-NEXT:    vs8r.v v8, (a0)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    li a0, 2040
+; RV32-NEXT:    mul a1, a1, a0
 ; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs1r.v v0, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    addi a1, a1, 512
+; RV32-NEXT:    vs8r.v v0, (a1)
+; RV32-NEXT:    add a0, a1, a5
+; RV32-NEXT:    vs8r.v v24, (a0)
+; RV32-NEXT:    add a0, a1, a2
+; RV32-NEXT:    vs8r.v v16, (a0)
+; RV32-NEXT:    add a0, a1, a3
+; RV32-NEXT:    vs8r.v v8, (a0)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 72
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    li a0, 1688
+; RV32-NEXT:    mul a1, a1, a0
 ; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v8, v16, v24
+; RV32-NEXT:    addi a1, a1, 512
+; RV32-NEXT:    vs8r.v v0, (a1)
+; RV32-NEXT:    add a0, a1, a5
+; RV32-NEXT:    vs8r.v v24, (a0)
+; RV32-NEXT:    add a0, a1, a2
+; RV32-NEXT:    vs8r.v v16, (a0)
+; RV32-NEXT:    add a0, a1, a3
+; RV32-NEXT:    vs8r.v v8, (a0)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 48
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    li a0, 1656
+; RV32-NEXT:    mul a1, a1, a0
 ; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v8, v16, v4, v0.t
+; RV32-NEXT:    addi a1, a1, 512
+; RV32-NEXT:    vs8r.v v0, (a1)
+; RV32-NEXT:    add a0, a1, a5
+; RV32-NEXT:    vs8r.v v24, (a0)
+; RV32-NEXT:    add a0, a1, a2
+; RV32-NEXT:    vs8r.v v16, (a0)
+; RV32-NEXT:    add a0, a1, a3
+; RV32-NEXT:    vs8r.v v8, (a0)
+; RV32-NEXT:    addi a0, sp, 188
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 6
+; RV32-NEXT:    li a4, 48
+; RV32-NEXT:    mul a1, a1, a4
 ; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    addi a1, a1, 512
 ; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    lui a1, %hi(.LCPI6_13)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_13)
-; RV32-NEXT:    lui a3, %hi(.LCPI6_14)
-; RV32-NEXT:    addi a3, a3, %lo(.LCPI6_14)
-; RV32-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; RV32-NEXT:    vle16.v v20, (a1)
-; RV32-NEXT:    lui a1, %hi(.LCPI6_15)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_15)
-; RV32-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; RV32-NEXT:    vle16.v v24, (a3)
-; RV32-NEXT:    vle16.v v8, (a1)
-; RV32-NEXT:    addi a1, sp, 16
-; RV32-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    vmv1r.v v0, v3
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    mul a1, a1, a3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl4r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 56
-; RV32-NEXT:    mul a1, a1, a3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
-; RV32-NEXT:    vrgatherei16.vv v16, v8, v20, v0.t
+; RV32-NEXT:    vsetivli zero, 1, e32, m4, ta, ma
+; RV32-NEXT:    vslidedown.vi v8, v0, 15
+; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; RV32-NEXT:    vse32.v v8, (a0)
+; RV32-NEXT:    addi a0, sp, 184
+; RV32-NEXT:    vsetivli zero, 1, e32, m4, ta, ma
+; RV32-NEXT:    vslidedown.vi v8, v0, 14
+; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; RV32-NEXT:    vse32.v v8, (a0)
+; RV32-NEXT:    addi a0, sp, 180
+; RV32-NEXT:    vsetivli zero, 1, e32, m4, ta, ma
+; RV32-NEXT:    vslidedown.vi v8, v0, 13
+; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; RV32-NEXT:    vse32.v v8, (a0)
+; RV32-NEXT:    addi a0, sp, 176
+; RV32-NEXT:    vsetivli zero, 1, e32, m4, ta, ma
+; RV32-NEXT:    vslidedown.vi v8, v0, 12
+; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; RV32-NEXT:    vse32.v v8, (a0)
+; RV32-NEXT:    addi a0, sp, 172
+; RV32-NEXT:    vsetivli zero, 1, e32, m4, ta, ma
+; RV32-NEXT:    vslidedown.vi v8, v0, 11
+; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; RV32-NEXT:    vse32.v v8, (a0)
+; RV32-NEXT:    addi a0, sp, 168
+; RV32-NEXT:    vsetivli zero, 1, e32, m4, ta, ma
+; RV32-NEXT:    vslidedown.vi v8, v0, 10
+; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; RV32-NEXT:    vse32.v v8, (a0)
+; RV32-NEXT:    addi a0, sp, 164
+; RV32-NEXT:    vsetivli zero, 1, e32, m4, ta, ma
+; RV32-NEXT:    vslidedown.vi v8, v0, 9
+; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; RV32-NEXT:    vse32.v v8, (a0)
+; RV32-NEXT:    addi a0, sp, 160
+; RV32-NEXT:    vsetivli zero, 1, e32, m4, ta, ma
+; RV32-NEXT:    vslidedown.vi v8, v0, 8
+; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; RV32-NEXT:    vse32.v v8, (a0)
+; RV32-NEXT:    addi a0, sp, 156
+; RV32-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
+; RV32-NEXT:    vslidedown.vi v8, v0, 7
+; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; RV32-NEXT:    vse32.v v8, (a0)
+; RV32-NEXT:    addi a0, sp, 152
+; RV32-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
+; RV32-NEXT:    vslidedown.vi v8, v0, 6
+; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; RV32-NEXT:    vse32.v v8, (a0)
+; RV32-NEXT:    addi a0, sp, 148
+; RV32-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
+; RV32-NEXT:    vslidedown.vi v8, v0, 5
+; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; RV32-NEXT:    vse32.v v8, (a0)
+; RV32-NEXT:    addi a0, sp, 144
+; RV32-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
+; RV32-NEXT:    vslidedown.vi v8, v0, 4
+; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; RV32-NEXT:    vse32.v v8, (a0)
+; RV32-NEXT:    addi a0, sp, 140
+; RV32-NEXT:    vslidedown.vi v8, v0, 3
+; RV32-NEXT:    vse32.v v8, (a0)
+; RV32-NEXT:    addi a0, sp, 136
+; RV32-NEXT:    vslidedown.vi v8, v0, 2
+; RV32-NEXT:    vse32.v v8, (a0)
+; RV32-NEXT:    addi a0, sp, 132
+; RV32-NEXT:    vslidedown.vi v8, v0, 1
+; RV32-NEXT:    vse32.v v8, (a0)
+; RV32-NEXT:    addi a0, sp, 128
+; RV32-NEXT:    vse32.v v0, (a0)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 5
+; RV32-NEXT:    li a0, 2008
+; RV32-NEXT:    mul a1, a1, a0
 ; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl4r.v v20, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    addi a1, a1, 512
+; RV32-NEXT:    vs8r.v v0, (a1)
+; RV32-NEXT:    add a0, a1, a5
+; RV32-NEXT:    vs8r.v v24, (a0)
+; RV32-NEXT:    add a0, a1, a2
+; RV32-NEXT:    vs8r.v v16, (a0)
+; RV32-NEXT:    add a0, a1, a3
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 24
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    li a4, 48
+; RV32-NEXT:    mul a1, a1, a4
 ; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    addi a1, a1, 512
 ; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vsetivli zero, 10, e32, m4, tu, ma
-; RV32-NEXT:    vmv.v.v v20, v8
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 72
-; RV32-NEXT:    mul a1, a1, a3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
-; RV32-NEXT:    vrgatherei16.vv v8, v0, v24
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 48
-; RV32-NEXT:    mul a1, a1, a2
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    addi a1, sp, 16
-; RV32-NEXT:    vl4r.v v4, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v8, v24, v4, v0.t
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 4
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 12
-; RV32-NEXT:    mul a1, a1, a2
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl4r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vsetivli zero, 10, e32, m4, tu, ma
-; RV32-NEXT:    vmv.v.v v24, v0
+; RV32-NEXT:    vs8r.v v8, (a0)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 6
+; RV32-NEXT:    li a0, 1976
+; RV32-NEXT:    mul a1, a1, a0
 ; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    addi a1, a1, 512
+; RV32-NEXT:    vs8r.v v0, (a1)
+; RV32-NEXT:    add a0, a1, a5
+; RV32-NEXT:    vs8r.v v24, (a0)
+; RV32-NEXT:    add a0, a1, a2
+; RV32-NEXT:    vs8r.v v16, (a0)
+; RV32-NEXT:    add a0, a1, a3
+; RV32-NEXT:    vs8r.v v8, (a0)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 2
+; RV32-NEXT:    li a0, 1624
+; RV32-NEXT:    mul a1, a1, a0
 ; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl4r.v v28, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vmv.v.v v28, v0
-; RV32-NEXT:    vmv.v.v v16, v8
-; RV32-NEXT:    addi a1, a0, 320
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vse32.v v16, (a1)
-; RV32-NEXT:    addi a1, a0, 256
-; RV32-NEXT:    vse32.v v28, (a1)
-; RV32-NEXT:    addi a1, a0, 192
-; RV32-NEXT:    vse32.v v24, (a1)
-; RV32-NEXT:    addi a1, a0, 128
-; RV32-NEXT:    vse32.v v20, (a1)
-; RV32-NEXT:    addi a1, a0, 64
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    li a3, 36
-; RV32-NEXT:    mul a2, a2, a3
-; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 16
-; RV32-NEXT:    vl4r.v v8, (a2) # Unknown-size Folded Reload
-; RV32-NEXT:    vse32.v v8, (a1)
+; RV32-NEXT:    addi a1, a1, 512
+; RV32-NEXT:    vs8r.v v0, (a1)
+; RV32-NEXT:    add a0, a1, a5
+; RV32-NEXT:    vs8r.v v24, (a0)
+; RV32-NEXT:    add a0, a1, a2
+; RV32-NEXT:    vs8r.v v16, (a0)
+; RV32-NEXT:    add a0, a1, a3
+; RV32-NEXT:    vs8r.v v8, (a0)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 44
-; RV32-NEXT:    mul a1, a1, a2
+; RV32-NEXT:    li a0, 1592
+; RV32-NEXT:    mul a1, a1, a0
 ; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vse32.v v8, (a0)
+; RV32-NEXT:    addi a1, a1, 512
+; RV32-NEXT:    vs8r.v v0, (a1)
+; RV32-NEXT:    add a5, a1, a5
+; RV32-NEXT:    vs8r.v v24, (a5)
+; RV32-NEXT:    add a0, a1, a2
+; RV32-NEXT:    vs8r.v v16, (a0)
+; RV32-NEXT:    add a3, a1, a3
+; RV32-NEXT:    vs8r.v v8, (a3)
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 80
+; RV32-NEXT:    li a1, 1304
 ; RV32-NEXT:    mul a0, a0, a1
-; RV32-NEXT:    add sp, sp, a0
-; RV32-NEXT:    addi sp, sp, 16
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: load_factor6_too_big:
-; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -16
-; RV64-NEXT:    .cfi_def_cfa_offset 16
-; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 74
-; RV64-NEXT:    mul a2, a2, a3
-; RV64-NEXT:    sub sp, sp, a2
-; RV64-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xca, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 74 * vlenb
-; RV64-NEXT:    addi a2, a1, 256
-; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT:    vle64.v v16, (a2)
-; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 25
-; RV64-NEXT:    mul a2, a2, a3
-; RV64-NEXT:    add a2, sp, a2
-; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; RV64-NEXT:    addi a2, a1, 128
-; RV64-NEXT:    vle64.v v8, (a1)
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a3, a1, 6
-; RV64-NEXT:    add a1, a3, a1
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV64-NEXT:    vrgather.vi v12, v16, 4
-; RV64-NEXT:    li a1, 128
-; RV64-NEXT:    vmv.s.x v8, a1
-; RV64-NEXT:    vsetivli zero, 8, e64, m8, ta, ma
-; RV64-NEXT:    vslidedown.vi v16, v16, 8
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a3, 49
-; RV64-NEXT:    mul a1, a1, a3
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV64-NEXT:    vmv1r.v v0, v8
-; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
-; RV64-NEXT:    vrgather.vi v12, v16, 2, v0.t
-; RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; RV64-NEXT:    vid.v v10
-; RV64-NEXT:    li a1, 6
-; RV64-NEXT:    vmul.vx v2, v10, a1
-; RV64-NEXT:    li a1, 56
-; RV64-NEXT:    vle64.v v16, (a2)
-; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 57
-; RV64-NEXT:    mul a2, a2, a3
-; RV64-NEXT:    add a2, sp, a2
-; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; RV64-NEXT:    vmv.s.x v7, a1
-; RV64-NEXT:    vadd.vi v10, v2, -16
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a2, a1, 6
-; RV64-NEXT:    add a1, a2, a1
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; RV64-NEXT:    vrgatherei16.vv v16, v24, v2
-; RV64-NEXT:    vmv1r.v v0, v7
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 57
-; RV64-NEXT:    mul a1, a1, a2
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vrgatherei16.vv v16, v24, v10, v0.t
-; RV64-NEXT:    vsetivli zero, 6, e64, m4, tu, ma
-; RV64-NEXT:    vmv.v.v v12, v16
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 21
-; RV64-NEXT:    mul a1, a1, a2
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 25
-; RV64-NEXT:    mul a1, a1, a2
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
-; RV64-NEXT:    vrgather.vi v12, v16, 5
-; RV64-NEXT:    vmv1r.v v0, v8
-; RV64-NEXT:    vmv1r.v v6, v8
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 49
-; RV64-NEXT:    mul a1, a1, a2
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vrgather.vi v12, v16, 3, v0.t
-; RV64-NEXT:    vmv.v.v v28, v12
-; RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; RV64-NEXT:    vadd.vi v24, v2, 1
-; RV64-NEXT:    vadd.vi v26, v2, -15
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a2, a1, 6
-; RV64-NEXT:    add a1, a2, a1
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; RV64-NEXT:    vrgatherei16.vv v16, v8, v24
-; RV64-NEXT:    vmv1r.v v0, v7
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 57
-; RV64-NEXT:    mul a1, a1, a2
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vrgatherei16.vv v16, v8, v26, v0.t
-; RV64-NEXT:    vsetivli zero, 6, e64, m4, tu, ma
-; RV64-NEXT:    vmv.v.v v28, v16
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a2, a1, 4
-; RV64-NEXT:    add a1, a2, a1
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vs4r.v v28, (a1) # Unknown-size Folded Spill
-; RV64-NEXT:    lui a1, 16
-; RV64-NEXT:    addi a1, a1, 7
-; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV64-NEXT:    vmv.v.i v9, 6
-; RV64-NEXT:    vmv.v.x v10, a1
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 25
-; RV64-NEXT:    mul a1, a1, a2
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV64-NEXT:    vrgatherei16.vv v12, v16, v9
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 45
-; RV64-NEXT:    mul a1, a1, a2
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
-; RV64-NEXT:    vrgatherei16.vv v12, v16, v10
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 41
-; RV64-NEXT:    mul a1, a1, a2
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
-; RV64-NEXT:    vmv4r.v v8, v16
-; RV64-NEXT:    vrgather.vi v12, v16, 2
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 37
-; RV64-NEXT:    mul a1, a1, a2
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
-; RV64-NEXT:    vrgather.vi v12, v16, 3
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a2, a1, 5
-; RV64-NEXT:    add a1, a2, a1
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
-; RV64-NEXT:    li a1, 24
-; RV64-NEXT:    vmv.s.x v1, a1
-; RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; RV64-NEXT:    vadd.vi v24, v2, 2
-; RV64-NEXT:    vadd.vi v4, v2, -14
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a2, a1, 6
-; RV64-NEXT:    add a1, a2, a1
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; RV64-NEXT:    vrgatherei16.vv v8, v16, v24
-; RV64-NEXT:    vmv1r.v v0, v1
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 57
-; RV64-NEXT:    mul a1, a1, a2
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vrgatherei16.vv v8, v24, v4, v0.t
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 25
-; RV64-NEXT:    mul a1, a1, a2
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; RV64-NEXT:    vmv1r.v v0, v6
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 49
-; RV64-NEXT:    mul a1, a1, a2
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 45
-; RV64-NEXT:    mul a1, a1, a2
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl4r.v v20, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
-; RV64-NEXT:    vrgather.vi v20, v16, 4, v0.t
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    lw a0, 764(a0)
+; RV32-NEXT:    sw a0, 380(sp)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 1272
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    lw a0, 760(a0)
+; RV32-NEXT:    sw a0, 376(sp)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 1240
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    lw a0, 756(a0)
+; RV32-NEXT:    sw a0, 372(sp)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 1208
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    lw a0, 752(a0)
+; RV32-NEXT:    sw a0, 368(sp)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 1176
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    lw a0, 748(a0)
+; RV32-NEXT:    sw a0, 364(sp)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 1144
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    lw a0, 744(a0)
+; RV32-NEXT:    sw a0, 360(sp)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 1112
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    lw a0, 740(a0)
+; RV32-NEXT:    sw a0, 356(sp)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 1080
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    lw a0, 736(a0)
+; RV32-NEXT:    sw a0, 352(sp)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 1048
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    lw a0, 732(a0)
+; RV32-NEXT:    sw a0, 348(sp)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 1016
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    lw a0, 728(a0)
+; RV32-NEXT:    sw a0, 344(sp)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 984
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    lw a0, 724(a0)
+; RV32-NEXT:    sw a0, 340(sp)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 952
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    lw a0, 720(a0)
+; RV32-NEXT:    sw a0, 336(sp)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 920
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    lw a0, 716(a0)
+; RV32-NEXT:    sw a0, 332(sp)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 888
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    lw a0, 712(a0)
+; RV32-NEXT:    sw a0, 328(sp)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 856
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    lw a0, 708(a0)
+; RV32-NEXT:    sw a0, 324(sp)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 824
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    lw a0, 704(a0)
+; RV32-NEXT:    sw a0, 320(sp)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 792
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    lw a0, 700(a0)
+; RV32-NEXT:    sw a0, 316(sp)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 760
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    lw a0, 696(a0)
+; RV32-NEXT:    sw a0, 312(sp)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 728
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    lw a0, 692(a0)
+; RV32-NEXT:    sw a0, 308(sp)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 696
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    lw a0, 688(a0)
+; RV32-NEXT:    sw a0, 304(sp)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 664
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    lw a0, 684(a0)
+; RV32-NEXT:    sw a0, 300(sp)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 632
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    lw a0, 680(a0)
+; RV32-NEXT:    sw a0, 296(sp)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 600
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    lw a0, 676(a0)
+; RV32-NEXT:    sw a0, 292(sp)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 568
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    lw a0, 672(a0)
+; RV32-NEXT:    sw a0, 288(sp)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 536
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    lw a0, 668(a0)
+; RV32-NEXT:    sw a0, 284(sp)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 504
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    lw a0, 664(a0)
+; RV32-NEXT:    sw a0, 280(sp)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 472
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    lw a0, 660(a0)
+; RV32-NEXT:    sw a0, 276(sp)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 440
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    lw a0, 656(a0)
+; RV32-NEXT:    sw a0, 272(sp)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 408
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    lw a0, 652(a0)
+; RV32-NEXT:    sw a0, 268(sp)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 376
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    lw a0, 648(a0)
+; RV32-NEXT:    sw a0, 264(sp)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 344
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    lw a0, 644(a0)
+; RV32-NEXT:    sw a0, 260(sp)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 312
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    lw a0, 640(a0)
+; RV32-NEXT:    sw a0, 256(sp)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 296
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    lw a0, 636(a0)
+; RV32-NEXT:    sw a0, 252(sp)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 280
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    lw a0, 632(a0)
+; RV32-NEXT:    sw a0, 248(sp)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 264
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    lw a0, 628(a0)
+; RV32-NEXT:    sw a0, 244(sp)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 248
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    lw a0, 624(a0)
+; RV32-NEXT:    sw a0, 240(sp)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 232
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    lw a0, 620(a0)
+; RV32-NEXT:    sw a0, 236(sp)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 216
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    lw a0, 616(a0)
+; RV32-NEXT:    sw a0, 232(sp)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 200
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    lw a0, 612(a0)
+; RV32-NEXT:    sw a0, 228(sp)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 184
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    lw a0, 608(a0)
+; RV32-NEXT:    sw a0, 224(sp)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 168
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    lw a0, 604(a0)
+; RV32-NEXT:    sw a0, 220(sp)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 152
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    lw a0, 600(a0)
+; RV32-NEXT:    sw a0, 216(sp)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 136
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    lw a0, 596(a0)
+; RV32-NEXT:    sw a0, 212(sp)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 120
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    lw a0, 592(a0)
+; RV32-NEXT:    sw a0, 208(sp)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 104
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    lw a0, 588(a0)
+; RV32-NEXT:    sw a0, 204(sp)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 88
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    lw a0, 584(a0)
+; RV32-NEXT:    sw a0, 200(sp)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 72
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    lw a0, 580(a0)
+; RV32-NEXT:    sw a0, 196(sp)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 56
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    lw a0, 576(a0)
+; RV32-NEXT:    addi a2, a2, -1
+; RV32-NEXT:    li a1, 95
+; RV32-NEXT:    sw a0, 192(sp)
+; RV32-NEXT:    mv s2, a2
+; RV32-NEXT:    bltu a2, a1, .LBB6_8
+; RV32-NEXT:  # %bb.7:
+; RV32-NEXT:    li s2, 95
+; RV32-NEXT:  .LBB6_8:
+; RV32-NEXT:    li a0, 94
+; RV32-NEXT:    mv s3, a2
+; RV32-NEXT:    bltu a2, a0, .LBB6_10
+; RV32-NEXT:  # %bb.9:
+; RV32-NEXT:    li s3, 94
+; RV32-NEXT:  .LBB6_10:
+; RV32-NEXT:    li a0, 83
+; RV32-NEXT:    mv a4, a2
+; RV32-NEXT:    bltu a2, a0, .LBB6_12
+; RV32-NEXT:  # %bb.11:
+; RV32-NEXT:    li a4, 83
+; RV32-NEXT:  .LBB6_12:
+; RV32-NEXT:    li a0, 82
+; RV32-NEXT:    mv s5, a2
+; RV32-NEXT:    bltu a2, a0, .LBB6_14
+; RV32-NEXT:  # %bb.13:
+; RV32-NEXT:    li s5, 82
+; RV32-NEXT:  .LBB6_14:
+; RV32-NEXT:    li a0, 71
+; RV32-NEXT:    mv a6, a2
+; RV32-NEXT:    bltu a2, a0, .LBB6_16
+; RV32-NEXT:  # %bb.15:
+; RV32-NEXT:    li a6, 71
+; RV32-NEXT:  .LBB6_16:
+; RV32-NEXT:    li a0, 70
+; RV32-NEXT:    mv a7, a2
+; RV32-NEXT:    bltu a2, a0, .LBB6_18
+; RV32-NEXT:  # %bb.17:
+; RV32-NEXT:    li a7, 70
+; RV32-NEXT:  .LBB6_18:
+; RV32-NEXT:    li a0, 93
+; RV32-NEXT:    mv t0, a2
+; RV32-NEXT:    bltu a2, a0, .LBB6_20
+; RV32-NEXT:  # %bb.19:
+; RV32-NEXT:    li t0, 93
+; RV32-NEXT:  .LBB6_20:
+; RV32-NEXT:    li a0, 92
+; RV32-NEXT:    mv t2, a2
+; RV32-NEXT:    bltu a2, a0, .LBB6_22
+; RV32-NEXT:  # %bb.21:
+; RV32-NEXT:    li t2, 92
+; RV32-NEXT:  .LBB6_22:
+; RV32-NEXT:    li a0, 81
+; RV32-NEXT:    mv s9, a2
+; RV32-NEXT:    bltu a2, a0, .LBB6_24
+; RV32-NEXT:  # %bb.23:
+; RV32-NEXT:    li s9, 81
+; RV32-NEXT:  .LBB6_24:
+; RV32-NEXT:    li a0, 80
+; RV32-NEXT:    mv s10, a2
+; RV32-NEXT:    bltu a2, a0, .LBB6_26
+; RV32-NEXT:  # %bb.25:
+; RV32-NEXT:    li s10, 80
+; RV32-NEXT:  .LBB6_26:
+; RV32-NEXT:    li a0, 69
+; RV32-NEXT:    mv s11, a2
+; RV32-NEXT:    bltu a2, a0, .LBB6_28
+; RV32-NEXT:  # %bb.27:
+; RV32-NEXT:    li s11, 69
+; RV32-NEXT:  .LBB6_28:
+; RV32-NEXT:    li a0, 68
+; RV32-NEXT:    mv s4, a2
+; RV32-NEXT:    bltu a2, a0, .LBB6_30
+; RV32-NEXT:  # %bb.29:
+; RV32-NEXT:    li s4, 68
+; RV32-NEXT:  .LBB6_30:
+; RV32-NEXT:    li a0, 91
+; RV32-NEXT:    mv ra, a2
+; RV32-NEXT:    bltu a2, a0, .LBB6_32
+; RV32-NEXT:  # %bb.31:
+; RV32-NEXT:    li ra, 91
+; RV32-NEXT:  .LBB6_32:
+; RV32-NEXT:    li a0, 90
+; RV32-NEXT:    mv s6, a2
+; RV32-NEXT:    bltu a2, a0, .LBB6_34
+; RV32-NEXT:  # %bb.33:
+; RV32-NEXT:    li s6, 90
+; RV32-NEXT:  .LBB6_34:
+; RV32-NEXT:    li a0, 79
+; RV32-NEXT:    mv s7, a2
+; RV32-NEXT:    bltu a2, a0, .LBB6_36
+; RV32-NEXT:  # %bb.35:
+; RV32-NEXT:    li s7, 79
+; RV32-NEXT:  .LBB6_36:
+; RV32-NEXT:    li a0, 78
+; RV32-NEXT:    mv t1, a2
+; RV32-NEXT:    bltu a2, a0, .LBB6_38
+; RV32-NEXT:  # %bb.37:
+; RV32-NEXT:    li t1, 78
+; RV32-NEXT:  .LBB6_38:
+; RV32-NEXT:    li a0, 67
+; RV32-NEXT:    mv s8, a2
+; RV32-NEXT:    bltu a2, a0, .LBB6_40
+; RV32-NEXT:  # %bb.39:
+; RV32-NEXT:    li s8, 67
+; RV32-NEXT:  .LBB6_40:
+; RV32-NEXT:    li a0, 66
+; RV32-NEXT:    mv t3, a2
+; RV32-NEXT:    bltu a2, a0, .LBB6_42
+; RV32-NEXT:  # %bb.41:
+; RV32-NEXT:    li t3, 66
+; RV32-NEXT:  .LBB6_42:
+; RV32-NEXT:    li a0, 89
+; RV32-NEXT:    mv a1, a2
+; RV32-NEXT:    bltu a2, a0, .LBB6_44
+; RV32-NEXT:  # %bb.43:
+; RV32-NEXT:    li a1, 89
+; RV32-NEXT:  .LBB6_44:
+; RV32-NEXT:    sw a1, 120(sp) # 4-byte Folded Spill
+; RV32-NEXT:    li a0, 88
+; RV32-NEXT:    mv a1, a2
+; RV32-NEXT:    bltu a2, a0, .LBB6_46
+; RV32-NEXT:  # %bb.45:
+; RV32-NEXT:    li a1, 88
+; RV32-NEXT:  .LBB6_46:
+; RV32-NEXT:    sw a1, 116(sp) # 4-byte Folded Spill
+; RV32-NEXT:    li a0, 77
+; RV32-NEXT:    mv a1, a2
+; RV32-NEXT:    bltu a2, a0, .LBB6_48
+; RV32-NEXT:  # %bb.47:
+; RV32-NEXT:    li a1, 77
+; RV32-NEXT:  .LBB6_48:
+; RV32-NEXT:    li a0, 76
+; RV32-NEXT:    mv t5, a2
+; RV32-NEXT:    bltu a2, a0, .LBB6_50
+; RV32-NEXT:  # %bb.49:
+; RV32-NEXT:    li t5, 76
+; RV32-NEXT:  .LBB6_50:
+; RV32-NEXT:    li a0, 65
+; RV32-NEXT:    mv a5, a2
+; RV32-NEXT:    bltu a2, a0, .LBB6_52
+; RV32-NEXT:  # %bb.51:
+; RV32-NEXT:    li a5, 65
+; RV32-NEXT:  .LBB6_52:
+; RV32-NEXT:    li a0, 64
+; RV32-NEXT:    mv t4, a2
+; RV32-NEXT:    bltu a2, a0, .LBB6_54
+; RV32-NEXT:  # %bb.53:
+; RV32-NEXT:    li t4, 64
+; RV32-NEXT:  .LBB6_54:
+; RV32-NEXT:    li a0, 87
+; RV32-NEXT:    mv a3, a2
+; RV32-NEXT:    bltu a2, a0, .LBB6_56
+; RV32-NEXT:  # %bb.55:
+; RV32-NEXT:    li a3, 87
+; RV32-NEXT:  .LBB6_56:
+; RV32-NEXT:    sw a3, 104(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw a5, 112(sp) # 4-byte Folded Spill
+; RV32-NEXT:    li a0, 86
+; RV32-NEXT:    mv a3, a2
+; RV32-NEXT:    bltu a2, a0, .LBB6_58
+; RV32-NEXT:  # %bb.57:
+; RV32-NEXT:    li a3, 86
+; RV32-NEXT:  .LBB6_58:
+; RV32-NEXT:    sw t4, 108(sp) # 4-byte Folded Spill
+; RV32-NEXT:    li a0, 75
+; RV32-NEXT:    mv a5, a2
+; RV32-NEXT:    bltu a2, a0, .LBB6_60
+; RV32-NEXT:  # %bb.59:
+; RV32-NEXT:    li a5, 75
+; RV32-NEXT:  .LBB6_60:
+; RV32-NEXT:    mv t4, a1
+; RV32-NEXT:    li a0, 74
+; RV32-NEXT:    mv t6, a2
+; RV32-NEXT:    bltu a2, a0, .LBB6_62
+; RV32-NEXT:  # %bb.61:
+; RV32-NEXT:    li t6, 74
+; RV32-NEXT:  .LBB6_62:
+; RV32-NEXT:    li a1, 85
+; RV32-NEXT:    mv a0, a2
+; RV32-NEXT:    bltu a2, a1, .LBB6_64
+; RV32-NEXT:  # %bb.63:
+; RV32-NEXT:    li a0, 85
+; RV32-NEXT:  .LBB6_64:
+; RV32-NEXT:    slli s2, s2, 2
+; RV32-NEXT:    slli s3, s3, 2
+; RV32-NEXT:    slli a4, a4, 2
+; RV32-NEXT:    slli s5, s5, 2
+; RV32-NEXT:    slli a6, a6, 2
+; RV32-NEXT:    slli a7, a7, 2
+; RV32-NEXT:    slli t0, t0, 2
+; RV32-NEXT:    slli t2, t2, 2
+; RV32-NEXT:    slli s9, s9, 2
+; RV32-NEXT:    slli s10, s10, 2
+; RV32-NEXT:    slli s11, s11, 2
+; RV32-NEXT:    slli s4, s4, 2
+; RV32-NEXT:    slli ra, ra, 2
+; RV32-NEXT:    slli s6, s6, 2
+; RV32-NEXT:    slli s7, s7, 2
+; RV32-NEXT:    slli t1, t1, 2
+; RV32-NEXT:    sw t1, 100(sp) # 4-byte Folded Spill
+; RV32-NEXT:    slli s8, s8, 2
+; RV32-NEXT:    slli t3, t3, 2
+; RV32-NEXT:    sw t3, 96(sp) # 4-byte Folded Spill
+; RV32-NEXT:    lw t1, 120(sp) # 4-byte Folded Reload
+; RV32-NEXT:    slli t1, t1, 2
+; RV32-NEXT:    lw t3, 116(sp) # 4-byte Folded Reload
+; RV32-NEXT:    slli t3, t3, 2
+; RV32-NEXT:    slli t4, t4, 2
+; RV32-NEXT:    slli t5, t5, 2
+; RV32-NEXT:    lw a1, 112(sp) # 4-byte Folded Reload
+; RV32-NEXT:    slli a1, a1, 2
+; RV32-NEXT:    sw a1, 112(sp) # 4-byte Folded Spill
+; RV32-NEXT:    lw a1, 108(sp) # 4-byte Folded Reload
+; RV32-NEXT:    slli a1, a1, 2
+; RV32-NEXT:    sw a1, 92(sp) # 4-byte Folded Spill
+; RV32-NEXT:    lw a1, 104(sp) # 4-byte Folded Reload
+; RV32-NEXT:    slli a1, a1, 2
+; RV32-NEXT:    sw a1, 104(sp) # 4-byte Folded Spill
+; RV32-NEXT:    slli a3, a3, 2
+; RV32-NEXT:    slli a5, a5, 2
+; RV32-NEXT:    slli t6, t6, 2
+; RV32-NEXT:    li a1, 84
+; RV32-NEXT:    slli a0, a0, 2
+; RV32-NEXT:    sw a0, 68(sp) # 4-byte Folded Spill
+; RV32-NEXT:    mv a0, a2
+; RV32-NEXT:    bltu a2, a1, .LBB6_66
+; RV32-NEXT:  # %bb.65:
+; RV32-NEXT:    li a0, 84
+; RV32-NEXT:  .LBB6_66:
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    sw a0, 4(sp)
+; RV32-NEXT:    lui a0, 1
+; RV32-NEXT:    addi a0, a0, -1800
+; RV32-NEXT:    mul a1, a1, a0
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 512
+; RV32-NEXT:    add a1, a1, s2
+; RV32-NEXT:    sw a1, 120(sp) # 4-byte Folded Spill
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    lui a0, 1
+; RV32-NEXT:    addi a0, a0, -1768
+; RV32-NEXT:    mul a1, a1, a0
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 512
+; RV32-NEXT:    add a1, a1, s3
+; RV32-NEXT:    sw a1, 116(sp) # 4-byte Folded Spill
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    li a0, 1944
+; RV32-NEXT:    mul a1, a1, a0
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 512
+; RV32-NEXT:    add a4, a1, a4
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    li a0, 1912
+; RV32-NEXT:    mul a1, a1, a0
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 512
+; RV32-NEXT:    add a1, a1, s5
+; RV32-NEXT:    sw a1, 108(sp) # 4-byte Folded Spill
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    li a0, 1560
+; RV32-NEXT:    mul a1, a1, a0
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 512
+; RV32-NEXT:    add a6, a1, a6
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    li a0, 1528
+; RV32-NEXT:    mul a1, a1, a0
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 512
+; RV32-NEXT:    add a7, a1, a7
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    lui a0, 1
+; RV32-NEXT:    addi a0, a0, -1832
+; RV32-NEXT:    mul a1, a1, a0
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 512
+; RV32-NEXT:    add t0, a1, t0
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    lui a0, 1
+; RV32-NEXT:    addi a0, a0, -1864
+; RV32-NEXT:    mul a1, a1, a0
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 512
+; RV32-NEXT:    add t2, a1, t2
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    li a0, 1880
+; RV32-NEXT:    mul a1, a1, a0
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 512
+; RV32-NEXT:    add a1, a1, s9
+; RV32-NEXT:    sw a1, 88(sp) # 4-byte Folded Spill
+; RV32-NEXT:    csrr s2, vlenb
+; RV32-NEXT:    li a0, 1848
+; RV32-NEXT:    mul s2, s2, a0
+; RV32-NEXT:    add s2, sp, s2
+; RV32-NEXT:    addi s2, s2, 512
+; RV32-NEXT:    add s2, s2, s10
+; RV32-NEXT:    csrr s3, vlenb
+; RV32-NEXT:    li a0, 1496
+; RV32-NEXT:    mul s3, s3, a0
+; RV32-NEXT:    add s3, sp, s3
+; RV32-NEXT:    addi s3, s3, 512
+; RV32-NEXT:    add s3, s3, s11
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    li a0, 1464
+; RV32-NEXT:    mul a1, a1, a0
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 512
+; RV32-NEXT:    add s4, a1, s4
+; RV32-NEXT:    csrr s5, vlenb
+; RV32-NEXT:    lui a0, 1
+; RV32-NEXT:    addi a0, a0, -1896
+; RV32-NEXT:    mul s5, s5, a0
+; RV32-NEXT:    add s5, sp, s5
+; RV32-NEXT:    addi s5, s5, 512
+; RV32-NEXT:    add s5, s5, ra
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    lui a0, 1
+; RV32-NEXT:    addi a0, a0, -1928
+; RV32-NEXT:    mul a1, a1, a0
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 512
+; RV32-NEXT:    add s6, a1, s6
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    li a0, 1816
+; RV32-NEXT:    mul a1, a1, a0
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 512
+; RV32-NEXT:    add s7, a1, s7
+; RV32-NEXT:    csrr s9, vlenb
+; RV32-NEXT:    li a0, 1784
+; RV32-NEXT:    mul s9, s9, a0
+; RV32-NEXT:    add s9, sp, s9
+; RV32-NEXT:    addi s9, s9, 512
+; RV32-NEXT:    lw a1, 100(sp) # 4-byte Folded Reload
+; RV32-NEXT:    add s9, s9, a1
+; RV32-NEXT:    csrr s10, vlenb
+; RV32-NEXT:    li a0, 1432
+; RV32-NEXT:    mul s10, s10, a0
+; RV32-NEXT:    add s10, sp, s10
+; RV32-NEXT:    addi s10, s10, 512
+; RV32-NEXT:    add s10, s10, s8
+; RV32-NEXT:    csrr s11, vlenb
+; RV32-NEXT:    li a0, 1400
+; RV32-NEXT:    mul s11, s11, a0
+; RV32-NEXT:    add s11, sp, s11
+; RV32-NEXT:    addi s11, s11, 512
+; RV32-NEXT:    lw a1, 96(sp) # 4-byte Folded Reload
+; RV32-NEXT:    add s11, s11, a1
+; RV32-NEXT:    csrr ra, vlenb
+; RV32-NEXT:    lui a0, 1
+; RV32-NEXT:    addi a0, a0, -1960
+; RV32-NEXT:    mul ra, ra, a0
+; RV32-NEXT:    add ra, sp, ra
+; RV32-NEXT:    addi ra, ra, 512
+; RV32-NEXT:    add ra, ra, t1
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    lui a0, 1
+; RV32-NEXT:    addi a0, a0, -1992
+; RV32-NEXT:    mul a1, a1, a0
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 512
+; RV32-NEXT:    add a1, a1, t3
+; RV32-NEXT:    sw a1, 84(sp) # 4-byte Folded Spill
+; RV32-NEXT:    csrr t1, vlenb
+; RV32-NEXT:    li a0, 1752
+; RV32-NEXT:    mul t1, t1, a0
+; RV32-NEXT:    add t1, sp, t1
+; RV32-NEXT:    addi t1, t1, 512
+; RV32-NEXT:    add t1, t1, t4
+; RV32-NEXT:    csrr t3, vlenb
+; RV32-NEXT:    li a0, 1720
+; RV32-NEXT:    mul t3, t3, a0
+; RV32-NEXT:    add t3, sp, t3
+; RV32-NEXT:    addi t3, t3, 512
+; RV32-NEXT:    add t3, t3, t5
+; RV32-NEXT:    csrr t4, vlenb
+; RV32-NEXT:    li a0, 1368
+; RV32-NEXT:    mul t4, t4, a0
+; RV32-NEXT:    add t4, sp, t4
+; RV32-NEXT:    addi t4, t4, 512
+; RV32-NEXT:    lw a1, 112(sp) # 4-byte Folded Reload
+; RV32-NEXT:    add t4, t4, a1
+; RV32-NEXT:    csrr t5, vlenb
+; RV32-NEXT:    li a0, 1336
+; RV32-NEXT:    mul t5, t5, a0
+; RV32-NEXT:    add t5, sp, t5
+; RV32-NEXT:    addi t5, t5, 512
+; RV32-NEXT:    lw a1, 92(sp) # 4-byte Folded Reload
+; RV32-NEXT:    add t5, t5, a1
+; RV32-NEXT:    csrr s8, vlenb
+; RV32-NEXT:    lui a0, 1
+; RV32-NEXT:    addi a0, a0, -2024
+; RV32-NEXT:    mul s8, s8, a0
+; RV32-NEXT:    add s8, sp, s8
+; RV32-NEXT:    addi s8, s8, 512
+; RV32-NEXT:    lw a1, 104(sp) # 4-byte Folded Reload
+; RV32-NEXT:    add s8, s8, a1
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    li a0, 2040
+; RV32-NEXT:    mul a1, a1, a0
+; RV32-NEXT:    lw a0, 4(sp)
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 512
+; RV32-NEXT:    add a1, a1, a3
+; RV32-NEXT:    sw a1, 80(sp) # 4-byte Folded Spill
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    li a3, 1688
+; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 512
+; RV32-NEXT:    add a1, a1, a5
+; RV32-NEXT:    sw a1, 76(sp) # 4-byte Folded Spill
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    li a3, 1656
+; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 512
+; RV32-NEXT:    add a1, a1, t6
+; RV32-NEXT:    sw a1, 72(sp) # 4-byte Folded Spill
+; RV32-NEXT:    csrr t6, vlenb
+; RV32-NEXT:    li a1, 2008
+; RV32-NEXT:    mul t6, t6, a1
+; RV32-NEXT:    add t6, sp, t6
+; RV32-NEXT:    addi t6, t6, 512
+; RV32-NEXT:    lw a1, 68(sp) # 4-byte Folded Reload
+; RV32-NEXT:    add t6, t6, a1
+; RV32-NEXT:    slli a0, a0, 2
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    li a3, 1976
+; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 512
+; RV32-NEXT:    add a1, a1, a0
+; RV32-NEXT:    li a5, 32
+; RV32-NEXT:    li a0, 73
+; RV32-NEXT:    mv a3, a2
+; RV32-NEXT:    bltu a2, a0, .LBB6_68
+; RV32-NEXT:  # %bb.67:
+; RV32-NEXT:    li a3, 73
+; RV32-NEXT:  .LBB6_68:
+; RV32-NEXT:    vsetvli zero, a5, e32, m8, ta, ma
+; RV32-NEXT:    addi a0, sp, 256
+; RV32-NEXT:    vle32.v v8, (a0)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a5, 48
+; RV32-NEXT:    mul a0, a0, a5
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 512
+; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    addi a0, sp, 128
+; RV32-NEXT:    vle32.v v8, (a0)
+; RV32-NEXT:    slli a3, a3, 2
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a5, 1624
+; RV32-NEXT:    mul a0, a0, a5
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 512
+; RV32-NEXT:    add a3, a0, a3
+; RV32-NEXT:    lw a0, 120(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw a0, 0(a0)
+; RV32-NEXT:    sw a0, 120(sp) # 4-byte Folded Spill
+; RV32-NEXT:    lw a0, 116(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw a0, 0(a0)
+; RV32-NEXT:    sw a0, 116(sp) # 4-byte Folded Spill
+; RV32-NEXT:    lw a0, 0(a4)
+; RV32-NEXT:    sw a0, 112(sp) # 4-byte Folded Spill
+; RV32-NEXT:    lw a0, 108(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw a0, 0(a0)
+; RV32-NEXT:    sw a0, 108(sp) # 4-byte Folded Spill
+; RV32-NEXT:    lw a0, 0(a6)
+; RV32-NEXT:    sw a0, 104(sp) # 4-byte Folded Spill
+; RV32-NEXT:    lw a0, 0(a7)
+; RV32-NEXT:    sw a0, 100(sp) # 4-byte Folded Spill
+; RV32-NEXT:    lw a0, 0(t0)
+; RV32-NEXT:    sw a0, 96(sp) # 4-byte Folded Spill
+; RV32-NEXT:    lw a0, 0(t2)
+; RV32-NEXT:    sw a0, 92(sp) # 4-byte Folded Spill
+; RV32-NEXT:    lw a0, 88(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw a5, 0(a0)
+; RV32-NEXT:    lw a6, 0(s2)
+; RV32-NEXT:    lw a7, 0(s3)
+; RV32-NEXT:    lw a4, 0(s4)
+; RV32-NEXT:    lw s2, 0(s5)
+; RV32-NEXT:    lw s3, 0(s6)
+; RV32-NEXT:    lw s4, 0(s7)
+; RV32-NEXT:    lw s5, 0(s9)
+; RV32-NEXT:    lw s6, 0(s10)
+; RV32-NEXT:    lw s7, 0(s11)
+; RV32-NEXT:    lw t0, 0(ra)
+; RV32-NEXT:    lw a0, 84(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s9, 0(a0)
+; RV32-NEXT:    lw s10, 0(t1)
+; RV32-NEXT:    lw s11, 0(t3)
+; RV32-NEXT:    lw ra, 0(t4)
+; RV32-NEXT:    lw t2, 0(t5)
+; RV32-NEXT:    lw t3, 0(s8)
+; RV32-NEXT:    lw a0, 80(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s8, 0(a0)
+; RV32-NEXT:    lw a0, 76(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw t1, 0(a0)
+; RV32-NEXT:    lw a0, 72(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw t4, 0(a0)
+; RV32-NEXT:    lw a0, 0(t6)
+; RV32-NEXT:    lw a1, 0(a1)
+; RV32-NEXT:    lw t5, 0(a3)
+; RV32-NEXT:    li a3, 72
+; RV32-NEXT:    bltu a2, a3, .LBB6_70
+; RV32-NEXT:  # %bb.69:
+; RV32-NEXT:    li a2, 72
+; RV32-NEXT:  .LBB6_70:
+; RV32-NEXT:    slli a2, a2, 2
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    li t6, 1592
+; RV32-NEXT:    mul a3, a3, t6
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 512
+; RV32-NEXT:    add a2, a3, a2
+; RV32-NEXT:    lui a3, %hi(.LCPI6_0)
+; RV32-NEXT:    addi a3, a3, %lo(.LCPI6_0)
+; RV32-NEXT:    vle16.v v4, (a3)
+; RV32-NEXT:    lui a3, %hi(.LCPI6_1)
+; RV32-NEXT:    addi a3, a3, %lo(.LCPI6_1)
+; RV32-NEXT:    vle16.v v24, (a3)
+; RV32-NEXT:    lw a2, 0(a2)
+; RV32-NEXT:    lui a3, 1
+; RV32-NEXT:    addi a3, a3, -64
+; RV32-NEXT:    vmv.s.x v0, a3
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    li t6, 40
+; RV32-NEXT:    mul a3, a3, t6
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 512
+; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vrgatherei16.vv v16, v8, v4
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    li t6, 48
+; RV32-NEXT:    mul a3, a3, t6
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 512
+; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
+; RV32-NEXT:    vrgatherei16.vv v16, v8, v24, v0.t
+; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a2
+; RV32-NEXT:    vslide1down.vx v12, v12, t5
+; RV32-NEXT:    vslide1down.vx v12, v12, a1
+; RV32-NEXT:    lui a1, %hi(.LCPI6_2)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_2)
+; RV32-NEXT:    lui a2, %hi(.LCPI6_3)
+; RV32-NEXT:    addi a2, a2, %lo(.LCPI6_3)
+; RV32-NEXT:    li a3, 32
+; RV32-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
+; RV32-NEXT:    vle16.v v20, (a1)
+; RV32-NEXT:    vle16.v v4, (a2)
+; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT:    vslide1down.vx v12, v12, a0
+; RV32-NEXT:    vsetivli zero, 12, e32, m4, tu, ma
+; RV32-NEXT:    vmv.v.v v12, v16
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 36
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 512
+; RV32-NEXT:    vs4r.v v12, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 40
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 512
+; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vsetvli zero, a3, e32, m8, ta, mu
+; RV32-NEXT:    vrgatherei16.vv v8, v24, v20
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 48
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 512
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vrgatherei16.vv v8, v16, v4, v0.t
+; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT:    vmv.v.x v12, t4
+; RV32-NEXT:    vslide1down.vx v12, v12, t1
+; RV32-NEXT:    vslide1down.vx v12, v12, s8
+; RV32-NEXT:    vslide1down.vx v12, v12, t3
+; RV32-NEXT:    vsetivli zero, 12, e32, m4, tu, ma
+; RV32-NEXT:    vmv.v.v v12, v8
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 5
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 512
+; RV32-NEXT:    vs4r.v v12, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    lui a0, %hi(.LCPI6_4)
+; RV32-NEXT:    addi a0, a0, %lo(.LCPI6_4)
+; RV32-NEXT:    vsetvli zero, a3, e32, m8, ta, mu
+; RV32-NEXT:    vle16.v v4, (a0)
+; RV32-NEXT:    lui a0, %hi(.LCPI6_5)
+; RV32-NEXT:    addi a0, a0, %lo(.LCPI6_5)
+; RV32-NEXT:    vle16.v v8, (a0)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 28
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 512
+; RV32-NEXT:    vs4r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    li a0, 960
+; RV32-NEXT:    vmv.s.x v0, a0
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 40
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 512
+; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vrgatherei16.vv v8, v24, v4
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 28
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 512
+; RV32-NEXT:    vl4r.v v4, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vrgatherei16.vv v8, v16, v4, v0.t
+; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT:    vmv.v.x v12, t2
+; RV32-NEXT:    vslide1down.vx v12, v12, ra
+; RV32-NEXT:    vslide1down.vx v12, v12, s11
+; RV32-NEXT:    vslide1down.vx v12, v12, s10
+; RV32-NEXT:    vslide1down.vx v12, v12, s9
+; RV32-NEXT:    lui a0, %hi(.LCPI6_6)
+; RV32-NEXT:    addi a0, a0, %lo(.LCPI6_6)
+; RV32-NEXT:    lui a1, %hi(.LCPI6_7)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_7)
+; RV32-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
+; RV32-NEXT:    vle16.v v16, (a0)
+; RV32-NEXT:    vle16.v v20, (a1)
+; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT:    vslide1down.vx v12, v12, t0
+; RV32-NEXT:    vsetivli zero, 10, e32, m4, tu, ma
+; RV32-NEXT:    vmv.v.v v12, v8
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 28
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 512
+; RV32-NEXT:    vs4r.v v12, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetvli zero, a3, e32, m8, ta, mu
+; RV32-NEXT:    vrgatherei16.vv v8, v24, v16
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 48
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 512
+; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vrgatherei16.vv v8, v24, v20, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 20
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 512
+; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    lui a0, %hi(.LCPI6_8)
+; RV32-NEXT:    addi a0, a0, %lo(.LCPI6_8)
+; RV32-NEXT:    lui a1, %hi(.LCPI6_9)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_9)
+; RV32-NEXT:    li a2, 1008
+; RV32-NEXT:    lui a3, %hi(.LCPI6_10)
+; RV32-NEXT:    addi a3, a3, %lo(.LCPI6_10)
+; RV32-NEXT:    lui t0, %hi(.LCPI6_11)
+; RV32-NEXT:    addi t0, t0, %lo(.LCPI6_11)
+; RV32-NEXT:    vmv.s.x v0, a2
+; RV32-NEXT:    vle16.v v4, (a0)
+; RV32-NEXT:    vle16.v v8, (a1)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 2
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 512
+; RV32-NEXT:    vs4r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vle16.v v8, (a3)
+; RV32-NEXT:    addi a0, sp, 512
+; RV32-NEXT:    vs4r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vle16.v v8, (t0)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 512
+; RV32-NEXT:    vs4r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 40
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 512
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vrgatherei16.vv v8, v16, v4
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 2
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 512
+; RV32-NEXT:    vl4r.v v4, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vrgatherei16.vv v8, v24, v4, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 12
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 512
+; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    addi a0, sp, 512
+; RV32-NEXT:    vl4r.v v4, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vrgatherei16.vv v8, v16, v4
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 512
+; RV32-NEXT:    vl4r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vrgatherei16.vv v8, v24, v16, v0.t
+; RV32-NEXT:    vmv4r.v v0, v8
+; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT:    vmv.v.x v8, s7
+; RV32-NEXT:    vslide1down.vx v8, v8, s6
+; RV32-NEXT:    vslide1down.vx v8, v8, s5
+; RV32-NEXT:    vslide1down.vx v8, v8, s4
+; RV32-NEXT:    vslide1down.vx v8, v8, s3
+; RV32-NEXT:    vslide1down.vx v12, v8, s2
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 20
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 512
+; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vsetivli zero, 10, e32, m4, tu, ma
+; RV32-NEXT:    vmv.v.v v12, v24
+; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT:    vmv.v.x v8, a4
+; RV32-NEXT:    vslide1down.vx v8, v8, a7
+; RV32-NEXT:    vslide1down.vx v8, v8, a6
+; RV32-NEXT:    vslide1down.vx v8, v8, a5
+; RV32-NEXT:    lw a0, 92(sp) # 4-byte Folded Reload
+; RV32-NEXT:    vslide1down.vx v8, v8, a0
+; RV32-NEXT:    lw a0, 96(sp) # 4-byte Folded Reload
+; RV32-NEXT:    vslide1down.vx v8, v8, a0
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 12
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 512
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vsetivli zero, 10, e32, m4, tu, ma
+; RV32-NEXT:    vmv.v.v v8, v16
+; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT:    lw a0, 100(sp) # 4-byte Folded Reload
+; RV32-NEXT:    vmv.v.x v16, a0
+; RV32-NEXT:    lw a0, 104(sp) # 4-byte Folded Reload
+; RV32-NEXT:    vslide1down.vx v16, v16, a0
+; RV32-NEXT:    lw a0, 108(sp) # 4-byte Folded Reload
+; RV32-NEXT:    vslide1down.vx v16, v16, a0
+; RV32-NEXT:    lw a0, 112(sp) # 4-byte Folded Reload
+; RV32-NEXT:    vslide1down.vx v16, v16, a0
+; RV32-NEXT:    lw a0, 116(sp) # 4-byte Folded Reload
+; RV32-NEXT:    vslide1down.vx v16, v16, a0
+; RV32-NEXT:    lw a0, 120(sp) # 4-byte Folded Reload
+; RV32-NEXT:    vslide1down.vx v16, v16, a0
+; RV32-NEXT:    vsetivli zero, 10, e32, m4, tu, ma
+; RV32-NEXT:    vmv.v.v v16, v0
+; RV32-NEXT:    lw a1, 124(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi a0, a1, 320
+; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT:    vse32.v v16, (a0)
+; RV32-NEXT:    addi a0, a1, 256
+; RV32-NEXT:    vse32.v v8, (a0)
+; RV32-NEXT:    addi a0, a1, 192
+; RV32-NEXT:    vse32.v v12, (a0)
+; RV32-NEXT:    addi a0, a1, 128
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    li a3, 28
+; RV32-NEXT:    mul a2, a2, a3
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 512
+; RV32-NEXT:    vl4r.v v8, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vse32.v v8, (a0)
+; RV32-NEXT:    addi a0, a1, 64
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 5
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 512
+; RV32-NEXT:    vl4r.v v8, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vse32.v v8, (a0)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a2, 36
+; RV32-NEXT:    mul a0, a0, a2
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 512
+; RV32-NEXT:    vl4r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vse32.v v8, (a1)
+; RV32-NEXT:    addi sp, s0, -560
+; RV32-NEXT:    lw ra, 556(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s0, 552(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s2, 548(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s3, 544(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s4, 540(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s5, 536(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s6, 532(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s7, 528(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s8, 524(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s9, 520(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s10, 516(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s11, 512(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 560
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: load_factor6_too_big:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -736
+; RV64-NEXT:    .cfi_def_cfa_offset 736
+; RV64-NEXT:    sd ra, 728(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s0, 720(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s2, 712(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s3, 704(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s4, 696(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s5, 688(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s6, 680(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s7, 672(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s8, 664(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s9, 656(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s10, 648(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s11, 640(sp) # 8-byte Folded Spill
+; RV64-NEXT:    .cfi_offset ra, -8
+; RV64-NEXT:    .cfi_offset s0, -16
+; RV64-NEXT:    .cfi_offset s2, -24
+; RV64-NEXT:    .cfi_offset s3, -32
+; RV64-NEXT:    .cfi_offset s4, -40
+; RV64-NEXT:    .cfi_offset s5, -48
+; RV64-NEXT:    .cfi_offset s6, -56
+; RV64-NEXT:    .cfi_offset s7, -64
+; RV64-NEXT:    .cfi_offset s8, -72
+; RV64-NEXT:    .cfi_offset s9, -80
+; RV64-NEXT:    .cfi_offset s10, -88
+; RV64-NEXT:    .cfi_offset s11, -96
+; RV64-NEXT:    addi s0, sp, 736
+; RV64-NEXT:    .cfi_def_cfa s0, 0
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    li a3, 1152
+; RV64-NEXT:    mul a2, a2, a3
+; RV64-NEXT:    sub sp, sp, a2
+; RV64-NEXT:    andi sp, sp, -64
+; RV64-NEXT:    sd a0, 184(sp) # 8-byte Folded Spill
+; RV64-NEXT:    csrr t6, vlenb
+; RV64-NEXT:    slli a0, t6, 1
+; RV64-NEXT:    li a5, 48
+; RV64-NEXT:    sub a2, a5, a0
+; RV64-NEXT:    sltiu a3, a2, 49
+; RV64-NEXT:    neg a3, a3
+; RV64-NEXT:    and a2, a3, a2
+; RV64-NEXT:    sub a3, a2, t6
+; RV64-NEXT:    sltu a4, a2, a3
+; RV64-NEXT:    addi a4, a4, -1
+; RV64-NEXT:    slli s5, t6, 3
+; RV64-NEXT:    slli a6, t6, 5
+; RV64-NEXT:    sub s6, a6, s5
+; RV64-NEXT:    bltu a0, a5, .LBB6_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    li a0, 48
+; RV64-NEXT:  .LBB6_2:
+; RV64-NEXT:    and a4, a4, a3
+; RV64-NEXT:    add a3, a1, s6
+; RV64-NEXT:    sub a5, a0, t6
+; RV64-NEXT:    sltu a6, a0, a5
+; RV64-NEXT:    addi a6, a6, -1
+; RV64-NEXT:    and a5, a6, a5
+; RV64-NEXT:    add a6, a1, s5
+; RV64-NEXT:    bltu a2, t6, .LBB6_4
+; RV64-NEXT:  # %bb.3:
+; RV64-NEXT:    mv a2, t6
+; RV64-NEXT:  .LBB6_4:
+; RV64-NEXT:    vsetvli zero, a4, e64, m8, ta, ma
+; RV64-NEXT:    vle64.v v24, (a3)
+; RV64-NEXT:    vsetvli zero, a5, e64, m8, ta, ma
+; RV64-NEXT:    vle64.v v16, (a6)
+; RV64-NEXT:    slli s7, t6, 4
+; RV64-NEXT:    add a3, a1, s7
+; RV64-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV64-NEXT:    vle64.v v0, (a3)
+; RV64-NEXT:    bltu a0, t6, .LBB6_6
+; RV64-NEXT:  # %bb.5:
+; RV64-NEXT:    mv a0, t6
+; RV64-NEXT:  .LBB6_6:
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT:    vle64.v v8, (a1)
+; RV64-NEXT:    csrr t1, vlenb
+; RV64-NEXT:    li a0, 736
+; RV64-NEXT:    mul t1, t1, a0
+; RV64-NEXT:    add t1, sp, t1
+; RV64-NEXT:    addi t1, t1, 640
+; RV64-NEXT:    vs8r.v v8, (t1)
+; RV64-NEXT:    add a0, t1, s7
+; RV64-NEXT:    vs8r.v v0, (a0)
+; RV64-NEXT:    add a0, t1, s5
+; RV64-NEXT:    vs8r.v v16, (a0)
+; RV64-NEXT:    add a0, t1, s6
+; RV64-NEXT:    vs8r.v v24, (a0)
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 45
-; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    li a0, 928
+; RV64-NEXT:    mul a1, a1, a0
 ; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vs4r.v v20, (a1) # Unknown-size Folded Spill
-; RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; RV64-NEXT:    vadd.vi v4, v2, 3
-; RV64-NEXT:    vadd.vi v8, v2, -13
+; RV64-NEXT:    addi a1, a1, 640
+; RV64-NEXT:    vs8r.v v8, (a1)
+; RV64-NEXT:    add a0, a1, s7
+; RV64-NEXT:    vs8r.v v0, (a0)
+; RV64-NEXT:    add a0, a1, s5
+; RV64-NEXT:    vs8r.v v16, (a0)
+; RV64-NEXT:    add a0, a1, s6
+; RV64-NEXT:    vs8r.v v24, (a0)
 ; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    li a0, 1120
+; RV64-NEXT:    mul a1, a1, a0
 ; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vs2r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    addi a1, a1, 640
+; RV64-NEXT:    vs8r.v v8, (a1)
+; RV64-NEXT:    add a0, a1, s7
+; RV64-NEXT:    vs8r.v v0, (a0)
+; RV64-NEXT:    add a0, a1, s5
+; RV64-NEXT:    vs8r.v v16, (a0)
+; RV64-NEXT:    add a0, a1, s6
+; RV64-NEXT:    vs8r.v v24, (a0)
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a2, a1, 6
-; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    li a0, 704
+; RV64-NEXT:    mul a1, a1, a0
 ; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; RV64-NEXT:    vrgatherei16.vv v8, v16, v4
-; RV64-NEXT:    vmv1r.v v0, v1
+; RV64-NEXT:    addi a1, a1, 640
+; RV64-NEXT:    vs8r.v v8, (a1)
+; RV64-NEXT:    add a0, a1, s7
+; RV64-NEXT:    vs8r.v v0, (a0)
+; RV64-NEXT:    add a0, a1, s5
+; RV64-NEXT:    vs8r.v v16, (a0)
+; RV64-NEXT:    add a0, a1, s6
+; RV64-NEXT:    vs8r.v v24, (a0)
 ; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    li a0, 896
+; RV64-NEXT:    mul a1, a1, a0
 ; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl2r.v v16, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vrgatherei16.vv v8, v24, v16, v0.t
+; RV64-NEXT:    addi a1, a1, 640
+; RV64-NEXT:    vs8r.v v8, (a1)
+; RV64-NEXT:    add a0, a1, s7
+; RV64-NEXT:    vs8r.v v0, (a0)
+; RV64-NEXT:    add a0, a1, s5
+; RV64-NEXT:    vs8r.v v16, (a0)
+; RV64-NEXT:    add a0, a1, s6
+; RV64-NEXT:    vs8r.v v24, (a0)
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a2, a1, 3
-; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    li a0, 1088
+; RV64-NEXT:    mul a1, a1, a0
 ; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; RV64-NEXT:    vmv1r.v v0, v6
+; RV64-NEXT:    addi a1, a1, 640
+; RV64-NEXT:    vs8r.v v8, (a1)
+; RV64-NEXT:    add a0, a1, s7
+; RV64-NEXT:    vs8r.v v0, (a0)
+; RV64-NEXT:    add a0, a1, s5
+; RV64-NEXT:    vs8r.v v16, (a0)
+; RV64-NEXT:    add a0, a1, s6
+; RV64-NEXT:    vs8r.v v24, (a0)
+; RV64-NEXT:    csrr a7, vlenb
+; RV64-NEXT:    li a0, 672
+; RV64-NEXT:    mul a7, a7, a0
+; RV64-NEXT:    add a7, sp, a7
+; RV64-NEXT:    addi a7, a7, 640
+; RV64-NEXT:    vs8r.v v8, (a7)
+; RV64-NEXT:    add a0, a7, s7
+; RV64-NEXT:    vs8r.v v0, (a0)
+; RV64-NEXT:    add a0, a7, s5
+; RV64-NEXT:    vs8r.v v16, (a0)
+; RV64-NEXT:    add a0, a7, s6
+; RV64-NEXT:    vs8r.v v24, (a0)
+; RV64-NEXT:    csrr t0, vlenb
+; RV64-NEXT:    li a0, 864
+; RV64-NEXT:    mul t0, t0, a0
+; RV64-NEXT:    add t0, sp, t0
+; RV64-NEXT:    addi t0, t0, 640
+; RV64-NEXT:    vs8r.v v8, (t0)
+; RV64-NEXT:    add a0, t0, s7
+; RV64-NEXT:    vs8r.v v0, (a0)
+; RV64-NEXT:    add a0, t0, s5
+; RV64-NEXT:    vs8r.v v16, (a0)
+; RV64-NEXT:    add a0, t0, s6
+; RV64-NEXT:    vs8r.v v24, (a0)
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 49
-; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    li a0, 1056
+; RV64-NEXT:    mul a1, a1, a0
 ; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    addi a1, a1, 640
+; RV64-NEXT:    vs8r.v v8, (a1)
+; RV64-NEXT:    add a0, a1, s7
+; RV64-NEXT:    vs8r.v v0, (a0)
+; RV64-NEXT:    add a0, a1, s5
+; RV64-NEXT:    vs8r.v v16, (a0)
+; RV64-NEXT:    add a0, a1, s6
+; RV64-NEXT:    vs8r.v v24, (a0)
+; RV64-NEXT:    csrr t4, vlenb
+; RV64-NEXT:    li a0, 640
+; RV64-NEXT:    mul t4, t4, a0
+; RV64-NEXT:    add t4, sp, t4
+; RV64-NEXT:    addi t4, t4, 640
+; RV64-NEXT:    vs8r.v v8, (t4)
+; RV64-NEXT:    add a0, t4, s7
+; RV64-NEXT:    vs8r.v v0, (a0)
+; RV64-NEXT:    add a0, t4, s5
+; RV64-NEXT:    vs8r.v v16, (a0)
+; RV64-NEXT:    add a0, t4, s6
+; RV64-NEXT:    vs8r.v v24, (a0)
+; RV64-NEXT:    csrr t5, vlenb
+; RV64-NEXT:    li a0, 832
+; RV64-NEXT:    mul t5, t5, a0
+; RV64-NEXT:    add t5, sp, t5
+; RV64-NEXT:    addi t5, t5, 640
+; RV64-NEXT:    vs8r.v v8, (t5)
+; RV64-NEXT:    add a0, t5, s7
+; RV64-NEXT:    vs8r.v v0, (a0)
+; RV64-NEXT:    add a0, t5, s5
+; RV64-NEXT:    vs8r.v v16, (a0)
+; RV64-NEXT:    add a0, t5, s6
+; RV64-NEXT:    vs8r.v v24, (a0)
+; RV64-NEXT:    csrr s2, vlenb
+; RV64-NEXT:    slli s2, s2, 10
+; RV64-NEXT:    add s2, sp, s2
+; RV64-NEXT:    addi s2, s2, 640
+; RV64-NEXT:    vs8r.v v8, (s2)
+; RV64-NEXT:    add a0, s2, s7
+; RV64-NEXT:    vs8r.v v0, (a0)
+; RV64-NEXT:    add a0, s2, s5
+; RV64-NEXT:    vs8r.v v16, (a0)
+; RV64-NEXT:    add a0, s2, s6
+; RV64-NEXT:    vs8r.v v24, (a0)
+; RV64-NEXT:    csrr s3, vlenb
+; RV64-NEXT:    li a0, 800
+; RV64-NEXT:    mul s3, s3, a0
+; RV64-NEXT:    add s3, sp, s3
+; RV64-NEXT:    addi s3, s3, 640
+; RV64-NEXT:    vs8r.v v8, (s3)
+; RV64-NEXT:    add a0, s3, s7
+; RV64-NEXT:    vs8r.v v0, (a0)
+; RV64-NEXT:    add a0, s3, s5
+; RV64-NEXT:    vs8r.v v16, (a0)
+; RV64-NEXT:    add a0, s3, s6
+; RV64-NEXT:    vs8r.v v24, (a0)
+; RV64-NEXT:    csrr s4, vlenb
+; RV64-NEXT:    li a0, 992
+; RV64-NEXT:    mul s4, s4, a0
+; RV64-NEXT:    add s4, sp, s4
+; RV64-NEXT:    addi s4, s4, 640
+; RV64-NEXT:    vs8r.v v8, (s4)
+; RV64-NEXT:    add a0, s4, s7
+; RV64-NEXT:    vs8r.v v0, (a0)
+; RV64-NEXT:    add a0, s4, s5
+; RV64-NEXT:    vs8r.v v16, (a0)
+; RV64-NEXT:    add a0, s4, s6
+; RV64-NEXT:    vs8r.v v24, (a0)
+; RV64-NEXT:    csrr t3, vlenb
+; RV64-NEXT:    li a0, 768
+; RV64-NEXT:    mul t3, t3, a0
+; RV64-NEXT:    add t3, sp, t3
+; RV64-NEXT:    addi t3, t3, 640
+; RV64-NEXT:    vs8r.v v8, (t3)
+; RV64-NEXT:    add a0, t3, s7
+; RV64-NEXT:    vs8r.v v0, (a0)
+; RV64-NEXT:    add a0, t3, s5
+; RV64-NEXT:    vs8r.v v16, (a0)
+; RV64-NEXT:    add a0, t3, s6
+; RV64-NEXT:    vs8r.v v24, (a0)
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 41
-; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    li a0, 960
+; RV64-NEXT:    mul a1, a1, a0
 ; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
-; RV64-NEXT:    vrgather.vi v8, v24, 5, v0.t
+; RV64-NEXT:    addi a1, a1, 640
+; RV64-NEXT:    vs8r.v v8, (a1)
+; RV64-NEXT:    add a0, a1, s7
+; RV64-NEXT:    vs8r.v v0, (a0)
+; RV64-NEXT:    add a0, a1, s5
+; RV64-NEXT:    vs8r.v v16, (a0)
+; RV64-NEXT:    add a0, a1, s6
+; RV64-NEXT:    vs8r.v v24, (a0)
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a1, 160
+; RV64-NEXT:    mul a0, a0, a1
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 640
+; RV64-NEXT:    vs8r.v v8, (a0)
+; RV64-NEXT:    add a1, a0, s7
+; RV64-NEXT:    vs8r.v v0, (a1)
+; RV64-NEXT:    add a1, a0, s5
+; RV64-NEXT:    vs8r.v v16, (a1)
+; RV64-NEXT:    add a0, a0, s6
+; RV64-NEXT:    vs8r.v v24, (a0)
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a1, 352
+; RV64-NEXT:    mul a0, a0, a1
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 640
+; RV64-NEXT:    vs8r.v v8, (a0)
+; RV64-NEXT:    add a1, a0, s7
+; RV64-NEXT:    vs8r.v v0, (a1)
+; RV64-NEXT:    add a1, a0, s5
+; RV64-NEXT:    vs8r.v v16, (a1)
+; RV64-NEXT:    add a0, a0, s6
+; RV64-NEXT:    vs8r.v v24, (a0)
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a1, 544
+; RV64-NEXT:    mul a0, a0, a1
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 640
+; RV64-NEXT:    vs8r.v v8, (a0)
+; RV64-NEXT:    add a1, a0, s7
+; RV64-NEXT:    vs8r.v v0, (a1)
+; RV64-NEXT:    add a1, a0, s5
+; RV64-NEXT:    vs8r.v v16, (a1)
+; RV64-NEXT:    add a0, a0, s6
+; RV64-NEXT:    vs8r.v v24, (a0)
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a1, 48
+; RV64-NEXT:    mul a0, a0, a1
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 640
+; RV64-NEXT:    vs8r.v v8, (a0)
+; RV64-NEXT:    add a0, a0, s5
+; RV64-NEXT:    vs8r.v v16, (a0)
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 7
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 640
+; RV64-NEXT:    vs8r.v v8, (a0)
+; RV64-NEXT:    add a1, a0, s7
+; RV64-NEXT:    vs8r.v v0, (a1)
+; RV64-NEXT:    add a1, a0, s5
+; RV64-NEXT:    vs8r.v v16, (a1)
+; RV64-NEXT:    add a0, a0, s6
+; RV64-NEXT:    vs8r.v v24, (a0)
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a1, 320
+; RV64-NEXT:    mul a0, a0, a1
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 640
+; RV64-NEXT:    vs8r.v v8, (a0)
+; RV64-NEXT:    add a1, a0, s7
+; RV64-NEXT:    vs8r.v v0, (a1)
+; RV64-NEXT:    add a1, a0, s5
+; RV64-NEXT:    vs8r.v v16, (a1)
+; RV64-NEXT:    add a0, a0, s6
+; RV64-NEXT:    vs8r.v v24, (a0)
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 9
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 640
+; RV64-NEXT:    vs8r.v v8, (a0)
+; RV64-NEXT:    add a1, a0, s7
+; RV64-NEXT:    vs8r.v v0, (a1)
+; RV64-NEXT:    add a1, a0, s5
+; RV64-NEXT:    vs8r.v v16, (a1)
+; RV64-NEXT:    add a0, a0, s6
+; RV64-NEXT:    vs8r.v v24, (a0)
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 5
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 640
+; RV64-NEXT:    vs8r.v v8, (a0)
+; RV64-NEXT:    add a0, a0, s5
+; RV64-NEXT:    vs8r.v v16, (a0)
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a1, 288
+; RV64-NEXT:    mul a0, a0, a1
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 640
+; RV64-NEXT:    vs8r.v v8, (a0)
+; RV64-NEXT:    add a1, a0, s7
+; RV64-NEXT:    vs8r.v v0, (a1)
+; RV64-NEXT:    add a1, a0, s5
+; RV64-NEXT:    vs8r.v v16, (a1)
+; RV64-NEXT:    add a0, a0, s6
+; RV64-NEXT:    vs8r.v v24, (a0)
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a1, 480
+; RV64-NEXT:    mul a0, a0, a1
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 640
+; RV64-NEXT:    vs8r.v v8, (a0)
+; RV64-NEXT:    add a1, a0, s7
+; RV64-NEXT:    vs8r.v v0, (a1)
+; RV64-NEXT:    add a1, a0, s5
+; RV64-NEXT:    vs8r.v v16, (a1)
+; RV64-NEXT:    add a0, a0, s6
+; RV64-NEXT:    vs8r.v v24, (a0)
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 4
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 640
+; RV64-NEXT:    vs8r.v v8, (a0)
+; RV64-NEXT:    add a0, a0, s5
+; RV64-NEXT:    vs8r.v v16, (a0)
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a1, 112
+; RV64-NEXT:    mul a0, a0, a1
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 640
+; RV64-NEXT:    vs8r.v v8, (a0)
+; RV64-NEXT:    add a0, a0, s5
+; RV64-NEXT:    vs8r.v v16, (a0)
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 8
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 640
+; RV64-NEXT:    vs8r.v v8, (a0)
+; RV64-NEXT:    add a1, a0, s7
+; RV64-NEXT:    vs8r.v v0, (a1)
+; RV64-NEXT:    add a1, a0, s5
+; RV64-NEXT:    vs8r.v v16, (a1)
+; RV64-NEXT:    add a0, a0, s6
+; RV64-NEXT:    vs8r.v v24, (a0)
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a1, 448
+; RV64-NEXT:    mul a0, a0, a1
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 640
+; RV64-NEXT:    vs8r.v v8, (a0)
+; RV64-NEXT:    add a1, a0, s7
+; RV64-NEXT:    vs8r.v v0, (a1)
+; RV64-NEXT:    add a1, a0, s5
+; RV64-NEXT:    vs8r.v v16, (a1)
+; RV64-NEXT:    add a0, a0, s6
+; RV64-NEXT:    vs8r.v v24, (a0)
+; RV64-NEXT:    addi a0, sp, 640
+; RV64-NEXT:    vs8r.v v8, (a0)
+; RV64-NEXT:    add a0, a0, s5
+; RV64-NEXT:    vs8r.v v16, (a0)
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a1, 96
+; RV64-NEXT:    mul a0, a0, a1
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 640
+; RV64-NEXT:    vs8r.v v8, (a0)
+; RV64-NEXT:    add a0, a0, s5
+; RV64-NEXT:    vs8r.v v16, (a0)
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a1, 224
+; RV64-NEXT:    mul a0, a0, a1
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 640
+; RV64-NEXT:    vs8r.v v8, (a0)
+; RV64-NEXT:    add a1, a0, s7
+; RV64-NEXT:    vs8r.v v0, (a1)
+; RV64-NEXT:    add a1, a0, s5
+; RV64-NEXT:    vs8r.v v16, (a1)
+; RV64-NEXT:    add a0, a0, s6
+; RV64-NEXT:    vs8r.v v24, (a0)
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a1, 416
+; RV64-NEXT:    mul a0, a0, a1
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 640
+; RV64-NEXT:    vs8r.v v8, (a0)
+; RV64-NEXT:    add a1, a0, s7
+; RV64-NEXT:    vs8r.v v0, (a1)
+; RV64-NEXT:    add a1, a0, s5
+; RV64-NEXT:    vs8r.v v16, (a1)
+; RV64-NEXT:    add a0, a0, s6
+; RV64-NEXT:    vs8r.v v24, (a0)
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a1, 608
+; RV64-NEXT:    mul a0, a0, a1
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 640
+; RV64-NEXT:    vs8r.v v8, (a0)
+; RV64-NEXT:    add a1, a0, s7
+; RV64-NEXT:    vs8r.v v0, (a1)
+; RV64-NEXT:    add a1, a0, s5
+; RV64-NEXT:    vs8r.v v16, (a1)
+; RV64-NEXT:    add a0, a0, s6
+; RV64-NEXT:    vs8r.v v24, (a0)
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a1, 80
+; RV64-NEXT:    mul a0, a0, a1
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 640
+; RV64-NEXT:    vs8r.v v8, (a0)
+; RV64-NEXT:    add a0, a0, s5
+; RV64-NEXT:    vs8r.v v16, (a0)
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a1, 192
+; RV64-NEXT:    mul a0, a0, a1
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 640
+; RV64-NEXT:    vs8r.v v8, (a0)
+; RV64-NEXT:    add a1, a0, s7
+; RV64-NEXT:    vs8r.v v0, (a1)
+; RV64-NEXT:    add a1, a0, s5
+; RV64-NEXT:    vs8r.v v16, (a1)
+; RV64-NEXT:    add a0, a0, s6
+; RV64-NEXT:    vs8r.v v24, (a0)
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a1, 384
+; RV64-NEXT:    mul a0, a0, a1
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 640
+; RV64-NEXT:    vs8r.v v8, (a0)
+; RV64-NEXT:    add a1, a0, s7
+; RV64-NEXT:    vs8r.v v0, (a1)
+; RV64-NEXT:    add a1, a0, s5
+; RV64-NEXT:    vs8r.v v16, (a1)
+; RV64-NEXT:    add a0, a0, s6
+; RV64-NEXT:    vs8r.v v24, (a0)
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a1, 576
+; RV64-NEXT:    mul a0, a0, a1
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 640
+; RV64-NEXT:    vs8r.v v8, (a0)
+; RV64-NEXT:    add s7, a0, s7
+; RV64-NEXT:    vs8r.v v0, (s7)
+; RV64-NEXT:    add a1, a0, s5
+; RV64-NEXT:    vs8r.v v16, (a1)
+; RV64-NEXT:    add a0, a0, s6
+; RV64-NEXT:    vs8r.v v24, (a0)
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 6
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 640
+; RV64-NEXT:    vs8r.v v8, (a0)
+; RV64-NEXT:    add a0, a0, s5
+; RV64-NEXT:    slli t6, t6, 2
+; RV64-NEXT:    addi t6, t6, -1
+; RV64-NEXT:    li a1, 35
+; RV64-NEXT:    vs8r.v v16, (a0)
+; RV64-NEXT:    mv s5, t6
+; RV64-NEXT:    bltu t6, a1, .LBB6_8
+; RV64-NEXT:  # %bb.7:
+; RV64-NEXT:    li s5, 35
+; RV64-NEXT:  .LBB6_8:
+; RV64-NEXT:    li a0, 41
+; RV64-NEXT:    mv s6, t6
+; RV64-NEXT:    bltu t6, a0, .LBB6_10
+; RV64-NEXT:  # %bb.9:
+; RV64-NEXT:    li s6, 41
+; RV64-NEXT:  .LBB6_10:
+; RV64-NEXT:    li a0, 47
+; RV64-NEXT:    mv s7, t6
+; RV64-NEXT:    bltu t6, a0, .LBB6_12
+; RV64-NEXT:  # %bb.11:
+; RV64-NEXT:    li s7, 47
+; RV64-NEXT:  .LBB6_12:
+; RV64-NEXT:    li a0, 34
+; RV64-NEXT:    mv s8, t6
+; RV64-NEXT:    bltu t6, a0, .LBB6_14
+; RV64-NEXT:  # %bb.13:
+; RV64-NEXT:    li s8, 34
+; RV64-NEXT:  .LBB6_14:
+; RV64-NEXT:    li a0, 40
+; RV64-NEXT:    mv s9, t6
+; RV64-NEXT:    bltu t6, a0, .LBB6_16
+; RV64-NEXT:  # %bb.15:
+; RV64-NEXT:    li s9, 40
+; RV64-NEXT:  .LBB6_16:
+; RV64-NEXT:    li a0, 46
+; RV64-NEXT:    mv a6, t6
+; RV64-NEXT:    bltu t6, a0, .LBB6_18
+; RV64-NEXT:  # %bb.17:
+; RV64-NEXT:    li a6, 46
+; RV64-NEXT:  .LBB6_18:
+; RV64-NEXT:    li a0, 33
+; RV64-NEXT:    mv s10, t6
+; RV64-NEXT:    bltu t6, a0, .LBB6_20
+; RV64-NEXT:  # %bb.19:
+; RV64-NEXT:    li s10, 33
+; RV64-NEXT:  .LBB6_20:
+; RV64-NEXT:    li a0, 39
+; RV64-NEXT:    mv s11, t6
+; RV64-NEXT:    bltu t6, a0, .LBB6_22
+; RV64-NEXT:  # %bb.21:
+; RV64-NEXT:    li s11, 39
+; RV64-NEXT:  .LBB6_22:
+; RV64-NEXT:    li a0, 45
+; RV64-NEXT:    mv ra, t6
+; RV64-NEXT:    bltu t6, a0, .LBB6_24
+; RV64-NEXT:  # %bb.23:
+; RV64-NEXT:    li ra, 45
+; RV64-NEXT:  .LBB6_24:
+; RV64-NEXT:    li a0, 32
+; RV64-NEXT:    mv a1, t6
+; RV64-NEXT:    bltu t6, a0, .LBB6_26
+; RV64-NEXT:  # %bb.25:
+; RV64-NEXT:    li a1, 32
+; RV64-NEXT:  .LBB6_26:
+; RV64-NEXT:    li a0, 38
+; RV64-NEXT:    mv a2, t6
+; RV64-NEXT:    bltu t6, a0, .LBB6_28
+; RV64-NEXT:  # %bb.27:
+; RV64-NEXT:    li a2, 38
+; RV64-NEXT:  .LBB6_28:
+; RV64-NEXT:    li a0, 44
+; RV64-NEXT:    mv a3, t6
+; RV64-NEXT:    bltu t6, a0, .LBB6_30
+; RV64-NEXT:  # %bb.29:
+; RV64-NEXT:    li a3, 44
+; RV64-NEXT:  .LBB6_30:
+; RV64-NEXT:    li a0, 37
+; RV64-NEXT:    mv t2, t6
+; RV64-NEXT:    bltu t6, a0, .LBB6_32
+; RV64-NEXT:  # %bb.31:
+; RV64-NEXT:    li t2, 37
+; RV64-NEXT:  .LBB6_32:
+; RV64-NEXT:    slli a0, s5, 3
+; RV64-NEXT:    slli s6, s6, 3
+; RV64-NEXT:    slli s7, s7, 3
+; RV64-NEXT:    slli a4, s8, 3
+; RV64-NEXT:    slli a5, s9, 3
+; RV64-NEXT:    slli a6, a6, 3
+; RV64-NEXT:    slli s10, s10, 3
+; RV64-NEXT:    slli s11, s11, 3
+; RV64-NEXT:    slli ra, ra, 3
+; RV64-NEXT:    slli s5, a1, 3
+; RV64-NEXT:    slli a2, a2, 3
+; RV64-NEXT:    slli s9, a3, 3
+; RV64-NEXT:    li a1, 43
+; RV64-NEXT:    slli t2, t2, 3
+; RV64-NEXT:    mv s8, t6
+; RV64-NEXT:    bltu t6, a1, .LBB6_34
+; RV64-NEXT:  # %bb.33:
+; RV64-NEXT:    li s8, 43
+; RV64-NEXT:  .LBB6_34:
+; RV64-NEXT:    add a0, t1, a0
+; RV64-NEXT:    csrr t1, vlenb
+; RV64-NEXT:    li a1, 928
+; RV64-NEXT:    mul t1, t1, a1
+; RV64-NEXT:    add t1, sp, t1
+; RV64-NEXT:    addi t1, t1, 640
+; RV64-NEXT:    add t1, t1, s6
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    li a1, 1120
+; RV64-NEXT:    mul a3, a3, a1
+; RV64-NEXT:    add a3, sp, a3
+; RV64-NEXT:    addi a3, a3, 640
+; RV64-NEXT:    add a3, a3, s7
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 41
-; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    li s6, 704
+; RV64-NEXT:    mul a1, a1, s6
 ; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
-; RV64-NEXT:    lui a1, 96
-; RV64-NEXT:    li a2, 192
-; RV64-NEXT:    vmv.s.x v28, a2
-; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV64-NEXT:    vmv.v.x v8, a1
-; RV64-NEXT:    vmv1r.v v0, v28
+; RV64-NEXT:    addi a1, a1, 640
+; RV64-NEXT:    add a4, a1, a4
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 37
-; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    li s6, 896
+; RV64-NEXT:    mul a1, a1, s6
 ; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl4r.v v12, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
-; RV64-NEXT:    vrgatherei16.vv v12, v24, v8, v0.t
+; RV64-NEXT:    addi a1, a1, 640
+; RV64-NEXT:    add a5, a1, a5
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 37
-; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    li s6, 1088
+; RV64-NEXT:    mul a1, a1, s6
 ; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
-; RV64-NEXT:    li a1, 28
-; RV64-NEXT:    vmv.s.x v0, a1
-; RV64-NEXT:    addi a1, sp, 16
-; RV64-NEXT:    vs1r.v v0, (a1) # Unknown-size Folded Spill
-; RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; RV64-NEXT:    vadd.vi v30, v2, 4
-; RV64-NEXT:    vadd.vi v6, v2, -12
+; RV64-NEXT:    addi a1, a1, 640
+; RV64-NEXT:    add a6, a1, a6
+; RV64-NEXT:    add a7, a7, s10
+; RV64-NEXT:    add t0, t0, s11
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a2, a1, 6
-; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    li s6, 1056
+; RV64-NEXT:    mul a1, a1, s6
 ; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; RV64-NEXT:    vrgatherei16.vv v16, v8, v30
+; RV64-NEXT:    addi a1, a1, 640
+; RV64-NEXT:    add a1, a1, ra
+; RV64-NEXT:    add s5, t4, s5
+; RV64-NEXT:    add t5, t5, a2
+; RV64-NEXT:    add s6, s2, s9
+; RV64-NEXT:    vsetivli zero, 1, e64, m4, ta, ma
+; RV64-NEXT:    vslidedown.vi v12, v8, 5
+; RV64-NEXT:    vslidedown.vi v16, v8, 4
+; RV64-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
+; RV64-NEXT:    vslidedown.vi v14, v8, 3
+; RV64-NEXT:    vslidedown.vi v18, v8, 2
+; RV64-NEXT:    add s7, s3, t2
+; RV64-NEXT:    slli s8, s8, 3
+; RV64-NEXT:    add s4, s4, s8
+; RV64-NEXT:    li a2, 36
+; RV64-NEXT:    vsetivli zero, 1, e64, m4, ta, ma
+; RV64-NEXT:    vslidedown.vi v20, v8, 7
+; RV64-NEXT:    mv s9, t6
+; RV64-NEXT:    bltu t6, a2, .LBB6_36
+; RV64-NEXT:  # %bb.35:
+; RV64-NEXT:    li s9, 36
+; RV64-NEXT:  .LBB6_36:
+; RV64-NEXT:    ld a0, 0(a0)
+; RV64-NEXT:    sd a0, 176(sp) # 8-byte Folded Spill
+; RV64-NEXT:    ld a0, 0(t1)
+; RV64-NEXT:    sd a0, 168(sp) # 8-byte Folded Spill
+; RV64-NEXT:    ld a0, 0(a3)
+; RV64-NEXT:    sd a0, 160(sp) # 8-byte Folded Spill
+; RV64-NEXT:    ld a0, 0(a4)
+; RV64-NEXT:    sd a0, 152(sp) # 8-byte Folded Spill
+; RV64-NEXT:    ld a0, 0(a5)
+; RV64-NEXT:    sd a0, 144(sp) # 8-byte Folded Spill
+; RV64-NEXT:    ld a0, 0(a6)
+; RV64-NEXT:    sd a0, 136(sp) # 8-byte Folded Spill
+; RV64-NEXT:    ld a0, 0(a7)
+; RV64-NEXT:    sd a0, 128(sp) # 8-byte Folded Spill
+; RV64-NEXT:    ld a0, 0(t0)
+; RV64-NEXT:    sd a0, 112(sp) # 8-byte Folded Spill
+; RV64-NEXT:    ld a0, 0(a1)
+; RV64-NEXT:    sd a0, 96(sp) # 8-byte Folded Spill
+; RV64-NEXT:    ld a0, 0(s5)
+; RV64-NEXT:    sd a0, 72(sp) # 8-byte Folded Spill
+; RV64-NEXT:    ld a0, 0(t5)
+; RV64-NEXT:    sd a0, 64(sp) # 8-byte Folded Spill
+; RV64-NEXT:    ld s5, 0(s6)
+; RV64-NEXT:    ld s8, 0(s7)
+; RV64-NEXT:    ld s10, 0(s4)
+; RV64-NEXT:    slli s9, s9, 3
+; RV64-NEXT:    add t3, t3, s9
+; RV64-NEXT:    ld a0, 0(t3)
+; RV64-NEXT:    li a1, 42
+; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; RV64-NEXT:    vslidedown.vi v13, v8, 1
+; RV64-NEXT:    bltu t6, a1, .LBB6_38
+; RV64-NEXT:  # %bb.37:
+; RV64-NEXT:    li t6, 42
+; RV64-NEXT:  .LBB6_38:
+; RV64-NEXT:    slli t6, t6, 3
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 57
+; RV64-NEXT:    li a2, 960
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vrgatherei16.vv v16, v8, v6, v0.t
+; RV64-NEXT:    addi a1, a1, 640
+; RV64-NEXT:    add t6, a1, t6
+; RV64-NEXT:    ld a2, 0(t6)
+; RV64-NEXT:    vsetivli zero, 1, e64, m4, ta, ma
+; RV64-NEXT:    vslidedown.vi v24, v8, 6
 ; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    li a3, 160
+; RV64-NEXT:    mul a1, a1, a3
 ; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV64-NEXT:    lui a1, 112
-; RV64-NEXT:    addi a1, a1, 1
-; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV64-NEXT:    vmv.v.x v12, a1
-; RV64-NEXT:    vmv1r.v v0, v28
+; RV64-NEXT:    ld a1, 776(a1)
+; RV64-NEXT:    sd a1, 120(sp) # 8-byte Folded Spill
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a2, a1, 5
-; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    li a3, 352
+; RV64-NEXT:    mul a1, a1, a3
 ; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl4r.v v16, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
-; RV64-NEXT:    vrgatherei16.vv v16, v24, v12, v0.t
+; RV64-NEXT:    ld a1, 824(a1)
+; RV64-NEXT:    sd a1, 104(sp) # 8-byte Folded Spill
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a2, a1, 5
-; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    li a3, 544
+; RV64-NEXT:    mul a1, a1, a3
 ; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vs4r.v v16, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    ld a1, 872(a1)
+; RV64-NEXT:    sd a1, 88(sp) # 8-byte Folded Spill
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 45
-; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    li a3, 48
+; RV64-NEXT:    mul a1, a1, a3
 ; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl4r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    ld a1, 728(a1)
+; RV64-NEXT:    sd a1, 80(sp) # 8-byte Folded Spill
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 25
-; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    slli a1, a1, 7
 ; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vsetivli zero, 5, e64, m4, tu, ma
-; RV64-NEXT:    vmv.v.v v16, v24
-; RV64-NEXT:    vmv2r.v v8, v2
-; RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; RV64-NEXT:    vadd.vi v12, v2, 5
+; RV64-NEXT:    ld t6, 768(a1)
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a2, a1, 6
-; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    li a3, 320
+; RV64-NEXT:    mul a1, a1, a3
 ; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; RV64-NEXT:    vrgatherei16.vv v24, v0, v12
-; RV64-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV64-NEXT:    vadd.vi v2, v8, -11
-; RV64-NEXT:    addi a1, sp, 16
-; RV64-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    ld s4, 816(a1)
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 57
-; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    slli a1, a1, 9
 ; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; RV64-NEXT:    vrgatherei16.vv v24, v8, v2, v0.t
+; RV64-NEXT:    ld s6, 864(a1)
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 41
-; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    slli a1, a1, 5
 ; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl4r.v v12, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    ld s7, 720(a1)
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a2, a1, 3
-; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    li a3, 288
+; RV64-NEXT:    mul a1, a1, a3
 ; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vsetivli zero, 5, e64, m4, tu, ma
-; RV64-NEXT:    vmv.v.v v12, v0
+; RV64-NEXT:    ld s9, 808(a1)
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 37
-; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    li a3, 480
+; RV64-NEXT:    mul a1, a1, a3
 ; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl4r.v v20, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    ld s11, 856(a1)
 ; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 4
 ; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vmv.v.v v20, v0
+; RV64-NEXT:    ld ra, 712(a1)
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a2, a1, 5
-; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    li a3, 112
+; RV64-NEXT:    mul a1, a1, a3
 ; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vmv.v.v v8, v24
-; RV64-NEXT:    addi a1, a0, 320
+; RV64-NEXT:    ld a1, 760(a1)
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    slli a3, a3, 8
+; RV64-NEXT:    add a3, sp, a3
+; RV64-NEXT:    ld a3, 800(a3)
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    li a5, 448
+; RV64-NEXT:    mul a4, a4, a5
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    ld a4, 848(a4)
+; RV64-NEXT:    ld a5, 704(sp)
+; RV64-NEXT:    csrr a6, vlenb
+; RV64-NEXT:    li a7, 96
+; RV64-NEXT:    mul a6, a6, a7
+; RV64-NEXT:    add a6, sp, a6
+; RV64-NEXT:    ld a6, 752(a6)
+; RV64-NEXT:    csrr a7, vlenb
+; RV64-NEXT:    li t0, 224
+; RV64-NEXT:    mul a7, a7, t0
+; RV64-NEXT:    add a7, sp, a7
+; RV64-NEXT:    ld a7, 792(a7)
+; RV64-NEXT:    csrr t0, vlenb
+; RV64-NEXT:    li t1, 416
+; RV64-NEXT:    mul t0, t0, t1
+; RV64-NEXT:    add t0, sp, t0
+; RV64-NEXT:    ld t2, 840(t0)
+; RV64-NEXT:    csrr t0, vlenb
+; RV64-NEXT:    li t1, 608
+; RV64-NEXT:    mul t0, t0, t1
+; RV64-NEXT:    add t0, sp, t0
+; RV64-NEXT:    ld t0, 888(t0)
+; RV64-NEXT:    csrr t1, vlenb
+; RV64-NEXT:    li t3, 80
+; RV64-NEXT:    mul t1, t1, t3
+; RV64-NEXT:    add t1, sp, t1
+; RV64-NEXT:    ld t4, 744(t1)
+; RV64-NEXT:    csrr t1, vlenb
+; RV64-NEXT:    li t3, 192
+; RV64-NEXT:    mul t1, t1, t3
+; RV64-NEXT:    add t1, sp, t1
+; RV64-NEXT:    ld t1, 784(t1)
+; RV64-NEXT:    csrr t3, vlenb
+; RV64-NEXT:    li t5, 384
+; RV64-NEXT:    mul t3, t3, t5
+; RV64-NEXT:    add t3, sp, t3
+; RV64-NEXT:    ld s2, 832(t3)
+; RV64-NEXT:    csrr t3, vlenb
+; RV64-NEXT:    slli t3, t3, 6
+; RV64-NEXT:    add t3, sp, t3
+; RV64-NEXT:    ld s3, 736(t3)
+; RV64-NEXT:    csrr t3, vlenb
+; RV64-NEXT:    li t5, 576
+; RV64-NEXT:    mul t3, t3, t5
+; RV64-NEXT:    add t3, sp, t3
+; RV64-NEXT:    ld t5, 880(t3)
+; RV64-NEXT:    addi t3, sp, 264
+; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; RV64-NEXT:    vse64.v v24, (t3)
+; RV64-NEXT:    sd s3, 272(sp)
+; RV64-NEXT:    sd a2, 312(sp)
+; RV64-NEXT:    sd a0, 304(sp)
+; RV64-NEXT:    sd t5, 296(sp)
+; RV64-NEXT:    sd s2, 288(sp)
+; RV64-NEXT:    sd t1, 280(sp)
+; RV64-NEXT:    addi a0, sp, 256
+; RV64-NEXT:    vse64.v v8, (a0)
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV64-NEXT:    vle64.v v8, (a0)
+; RV64-NEXT:    addi a0, sp, 200
+; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; RV64-NEXT:    vse64.v v20, (a0)
+; RV64-NEXT:    sd t4, 208(sp)
+; RV64-NEXT:    sd s10, 248(sp)
+; RV64-NEXT:    sd s8, 240(sp)
+; RV64-NEXT:    sd t0, 232(sp)
+; RV64-NEXT:    sd t2, 224(sp)
+; RV64-NEXT:    sd a7, 216(sp)
+; RV64-NEXT:    addi a0, sp, 192
+; RV64-NEXT:    vse64.v v13, (a0)
+; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV64-NEXT:    vle64.v v20, (a0)
+; RV64-NEXT:    sd a6, 528(sp)
+; RV64-NEXT:    sd a5, 520(sp)
+; RV64-NEXT:    sd s5, 568(sp)
+; RV64-NEXT:    ld a0, 64(sp) # 8-byte Folded Reload
+; RV64-NEXT:    sd a0, 560(sp)
+; RV64-NEXT:    ld a0, 72(sp) # 8-byte Folded Reload
+; RV64-NEXT:    sd a0, 552(sp)
+; RV64-NEXT:    sd a4, 544(sp)
+; RV64-NEXT:    sd a3, 536(sp)
+; RV64-NEXT:    addi a0, sp, 512
+; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; RV64-NEXT:    vse64.v v18, (a0)
+; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV64-NEXT:    vle64.v v24, (a0)
+; RV64-NEXT:    sd a1, 464(sp)
+; RV64-NEXT:    sd ra, 456(sp)
+; RV64-NEXT:    ld a0, 96(sp) # 8-byte Folded Reload
+; RV64-NEXT:    sd a0, 504(sp)
+; RV64-NEXT:    ld a0, 112(sp) # 8-byte Folded Reload
+; RV64-NEXT:    sd a0, 496(sp)
+; RV64-NEXT:    ld a0, 128(sp) # 8-byte Folded Reload
+; RV64-NEXT:    sd a0, 488(sp)
+; RV64-NEXT:    sd s11, 480(sp)
+; RV64-NEXT:    sd s9, 472(sp)
+; RV64-NEXT:    addi a0, sp, 448
+; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; RV64-NEXT:    vse64.v v14, (a0)
+; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV64-NEXT:    vle64.v v28, (a0)
+; RV64-NEXT:    sd s7, 392(sp)
+; RV64-NEXT:    ld a0, 136(sp) # 8-byte Folded Reload
+; RV64-NEXT:    sd a0, 440(sp)
+; RV64-NEXT:    ld a0, 144(sp) # 8-byte Folded Reload
+; RV64-NEXT:    sd a0, 432(sp)
+; RV64-NEXT:    ld a0, 152(sp) # 8-byte Folded Reload
+; RV64-NEXT:    sd a0, 424(sp)
+; RV64-NEXT:    sd s6, 416(sp)
+; RV64-NEXT:    sd s4, 408(sp)
+; RV64-NEXT:    sd t6, 400(sp)
+; RV64-NEXT:    addi a0, sp, 384
+; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; RV64-NEXT:    vse64.v v16, (a0)
+; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV64-NEXT:    vle64.v v16, (a0)
+; RV64-NEXT:    ld a0, 80(sp) # 8-byte Folded Reload
+; RV64-NEXT:    sd a0, 328(sp)
+; RV64-NEXT:    ld a0, 160(sp) # 8-byte Folded Reload
+; RV64-NEXT:    sd a0, 376(sp)
+; RV64-NEXT:    ld a0, 168(sp) # 8-byte Folded Reload
+; RV64-NEXT:    sd a0, 368(sp)
+; RV64-NEXT:    ld a0, 176(sp) # 8-byte Folded Reload
+; RV64-NEXT:    sd a0, 360(sp)
+; RV64-NEXT:    ld a0, 88(sp) # 8-byte Folded Reload
+; RV64-NEXT:    sd a0, 352(sp)
+; RV64-NEXT:    ld a0, 104(sp) # 8-byte Folded Reload
+; RV64-NEXT:    sd a0, 344(sp)
+; RV64-NEXT:    ld a0, 120(sp) # 8-byte Folded Reload
+; RV64-NEXT:    sd a0, 336(sp)
+; RV64-NEXT:    addi a0, sp, 320
+; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; RV64-NEXT:    vse64.v v12, (a0)
+; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV64-NEXT:    vle64.v v12, (a0)
+; RV64-NEXT:    ld a1, 184(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi a0, a1, 320
+; RV64-NEXT:    vse64.v v12, (a0)
+; RV64-NEXT:    addi a0, a1, 256
+; RV64-NEXT:    vse64.v v16, (a0)
+; RV64-NEXT:    addi a0, a1, 192
+; RV64-NEXT:    vse64.v v28, (a0)
+; RV64-NEXT:    addi a0, a1, 128
+; RV64-NEXT:    vse64.v v24, (a0)
+; RV64-NEXT:    addi a0, a1, 64
+; RV64-NEXT:    vse64.v v20, (a0)
 ; RV64-NEXT:    vse64.v v8, (a1)
-; RV64-NEXT:    addi a1, a0, 256
-; RV64-NEXT:    vse64.v v20, (a1)
-; RV64-NEXT:    addi a1, a0, 192
-; RV64-NEXT:    vse64.v v12, (a1)
-; RV64-NEXT:    addi a1, a0, 128
-; RV64-NEXT:    vse64.v v16, (a1)
-; RV64-NEXT:    addi a1, a0, 64
-; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    slli a3, a2, 4
-; RV64-NEXT:    add a2, a3, a2
-; RV64-NEXT:    add a2, sp, a2
-; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vl4r.v v8, (a2) # Unknown-size Folded Reload
-; RV64-NEXT:    vse64.v v8, (a1)
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 21
-; RV64-NEXT:    mul a1, a1, a2
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vse64.v v8, (a0)
-; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    li a1, 74
-; RV64-NEXT:    mul a0, a0, a1
-; RV64-NEXT:    add sp, sp, a0
-; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    addi sp, s0, -736
+; RV64-NEXT:    ld ra, 728(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s0, 720(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s2, 712(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s3, 704(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s4, 696(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s5, 688(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s6, 680(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s7, 672(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s8, 664(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s9, 656(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s10, 648(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s11, 640(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 736
 ; RV64-NEXT:    ret
   %interleaved.vec = load <48 x i64>, ptr %ptr
   %v0 = shufflevector <48 x i64> %interleaved.vec, <48 x i64> poison, <8 x i32> <i32 0, i32 6, i32 12, i32 18, i32 24, i32 30, i32 36, i32 42>
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-load.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-load.ll
index 19587438ea9475..d7a696a807a5ef 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-load.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-load.ll
@@ -7,6 +7,22 @@ define <5 x i8> @load_v5i8(ptr %p) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 5, e8, mf2, ta, ma
 ; CHECK-NEXT:    vle8.v v8, (a0)
+; CHECK-NEXT:    vslidedown.vi v9, v8, 1
+; CHECK-NEXT:    vmv.x.s a0, v9
+; CHECK-NEXT:    vmv.x.s a1, v8
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT:    vmv.v.x v9, a1
+; CHECK-NEXT:    vslide1down.vx v9, v9, a0
+; CHECK-NEXT:    vslidedown.vi v10, v8, 2
+; CHECK-NEXT:    vmv.x.s a0, v10
+; CHECK-NEXT:    vslide1down.vx v9, v9, a0
+; CHECK-NEXT:    vslidedown.vi v10, v8, 3
+; CHECK-NEXT:    vmv.x.s a0, v10
+; CHECK-NEXT:    vslide1down.vx v9, v9, a0
+; CHECK-NEXT:    vslidedown.vi v8, v8, 4
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    vslide1down.vx v8, v9, a0
+; CHECK-NEXT:    vslidedown.vi v8, v8, 3
 ; CHECK-NEXT:    ret
   %x = load <5 x i8>, ptr %p
   ret <5 x i8> %x
@@ -17,6 +33,22 @@ define <5 x i8> @load_v5i8_align1(ptr %p) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 5, e8, mf2, ta, ma
 ; CHECK-NEXT:    vle8.v v8, (a0)
+; CHECK-NEXT:    vslidedown.vi v9, v8, 1
+; CHECK-NEXT:    vmv.x.s a0, v9
+; CHECK-NEXT:    vmv.x.s a1, v8
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT:    vmv.v.x v9, a1
+; CHECK-NEXT:    vslide1down.vx v9, v9, a0
+; CHECK-NEXT:    vslidedown.vi v10, v8, 2
+; CHECK-NEXT:    vmv.x.s a0, v10
+; CHECK-NEXT:    vslide1down.vx v9, v9, a0
+; CHECK-NEXT:    vslidedown.vi v10, v8, 3
+; CHECK-NEXT:    vmv.x.s a0, v10
+; CHECK-NEXT:    vslide1down.vx v9, v9, a0
+; CHECK-NEXT:    vslidedown.vi v8, v8, 4
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    vslide1down.vx v8, v9, a0
+; CHECK-NEXT:    vslidedown.vi v8, v8, 3
 ; CHECK-NEXT:    ret
   %x = load <5 x i8>, ptr %p, align 1
   ret <5 x i8> %x
@@ -27,6 +59,25 @@ define <6 x i8> @load_v6i8(ptr %p) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 6, e8, mf2, ta, ma
 ; CHECK-NEXT:    vle8.v v8, (a0)
+; CHECK-NEXT:    vslidedown.vi v9, v8, 1
+; CHECK-NEXT:    vmv.x.s a0, v9
+; CHECK-NEXT:    vmv.x.s a1, v8
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT:    vmv.v.x v9, a1
+; CHECK-NEXT:    vslide1down.vx v9, v9, a0
+; CHECK-NEXT:    vslidedown.vi v10, v8, 2
+; CHECK-NEXT:    vmv.x.s a0, v10
+; CHECK-NEXT:    vslide1down.vx v9, v9, a0
+; CHECK-NEXT:    vslidedown.vi v10, v8, 3
+; CHECK-NEXT:    vmv.x.s a0, v10
+; CHECK-NEXT:    vslide1down.vx v9, v9, a0
+; CHECK-NEXT:    vslidedown.vi v10, v8, 4
+; CHECK-NEXT:    vmv.x.s a0, v10
+; CHECK-NEXT:    vslide1down.vx v9, v9, a0
+; CHECK-NEXT:    vslidedown.vi v8, v8, 5
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    vslide1down.vx v8, v9, a0
+; CHECK-NEXT:    vslidedown.vi v8, v8, 2
 ; CHECK-NEXT:    ret
   %x = load <6 x i8>, ptr %p
   ret <6 x i8> %x
@@ -37,6 +88,48 @@ define <12 x i8> @load_v12i8(ptr %p) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 12, e8, m1, ta, ma
 ; CHECK-NEXT:    vle8.v v8, (a0)
+; CHECK-NEXT:    vslidedown.vi v9, v8, 1
+; CHECK-NEXT:    vmv.x.s a0, v9
+; CHECK-NEXT:    vmv.x.s a1, v8
+; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-NEXT:    vmv.v.x v9, a1
+; CHECK-NEXT:    vslide1down.vx v9, v9, a0
+; CHECK-NEXT:    vslidedown.vi v10, v8, 2
+; CHECK-NEXT:    vmv.x.s a0, v10
+; CHECK-NEXT:    vslide1down.vx v9, v9, a0
+; CHECK-NEXT:    vslidedown.vi v10, v8, 3
+; CHECK-NEXT:    vmv.x.s a0, v10
+; CHECK-NEXT:    vslide1down.vx v9, v9, a0
+; CHECK-NEXT:    vslidedown.vi v10, v8, 4
+; CHECK-NEXT:    vmv.x.s a0, v10
+; CHECK-NEXT:    vslide1down.vx v9, v9, a0
+; CHECK-NEXT:    vslidedown.vi v10, v8, 5
+; CHECK-NEXT:    vmv.x.s a0, v10
+; CHECK-NEXT:    vslide1down.vx v9, v9, a0
+; CHECK-NEXT:    vslidedown.vi v10, v8, 6
+; CHECK-NEXT:    vmv.x.s a0, v10
+; CHECK-NEXT:    vslide1down.vx v9, v9, a0
+; CHECK-NEXT:    vslidedown.vi v10, v8, 7
+; CHECK-NEXT:    vmv.x.s a0, v10
+; CHECK-NEXT:    vslide1down.vx v9, v9, a0
+; CHECK-NEXT:    vslidedown.vi v10, v8, 9
+; CHECK-NEXT:    vmv.x.s a0, v10
+; CHECK-NEXT:    vslidedown.vi v10, v8, 8
+; CHECK-NEXT:    vmv.x.s a1, v10
+; CHECK-NEXT:    vmv.v.x v10, a1
+; CHECK-NEXT:    vslide1down.vx v10, v10, a0
+; CHECK-NEXT:    vslidedown.vi v11, v8, 10
+; CHECK-NEXT:    vmv.x.s a0, v11
+; CHECK-NEXT:    vslide1down.vx v10, v10, a0
+; CHECK-NEXT:    vslidedown.vi v8, v8, 11
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    vslide1down.vx v8, v10, a0
+; CHECK-NEXT:    li a0, 255
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vmv.s.x v0, a0
+; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, mu
+; CHECK-NEXT:    vslidedown.vi v8, v8, 4
+; CHECK-NEXT:    vslidedown.vi v8, v9, 8, v0.t
 ; CHECK-NEXT:    ret
   %x = load <12 x i8>, ptr %p
   ret <12 x i8> %x
@@ -47,6 +140,25 @@ define <6 x i16> @load_v6i16(ptr %p) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vslidedown.vi v9, v8, 1
+; CHECK-NEXT:    vmv.x.s a0, v9
+; CHECK-NEXT:    vmv.x.s a1, v8
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vmv.v.x v9, a1
+; CHECK-NEXT:    vslide1down.vx v9, v9, a0
+; CHECK-NEXT:    vslidedown.vi v10, v8, 2
+; CHECK-NEXT:    vmv.x.s a0, v10
+; CHECK-NEXT:    vslide1down.vx v9, v9, a0
+; CHECK-NEXT:    vslidedown.vi v10, v8, 3
+; CHECK-NEXT:    vmv.x.s a0, v10
+; CHECK-NEXT:    vslide1down.vx v9, v9, a0
+; CHECK-NEXT:    vslidedown.vi v10, v8, 4
+; CHECK-NEXT:    vmv.x.s a0, v10
+; CHECK-NEXT:    vslide1down.vx v9, v9, a0
+; CHECK-NEXT:    vslidedown.vi v8, v8, 5
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    vslide1down.vx v8, v9, a0
+; CHECK-NEXT:    vslidedown.vi v8, v8, 2
 ; CHECK-NEXT:    ret
   %x = load <6 x i16>, ptr %p
   ret <6 x i16> %x
@@ -57,6 +169,25 @@ define <6 x half> @load_v6f16(ptr %p) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vslidedown.vi v9, v8, 1
+; CHECK-NEXT:    vfmv.f.s fa5, v9
+; CHECK-NEXT:    vfmv.f.s fa4, v8
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vfmv.v.f v9, fa4
+; CHECK-NEXT:    vfslide1down.vf v9, v9, fa5
+; CHECK-NEXT:    vslidedown.vi v10, v8, 2
+; CHECK-NEXT:    vfmv.f.s fa5, v10
+; CHECK-NEXT:    vfslide1down.vf v9, v9, fa5
+; CHECK-NEXT:    vslidedown.vi v10, v8, 3
+; CHECK-NEXT:    vfmv.f.s fa5, v10
+; CHECK-NEXT:    vfslide1down.vf v9, v9, fa5
+; CHECK-NEXT:    vslidedown.vi v10, v8, 4
+; CHECK-NEXT:    vfmv.f.s fa5, v10
+; CHECK-NEXT:    vfslide1down.vf v9, v9, fa5
+; CHECK-NEXT:    vslidedown.vi v8, v8, 5
+; CHECK-NEXT:    vfmv.f.s fa5, v8
+; CHECK-NEXT:    vfslide1down.vf v8, v9, fa5
+; CHECK-NEXT:    vslidedown.vi v8, v8, 2
 ; CHECK-NEXT:    ret
   %x = load <6 x half>, ptr %p
   ret <6 x half> %x
@@ -67,17 +198,129 @@ define <6 x float> @load_v6f32(ptr %p) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v10, v8, 1
+; CHECK-NEXT:    vfmv.f.s fa5, v10
+; CHECK-NEXT:    vfmv.f.s fa4, v8
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vfmv.v.f v10, fa4
+; CHECK-NEXT:    vfslide1down.vf v10, v10, fa5
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v12, v8, 2
+; CHECK-NEXT:    vfmv.f.s fa5, v12
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vfslide1down.vf v10, v10, fa5
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v12, v8, 3
+; CHECK-NEXT:    vfmv.f.s fa5, v12
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vfslide1down.vf v10, v10, fa5
+; CHECK-NEXT:    vslidedown.vi v12, v8, 4
+; CHECK-NEXT:    vfmv.f.s fa5, v12
+; CHECK-NEXT:    vfslide1down.vf v10, v10, fa5
+; CHECK-NEXT:    vslidedown.vi v8, v8, 5
+; CHECK-NEXT:    vfmv.f.s fa5, v8
+; CHECK-NEXT:    vfslide1down.vf v8, v10, fa5
+; CHECK-NEXT:    vslidedown.vi v8, v8, 2
 ; CHECK-NEXT:    ret
   %x = load <6 x float>, ptr %p
   ret <6 x float> %x
 }
 
 define <6 x double> @load_v6f64(ptr %p) {
-; CHECK-LABEL: load_v6f64:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 6, e64, m4, ta, ma
-; CHECK-NEXT:    vle64.v v8, (a0)
-; CHECK-NEXT:    ret
+; RV32-LABEL: load_v6f64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -128
+; RV32-NEXT:    .cfi_def_cfa_offset 128
+; RV32-NEXT:    sw ra, 124(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s0, 120(sp) # 4-byte Folded Spill
+; RV32-NEXT:    .cfi_offset ra, -4
+; RV32-NEXT:    .cfi_offset s0, -8
+; RV32-NEXT:    addi s0, sp, 128
+; RV32-NEXT:    .cfi_def_cfa s0, 0
+; RV32-NEXT:    andi sp, sp, -64
+; RV32-NEXT:    vsetivli zero, 6, e64, m4, ta, ma
+; RV32-NEXT:    vle64.v v8, (a0)
+; RV32-NEXT:    mv a0, sp
+; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; RV32-NEXT:    vse64.v v8, (a0)
+; RV32-NEXT:    vsetivli zero, 1, e64, m4, ta, ma
+; RV32-NEXT:    vslidedown.vi v12, v8, 5
+; RV32-NEXT:    addi a1, sp, 40
+; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; RV32-NEXT:    vse64.v v12, (a1)
+; RV32-NEXT:    vsetivli zero, 1, e64, m4, ta, ma
+; RV32-NEXT:    vslidedown.vi v12, v8, 4
+; RV32-NEXT:    addi a1, sp, 32
+; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; RV32-NEXT:    vse64.v v12, (a1)
+; RV32-NEXT:    vslidedown.vi v10, v8, 1
+; RV32-NEXT:    addi a1, sp, 8
+; RV32-NEXT:    vse64.v v10, (a1)
+; RV32-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
+; RV32-NEXT:    vslidedown.vi v10, v8, 3
+; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; RV32-NEXT:    vse64.v v10, (a1)
+; RV32-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
+; RV32-NEXT:    vslidedown.vi v8, v8, 2
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; RV32-NEXT:    vse64.v v8, (a1)
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT:    vle64.v v8, (a0)
+; RV32-NEXT:    addi sp, s0, -128
+; RV32-NEXT:    lw ra, 124(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s0, 120(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 128
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: load_v6f64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -128
+; RV64-NEXT:    .cfi_def_cfa_offset 128
+; RV64-NEXT:    sd ra, 120(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s0, 112(sp) # 8-byte Folded Spill
+; RV64-NEXT:    .cfi_offset ra, -8
+; RV64-NEXT:    .cfi_offset s0, -16
+; RV64-NEXT:    addi s0, sp, 128
+; RV64-NEXT:    .cfi_def_cfa s0, 0
+; RV64-NEXT:    andi sp, sp, -64
+; RV64-NEXT:    vsetivli zero, 6, e64, m4, ta, ma
+; RV64-NEXT:    vle64.v v8, (a0)
+; RV64-NEXT:    mv a0, sp
+; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; RV64-NEXT:    vse64.v v8, (a0)
+; RV64-NEXT:    vsetivli zero, 1, e64, m4, ta, ma
+; RV64-NEXT:    vslidedown.vi v12, v8, 5
+; RV64-NEXT:    addi a1, sp, 40
+; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; RV64-NEXT:    vse64.v v12, (a1)
+; RV64-NEXT:    vsetivli zero, 1, e64, m4, ta, ma
+; RV64-NEXT:    vslidedown.vi v12, v8, 4
+; RV64-NEXT:    addi a1, sp, 32
+; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; RV64-NEXT:    vse64.v v12, (a1)
+; RV64-NEXT:    vslidedown.vi v10, v8, 1
+; RV64-NEXT:    addi a1, sp, 8
+; RV64-NEXT:    vse64.v v10, (a1)
+; RV64-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
+; RV64-NEXT:    vslidedown.vi v10, v8, 3
+; RV64-NEXT:    addi a1, sp, 24
+; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; RV64-NEXT:    vse64.v v10, (a1)
+; RV64-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
+; RV64-NEXT:    vslidedown.vi v8, v8, 2
+; RV64-NEXT:    addi a1, sp, 16
+; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; RV64-NEXT:    vse64.v v8, (a1)
+; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV64-NEXT:    vle64.v v8, (a0)
+; RV64-NEXT:    addi sp, s0, -128
+; RV64-NEXT:    ld ra, 120(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s0, 112(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 128
+; RV64-NEXT:    ret
   %x = load <6 x double>, ptr %p
   ret <6 x double> %x
 }



More information about the llvm-commits mailing list