[llvm] d85e347 - [RISCV] Add a pass to recognize VLS strided loads/store from gather/scatter.

Mon Sep 20 09:39:51 PDT 2021

Author: Craig Topper
Date: 2021-09-20T09:39:44-07:00
New Revision: d85e347a28dc9a329d7029987e4e062428985b41

URL: https://github.com/llvm/llvm-project/commit/d85e347a28dc9a329d7029987e4e062428985b41
DIFF: https://github.com/llvm/llvm-project/commit/d85e347a28dc9a329d7029987e4e062428985b41.diff

LOG: [RISCV] Add a pass to recognize VLS strided loads/store from gather/scatter.

For strided accesses the loop vectorizer seems to prefer creating a
vector induction variable with a start value of the form
<i32 0, i32 1, i32 2, ...>. This value will be incremented each
loop iteration by a splat constant equal to the length of the vector.
Within the loop, arithmetic using splat values will be done on this
vector induction variable to produce indices for a vector GEP.

This pass attempts to dig through the arithmetic back to the phi
to create a new scalar induction variable and a stride. We push
all of the arithmetic out of the loop by folding it into the start,
step, and stride values. Then we create a scalar GEP to use as the
base pointer for a strided load or store using the computed stride.
Loop strength reduce will run after this pass and can do some
cleanups to the scalar GEP and induction variable.

Reviewed By: frasercrmck

Differential Revision: https://reviews.llvm.org/D107790

Added: 
    llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp
    llvm/test/CodeGen/RISCV/rvv/fixed-vector-strided-load-store-negative.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vector-strided-load-store.ll

Modified: 
    llvm/include/llvm/IR/IntrinsicsRISCV.td
    llvm/lib/Target/RISCV/CMakeLists.txt
    llvm/lib/Target/RISCV/RISCV.h
    llvm/lib/Target/RISCV/RISCVISelLowering.cpp
    llvm/lib/Target/RISCV/RISCVISelLowering.h
    llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
    llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
    llvm/tools/opt/opt.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/IR/IntrinsicsRISCV.td b/llvm/include/llvm/IR/IntrinsicsRISCV.td
index a46709bf09d1..b3f82c7022a8 100644

--- a/llvm/include/llvm/IR/IntrinsicsRISCV.td
+++ b/llvm/include/llvm/IR/IntrinsicsRISCV.td
@@ -1245,4 +1245,15 @@ let TargetPrefix = "riscv" in {
     defm vsuxseg # nf : RISCVISegStore<nf>;
   }
 
+  // Strided loads/stores for fixed vectors.
+  def int_riscv_masked_strided_load
+        : Intrinsic<[llvm_anyvector_ty],
+                    [LLVMMatchType<0>, llvm_anyptr_ty,
+                     llvm_anyint_ty, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
+                    [NoCapture<ArgIndex<1>>, IntrReadMem]>;
+  def int_riscv_masked_strided_store
+        : Intrinsic<[],
+                    [llvm_anyvector_ty, llvm_anyptr_ty,
+                     llvm_anyint_ty, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
+                    [NoCapture<ArgIndex<1>>, IntrWriteMem]>;
 } // TargetPrefix = "riscv"

diff  --git a/llvm/lib/Target/RISCV/CMakeLists.txt b/llvm/lib/Target/RISCV/CMakeLists.txt
index 961781bec193..8eb3a9abc5fc 100644
--- a/llvm/lib/Target/RISCV/CMakeLists.txt
+++ b/llvm/lib/Target/RISCV/CMakeLists.txt
@@ -24,6 +24,7 @@ add_llvm_target(RISCVCodeGen
   RISCVExpandAtomicPseudoInsts.cpp
   RISCVExpandPseudoInsts.cpp
   RISCVFrameLowering.cpp
+  RISCVGatherScatterLowering.cpp
   RISCVInsertVSETVLI.cpp
   RISCVInstrInfo.cpp
   RISCVInstructionSelector.cpp
@@ -50,6 +51,7 @@ add_llvm_target(RISCVCodeGen
   SelectionDAG
   Support
   Target
+  TransformUtils
   GlobalISel
 
   ADD_TO_COMPONENT

diff  --git a/llvm/lib/Target/RISCV/RISCV.h b/llvm/lib/Target/RISCV/RISCV.h
index ef1f97067e12..b415c9f35e7f 100644
--- a/llvm/lib/Target/RISCV/RISCV.h
+++ b/llvm/lib/Target/RISCV/RISCV.h
@@ -37,6 +37,9 @@ bool LowerRISCVMachineOperandToMCOperand(const MachineOperand &MO,
 
 FunctionPass *createRISCVISelDag(RISCVTargetMachine &TM);
 
+FunctionPass *createRISCVGatherScatterLoweringPass();
+void initializeRISCVGatherScatterLoweringPass(PassRegistry &);
+
 FunctionPass *createRISCVMergeBaseOffsetOptPass();
 void initializeRISCVMergeBaseOffsetOptPass(PassRegistry &);
 

diff  --git a/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp b/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp
new file mode 100644
index 000000000000..e265bb4d392d
--- /dev/null
+++ b/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp
@@ -0,0 +1,475 @@
+//===- RISCVGatherScatterLowering.cpp - Gather/Scatter lowering -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass custom lowers llvm.gather and llvm.scatter instructions to
+// RISCV intrinsics.
+//
+//===----------------------------------------------------------------------===//
+
+#include "RISCV.h"
+#include "RISCVTargetMachine.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IntrinsicsRISCV.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "riscv-gather-scatter-lowering"
+
+namespace {
+
+class RISCVGatherScatterLowering : public FunctionPass {
+  const RISCVSubtarget *ST = nullptr;
+  const RISCVTargetLowering *TLI = nullptr;
+  LoopInfo *LI = nullptr;
+  const DataLayout *DL = nullptr;
+
+  SmallVector<WeakTrackingVH> MaybeDeadPHIs;
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+
+  RISCVGatherScatterLowering() : FunctionPass(ID) {}
+
+  bool runOnFunction(Function &F) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<TargetPassConfig>();
+    AU.addRequired<LoopInfoWrapperPass>();
+  }
+
+  StringRef getPassName() const override {
+    return "RISCV gather/scatter lowering";
+  }
+
+private:
+  bool isLegalTypeAndAlignment(Type *DataType, Value *AlignOp);
+
+  bool tryCreateStridedLoadStore(IntrinsicInst *II, Type *DataType, Value *Ptr,
+                                 Value *AlignOp);
+
+  std::pair<Value *, Value *> determineBaseAndStride(GetElementPtrInst *GEP,
+                                                     IRBuilder<> &Builder);
+
+  bool matchStridedRecurrence(Value *Index, Loop *L, Value *&Stride,
+                              PHINode *&BasePtr, BinaryOperator *&Inc,
+                              IRBuilder<> &Builder);
+};
+
+} // end anonymous namespace
+
+char RISCVGatherScatterLowering::ID = 0;
+
+INITIALIZE_PASS(RISCVGatherScatterLowering, DEBUG_TYPE,
+                "RISCV gather/scatter lowering pass", false, false)
+
+FunctionPass *llvm::createRISCVGatherScatterLoweringPass() {
+  return new RISCVGatherScatterLowering();
+}
+
+bool RISCVGatherScatterLowering::isLegalTypeAndAlignment(Type *DataType,
+                                                         Value *AlignOp) {
+  Type *ScalarType = DataType->getScalarType();
+  if (!TLI->isLegalElementTypeForRVV(ScalarType))
+    return false;
+
+  MaybeAlign MA = cast<ConstantInt>(AlignOp)->getMaybeAlignValue();
+  if (MA && MA->value() < DL->getTypeStoreSize(ScalarType).getFixedSize())
+    return false;
+
+  // FIXME: Let the backend type legalize by splitting/widening?
+  EVT DataVT = TLI->getValueType(*DL, DataType);
+  if (!TLI->isTypeLegal(DataVT))
+    return false;
+
+  return true;
+}
+
+// TODO: Should we consider the mask when looking for a stride?
+static std::pair<Value *, Value *> matchStridedConstant(Constant *StartC) {
+  unsigned NumElts = cast<FixedVectorType>(StartC->getType())->getNumElements();
+
+  // Check that the start value is a strided constant.
+  auto *StartVal =
+      dyn_cast_or_null<ConstantInt>(StartC->getAggregateElement((unsigned)0));
+  if (!StartVal)
+    return std::make_pair(nullptr, nullptr);
+  APInt StrideVal(StartVal->getValue().getBitWidth(), 0);
+  ConstantInt *Prev = StartVal;
+  for (unsigned i = 1; i != NumElts; ++i) {
+    auto *C = dyn_cast_or_null<ConstantInt>(StartC->getAggregateElement(i));
+    if (!C)
+      return std::make_pair(nullptr, nullptr);
+
+    APInt LocalStride = C->getValue() - Prev->getValue();
+    if (i == 1)
+      StrideVal = LocalStride;
+    else if (StrideVal != LocalStride)
+      return std::make_pair(nullptr, nullptr);
+
+    Prev = C;
+  }
+
+  Value *Stride = ConstantInt::get(StartVal->getType(), StrideVal);
+
+  return std::make_pair(StartVal, Stride);
+}
+
+// Recursively, walk about the use-def chain until we find a Phi with a strided
+// start value. Build and update a scalar recurrence as we unwind the recursion.
+// We also update the Stride as we unwind. Our goal is to move all of the
+// arithmetic out of the loop.
+bool RISCVGatherScatterLowering::matchStridedRecurrence(Value *Index, Loop *L,
+                                                        Value *&Stride,
+                                                        PHINode *&BasePtr,
+                                                        BinaryOperator *&Inc,
+                                                        IRBuilder<> &Builder) {
+  // Our base case is a Phi.
+  if (auto *Phi = dyn_cast<PHINode>(Index)) {
+    // A phi node we want to perform this function on should be from the
+    // loop header.
+    if (Phi->getParent() != L->getHeader())
+      return false;
+
+    Value *Step, *Start;
+    if (!matchSimpleRecurrence(Phi, Inc, Start, Step) ||
+        Inc->getOpcode() != Instruction::Add)
+      return false;
+    assert(Phi->getNumIncomingValues() == 2 && "Expected 2 operand phi.");
+    unsigned IncrementingBlock = Phi->getIncomingValue(0) == Inc ? 0 : 1;
+    assert(Phi->getIncomingValue(IncrementingBlock) == Inc &&
+           "Expected one operand of phi to be Inc");
+
+    // Only proceed if the step is loop invariant.
+    if (!L->isLoopInvariant(Step))
+      return false;
+
+    // Step should be a splat.
+    Step = getSplatValue(Step);
+    if (!Step)
+      return false;
+
+    // Start should be a strided constant.
+    auto *StartC = dyn_cast<Constant>(Start);
+    if (!StartC)
+      return false;
+
+    std::tie(Start, Stride) = matchStridedConstant(StartC);
+    if (!Start)
+      return false;
+    assert(Stride != nullptr);
+
+    // Build scalar phi and increment.
+    BasePtr =
+        PHINode::Create(Start->getType(), 2, Phi->getName() + ".scalar", Phi);
+    Inc = BinaryOperator::CreateAdd(BasePtr, Step, Inc->getName() + ".scalar",
+                                    Inc);
+    BasePtr->addIncoming(Start, Phi->getIncomingBlock(1 - IncrementingBlock));
+    BasePtr->addIncoming(Inc, Phi->getIncomingBlock(IncrementingBlock));
+
+    // Note that this Phi might be eligible for removal.
+    MaybeDeadPHIs.push_back(Phi);
+    return true;
+  }
+
+  // Otherwise look for binary operator.
+  auto *BO = dyn_cast<BinaryOperator>(Index);
+  if (!BO)
+    return false;
+
+  if (BO->getOpcode() != Instruction::Add &&
+      BO->getOpcode() != Instruction::Or &&
+      BO->getOpcode() != Instruction::Mul &&
+      BO->getOpcode() != Instruction::Shl)
+    return false;
+
+  // Only support shift by constant.
+  if (BO->getOpcode() == Instruction::Shl && !isa<Constant>(BO->getOperand(1)))
+    return false;
+
+  // We need to be able to treat Or as Add.
+  if (BO->getOpcode() == Instruction::Or &&
+      !haveNoCommonBitsSet(BO->getOperand(0), BO->getOperand(1), *DL))
+    return false;
+
+  // We should have one operand in the loop and one splat.
+  Value *OtherOp;
+  if (isa<Instruction>(BO->getOperand(0)) &&
+      L->contains(cast<Instruction>(BO->getOperand(0)))) {
+    Index = cast<Instruction>(BO->getOperand(0));
+    OtherOp = BO->getOperand(1);
+  } else if (isa<Instruction>(BO->getOperand(1)) &&
+             L->contains(cast<Instruction>(BO->getOperand(1)))) {
+    Index = cast<Instruction>(BO->getOperand(1));
+    OtherOp = BO->getOperand(0);
+  } else {
+    return false;
+  }
+
+  // Make sure other op is loop invariant.
+  if (!L->isLoopInvariant(OtherOp))
+    return false;
+
+  // Make sure we have a splat.
+  Value *SplatOp = getSplatValue(OtherOp);
+  if (!SplatOp)
+    return false;
+
+  // Recurse up the use-def chain.
+  if (!matchStridedRecurrence(Index, L, Stride, BasePtr, Inc, Builder))
+    return false;
+
+  // Locate the Step and Start values from the recurrence.
+  unsigned StepIndex = Inc->getOperand(0) == BasePtr ? 1 : 0;
+  unsigned StartBlock = BasePtr->getOperand(0) == Inc ? 1 : 0;
+  Value *Step = Inc->getOperand(StepIndex);
+  Value *Start = BasePtr->getOperand(StartBlock);
+
+  // We need to adjust the start value in the preheader.
+  Builder.SetInsertPoint(
+      BasePtr->getIncomingBlock(StartBlock)->getTerminator());
+  Builder.SetCurrentDebugLocation(DebugLoc());
+
+  switch (BO->getOpcode()) {
+  default:
+    llvm_unreachable("Unexpected opcode!");
+  case Instruction::Add:
+  case Instruction::Or: {
+    // An add only affects the start value. It's ok to do this for Or because
+    // we already checked that there are no common set bits.
+
+    // If the start value is Zero, just take the SplatOp.
+    if (isa<ConstantInt>(Start) && cast<ConstantInt>(Start)->isZero())
+      Start = SplatOp;
+    else
+      Start = Builder.CreateAdd(Start, SplatOp, "start");
+    BasePtr->setIncomingValue(StartBlock, Start);
+    break;
+  }
+  case Instruction::Mul: {
+    // If the start is zero we don't need to multiply.
+    if (!isa<ConstantInt>(Start) || !cast<ConstantInt>(Start)->isZero())
+      Start = Builder.CreateMul(Start, SplatOp, "start");
+
+    Step = Builder.CreateMul(Step, SplatOp, "step");
+
+    // If the Stride is 1 just take the SplatOpt.
+    if (isa<ConstantInt>(Stride) && cast<ConstantInt>(Stride)->isOne())
+      Stride = SplatOp;
+    else
+      Stride = Builder.CreateMul(Stride, SplatOp, "stride");
+    Inc->setOperand(StepIndex, Step);
+    BasePtr->setIncomingValue(StartBlock, Start);
+    break;
+  }
+  case Instruction::Shl: {
+    // If the start is zero we don't need to shift.
+    if (!isa<ConstantInt>(Start) || !cast<ConstantInt>(Start)->isZero())
+      Start = Builder.CreateShl(Start, SplatOp, "start");
+    Step = Builder.CreateShl(Step, SplatOp, "step");
+    Stride = Builder.CreateShl(Stride, SplatOp, "stride");
+    Inc->setOperand(StepIndex, Step);
+    BasePtr->setIncomingValue(StartBlock, Start);
+    break;
+  }
+  }
+
+  return true;
+}
+
+std::pair<Value *, Value *>
+RISCVGatherScatterLowering::determineBaseAndStride(GetElementPtrInst *GEP,
+                                                   IRBuilder<> &Builder) {
+
+  SmallVector<Value *, 2> Ops(GEP->operands());
+
+  // Base pointer needs to be a scalar.
+  if (Ops[0]->getType()->isVectorTy())
+    return std::make_pair(nullptr, nullptr);
+
+  // Make sure we're in a loop and it is in loop simplify form.
+  Loop *L = LI->getLoopFor(GEP->getParent());
+  if (!L || !L->isLoopSimplifyForm())
+    return std::make_pair(nullptr, nullptr);
+
+  Optional<unsigned> VecOperand;
+  unsigned TypeScale = 0;
+
+  // Look for a vector operand and scale.
+  gep_type_iterator GTI = gep_type_begin(GEP);
+  for (unsigned i = 1, e = GEP->getNumOperands(); i != e; ++i, ++GTI) {
+    if (!Ops[i]->getType()->isVectorTy())
+      continue;
+
+    if (VecOperand)
+      return std::make_pair(nullptr, nullptr);
+
+    VecOperand = i;
+
+    TypeSize TS = DL->getTypeAllocSize(GTI.getIndexedType());
+    if (TS.isScalable())
+      return std::make_pair(nullptr, nullptr);
+
+    TypeScale = TS.getFixedSize();
+  }
+
+  // We need to find a vector index to simplify.
+  if (!VecOperand)
+    return std::make_pair(nullptr, nullptr);
+
+  // We can't extract the stride if the arithmetic is done at a 
diff erent size
+  // than the pointer type. Adding the stride later may not wrap correctly.
+  // Technically we could handle wider indices, but I don't expect that in
+  // practice.
+  Value *VecIndex = Ops[*VecOperand];
+  Type *VecIntPtrTy = DL->getIntPtrType(GEP->getType());
+  if (VecIndex->getType() != VecIntPtrTy)
+    return std::make_pair(nullptr, nullptr);
+
+  Value *Stride;
+  BinaryOperator *Inc;
+  PHINode *BasePhi;
+  if (!matchStridedRecurrence(VecIndex, L, Stride, BasePhi, Inc, Builder))
+    return std::make_pair(nullptr, nullptr);
+
+  assert(BasePhi->getNumIncomingValues() == 2 && "Expected 2 operand phi.");
+  unsigned IncrementingBlock = BasePhi->getOperand(0) == Inc ? 0 : 1;
+  assert(BasePhi->getIncomingValue(IncrementingBlock) == Inc &&
+         "Expected one operand of phi to be Inc");
+
+  Builder.SetInsertPoint(GEP);
+
+  // Replace the vector index with the scalar phi and build a scalar GEP.
+  Ops[*VecOperand] = BasePhi;
+  Type *SourceTy = GEP->getSourceElementType();
+  Value *BasePtr =
+      Builder.CreateGEP(SourceTy, Ops[0], makeArrayRef(Ops).drop_front());
+
+  // Cast the GEP to an i8*.
+  LLVMContext &Ctx = GEP->getContext();
+  Type *I8PtrTy =
+      Type::getInt8PtrTy(Ctx, GEP->getType()->getPointerAddressSpace());
+  if (BasePtr->getType() != I8PtrTy)
+    BasePtr = Builder.CreatePointerCast(BasePtr, I8PtrTy);
+
+  // Final adjustments to stride should go in the start block.
+  Builder.SetInsertPoint(
+      BasePhi->getIncomingBlock(1 - IncrementingBlock)->getTerminator());
+
+  // Convert stride to pointer size if needed.
+  Type *IntPtrTy = DL->getIntPtrType(BasePtr->getType());
+  assert(Stride->getType() == IntPtrTy && "Unexpected type");
+
+  // Scale the stride by the size of the indexed type.
+  if (TypeScale != 1)
+    Stride = Builder.CreateMul(Stride, ConstantInt::get(IntPtrTy, TypeScale));
+
+  return std::make_pair(BasePtr, Stride);
+}
+
+bool RISCVGatherScatterLowering::tryCreateStridedLoadStore(IntrinsicInst *II,
+                                                           Type *DataType,
+                                                           Value *Ptr,
+                                                           Value *AlignOp) {
+  // Make sure the operation will be supported by the backend.
+  if (!isLegalTypeAndAlignment(DataType, AlignOp))
+    return false;
+
+  // Pointer should be a GEP.
+  auto *GEP = dyn_cast<GetElementPtrInst>(Ptr);
+  if (!GEP)
+    return false;
+
+  IRBuilder<> Builder(GEP);
+
+  Value *BasePtr, *Stride;
+  std::tie(BasePtr, Stride) = determineBaseAndStride(GEP, Builder);
+  if (!BasePtr)
+    return false;
+  assert(Stride != nullptr);
+
+  Builder.SetInsertPoint(II);
+
+  CallInst *Call;
+  if (II->getIntrinsicID() == Intrinsic::masked_gather)
+    Call = Builder.CreateIntrinsic(
+        Intrinsic::riscv_masked_strided_load,
+        {DataType, BasePtr->getType(), Stride->getType()},
+        {II->getArgOperand(3), BasePtr, Stride, II->getArgOperand(2)});
+  else
+    Call = Builder.CreateIntrinsic(
+        Intrinsic::riscv_masked_strided_store,
+        {DataType, BasePtr->getType(), Stride->getType()},
+        {II->getArgOperand(0), BasePtr, Stride, II->getArgOperand(3)});
+
+  Call->takeName(II);
+  II->replaceAllUsesWith(Call);
+  II->eraseFromParent();
+
+  if (GEP->use_empty())
+    RecursivelyDeleteTriviallyDeadInstructions(GEP);
+
+  return true;
+}
+
+bool RISCVGatherScatterLowering::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
+
+  auto &TPC = getAnalysis<TargetPassConfig>();
+  auto &TM = TPC.getTM<RISCVTargetMachine>();
+  ST = &TM.getSubtarget<RISCVSubtarget>(F);
+  if (!ST->hasStdExtV() || !ST->useRVVForFixedLengthVectors())
+    return false;
+
+  TLI = ST->getTargetLowering();
+  DL = &F.getParent()->getDataLayout();
+  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+
+  SmallVector<IntrinsicInst *, 4> Gathers;
+  SmallVector<IntrinsicInst *, 4> Scatters;
+
+  bool Changed = false;
+
+  for (BasicBlock &BB : F) {
+    for (Instruction &I : BB) {
+      IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I);
+      if (II && II->getIntrinsicID() == Intrinsic::masked_gather &&
+          isa<FixedVectorType>(II->getType())) {
+        Gathers.push_back(II);
+      } else if (II && II->getIntrinsicID() == Intrinsic::masked_scatter &&
+                 isa<FixedVectorType>(II->getArgOperand(0)->getType())) {
+        Scatters.push_back(II);
+      }
+    }
+  }
+
+  // Rewrite gather/scatter to form strided load/store if possible.
+  for (auto *II : Gathers)
+    Changed |= tryCreateStridedLoadStore(
+        II, II->getType(), II->getArgOperand(0), II->getArgOperand(1));
+  for (auto *II : Scatters)
+    Changed |=
+        tryCreateStridedLoadStore(II, II->getArgOperand(0)->getType(),
+                                  II->getArgOperand(1), II->getArgOperand(2));
+
+  // Remove any dead phis.
+  while (!MaybeDeadPHIs.empty()) {
+    if (auto *Phi = dyn_cast_or_null<PHINode>(MaybeDeadPHIs.pop_back_val()))
+      RecursivelyDeleteDeadPHINode(Phi);
+  }
+
+  return Changed;
+}

diff  --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index f7969bdd27e4..fe5dc60b8b7b 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -20,6 +20,7 @@
 #include "RISCVTargetMachine.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/MemoryLocation.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -436,6 +437,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
     }
 
     setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
+    setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
 
     static unsigned IntegerVPOps[] = {
         ISD::VP_ADD,  ISD::VP_SUB,  ISD::VP_MUL, ISD::VP_SDIV, ISD::VP_UDIV,
@@ -948,6 +950,23 @@ bool RISCVTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
                  MachineMemOperand::MOVolatile;
     return true;
   }
+  case Intrinsic::riscv_masked_strided_load:
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    Info.ptrVal = I.getArgOperand(1);
+    Info.memVT = MVT::getVT(I.getType()->getScalarType());
+    Info.align = Align(I.getType()->getScalarSizeInBits() / 8);
+    Info.size = MemoryLocation::UnknownSize;
+    Info.flags |= MachineMemOperand::MOLoad;
+    return true;
+  case Intrinsic::riscv_masked_strided_store:
+    Info.opc = ISD::INTRINSIC_VOID;
+    Info.ptrVal = I.getArgOperand(1);
+    Info.memVT = MVT::getVT(I.getArgOperand(0)->getType()->getScalarType());
+    Info.align =
+        Align(I.getArgOperand(0)->getType()->getScalarSizeInBits() / 8);
+    Info.size = MemoryLocation::UnknownSize;
+    Info.flags |= MachineMemOperand::MOStore;
+    return true;
   }
 }
 
@@ -1274,6 +1293,24 @@ bool RISCVTargetLowering::mergeStoresAfterLegalization(EVT VT) const {
          (VT.isFixedLengthVector() && VT.getVectorElementType() == MVT::i1);
 }
 
+bool RISCVTargetLowering::isLegalElementTypeForRVV(Type *ScalarTy) const {
+  if (ScalarTy->isPointerTy())
+    return true;
+
+  if (ScalarTy->isIntegerTy(8) || ScalarTy->isIntegerTy(16) ||
+      ScalarTy->isIntegerTy(32) || ScalarTy->isIntegerTy(64))
+    return true;
+
+  if (ScalarTy->isHalfTy())
+    return Subtarget.hasStdExtZfh();
+  if (ScalarTy->isFloatTy())
+    return Subtarget.hasStdExtF();
+  if (ScalarTy->isDoubleTy())
+    return Subtarget.hasStdExtD();
+
+  return false;
+}
+
 static bool useRVVForFixedLengthVectorVT(MVT VT,
                                          const RISCVSubtarget &Subtarget) {
   assert(VT.isFixedLengthVector() && "Expected a fixed length vector type!");
@@ -2336,6 +2373,8 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
     return LowerINTRINSIC_WO_CHAIN(Op, DAG);
   case ISD::INTRINSIC_W_CHAIN:
     return LowerINTRINSIC_W_CHAIN(Op, DAG);
+  case ISD::INTRINSIC_VOID:
+    return LowerINTRINSIC_VOID(Op, DAG);
   case ISD::BSWAP:
   case ISD::BITREVERSE: {
     // Convert BSWAP/BITREVERSE to GREVI to enable GREVI combinining.
@@ -3887,9 +3926,109 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
 
 SDValue RISCVTargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
                                                     SelectionDAG &DAG) const {
+  unsigned IntNo = Op.getConstantOperandVal(1);
+  switch (IntNo) {
+  default:
+    break;
+  case Intrinsic::riscv_masked_strided_load: {
+    SDLoc DL(Op);
+    MVT XLenVT = Subtarget.getXLenVT();
+
+    // If the mask is known to be all ones, optimize to an unmasked intrinsic;
+    // the selection of the masked intrinsics doesn't do this for us.
+    SDValue Mask = Op.getOperand(5);
+    bool IsUnmasked = ISD::isConstantSplatVectorAllOnes(Mask.getNode());
+
+    MVT VT = Op->getSimpleValueType(0);
+    MVT ContainerVT = getContainerForFixedLengthVector(VT);
+
+    SDValue PassThru = Op.getOperand(2);
+    if (!IsUnmasked) {
+      MVT MaskVT =
+          MVT::getVectorVT(MVT::i1, ContainerVT.getVectorElementCount());
+      Mask = convertToScalableVector(MaskVT, Mask, DAG, Subtarget);
+      PassThru = convertToScalableVector(ContainerVT, PassThru, DAG, Subtarget);
+    }
+
+    SDValue VL = DAG.getConstant(VT.getVectorNumElements(), DL, XLenVT);
+
+    SDValue IntID = DAG.getTargetConstant(
+        IsUnmasked ? Intrinsic::riscv_vlse : Intrinsic::riscv_vlse_mask, DL,
+        XLenVT);
+
+    auto *Load = cast<MemIntrinsicSDNode>(Op);
+    SmallVector<SDValue, 8> Ops{Load->getChain(), IntID};
+    if (!IsUnmasked)
+      Ops.push_back(PassThru);
+    Ops.push_back(Op.getOperand(3)); // Ptr
+    Ops.push_back(Op.getOperand(4)); // Stride
+    if (!IsUnmasked)
+      Ops.push_back(Mask);
+    Ops.push_back(VL);
+
+    SDVTList VTs = DAG.getVTList({ContainerVT, MVT::Other});
+    SDValue Result =
+        DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, VTs, Ops,
+                                Load->getMemoryVT(), Load->getMemOperand());
+    SDValue Chain = Result.getValue(1);
+    Result = convertFromScalableVector(VT, Result, DAG, Subtarget);
+    return DAG.getMergeValues({Result, Chain}, DL);
+  }
+  }
+
   return lowerVectorIntrinsicSplats(Op, DAG, Subtarget);
 }
 
+SDValue RISCVTargetLowering::LowerINTRINSIC_VOID(SDValue Op,
+                                                 SelectionDAG &DAG) const {
+  unsigned IntNo = Op.getConstantOperandVal(1);
+  switch (IntNo) {
+  default:
+    break;
+  case Intrinsic::riscv_masked_strided_store: {
+    SDLoc DL(Op);
+    MVT XLenVT = Subtarget.getXLenVT();
+
+    // If the mask is known to be all ones, optimize to an unmasked intrinsic;
+    // the selection of the masked intrinsics doesn't do this for us.
+    SDValue Mask = Op.getOperand(5);
+    bool IsUnmasked = ISD::isConstantSplatVectorAllOnes(Mask.getNode());
+
+    SDValue Val = Op.getOperand(2);
+    MVT VT = Val.getSimpleValueType();
+    MVT ContainerVT = getContainerForFixedLengthVector(VT);
+
+    Val = convertToScalableVector(ContainerVT, Val, DAG, Subtarget);
+    if (!IsUnmasked) {
+      MVT MaskVT =
+          MVT::getVectorVT(MVT::i1, ContainerVT.getVectorElementCount());
+      Mask = convertToScalableVector(MaskVT, Mask, DAG, Subtarget);
+    }
+
+    SDValue VL = DAG.getConstant(VT.getVectorNumElements(), DL, XLenVT);
+
+    SDValue IntID = DAG.getTargetConstant(
+        IsUnmasked ? Intrinsic::riscv_vsse : Intrinsic::riscv_vsse_mask, DL,
+        XLenVT);
+
+    auto *Store = cast<MemIntrinsicSDNode>(Op);
+    SmallVector<SDValue, 8> Ops{Store->getChain(), IntID};
+    Ops.push_back(Val);
+    Ops.push_back(Op.getOperand(3)); // Ptr
+    Ops.push_back(Op.getOperand(4)); // Stride
+    if (!IsUnmasked)
+      Ops.push_back(Mask);
+    Ops.push_back(VL);
+
+    return DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID, DL, Store->getVTList(),
+                                   Ops, Store->getMemoryVT(),
+                                   Store->getMemOperand());
+  }
+  }
+
+  return SDValue();
+}
+
 static MVT getLMUL1VT(MVT VT) {
   assert(VT.getVectorElementType().getSizeInBits() <= 64 &&
          "Unexpected vector MVT");

diff  --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 45173fc39d28..64d52089ee1a 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -510,6 +510,8 @@ class RISCVTargetLowering : public TargetLowering {
 
   bool shouldRemoveExtendFromGSIndex(EVT VT) const override;
 
+  bool isLegalElementTypeForRVV(Type *ScalarTy) const;
+
 private:
   /// RISCVCCAssignFn - This target-specific function extends the default
   /// CCValAssign with additional information used to lower RISC-V calling
@@ -558,6 +560,7 @@ class RISCVTargetLowering : public TargetLowering {
   SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerVectorMaskVECREDUCE(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerFPVECREDUCE(SDValue Op, SelectionDAG &DAG) const;

diff  --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index b18ee6009217..71ca065f884e 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -37,6 +37,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVTarget() {
   RegisterTargetMachine<RISCVTargetMachine> Y(getTheRISCV64Target());
   auto *PR = PassRegistry::getPassRegistry();
   initializeGlobalISel(*PR);
+  initializeRISCVGatherScatterLoweringPass(*PR);
   initializeRISCVMergeBaseOffsetOptPass(*PR);
   initializeRISCVExpandPseudoPass(*PR);
   initializeRISCVInsertVSETVLIPass(*PR);
@@ -149,6 +150,9 @@ TargetPassConfig *RISCVTargetMachine::createPassConfig(PassManagerBase &PM) {
 
 void RISCVPassConfig::addIRPasses() {
   addPass(createAtomicExpandPass());
+
+  addPass(createRISCVGatherScatterLoweringPass());
+
   TargetPassConfig::addIRPasses();
 }
 

diff  --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index b1d0e386f992..04a21893e8c9 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -83,24 +83,6 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
                                          TTI::TargetCostKind CostKind,
                                          const Instruction *I);
 
-  bool isLegalElementTypeForRVV(Type *ScalarTy) const {
-    if (ScalarTy->isPointerTy())
-      return true;
-
-    if (ScalarTy->isIntegerTy(8) || ScalarTy->isIntegerTy(16) ||
-        ScalarTy->isIntegerTy(32) || ScalarTy->isIntegerTy(64))
-      return true;
-
-    if (ScalarTy->isHalfTy())
-      return ST->hasStdExtZfh();
-    if (ScalarTy->isFloatTy())
-      return ST->hasStdExtF();
-    if (ScalarTy->isDoubleTy())
-      return ST->hasStdExtD();
-
-    return false;
-  }
-
   bool isLegalMaskedLoadStore(Type *DataType, Align Alignment) {
     if (!ST->hasStdExtV())
       return false;
@@ -119,7 +101,7 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
         DL.getTypeStoreSize(DataType->getScalarType()).getFixedSize())
       return false;
 
-    return isLegalElementTypeForRVV(DataType->getScalarType());
+    return TLI->isLegalElementTypeForRVV(DataType->getScalarType());
   }
 
   bool isLegalMaskedLoad(Type *DataType, Align Alignment) {
@@ -147,7 +129,7 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
         DL.getTypeStoreSize(DataType->getScalarType()).getFixedSize())
       return false;
 
-    return isLegalElementTypeForRVV(DataType->getScalarType());
+    return TLI->isLegalElementTypeForRVV(DataType->getScalarType());
   }
 
   bool isLegalMaskedGather(Type *DataType, Align Alignment) {
@@ -174,7 +156,7 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
       return true;
 
     Type *Ty = RdxDesc.getRecurrenceType();
-    if (!isLegalElementTypeForRVV(Ty))
+    if (!TLI->isLegalElementTypeForRVV(Ty))
       return false;
 
     switch (RdxDesc.getRecurrenceKind()) {

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vector-strided-load-store-negative.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-strided-load-store-negative.ll
new file mode 100644
index 000000000000..1bc2adb71c72
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-strided-load-store-negative.ll
@@ -0,0 +1,155 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt %s -S -riscv-gather-scatter-lowering -mtriple=riscv64 -mattr=+m,+experimental-v -riscv-v-vector-bits-min=256 | FileCheck %s
+
+; This contains negative tests for the strided load/store recognition in
+; RISCVGatherScatterLowering.cpp
+
+; Negative test for treating OR as ADD.
+define void @gather_bad_or(i8* noalias nocapture %A, i8* noalias nocapture readonly %B) {
+; CHECK-LABEL: @gather_bad_or(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <32 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15, i64 16, i64 17, i64 18, i64 19, i64 20, i64 21, i64 22, i64 23, i64 24, i64 25, i64 26, i64 27, i64 28, i64 29, i64 30, i64 31>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[I:%.*]] = mul nuw nsw <32 x i64> [[VEC_IND]], <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>
+; CHECK-NEXT:    [[OR:%.*]] = or <32 x i64> [[I]], <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+; CHECK-NEXT:    [[I1:%.*]] = getelementptr inbounds i8, i8* [[B:%.*]], <32 x i64> [[OR]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> [[I1]], i32 1, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <32 x i8> undef)
+; CHECK-NEXT:    [[I2:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[I3:%.*]] = bitcast i8* [[I2]] to <32 x i8>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <32 x i8>, <32 x i8>* [[I3]], align 1
+; CHECK-NEXT:    [[I4:%.*]] = add <32 x i8> [[WIDE_LOAD]], [[WIDE_MASKED_GATHER]]
+; CHECK-NEXT:    [[I5:%.*]] = bitcast i8* [[I2]] to <32 x i8>*
+; CHECK-NEXT:    store <32 x i8> [[I4]], <32 x i8>* [[I5]], align 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <32 x i64> [[VEC_IND]], <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+; CHECK-NEXT:    [[I6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[I6]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %vec.ind = phi <32 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15, i64 16, i64 17, i64 18, i64 19, i64 20, i64 21, i64 22, i64 23, i64 24, i64 25, i64 26, i64 27, i64 28, i64 29, i64 30, i64 31>, %entry ], [ %vec.ind.next, %vector.body ]
+  %i = mul nuw nsw <32 x i64> %vec.ind, <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>
+  %or = or <32 x i64> %i, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+  %i1 = getelementptr inbounds i8, i8* %B, <32 x i64> %or
+  %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> %i1, i32 1, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <32 x i8> undef)
+  %i2 = getelementptr inbounds i8, i8* %A, i64 %index
+  %i3 = bitcast i8* %i2 to <32 x i8>*
+  %wide.load = load <32 x i8>, <32 x i8>* %i3, align 1
+  %i4 = add <32 x i8> %wide.load, %wide.masked.gather
+  %i5 = bitcast i8* %i2 to <32 x i8>*
+  store <32 x i8> %i4, <32 x i8>* %i5, align 1
+  %index.next = add nuw i64 %index, 32
+  %vec.ind.next = add <32 x i64> %vec.ind, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+  %i6 = icmp eq i64 %index.next, 1024
+  br i1 %i6, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body
+  ret void
+}
+
+; Don't transform since we might not handle wrap correctly with narrow indices.
+define void @gather_narrow_index(i8* noalias nocapture %A, i8* noalias nocapture readonly %B) {
+; CHECK-LABEL: @gather_narrow_index(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <32 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = mul nuw nsw <32 x i32> [[VEC_IND]], <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[B:%.*]], <32 x i32> [[TMP0]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> [[TMP1]], i32 1, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <32 x i8> undef)
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <32 x i8>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <32 x i8>, <32 x i8>* [[TMP3]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = add <32 x i8> [[WIDE_LOAD]], [[WIDE_MASKED_GATHER]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP2]] to <32 x i8>*
+; CHECK-NEXT:    store <32 x i8> [[TMP4]], <32 x i8>* [[TMP5]], align 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <32 x i32> [[VEC_IND]], <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[TMP6]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %vec.ind = phi <32 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>, %entry ], [ %vec.ind.next, %vector.body ]
+  %0 = mul nuw nsw <32 x i32> %vec.ind, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+  %1 = getelementptr inbounds i8, i8* %B, <32 x i32> %0
+  %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> %1, i32 1, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <32 x i8> undef)
+  %2 = getelementptr inbounds i8, i8* %A, i64 %index
+  %3 = bitcast i8* %2 to <32 x i8>*
+  %wide.load = load <32 x i8>, <32 x i8>* %3, align 1
+  %4 = add <32 x i8> %wide.load, %wide.masked.gather
+  %5 = bitcast i8* %2 to <32 x i8>*
+  store <32 x i8> %4, <32 x i8>* %5, align 1
+  %index.next = add nuw i64 %index, 32
+  %vec.ind.next = add <32 x i32> %vec.ind, <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>
+  %6 = icmp eq i64 %index.next, 1024
+  br i1 %6, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body
+  ret void
+}
+
+; The last element of the start value of the phi has the wrong stride.
+define void @gather_broken_stride(i8* noalias nocapture %A, i8* noalias nocapture readonly %B) {
+; CHECK-LABEL: @gather_broken_stride(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <32 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15, i64 16, i64 17, i64 18, i64 19, i64 20, i64 21, i64 22, i64 23, i64 24, i64 25, i64 26, i64 27, i64 28, i64 29, i64 30, i64 32>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[I:%.*]] = mul nuw nsw <32 x i64> [[VEC_IND]], <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>
+; CHECK-NEXT:    [[OR:%.*]] = or <32 x i64> [[I]], <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+; CHECK-NEXT:    [[I1:%.*]] = getelementptr inbounds i8, i8* [[B:%.*]], <32 x i64> [[OR]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> [[I1]], i32 1, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <32 x i8> undef)
+; CHECK-NEXT:    [[I2:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[I3:%.*]] = bitcast i8* [[I2]] to <32 x i8>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <32 x i8>, <32 x i8>* [[I3]], align 1
+; CHECK-NEXT:    [[I4:%.*]] = add <32 x i8> [[WIDE_LOAD]], [[WIDE_MASKED_GATHER]]
+; CHECK-NEXT:    [[I5:%.*]] = bitcast i8* [[I2]] to <32 x i8>*
+; CHECK-NEXT:    store <32 x i8> [[I4]], <32 x i8>* [[I5]], align 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <32 x i64> [[VEC_IND]], <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+; CHECK-NEXT:    [[I6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[I6]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %vec.ind = phi <32 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15, i64 16, i64 17, i64 18, i64 19, i64 20, i64 21, i64 22, i64 23, i64 24, i64 25, i64 26, i64 27, i64 28, i64 29, i64 30, i64 32>, %entry ], [ %vec.ind.next, %vector.body ]
+  %i = mul nuw nsw <32 x i64> %vec.ind, <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>
+  %or = or <32 x i64> %i, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+  %i1 = getelementptr inbounds i8, i8* %B, <32 x i64> %or
+  %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> %i1, i32 1, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <32 x i8> undef)
+  %i2 = getelementptr inbounds i8, i8* %A, i64 %index
+  %i3 = bitcast i8* %i2 to <32 x i8>*
+  %wide.load = load <32 x i8>, <32 x i8>* %i3, align 1
+  %i4 = add <32 x i8> %wide.load, %wide.masked.gather
+  %i5 = bitcast i8* %i2 to <32 x i8>*
+  store <32 x i8> %i4, <32 x i8>* %i5, align 1
+  %index.next = add nuw i64 %index, 32
+  %vec.ind.next = add <32 x i64> %vec.ind, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+  %i6 = icmp eq i64 %index.next, 1024
+  br i1 %i6, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body
+  ret void
+}
+
+declare <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*>, i32 immarg, <32 x i1>, <32 x i8>)

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vector-strided-load-store.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-strided-load-store.ll
new file mode 100644
index 000000000000..bd89f91ea897
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-strided-load-store.ll
@@ -0,0 +1,831 @@
+; RUN: opt %s -S -riscv-gather-scatter-lowering -mtriple=riscv64 -mattr=+m,+experimental-v -riscv-v-vector-bits-min=256 | FileCheck %s
+; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+experimental-v -riscv-v-vector-bits-min=256 | FileCheck %s --check-prefix=CHECK-ASM
+
+%struct.foo = type { i32, i32, i32, i32 }
+
+; void gather(signed char * __restrict  A, signed char * __restrict B) {
+;   for (int i = 0; i != 1024; ++i)
+;       A[i] += B[i * 5];
+; }
+define void @gather(i8* noalias nocapture %A, i8* noalias nocapture readonly %B) {
+; CHECK-LABEL: @gather(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, i8* [[B:%.*]], i64 [[VEC_IND_SCALAR]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <32 x i8> @llvm.riscv.masked.strided.load.v32i8.p0i8.i64(<32 x i8> undef, i8* [[TMP0]], i64 5, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <32 x i8>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <32 x i8>, <32 x i8>* [[TMP2]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = add <32 x i8> [[WIDE_LOAD]], [[WIDE_MASKED_GATHER]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP1]] to <32 x i8>*
+; CHECK-NEXT:    store <32 x i8> [[TMP3]], <32 x i8>* [[TMP4]], align 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR]] = add i64 [[VEC_IND_SCALAR]], 160
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[TMP5]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+;
+; CHECK-ASM-LABEL: gather:
+; CHECK-ASM:       # %bb.0: # %entry
+; CHECK-ASM-NEXT:    mv a2, zero
+; CHECK-ASM-NEXT:    addi a6, zero, 32
+; CHECK-ASM-NEXT:    addi a4, zero, 5
+; CHECK-ASM-NEXT:    addi a5, zero, 1024
+; CHECK-ASM-NEXT:  .LBB0_1: # %vector.body
+; CHECK-ASM-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-ASM-NEXT:    vsetvli zero, a6, e8, m1, ta, mu
+; CHECK-ASM-NEXT:    vlse8.v v25, (a1), a4
+; CHECK-ASM-NEXT:    add a3, a0, a2
+; CHECK-ASM-NEXT:    vle8.v v26, (a3)
+; CHECK-ASM-NEXT:    vadd.vv v25, v26, v25
+; CHECK-ASM-NEXT:    vse8.v v25, (a3)
+; CHECK-ASM-NEXT:    addi a2, a2, 32
+; CHECK-ASM-NEXT:    addi a1, a1, 160
+; CHECK-ASM-NEXT:    bne a2, a5, .LBB0_1
+; CHECK-ASM-NEXT:  # %bb.2: # %for.cond.cleanup
+; CHECK-ASM-NEXT:    ret
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %vec.ind = phi <32 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15, i64 16, i64 17, i64 18, i64 19, i64 20, i64 21, i64 22, i64 23, i64 24, i64 25, i64 26, i64 27, i64 28, i64 29, i64 30, i64 31>, %entry ], [ %vec.ind.next, %vector.body ]
+  %0 = mul nuw nsw <32 x i64> %vec.ind, <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>
+  %1 = getelementptr inbounds i8, i8* %B, <32 x i64> %0
+  %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> %1, i32 1, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <32 x i8> undef)
+  %2 = getelementptr inbounds i8, i8* %A, i64 %index
+  %3 = bitcast i8* %2 to <32 x i8>*
+  %wide.load = load <32 x i8>, <32 x i8>* %3, align 1
+  %4 = add <32 x i8> %wide.load, %wide.masked.gather
+  %5 = bitcast i8* %2 to <32 x i8>*
+  store <32 x i8> %4, <32 x i8>* %5, align 1
+  %index.next = add nuw i64 %index, 32
+  %vec.ind.next = add <32 x i64> %vec.ind, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+  %6 = icmp eq i64 %index.next, 1024
+  br i1 %6, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body
+  ret void
+}
+
+define void @gather_masked(i8* noalias nocapture %A, i8* noalias nocapture readonly %B, <32 x i8> %maskedoff) {
+; CHECK-LABEL: @gather_masked(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, i8* [[B:%.*]], i64 [[VEC_IND_SCALAR]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <32 x i8> @llvm.riscv.masked.strided.load.v32i8.p0i8.i64(<32 x i8> [[MASKEDOFF:%.*]], i8* [[TMP0]], i64 5, <32 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <32 x i8>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <32 x i8>, <32 x i8>* [[TMP2]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = add <32 x i8> [[WIDE_LOAD]], [[WIDE_MASKED_GATHER]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP1]] to <32 x i8>*
+; CHECK-NEXT:    store <32 x i8> [[TMP3]], <32 x i8>* [[TMP4]], align 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR]] = add i64 [[VEC_IND_SCALAR]], 160
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[TMP5]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+;
+; CHECK-ASM-LABEL: gather_masked:
+; CHECK-ASM:       # %bb.0: # %entry
+; CHECK-ASM-NEXT:    mv a2, zero
+; CHECK-ASM-NEXT:    lui a3, 983765
+; CHECK-ASM-NEXT:    addiw a3, a3, 873
+; CHECK-ASM-NEXT:    vsetivli zero, 1, e32, mf2, ta, mu
+; CHECK-ASM-NEXT:    vmv.s.x v0, a3
+; CHECK-ASM-NEXT:    addi a6, zero, 32
+; CHECK-ASM-NEXT:    addi a4, zero, 5
+; CHECK-ASM-NEXT:    addi a5, zero, 1024
+; CHECK-ASM-NEXT:  .LBB1_1: # %vector.body
+; CHECK-ASM-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-ASM-NEXT:    vsetvli zero, a6, e8, m1, tu, mu
+; CHECK-ASM-NEXT:    vmv1r.v v25, v8
+; CHECK-ASM-NEXT:    vlse8.v v25, (a1), a4, v0.t
+; CHECK-ASM-NEXT:    add a3, a0, a2
+; CHECK-ASM-NEXT:    vsetvli zero, zero, e8, m1, ta, mu
+; CHECK-ASM-NEXT:    vle8.v v26, (a3)
+; CHECK-ASM-NEXT:    vadd.vv v25, v26, v25
+; CHECK-ASM-NEXT:    vse8.v v25, (a3)
+; CHECK-ASM-NEXT:    addi a2, a2, 32
+; CHECK-ASM-NEXT:    addi a1, a1, 160
+; CHECK-ASM-NEXT:    bne a2, a5, .LBB1_1
+; CHECK-ASM-NEXT:  # %bb.2: # %for.cond.cleanup
+; CHECK-ASM-NEXT:    ret
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %vec.ind = phi <32 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15, i64 16, i64 17, i64 18, i64 19, i64 20, i64 21, i64 22, i64 23, i64 24, i64 25, i64 26, i64 27, i64 28, i64 29, i64 30, i64 31>, %entry ], [ %vec.ind.next, %vector.body ]
+  %0 = mul nuw nsw <32 x i64> %vec.ind, <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>
+  %1 = getelementptr inbounds i8, i8* %B, <32 x i64> %0
+  %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> %1, i32 1, <32 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>, <32 x i8> %maskedoff)
+  %2 = getelementptr inbounds i8, i8* %A, i64 %index
+  %3 = bitcast i8* %2 to <32 x i8>*
+  %wide.load = load <32 x i8>, <32 x i8>* %3, align 1
+  %4 = add <32 x i8> %wide.load, %wide.masked.gather
+  %5 = bitcast i8* %2 to <32 x i8>*
+  store <32 x i8> %4, <32 x i8>* %5, align 1
+  %index.next = add nuw i64 %index, 32
+  %vec.ind.next = add <32 x i64> %vec.ind, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+  %6 = icmp eq i64 %index.next, 1024
+  br i1 %6, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body
+  ret void
+}
+
+define void @gather_negative_stride(i8* noalias nocapture %A, i8* noalias nocapture readonly %B) {
+; CHECK-LABEL: @gather_negative_stride(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND_SCALAR:%.*]] = phi i64 [ 155, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, i8* [[B:%.*]], i64 [[VEC_IND_SCALAR]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <32 x i8> @llvm.riscv.masked.strided.load.v32i8.p0i8.i64(<32 x i8> undef, i8* [[TMP0]], i64 -5, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <32 x i8>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <32 x i8>, <32 x i8>* [[TMP2]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = add <32 x i8> [[WIDE_LOAD]], [[WIDE_MASKED_GATHER]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP1]] to <32 x i8>*
+; CHECK-NEXT:    store <32 x i8> [[TMP3]], <32 x i8>* [[TMP4]], align 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR]] = add i64 [[VEC_IND_SCALAR]], 160
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[TMP5]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+;
+; CHECK-ASM-LABEL: gather_negative_stride:
+; CHECK-ASM:       # %bb.0: # %entry
+; CHECK-ASM-NEXT:    mv a2, zero
+; CHECK-ASM-NEXT:    addi a1, a1, 155
+; CHECK-ASM-NEXT:    addi a6, zero, 32
+; CHECK-ASM-NEXT:    addi a4, zero, -5
+; CHECK-ASM-NEXT:    addi a5, zero, 1024
+; CHECK-ASM-NEXT:  .LBB2_1: # %vector.body
+; CHECK-ASM-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-ASM-NEXT:    vsetvli zero, a6, e8, m1, ta, mu
+; CHECK-ASM-NEXT:    vlse8.v v25, (a1), a4
+; CHECK-ASM-NEXT:    add a3, a0, a2
+; CHECK-ASM-NEXT:    vle8.v v26, (a3)
+; CHECK-ASM-NEXT:    vadd.vv v25, v26, v25
+; CHECK-ASM-NEXT:    vse8.v v25, (a3)
+; CHECK-ASM-NEXT:    addi a2, a2, 32
+; CHECK-ASM-NEXT:    addi a1, a1, 160
+; CHECK-ASM-NEXT:    bne a2, a5, .LBB2_1
+; CHECK-ASM-NEXT:  # %bb.2: # %for.cond.cleanup
+; CHECK-ASM-NEXT:    ret
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %vec.ind = phi <32 x i64> [ <i64 31, i64 30, i64 29, i64 28, i64 27, i64 26, i64 25, i64 24, i64 23, i64 22, i64 21, i64 20, i64 19, i64 18, i64 17, i64 16, i64 15, i64 14, i64 13, i64 12, i64 11, i64 10, i64 9, i64 8, i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, %entry ], [ %vec.ind.next, %vector.body ]
+  %0 = mul nuw nsw <32 x i64> %vec.ind, <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>
+  %1 = getelementptr inbounds i8, i8* %B, <32 x i64> %0
+  %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> %1, i32 1, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <32 x i8> undef)
+  %2 = getelementptr inbounds i8, i8* %A, i64 %index
+  %3 = bitcast i8* %2 to <32 x i8>*
+  %wide.load = load <32 x i8>, <32 x i8>* %3, align 1
+  %4 = add <32 x i8> %wide.load, %wide.masked.gather
+  %5 = bitcast i8* %2 to <32 x i8>*
+  store <32 x i8> %4, <32 x i8>* %5, align 1
+  %index.next = add nuw i64 %index, 32
+  %vec.ind.next = add <32 x i64> %vec.ind, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+  %6 = icmp eq i64 %index.next, 1024
+  br i1 %6, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body
+  ret void
+}
+
+define void @gather_zero_stride(i8* noalias nocapture %A, i8* noalias nocapture readonly %B) {
+; CHECK-LABEL: @gather_zero_stride(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, i8* [[B:%.*]], i64 [[VEC_IND_SCALAR]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <32 x i8> @llvm.riscv.masked.strided.load.v32i8.p0i8.i64(<32 x i8> undef, i8* [[TMP0]], i64 0, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <32 x i8>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <32 x i8>, <32 x i8>* [[TMP2]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = add <32 x i8> [[WIDE_LOAD]], [[WIDE_MASKED_GATHER]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP1]] to <32 x i8>*
+; CHECK-NEXT:    store <32 x i8> [[TMP3]], <32 x i8>* [[TMP4]], align 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR]] = add i64 [[VEC_IND_SCALAR]], 160
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[TMP5]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+;
+; CHECK-ASM-LABEL: gather_zero_stride:
+; CHECK-ASM:       # %bb.0: # %entry
+; CHECK-ASM-NEXT:    mv a2, zero
+; CHECK-ASM-NEXT:    addi a3, zero, 32
+; CHECK-ASM-NEXT:    addi a4, zero, 1024
+; CHECK-ASM-NEXT:  .LBB3_1: # %vector.body
+; CHECK-ASM-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-ASM-NEXT:    vsetvli zero, a3, e8, m1, ta, mu
+; CHECK-ASM-NEXT:    vlse8.v v25, (a1), zero
+; CHECK-ASM-NEXT:    add a5, a0, a2
+; CHECK-ASM-NEXT:    vle8.v v26, (a5)
+; CHECK-ASM-NEXT:    vadd.vv v25, v26, v25
+; CHECK-ASM-NEXT:    vse8.v v25, (a5)
+; CHECK-ASM-NEXT:    addi a2, a2, 32
+; CHECK-ASM-NEXT:    addi a1, a1, 160
+; CHECK-ASM-NEXT:    bne a2, a4, .LBB3_1
+; CHECK-ASM-NEXT:  # %bb.2: # %for.cond.cleanup
+; CHECK-ASM-NEXT:    ret
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %vec.ind = phi <32 x i64> [ zeroinitializer, %entry ], [ %vec.ind.next, %vector.body ]
+  %0 = mul nuw nsw <32 x i64> %vec.ind, <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>
+  %1 = getelementptr inbounds i8, i8* %B, <32 x i64> %0
+  %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> %1, i32 1, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <32 x i8> undef)
+  %2 = getelementptr inbounds i8, i8* %A, i64 %index
+  %3 = bitcast i8* %2 to <32 x i8>*
+  %wide.load = load <32 x i8>, <32 x i8>* %3, align 1
+  %4 = add <32 x i8> %wide.load, %wide.masked.gather
+  %5 = bitcast i8* %2 to <32 x i8>*
+  store <32 x i8> %4, <32 x i8>* %5, align 1
+  %index.next = add nuw i64 %index, 32
+  %vec.ind.next = add <32 x i64> %vec.ind, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+  %6 = icmp eq i64 %index.next, 1024
+  br i1 %6, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body
+  ret void
+}
+
+;void scatter(signed char * __restrict  A, signed char * __restrict B) {
+;  for (int i = 0; i < 1024; ++i)
+;      A[i * 5] += B[i];
+;}
+define void @scatter(i8* noalias nocapture %A, i8* noalias nocapture readonly %B) {
+; CHECK-LABEL: @scatter(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND_SCALAR1:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR2:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[B:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <32 x i8>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <32 x i8>, <32 x i8>* [[TMP1]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, i8* [[A:%.*]], i64 [[VEC_IND_SCALAR]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* [[A]], i64 [[VEC_IND_SCALAR1]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <32 x i8> @llvm.riscv.masked.strided.load.v32i8.p0i8.i64(<32 x i8> undef, i8* [[TMP2]], i64 5, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[TMP4:%.*]] = add <32 x i8> [[WIDE_MASKED_GATHER]], [[WIDE_LOAD]]
+; CHECK-NEXT:    call void @llvm.riscv.masked.strided.store.v32i8.p0i8.i64(<32 x i8> [[TMP4]], i8* [[TMP3]], i64 5, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR]] = add i64 [[VEC_IND_SCALAR]], 160
+; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR2]] = add i64 [[VEC_IND_SCALAR1]], 160
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[TMP5]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+;
+; CHECK-ASM-LABEL: scatter:
+; CHECK-ASM:       # %bb.0: # %entry
+; CHECK-ASM-NEXT:    mv a2, zero
+; CHECK-ASM-NEXT:    addi a6, zero, 32
+; CHECK-ASM-NEXT:    addi a4, zero, 5
+; CHECK-ASM-NEXT:    addi a5, zero, 1024
+; CHECK-ASM-NEXT:  .LBB4_1: # %vector.body
+; CHECK-ASM-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-ASM-NEXT:    add a3, a1, a2
+; CHECK-ASM-NEXT:    vsetvli zero, a6, e8, m1, ta, mu
+; CHECK-ASM-NEXT:    vle8.v v25, (a3)
+; CHECK-ASM-NEXT:    vlse8.v v26, (a0), a4
+; CHECK-ASM-NEXT:    vadd.vv v25, v26, v25
+; CHECK-ASM-NEXT:    vsse8.v v25, (a0), a4
+; CHECK-ASM-NEXT:    addi a2, a2, 32
+; CHECK-ASM-NEXT:    addi a0, a0, 160
+; CHECK-ASM-NEXT:    bne a2, a5, .LBB4_1
+; CHECK-ASM-NEXT:  # %bb.2: # %for.cond.cleanup
+; CHECK-ASM-NEXT:    ret
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %vec.ind = phi <32 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15, i64 16, i64 17, i64 18, i64 19, i64 20, i64 21, i64 22, i64 23, i64 24, i64 25, i64 26, i64 27, i64 28, i64 29, i64 30, i64 31>, %entry ], [ %vec.ind.next, %vector.body ]
+  %0 = getelementptr inbounds i8, i8* %B, i64 %index
+  %1 = bitcast i8* %0 to <32 x i8>*
+  %wide.load = load <32 x i8>, <32 x i8>* %1, align 1
+  %2 = mul nuw nsw <32 x i64> %vec.ind, <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>
+  %3 = getelementptr inbounds i8, i8* %A, <32 x i64> %2
+  %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> %3, i32 1, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <32 x i8> undef)
+  %4 = add <32 x i8> %wide.masked.gather, %wide.load
+  call void @llvm.masked.scatter.v32i8.v32p0i8(<32 x i8> %4, <32 x i8*> %3, i32 1, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+  %index.next = add nuw i64 %index, 32
+  %vec.ind.next = add <32 x i64> %vec.ind, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+  %5 = icmp eq i64 %index.next, 1024
+  br i1 %5, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body
+  ret void
+}
+
+define void @scatter_masked(i8* noalias nocapture %A, i8* noalias nocapture readonly %B, <32 x i8> %maskedoff) {
+; CHECK-LABEL: @scatter_masked(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND_SCALAR1:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR2:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[B:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <32 x i8>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <32 x i8>, <32 x i8>* [[TMP1]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, i8* [[A:%.*]], i64 [[VEC_IND_SCALAR]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* [[A]], i64 [[VEC_IND_SCALAR1]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <32 x i8> @llvm.riscv.masked.strided.load.v32i8.p0i8.i64(<32 x i8> [[MASKEDOFF:%.*]], i8* [[TMP2]], i64 5, <32 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[TMP4:%.*]] = add <32 x i8> [[WIDE_MASKED_GATHER]], [[WIDE_LOAD]]
+; CHECK-NEXT:    call void @llvm.riscv.masked.strided.store.v32i8.p0i8.i64(<32 x i8> [[TMP4]], i8* [[TMP3]], i64 5, <32 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR]] = add i64 [[VEC_IND_SCALAR]], 160
+; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR2]] = add i64 [[VEC_IND_SCALAR1]], 160
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[TMP5]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+;
+; CHECK-ASM-LABEL: scatter_masked:
+; CHECK-ASM:       # %bb.0: # %entry
+; CHECK-ASM-NEXT:    mv a2, zero
+; CHECK-ASM-NEXT:    addi a6, zero, 32
+; CHECK-ASM-NEXT:    lui a4, 983765
+; CHECK-ASM-NEXT:    addiw a4, a4, 873
+; CHECK-ASM-NEXT:    vsetivli zero, 1, e32, mf2, ta, mu
+; CHECK-ASM-NEXT:    vmv.s.x v0, a4
+; CHECK-ASM-NEXT:    addi a4, zero, 5
+; CHECK-ASM-NEXT:    addi a5, zero, 1024
+; CHECK-ASM-NEXT:  .LBB5_1: # %vector.body
+; CHECK-ASM-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-ASM-NEXT:    add a3, a1, a2
+; CHECK-ASM-NEXT:    vsetvli zero, a6, e8, m1, ta, mu
+; CHECK-ASM-NEXT:    vle8.v v25, (a3)
+; CHECK-ASM-NEXT:    vsetvli zero, zero, e8, m1, tu, mu
+; CHECK-ASM-NEXT:    vmv1r.v v26, v8
+; CHECK-ASM-NEXT:    vlse8.v v26, (a0), a4, v0.t
+; CHECK-ASM-NEXT:    vsetvli zero, zero, e8, m1, ta, mu
+; CHECK-ASM-NEXT:    vadd.vv v25, v26, v25
+; CHECK-ASM-NEXT:    vsse8.v v25, (a0), a4, v0.t
+; CHECK-ASM-NEXT:    addi a2, a2, 32
+; CHECK-ASM-NEXT:    addi a0, a0, 160
+; CHECK-ASM-NEXT:    bne a2, a5, .LBB5_1
+; CHECK-ASM-NEXT:  # %bb.2: # %for.cond.cleanup
+; CHECK-ASM-NEXT:    ret
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %vec.ind = phi <32 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15, i64 16, i64 17, i64 18, i64 19, i64 20, i64 21, i64 22, i64 23, i64 24, i64 25, i64 26, i64 27, i64 28, i64 29, i64 30, i64 31>, %entry ], [ %vec.ind.next, %vector.body ]
+  %0 = getelementptr inbounds i8, i8* %B, i64 %index
+  %1 = bitcast i8* %0 to <32 x i8>*
+  %wide.load = load <32 x i8>, <32 x i8>* %1, align 1
+  %2 = mul nuw nsw <32 x i64> %vec.ind, <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>
+  %3 = getelementptr inbounds i8, i8* %A, <32 x i64> %2
+  %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> %3, i32 1, <32 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>, <32 x i8> %maskedoff)
+  %4 = add <32 x i8> %wide.masked.gather, %wide.load
+  call void @llvm.masked.scatter.v32i8.v32p0i8(<32 x i8> %4, <32 x i8*> %3, i32 1, <32 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>)
+  %index.next = add nuw i64 %index, 32
+  %vec.ind.next = add <32 x i64> %vec.ind, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+  %5 = icmp eq i64 %index.next, 1024
+  br i1 %5, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body
+  ret void
+}
+
+; void gather_pow2(signed char * __restrict  A, signed char * __restrict B) {
+;   for (int i = 0; i != 1024; ++i)
+;       A[i] += B[i * 4];
+; }
+define void @gather_pow2(i32* noalias nocapture %A, i32* noalias nocapture readonly %B) {
+; CHECK-LABEL: @gather_pow2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[VEC_IND_SCALAR]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to i8*
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <8 x i32> @llvm.riscv.masked.strided.load.v8i32.p0i8.i64(<8 x i32> undef, i8* [[TMP1]], i64 16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <8 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP3]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = add <8 x i32> [[WIDE_LOAD]], [[WIDE_MASKED_GATHER]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP2]] to <8 x i32>*
+; CHECK-NEXT:    store <8 x i32> [[TMP4]], <8 x i32>* [[TMP5]], align 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR]] = add i64 [[VEC_IND_SCALAR]], 32
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[TMP6]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+;
+; CHECK-ASM-LABEL: gather_pow2:
+; CHECK-ASM:       # %bb.0: # %entry
+; CHECK-ASM-NEXT:    addi a2, zero, 1024
+; CHECK-ASM-NEXT:    addi a3, zero, 16
+; CHECK-ASM-NEXT:    addi a4, zero, 32
+; CHECK-ASM-NEXT:  .LBB6_1: # %vector.body
+; CHECK-ASM-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-ASM-NEXT:    vsetivli zero, 8, e32, m1, ta, mu
+; CHECK-ASM-NEXT:    vlse32.v v25, (a1), a3
+; CHECK-ASM-NEXT:    vsetvli zero, a4, e8, m1, ta, mu
+; CHECK-ASM-NEXT:    vle8.v v26, (a0)
+; CHECK-ASM-NEXT:    vsetivli zero, 8, e32, m1, ta, mu
+; CHECK-ASM-NEXT:    vadd.vv v25, v26, v25
+; CHECK-ASM-NEXT:    vsetvli zero, a4, e8, m1, ta, mu
+; CHECK-ASM-NEXT:    vse8.v v25, (a0)
+; CHECK-ASM-NEXT:    addi a2, a2, -8
+; CHECK-ASM-NEXT:    addi a0, a0, 32
+; CHECK-ASM-NEXT:    addi a1, a1, 128
+; CHECK-ASM-NEXT:    bnez a2, .LBB6_1
+; CHECK-ASM-NEXT:  # %bb.2: # %for.cond.cleanup
+; CHECK-ASM-NEXT:    ret
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %vec.ind = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, %entry ], [ %vec.ind.next, %vector.body ]
+  %0 = shl nsw <8 x i64> %vec.ind, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
+  %1 = getelementptr inbounds i32, i32* %B, <8 x i64> %0
+  %wide.masked.gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %1, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
+  %2 = getelementptr inbounds i32, i32* %A, i64 %index
+  %3 = bitcast i32* %2 to <8 x i32>*
+  %wide.load = load <8 x i32>, <8 x i32>* %3, align 1
+  %4 = add <8 x i32> %wide.load, %wide.masked.gather
+  %5 = bitcast i32* %2 to <8 x i32>*
+  store <8 x i32> %4, <8 x i32>* %5, align 1
+  %index.next = add nuw i64 %index, 8
+  %vec.ind.next = add <8 x i64> %vec.ind, <i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8>
+  %6 = icmp eq i64 %index.next, 1024
+  br i1 %6, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body
+  ret void
+}
+
+;void scatter_pow2(signed char * __restrict  A, signed char * __restrict B) {
+;  for (int i = 0; i < 1024; ++i)
+;      A[i * 4] += B[i];
+;}
+define void @scatter_pow2(i32* noalias nocapture %A, i32* noalias nocapture readonly %B) {
+; CHECK-LABEL: @scatter_pow2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND_SCALAR1:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR2:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <8 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP1]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 [[VEC_IND_SCALAR]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to i8*
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i32, i32* [[A]], i64 [[VEC_IND_SCALAR1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to i8*
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <8 x i32> @llvm.riscv.masked.strided.load.v8i32.p0i8.i64(<8 x i32> undef, i8* [[TMP3]], i64 16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[TMP6:%.*]] = add <8 x i32> [[WIDE_MASKED_GATHER]], [[WIDE_LOAD]]
+; CHECK-NEXT:    call void @llvm.riscv.masked.strided.store.v8i32.p0i8.i64(<8 x i32> [[TMP6]], i8* [[TMP5]], i64 16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR]] = add i64 [[VEC_IND_SCALAR]], 32
+; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR2]] = add i64 [[VEC_IND_SCALAR1]], 32
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[TMP7]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+;
+; CHECK-ASM-LABEL: scatter_pow2:
+; CHECK-ASM:       # %bb.0: # %entry
+; CHECK-ASM-NEXT:    addi a2, zero, 1024
+; CHECK-ASM-NEXT:    addi a3, zero, 32
+; CHECK-ASM-NEXT:    addi a4, zero, 16
+; CHECK-ASM-NEXT:  .LBB7_1: # %vector.body
+; CHECK-ASM-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-ASM-NEXT:    vsetvli zero, a3, e8, m1, ta, mu
+; CHECK-ASM-NEXT:    vle8.v v25, (a1)
+; CHECK-ASM-NEXT:    vsetivli zero, 8, e32, m1, ta, mu
+; CHECK-ASM-NEXT:    vlse32.v v26, (a0), a4
+; CHECK-ASM-NEXT:    vadd.vv v25, v26, v25
+; CHECK-ASM-NEXT:    vsse32.v v25, (a0), a4
+; CHECK-ASM-NEXT:    addi a2, a2, -8
+; CHECK-ASM-NEXT:    addi a1, a1, 32
+; CHECK-ASM-NEXT:    addi a0, a0, 128
+; CHECK-ASM-NEXT:    bnez a2, .LBB7_1
+; CHECK-ASM-NEXT:  # %bb.2: # %for.cond.cleanup
+; CHECK-ASM-NEXT:    ret
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %vec.ind = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, %entry ], [ %vec.ind.next, %vector.body ]
+  %0 = getelementptr inbounds i32, i32* %B, i64 %index
+  %1 = bitcast i32* %0 to <8 x i32>*
+  %wide.load = load <8 x i32>, <8 x i32>* %1, align 1
+  %2 = shl nuw nsw <8 x i64> %vec.ind, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
+  %3 = getelementptr inbounds i32, i32* %A, <8 x i64> %2
+  %wide.masked.gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %3, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
+  %4 = add <8 x i32> %wide.masked.gather, %wide.load
+  call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %4, <8 x i32*> %3, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+  %index.next = add nuw i64 %index, 8
+  %vec.ind.next = add <8 x i64> %vec.ind, <i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8>
+  %5 = icmp eq i64 %index.next, 1024
+  br i1 %5, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body
+  ret void
+}
+
+;struct foo {
+;  int a, b, c, d;
+;};
+;
+;void struct_gather(int * __restrict  A, struct foo * __restrict B) {
+;  for (int i = 0; i < 1024; ++i)
+;      A[i] += B[i].b;
+;}
+define void @struct_gather(i32* noalias nocapture %A, %struct.foo* noalias nocapture readonly %B) {
+; CHECK-LABEL: @struct_gather(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND_SCALAR1:%.*]] = phi i64 [ 8, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR2:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr [[STRUCT_FOO:%.*]], %struct.foo* [[B:%.*]], i64 [[VEC_IND_SCALAR]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to i8*
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr [[STRUCT_FOO]], %struct.foo* [[B]], i64 [[VEC_IND_SCALAR1]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to i8*
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <8 x i32> @llvm.riscv.masked.strided.load.v8i32.p0i8.i64(<8 x i32> undef, i8* [[TMP1]], i64 16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER9:%.*]] = call <8 x i32> @llvm.riscv.masked.strided.load.v8i32.p0i8.i64(<8 x i32> undef, i8* [[TMP3]], i64 16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <8 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP5]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i64 8
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD10:%.*]] = load <8 x i32>, <8 x i32>* [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = add nsw <8 x i32> [[WIDE_LOAD]], [[WIDE_MASKED_GATHER]]
+; CHECK-NEXT:    [[TMP9:%.*]] = add nsw <8 x i32> [[WIDE_LOAD10]], [[WIDE_MASKED_GATHER9]]
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32* [[TMP4]] to <8 x i32>*
+; CHECK-NEXT:    store <8 x i32> [[TMP8]], <8 x i32>* [[TMP10]], align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>*
+; CHECK-NEXT:    store <8 x i32> [[TMP9]], <8 x i32>* [[TMP11]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR]] = add i64 [[VEC_IND_SCALAR]], 16
+; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR2]] = add i64 [[VEC_IND_SCALAR1]], 16
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[TMP12]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+;
+; CHECK-ASM-LABEL: struct_gather:
+; CHECK-ASM:       # %bb.0: # %entry
+; CHECK-ASM-NEXT:    addi a0, a0, 32
+; CHECK-ASM-NEXT:    addi a1, a1, 132
+; CHECK-ASM-NEXT:    addi a2, zero, 1024
+; CHECK-ASM-NEXT:    addi a3, zero, 16
+; CHECK-ASM-NEXT:  .LBB8_1: # %vector.body
+; CHECK-ASM-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-ASM-NEXT:    addi a4, a1, -128
+; CHECK-ASM-NEXT:    vsetivli zero, 8, e32, m1, ta, mu
+; CHECK-ASM-NEXT:    vlse32.v v25, (a4), a3
+; CHECK-ASM-NEXT:    vlse32.v v26, (a1), a3
+; CHECK-ASM-NEXT:    addi a4, a0, -32
+; CHECK-ASM-NEXT:    vle32.v v27, (a4)
+; CHECK-ASM-NEXT:    vle32.v v28, (a0)
+; CHECK-ASM-NEXT:    vadd.vv v25, v27, v25
+; CHECK-ASM-NEXT:    vadd.vv v26, v28, v26
+; CHECK-ASM-NEXT:    vse32.v v25, (a4)
+; CHECK-ASM-NEXT:    vse32.v v26, (a0)
+; CHECK-ASM-NEXT:    addi a2, a2, -16
+; CHECK-ASM-NEXT:    addi a0, a0, 64
+; CHECK-ASM-NEXT:    addi a1, a1, 256
+; CHECK-ASM-NEXT:    bnez a2, .LBB8_1
+; CHECK-ASM-NEXT:  # %bb.2: # %for.cond.cleanup
+; CHECK-ASM-NEXT:    ret
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %vec.ind = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, %entry ], [ %vec.ind.next, %vector.body ]
+  %step.add = add <8 x i64> %vec.ind, <i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8>
+  %0 = getelementptr inbounds %struct.foo, %struct.foo* %B, <8 x i64> %vec.ind, i32 1
+  %1 = getelementptr inbounds %struct.foo, %struct.foo* %B, <8 x i64> %step.add, i32 1
+  %wide.masked.gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %0, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
+  %wide.masked.gather9 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %1, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
+  %2 = getelementptr inbounds i32, i32* %A, i64 %index
+  %3 = bitcast i32* %2 to <8 x i32>*
+  %wide.load = load <8 x i32>, <8 x i32>* %3, align 4
+  %4 = getelementptr inbounds i32, i32* %2, i64 8
+  %5 = bitcast i32* %4 to <8 x i32>*
+  %wide.load10 = load <8 x i32>, <8 x i32>* %5, align 4
+  %6 = add nsw <8 x i32> %wide.load, %wide.masked.gather
+  %7 = add nsw <8 x i32> %wide.load10, %wide.masked.gather9
+  %8 = bitcast i32* %2 to <8 x i32>*
+  store <8 x i32> %6, <8 x i32>* %8, align 4
+  %9 = bitcast i32* %4 to <8 x i32>*
+  store <8 x i32> %7, <8 x i32>* %9, align 4
+  %index.next = add nuw i64 %index, 16
+  %vec.ind.next = add <8 x i64> %vec.ind, <i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16>
+  %10 = icmp eq i64 %index.next, 1024
+  br i1 %10, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body
+  ret void
+}
+
+;void gather_unroll(int * __restrict  A, int * __restrict B) {
+;  for (int i = 0; i < 1024; i+= 4 ) {
+;    A[i] += B[i * 4];
+;    A[i+1] += B[(i+1) * 4];
+;    A[i+2] += B[(i+2) * 4];
+;    A[i+3] += B[(i+3) * 4];
+;  }
+;}
+define void @gather_unroll(i32* noalias nocapture %A, i32* noalias nocapture readonly %B) {
+; CHECK-LABEL: @gather_unroll(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND_SCALAR1:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR2:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND_SCALAR3:%.*]] = phi i64 [ 4, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR4:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND_SCALAR5:%.*]] = phi i64 [ 1, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR6:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND_SCALAR7:%.*]] = phi i64 [ 8, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR8:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND_SCALAR9:%.*]] = phi i64 [ 2, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR10:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND_SCALAR11:%.*]] = phi i64 [ 12, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR12:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND_SCALAR13:%.*]] = phi i64 [ 3, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR14:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND_SCALAR15:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR16:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND_SCALAR17:%.*]] = phi i64 [ 1, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR18:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND_SCALAR19:%.*]] = phi i64 [ 2, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR20:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND_SCALAR21:%.*]] = phi i64 [ 3, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR22:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[VEC_IND_SCALAR]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to i8*
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <8 x i32> @llvm.riscv.masked.strided.load.v8i32.p0i8.i64(<8 x i32> undef, i8* [[TMP1]], i64 64, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 [[VEC_IND_SCALAR1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to i8*
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i32, i32* [[A]], i64 [[VEC_IND_SCALAR15]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to i8*
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER52:%.*]] = call <8 x i32> @llvm.riscv.masked.strided.load.v8i32.p0i8.i64(<8 x i32> undef, i8* [[TMP3]], i64 16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[TMP6:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_GATHER52]], [[WIDE_MASKED_GATHER]]
+; CHECK-NEXT:    call void @llvm.riscv.masked.strided.store.v8i32.p0i8.i64(<8 x i32> [[TMP6]], i8* [[TMP5]], i64 16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i32, i32* [[B]], i64 [[VEC_IND_SCALAR3]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8*
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER53:%.*]] = call <8 x i32> @llvm.riscv.masked.strided.load.v8i32.p0i8.i64(<8 x i32> undef, i8* [[TMP8]], i64 64, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i32, i32* [[A]], i64 [[VEC_IND_SCALAR5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32* [[TMP9]] to i8*
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i32, i32* [[A]], i64 [[VEC_IND_SCALAR17]]
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to i8*
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER54:%.*]] = call <8 x i32> @llvm.riscv.masked.strided.load.v8i32.p0i8.i64(<8 x i32> undef, i8* [[TMP10]], i64 16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[TMP13:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_GATHER54]], [[WIDE_MASKED_GATHER53]]
+; CHECK-NEXT:    call void @llvm.riscv.masked.strided.store.v8i32.p0i8.i64(<8 x i32> [[TMP13]], i8* [[TMP12]], i64 16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i32, i32* [[B]], i64 [[VEC_IND_SCALAR7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i32* [[TMP14]] to i8*
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER55:%.*]] = call <8 x i32> @llvm.riscv.masked.strided.load.v8i32.p0i8.i64(<8 x i32> undef, i8* [[TMP15]], i64 64, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i32, i32* [[A]], i64 [[VEC_IND_SCALAR9]]
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to i8*
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i32, i32* [[A]], i64 [[VEC_IND_SCALAR19]]
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP18]] to i8*
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER56:%.*]] = call <8 x i32> @llvm.riscv.masked.strided.load.v8i32.p0i8.i64(<8 x i32> undef, i8* [[TMP17]], i64 16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[TMP20:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_GATHER56]], [[WIDE_MASKED_GATHER55]]
+; CHECK-NEXT:    call void @llvm.riscv.masked.strided.store.v8i32.p0i8.i64(<8 x i32> [[TMP20]], i8* [[TMP19]], i64 16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr i32, i32* [[B]], i64 [[VEC_IND_SCALAR11]]
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast i32* [[TMP21]] to i8*
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER57:%.*]] = call <8 x i32> @llvm.riscv.masked.strided.load.v8i32.p0i8.i64(<8 x i32> undef, i8* [[TMP22]], i64 64, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr i32, i32* [[A]], i64 [[VEC_IND_SCALAR13]]
+; CHECK-NEXT:    [[TMP24:%.*]] = bitcast i32* [[TMP23]] to i8*
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr i32, i32* [[A]], i64 [[VEC_IND_SCALAR21]]
+; CHECK-NEXT:    [[TMP26:%.*]] = bitcast i32* [[TMP25]] to i8*
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER58:%.*]] = call <8 x i32> @llvm.riscv.masked.strided.load.v8i32.p0i8.i64(<8 x i32> undef, i8* [[TMP24]], i64 16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[TMP27:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_GATHER58]], [[WIDE_MASKED_GATHER57]]
+; CHECK-NEXT:    call void @llvm.riscv.masked.strided.store.v8i32.p0i8.i64(<8 x i32> [[TMP27]], i8* [[TMP26]], i64 16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR]] = add i64 [[VEC_IND_SCALAR]], 128
+; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR2]] = add i64 [[VEC_IND_SCALAR1]], 32
+; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR4]] = add i64 [[VEC_IND_SCALAR3]], 128
+; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR6]] = add i64 [[VEC_IND_SCALAR5]], 32
+; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR8]] = add i64 [[VEC_IND_SCALAR7]], 128
+; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR10]] = add i64 [[VEC_IND_SCALAR9]], 32
+; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR12]] = add i64 [[VEC_IND_SCALAR11]], 128
+; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR14]] = add i64 [[VEC_IND_SCALAR13]], 32
+; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR16]] = add i64 [[VEC_IND_SCALAR15]], 32
+; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR18]] = add i64 [[VEC_IND_SCALAR17]], 32
+; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR20]] = add i64 [[VEC_IND_SCALAR19]], 32
+; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR22]] = add i64 [[VEC_IND_SCALAR21]], 32
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
+; CHECK-NEXT:    br i1 [[TMP28]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+;
+; CHECK-ASM-LABEL: gather_unroll:
+; CHECK-ASM:       # %bb.0: # %entry
+; CHECK-ASM-NEXT:    addi a2, zero, 256
+; CHECK-ASM-NEXT:    addi a3, zero, 64
+; CHECK-ASM-NEXT:    addi a4, zero, 16
+; CHECK-ASM-NEXT:  .LBB9_1: # %vector.body
+; CHECK-ASM-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-ASM-NEXT:    vsetivli zero, 8, e32, m1, ta, mu
+; CHECK-ASM-NEXT:    vlse32.v v25, (a1), a3
+; CHECK-ASM-NEXT:    vlse32.v v26, (a0), a4
+; CHECK-ASM-NEXT:    vadd.vv v25, v26, v25
+; CHECK-ASM-NEXT:    vsse32.v v25, (a0), a4
+; CHECK-ASM-NEXT:    addi a5, a1, 16
+; CHECK-ASM-NEXT:    vlse32.v v25, (a5), a3
+; CHECK-ASM-NEXT:    addi a5, a0, 4
+; CHECK-ASM-NEXT:    vlse32.v v26, (a5), a4
+; CHECK-ASM-NEXT:    vadd.vv v25, v26, v25
+; CHECK-ASM-NEXT:    vsse32.v v25, (a5), a4
+; CHECK-ASM-NEXT:    addi a5, a1, 32
+; CHECK-ASM-NEXT:    vlse32.v v25, (a5), a3
+; CHECK-ASM-NEXT:    addi a5, a0, 8
+; CHECK-ASM-NEXT:    vlse32.v v26, (a5), a4
+; CHECK-ASM-NEXT:    vadd.vv v25, v26, v25
+; CHECK-ASM-NEXT:    vsse32.v v25, (a5), a4
+; CHECK-ASM-NEXT:    addi a5, a1, 48
+; CHECK-ASM-NEXT:    vlse32.v v25, (a5), a3
+; CHECK-ASM-NEXT:    addi a5, a0, 12
+; CHECK-ASM-NEXT:    vlse32.v v26, (a5), a4
+; CHECK-ASM-NEXT:    vadd.vv v25, v26, v25
+; CHECK-ASM-NEXT:    vsse32.v v25, (a5), a4
+; CHECK-ASM-NEXT:    addi a2, a2, -8
+; CHECK-ASM-NEXT:    addi a1, a1, 512
+; CHECK-ASM-NEXT:    addi a0, a0, 128
+; CHECK-ASM-NEXT:    bnez a2, .LBB9_1
+; CHECK-ASM-NEXT:  # %bb.2: # %for.cond.cleanup
+; CHECK-ASM-NEXT:    ret
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %vec.ind = phi <8 x i64> [ <i64 0, i64 4, i64 8, i64 12, i64 16, i64 20, i64 24, i64 28>, %entry ], [ %vec.ind.next, %vector.body ]
+  %0 = shl nuw nsw <8 x i64> %vec.ind, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
+  %1 = getelementptr inbounds i32, i32* %B, <8 x i64> %0
+  %wide.masked.gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %1, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
+  %2 = getelementptr inbounds i32, i32* %A, <8 x i64> %vec.ind
+  %wide.masked.gather52 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %2, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
+  %3 = add nsw <8 x i32> %wide.masked.gather52, %wide.masked.gather
+  call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %3, <8 x i32*> %2, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+  %4 = or <8 x i64> %vec.ind, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+  %5 = shl nsw <8 x i64> %4, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
+  %6 = getelementptr inbounds i32, i32* %B, <8 x i64> %5
+  %wide.masked.gather53 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %6, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
+  %7 = getelementptr inbounds i32, i32* %A, <8 x i64> %4
+  %wide.masked.gather54 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %7, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
+  %8 = add nsw <8 x i32> %wide.masked.gather54, %wide.masked.gather53
+  call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %8, <8 x i32*> %7, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+  %9 = or <8 x i64> %vec.ind, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
+  %10 = shl nsw <8 x i64> %9, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
+  %11 = getelementptr inbounds i32, i32* %B, <8 x i64> %10
+  %wide.masked.gather55 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %11, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
+  %12 = getelementptr inbounds i32, i32* %A, <8 x i64> %9
+  %wide.masked.gather56 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %12, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
+  %13 = add nsw <8 x i32> %wide.masked.gather56, %wide.masked.gather55
+  call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %13, <8 x i32*> %12, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+  %14 = or <8 x i64> %vec.ind, <i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3>
+  %15 = shl nsw <8 x i64> %14, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
+  %16 = getelementptr inbounds i32, i32* %B, <8 x i64> %15
+  %wide.masked.gather57 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %16, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
+  %17 = getelementptr inbounds i32, i32* %A, <8 x i64> %14
+  %wide.masked.gather58 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %17, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
+  %18 = add nsw <8 x i32> %wide.masked.gather58, %wide.masked.gather57
+  call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %18, <8 x i32*> %17, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+  %index.next = add nuw i64 %index, 8
+  %vec.ind.next = add <8 x i64> %vec.ind, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+  %19 = icmp eq i64 %index.next, 256
+  br i1 %19, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body
+  ret void
+}
+
+declare <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*>, i32 immarg, <32 x i1>, <32 x i8>)
+declare <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*>, i32 immarg, <8 x i1>, <8 x i32>)
+declare void @llvm.masked.scatter.v32i8.v32p0i8(<32 x i8>, <32 x i8*>, i32 immarg, <32 x i1>)
+declare void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32>, <8 x i32*>, i32 immarg, <8 x i1>)

diff  --git a/llvm/tools/opt/opt.cpp b/llvm/tools/opt/opt.cpp
index e21853ec3460..2690feefb0fd 100644
--- a/llvm/tools/opt/opt.cpp
+++ b/llvm/tools/opt/opt.cpp
@@ -481,9 +481,10 @@ static bool shouldPinPassToLegacyPM(StringRef Pass) {
     return false;
 
   std::vector<StringRef> PassNamePrefix = {
-      "x86-",  "xcore-", "wasm-",    "systemz-", "ppc-",    "nvvm-",   "nvptx-",
-      "mips-", "lanai-", "hexagon-", "bpf-",     "avr-",    "thumb2-", "arm-",
-      "si-",   "gcn-",   "amdgpu-",  "aarch64-", "amdgcn-", "polly-"};
+      "x86-",    "xcore-", "wasm-",  "systemz-", "ppc-",    "nvvm-",
+      "nvptx-",  "mips-",  "lanai-", "hexagon-", "bpf-",    "avr-",
+      "thumb2-", "arm-",   "si-",    "gcn-",     "amdgpu-", "aarch64-",
+      "amdgcn-", "polly-", "riscv-"};
   std::vector<StringRef> PassNameContain = {"ehprepare"};
   std::vector<StringRef> PassNameExact = {
       "safe-stack",           "cost-model",