[llvm] 294efcd - [RISCV] Add support for fixed vector masked gather/scatter.

Mon Mar 22 10:17:59 PDT 2021

Author: Craig Topper
Date: 2021-03-22T10:17:30-07:00
New Revision: 294efcd6f7e226013a63aeff6054f7e51909a4b5

URL: https://github.com/llvm/llvm-project/commit/294efcd6f7e226013a63aeff6054f7e51909a4b5
DIFF: https://github.com/llvm/llvm-project/commit/294efcd6f7e226013a63aeff6054f7e51909a4b5.diff

LOG: [RISCV] Add support for fixed vector masked gather/scatter.

I've split the gather/scatter custom handler to avoid complicating
it with even more differences between gather/scatter.

Tests are the scalable vector tests with the vscale removed and
dropped the tests that used vector.insert. We're probably not
as thorough on the splitting cases since we use 128 for VLEN here
but scalable vector use a known min size of 64.

Reviewed By: frasercrmck

Differential Revision: https://reviews.llvm.org/D98991

Added: 
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll

Modified: 
    llvm/lib/Target/RISCV/RISCVISelLowering.cpp
    llvm/lib/Target/RISCV/RISCVISelLowering.h
    llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 3bde5158c9b1..76a6386a23f2 100644

--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -585,6 +585,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
 
         setOperationAction(ISD::MLOAD, VT, Custom);
         setOperationAction(ISD::MSTORE, VT, Custom);
+        setOperationAction(ISD::MGATHER, VT, Custom);
+        setOperationAction(ISD::MSCATTER, VT, Custom);
         setOperationAction(ISD::ADD, VT, Custom);
         setOperationAction(ISD::MUL, VT, Custom);
         setOperationAction(ISD::SUB, VT, Custom);
@@ -656,6 +658,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
         setOperationAction(ISD::STORE, VT, Custom);
         setOperationAction(ISD::MLOAD, VT, Custom);
         setOperationAction(ISD::MSTORE, VT, Custom);
+        setOperationAction(ISD::MGATHER, VT, Custom);
+        setOperationAction(ISD::MSCATTER, VT, Custom);
         setOperationAction(ISD::FADD, VT, Custom);
         setOperationAction(ISD::FSUB, VT, Custom);
         setOperationAction(ISD::FMUL, VT, Custom);
@@ -1724,8 +1728,9 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
   case ISD::FCOPYSIGN:
     return lowerFixedLengthVectorFCOPYSIGNToRVV(Op, DAG);
   case ISD::MGATHER:
+    return lowerMGATHER(Op, DAG);
   case ISD::MSCATTER:
-    return lowerMGATHERMSCATTER(Op, DAG);
+    return lowerMSCATTER(Op, DAG);
   }
 }
 
@@ -3487,54 +3492,154 @@ SDValue RISCVTargetLowering::lowerToScalableOp(SDValue Op, SelectionDAG &DAG,
 }
 
 // Custom lower MGATHER to a legalized form for RVV. It will then be matched to
-// a RVV indexed load. The RVV indexed load/store instructions only support the
+// a RVV indexed load. The RVV indexed load instructions only support the
 // "unsigned unscaled" addressing mode; indices are implicitly zero-extended or
 // truncated to XLEN and are treated as byte offsets. Any signed or scaled
 // indexing is extended to the XLEN value type and scaled accordingly.
-SDValue RISCVTargetLowering::lowerMGATHERMSCATTER(SDValue Op,
-                                                  SelectionDAG &DAG) const {
-  auto *N = cast<MaskedGatherScatterSDNode>(Op.getNode());
+SDValue RISCVTargetLowering::lowerMGATHER(SDValue Op, SelectionDAG &DAG) const {
+  auto *MGN = cast<MaskedGatherSDNode>(Op.getNode());
   SDLoc DL(Op);
-  SDValue Index = N->getIndex();
-  SDValue Mask = N->getMask();
 
+  SDValue Index = MGN->getIndex();
+  SDValue Mask = MGN->getMask();
+  SDValue PassThru = MGN->getPassThru();
+
+  MVT VT = Op.getSimpleValueType();
+  MVT IndexVT = Index.getSimpleValueType();
   MVT XLenVT = Subtarget.getXLenVT();
-  assert(N->getBasePtr().getSimpleValueType() == XLenVT &&
+
+  assert(VT.getVectorElementCount() == IndexVT.getVectorElementCount() &&
+         "Unexpected VTs!");
+  assert(MGN->getBasePtr().getSimpleValueType() == XLenVT &&
          "Unexpected pointer type");
-  // Targets have to explicitly opt-in for extending vector loads and
-  // truncating vector stores.
-  const auto *MGN = dyn_cast<MaskedGatherSDNode>(N);
-  const auto *MSN = dyn_cast<MaskedScatterSDNode>(N);
-  assert((!MGN || MGN->getExtensionType() == ISD::NON_EXTLOAD) &&
+  // Targets have to explicitly opt-in for extending vector loads.
+  assert(MGN->getExtensionType() == ISD::NON_EXTLOAD &&
          "Unexpected extending MGATHER");
-  assert((!MSN || !MSN->isTruncatingStore()) &&
-         "Unexpected extending MSCATTER");
 
   // If the mask is known to be all ones, optimize to an unmasked intrinsic;
   // the selection of the masked intrinsics doesn't do this for us.
-  unsigned IntID = 0;
+  bool IsUnmasked = ISD::isConstantSplatVectorAllOnes(Mask.getNode());
+
+  SDValue VL;
+  MVT ContainerVT = VT;
+  if (VT.isFixedLengthVector()) {
+    // We need to use the larger of the result and index type to determine the
+    // scalable type to use so we don't increase LMUL for any operand/result.
+    if (VT.bitsGE(IndexVT)) {
+      ContainerVT = getContainerForFixedLengthVector(VT);
+      IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(),
+                                 ContainerVT.getVectorElementCount());
+    } else {
+      IndexVT = getContainerForFixedLengthVector(IndexVT);
+      ContainerVT = MVT::getVectorVT(ContainerVT.getVectorElementType(),
+                                     IndexVT.getVectorElementCount());
+    }
+
+    Index = convertToScalableVector(IndexVT, Index, DAG, Subtarget);
+
+    if (!IsUnmasked) {
+      MVT MaskVT =
+          MVT::getVectorVT(MVT::i1, ContainerVT.getVectorElementCount());
+      Mask = convertToScalableVector(MaskVT, Mask, DAG, Subtarget);
+      PassThru = convertToScalableVector(ContainerVT, PassThru, DAG, Subtarget);
+    }
+
+    VL = DAG.getConstant(VT.getVectorNumElements(), DL, XLenVT);
+  } else
+    VL = DAG.getRegister(RISCV::X0, XLenVT);
+
+  unsigned IntID =
+      IsUnmasked ? Intrinsic::riscv_vloxei : Intrinsic::riscv_vloxei_mask;
+  SmallVector<SDValue, 8> Ops{MGN->getChain(),
+                              DAG.getTargetConstant(IntID, DL, XLenVT)};
+  if (!IsUnmasked)
+    Ops.push_back(PassThru);
+  Ops.push_back(MGN->getBasePtr());
+  Ops.push_back(Index);
+  if (!IsUnmasked)
+    Ops.push_back(Mask);
+  Ops.push_back(VL);
+
+  SDVTList VTs = DAG.getVTList({ContainerVT, MVT::Other});
+  SDValue Result =
+      DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, VTs, Ops,
+                              MGN->getMemoryVT(), MGN->getMemOperand());
+  SDValue Chain = Result.getValue(1);
+
+  if (VT.isFixedLengthVector())
+    Result = convertFromScalableVector(VT, Result, DAG, Subtarget);
+
+  return DAG.getMergeValues({Result, Chain}, DL);
+}
+
+// Custom lower MSCATTER to a legalized form for RVV. It will then be matched to
+// a RVV indexed store. The RVV indexed store instructions only support the
+// "unsigned unscaled" addressing mode; indices are implicitly zero-extended or
+// truncated to XLEN and are treated as byte offsets. Any signed or scaled
+// indexing is extended to the XLEN value type and scaled accordingly.
+SDValue RISCVTargetLowering::lowerMSCATTER(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  auto *MSN = cast<MaskedScatterSDNode>(Op.getNode());
+  SDLoc DL(Op);
+  SDValue Index = MSN->getIndex();
+  SDValue Mask = MSN->getMask();
+  SDValue Val = MSN->getValue();
+
+  MVT VT = Val.getSimpleValueType();
   MVT IndexVT = Index.getSimpleValueType();
-  SDValue VL = getDefaultVLOps(IndexVT, IndexVT, DL, DAG, Subtarget).second;
+  MVT XLenVT = Subtarget.getXLenVT();
+
+  assert(VT.getVectorElementCount() == IndexVT.getVectorElementCount() &&
+         "Unexpected VTs!");
+  assert(MSN->getBasePtr().getSimpleValueType() == XLenVT &&
+         "Unexpected pointer type");
+  // Targets have to explicitly opt-in for extending vector loads and
+  // truncating vector stores.
+  assert(!MSN->isTruncatingStore() && "Unexpected extending MSCATTER");
+
+  // If the mask is known to be all ones, optimize to an unmasked intrinsic;
+  // the selection of the masked intrinsics doesn't do this for us.
   bool IsUnmasked = ISD::isConstantSplatVectorAllOnes(Mask.getNode());
 
-  if (IsUnmasked)
-    IntID = MGN ? Intrinsic::riscv_vloxei : Intrinsic::riscv_vsoxei;
-  else
-    IntID = MGN ? Intrinsic::riscv_vloxei_mask : Intrinsic::riscv_vsoxei_mask;
-  SmallVector<SDValue, 8> Ops{N->getChain(),
+  SDValue VL;
+  if (VT.isFixedLengthVector()) {
+    // We need to use the larger of the value and index type to determine the
+    // scalable type to use so we don't increase LMUL for any operand/result.
+    if (VT.bitsGE(IndexVT)) {
+      VT = getContainerForFixedLengthVector(VT);
+      IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(),
+                                 VT.getVectorElementCount());
+    } else {
+      IndexVT = getContainerForFixedLengthVector(IndexVT);
+      VT = MVT::getVectorVT(VT.getVectorElementType(),
+                            IndexVT.getVectorElementCount());
+    }
+
+    Index = convertToScalableVector(IndexVT, Index, DAG, Subtarget);
+    Val = convertToScalableVector(VT, Val, DAG, Subtarget);
+
+    if (!IsUnmasked) {
+      MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorElementCount());
+      Mask = convertToScalableVector(MaskVT, Mask, DAG, Subtarget);
+    }
+
+    VL = DAG.getConstant(VT.getVectorNumElements(), DL, XLenVT);
+  } else
+    VL = DAG.getRegister(RISCV::X0, XLenVT);
+
+  unsigned IntID =
+      IsUnmasked ? Intrinsic::riscv_vsoxei : Intrinsic::riscv_vsoxei_mask;
+  SmallVector<SDValue, 8> Ops{MSN->getChain(),
                               DAG.getTargetConstant(IntID, DL, XLenVT)};
-  if (MSN)
-    Ops.push_back(MSN->getValue());
-  else if (!IsUnmasked)
-    Ops.push_back(MGN->getPassThru());
-  Ops.push_back(N->getBasePtr());
+  Ops.push_back(Val);
+  Ops.push_back(MSN->getBasePtr());
   Ops.push_back(Index);
   if (!IsUnmasked)
     Ops.push_back(Mask);
   Ops.push_back(VL);
-  return DAG.getMemIntrinsicNode(
-      MGN ? ISD::INTRINSIC_W_CHAIN : ISD::INTRINSIC_VOID, DL, N->getVTList(),
-      Ops, N->getMemoryVT(), N->getMemOperand());
+
+  return DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID, DL, MSN->getVTList(), Ops,
+                                 MSN->getMemoryVT(), MSN->getMemOperand());
 }
 
 // Returns the opcode of the target-specific SDNode that implements the 32-bit

diff  --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 4546ee4d0f89..29e2c29712f5 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -479,7 +479,8 @@ class RISCVTargetLowering : public TargetLowering {
   SDValue lowerMSTORE(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerFixedLengthVectorFCOPYSIGNToRVV(SDValue Op,
                                                SelectionDAG &DAG) const;
-  SDValue lowerMGATHERMSCATTER(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerMGATHER(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerMSCATTER(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerFixedLengthVectorLoadToRVV(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerFixedLengthVectorStoreToRVV(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerFixedLengthVectorSetccToRVV(SDValue Op, SelectionDAG &DAG) const;

diff  --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index b0aa57c9e8ef..bb8215b736ca 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -61,15 +61,7 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
     return ST->getXLen();
   }
 
-  bool isLegalMaskedLoadStore(Type *DataType, Align Alignment) {
-    if (!ST->hasStdExtV())
-      return false;
-
-    // Only support fixed vectors if we know the minimum vector size.
-    if (isa<FixedVectorType>(DataType) && ST->getMinRVVVectorSizeInBits() == 0)
-      return false;
-
-    Type *ScalarTy = DataType->getScalarType();
+  bool isLegalElementTypeForRVV(Type *ScalarTy) {
     if (ScalarTy->isPointerTy())
       return true;
 
@@ -87,12 +79,41 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
     return false;
   }
 
+  bool isLegalMaskedLoadStore(Type *DataType, Align Alignment) {
+    if (!ST->hasStdExtV())
+      return false;
+
+    // Only support fixed vectors if we know the minimum vector size.
+    if (isa<FixedVectorType>(DataType) && ST->getMinRVVVectorSizeInBits() == 0)
+      return false;
+
+    return isLegalElementTypeForRVV(DataType->getScalarType());
+  }
+
   bool isLegalMaskedLoad(Type *DataType, Align Alignment) {
     return isLegalMaskedLoadStore(DataType, Alignment);
   }
   bool isLegalMaskedStore(Type *DataType, Align Alignment) {
     return isLegalMaskedLoadStore(DataType, Alignment);
   }
+
+  bool isLegalMaskedGatherScatter(Type *DataType, Align Alignment) {
+    if (!ST->hasStdExtV())
+      return false;
+
+    // Only support fixed vectors if we know the minimum vector size.
+    if (isa<FixedVectorType>(DataType) && ST->getMinRVVVectorSizeInBits() == 0)
+      return false;
+
+    return isLegalElementTypeForRVV(DataType->getScalarType());
+  }
+
+  bool isLegalMaskedGather(Type *DataType, Align Alignment) {
+    return isLegalMaskedGatherScatter(DataType, Alignment);
+  }
+  bool isLegalMaskedScatter(Type *DataType, Align Alignment) {
+    return isLegalMaskedGatherScatter(DataType, Alignment);
+  }
 };
 
 } // end namespace llvm

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
new file mode 100644
index 000000000000..e7ea8535ff4e
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
@@ -0,0 +1,2267 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+experimental-zfh,+experimental-v -target-abi=ilp32d \
+; RUN:     -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefix=RV32
+; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+experimental-zfh,+experimental-v -target-abi=lp64d \
+; RUN:     -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefix=RV64
+
+declare <1 x i8> @llvm.masked.gather.v1i8.v1p0i8(<1 x i8*>, i32, <1 x i1>, <1 x i8>)
+
+define <1 x i8> @mgather_v1i8(<1 x i8*> %ptrs, <1 x i1> %m, <1 x i8> %passthru) {
+; RV32-LABEL: mgather_v1i8:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a0, 1, e8,mf4,tu,mu
+; RV32-NEXT:    vloxei32.v v9, (zero), v8, v0.t
+; RV32-NEXT:    vmv1r.v v8, v9
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_v1i8:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a0, 1, e8,mf8,tu,mu
+; RV64-NEXT:    vloxei64.v v9, (zero), v8, v0.t
+; RV64-NEXT:    vmv1r.v v8, v9
+; RV64-NEXT:    ret
+  %v = call <1 x i8> @llvm.masked.gather.v1i8.v1p0i8(<1 x i8*> %ptrs, i32 1, <1 x i1> %m, <1 x i8> %passthru)
+  ret <1 x i8> %v
+}
+
+declare <2 x i8> @llvm.masked.gather.v2i8.v2p0i8(<2 x i8*>, i32, <2 x i1>, <2 x i8>)
+
+define <2 x i8> @mgather_v2i8(<2 x i8*> %ptrs, <2 x i1> %m, <2 x i8> %passthru) {
+; RV32-LABEL: mgather_v2i8:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a0, 2, e8,mf4,tu,mu
+; RV32-NEXT:    vloxei32.v v9, (zero), v8, v0.t
+; RV32-NEXT:    vmv1r.v v8, v9
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_v2i8:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a0, 2, e8,mf8,tu,mu
+; RV64-NEXT:    vloxei64.v v9, (zero), v8, v0.t
+; RV64-NEXT:    vmv1r.v v8, v9
+; RV64-NEXT:    ret
+  %v = call <2 x i8> @llvm.masked.gather.v2i8.v2p0i8(<2 x i8*> %ptrs, i32 1, <2 x i1> %m, <2 x i8> %passthru)
+  ret <2 x i8> %v
+}
+
+define <2 x i16> @mgather_v2i8_sextload_v2i16(<2 x i8*> %ptrs, <2 x i1> %m, <2 x i8> %passthru) {
+; RV32-LABEL: mgather_v2i8_sextload_v2i16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a0, 2, e8,mf4,tu,mu
+; RV32-NEXT:    vloxei32.v v9, (zero), v8, v0.t
+; RV32-NEXT:    vsetivli a0, 2, e16,m1,ta,mu
+; RV32-NEXT:    vsext.vf2 v8, v9
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_v2i8_sextload_v2i16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a0, 2, e8,mf8,tu,mu
+; RV64-NEXT:    vloxei64.v v9, (zero), v8, v0.t
+; RV64-NEXT:    vsetivli a0, 2, e16,m1,ta,mu
+; RV64-NEXT:    vsext.vf2 v8, v9
+; RV64-NEXT:    ret
+  %v = call <2 x i8> @llvm.masked.gather.v2i8.v2p0i8(<2 x i8*> %ptrs, i32 1, <2 x i1> %m, <2 x i8> %passthru)
+  %ev = sext <2 x i8> %v to <2 x i16>
+  ret <2 x i16> %ev
+}
+
+define <2 x i16> @mgather_v2i8_zextload_v2i16(<2 x i8*> %ptrs, <2 x i1> %m, <2 x i8> %passthru) {
+; RV32-LABEL: mgather_v2i8_zextload_v2i16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a0, 2, e8,mf4,tu,mu
+; RV32-NEXT:    vloxei32.v v9, (zero), v8, v0.t
+; RV32-NEXT:    vsetivli a0, 2, e16,m1,ta,mu
+; RV32-NEXT:    vzext.vf2 v8, v9
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_v2i8_zextload_v2i16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a0, 2, e8,mf8,tu,mu
+; RV64-NEXT:    vloxei64.v v9, (zero), v8, v0.t
+; RV64-NEXT:    vsetivli a0, 2, e16,m1,ta,mu
+; RV64-NEXT:    vzext.vf2 v8, v9
+; RV64-NEXT:    ret
+  %v = call <2 x i8> @llvm.masked.gather.v2i8.v2p0i8(<2 x i8*> %ptrs, i32 1, <2 x i1> %m, <2 x i8> %passthru)
+  %ev = zext <2 x i8> %v to <2 x i16>
+  ret <2 x i16> %ev
+}
+
+define <2 x i32> @mgather_v2i8_sextload_v2i32(<2 x i8*> %ptrs, <2 x i1> %m, <2 x i8> %passthru) {
+; RV32-LABEL: mgather_v2i8_sextload_v2i32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a0, 2, e8,mf4,tu,mu
+; RV32-NEXT:    vloxei32.v v9, (zero), v8, v0.t
+; RV32-NEXT:    vsetivli a0, 2, e32,m1,ta,mu
+; RV32-NEXT:    vsext.vf4 v8, v9
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_v2i8_sextload_v2i32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a0, 2, e8,mf8,tu,mu
+; RV64-NEXT:    vloxei64.v v9, (zero), v8, v0.t
+; RV64-NEXT:    vsetivli a0, 2, e32,m1,ta,mu
+; RV64-NEXT:    vsext.vf4 v8, v9
+; RV64-NEXT:    ret
+  %v = call <2 x i8> @llvm.masked.gather.v2i8.v2p0i8(<2 x i8*> %ptrs, i32 1, <2 x i1> %m, <2 x i8> %passthru)
+  %ev = sext <2 x i8> %v to <2 x i32>
+  ret <2 x i32> %ev
+}
+
+define <2 x i32> @mgather_v2i8_zextload_v2i32(<2 x i8*> %ptrs, <2 x i1> %m, <2 x i8> %passthru) {
+; RV32-LABEL: mgather_v2i8_zextload_v2i32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a0, 2, e8,mf4,tu,mu
+; RV32-NEXT:    vloxei32.v v9, (zero), v8, v0.t
+; RV32-NEXT:    vsetivli a0, 2, e32,m1,ta,mu
+; RV32-NEXT:    vzext.vf4 v8, v9
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_v2i8_zextload_v2i32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a0, 2, e8,mf8,tu,mu
+; RV64-NEXT:    vloxei64.v v9, (zero), v8, v0.t
+; RV64-NEXT:    vsetivli a0, 2, e32,m1,ta,mu
+; RV64-NEXT:    vzext.vf4 v8, v9
+; RV64-NEXT:    ret
+  %v = call <2 x i8> @llvm.masked.gather.v2i8.v2p0i8(<2 x i8*> %ptrs, i32 1, <2 x i1> %m, <2 x i8> %passthru)
+  %ev = zext <2 x i8> %v to <2 x i32>
+  ret <2 x i32> %ev
+}
+
+define <2 x i64> @mgather_v2i8_sextload_v2i64(<2 x i8*> %ptrs, <2 x i1> %m, <2 x i8> %passthru) {
+; RV32-LABEL: mgather_v2i8_sextload_v2i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a0, 2, e8,mf4,tu,mu
+; RV32-NEXT:    vloxei32.v v9, (zero), v8, v0.t
+; RV32-NEXT:    vsetivli a0, 2, e64,m1,ta,mu
+; RV32-NEXT:    vsext.vf8 v8, v9
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_v2i8_sextload_v2i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a0, 2, e8,mf8,tu,mu
+; RV64-NEXT:    vloxei64.v v9, (zero), v8, v0.t
+; RV64-NEXT:    vsetivli a0, 2, e64,m1,ta,mu
+; RV64-NEXT:    vsext.vf8 v8, v9
+; RV64-NEXT:    ret
+  %v = call <2 x i8> @llvm.masked.gather.v2i8.v2p0i8(<2 x i8*> %ptrs, i32 1, <2 x i1> %m, <2 x i8> %passthru)
+  %ev = sext <2 x i8> %v to <2 x i64>
+  ret <2 x i64> %ev
+}
+
+define <2 x i64> @mgather_v2i8_zextload_v2i64(<2 x i8*> %ptrs, <2 x i1> %m, <2 x i8> %passthru) {
+; RV32-LABEL: mgather_v2i8_zextload_v2i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a0, 2, e8,mf4,tu,mu
+; RV32-NEXT:    vloxei32.v v9, (zero), v8, v0.t
+; RV32-NEXT:    vsetivli a0, 2, e64,m1,ta,mu
+; RV32-NEXT:    vzext.vf8 v8, v9
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_v2i8_zextload_v2i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a0, 2, e8,mf8,tu,mu
+; RV64-NEXT:    vloxei64.v v9, (zero), v8, v0.t
+; RV64-NEXT:    vsetivli a0, 2, e64,m1,ta,mu
+; RV64-NEXT:    vzext.vf8 v8, v9
+; RV64-NEXT:    ret
+  %v = call <2 x i8> @llvm.masked.gather.v2i8.v2p0i8(<2 x i8*> %ptrs, i32 1, <2 x i1> %m, <2 x i8> %passthru)
+  %ev = zext <2 x i8> %v to <2 x i64>
+  ret <2 x i64> %ev
+}
+
+declare <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*>, i32, <4 x i1>, <4 x i8>)
+
+define <4 x i8> @mgather_v4i8(<4 x i8*> %ptrs, <4 x i1> %m, <4 x i8> %passthru) {
+; RV32-LABEL: mgather_v4i8:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a0, 4, e8,mf4,tu,mu
+; RV32-NEXT:    vloxei32.v v9, (zero), v8, v0.t
+; RV32-NEXT:    vmv1r.v v8, v9
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_v4i8:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a0, 4, e8,mf4,tu,mu
+; RV64-NEXT:    vloxei64.v v10, (zero), v8, v0.t
+; RV64-NEXT:    vmv1r.v v8, v10
+; RV64-NEXT:    ret
+  %v = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> %ptrs, i32 1, <4 x i1> %m, <4 x i8> %passthru)
+  ret <4 x i8> %v
+}
+
+define <4 x i8> @mgather_truemask_v4i8(<4 x i8*> %ptrs, <4 x i8> %passthru) {
+; RV32-LABEL: mgather_truemask_v4i8:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a0, 4, e8,mf4,ta,mu
+; RV32-NEXT:    vloxei32.v v8, (zero), v8
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_truemask_v4i8:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a0, 4, e8,mf4,ta,mu
+; RV64-NEXT:    vloxei64.v v8, (zero), v8
+; RV64-NEXT:    ret
+  %mhead = insertelement <4 x i1> undef, i1 1, i32 0
+  %mtrue = shufflevector <4 x i1> %mhead, <4 x i1> undef, <4 x i32> zeroinitializer
+  %v = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> %ptrs, i32 1, <4 x i1> %mtrue, <4 x i8> %passthru)
+  ret <4 x i8> %v
+}
+
+define <4 x i8> @mgather_falsemask_v4i8(<4 x i8*> %ptrs, <4 x i8> %passthru) {
+; RV32-LABEL: mgather_falsemask_v4i8:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vmv1r.v v8, v9
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_falsemask_v4i8:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vmv1r.v v8, v10
+; RV64-NEXT:    ret
+  %v = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> %ptrs, i32 1, <4 x i1> zeroinitializer, <4 x i8> %passthru)
+  ret <4 x i8> %v
+}
+
+declare <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*>, i32, <8 x i1>, <8 x i8>)
+
+define <8 x i8> @mgather_v8i8(<8 x i8*> %ptrs, <8 x i1> %m, <8 x i8> %passthru) {
+; RV32-LABEL: mgather_v8i8:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a0, 8, e8,mf2,tu,mu
+; RV32-NEXT:    vloxei32.v v10, (zero), v8, v0.t
+; RV32-NEXT:    vmv1r.v v8, v10
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_v8i8:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a0, 8, e8,mf2,tu,mu
+; RV64-NEXT:    vloxei64.v v12, (zero), v8, v0.t
+; RV64-NEXT:    vmv1r.v v8, v12
+; RV64-NEXT:    ret
+  %v = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> %ptrs, i32 1, <8 x i1> %m, <8 x i8> %passthru)
+  ret <8 x i8> %v
+}
+
+define <8 x i8> @mgather_baseidx_v8i8(i8* %base, <8 x i8> %idxs, <8 x i1> %m, <8 x i8> %passthru) {
+; RV32-LABEL: mgather_baseidx_v8i8:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a1, 8, e32,m2,ta,mu
+; RV32-NEXT:    vsext.vf4 v26, v8
+; RV32-NEXT:    vsetivli a1, 8, e8,mf2,tu,mu
+; RV32-NEXT:    vloxei32.v v9, (a0), v26, v0.t
+; RV32-NEXT:    vmv1r.v v8, v9
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_baseidx_v8i8:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vsext.vf8 v28, v8
+; RV64-NEXT:    vsetivli a1, 8, e8,mf2,tu,mu
+; RV64-NEXT:    vloxei64.v v9, (a0), v28, v0.t
+; RV64-NEXT:    vmv1r.v v8, v9
+; RV64-NEXT:    ret
+  %ptrs = getelementptr inbounds i8, i8* %base, <8 x i8> %idxs
+  %v = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> %ptrs, i32 1, <8 x i1> %m, <8 x i8> %passthru)
+  ret <8 x i8> %v
+}
+
+declare <1 x i16> @llvm.masked.gather.v1i16.v1p0i16(<1 x i16*>, i32, <1 x i1>, <1 x i16>)
+
+define <1 x i16> @mgather_v1i16(<1 x i16*> %ptrs, <1 x i1> %m, <1 x i16> %passthru) {
+; RV32-LABEL: mgather_v1i16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a0, 1, e16,mf2,tu,mu
+; RV32-NEXT:    vloxei32.v v9, (zero), v8, v0.t
+; RV32-NEXT:    vmv1r.v v8, v9
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_v1i16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a0, 1, e16,mf4,tu,mu
+; RV64-NEXT:    vloxei64.v v9, (zero), v8, v0.t
+; RV64-NEXT:    vmv1r.v v8, v9
+; RV64-NEXT:    ret
+  %v = call <1 x i16> @llvm.masked.gather.v1i16.v1p0i16(<1 x i16*> %ptrs, i32 2, <1 x i1> %m, <1 x i16> %passthru)
+  ret <1 x i16> %v
+}
+
+declare <2 x i16> @llvm.masked.gather.v2i16.v2p0i16(<2 x i16*>, i32, <2 x i1>, <2 x i16>)
+
+define <2 x i16> @mgather_v2i16(<2 x i16*> %ptrs, <2 x i1> %m, <2 x i16> %passthru) {
+; RV32-LABEL: mgather_v2i16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a0, 2, e16,mf2,tu,mu
+; RV32-NEXT:    vloxei32.v v9, (zero), v8, v0.t
+; RV32-NEXT:    vmv1r.v v8, v9
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_v2i16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a0, 2, e16,mf4,tu,mu
+; RV64-NEXT:    vloxei64.v v9, (zero), v8, v0.t
+; RV64-NEXT:    vmv1r.v v8, v9
+; RV64-NEXT:    ret
+  %v = call <2 x i16> @llvm.masked.gather.v2i16.v2p0i16(<2 x i16*> %ptrs, i32 2, <2 x i1> %m, <2 x i16> %passthru)
+  ret <2 x i16> %v
+}
+
+define <2 x i32> @mgather_v2i16_sextload_v2i32(<2 x i16*> %ptrs, <2 x i1> %m, <2 x i16> %passthru) {
+; RV32-LABEL: mgather_v2i16_sextload_v2i32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a0, 2, e16,mf2,tu,mu
+; RV32-NEXT:    vloxei32.v v9, (zero), v8, v0.t
+; RV32-NEXT:    vsetivli a0, 2, e32,m1,ta,mu
+; RV32-NEXT:    vsext.vf2 v8, v9
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_v2i16_sextload_v2i32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a0, 2, e16,mf4,tu,mu
+; RV64-NEXT:    vloxei64.v v9, (zero), v8, v0.t
+; RV64-NEXT:    vsetivli a0, 2, e32,m1,ta,mu
+; RV64-NEXT:    vsext.vf2 v8, v9
+; RV64-NEXT:    ret
+  %v = call <2 x i16> @llvm.masked.gather.v2i16.v2p0i16(<2 x i16*> %ptrs, i32 2, <2 x i1> %m, <2 x i16> %passthru)
+  %ev = sext <2 x i16> %v to <2 x i32>
+  ret <2 x i32> %ev
+}
+
+define <2 x i32> @mgather_v2i16_zextload_v2i32(<2 x i16*> %ptrs, <2 x i1> %m, <2 x i16> %passthru) {
+; RV32-LABEL: mgather_v2i16_zextload_v2i32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a0, 2, e16,mf2,tu,mu
+; RV32-NEXT:    vloxei32.v v9, (zero), v8, v0.t
+; RV32-NEXT:    vsetivli a0, 2, e32,m1,ta,mu
+; RV32-NEXT:    vzext.vf2 v8, v9
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_v2i16_zextload_v2i32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a0, 2, e16,mf4,tu,mu
+; RV64-NEXT:    vloxei64.v v9, (zero), v8, v0.t
+; RV64-NEXT:    vsetivli a0, 2, e32,m1,ta,mu
+; RV64-NEXT:    vzext.vf2 v8, v9
+; RV64-NEXT:    ret
+  %v = call <2 x i16> @llvm.masked.gather.v2i16.v2p0i16(<2 x i16*> %ptrs, i32 2, <2 x i1> %m, <2 x i16> %passthru)
+  %ev = zext <2 x i16> %v to <2 x i32>
+  ret <2 x i32> %ev
+}
+
+define <2 x i64> @mgather_v2i16_sextload_v2i64(<2 x i16*> %ptrs, <2 x i1> %m, <2 x i16> %passthru) {
+; RV32-LABEL: mgather_v2i16_sextload_v2i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a0, 2, e16,mf2,tu,mu
+; RV32-NEXT:    vloxei32.v v9, (zero), v8, v0.t
+; RV32-NEXT:    vsetivli a0, 2, e64,m1,ta,mu
+; RV32-NEXT:    vsext.vf4 v8, v9
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_v2i16_sextload_v2i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a0, 2, e16,mf4,tu,mu
+; RV64-NEXT:    vloxei64.v v9, (zero), v8, v0.t
+; RV64-NEXT:    vsetivli a0, 2, e64,m1,ta,mu
+; RV64-NEXT:    vsext.vf4 v8, v9
+; RV64-NEXT:    ret
+  %v = call <2 x i16> @llvm.masked.gather.v2i16.v2p0i16(<2 x i16*> %ptrs, i32 2, <2 x i1> %m, <2 x i16> %passthru)
+  %ev = sext <2 x i16> %v to <2 x i64>
+  ret <2 x i64> %ev
+}
+
+define <2 x i64> @mgather_v2i16_zextload_v2i64(<2 x i16*> %ptrs, <2 x i1> %m, <2 x i16> %passthru) {
+; RV32-LABEL: mgather_v2i16_zextload_v2i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a0, 2, e16,mf2,tu,mu
+; RV32-NEXT:    vloxei32.v v9, (zero), v8, v0.t
+; RV32-NEXT:    vsetivli a0, 2, e64,m1,ta,mu
+; RV32-NEXT:    vzext.vf4 v8, v9
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_v2i16_zextload_v2i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a0, 2, e16,mf4,tu,mu
+; RV64-NEXT:    vloxei64.v v9, (zero), v8, v0.t
+; RV64-NEXT:    vsetivli a0, 2, e64,m1,ta,mu
+; RV64-NEXT:    vzext.vf4 v8, v9
+; RV64-NEXT:    ret
+  %v = call <2 x i16> @llvm.masked.gather.v2i16.v2p0i16(<2 x i16*> %ptrs, i32 2, <2 x i1> %m, <2 x i16> %passthru)
+  %ev = zext <2 x i16> %v to <2 x i64>
+  ret <2 x i64> %ev
+}
+
+declare <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*>, i32, <4 x i1>, <4 x i16>)
+
+define <4 x i16> @mgather_v4i16(<4 x i16*> %ptrs, <4 x i1> %m, <4 x i16> %passthru) {
+; RV32-LABEL: mgather_v4i16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a0, 4, e16,mf2,tu,mu
+; RV32-NEXT:    vloxei32.v v9, (zero), v8, v0.t
+; RV32-NEXT:    vmv1r.v v8, v9
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_v4i16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a0, 4, e16,mf2,tu,mu
+; RV64-NEXT:    vloxei64.v v10, (zero), v8, v0.t
+; RV64-NEXT:    vmv1r.v v8, v10
+; RV64-NEXT:    ret
+  %v = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %ptrs, i32 2, <4 x i1> %m, <4 x i16> %passthru)
+  ret <4 x i16> %v
+}
+
+define <4 x i16> @mgather_truemask_v4i16(<4 x i16*> %ptrs, <4 x i16> %passthru) {
+; RV32-LABEL: mgather_truemask_v4i16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a0, 4, e16,mf2,ta,mu
+; RV32-NEXT:    vloxei32.v v8, (zero), v8
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_truemask_v4i16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a0, 4, e16,mf2,ta,mu
+; RV64-NEXT:    vloxei64.v v8, (zero), v8
+; RV64-NEXT:    ret
+  %mhead = insertelement <4 x i1> undef, i1 1, i32 0
+  %mtrue = shufflevector <4 x i1> %mhead, <4 x i1> undef, <4 x i32> zeroinitializer
+  %v = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %ptrs, i32 2, <4 x i1> %mtrue, <4 x i16> %passthru)
+  ret <4 x i16> %v
+}
+
+define <4 x i16> @mgather_falsemask_v4i16(<4 x i16*> %ptrs, <4 x i16> %passthru) {
+; RV32-LABEL: mgather_falsemask_v4i16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vmv1r.v v8, v9
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_falsemask_v4i16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vmv1r.v v8, v10
+; RV64-NEXT:    ret
+  %v = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %ptrs, i32 2, <4 x i1> zeroinitializer, <4 x i16> %passthru)
+  ret <4 x i16> %v
+}
+
+declare <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*>, i32, <8 x i1>, <8 x i16>)
+
+define <8 x i16> @mgather_v8i16(<8 x i16*> %ptrs, <8 x i1> %m, <8 x i16> %passthru) {
+; RV32-LABEL: mgather_v8i16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a0, 8, e16,m1,tu,mu
+; RV32-NEXT:    vloxei32.v v10, (zero), v8, v0.t
+; RV32-NEXT:    vmv1r.v v8, v10
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_v8i16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a0, 8, e16,m1,tu,mu
+; RV64-NEXT:    vloxei64.v v12, (zero), v8, v0.t
+; RV64-NEXT:    vmv1r.v v8, v12
+; RV64-NEXT:    ret
+  %v = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs, i32 2, <8 x i1> %m, <8 x i16> %passthru)
+  ret <8 x i16> %v
+}
+
+define <8 x i16> @mgather_baseidx_v8i8_v8i16(i16* %base, <8 x i8> %idxs, <8 x i1> %m, <8 x i16> %passthru) {
+; RV32-LABEL: mgather_baseidx_v8i8_v8i16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a1, 8, e32,m2,ta,mu
+; RV32-NEXT:    vsext.vf4 v26, v8
+; RV32-NEXT:    vsll.vi v26, v26, 1
+; RV32-NEXT:    vsetivli a1, 8, e16,m1,tu,mu
+; RV32-NEXT:    vloxei32.v v9, (a0), v26, v0.t
+; RV32-NEXT:    vmv1r.v v8, v9
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_baseidx_v8i8_v8i16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vsext.vf8 v28, v8
+; RV64-NEXT:    vsll.vi v28, v28, 1
+; RV64-NEXT:    vsetivli a1, 8, e16,m1,tu,mu
+; RV64-NEXT:    vloxei64.v v9, (a0), v28, v0.t
+; RV64-NEXT:    vmv1r.v v8, v9
+; RV64-NEXT:    ret
+  %ptrs = getelementptr inbounds i16, i16* %base, <8 x i8> %idxs
+  %v = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs, i32 2, <8 x i1> %m, <8 x i16> %passthru)
+  ret <8 x i16> %v
+}
+
+define <8 x i16> @mgather_baseidx_sext_v8i8_v8i16(i16* %base, <8 x i8> %idxs, <8 x i1> %m, <8 x i16> %passthru) {
+; RV32-LABEL: mgather_baseidx_sext_v8i8_v8i16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a1, 8, e32,m2,ta,mu
+; RV32-NEXT:    vsext.vf4 v26, v8
+; RV32-NEXT:    vsll.vi v26, v26, 1
+; RV32-NEXT:    vsetivli a1, 8, e16,m1,tu,mu
+; RV32-NEXT:    vloxei32.v v9, (a0), v26, v0.t
+; RV32-NEXT:    vmv1r.v v8, v9
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_baseidx_sext_v8i8_v8i16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vsext.vf8 v28, v8
+; RV64-NEXT:    vsll.vi v28, v28, 1
+; RV64-NEXT:    vsetivli a1, 8, e16,m1,tu,mu
+; RV64-NEXT:    vloxei64.v v9, (a0), v28, v0.t
+; RV64-NEXT:    vmv1r.v v8, v9
+; RV64-NEXT:    ret
+  %eidxs = sext <8 x i8> %idxs to <8 x i16>
+  %ptrs = getelementptr inbounds i16, i16* %base, <8 x i16> %eidxs
+  %v = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs, i32 2, <8 x i1> %m, <8 x i16> %passthru)
+  ret <8 x i16> %v
+}
+
+define <8 x i16> @mgather_baseidx_zext_v8i8_v8i16(i16* %base, <8 x i8> %idxs, <8 x i1> %m, <8 x i16> %passthru) {
+; RV32-LABEL: mgather_baseidx_zext_v8i8_v8i16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a1, 8, e32,m2,ta,mu
+; RV32-NEXT:    vzext.vf4 v26, v8
+; RV32-NEXT:    vsll.vi v26, v26, 1
+; RV32-NEXT:    vsetivli a1, 8, e16,m1,tu,mu
+; RV32-NEXT:    vloxei32.v v9, (a0), v26, v0.t
+; RV32-NEXT:    vmv1r.v v8, v9
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_baseidx_zext_v8i8_v8i16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vzext.vf8 v28, v8
+; RV64-NEXT:    vsll.vi v28, v28, 1
+; RV64-NEXT:    vsetivli a1, 8, e16,m1,tu,mu
+; RV64-NEXT:    vloxei64.v v9, (a0), v28, v0.t
+; RV64-NEXT:    vmv1r.v v8, v9
+; RV64-NEXT:    ret
+  %eidxs = zext <8 x i8> %idxs to <8 x i16>
+  %ptrs = getelementptr inbounds i16, i16* %base, <8 x i16> %eidxs
+  %v = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs, i32 2, <8 x i1> %m, <8 x i16> %passthru)
+  ret <8 x i16> %v
+}
+
+define <8 x i16> @mgather_baseidx_v8i16(i16* %base, <8 x i16> %idxs, <8 x i1> %m, <8 x i16> %passthru) {
+; RV32-LABEL: mgather_baseidx_v8i16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a1, 8, e32,m2,ta,mu
+; RV32-NEXT:    vsext.vf2 v26, v8
+; RV32-NEXT:    vsll.vi v26, v26, 1
+; RV32-NEXT:    vsetivli a1, 8, e16,m1,tu,mu
+; RV32-NEXT:    vloxei32.v v9, (a0), v26, v0.t
+; RV32-NEXT:    vmv1r.v v8, v9
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_baseidx_v8i16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vsext.vf4 v28, v8
+; RV64-NEXT:    vsll.vi v28, v28, 1
+; RV64-NEXT:    vsetivli a1, 8, e16,m1,tu,mu
+; RV64-NEXT:    vloxei64.v v9, (a0), v28, v0.t
+; RV64-NEXT:    vmv1r.v v8, v9
+; RV64-NEXT:    ret
+  %ptrs = getelementptr inbounds i16, i16* %base, <8 x i16> %idxs
+  %v = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs, i32 2, <8 x i1> %m, <8 x i16> %passthru)
+  ret <8 x i16> %v
+}
+
+declare <1 x i32> @llvm.masked.gather.v1i32.v1p0i32(<1 x i32*>, i32, <1 x i1>, <1 x i32>)
+
+define <1 x i32> @mgather_v1i32(<1 x i32*> %ptrs, <1 x i1> %m, <1 x i32> %passthru) {
+; RV32-LABEL: mgather_v1i32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a0, 1, e32,m1,tu,mu
+; RV32-NEXT:    vloxei32.v v9, (zero), v8, v0.t
+; RV32-NEXT:    vmv1r.v v8, v9
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_v1i32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a0, 1, e32,mf2,tu,mu
+; RV64-NEXT:    vloxei64.v v9, (zero), v8, v0.t
+; RV64-NEXT:    vmv1r.v v8, v9
+; RV64-NEXT:    ret
+  %v = call <1 x i32> @llvm.masked.gather.v1i32.v1p0i32(<1 x i32*> %ptrs, i32 4, <1 x i1> %m, <1 x i32> %passthru)
+  ret <1 x i32> %v
+}
+
+declare <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*>, i32, <2 x i1>, <2 x i32>)
+
+define <2 x i32> @mgather_v2i32(<2 x i32*> %ptrs, <2 x i1> %m, <2 x i32> %passthru) {
+; RV32-LABEL: mgather_v2i32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a0, 2, e32,m1,tu,mu
+; RV32-NEXT:    vloxei32.v v9, (zero), v8, v0.t
+; RV32-NEXT:    vmv1r.v v8, v9
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_v2i32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a0, 2, e32,mf2,tu,mu
+; RV64-NEXT:    vloxei64.v v9, (zero), v8, v0.t
+; RV64-NEXT:    vmv1r.v v8, v9
+; RV64-NEXT:    ret
+  %v = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %ptrs, i32 4, <2 x i1> %m, <2 x i32> %passthru)
+  ret <2 x i32> %v
+}
+
+define <2 x i64> @mgather_v2i32_sextload_v2i64(<2 x i32*> %ptrs, <2 x i1> %m, <2 x i32> %passthru) {
+; RV32-LABEL: mgather_v2i32_sextload_v2i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a0, 2, e32,m1,tu,mu
+; RV32-NEXT:    vloxei32.v v9, (zero), v8, v0.t
+; RV32-NEXT:    vsetivli a0, 2, e64,m1,ta,mu
+; RV32-NEXT:    vsext.vf2 v8, v9
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_v2i32_sextload_v2i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a0, 2, e32,mf2,tu,mu
+; RV64-NEXT:    vloxei64.v v9, (zero), v8, v0.t
+; RV64-NEXT:    vsetivli a0, 2, e64,m1,ta,mu
+; RV64-NEXT:    vsext.vf2 v8, v9
+; RV64-NEXT:    ret
+  %v = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %ptrs, i32 4, <2 x i1> %m, <2 x i32> %passthru)
+  %ev = sext <2 x i32> %v to <2 x i64>
+  ret <2 x i64> %ev
+}
+
+define <2 x i64> @mgather_v2i32_zextload_v2i64(<2 x i32*> %ptrs, <2 x i1> %m, <2 x i32> %passthru) {
+; RV32-LABEL: mgather_v2i32_zextload_v2i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a0, 2, e32,m1,tu,mu
+; RV32-NEXT:    vloxei32.v v9, (zero), v8, v0.t
+; RV32-NEXT:    vsetivli a0, 2, e64,m1,ta,mu
+; RV32-NEXT:    vzext.vf2 v8, v9
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_v2i32_zextload_v2i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a0, 2, e32,mf2,tu,mu
+; RV64-NEXT:    vloxei64.v v9, (zero), v8, v0.t
+; RV64-NEXT:    vsetivli a0, 2, e64,m1,ta,mu
+; RV64-NEXT:    vzext.vf2 v8, v9
+; RV64-NEXT:    ret
+  %v = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %ptrs, i32 4, <2 x i1> %m, <2 x i32> %passthru)
+  %ev = zext <2 x i32> %v to <2 x i64>
+  ret <2 x i64> %ev
+}
+
+declare <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*>, i32, <4 x i1>, <4 x i32>)
+
+define <4 x i32> @mgather_v4i32(<4 x i32*> %ptrs, <4 x i1> %m, <4 x i32> %passthru) {
+; RV32-LABEL: mgather_v4i32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a0, 4, e32,m1,tu,mu
+; RV32-NEXT:    vloxei32.v v9, (zero), v8, v0.t
+; RV32-NEXT:    vmv1r.v v8, v9
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_v4i32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a0, 4, e32,m1,tu,mu
+; RV64-NEXT:    vloxei64.v v10, (zero), v8, v0.t
+; RV64-NEXT:    vmv1r.v v8, v10
+; RV64-NEXT:    ret
+  %v = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1> %m, <4 x i32> %passthru)
+  ret <4 x i32> %v
+}
+
+define <4 x i32> @mgather_truemask_v4i32(<4 x i32*> %ptrs, <4 x i32> %passthru) {
+; RV32-LABEL: mgather_truemask_v4i32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a0, 4, e32,m1,ta,mu
+; RV32-NEXT:    vloxei32.v v8, (zero), v8
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_truemask_v4i32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a0, 4, e32,m1,ta,mu
+; RV64-NEXT:    vloxei64.v v8, (zero), v8
+; RV64-NEXT:    ret
+  %mhead = insertelement <4 x i1> undef, i1 1, i32 0
+  %mtrue = shufflevector <4 x i1> %mhead, <4 x i1> undef, <4 x i32> zeroinitializer
+  %v = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1> %mtrue, <4 x i32> %passthru)
+  ret <4 x i32> %v
+}
+
+define <4 x i32> @mgather_falsemask_v4i32(<4 x i32*> %ptrs, <4 x i32> %passthru) {
+; RV32-LABEL: mgather_falsemask_v4i32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vmv1r.v v8, v9
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_falsemask_v4i32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vmv1r.v v8, v10
+; RV64-NEXT:    ret
+  %v = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1> zeroinitializer, <4 x i32> %passthru)
+  ret <4 x i32> %v
+}
+
+declare <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*>, i32, <8 x i1>, <8 x i32>)
+
+define <8 x i32> @mgather_v8i32(<8 x i32*> %ptrs, <8 x i1> %m, <8 x i32> %passthru) {
+; RV32-LABEL: mgather_v8i32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a0, 8, e32,m2,tu,mu
+; RV32-NEXT:    vloxei32.v v10, (zero), v8, v0.t
+; RV32-NEXT:    vmv2r.v v8, v10
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_v8i32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a0, 8, e32,m2,tu,mu
+; RV64-NEXT:    vloxei64.v v12, (zero), v8, v0.t
+; RV64-NEXT:    vmv2r.v v8, v12
+; RV64-NEXT:    ret
+  %v = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %ptrs, i32 4, <8 x i1> %m, <8 x i32> %passthru)
+  ret <8 x i32> %v
+}
+
+define <8 x i32> @mgather_baseidx_v8i8_v8i32(i32* %base, <8 x i8> %idxs, <8 x i1> %m, <8 x i32> %passthru) {
+; RV32-LABEL: mgather_baseidx_v8i8_v8i32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a1, 8, e32,m2,ta,mu
+; RV32-NEXT:    vsext.vf4 v26, v8
+; RV32-NEXT:    vsll.vi v26, v26, 2
+; RV32-NEXT:    vsetivli a1, 8, e32,m2,tu,mu
+; RV32-NEXT:    vloxei32.v v10, (a0), v26, v0.t
+; RV32-NEXT:    vmv2r.v v8, v10
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_baseidx_v8i8_v8i32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vsext.vf8 v28, v8
+; RV64-NEXT:    vsll.vi v28, v28, 2
+; RV64-NEXT:    vsetivli a1, 8, e32,m2,tu,mu
+; RV64-NEXT:    vloxei64.v v10, (a0), v28, v0.t
+; RV64-NEXT:    vmv2r.v v8, v10
+; RV64-NEXT:    ret
+  %ptrs = getelementptr inbounds i32, i32* %base, <8 x i8> %idxs
+  %v = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %ptrs, i32 4, <8 x i1> %m, <8 x i32> %passthru)
+  ret <8 x i32> %v
+}
+
+define <8 x i32> @mgather_baseidx_sext_v8i8_v8i32(i32* %base, <8 x i8> %idxs, <8 x i1> %m, <8 x i32> %passthru) {
+; RV32-LABEL: mgather_baseidx_sext_v8i8_v8i32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a1, 8, e32,m2,ta,mu
+; RV32-NEXT:    vsext.vf4 v26, v8
+; RV32-NEXT:    vsll.vi v26, v26, 2
+; RV32-NEXT:    vsetivli a1, 8, e32,m2,tu,mu
+; RV32-NEXT:    vloxei32.v v10, (a0), v26, v0.t
+; RV32-NEXT:    vmv2r.v v8, v10
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_baseidx_sext_v8i8_v8i32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vsext.vf8 v28, v8
+; RV64-NEXT:    vsll.vi v28, v28, 2
+; RV64-NEXT:    vsetivli a1, 8, e32,m2,tu,mu
+; RV64-NEXT:    vloxei64.v v10, (a0), v28, v0.t
+; RV64-NEXT:    vmv2r.v v8, v10
+; RV64-NEXT:    ret
+  %eidxs = sext <8 x i8> %idxs to <8 x i32>
+  %ptrs = getelementptr inbounds i32, i32* %base, <8 x i32> %eidxs
+  %v = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %ptrs, i32 4, <8 x i1> %m, <8 x i32> %passthru)
+  ret <8 x i32> %v
+}
+
+define <8 x i32> @mgather_baseidx_zext_v8i8_v8i32(i32* %base, <8 x i8> %idxs, <8 x i1> %m, <8 x i32> %passthru) {
+; RV32-LABEL: mgather_baseidx_zext_v8i8_v8i32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a1, 8, e32,m2,ta,mu
+; RV32-NEXT:    vzext.vf4 v26, v8
+; RV32-NEXT:    vsll.vi v26, v26, 2
+; RV32-NEXT:    vsetivli a1, 8, e32,m2,tu,mu
+; RV32-NEXT:    vloxei32.v v10, (a0), v26, v0.t
+; RV32-NEXT:    vmv2r.v v8, v10
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_baseidx_zext_v8i8_v8i32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vzext.vf8 v28, v8
+; RV64-NEXT:    vsll.vi v28, v28, 2
+; RV64-NEXT:    vsetivli a1, 8, e32,m2,tu,mu
+; RV64-NEXT:    vloxei64.v v10, (a0), v28, v0.t
+; RV64-NEXT:    vmv2r.v v8, v10
+; RV64-NEXT:    ret
+  %eidxs = zext <8 x i8> %idxs to <8 x i32>
+  %ptrs = getelementptr inbounds i32, i32* %base, <8 x i32> %eidxs
+  %v = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %ptrs, i32 4, <8 x i1> %m, <8 x i32> %passthru)
+  ret <8 x i32> %v
+}
+
+define <8 x i32> @mgather_baseidx_v8i16_v8i32(i32* %base, <8 x i16> %idxs, <8 x i1> %m, <8 x i32> %passthru) {
+; RV32-LABEL: mgather_baseidx_v8i16_v8i32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a1, 8, e32,m2,ta,mu
+; RV32-NEXT:    vsext.vf2 v26, v8
+; RV32-NEXT:    vsll.vi v26, v26, 2
+; RV32-NEXT:    vsetivli a1, 8, e32,m2,tu,mu
+; RV32-NEXT:    vloxei32.v v10, (a0), v26, v0.t
+; RV32-NEXT:    vmv2r.v v8, v10
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_baseidx_v8i16_v8i32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vsext.vf4 v28, v8
+; RV64-NEXT:    vsll.vi v28, v28, 2
+; RV64-NEXT:    vsetivli a1, 8, e32,m2,tu,mu
+; RV64-NEXT:    vloxei64.v v10, (a0), v28, v0.t
+; RV64-NEXT:    vmv2r.v v8, v10
+; RV64-NEXT:    ret
+  %ptrs = getelementptr inbounds i32, i32* %base, <8 x i16> %idxs
+  %v = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %ptrs, i32 4, <8 x i1> %m, <8 x i32> %passthru)
+  ret <8 x i32> %v
+}
+
+define <8 x i32> @mgather_baseidx_sext_v8i16_v8i32(i32* %base, <8 x i16> %idxs, <8 x i1> %m, <8 x i32> %passthru) {
+; RV32-LABEL: mgather_baseidx_sext_v8i16_v8i32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a1, 8, e32,m2,ta,mu
+; RV32-NEXT:    vsext.vf2 v26, v8
+; RV32-NEXT:    vsll.vi v26, v26, 2
+; RV32-NEXT:    vsetivli a1, 8, e32,m2,tu,mu
+; RV32-NEXT:    vloxei32.v v10, (a0), v26, v0.t
+; RV32-NEXT:    vmv2r.v v8, v10
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_baseidx_sext_v8i16_v8i32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vsext.vf4 v28, v8
+; RV64-NEXT:    vsll.vi v28, v28, 2
+; RV64-NEXT:    vsetivli a1, 8, e32,m2,tu,mu
+; RV64-NEXT:    vloxei64.v v10, (a0), v28, v0.t
+; RV64-NEXT:    vmv2r.v v8, v10
+; RV64-NEXT:    ret
+  %eidxs = sext <8 x i16> %idxs to <8 x i32>
+  %ptrs = getelementptr inbounds i32, i32* %base, <8 x i32> %eidxs
+  %v = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %ptrs, i32 4, <8 x i1> %m, <8 x i32> %passthru)
+  ret <8 x i32> %v
+}
+
+define <8 x i32> @mgather_baseidx_zext_v8i16_v8i32(i32* %base, <8 x i16> %idxs, <8 x i1> %m, <8 x i32> %passthru) {
+; RV32-LABEL: mgather_baseidx_zext_v8i16_v8i32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a1, 8, e32,m2,ta,mu
+; RV32-NEXT:    vzext.vf2 v26, v8
+; RV32-NEXT:    vsll.vi v26, v26, 2
+; RV32-NEXT:    vsetivli a1, 8, e32,m2,tu,mu
+; RV32-NEXT:    vloxei32.v v10, (a0), v26, v0.t
+; RV32-NEXT:    vmv2r.v v8, v10
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_baseidx_zext_v8i16_v8i32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vzext.vf4 v28, v8
+; RV64-NEXT:    vsll.vi v28, v28, 2
+; RV64-NEXT:    vsetivli a1, 8, e32,m2,tu,mu
+; RV64-NEXT:    vloxei64.v v10, (a0), v28, v0.t
+; RV64-NEXT:    vmv2r.v v8, v10
+; RV64-NEXT:    ret
+  %eidxs = zext <8 x i16> %idxs to <8 x i32>
+  %ptrs = getelementptr inbounds i32, i32* %base, <8 x i32> %eidxs
+  %v = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %ptrs, i32 4, <8 x i1> %m, <8 x i32> %passthru)
+  ret <8 x i32> %v
+}
+
+define <8 x i32> @mgather_baseidx_v8i32(i32* %base, <8 x i32> %idxs, <8 x i1> %m, <8 x i32> %passthru) {
+; RV32-LABEL: mgather_baseidx_v8i32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a1, 8, e32,m2,ta,mu
+; RV32-NEXT:    vsll.vi v26, v8, 2
+; RV32-NEXT:    vsetivli a1, 8, e32,m2,tu,mu
+; RV32-NEXT:    vloxei32.v v10, (a0), v26, v0.t
+; RV32-NEXT:    vmv2r.v v8, v10
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_baseidx_v8i32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vsext.vf2 v28, v8
+; RV64-NEXT:    vsll.vi v28, v28, 2
+; RV64-NEXT:    vsetivli a1, 8, e32,m2,tu,mu
+; RV64-NEXT:    vloxei64.v v10, (a0), v28, v0.t
+; RV64-NEXT:    vmv2r.v v8, v10
+; RV64-NEXT:    ret
+  %ptrs = getelementptr inbounds i32, i32* %base, <8 x i32> %idxs
+  %v = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %ptrs, i32 4, <8 x i1> %m, <8 x i32> %passthru)
+  ret <8 x i32> %v
+}
+
+declare <1 x i64> @llvm.masked.gather.v1i64.v1p0i64(<1 x i64*>, i32, <1 x i1>, <1 x i64>)
+
+define <1 x i64> @mgather_v1i64(<1 x i64*> %ptrs, <1 x i1> %m, <1 x i64> %passthru) {
+; RV32-LABEL: mgather_v1i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a0, 1, e64,m1,tu,mu
+; RV32-NEXT:    vloxei32.v v9, (zero), v8, v0.t
+; RV32-NEXT:    vmv1r.v v8, v9
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_v1i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a0, 1, e64,m1,tu,mu
+; RV64-NEXT:    vloxei64.v v9, (zero), v8, v0.t
+; RV64-NEXT:    vmv1r.v v8, v9
+; RV64-NEXT:    ret
+  %v = call <1 x i64> @llvm.masked.gather.v1i64.v1p0i64(<1 x i64*> %ptrs, i32 8, <1 x i1> %m, <1 x i64> %passthru)
+  ret <1 x i64> %v
+}
+
+declare <2 x i64> @llvm.masked.gather.v2i64.v2p0i64(<2 x i64*>, i32, <2 x i1>, <2 x i64>)
+
+define <2 x i64> @mgather_v2i64(<2 x i64*> %ptrs, <2 x i1> %m, <2 x i64> %passthru) {
+; RV32-LABEL: mgather_v2i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a0, 2, e64,m1,tu,mu
+; RV32-NEXT:    vloxei32.v v9, (zero), v8, v0.t
+; RV32-NEXT:    vmv1r.v v8, v9
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_v2i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a0, 2, e64,m1,tu,mu
+; RV64-NEXT:    vloxei64.v v9, (zero), v8, v0.t
+; RV64-NEXT:    vmv1r.v v8, v9
+; RV64-NEXT:    ret
+  %v = call <2 x i64> @llvm.masked.gather.v2i64.v2p0i64(<2 x i64*> %ptrs, i32 8, <2 x i1> %m, <2 x i64> %passthru)
+  ret <2 x i64> %v
+}
+
+declare <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*>, i32, <4 x i1>, <4 x i64>)
+
+define <4 x i64> @mgather_v4i64(<4 x i64*> %ptrs, <4 x i1> %m, <4 x i64> %passthru) {
+; RV32-LABEL: mgather_v4i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a0, 4, e64,m2,tu,mu
+; RV32-NEXT:    vloxei32.v v10, (zero), v8, v0.t
+; RV32-NEXT:    vmv2r.v v8, v10
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_v4i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a0, 4, e64,m2,tu,mu
+; RV64-NEXT:    vloxei64.v v10, (zero), v8, v0.t
+; RV64-NEXT:    vmv2r.v v8, v10
+; RV64-NEXT:    ret
+  %v = call <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*> %ptrs, i32 8, <4 x i1> %m, <4 x i64> %passthru)
+  ret <4 x i64> %v
+}
+
+define <4 x i64> @mgather_truemask_v4i64(<4 x i64*> %ptrs, <4 x i64> %passthru) {
+; RV32-LABEL: mgather_truemask_v4i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a0, 4, e64,m2,ta,mu
+; RV32-NEXT:    vloxei32.v v8, (zero), v8
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_truemask_v4i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a0, 4, e64,m2,ta,mu
+; RV64-NEXT:    vloxei64.v v8, (zero), v8
+; RV64-NEXT:    ret
+  %mhead = insertelement <4 x i1> undef, i1 1, i32 0
+  %mtrue = shufflevector <4 x i1> %mhead, <4 x i1> undef, <4 x i32> zeroinitializer
+  %v = call <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*> %ptrs, i32 8, <4 x i1> %mtrue, <4 x i64> %passthru)
+  ret <4 x i64> %v
+}
+
+define <4 x i64> @mgather_falsemask_v4i64(<4 x i64*> %ptrs, <4 x i64> %passthru) {
+; RV32-LABEL: mgather_falsemask_v4i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vmv2r.v v8, v10
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_falsemask_v4i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vmv2r.v v8, v10
+; RV64-NEXT:    ret
+  %v = call <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*> %ptrs, i32 8, <4 x i1> zeroinitializer, <4 x i64> %passthru)
+  ret <4 x i64> %v
+}
+
+declare <8 x i64> @llvm.masked.gather.v8i64.v8p0i64(<8 x i64*>, i32, <8 x i1>, <8 x i64>)
+
+define <8 x i64> @mgather_v8i64(<8 x i64*> %ptrs, <8 x i1> %m, <8 x i64> %passthru) {
+; RV32-LABEL: mgather_v8i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a0, 8, e64,m4,tu,mu
+; RV32-NEXT:    vloxei32.v v12, (zero), v8, v0.t
+; RV32-NEXT:    vmv4r.v v8, v12
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_v8i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a0, 8, e64,m4,tu,mu
+; RV64-NEXT:    vloxei64.v v12, (zero), v8, v0.t
+; RV64-NEXT:    vmv4r.v v8, v12
+; RV64-NEXT:    ret
+  %v = call <8 x i64> @llvm.masked.gather.v8i64.v8p0i64(<8 x i64*> %ptrs, i32 8, <8 x i1> %m, <8 x i64> %passthru)
+  ret <8 x i64> %v
+}
+
+define <8 x i64> @mgather_baseidx_v8i8_v8i64(i64* %base, <8 x i8> %idxs, <8 x i1> %m, <8 x i64> %passthru) {
+; RV32-LABEL: mgather_baseidx_v8i8_v8i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a1, 8, e32,m2,ta,mu
+; RV32-NEXT:    vsext.vf4 v26, v8
+; RV32-NEXT:    vsll.vi v26, v26, 3
+; RV32-NEXT:    vsetivli a1, 8, e64,m4,tu,mu
+; RV32-NEXT:    vloxei32.v v12, (a0), v26, v0.t
+; RV32-NEXT:    vmv4r.v v8, v12
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_baseidx_v8i8_v8i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vsext.vf8 v28, v8
+; RV64-NEXT:    vsll.vi v28, v28, 3
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,tu,mu
+; RV64-NEXT:    vloxei64.v v12, (a0), v28, v0.t
+; RV64-NEXT:    vmv4r.v v8, v12
+; RV64-NEXT:    ret
+  %ptrs = getelementptr inbounds i64, i64* %base, <8 x i8> %idxs
+  %v = call <8 x i64> @llvm.masked.gather.v8i64.v8p0i64(<8 x i64*> %ptrs, i32 8, <8 x i1> %m, <8 x i64> %passthru)
+  ret <8 x i64> %v
+}
+
+define <8 x i64> @mgather_baseidx_sext_v8i8_v8i64(i64* %base, <8 x i8> %idxs, <8 x i1> %m, <8 x i64> %passthru) {
+; RV32-LABEL: mgather_baseidx_sext_v8i8_v8i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV32-NEXT:    vsext.vf8 v28, v8
+; RV32-NEXT:    lui a1, %hi(.LCPI49_0)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI49_0)
+; RV32-NEXT:    vsetivli a2, 16, e32,m4,ta,mu
+; RV32-NEXT:    vle32.v v16, (a1)
+; RV32-NEXT:    vmv4r.v v8, v12
+; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV32-NEXT:    vsll.vv v28, v28, v16
+; RV32-NEXT:    vsetivli a1, 8, e64,m4,tu,mu
+; RV32-NEXT:    vloxei64.v v8, (a0), v28, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_baseidx_sext_v8i8_v8i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vsext.vf8 v28, v8
+; RV64-NEXT:    vsll.vi v28, v28, 3
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,tu,mu
+; RV64-NEXT:    vloxei64.v v12, (a0), v28, v0.t
+; RV64-NEXT:    vmv4r.v v8, v12
+; RV64-NEXT:    ret
+  %eidxs = sext <8 x i8> %idxs to <8 x i64>
+  %ptrs = getelementptr inbounds i64, i64* %base, <8 x i64> %eidxs
+  %v = call <8 x i64> @llvm.masked.gather.v8i64.v8p0i64(<8 x i64*> %ptrs, i32 8, <8 x i1> %m, <8 x i64> %passthru)
+  ret <8 x i64> %v
+}
+
+define <8 x i64> @mgather_baseidx_zext_v8i8_v8i64(i64* %base, <8 x i8> %idxs, <8 x i1> %m, <8 x i64> %passthru) {
+; RV32-LABEL: mgather_baseidx_zext_v8i8_v8i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV32-NEXT:    vzext.vf8 v28, v8
+; RV32-NEXT:    lui a1, %hi(.LCPI50_0)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI50_0)
+; RV32-NEXT:    vsetivli a2, 16, e32,m4,ta,mu
+; RV32-NEXT:    vle32.v v16, (a1)
+; RV32-NEXT:    vmv4r.v v8, v12
+; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV32-NEXT:    vsll.vv v28, v28, v16
+; RV32-NEXT:    vsetivli a1, 8, e64,m4,tu,mu
+; RV32-NEXT:    vloxei64.v v8, (a0), v28, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_baseidx_zext_v8i8_v8i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vzext.vf8 v28, v8
+; RV64-NEXT:    vsll.vi v28, v28, 3
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,tu,mu
+; RV64-NEXT:    vloxei64.v v12, (a0), v28, v0.t
+; RV64-NEXT:    vmv4r.v v8, v12
+; RV64-NEXT:    ret
+  %eidxs = zext <8 x i8> %idxs to <8 x i64>
+  %ptrs = getelementptr inbounds i64, i64* %base, <8 x i64> %eidxs
+  %v = call <8 x i64> @llvm.masked.gather.v8i64.v8p0i64(<8 x i64*> %ptrs, i32 8, <8 x i1> %m, <8 x i64> %passthru)
+  ret <8 x i64> %v
+}
+
+define <8 x i64> @mgather_baseidx_v8i16_v8i64(i64* %base, <8 x i16> %idxs, <8 x i1> %m, <8 x i64> %passthru) {
+; RV32-LABEL: mgather_baseidx_v8i16_v8i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a1, 8, e32,m2,ta,mu
+; RV32-NEXT:    vsext.vf2 v26, v8
+; RV32-NEXT:    vsll.vi v26, v26, 3
+; RV32-NEXT:    vsetivli a1, 8, e64,m4,tu,mu
+; RV32-NEXT:    vloxei32.v v12, (a0), v26, v0.t
+; RV32-NEXT:    vmv4r.v v8, v12
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_baseidx_v8i16_v8i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vsext.vf4 v28, v8
+; RV64-NEXT:    vsll.vi v28, v28, 3
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,tu,mu
+; RV64-NEXT:    vloxei64.v v12, (a0), v28, v0.t
+; RV64-NEXT:    vmv4r.v v8, v12
+; RV64-NEXT:    ret
+  %ptrs = getelementptr inbounds i64, i64* %base, <8 x i16> %idxs
+  %v = call <8 x i64> @llvm.masked.gather.v8i64.v8p0i64(<8 x i64*> %ptrs, i32 8, <8 x i1> %m, <8 x i64> %passthru)
+  ret <8 x i64> %v
+}
+
+define <8 x i64> @mgather_baseidx_sext_v8i16_v8i64(i64* %base, <8 x i16> %idxs, <8 x i1> %m, <8 x i64> %passthru) {
+; RV32-LABEL: mgather_baseidx_sext_v8i16_v8i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV32-NEXT:    vsext.vf4 v28, v8
+; RV32-NEXT:    lui a1, %hi(.LCPI52_0)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI52_0)
+; RV32-NEXT:    vsetivli a2, 16, e32,m4,ta,mu
+; RV32-NEXT:    vle32.v v16, (a1)
+; RV32-NEXT:    vmv4r.v v8, v12
+; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV32-NEXT:    vsll.vv v28, v28, v16
+; RV32-NEXT:    vsetivli a1, 8, e64,m4,tu,mu
+; RV32-NEXT:    vloxei64.v v8, (a0), v28, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_baseidx_sext_v8i16_v8i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vsext.vf4 v28, v8
+; RV64-NEXT:    vsll.vi v28, v28, 3
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,tu,mu
+; RV64-NEXT:    vloxei64.v v12, (a0), v28, v0.t
+; RV64-NEXT:    vmv4r.v v8, v12
+; RV64-NEXT:    ret
+  %eidxs = sext <8 x i16> %idxs to <8 x i64>
+  %ptrs = getelementptr inbounds i64, i64* %base, <8 x i64> %eidxs
+  %v = call <8 x i64> @llvm.masked.gather.v8i64.v8p0i64(<8 x i64*> %ptrs, i32 8, <8 x i1> %m, <8 x i64> %passthru)
+  ret <8 x i64> %v
+}
+
+define <8 x i64> @mgather_baseidx_zext_v8i16_v8i64(i64* %base, <8 x i16> %idxs, <8 x i1> %m, <8 x i64> %passthru) {
+; RV32-LABEL: mgather_baseidx_zext_v8i16_v8i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV32-NEXT:    vzext.vf4 v28, v8
+; RV32-NEXT:    lui a1, %hi(.LCPI53_0)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI53_0)
+; RV32-NEXT:    vsetivli a2, 16, e32,m4,ta,mu
+; RV32-NEXT:    vle32.v v16, (a1)
+; RV32-NEXT:    vmv4r.v v8, v12
+; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV32-NEXT:    vsll.vv v28, v28, v16
+; RV32-NEXT:    vsetivli a1, 8, e64,m4,tu,mu
+; RV32-NEXT:    vloxei64.v v8, (a0), v28, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_baseidx_zext_v8i16_v8i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vzext.vf4 v28, v8
+; RV64-NEXT:    vsll.vi v28, v28, 3
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,tu,mu
+; RV64-NEXT:    vloxei64.v v12, (a0), v28, v0.t
+; RV64-NEXT:    vmv4r.v v8, v12
+; RV64-NEXT:    ret
+  %eidxs = zext <8 x i16> %idxs to <8 x i64>
+  %ptrs = getelementptr inbounds i64, i64* %base, <8 x i64> %eidxs
+  %v = call <8 x i64> @llvm.masked.gather.v8i64.v8p0i64(<8 x i64*> %ptrs, i32 8, <8 x i1> %m, <8 x i64> %passthru)
+  ret <8 x i64> %v
+}
+
+define <8 x i64> @mgather_baseidx_v8i32_v8i64(i64* %base, <8 x i32> %idxs, <8 x i1> %m, <8 x i64> %passthru) {
+; RV32-LABEL: mgather_baseidx_v8i32_v8i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a1, 8, e32,m2,ta,mu
+; RV32-NEXT:    vsll.vi v26, v8, 3
+; RV32-NEXT:    vsetivli a1, 8, e64,m4,tu,mu
+; RV32-NEXT:    vloxei32.v v12, (a0), v26, v0.t
+; RV32-NEXT:    vmv4r.v v8, v12
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_baseidx_v8i32_v8i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vsext.vf2 v28, v8
+; RV64-NEXT:    vsll.vi v28, v28, 3
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,tu,mu
+; RV64-NEXT:    vloxei64.v v12, (a0), v28, v0.t
+; RV64-NEXT:    vmv4r.v v8, v12
+; RV64-NEXT:    ret
+  %ptrs = getelementptr inbounds i64, i64* %base, <8 x i32> %idxs
+  %v = call <8 x i64> @llvm.masked.gather.v8i64.v8p0i64(<8 x i64*> %ptrs, i32 8, <8 x i1> %m, <8 x i64> %passthru)
+  ret <8 x i64> %v
+}
+
+define <8 x i64> @mgather_baseidx_sext_v8i32_v8i64(i64* %base, <8 x i32> %idxs, <8 x i1> %m, <8 x i64> %passthru) {
+; RV32-LABEL: mgather_baseidx_sext_v8i32_v8i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV32-NEXT:    vsext.vf2 v28, v8
+; RV32-NEXT:    lui a1, %hi(.LCPI55_0)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI55_0)
+; RV32-NEXT:    vsetivli a2, 16, e32,m4,ta,mu
+; RV32-NEXT:    vle32.v v16, (a1)
+; RV32-NEXT:    vmv4r.v v8, v12
+; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV32-NEXT:    vsll.vv v28, v28, v16
+; RV32-NEXT:    vsetivli a1, 8, e64,m4,tu,mu
+; RV32-NEXT:    vloxei64.v v8, (a0), v28, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_baseidx_sext_v8i32_v8i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vsext.vf2 v28, v8
+; RV64-NEXT:    vsll.vi v28, v28, 3
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,tu,mu
+; RV64-NEXT:    vloxei64.v v12, (a0), v28, v0.t
+; RV64-NEXT:    vmv4r.v v8, v12
+; RV64-NEXT:    ret
+  %eidxs = sext <8 x i32> %idxs to <8 x i64>
+  %ptrs = getelementptr inbounds i64, i64* %base, <8 x i64> %eidxs
+  %v = call <8 x i64> @llvm.masked.gather.v8i64.v8p0i64(<8 x i64*> %ptrs, i32 8, <8 x i1> %m, <8 x i64> %passthru)
+  ret <8 x i64> %v
+}
+
+define <8 x i64> @mgather_baseidx_zext_v8i32_v8i64(i64* %base, <8 x i32> %idxs, <8 x i1> %m, <8 x i64> %passthru) {
+; RV32-LABEL: mgather_baseidx_zext_v8i32_v8i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV32-NEXT:    vzext.vf2 v28, v8
+; RV32-NEXT:    lui a1, %hi(.LCPI56_0)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI56_0)
+; RV32-NEXT:    vsetivli a2, 16, e32,m4,ta,mu
+; RV32-NEXT:    vle32.v v16, (a1)
+; RV32-NEXT:    vmv4r.v v8, v12
+; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV32-NEXT:    vsll.vv v28, v28, v16
+; RV32-NEXT:    vsetivli a1, 8, e64,m4,tu,mu
+; RV32-NEXT:    vloxei64.v v8, (a0), v28, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_baseidx_zext_v8i32_v8i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vzext.vf2 v28, v8
+; RV64-NEXT:    vsll.vi v28, v28, 3
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,tu,mu
+; RV64-NEXT:    vloxei64.v v12, (a0), v28, v0.t
+; RV64-NEXT:    vmv4r.v v8, v12
+; RV64-NEXT:    ret
+  %eidxs = zext <8 x i32> %idxs to <8 x i64>
+  %ptrs = getelementptr inbounds i64, i64* %base, <8 x i64> %eidxs
+  %v = call <8 x i64> @llvm.masked.gather.v8i64.v8p0i64(<8 x i64*> %ptrs, i32 8, <8 x i1> %m, <8 x i64> %passthru)
+  ret <8 x i64> %v
+}
+
+define <8 x i64> @mgather_baseidx_v8i64(i64* %base, <8 x i64> %idxs, <8 x i1> %m, <8 x i64> %passthru) {
+; RV32-LABEL: mgather_baseidx_v8i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    lui a1, %hi(.LCPI57_0)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI57_0)
+; RV32-NEXT:    vsetivli a2, 16, e32,m4,ta,mu
+; RV32-NEXT:    vle32.v v28, (a1)
+; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV32-NEXT:    vsll.vv v28, v8, v28
+; RV32-NEXT:    vsetivli a1, 8, e64,m4,tu,mu
+; RV32-NEXT:    vloxei64.v v12, (a0), v28, v0.t
+; RV32-NEXT:    vmv4r.v v8, v12
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_baseidx_v8i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vsll.vi v28, v8, 3
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,tu,mu
+; RV64-NEXT:    vloxei64.v v12, (a0), v28, v0.t
+; RV64-NEXT:    vmv4r.v v8, v12
+; RV64-NEXT:    ret
+  %ptrs = getelementptr inbounds i64, i64* %base, <8 x i64> %idxs
+  %v = call <8 x i64> @llvm.masked.gather.v8i64.v8p0i64(<8 x i64*> %ptrs, i32 8, <8 x i1> %m, <8 x i64> %passthru)
+  ret <8 x i64> %v
+}
+
+declare <1 x half> @llvm.masked.gather.v1f16.v1p0f16(<1 x half*>, i32, <1 x i1>, <1 x half>)
+
+define <1 x half> @mgather_v1f16(<1 x half*> %ptrs, <1 x i1> %m, <1 x half> %passthru) {
+; RV32-LABEL: mgather_v1f16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a0, 1, e16,mf2,tu,mu
+; RV32-NEXT:    vloxei32.v v9, (zero), v8, v0.t
+; RV32-NEXT:    vmv1r.v v8, v9
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_v1f16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a0, 1, e16,mf4,tu,mu
+; RV64-NEXT:    vloxei64.v v9, (zero), v8, v0.t
+; RV64-NEXT:    vmv1r.v v8, v9
+; RV64-NEXT:    ret
+  %v = call <1 x half> @llvm.masked.gather.v1f16.v1p0f16(<1 x half*> %ptrs, i32 2, <1 x i1> %m, <1 x half> %passthru)
+  ret <1 x half> %v
+}
+
+declare <2 x half> @llvm.masked.gather.v2f16.v2p0f16(<2 x half*>, i32, <2 x i1>, <2 x half>)
+
+define <2 x half> @mgather_v2f16(<2 x half*> %ptrs, <2 x i1> %m, <2 x half> %passthru) {
+; RV32-LABEL: mgather_v2f16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a0, 2, e16,mf2,tu,mu
+; RV32-NEXT:    vloxei32.v v9, (zero), v8, v0.t
+; RV32-NEXT:    vmv1r.v v8, v9
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_v2f16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a0, 2, e16,mf4,tu,mu
+; RV64-NEXT:    vloxei64.v v9, (zero), v8, v0.t
+; RV64-NEXT:    vmv1r.v v8, v9
+; RV64-NEXT:    ret
+  %v = call <2 x half> @llvm.masked.gather.v2f16.v2p0f16(<2 x half*> %ptrs, i32 2, <2 x i1> %m, <2 x half> %passthru)
+  ret <2 x half> %v
+}
+
+declare <4 x half> @llvm.masked.gather.v4f16.v4p0f16(<4 x half*>, i32, <4 x i1>, <4 x half>)
+
+define <4 x half> @mgather_v4f16(<4 x half*> %ptrs, <4 x i1> %m, <4 x half> %passthru) {
+; RV32-LABEL: mgather_v4f16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a0, 4, e16,mf2,tu,mu
+; RV32-NEXT:    vloxei32.v v9, (zero), v8, v0.t
+; RV32-NEXT:    vmv1r.v v8, v9
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_v4f16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a0, 4, e16,mf2,tu,mu
+; RV64-NEXT:    vloxei64.v v10, (zero), v8, v0.t
+; RV64-NEXT:    vmv1r.v v8, v10
+; RV64-NEXT:    ret
+  %v = call <4 x half> @llvm.masked.gather.v4f16.v4p0f16(<4 x half*> %ptrs, i32 2, <4 x i1> %m, <4 x half> %passthru)
+  ret <4 x half> %v
+}
+
+define <4 x half> @mgather_truemask_v4f16(<4 x half*> %ptrs, <4 x half> %passthru) {
+; RV32-LABEL: mgather_truemask_v4f16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a0, 4, e16,mf2,ta,mu
+; RV32-NEXT:    vloxei32.v v8, (zero), v8
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_truemask_v4f16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a0, 4, e16,mf2,ta,mu
+; RV64-NEXT:    vloxei64.v v8, (zero), v8
+; RV64-NEXT:    ret
+  %mhead = insertelement <4 x i1> undef, i1 1, i32 0
+  %mtrue = shufflevector <4 x i1> %mhead, <4 x i1> undef, <4 x i32> zeroinitializer
+  %v = call <4 x half> @llvm.masked.gather.v4f16.v4p0f16(<4 x half*> %ptrs, i32 2, <4 x i1> %mtrue, <4 x half> %passthru)
+  ret <4 x half> %v
+}
+
+define <4 x half> @mgather_falsemask_v4f16(<4 x half*> %ptrs, <4 x half> %passthru) {
+; RV32-LABEL: mgather_falsemask_v4f16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vmv1r.v v8, v9
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_falsemask_v4f16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vmv1r.v v8, v10
+; RV64-NEXT:    ret
+  %v = call <4 x half> @llvm.masked.gather.v4f16.v4p0f16(<4 x half*> %ptrs, i32 2, <4 x i1> zeroinitializer, <4 x half> %passthru)
+  ret <4 x half> %v
+}
+
+declare <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*>, i32, <8 x i1>, <8 x half>)
+
+define <8 x half> @mgather_v8f16(<8 x half*> %ptrs, <8 x i1> %m, <8 x half> %passthru) {
+; RV32-LABEL: mgather_v8f16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a0, 8, e16,m1,tu,mu
+; RV32-NEXT:    vloxei32.v v10, (zero), v8, v0.t
+; RV32-NEXT:    vmv1r.v v8, v10
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_v8f16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a0, 8, e16,m1,tu,mu
+; RV64-NEXT:    vloxei64.v v12, (zero), v8, v0.t
+; RV64-NEXT:    vmv1r.v v8, v12
+; RV64-NEXT:    ret
+  %v = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %ptrs, i32 2, <8 x i1> %m, <8 x half> %passthru)
+  ret <8 x half> %v
+}
+
+define <8 x half> @mgather_baseidx_v8i8_v8f16(half* %base, <8 x i8> %idxs, <8 x i1> %m, <8 x half> %passthru) {
+; RV32-LABEL: mgather_baseidx_v8i8_v8f16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a1, 8, e32,m2,ta,mu
+; RV32-NEXT:    vsext.vf4 v26, v8
+; RV32-NEXT:    vsll.vi v26, v26, 1
+; RV32-NEXT:    vsetivli a1, 8, e16,m1,tu,mu
+; RV32-NEXT:    vloxei32.v v9, (a0), v26, v0.t
+; RV32-NEXT:    vmv1r.v v8, v9
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_baseidx_v8i8_v8f16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vsext.vf8 v28, v8
+; RV64-NEXT:    vsll.vi v28, v28, 1
+; RV64-NEXT:    vsetivli a1, 8, e16,m1,tu,mu
+; RV64-NEXT:    vloxei64.v v9, (a0), v28, v0.t
+; RV64-NEXT:    vmv1r.v v8, v9
+; RV64-NEXT:    ret
+  %ptrs = getelementptr inbounds half, half* %base, <8 x i8> %idxs
+  %v = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %ptrs, i32 2, <8 x i1> %m, <8 x half> %passthru)
+  ret <8 x half> %v
+}
+
+define <8 x half> @mgather_baseidx_sext_v8i8_v8f16(half* %base, <8 x i8> %idxs, <8 x i1> %m, <8 x half> %passthru) {
+; RV32-LABEL: mgather_baseidx_sext_v8i8_v8f16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a1, 8, e32,m2,ta,mu
+; RV32-NEXT:    vsext.vf4 v26, v8
+; RV32-NEXT:    vsll.vi v26, v26, 1
+; RV32-NEXT:    vsetivli a1, 8, e16,m1,tu,mu
+; RV32-NEXT:    vloxei32.v v9, (a0), v26, v0.t
+; RV32-NEXT:    vmv1r.v v8, v9
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_baseidx_sext_v8i8_v8f16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vsext.vf8 v28, v8
+; RV64-NEXT:    vsll.vi v28, v28, 1
+; RV64-NEXT:    vsetivli a1, 8, e16,m1,tu,mu
+; RV64-NEXT:    vloxei64.v v9, (a0), v28, v0.t
+; RV64-NEXT:    vmv1r.v v8, v9
+; RV64-NEXT:    ret
+  %eidxs = sext <8 x i8> %idxs to <8 x i16>
+  %ptrs = getelementptr inbounds half, half* %base, <8 x i16> %eidxs
+  %v = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %ptrs, i32 2, <8 x i1> %m, <8 x half> %passthru)
+  ret <8 x half> %v
+}
+
+define <8 x half> @mgather_baseidx_zext_v8i8_v8f16(half* %base, <8 x i8> %idxs, <8 x i1> %m, <8 x half> %passthru) {
+; RV32-LABEL: mgather_baseidx_zext_v8i8_v8f16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a1, 8, e32,m2,ta,mu
+; RV32-NEXT:    vzext.vf4 v26, v8
+; RV32-NEXT:    vsll.vi v26, v26, 1
+; RV32-NEXT:    vsetivli a1, 8, e16,m1,tu,mu
+; RV32-NEXT:    vloxei32.v v9, (a0), v26, v0.t
+; RV32-NEXT:    vmv1r.v v8, v9
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_baseidx_zext_v8i8_v8f16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vzext.vf8 v28, v8
+; RV64-NEXT:    vsll.vi v28, v28, 1
+; RV64-NEXT:    vsetivli a1, 8, e16,m1,tu,mu
+; RV64-NEXT:    vloxei64.v v9, (a0), v28, v0.t
+; RV64-NEXT:    vmv1r.v v8, v9
+; RV64-NEXT:    ret
+  %eidxs = zext <8 x i8> %idxs to <8 x i16>
+  %ptrs = getelementptr inbounds half, half* %base, <8 x i16> %eidxs
+  %v = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %ptrs, i32 2, <8 x i1> %m, <8 x half> %passthru)
+  ret <8 x half> %v
+}
+
+define <8 x half> @mgather_baseidx_v8f16(half* %base, <8 x i16> %idxs, <8 x i1> %m, <8 x half> %passthru) {
+; RV32-LABEL: mgather_baseidx_v8f16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a1, 8, e32,m2,ta,mu
+; RV32-NEXT:    vsext.vf2 v26, v8
+; RV32-NEXT:    vsll.vi v26, v26, 1
+; RV32-NEXT:    vsetivli a1, 8, e16,m1,tu,mu
+; RV32-NEXT:    vloxei32.v v9, (a0), v26, v0.t
+; RV32-NEXT:    vmv1r.v v8, v9
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_baseidx_v8f16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vsext.vf4 v28, v8
+; RV64-NEXT:    vsll.vi v28, v28, 1
+; RV64-NEXT:    vsetivli a1, 8, e16,m1,tu,mu
+; RV64-NEXT:    vloxei64.v v9, (a0), v28, v0.t
+; RV64-NEXT:    vmv1r.v v8, v9
+; RV64-NEXT:    ret
+  %ptrs = getelementptr inbounds half, half* %base, <8 x i16> %idxs
+  %v = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %ptrs, i32 2, <8 x i1> %m, <8 x half> %passthru)
+  ret <8 x half> %v
+}
+
+declare <1 x float> @llvm.masked.gather.v1f32.v1p0f32(<1 x float*>, i32, <1 x i1>, <1 x float>)
+
+define <1 x float> @mgather_v1f32(<1 x float*> %ptrs, <1 x i1> %m, <1 x float> %passthru) {
+; RV32-LABEL: mgather_v1f32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a0, 1, e32,m1,tu,mu
+; RV32-NEXT:    vloxei32.v v9, (zero), v8, v0.t
+; RV32-NEXT:    vmv1r.v v8, v9
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_v1f32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a0, 1, e32,mf2,tu,mu
+; RV64-NEXT:    vloxei64.v v9, (zero), v8, v0.t
+; RV64-NEXT:    vmv1r.v v8, v9
+; RV64-NEXT:    ret
+  %v = call <1 x float> @llvm.masked.gather.v1f32.v1p0f32(<1 x float*> %ptrs, i32 4, <1 x i1> %m, <1 x float> %passthru)
+  ret <1 x float> %v
+}
+
+declare <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*>, i32, <2 x i1>, <2 x float>)
+
+define <2 x float> @mgather_v2f32(<2 x float*> %ptrs, <2 x i1> %m, <2 x float> %passthru) {
+; RV32-LABEL: mgather_v2f32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a0, 2, e32,m1,tu,mu
+; RV32-NEXT:    vloxei32.v v9, (zero), v8, v0.t
+; RV32-NEXT:    vmv1r.v v8, v9
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_v2f32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a0, 2, e32,mf2,tu,mu
+; RV64-NEXT:    vloxei64.v v9, (zero), v8, v0.t
+; RV64-NEXT:    vmv1r.v v8, v9
+; RV64-NEXT:    ret
+  %v = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> %ptrs, i32 4, <2 x i1> %m, <2 x float> %passthru)
+  ret <2 x float> %v
+}
+
+declare <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*>, i32, <4 x i1>, <4 x float>)
+
+define <4 x float> @mgather_v4f32(<4 x float*> %ptrs, <4 x i1> %m, <4 x float> %passthru) {
+; RV32-LABEL: mgather_v4f32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a0, 4, e32,m1,tu,mu
+; RV32-NEXT:    vloxei32.v v9, (zero), v8, v0.t
+; RV32-NEXT:    vmv1r.v v8, v9
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_v4f32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a0, 4, e32,m1,tu,mu
+; RV64-NEXT:    vloxei64.v v10, (zero), v8, v0.t
+; RV64-NEXT:    vmv1r.v v8, v10
+; RV64-NEXT:    ret
+  %v = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %ptrs, i32 4, <4 x i1> %m, <4 x float> %passthru)
+  ret <4 x float> %v
+}
+
+define <4 x float> @mgather_truemask_v4f32(<4 x float*> %ptrs, <4 x float> %passthru) {
+; RV32-LABEL: mgather_truemask_v4f32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a0, 4, e32,m1,ta,mu
+; RV32-NEXT:    vloxei32.v v8, (zero), v8
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_truemask_v4f32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a0, 4, e32,m1,ta,mu
+; RV64-NEXT:    vloxei64.v v8, (zero), v8
+; RV64-NEXT:    ret
+  %mhead = insertelement <4 x i1> undef, i1 1, i32 0
+  %mtrue = shufflevector <4 x i1> %mhead, <4 x i1> undef, <4 x i32> zeroinitializer
+  %v = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %ptrs, i32 4, <4 x i1> %mtrue, <4 x float> %passthru)
+  ret <4 x float> %v
+}
+
+define <4 x float> @mgather_falsemask_v4f32(<4 x float*> %ptrs, <4 x float> %passthru) {
+; RV32-LABEL: mgather_falsemask_v4f32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vmv1r.v v8, v9
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_falsemask_v4f32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vmv1r.v v8, v10
+; RV64-NEXT:    ret
+  %v = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %ptrs, i32 4, <4 x i1> zeroinitializer, <4 x float> %passthru)
+  ret <4 x float> %v
+}
+
+declare <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*>, i32, <8 x i1>, <8 x float>)
+
+define <8 x float> @mgather_v8f32(<8 x float*> %ptrs, <8 x i1> %m, <8 x float> %passthru) {
+; RV32-LABEL: mgather_v8f32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a0, 8, e32,m2,tu,mu
+; RV32-NEXT:    vloxei32.v v10, (zero), v8, v0.t
+; RV32-NEXT:    vmv2r.v v8, v10
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_v8f32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a0, 8, e32,m2,tu,mu
+; RV64-NEXT:    vloxei64.v v12, (zero), v8, v0.t
+; RV64-NEXT:    vmv2r.v v8, v12
+; RV64-NEXT:    ret
+  %v = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> %ptrs, i32 4, <8 x i1> %m, <8 x float> %passthru)
+  ret <8 x float> %v
+}
+
+define <8 x float> @mgather_baseidx_v8i8_v8f32(float* %base, <8 x i8> %idxs, <8 x i1> %m, <8 x float> %passthru) {
+; RV32-LABEL: mgather_baseidx_v8i8_v8f32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a1, 8, e32,m2,ta,mu
+; RV32-NEXT:    vsext.vf4 v26, v8
+; RV32-NEXT:    vsll.vi v26, v26, 2
+; RV32-NEXT:    vsetivli a1, 8, e32,m2,tu,mu
+; RV32-NEXT:    vloxei32.v v10, (a0), v26, v0.t
+; RV32-NEXT:    vmv2r.v v8, v10
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_baseidx_v8i8_v8f32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vsext.vf8 v28, v8
+; RV64-NEXT:    vsll.vi v28, v28, 2
+; RV64-NEXT:    vsetivli a1, 8, e32,m2,tu,mu
+; RV64-NEXT:    vloxei64.v v10, (a0), v28, v0.t
+; RV64-NEXT:    vmv2r.v v8, v10
+; RV64-NEXT:    ret
+  %ptrs = getelementptr inbounds float, float* %base, <8 x i8> %idxs
+  %v = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> %ptrs, i32 4, <8 x i1> %m, <8 x float> %passthru)
+  ret <8 x float> %v
+}
+
+define <8 x float> @mgather_baseidx_sext_v8i8_v8f32(float* %base, <8 x i8> %idxs, <8 x i1> %m, <8 x float> %passthru) {
+; RV32-LABEL: mgather_baseidx_sext_v8i8_v8f32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a1, 8, e32,m2,ta,mu
+; RV32-NEXT:    vsext.vf4 v26, v8
+; RV32-NEXT:    vsll.vi v26, v26, 2
+; RV32-NEXT:    vsetivli a1, 8, e32,m2,tu,mu
+; RV32-NEXT:    vloxei32.v v10, (a0), v26, v0.t
+; RV32-NEXT:    vmv2r.v v8, v10
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_baseidx_sext_v8i8_v8f32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vsext.vf8 v28, v8
+; RV64-NEXT:    vsll.vi v28, v28, 2
+; RV64-NEXT:    vsetivli a1, 8, e32,m2,tu,mu
+; RV64-NEXT:    vloxei64.v v10, (a0), v28, v0.t
+; RV64-NEXT:    vmv2r.v v8, v10
+; RV64-NEXT:    ret
+  %eidxs = sext <8 x i8> %idxs to <8 x i32>
+  %ptrs = getelementptr inbounds float, float* %base, <8 x i32> %eidxs
+  %v = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> %ptrs, i32 4, <8 x i1> %m, <8 x float> %passthru)
+  ret <8 x float> %v
+}
+
+define <8 x float> @mgather_baseidx_zext_v8i8_v8f32(float* %base, <8 x i8> %idxs, <8 x i1> %m, <8 x float> %passthru) {
+; RV32-LABEL: mgather_baseidx_zext_v8i8_v8f32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a1, 8, e32,m2,ta,mu
+; RV32-NEXT:    vzext.vf4 v26, v8
+; RV32-NEXT:    vsll.vi v26, v26, 2
+; RV32-NEXT:    vsetivli a1, 8, e32,m2,tu,mu
+; RV32-NEXT:    vloxei32.v v10, (a0), v26, v0.t
+; RV32-NEXT:    vmv2r.v v8, v10
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_baseidx_zext_v8i8_v8f32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vzext.vf8 v28, v8
+; RV64-NEXT:    vsll.vi v28, v28, 2
+; RV64-NEXT:    vsetivli a1, 8, e32,m2,tu,mu
+; RV64-NEXT:    vloxei64.v v10, (a0), v28, v0.t
+; RV64-NEXT:    vmv2r.v v8, v10
+; RV64-NEXT:    ret
+  %eidxs = zext <8 x i8> %idxs to <8 x i32>
+  %ptrs = getelementptr inbounds float, float* %base, <8 x i32> %eidxs
+  %v = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> %ptrs, i32 4, <8 x i1> %m, <8 x float> %passthru)
+  ret <8 x float> %v
+}
+
+define <8 x float> @mgather_baseidx_v8i16_v8f32(float* %base, <8 x i16> %idxs, <8 x i1> %m, <8 x float> %passthru) {
+; RV32-LABEL: mgather_baseidx_v8i16_v8f32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a1, 8, e32,m2,ta,mu
+; RV32-NEXT:    vsext.vf2 v26, v8
+; RV32-NEXT:    vsll.vi v26, v26, 2
+; RV32-NEXT:    vsetivli a1, 8, e32,m2,tu,mu
+; RV32-NEXT:    vloxei32.v v10, (a0), v26, v0.t
+; RV32-NEXT:    vmv2r.v v8, v10
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_baseidx_v8i16_v8f32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vsext.vf4 v28, v8
+; RV64-NEXT:    vsll.vi v28, v28, 2
+; RV64-NEXT:    vsetivli a1, 8, e32,m2,tu,mu
+; RV64-NEXT:    vloxei64.v v10, (a0), v28, v0.t
+; RV64-NEXT:    vmv2r.v v8, v10
+; RV64-NEXT:    ret
+  %ptrs = getelementptr inbounds float, float* %base, <8 x i16> %idxs
+  %v = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> %ptrs, i32 4, <8 x i1> %m, <8 x float> %passthru)
+  ret <8 x float> %v
+}
+
+define <8 x float> @mgather_baseidx_sext_v8i16_v8f32(float* %base, <8 x i16> %idxs, <8 x i1> %m, <8 x float> %passthru) {
+; RV32-LABEL: mgather_baseidx_sext_v8i16_v8f32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a1, 8, e32,m2,ta,mu
+; RV32-NEXT:    vsext.vf2 v26, v8
+; RV32-NEXT:    vsll.vi v26, v26, 2
+; RV32-NEXT:    vsetivli a1, 8, e32,m2,tu,mu
+; RV32-NEXT:    vloxei32.v v10, (a0), v26, v0.t
+; RV32-NEXT:    vmv2r.v v8, v10
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_baseidx_sext_v8i16_v8f32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vsext.vf4 v28, v8
+; RV64-NEXT:    vsll.vi v28, v28, 2
+; RV64-NEXT:    vsetivli a1, 8, e32,m2,tu,mu
+; RV64-NEXT:    vloxei64.v v10, (a0), v28, v0.t
+; RV64-NEXT:    vmv2r.v v8, v10
+; RV64-NEXT:    ret
+  %eidxs = sext <8 x i16> %idxs to <8 x i32>
+  %ptrs = getelementptr inbounds float, float* %base, <8 x i32> %eidxs
+  %v = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> %ptrs, i32 4, <8 x i1> %m, <8 x float> %passthru)
+  ret <8 x float> %v
+}
+
+define <8 x float> @mgather_baseidx_zext_v8i16_v8f32(float* %base, <8 x i16> %idxs, <8 x i1> %m, <8 x float> %passthru) {
+; RV32-LABEL: mgather_baseidx_zext_v8i16_v8f32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a1, 8, e32,m2,ta,mu
+; RV32-NEXT:    vzext.vf2 v26, v8
+; RV32-NEXT:    vsll.vi v26, v26, 2
+; RV32-NEXT:    vsetivli a1, 8, e32,m2,tu,mu
+; RV32-NEXT:    vloxei32.v v10, (a0), v26, v0.t
+; RV32-NEXT:    vmv2r.v v8, v10
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_baseidx_zext_v8i16_v8f32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vzext.vf4 v28, v8
+; RV64-NEXT:    vsll.vi v28, v28, 2
+; RV64-NEXT:    vsetivli a1, 8, e32,m2,tu,mu
+; RV64-NEXT:    vloxei64.v v10, (a0), v28, v0.t
+; RV64-NEXT:    vmv2r.v v8, v10
+; RV64-NEXT:    ret
+  %eidxs = zext <8 x i16> %idxs to <8 x i32>
+  %ptrs = getelementptr inbounds float, float* %base, <8 x i32> %eidxs
+  %v = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> %ptrs, i32 4, <8 x i1> %m, <8 x float> %passthru)
+  ret <8 x float> %v
+}
+
+define <8 x float> @mgather_baseidx_v8f32(float* %base, <8 x i32> %idxs, <8 x i1> %m, <8 x float> %passthru) {
+; RV32-LABEL: mgather_baseidx_v8f32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a1, 8, e32,m2,ta,mu
+; RV32-NEXT:    vsll.vi v26, v8, 2
+; RV32-NEXT:    vsetivli a1, 8, e32,m2,tu,mu
+; RV32-NEXT:    vloxei32.v v10, (a0), v26, v0.t
+; RV32-NEXT:    vmv2r.v v8, v10
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_baseidx_v8f32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vsext.vf2 v28, v8
+; RV64-NEXT:    vsll.vi v28, v28, 2
+; RV64-NEXT:    vsetivli a1, 8, e32,m2,tu,mu
+; RV64-NEXT:    vloxei64.v v10, (a0), v28, v0.t
+; RV64-NEXT:    vmv2r.v v8, v10
+; RV64-NEXT:    ret
+  %ptrs = getelementptr inbounds float, float* %base, <8 x i32> %idxs
+  %v = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> %ptrs, i32 4, <8 x i1> %m, <8 x float> %passthru)
+  ret <8 x float> %v
+}
+
+declare <1 x double> @llvm.masked.gather.v1f64.v1p0f64(<1 x double*>, i32, <1 x i1>, <1 x double>)
+
+define <1 x double> @mgather_v1f64(<1 x double*> %ptrs, <1 x i1> %m, <1 x double> %passthru) {
+; RV32-LABEL: mgather_v1f64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a0, 1, e64,m1,tu,mu
+; RV32-NEXT:    vloxei32.v v9, (zero), v8, v0.t
+; RV32-NEXT:    vmv1r.v v8, v9
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_v1f64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a0, 1, e64,m1,tu,mu
+; RV64-NEXT:    vloxei64.v v9, (zero), v8, v0.t
+; RV64-NEXT:    vmv1r.v v8, v9
+; RV64-NEXT:    ret
+  %v = call <1 x double> @llvm.masked.gather.v1f64.v1p0f64(<1 x double*> %ptrs, i32 8, <1 x i1> %m, <1 x double> %passthru)
+  ret <1 x double> %v
+}
+
+declare <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*>, i32, <2 x i1>, <2 x double>)
+
+define <2 x double> @mgather_v2f64(<2 x double*> %ptrs, <2 x i1> %m, <2 x double> %passthru) {
+; RV32-LABEL: mgather_v2f64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a0, 2, e64,m1,tu,mu
+; RV32-NEXT:    vloxei32.v v9, (zero), v8, v0.t
+; RV32-NEXT:    vmv1r.v v8, v9
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_v2f64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a0, 2, e64,m1,tu,mu
+; RV64-NEXT:    vloxei64.v v9, (zero), v8, v0.t
+; RV64-NEXT:    vmv1r.v v8, v9
+; RV64-NEXT:    ret
+  %v = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> %ptrs, i32 8, <2 x i1> %m, <2 x double> %passthru)
+  ret <2 x double> %v
+}
+
+declare <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*>, i32, <4 x i1>, <4 x double>)
+
+define <4 x double> @mgather_v4f64(<4 x double*> %ptrs, <4 x i1> %m, <4 x double> %passthru) {
+; RV32-LABEL: mgather_v4f64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a0, 4, e64,m2,tu,mu
+; RV32-NEXT:    vloxei32.v v10, (zero), v8, v0.t
+; RV32-NEXT:    vmv2r.v v8, v10
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_v4f64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a0, 4, e64,m2,tu,mu
+; RV64-NEXT:    vloxei64.v v10, (zero), v8, v0.t
+; RV64-NEXT:    vmv2r.v v8, v10
+; RV64-NEXT:    ret
+  %v = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> %ptrs, i32 8, <4 x i1> %m, <4 x double> %passthru)
+  ret <4 x double> %v
+}
+
+define <4 x double> @mgather_truemask_v4f64(<4 x double*> %ptrs, <4 x double> %passthru) {
+; RV32-LABEL: mgather_truemask_v4f64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a0, 4, e64,m2,ta,mu
+; RV32-NEXT:    vloxei32.v v8, (zero), v8
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_truemask_v4f64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a0, 4, e64,m2,ta,mu
+; RV64-NEXT:    vloxei64.v v8, (zero), v8
+; RV64-NEXT:    ret
+  %mhead = insertelement <4 x i1> undef, i1 1, i32 0
+  %mtrue = shufflevector <4 x i1> %mhead, <4 x i1> undef, <4 x i32> zeroinitializer
+  %v = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> %ptrs, i32 8, <4 x i1> %mtrue, <4 x double> %passthru)
+  ret <4 x double> %v
+}
+
+define <4 x double> @mgather_falsemask_v4f64(<4 x double*> %ptrs, <4 x double> %passthru) {
+; RV32-LABEL: mgather_falsemask_v4f64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vmv2r.v v8, v10
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_falsemask_v4f64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vmv2r.v v8, v10
+; RV64-NEXT:    ret
+  %v = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> %ptrs, i32 8, <4 x i1> zeroinitializer, <4 x double> %passthru)
+  ret <4 x double> %v
+}
+
+declare <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*>, i32, <8 x i1>, <8 x double>)
+
+define <8 x double> @mgather_v8f64(<8 x double*> %ptrs, <8 x i1> %m, <8 x double> %passthru) {
+; RV32-LABEL: mgather_v8f64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a0, 8, e64,m4,tu,mu
+; RV32-NEXT:    vloxei32.v v12, (zero), v8, v0.t
+; RV32-NEXT:    vmv4r.v v8, v12
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_v8f64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a0, 8, e64,m4,tu,mu
+; RV64-NEXT:    vloxei64.v v12, (zero), v8, v0.t
+; RV64-NEXT:    vmv4r.v v8, v12
+; RV64-NEXT:    ret
+  %v = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> %ptrs, i32 8, <8 x i1> %m, <8 x double> %passthru)
+  ret <8 x double> %v
+}
+
+define <8 x double> @mgather_baseidx_v8i8_v8f64(double* %base, <8 x i8> %idxs, <8 x i1> %m, <8 x double> %passthru) {
+; RV32-LABEL: mgather_baseidx_v8i8_v8f64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a1, 8, e32,m2,ta,mu
+; RV32-NEXT:    vsext.vf4 v26, v8
+; RV32-NEXT:    vsll.vi v26, v26, 3
+; RV32-NEXT:    vsetivli a1, 8, e64,m4,tu,mu
+; RV32-NEXT:    vloxei32.v v12, (a0), v26, v0.t
+; RV32-NEXT:    vmv4r.v v8, v12
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_baseidx_v8i8_v8f64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vsext.vf8 v28, v8
+; RV64-NEXT:    vsll.vi v28, v28, 3
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,tu,mu
+; RV64-NEXT:    vloxei64.v v12, (a0), v28, v0.t
+; RV64-NEXT:    vmv4r.v v8, v12
+; RV64-NEXT:    ret
+  %ptrs = getelementptr inbounds double, double* %base, <8 x i8> %idxs
+  %v = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> %ptrs, i32 8, <8 x i1> %m, <8 x double> %passthru)
+  ret <8 x double> %v
+}
+
+define <8 x double> @mgather_baseidx_sext_v8i8_v8f64(double* %base, <8 x i8> %idxs, <8 x i1> %m, <8 x double> %passthru) {
+; RV32-LABEL: mgather_baseidx_sext_v8i8_v8f64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV32-NEXT:    vsext.vf8 v28, v8
+; RV32-NEXT:    lui a1, %hi(.LCPI88_0)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI88_0)
+; RV32-NEXT:    vsetivli a2, 16, e32,m4,ta,mu
+; RV32-NEXT:    vle32.v v16, (a1)
+; RV32-NEXT:    vmv4r.v v8, v12
+; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV32-NEXT:    vsll.vv v28, v28, v16
+; RV32-NEXT:    vsetivli a1, 8, e64,m4,tu,mu
+; RV32-NEXT:    vloxei64.v v8, (a0), v28, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_baseidx_sext_v8i8_v8f64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vsext.vf8 v28, v8
+; RV64-NEXT:    vsll.vi v28, v28, 3
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,tu,mu
+; RV64-NEXT:    vloxei64.v v12, (a0), v28, v0.t
+; RV64-NEXT:    vmv4r.v v8, v12
+; RV64-NEXT:    ret
+  %eidxs = sext <8 x i8> %idxs to <8 x i64>
+  %ptrs = getelementptr inbounds double, double* %base, <8 x i64> %eidxs
+  %v = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> %ptrs, i32 8, <8 x i1> %m, <8 x double> %passthru)
+  ret <8 x double> %v
+}
+
+define <8 x double> @mgather_baseidx_zext_v8i8_v8f64(double* %base, <8 x i8> %idxs, <8 x i1> %m, <8 x double> %passthru) {
+; RV32-LABEL: mgather_baseidx_zext_v8i8_v8f64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV32-NEXT:    vzext.vf8 v28, v8
+; RV32-NEXT:    lui a1, %hi(.LCPI89_0)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI89_0)
+; RV32-NEXT:    vsetivli a2, 16, e32,m4,ta,mu
+; RV32-NEXT:    vle32.v v16, (a1)
+; RV32-NEXT:    vmv4r.v v8, v12
+; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV32-NEXT:    vsll.vv v28, v28, v16
+; RV32-NEXT:    vsetivli a1, 8, e64,m4,tu,mu
+; RV32-NEXT:    vloxei64.v v8, (a0), v28, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_baseidx_zext_v8i8_v8f64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vzext.vf8 v28, v8
+; RV64-NEXT:    vsll.vi v28, v28, 3
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,tu,mu
+; RV64-NEXT:    vloxei64.v v12, (a0), v28, v0.t
+; RV64-NEXT:    vmv4r.v v8, v12
+; RV64-NEXT:    ret
+  %eidxs = zext <8 x i8> %idxs to <8 x i64>
+  %ptrs = getelementptr inbounds double, double* %base, <8 x i64> %eidxs
+  %v = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> %ptrs, i32 8, <8 x i1> %m, <8 x double> %passthru)
+  ret <8 x double> %v
+}
+
+define <8 x double> @mgather_baseidx_v8i16_v8f64(double* %base, <8 x i16> %idxs, <8 x i1> %m, <8 x double> %passthru) {
+; RV32-LABEL: mgather_baseidx_v8i16_v8f64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a1, 8, e32,m2,ta,mu
+; RV32-NEXT:    vsext.vf2 v26, v8
+; RV32-NEXT:    vsll.vi v26, v26, 3
+; RV32-NEXT:    vsetivli a1, 8, e64,m4,tu,mu
+; RV32-NEXT:    vloxei32.v v12, (a0), v26, v0.t
+; RV32-NEXT:    vmv4r.v v8, v12
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_baseidx_v8i16_v8f64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vsext.vf4 v28, v8
+; RV64-NEXT:    vsll.vi v28, v28, 3
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,tu,mu
+; RV64-NEXT:    vloxei64.v v12, (a0), v28, v0.t
+; RV64-NEXT:    vmv4r.v v8, v12
+; RV64-NEXT:    ret
+  %ptrs = getelementptr inbounds double, double* %base, <8 x i16> %idxs
+  %v = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> %ptrs, i32 8, <8 x i1> %m, <8 x double> %passthru)
+  ret <8 x double> %v
+}
+
+define <8 x double> @mgather_baseidx_sext_v8i16_v8f64(double* %base, <8 x i16> %idxs, <8 x i1> %m, <8 x double> %passthru) {
+; RV32-LABEL: mgather_baseidx_sext_v8i16_v8f64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV32-NEXT:    vsext.vf4 v28, v8
+; RV32-NEXT:    lui a1, %hi(.LCPI91_0)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI91_0)
+; RV32-NEXT:    vsetivli a2, 16, e32,m4,ta,mu
+; RV32-NEXT:    vle32.v v16, (a1)
+; RV32-NEXT:    vmv4r.v v8, v12
+; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV32-NEXT:    vsll.vv v28, v28, v16
+; RV32-NEXT:    vsetivli a1, 8, e64,m4,tu,mu
+; RV32-NEXT:    vloxei64.v v8, (a0), v28, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_baseidx_sext_v8i16_v8f64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vsext.vf4 v28, v8
+; RV64-NEXT:    vsll.vi v28, v28, 3
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,tu,mu
+; RV64-NEXT:    vloxei64.v v12, (a0), v28, v0.t
+; RV64-NEXT:    vmv4r.v v8, v12
+; RV64-NEXT:    ret
+  %eidxs = sext <8 x i16> %idxs to <8 x i64>
+  %ptrs = getelementptr inbounds double, double* %base, <8 x i64> %eidxs
+  %v = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> %ptrs, i32 8, <8 x i1> %m, <8 x double> %passthru)
+  ret <8 x double> %v
+}
+
+define <8 x double> @mgather_baseidx_zext_v8i16_v8f64(double* %base, <8 x i16> %idxs, <8 x i1> %m, <8 x double> %passthru) {
+; RV32-LABEL: mgather_baseidx_zext_v8i16_v8f64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV32-NEXT:    vzext.vf4 v28, v8
+; RV32-NEXT:    lui a1, %hi(.LCPI92_0)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI92_0)
+; RV32-NEXT:    vsetivli a2, 16, e32,m4,ta,mu
+; RV32-NEXT:    vle32.v v16, (a1)
+; RV32-NEXT:    vmv4r.v v8, v12
+; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV32-NEXT:    vsll.vv v28, v28, v16
+; RV32-NEXT:    vsetivli a1, 8, e64,m4,tu,mu
+; RV32-NEXT:    vloxei64.v v8, (a0), v28, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_baseidx_zext_v8i16_v8f64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vzext.vf4 v28, v8
+; RV64-NEXT:    vsll.vi v28, v28, 3
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,tu,mu
+; RV64-NEXT:    vloxei64.v v12, (a0), v28, v0.t
+; RV64-NEXT:    vmv4r.v v8, v12
+; RV64-NEXT:    ret
+  %eidxs = zext <8 x i16> %idxs to <8 x i64>
+  %ptrs = getelementptr inbounds double, double* %base, <8 x i64> %eidxs
+  %v = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> %ptrs, i32 8, <8 x i1> %m, <8 x double> %passthru)
+  ret <8 x double> %v
+}
+
+define <8 x double> @mgather_baseidx_v8i32_v8f64(double* %base, <8 x i32> %idxs, <8 x i1> %m, <8 x double> %passthru) {
+; RV32-LABEL: mgather_baseidx_v8i32_v8f64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a1, 8, e32,m2,ta,mu
+; RV32-NEXT:    vsll.vi v26, v8, 3
+; RV32-NEXT:    vsetivli a1, 8, e64,m4,tu,mu
+; RV32-NEXT:    vloxei32.v v12, (a0), v26, v0.t
+; RV32-NEXT:    vmv4r.v v8, v12
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_baseidx_v8i32_v8f64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vsext.vf2 v28, v8
+; RV64-NEXT:    vsll.vi v28, v28, 3
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,tu,mu
+; RV64-NEXT:    vloxei64.v v12, (a0), v28, v0.t
+; RV64-NEXT:    vmv4r.v v8, v12
+; RV64-NEXT:    ret
+  %ptrs = getelementptr inbounds double, double* %base, <8 x i32> %idxs
+  %v = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> %ptrs, i32 8, <8 x i1> %m, <8 x double> %passthru)
+  ret <8 x double> %v
+}
+
+define <8 x double> @mgather_baseidx_sext_v8i32_v8f64(double* %base, <8 x i32> %idxs, <8 x i1> %m, <8 x double> %passthru) {
+; RV32-LABEL: mgather_baseidx_sext_v8i32_v8f64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV32-NEXT:    vsext.vf2 v28, v8
+; RV32-NEXT:    lui a1, %hi(.LCPI94_0)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI94_0)
+; RV32-NEXT:    vsetivli a2, 16, e32,m4,ta,mu
+; RV32-NEXT:    vle32.v v16, (a1)
+; RV32-NEXT:    vmv4r.v v8, v12
+; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV32-NEXT:    vsll.vv v28, v28, v16
+; RV32-NEXT:    vsetivli a1, 8, e64,m4,tu,mu
+; RV32-NEXT:    vloxei64.v v8, (a0), v28, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_baseidx_sext_v8i32_v8f64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vsext.vf2 v28, v8
+; RV64-NEXT:    vsll.vi v28, v28, 3
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,tu,mu
+; RV64-NEXT:    vloxei64.v v12, (a0), v28, v0.t
+; RV64-NEXT:    vmv4r.v v8, v12
+; RV64-NEXT:    ret
+  %eidxs = sext <8 x i32> %idxs to <8 x i64>
+  %ptrs = getelementptr inbounds double, double* %base, <8 x i64> %eidxs
+  %v = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> %ptrs, i32 8, <8 x i1> %m, <8 x double> %passthru)
+  ret <8 x double> %v
+}
+
+define <8 x double> @mgather_baseidx_zext_v8i32_v8f64(double* %base, <8 x i32> %idxs, <8 x i1> %m, <8 x double> %passthru) {
+; RV32-LABEL: mgather_baseidx_zext_v8i32_v8f64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV32-NEXT:    vzext.vf2 v28, v8
+; RV32-NEXT:    lui a1, %hi(.LCPI95_0)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI95_0)
+; RV32-NEXT:    vsetivli a2, 16, e32,m4,ta,mu
+; RV32-NEXT:    vle32.v v16, (a1)
+; RV32-NEXT:    vmv4r.v v8, v12
+; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV32-NEXT:    vsll.vv v28, v28, v16
+; RV32-NEXT:    vsetivli a1, 8, e64,m4,tu,mu
+; RV32-NEXT:    vloxei64.v v8, (a0), v28, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_baseidx_zext_v8i32_v8f64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vzext.vf2 v28, v8
+; RV64-NEXT:    vsll.vi v28, v28, 3
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,tu,mu
+; RV64-NEXT:    vloxei64.v v12, (a0), v28, v0.t
+; RV64-NEXT:    vmv4r.v v8, v12
+; RV64-NEXT:    ret
+  %eidxs = zext <8 x i32> %idxs to <8 x i64>
+  %ptrs = getelementptr inbounds double, double* %base, <8 x i64> %eidxs
+  %v = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> %ptrs, i32 8, <8 x i1> %m, <8 x double> %passthru)
+  ret <8 x double> %v
+}
+
+define <8 x double> @mgather_baseidx_v8f64(double* %base, <8 x i64> %idxs, <8 x i1> %m, <8 x double> %passthru) {
+; RV32-LABEL: mgather_baseidx_v8f64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    lui a1, %hi(.LCPI96_0)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI96_0)
+; RV32-NEXT:    vsetivli a2, 16, e32,m4,ta,mu
+; RV32-NEXT:    vle32.v v28, (a1)
+; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV32-NEXT:    vsll.vv v28, v8, v28
+; RV32-NEXT:    vsetivli a1, 8, e64,m4,tu,mu
+; RV32-NEXT:    vloxei64.v v12, (a0), v28, v0.t
+; RV32-NEXT:    vmv4r.v v8, v12
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_baseidx_v8f64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vsll.vi v28, v8, 3
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,tu,mu
+; RV64-NEXT:    vloxei64.v v12, (a0), v28, v0.t
+; RV64-NEXT:    vmv4r.v v8, v12
+; RV64-NEXT:    ret
+  %ptrs = getelementptr inbounds double, double* %base, <8 x i64> %idxs
+  %v = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> %ptrs, i32 8, <8 x i1> %m, <8 x double> %passthru)
+  ret <8 x double> %v
+}
+
+declare <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*>, i32, <16 x i1>, <16 x i8>)
+
+define <16 x i8> @mgather_baseidx_v16i8(i8* %base, <16 x i8> %idxs, <16 x i1> %m, <16 x i8> %passthru) {
+; RV32-LABEL: mgather_baseidx_v16i8:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a1, 16, e32,m4,ta,mu
+; RV32-NEXT:    vsext.vf4 v28, v8
+; RV32-NEXT:    vsetivli a1, 16, e8,m1,tu,mu
+; RV32-NEXT:    vloxei32.v v9, (a0), v28, v0.t
+; RV32-NEXT:    vmv1r.v v8, v9
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_baseidx_v16i8:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 16, e64,m8,ta,mu
+; RV64-NEXT:    vsext.vf8 v16, v8
+; RV64-NEXT:    vsetivli a1, 16, e8,m1,tu,mu
+; RV64-NEXT:    vloxei64.v v9, (a0), v16, v0.t
+; RV64-NEXT:    vmv1r.v v8, v9
+; RV64-NEXT:    ret
+  %ptrs = getelementptr inbounds i8, i8* %base, <16 x i8> %idxs
+  %v = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs, i32 2, <16 x i1> %m, <16 x i8> %passthru)
+  ret <16 x i8> %v
+}
+
+declare <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*>, i32, <32 x i1>, <32 x i8>)
+
+define <32 x i8> @mgather_baseidx_v32i8(i8* %base, <32 x i8> %idxs, <32 x i1> %m, <32 x i8> %passthru) {
+; RV32-LABEL: mgather_baseidx_v32i8:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi a1, zero, 32
+; RV32-NEXT:    vsetvli a2, a1, e32,m8,ta,mu
+; RV32-NEXT:    vsext.vf4 v16, v8
+; RV32-NEXT:    vsetvli a1, a1, e8,m2,tu,mu
+; RV32-NEXT:    vloxei32.v v10, (a0), v16, v0.t
+; RV32-NEXT:    vmv2r.v v8, v10
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mgather_baseidx_v32i8:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vmv1r.v v25, v0
+; RV64-NEXT:    vsetivli a1, 16, e8,m2,ta,mu
+; RV64-NEXT:    vslidedown.vi v26, v8, 16
+; RV64-NEXT:    vsetivli a1, 16, e64,m8,ta,mu
+; RV64-NEXT:    vsext.vf8 v16, v26
+; RV64-NEXT:    vsetivli a1, 16, e8,m2,ta,mu
+; RV64-NEXT:    vslidedown.vi v26, v10, 16
+; RV64-NEXT:    vsetivli a1, 2, e8,m1,ta,mu
+; RV64-NEXT:    vslidedown.vi v0, v0, 2
+; RV64-NEXT:    vsetivli a1, 16, e8,m1,tu,mu
+; RV64-NEXT:    vloxei64.v v26, (a0), v16, v0.t
+; RV64-NEXT:    vsetivli a1, 16, e64,m8,ta,mu
+; RV64-NEXT:    vsext.vf8 v16, v8
+; RV64-NEXT:    vsetivli a1, 16, e8,m1,tu,mu
+; RV64-NEXT:    vmv1r.v v0, v25
+; RV64-NEXT:    vloxei64.v v10, (a0), v16, v0.t
+; RV64-NEXT:    addi a0, zero, 32
+; RV64-NEXT:    vsetvli a1, a0, e8,m2,ta,mu
+; RV64-NEXT:    vmv.v.i v8, 0
+; RV64-NEXT:    vsetivli a1, 16, e8,m2,tu,mu
+; RV64-NEXT:    vslideup.vi v8, v10, 0
+; RV64-NEXT:    vsetvli a0, a0, e8,m2,tu,mu
+; RV64-NEXT:    vslideup.vi v8, v26, 16
+; RV64-NEXT:    ret
+  %ptrs = getelementptr inbounds i8, i8* %base, <32 x i8> %idxs
+  %v = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> %ptrs, i32 2, <32 x i1> %m, <32 x i8> %passthru)
+  ret <32 x i8> %v
+}

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll
new file mode 100644
index 000000000000..7138a07efa76
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll
@@ -0,0 +1,1987 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+experimental-zfh,+experimental-v -target-abi=ilp32d \
+; RUN:     -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefix=RV32
+; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+experimental-zfh,+experimental-v -target-abi=lp64d \
+; RUN:     -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefix=RV64
+
+declare void @llvm.masked.scatter.v1i8.v1p0i8(<1 x i8>, <1 x i8*>, i32, <1 x i1>)
+
+define void @mscatter_v1i8(<1 x i8> %val, <1 x i8*> %ptrs, <1 x i1> %m) {
+; RV32-LABEL: mscatter_v1i8:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a0, 2, e8,mf4,ta,mu
+; RV32-NEXT:    vsoxei32.v v8, (zero), v9, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_v1i8:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a0, 1, e8,mf8,ta,mu
+; RV64-NEXT:    vsoxei64.v v8, (zero), v9, v0.t
+; RV64-NEXT:    ret
+  call void @llvm.masked.scatter.v1i8.v1p0i8(<1 x i8> %val, <1 x i8*> %ptrs, i32 1, <1 x i1> %m)
+  ret void
+}
+
+declare void @llvm.masked.scatter.v2i8.v2p0i8(<2 x i8>, <2 x i8*>, i32, <2 x i1>)
+
+define void @mscatter_v2i8(<2 x i8> %val, <2 x i8*> %ptrs, <2 x i1> %m) {
+; RV32-LABEL: mscatter_v2i8:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a0, 2, e8,mf4,ta,mu
+; RV32-NEXT:    vsoxei32.v v8, (zero), v9, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_v2i8:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a0, 1, e8,mf8,ta,mu
+; RV64-NEXT:    vsoxei64.v v8, (zero), v9, v0.t
+; RV64-NEXT:    ret
+  call void @llvm.masked.scatter.v2i8.v2p0i8(<2 x i8> %val, <2 x i8*> %ptrs, i32 1, <2 x i1> %m)
+  ret void
+}
+
+define void @mscatter_v2i16_truncstore_v2i8(<2 x i16> %val, <2 x i8*> %ptrs, <2 x i1> %m) {
+; RV32-LABEL: mscatter_v2i16_truncstore_v2i8:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a0, 2, e8,mf2,ta,mu
+; RV32-NEXT:    vnsrl.wi v25, v8, 0
+; RV32-NEXT:    vsetivli a0, 2, e8,mf4,ta,mu
+; RV32-NEXT:    vsoxei32.v v25, (zero), v9, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_v2i16_truncstore_v2i8:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a0, 2, e8,mf2,ta,mu
+; RV64-NEXT:    vnsrl.wi v25, v8, 0
+; RV64-NEXT:    vsetivli a0, 1, e8,mf8,ta,mu
+; RV64-NEXT:    vsoxei64.v v25, (zero), v9, v0.t
+; RV64-NEXT:    ret
+  %tval = trunc <2 x i16> %val to <2 x i8>
+  call void @llvm.masked.scatter.v2i8.v2p0i8(<2 x i8> %tval, <2 x i8*> %ptrs, i32 1, <2 x i1> %m)
+  ret void
+}
+
+define void @mscatter_v2i32_truncstore_v2i8(<2 x i32> %val, <2 x i8*> %ptrs, <2 x i1> %m) {
+; RV32-LABEL: mscatter_v2i32_truncstore_v2i8:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a0, 2, e16,mf2,ta,mu
+; RV32-NEXT:    vnsrl.wi v25, v8, 0
+; RV32-NEXT:    vsetivli a0, 2, e8,mf4,ta,mu
+; RV32-NEXT:    vnsrl.wi v26, v25, 0
+; RV32-NEXT:    vsoxei32.v v26, (zero), v9, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_v2i32_truncstore_v2i8:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a0, 2, e16,mf2,ta,mu
+; RV64-NEXT:    vnsrl.wi v25, v8, 0
+; RV64-NEXT:    vsetivli a0, 2, e8,mf4,ta,mu
+; RV64-NEXT:    vnsrl.wi v26, v25, 0
+; RV64-NEXT:    vsetivli a0, 1, e8,mf8,ta,mu
+; RV64-NEXT:    vsoxei64.v v26, (zero), v9, v0.t
+; RV64-NEXT:    ret
+  %tval = trunc <2 x i32> %val to <2 x i8>
+  call void @llvm.masked.scatter.v2i8.v2p0i8(<2 x i8> %tval, <2 x i8*> %ptrs, i32 1, <2 x i1> %m)
+  ret void
+}
+
+define void @mscatter_v2i64_truncstore_v2i8(<2 x i64> %val, <2 x i8*> %ptrs, <2 x i1> %m) {
+; RV32-LABEL: mscatter_v2i64_truncstore_v2i8:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a0, 2, e32,mf2,ta,mu
+; RV32-NEXT:    vnsrl.wi v25, v8, 0
+; RV32-NEXT:    vsetivli a0, 2, e16,mf4,ta,mu
+; RV32-NEXT:    vnsrl.wi v26, v25, 0
+; RV32-NEXT:    vsetivli a0, 2, e8,mf8,ta,mu
+; RV32-NEXT:    vnsrl.wi v25, v26, 0
+; RV32-NEXT:    vsetivli a0, 2, e8,mf4,ta,mu
+; RV32-NEXT:    vsoxei32.v v25, (zero), v9, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_v2i64_truncstore_v2i8:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a0, 2, e32,mf2,ta,mu
+; RV64-NEXT:    vnsrl.wi v25, v8, 0
+; RV64-NEXT:    vsetivli a0, 2, e16,mf4,ta,mu
+; RV64-NEXT:    vnsrl.wi v26, v25, 0
+; RV64-NEXT:    vsetivli a0, 2, e8,mf8,ta,mu
+; RV64-NEXT:    vnsrl.wi v25, v26, 0
+; RV64-NEXT:    vsetivli a0, 1, e8,mf8,ta,mu
+; RV64-NEXT:    vsoxei64.v v25, (zero), v9, v0.t
+; RV64-NEXT:    ret
+  %tval = trunc <2 x i64> %val to <2 x i8>
+  call void @llvm.masked.scatter.v2i8.v2p0i8(<2 x i8> %tval, <2 x i8*> %ptrs, i32 1, <2 x i1> %m)
+  ret void
+}
+
+declare void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8>, <4 x i8*>, i32, <4 x i1>)
+
+define void @mscatter_v4i8(<4 x i8> %val, <4 x i8*> %ptrs, <4 x i1> %m) {
+; RV32-LABEL: mscatter_v4i8:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a0, 2, e8,mf4,ta,mu
+; RV32-NEXT:    vsoxei32.v v8, (zero), v9, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_v4i8:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a0, 2, e8,mf4,ta,mu
+; RV64-NEXT:    vsoxei64.v v8, (zero), v10, v0.t
+; RV64-NEXT:    ret
+  call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> %val, <4 x i8*> %ptrs, i32 1, <4 x i1> %m)
+  ret void
+}
+
+define void @mscatter_truemask_v4i8(<4 x i8> %val, <4 x i8*> %ptrs) {
+; RV32-LABEL: mscatter_truemask_v4i8:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a0, 2, e8,mf4,ta,mu
+; RV32-NEXT:    vsoxei32.v v8, (zero), v9
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_truemask_v4i8:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a0, 2, e8,mf4,ta,mu
+; RV64-NEXT:    vsoxei64.v v8, (zero), v10
+; RV64-NEXT:    ret
+  %mhead = insertelement <4 x i1> undef, i1 1, i32 0
+  %mtrue = shufflevector <4 x i1> %mhead, <4 x i1> undef, <4 x i32> zeroinitializer
+  call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> %val, <4 x i8*> %ptrs, i32 1, <4 x i1> %mtrue)
+  ret void
+}
+
+define void @mscatter_falsemask_v4i8(<4 x i8> %val, <4 x i8*> %ptrs) {
+; RV32-LABEL: mscatter_falsemask_v4i8:
+; RV32:       # %bb.0:
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_falsemask_v4i8:
+; RV64:       # %bb.0:
+; RV64-NEXT:    ret
+  call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> %val, <4 x i8*> %ptrs, i32 1, <4 x i1> zeroinitializer)
+  ret void
+}
+
+declare void @llvm.masked.scatter.v8i8.v8p0i8(<8 x i8>, <8 x i8*>, i32, <8 x i1>)
+
+define void @mscatter_v8i8(<8 x i8> %val, <8 x i8*> %ptrs, <8 x i1> %m) {
+; RV32-LABEL: mscatter_v8i8:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a0, 4, e8,mf2,ta,mu
+; RV32-NEXT:    vsoxei32.v v8, (zero), v10, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_v8i8:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a0, 4, e8,mf2,ta,mu
+; RV64-NEXT:    vsoxei64.v v8, (zero), v12, v0.t
+; RV64-NEXT:    ret
+  call void @llvm.masked.scatter.v8i8.v8p0i8(<8 x i8> %val, <8 x i8*> %ptrs, i32 1, <8 x i1> %m)
+  ret void
+}
+
+define void @mscatter_baseidx_v8i8(<8 x i8> %val, i8* %base, <8 x i8> %idxs, <8 x i1> %m) {
+; RV32-LABEL: mscatter_baseidx_v8i8:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a1, 8, e32,m2,ta,mu
+; RV32-NEXT:    vsext.vf4 v26, v9
+; RV32-NEXT:    vsetivli a1, 4, e8,mf2,ta,mu
+; RV32-NEXT:    vsoxei32.v v8, (a0), v26, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_baseidx_v8i8:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vsext.vf8 v28, v9
+; RV64-NEXT:    vsetivli a1, 4, e8,mf2,ta,mu
+; RV64-NEXT:    vsoxei64.v v8, (a0), v28, v0.t
+; RV64-NEXT:    ret
+  %ptrs = getelementptr inbounds i8, i8* %base, <8 x i8> %idxs
+  call void @llvm.masked.scatter.v8i8.v8p0i8(<8 x i8> %val, <8 x i8*> %ptrs, i32 1, <8 x i1> %m)
+  ret void
+}
+
+declare void @llvm.masked.scatter.v1i16.v1p0i16(<1 x i16>, <1 x i16*>, i32, <1 x i1>)
+
+define void @mscatter_v1i16(<1 x i16> %val, <1 x i16*> %ptrs, <1 x i1> %m) {
+; RV32-LABEL: mscatter_v1i16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a0, 2, e16,mf2,ta,mu
+; RV32-NEXT:    vsoxei32.v v8, (zero), v9, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_v1i16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a0, 1, e16,mf4,ta,mu
+; RV64-NEXT:    vsoxei64.v v8, (zero), v9, v0.t
+; RV64-NEXT:    ret
+  call void @llvm.masked.scatter.v1i16.v1p0i16(<1 x i16> %val, <1 x i16*> %ptrs, i32 2, <1 x i1> %m)
+  ret void
+}
+
+declare void @llvm.masked.scatter.v2i16.v2p0i16(<2 x i16>, <2 x i16*>, i32, <2 x i1>)
+
+define void @mscatter_v2i16(<2 x i16> %val, <2 x i16*> %ptrs, <2 x i1> %m) {
+; RV32-LABEL: mscatter_v2i16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a0, 2, e16,mf2,ta,mu
+; RV32-NEXT:    vsoxei32.v v8, (zero), v9, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_v2i16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a0, 1, e16,mf4,ta,mu
+; RV64-NEXT:    vsoxei64.v v8, (zero), v9, v0.t
+; RV64-NEXT:    ret
+  call void @llvm.masked.scatter.v2i16.v2p0i16(<2 x i16> %val, <2 x i16*> %ptrs, i32 2, <2 x i1> %m)
+  ret void
+}
+
+define void @mscatter_v2i32_truncstore_v2i16(<2 x i32> %val, <2 x i16*> %ptrs, <2 x i1> %m) {
+; RV32-LABEL: mscatter_v2i32_truncstore_v2i16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a0, 2, e16,mf2,ta,mu
+; RV32-NEXT:    vnsrl.wi v25, v8, 0
+; RV32-NEXT:    vsoxei32.v v25, (zero), v9, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_v2i32_truncstore_v2i16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a0, 2, e16,mf2,ta,mu
+; RV64-NEXT:    vnsrl.wi v25, v8, 0
+; RV64-NEXT:    vsetivli a0, 1, e16,mf4,ta,mu
+; RV64-NEXT:    vsoxei64.v v25, (zero), v9, v0.t
+; RV64-NEXT:    ret
+  %tval = trunc <2 x i32> %val to <2 x i16>
+  call void @llvm.masked.scatter.v2i16.v2p0i16(<2 x i16> %tval, <2 x i16*> %ptrs, i32 2, <2 x i1> %m)
+  ret void
+}
+
+define void @mscatter_v2i64_truncstore_v2i16(<2 x i64> %val, <2 x i16*> %ptrs, <2 x i1> %m) {
+; RV32-LABEL: mscatter_v2i64_truncstore_v2i16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a0, 2, e32,mf2,ta,mu
+; RV32-NEXT:    vnsrl.wi v25, v8, 0
+; RV32-NEXT:    vsetivli a0, 2, e16,mf4,ta,mu
+; RV32-NEXT:    vnsrl.wi v26, v25, 0
+; RV32-NEXT:    vsetivli a0, 2, e16,mf2,ta,mu
+; RV32-NEXT:    vsoxei32.v v26, (zero), v9, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_v2i64_truncstore_v2i16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a0, 2, e32,mf2,ta,mu
+; RV64-NEXT:    vnsrl.wi v25, v8, 0
+; RV64-NEXT:    vsetivli a0, 2, e16,mf4,ta,mu
+; RV64-NEXT:    vnsrl.wi v26, v25, 0
+; RV64-NEXT:    vsetivli a0, 1, e16,mf4,ta,mu
+; RV64-NEXT:    vsoxei64.v v26, (zero), v9, v0.t
+; RV64-NEXT:    ret
+  %tval = trunc <2 x i64> %val to <2 x i16>
+  call void @llvm.masked.scatter.v2i16.v2p0i16(<2 x i16> %tval, <2 x i16*> %ptrs, i32 2, <2 x i1> %m)
+  ret void
+}
+
+declare void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16>, <4 x i16*>, i32, <4 x i1>)
+
+define void @mscatter_v4i16(<4 x i16> %val, <4 x i16*> %ptrs, <4 x i1> %m) {
+; RV32-LABEL: mscatter_v4i16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a0, 2, e16,mf2,ta,mu
+; RV32-NEXT:    vsoxei32.v v8, (zero), v9, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_v4i16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a0, 2, e16,mf2,ta,mu
+; RV64-NEXT:    vsoxei64.v v8, (zero), v10, v0.t
+; RV64-NEXT:    ret
+  call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %val, <4 x i16*> %ptrs, i32 2, <4 x i1> %m)
+  ret void
+}
+
+define void @mscatter_truemask_v4i16(<4 x i16> %val, <4 x i16*> %ptrs) {
+; RV32-LABEL: mscatter_truemask_v4i16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a0, 2, e16,mf2,ta,mu
+; RV32-NEXT:    vsoxei32.v v8, (zero), v9
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_truemask_v4i16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a0, 2, e16,mf2,ta,mu
+; RV64-NEXT:    vsoxei64.v v8, (zero), v10
+; RV64-NEXT:    ret
+  %mhead = insertelement <4 x i1> undef, i1 1, i32 0
+  %mtrue = shufflevector <4 x i1> %mhead, <4 x i1> undef, <4 x i32> zeroinitializer
+  call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %val, <4 x i16*> %ptrs, i32 2, <4 x i1> %mtrue)
+  ret void
+}
+
+define void @mscatter_falsemask_v4i16(<4 x i16> %val, <4 x i16*> %ptrs) {
+; RV32-LABEL: mscatter_falsemask_v4i16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_falsemask_v4i16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    ret
+  call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %val, <4 x i16*> %ptrs, i32 2, <4 x i1> zeroinitializer)
+  ret void
+}
+
+declare void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16>, <8 x i16*>, i32, <8 x i1>)
+
+define void @mscatter_v8i16(<8 x i16> %val, <8 x i16*> %ptrs, <8 x i1> %m) {
+; RV32-LABEL: mscatter_v8i16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a0, 4, e16,m1,ta,mu
+; RV32-NEXT:    vsoxei32.v v8, (zero), v10, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_v8i16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a0, 4, e16,m1,ta,mu
+; RV64-NEXT:    vsoxei64.v v8, (zero), v12, v0.t
+; RV64-NEXT:    ret
+  call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %val, <8 x i16*> %ptrs, i32 2, <8 x i1> %m)
+  ret void
+}
+
+define void @mscatter_baseidx_v8i8_v8i16(<8 x i16> %val, i16* %base, <8 x i8> %idxs, <8 x i1> %m) {
+; RV32-LABEL: mscatter_baseidx_v8i8_v8i16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a1, 8, e32,m2,ta,mu
+; RV32-NEXT:    vsext.vf4 v26, v9
+; RV32-NEXT:    vsll.vi v26, v26, 1
+; RV32-NEXT:    vsetivli a1, 4, e16,m1,ta,mu
+; RV32-NEXT:    vsoxei32.v v8, (a0), v26, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_baseidx_v8i8_v8i16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vsext.vf8 v28, v9
+; RV64-NEXT:    vsll.vi v28, v28, 1
+; RV64-NEXT:    vsetivli a1, 4, e16,m1,ta,mu
+; RV64-NEXT:    vsoxei64.v v8, (a0), v28, v0.t
+; RV64-NEXT:    ret
+  %ptrs = getelementptr inbounds i16, i16* %base, <8 x i8> %idxs
+  call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %val, <8 x i16*> %ptrs, i32 2, <8 x i1> %m)
+  ret void
+}
+
+define void @mscatter_baseidx_sext_v8i8_v8i16(<8 x i16> %val, i16* %base, <8 x i8> %idxs, <8 x i1> %m) {
+; RV32-LABEL: mscatter_baseidx_sext_v8i8_v8i16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a1, 8, e32,m2,ta,mu
+; RV32-NEXT:    vsext.vf4 v26, v9
+; RV32-NEXT:    vsll.vi v26, v26, 1
+; RV32-NEXT:    vsetivli a1, 4, e16,m1,ta,mu
+; RV32-NEXT:    vsoxei32.v v8, (a0), v26, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_baseidx_sext_v8i8_v8i16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vsext.vf8 v28, v9
+; RV64-NEXT:    vsll.vi v28, v28, 1
+; RV64-NEXT:    vsetivli a1, 4, e16,m1,ta,mu
+; RV64-NEXT:    vsoxei64.v v8, (a0), v28, v0.t
+; RV64-NEXT:    ret
+  %eidxs = sext <8 x i8> %idxs to <8 x i16>
+  %ptrs = getelementptr inbounds i16, i16* %base, <8 x i16> %eidxs
+  call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %val, <8 x i16*> %ptrs, i32 2, <8 x i1> %m)
+  ret void
+}
+
+define void @mscatter_baseidx_zext_v8i8_v8i16(<8 x i16> %val, i16* %base, <8 x i8> %idxs, <8 x i1> %m) {
+; RV32-LABEL: mscatter_baseidx_zext_v8i8_v8i16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a1, 8, e32,m2,ta,mu
+; RV32-NEXT:    vzext.vf4 v26, v9
+; RV32-NEXT:    vsll.vi v26, v26, 1
+; RV32-NEXT:    vsetivli a1, 4, e16,m1,ta,mu
+; RV32-NEXT:    vsoxei32.v v8, (a0), v26, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_baseidx_zext_v8i8_v8i16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vzext.vf8 v28, v9
+; RV64-NEXT:    vsll.vi v28, v28, 1
+; RV64-NEXT:    vsetivli a1, 4, e16,m1,ta,mu
+; RV64-NEXT:    vsoxei64.v v8, (a0), v28, v0.t
+; RV64-NEXT:    ret
+  %eidxs = zext <8 x i8> %idxs to <8 x i16>
+  %ptrs = getelementptr inbounds i16, i16* %base, <8 x i16> %eidxs
+  call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %val, <8 x i16*> %ptrs, i32 2, <8 x i1> %m)
+  ret void
+}
+
+define void @mscatter_baseidx_v8i16(<8 x i16> %val, i16* %base, <8 x i16> %idxs, <8 x i1> %m) {
+; RV32-LABEL: mscatter_baseidx_v8i16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a1, 8, e32,m2,ta,mu
+; RV32-NEXT:    vsext.vf2 v26, v9
+; RV32-NEXT:    vsll.vi v26, v26, 1
+; RV32-NEXT:    vsetivli a1, 4, e16,m1,ta,mu
+; RV32-NEXT:    vsoxei32.v v8, (a0), v26, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_baseidx_v8i16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vsext.vf4 v28, v9
+; RV64-NEXT:    vsll.vi v28, v28, 1
+; RV64-NEXT:    vsetivli a1, 4, e16,m1,ta,mu
+; RV64-NEXT:    vsoxei64.v v8, (a0), v28, v0.t
+; RV64-NEXT:    ret
+  %ptrs = getelementptr inbounds i16, i16* %base, <8 x i16> %idxs
+  call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %val, <8 x i16*> %ptrs, i32 2, <8 x i1> %m)
+  ret void
+}
+
+declare void @llvm.masked.scatter.v1i32.v1p0i32(<1 x i32>, <1 x i32*>, i32, <1 x i1>)
+
+define void @mscatter_v1i32(<1 x i32> %val, <1 x i32*> %ptrs, <1 x i1> %m) {
+; RV32-LABEL: mscatter_v1i32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a0, 2, e32,m1,ta,mu
+; RV32-NEXT:    vsoxei32.v v8, (zero), v9, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_v1i32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a0, 1, e32,mf2,ta,mu
+; RV64-NEXT:    vsoxei64.v v8, (zero), v9, v0.t
+; RV64-NEXT:    ret
+  call void @llvm.masked.scatter.v1i32.v1p0i32(<1 x i32> %val, <1 x i32*> %ptrs, i32 4, <1 x i1> %m)
+  ret void
+}
+
+declare void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32>, <2 x i32*>, i32, <2 x i1>)
+
+define void @mscatter_v2i32(<2 x i32> %val, <2 x i32*> %ptrs, <2 x i1> %m) {
+; RV32-LABEL: mscatter_v2i32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a0, 2, e32,m1,ta,mu
+; RV32-NEXT:    vsoxei32.v v8, (zero), v9, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_v2i32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a0, 1, e32,mf2,ta,mu
+; RV64-NEXT:    vsoxei64.v v8, (zero), v9, v0.t
+; RV64-NEXT:    ret
+  call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> %val, <2 x i32*> %ptrs, i32 4, <2 x i1> %m)
+  ret void
+}
+
+define void @mscatter_v2i64_truncstore_v2i32(<2 x i64> %val, <2 x i32*> %ptrs, <2 x i1> %m) {
+; RV32-LABEL: mscatter_v2i64_truncstore_v2i32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a0, 2, e32,mf2,ta,mu
+; RV32-NEXT:    vnsrl.wi v25, v8, 0
+; RV32-NEXT:    vsetivli a0, 2, e32,m1,ta,mu
+; RV32-NEXT:    vsoxei32.v v25, (zero), v9, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_v2i64_truncstore_v2i32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a0, 2, e32,mf2,ta,mu
+; RV64-NEXT:    vnsrl.wi v25, v8, 0
+; RV64-NEXT:    vsetivli a0, 1, e32,mf2,ta,mu
+; RV64-NEXT:    vsoxei64.v v25, (zero), v9, v0.t
+; RV64-NEXT:    ret
+  %tval = trunc <2 x i64> %val to <2 x i32>
+  call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> %tval, <2 x i32*> %ptrs, i32 4, <2 x i1> %m)
+  ret void
+}
+
+declare void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32>, <4 x i32*>, i32, <4 x i1>)
+
+define void @mscatter_v4i32(<4 x i32> %val, <4 x i32*> %ptrs, <4 x i1> %m) {
+; RV32-LABEL: mscatter_v4i32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a0, 2, e32,m1,ta,mu
+; RV32-NEXT:    vsoxei32.v v8, (zero), v9, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_v4i32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a0, 2, e32,m1,ta,mu
+; RV64-NEXT:    vsoxei64.v v8, (zero), v10, v0.t
+; RV64-NEXT:    ret
+  call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %val, <4 x i32*> %ptrs, i32 4, <4 x i1> %m)
+  ret void
+}
+
+define void @mscatter_truemask_v4i32(<4 x i32> %val, <4 x i32*> %ptrs) {
+; RV32-LABEL: mscatter_truemask_v4i32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a0, 2, e32,m1,ta,mu
+; RV32-NEXT:    vsoxei32.v v8, (zero), v9
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_truemask_v4i32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a0, 2, e32,m1,ta,mu
+; RV64-NEXT:    vsoxei64.v v8, (zero), v10
+; RV64-NEXT:    ret
+  %mhead = insertelement <4 x i1> undef, i1 1, i32 0
+  %mtrue = shufflevector <4 x i1> %mhead, <4 x i1> undef, <4 x i32> zeroinitializer
+  call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %val, <4 x i32*> %ptrs, i32 4, <4 x i1> %mtrue)
+  ret void
+}
+
+define void @mscatter_falsemask_v4i32(<4 x i32> %val, <4 x i32*> %ptrs) {
+; RV32-LABEL: mscatter_falsemask_v4i32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_falsemask_v4i32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    ret
+  call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %val, <4 x i32*> %ptrs, i32 4, <4 x i1> zeroinitializer)
+  ret void
+}
+
+declare void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32>, <8 x i32*>, i32, <8 x i1>)
+
+define void @mscatter_v8i32(<8 x i32> %val, <8 x i32*> %ptrs, <8 x i1> %m) {
+; RV32-LABEL: mscatter_v8i32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a0, 4, e32,m2,ta,mu
+; RV32-NEXT:    vsoxei32.v v8, (zero), v10, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_v8i32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a0, 4, e32,m2,ta,mu
+; RV64-NEXT:    vsoxei64.v v8, (zero), v12, v0.t
+; RV64-NEXT:    ret
+  call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %val, <8 x i32*> %ptrs, i32 4, <8 x i1> %m)
+  ret void
+}
+
+define void @mscatter_baseidx_v8i8_v8i32(<8 x i32> %val, i32* %base, <8 x i8> %idxs, <8 x i1> %m) {
+; RV32-LABEL: mscatter_baseidx_v8i8_v8i32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a1, 8, e32,m2,ta,mu
+; RV32-NEXT:    vsext.vf4 v26, v10
+; RV32-NEXT:    vsll.vi v26, v26, 2
+; RV32-NEXT:    vsetivli a1, 4, e32,m2,ta,mu
+; RV32-NEXT:    vsoxei32.v v8, (a0), v26, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_baseidx_v8i8_v8i32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vsext.vf8 v28, v10
+; RV64-NEXT:    vsll.vi v28, v28, 2
+; RV64-NEXT:    vsetivli a1, 4, e32,m2,ta,mu
+; RV64-NEXT:    vsoxei64.v v8, (a0), v28, v0.t
+; RV64-NEXT:    ret
+  %ptrs = getelementptr inbounds i32, i32* %base, <8 x i8> %idxs
+  call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %val, <8 x i32*> %ptrs, i32 4, <8 x i1> %m)
+  ret void
+}
+
+define void @mscatter_baseidx_sext_v8i8_v8i32(<8 x i32> %val, i32* %base, <8 x i8> %idxs, <8 x i1> %m) {
+; RV32-LABEL: mscatter_baseidx_sext_v8i8_v8i32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a1, 8, e32,m2,ta,mu
+; RV32-NEXT:    vsext.vf4 v26, v10
+; RV32-NEXT:    vsll.vi v26, v26, 2
+; RV32-NEXT:    vsetivli a1, 4, e32,m2,ta,mu
+; RV32-NEXT:    vsoxei32.v v8, (a0), v26, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_baseidx_sext_v8i8_v8i32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vsext.vf8 v28, v10
+; RV64-NEXT:    vsll.vi v28, v28, 2
+; RV64-NEXT:    vsetivli a1, 4, e32,m2,ta,mu
+; RV64-NEXT:    vsoxei64.v v8, (a0), v28, v0.t
+; RV64-NEXT:    ret
+  %eidxs = sext <8 x i8> %idxs to <8 x i32>
+  %ptrs = getelementptr inbounds i32, i32* %base, <8 x i32> %eidxs
+  call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %val, <8 x i32*> %ptrs, i32 4, <8 x i1> %m)
+  ret void
+}
+
+define void @mscatter_baseidx_zext_v8i8_v8i32(<8 x i32> %val, i32* %base, <8 x i8> %idxs, <8 x i1> %m) {
+; RV32-LABEL: mscatter_baseidx_zext_v8i8_v8i32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a1, 8, e32,m2,ta,mu
+; RV32-NEXT:    vzext.vf4 v26, v10
+; RV32-NEXT:    vsll.vi v26, v26, 2
+; RV32-NEXT:    vsetivli a1, 4, e32,m2,ta,mu
+; RV32-NEXT:    vsoxei32.v v8, (a0), v26, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_baseidx_zext_v8i8_v8i32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vzext.vf8 v28, v10
+; RV64-NEXT:    vsll.vi v28, v28, 2
+; RV64-NEXT:    vsetivli a1, 4, e32,m2,ta,mu
+; RV64-NEXT:    vsoxei64.v v8, (a0), v28, v0.t
+; RV64-NEXT:    ret
+  %eidxs = zext <8 x i8> %idxs to <8 x i32>
+  %ptrs = getelementptr inbounds i32, i32* %base, <8 x i32> %eidxs
+  call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %val, <8 x i32*> %ptrs, i32 4, <8 x i1> %m)
+  ret void
+}
+
+define void @mscatter_baseidx_v8i16_v8i32(<8 x i32> %val, i32* %base, <8 x i16> %idxs, <8 x i1> %m) {
+; RV32-LABEL: mscatter_baseidx_v8i16_v8i32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a1, 8, e32,m2,ta,mu
+; RV32-NEXT:    vsext.vf2 v26, v10
+; RV32-NEXT:    vsll.vi v26, v26, 2
+; RV32-NEXT:    vsetivli a1, 4, e32,m2,ta,mu
+; RV32-NEXT:    vsoxei32.v v8, (a0), v26, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_baseidx_v8i16_v8i32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vsext.vf4 v28, v10
+; RV64-NEXT:    vsll.vi v28, v28, 2
+; RV64-NEXT:    vsetivli a1, 4, e32,m2,ta,mu
+; RV64-NEXT:    vsoxei64.v v8, (a0), v28, v0.t
+; RV64-NEXT:    ret
+  %ptrs = getelementptr inbounds i32, i32* %base, <8 x i16> %idxs
+  call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %val, <8 x i32*> %ptrs, i32 4, <8 x i1> %m)
+  ret void
+}
+
+define void @mscatter_baseidx_sext_v8i16_v8i32(<8 x i32> %val, i32* %base, <8 x i16> %idxs, <8 x i1> %m) {
+; RV32-LABEL: mscatter_baseidx_sext_v8i16_v8i32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a1, 8, e32,m2,ta,mu
+; RV32-NEXT:    vsext.vf2 v26, v10
+; RV32-NEXT:    vsll.vi v26, v26, 2
+; RV32-NEXT:    vsetivli a1, 4, e32,m2,ta,mu
+; RV32-NEXT:    vsoxei32.v v8, (a0), v26, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_baseidx_sext_v8i16_v8i32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vsext.vf4 v28, v10
+; RV64-NEXT:    vsll.vi v28, v28, 2
+; RV64-NEXT:    vsetivli a1, 4, e32,m2,ta,mu
+; RV64-NEXT:    vsoxei64.v v8, (a0), v28, v0.t
+; RV64-NEXT:    ret
+  %eidxs = sext <8 x i16> %idxs to <8 x i32>
+  %ptrs = getelementptr inbounds i32, i32* %base, <8 x i32> %eidxs
+  call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %val, <8 x i32*> %ptrs, i32 4, <8 x i1> %m)
+  ret void
+}
+
+define void @mscatter_baseidx_zext_v8i16_v8i32(<8 x i32> %val, i32* %base, <8 x i16> %idxs, <8 x i1> %m) {
+; RV32-LABEL: mscatter_baseidx_zext_v8i16_v8i32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a1, 8, e32,m2,ta,mu
+; RV32-NEXT:    vzext.vf2 v26, v10
+; RV32-NEXT:    vsll.vi v26, v26, 2
+; RV32-NEXT:    vsetivli a1, 4, e32,m2,ta,mu
+; RV32-NEXT:    vsoxei32.v v8, (a0), v26, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_baseidx_zext_v8i16_v8i32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vzext.vf4 v28, v10
+; RV64-NEXT:    vsll.vi v28, v28, 2
+; RV64-NEXT:    vsetivli a1, 4, e32,m2,ta,mu
+; RV64-NEXT:    vsoxei64.v v8, (a0), v28, v0.t
+; RV64-NEXT:    ret
+  %eidxs = zext <8 x i16> %idxs to <8 x i32>
+  %ptrs = getelementptr inbounds i32, i32* %base, <8 x i32> %eidxs
+  call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %val, <8 x i32*> %ptrs, i32 4, <8 x i1> %m)
+  ret void
+}
+
+define void @mscatter_baseidx_v8i32(<8 x i32> %val, i32* %base, <8 x i32> %idxs, <8 x i1> %m) {
+; RV32-LABEL: mscatter_baseidx_v8i32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a1, 8, e32,m2,ta,mu
+; RV32-NEXT:    vsll.vi v26, v10, 2
+; RV32-NEXT:    vsetivli a1, 4, e32,m2,ta,mu
+; RV32-NEXT:    vsoxei32.v v8, (a0), v26, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_baseidx_v8i32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vsext.vf2 v28, v10
+; RV64-NEXT:    vsll.vi v28, v28, 2
+; RV64-NEXT:    vsetivli a1, 4, e32,m2,ta,mu
+; RV64-NEXT:    vsoxei64.v v8, (a0), v28, v0.t
+; RV64-NEXT:    ret
+  %ptrs = getelementptr inbounds i32, i32* %base, <8 x i32> %idxs
+  call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %val, <8 x i32*> %ptrs, i32 4, <8 x i1> %m)
+  ret void
+}
+
+declare void @llvm.masked.scatter.v1i64.v1p0i64(<1 x i64>, <1 x i64*>, i32, <1 x i1>)
+
+define void @mscatter_v1i64(<1 x i64> %val, <1 x i64*> %ptrs, <1 x i1> %m) {
+; RV32-LABEL: mscatter_v1i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a0, 1, e64,m1,ta,mu
+; RV32-NEXT:    vsoxei32.v v8, (zero), v9, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_v1i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a0, 1, e64,m1,ta,mu
+; RV64-NEXT:    vsoxei64.v v8, (zero), v9, v0.t
+; RV64-NEXT:    ret
+  call void @llvm.masked.scatter.v1i64.v1p0i64(<1 x i64> %val, <1 x i64*> %ptrs, i32 8, <1 x i1> %m)
+  ret void
+}
+
+declare void @llvm.masked.scatter.v2i64.v2p0i64(<2 x i64>, <2 x i64*>, i32, <2 x i1>)
+
+define void @mscatter_v2i64(<2 x i64> %val, <2 x i64*> %ptrs, <2 x i1> %m) {
+; RV32-LABEL: mscatter_v2i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a0, 1, e64,m1,ta,mu
+; RV32-NEXT:    vsoxei32.v v8, (zero), v9, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_v2i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a0, 1, e64,m1,ta,mu
+; RV64-NEXT:    vsoxei64.v v8, (zero), v9, v0.t
+; RV64-NEXT:    ret
+  call void @llvm.masked.scatter.v2i64.v2p0i64(<2 x i64> %val, <2 x i64*> %ptrs, i32 8, <2 x i1> %m)
+  ret void
+}
+
+declare void @llvm.masked.scatter.v4i64.v4p0i64(<4 x i64>, <4 x i64*>, i32, <4 x i1>)
+
+define void @mscatter_v4i64(<4 x i64> %val, <4 x i64*> %ptrs, <4 x i1> %m) {
+; RV32-LABEL: mscatter_v4i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a0, 2, e64,m2,ta,mu
+; RV32-NEXT:    vsoxei32.v v8, (zero), v10, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_v4i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a0, 2, e64,m2,ta,mu
+; RV64-NEXT:    vsoxei64.v v8, (zero), v10, v0.t
+; RV64-NEXT:    ret
+  call void @llvm.masked.scatter.v4i64.v4p0i64(<4 x i64> %val, <4 x i64*> %ptrs, i32 8, <4 x i1> %m)
+  ret void
+}
+
+define void @mscatter_truemask_v4i64(<4 x i64> %val, <4 x i64*> %ptrs) {
+; RV32-LABEL: mscatter_truemask_v4i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a0, 2, e64,m2,ta,mu
+; RV32-NEXT:    vsoxei32.v v8, (zero), v10
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_truemask_v4i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a0, 2, e64,m2,ta,mu
+; RV64-NEXT:    vsoxei64.v v8, (zero), v10
+; RV64-NEXT:    ret
+  %mhead = insertelement <4 x i1> undef, i1 1, i32 0
+  %mtrue = shufflevector <4 x i1> %mhead, <4 x i1> undef, <4 x i32> zeroinitializer
+  call void @llvm.masked.scatter.v4i64.v4p0i64(<4 x i64> %val, <4 x i64*> %ptrs, i32 8, <4 x i1> %mtrue)
+  ret void
+}
+
+define void @mscatter_falsemask_v4i64(<4 x i64> %val, <4 x i64*> %ptrs) {
+; RV32-LABEL: mscatter_falsemask_v4i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_falsemask_v4i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    ret
+  call void @llvm.masked.scatter.v4i64.v4p0i64(<4 x i64> %val, <4 x i64*> %ptrs, i32 8, <4 x i1> zeroinitializer)
+  ret void
+}
+
+declare void @llvm.masked.scatter.v8i64.v8p0i64(<8 x i64>, <8 x i64*>, i32, <8 x i1>)
+
+define void @mscatter_v8i64(<8 x i64> %val, <8 x i64*> %ptrs, <8 x i1> %m) {
+; RV32-LABEL: mscatter_v8i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a0, 4, e64,m4,ta,mu
+; RV32-NEXT:    vsoxei32.v v8, (zero), v12, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_v8i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a0, 4, e64,m4,ta,mu
+; RV64-NEXT:    vsoxei64.v v8, (zero), v12, v0.t
+; RV64-NEXT:    ret
+  call void @llvm.masked.scatter.v8i64.v8p0i64(<8 x i64> %val, <8 x i64*> %ptrs, i32 8, <8 x i1> %m)
+  ret void
+}
+
+define void @mscatter_baseidx_v8i8_v8i64(<8 x i64> %val, i64* %base, <8 x i8> %idxs, <8 x i1> %m) {
+; RV32-LABEL: mscatter_baseidx_v8i8_v8i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a1, 8, e32,m2,ta,mu
+; RV32-NEXT:    vsext.vf4 v26, v12
+; RV32-NEXT:    vsll.vi v26, v26, 3
+; RV32-NEXT:    vsetivli a1, 4, e64,m4,ta,mu
+; RV32-NEXT:    vsoxei32.v v8, (a0), v26, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_baseidx_v8i8_v8i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vsext.vf8 v28, v12
+; RV64-NEXT:    vsll.vi v28, v28, 3
+; RV64-NEXT:    vsetivli a1, 4, e64,m4,ta,mu
+; RV64-NEXT:    vsoxei64.v v8, (a0), v28, v0.t
+; RV64-NEXT:    ret
+  %ptrs = getelementptr inbounds i64, i64* %base, <8 x i8> %idxs
+  call void @llvm.masked.scatter.v8i64.v8p0i64(<8 x i64> %val, <8 x i64*> %ptrs, i32 8, <8 x i1> %m)
+  ret void
+}
+
+define void @mscatter_baseidx_sext_v8i8_v8i64(<8 x i64> %val, i64* %base, <8 x i8> %idxs, <8 x i1> %m) {
+; RV32-LABEL: mscatter_baseidx_sext_v8i8_v8i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV32-NEXT:    vsext.vf8 v28, v12
+; RV32-NEXT:    lui a1, %hi(.LCPI43_0)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI43_0)
+; RV32-NEXT:    vsetivli a2, 16, e32,m4,ta,mu
+; RV32-NEXT:    vle32.v v12, (a1)
+; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV32-NEXT:    vsll.vv v28, v28, v12
+; RV32-NEXT:    vsetivli a1, 4, e64,m4,ta,mu
+; RV32-NEXT:    vsoxei64.v v8, (a0), v28, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_baseidx_sext_v8i8_v8i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vsext.vf8 v28, v12
+; RV64-NEXT:    vsll.vi v28, v28, 3
+; RV64-NEXT:    vsetivli a1, 4, e64,m4,ta,mu
+; RV64-NEXT:    vsoxei64.v v8, (a0), v28, v0.t
+; RV64-NEXT:    ret
+  %eidxs = sext <8 x i8> %idxs to <8 x i64>
+  %ptrs = getelementptr inbounds i64, i64* %base, <8 x i64> %eidxs
+  call void @llvm.masked.scatter.v8i64.v8p0i64(<8 x i64> %val, <8 x i64*> %ptrs, i32 8, <8 x i1> %m)
+  ret void
+}
+
+define void @mscatter_baseidx_zext_v8i8_v8i64(<8 x i64> %val, i64* %base, <8 x i8> %idxs, <8 x i1> %m) {
+; RV32-LABEL: mscatter_baseidx_zext_v8i8_v8i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV32-NEXT:    vzext.vf8 v28, v12
+; RV32-NEXT:    lui a1, %hi(.LCPI44_0)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI44_0)
+; RV32-NEXT:    vsetivli a2, 16, e32,m4,ta,mu
+; RV32-NEXT:    vle32.v v12, (a1)
+; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV32-NEXT:    vsll.vv v28, v28, v12
+; RV32-NEXT:    vsetivli a1, 4, e64,m4,ta,mu
+; RV32-NEXT:    vsoxei64.v v8, (a0), v28, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_baseidx_zext_v8i8_v8i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vzext.vf8 v28, v12
+; RV64-NEXT:    vsll.vi v28, v28, 3
+; RV64-NEXT:    vsetivli a1, 4, e64,m4,ta,mu
+; RV64-NEXT:    vsoxei64.v v8, (a0), v28, v0.t
+; RV64-NEXT:    ret
+  %eidxs = zext <8 x i8> %idxs to <8 x i64>
+  %ptrs = getelementptr inbounds i64, i64* %base, <8 x i64> %eidxs
+  call void @llvm.masked.scatter.v8i64.v8p0i64(<8 x i64> %val, <8 x i64*> %ptrs, i32 8, <8 x i1> %m)
+  ret void
+}
+
+define void @mscatter_baseidx_v8i16_v8i64(<8 x i64> %val, i64* %base, <8 x i16> %idxs, <8 x i1> %m) {
+; RV32-LABEL: mscatter_baseidx_v8i16_v8i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a1, 8, e32,m2,ta,mu
+; RV32-NEXT:    vsext.vf2 v26, v12
+; RV32-NEXT:    vsll.vi v26, v26, 3
+; RV32-NEXT:    vsetivli a1, 4, e64,m4,ta,mu
+; RV32-NEXT:    vsoxei32.v v8, (a0), v26, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_baseidx_v8i16_v8i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vsext.vf4 v28, v12
+; RV64-NEXT:    vsll.vi v28, v28, 3
+; RV64-NEXT:    vsetivli a1, 4, e64,m4,ta,mu
+; RV64-NEXT:    vsoxei64.v v8, (a0), v28, v0.t
+; RV64-NEXT:    ret
+  %ptrs = getelementptr inbounds i64, i64* %base, <8 x i16> %idxs
+  call void @llvm.masked.scatter.v8i64.v8p0i64(<8 x i64> %val, <8 x i64*> %ptrs, i32 8, <8 x i1> %m)
+  ret void
+}
+
+define void @mscatter_baseidx_sext_v8i16_v8i64(<8 x i64> %val, i64* %base, <8 x i16> %idxs, <8 x i1> %m) {
+; RV32-LABEL: mscatter_baseidx_sext_v8i16_v8i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV32-NEXT:    vsext.vf4 v28, v12
+; RV32-NEXT:    lui a1, %hi(.LCPI46_0)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI46_0)
+; RV32-NEXT:    vsetivli a2, 16, e32,m4,ta,mu
+; RV32-NEXT:    vle32.v v12, (a1)
+; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV32-NEXT:    vsll.vv v28, v28, v12
+; RV32-NEXT:    vsetivli a1, 4, e64,m4,ta,mu
+; RV32-NEXT:    vsoxei64.v v8, (a0), v28, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_baseidx_sext_v8i16_v8i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vsext.vf4 v28, v12
+; RV64-NEXT:    vsll.vi v28, v28, 3
+; RV64-NEXT:    vsetivli a1, 4, e64,m4,ta,mu
+; RV64-NEXT:    vsoxei64.v v8, (a0), v28, v0.t
+; RV64-NEXT:    ret
+  %eidxs = sext <8 x i16> %idxs to <8 x i64>
+  %ptrs = getelementptr inbounds i64, i64* %base, <8 x i64> %eidxs
+  call void @llvm.masked.scatter.v8i64.v8p0i64(<8 x i64> %val, <8 x i64*> %ptrs, i32 8, <8 x i1> %m)
+  ret void
+}
+
+define void @mscatter_baseidx_zext_v8i16_v8i64(<8 x i64> %val, i64* %base, <8 x i16> %idxs, <8 x i1> %m) {
+; RV32-LABEL: mscatter_baseidx_zext_v8i16_v8i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV32-NEXT:    vzext.vf4 v28, v12
+; RV32-NEXT:    lui a1, %hi(.LCPI47_0)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI47_0)
+; RV32-NEXT:    vsetivli a2, 16, e32,m4,ta,mu
+; RV32-NEXT:    vle32.v v12, (a1)
+; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV32-NEXT:    vsll.vv v28, v28, v12
+; RV32-NEXT:    vsetivli a1, 4, e64,m4,ta,mu
+; RV32-NEXT:    vsoxei64.v v8, (a0), v28, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_baseidx_zext_v8i16_v8i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vzext.vf4 v28, v12
+; RV64-NEXT:    vsll.vi v28, v28, 3
+; RV64-NEXT:    vsetivli a1, 4, e64,m4,ta,mu
+; RV64-NEXT:    vsoxei64.v v8, (a0), v28, v0.t
+; RV64-NEXT:    ret
+  %eidxs = zext <8 x i16> %idxs to <8 x i64>
+  %ptrs = getelementptr inbounds i64, i64* %base, <8 x i64> %eidxs
+  call void @llvm.masked.scatter.v8i64.v8p0i64(<8 x i64> %val, <8 x i64*> %ptrs, i32 8, <8 x i1> %m)
+  ret void
+}
+
+define void @mscatter_baseidx_v8i32_v8i64(<8 x i64> %val, i64* %base, <8 x i32> %idxs, <8 x i1> %m) {
+; RV32-LABEL: mscatter_baseidx_v8i32_v8i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a1, 8, e32,m2,ta,mu
+; RV32-NEXT:    vsll.vi v26, v12, 3
+; RV32-NEXT:    vsetivli a1, 4, e64,m4,ta,mu
+; RV32-NEXT:    vsoxei32.v v8, (a0), v26, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_baseidx_v8i32_v8i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vsext.vf2 v28, v12
+; RV64-NEXT:    vsll.vi v28, v28, 3
+; RV64-NEXT:    vsetivli a1, 4, e64,m4,ta,mu
+; RV64-NEXT:    vsoxei64.v v8, (a0), v28, v0.t
+; RV64-NEXT:    ret
+  %ptrs = getelementptr inbounds i64, i64* %base, <8 x i32> %idxs
+  call void @llvm.masked.scatter.v8i64.v8p0i64(<8 x i64> %val, <8 x i64*> %ptrs, i32 8, <8 x i1> %m)
+  ret void
+}
+
+define void @mscatter_baseidx_sext_v8i32_v8i64(<8 x i64> %val, i64* %base, <8 x i32> %idxs, <8 x i1> %m) {
+; RV32-LABEL: mscatter_baseidx_sext_v8i32_v8i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV32-NEXT:    vsext.vf2 v28, v12
+; RV32-NEXT:    lui a1, %hi(.LCPI49_0)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI49_0)
+; RV32-NEXT:    vsetivli a2, 16, e32,m4,ta,mu
+; RV32-NEXT:    vle32.v v12, (a1)
+; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV32-NEXT:    vsll.vv v28, v28, v12
+; RV32-NEXT:    vsetivli a1, 4, e64,m4,ta,mu
+; RV32-NEXT:    vsoxei64.v v8, (a0), v28, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_baseidx_sext_v8i32_v8i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vsext.vf2 v28, v12
+; RV64-NEXT:    vsll.vi v28, v28, 3
+; RV64-NEXT:    vsetivli a1, 4, e64,m4,ta,mu
+; RV64-NEXT:    vsoxei64.v v8, (a0), v28, v0.t
+; RV64-NEXT:    ret
+  %eidxs = sext <8 x i32> %idxs to <8 x i64>
+  %ptrs = getelementptr inbounds i64, i64* %base, <8 x i64> %eidxs
+  call void @llvm.masked.scatter.v8i64.v8p0i64(<8 x i64> %val, <8 x i64*> %ptrs, i32 8, <8 x i1> %m)
+  ret void
+}
+
+define void @mscatter_baseidx_zext_v8i32_v8i64(<8 x i64> %val, i64* %base, <8 x i32> %idxs, <8 x i1> %m) {
+; RV32-LABEL: mscatter_baseidx_zext_v8i32_v8i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV32-NEXT:    vzext.vf2 v28, v12
+; RV32-NEXT:    lui a1, %hi(.LCPI50_0)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI50_0)
+; RV32-NEXT:    vsetivli a2, 16, e32,m4,ta,mu
+; RV32-NEXT:    vle32.v v12, (a1)
+; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV32-NEXT:    vsll.vv v28, v28, v12
+; RV32-NEXT:    vsetivli a1, 4, e64,m4,ta,mu
+; RV32-NEXT:    vsoxei64.v v8, (a0), v28, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_baseidx_zext_v8i32_v8i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vzext.vf2 v28, v12
+; RV64-NEXT:    vsll.vi v28, v28, 3
+; RV64-NEXT:    vsetivli a1, 4, e64,m4,ta,mu
+; RV64-NEXT:    vsoxei64.v v8, (a0), v28, v0.t
+; RV64-NEXT:    ret
+  %eidxs = zext <8 x i32> %idxs to <8 x i64>
+  %ptrs = getelementptr inbounds i64, i64* %base, <8 x i64> %eidxs
+  call void @llvm.masked.scatter.v8i64.v8p0i64(<8 x i64> %val, <8 x i64*> %ptrs, i32 8, <8 x i1> %m)
+  ret void
+}
+
+define void @mscatter_baseidx_v8i64(<8 x i64> %val, i64* %base, <8 x i64> %idxs, <8 x i1> %m) {
+; RV32-LABEL: mscatter_baseidx_v8i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    lui a1, %hi(.LCPI51_0)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI51_0)
+; RV32-NEXT:    vsetivli a2, 16, e32,m4,ta,mu
+; RV32-NEXT:    vle32.v v28, (a1)
+; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV32-NEXT:    vsll.vv v28, v12, v28
+; RV32-NEXT:    vsetivli a1, 4, e64,m4,ta,mu
+; RV32-NEXT:    vsoxei64.v v8, (a0), v28, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_baseidx_v8i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vsll.vi v28, v12, 3
+; RV64-NEXT:    vsetivli a1, 4, e64,m4,ta,mu
+; RV64-NEXT:    vsoxei64.v v8, (a0), v28, v0.t
+; RV64-NEXT:    ret
+  %ptrs = getelementptr inbounds i64, i64* %base, <8 x i64> %idxs
+  call void @llvm.masked.scatter.v8i64.v8p0i64(<8 x i64> %val, <8 x i64*> %ptrs, i32 8, <8 x i1> %m)
+  ret void
+}
+
+declare void @llvm.masked.scatter.v1f16.v1p0f16(<1 x half>, <1 x half*>, i32, <1 x i1>)
+
+define void @mscatter_v1f16(<1 x half> %val, <1 x half*> %ptrs, <1 x i1> %m) {
+; RV32-LABEL: mscatter_v1f16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a0, 2, e16,mf2,ta,mu
+; RV32-NEXT:    vsoxei32.v v8, (zero), v9, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_v1f16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a0, 1, e16,mf4,ta,mu
+; RV64-NEXT:    vsoxei64.v v8, (zero), v9, v0.t
+; RV64-NEXT:    ret
+  call void @llvm.masked.scatter.v1f16.v1p0f16(<1 x half> %val, <1 x half*> %ptrs, i32 2, <1 x i1> %m)
+  ret void
+}
+
+declare void @llvm.masked.scatter.v2f16.v2p0f16(<2 x half>, <2 x half*>, i32, <2 x i1>)
+
+define void @mscatter_v2f16(<2 x half> %val, <2 x half*> %ptrs, <2 x i1> %m) {
+; RV32-LABEL: mscatter_v2f16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a0, 2, e16,mf2,ta,mu
+; RV32-NEXT:    vsoxei32.v v8, (zero), v9, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_v2f16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a0, 1, e16,mf4,ta,mu
+; RV64-NEXT:    vsoxei64.v v8, (zero), v9, v0.t
+; RV64-NEXT:    ret
+  call void @llvm.masked.scatter.v2f16.v2p0f16(<2 x half> %val, <2 x half*> %ptrs, i32 2, <2 x i1> %m)
+  ret void
+}
+
+declare void @llvm.masked.scatter.v4f16.v4p0f16(<4 x half>, <4 x half*>, i32, <4 x i1>)
+
+define void @mscatter_v4f16(<4 x half> %val, <4 x half*> %ptrs, <4 x i1> %m) {
+; RV32-LABEL: mscatter_v4f16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a0, 2, e16,mf2,ta,mu
+; RV32-NEXT:    vsoxei32.v v8, (zero), v9, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_v4f16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a0, 2, e16,mf2,ta,mu
+; RV64-NEXT:    vsoxei64.v v8, (zero), v10, v0.t
+; RV64-NEXT:    ret
+  call void @llvm.masked.scatter.v4f16.v4p0f16(<4 x half> %val, <4 x half*> %ptrs, i32 2, <4 x i1> %m)
+  ret void
+}
+
+define void @mscatter_truemask_v4f16(<4 x half> %val, <4 x half*> %ptrs) {
+; RV32-LABEL: mscatter_truemask_v4f16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a0, 2, e16,mf2,ta,mu
+; RV32-NEXT:    vsoxei32.v v8, (zero), v9
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_truemask_v4f16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a0, 2, e16,mf2,ta,mu
+; RV64-NEXT:    vsoxei64.v v8, (zero), v10
+; RV64-NEXT:    ret
+  %mhead = insertelement <4 x i1> undef, i1 1, i32 0
+  %mtrue = shufflevector <4 x i1> %mhead, <4 x i1> undef, <4 x i32> zeroinitializer
+  call void @llvm.masked.scatter.v4f16.v4p0f16(<4 x half> %val, <4 x half*> %ptrs, i32 2, <4 x i1> %mtrue)
+  ret void
+}
+
+define void @mscatter_falsemask_v4f16(<4 x half> %val, <4 x half*> %ptrs) {
+; RV32-LABEL: mscatter_falsemask_v4f16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_falsemask_v4f16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    ret
+  call void @llvm.masked.scatter.v4f16.v4p0f16(<4 x half> %val, <4 x half*> %ptrs, i32 2, <4 x i1> zeroinitializer)
+  ret void
+}
+
+declare void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half>, <8 x half*>, i32, <8 x i1>)
+
+define void @mscatter_v8f16(<8 x half> %val, <8 x half*> %ptrs, <8 x i1> %m) {
+; RV32-LABEL: mscatter_v8f16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a0, 4, e16,m1,ta,mu
+; RV32-NEXT:    vsoxei32.v v8, (zero), v10, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_v8f16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a0, 4, e16,m1,ta,mu
+; RV64-NEXT:    vsoxei64.v v8, (zero), v12, v0.t
+; RV64-NEXT:    ret
+  call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> %val, <8 x half*> %ptrs, i32 2, <8 x i1> %m)
+  ret void
+}
+
+define void @mscatter_baseidx_v8i8_v8f16(<8 x half> %val, half* %base, <8 x i8> %idxs, <8 x i1> %m) {
+; RV32-LABEL: mscatter_baseidx_v8i8_v8f16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a1, 8, e32,m2,ta,mu
+; RV32-NEXT:    vsext.vf4 v26, v9
+; RV32-NEXT:    vsll.vi v26, v26, 1
+; RV32-NEXT:    vsetivli a1, 4, e16,m1,ta,mu
+; RV32-NEXT:    vsoxei32.v v8, (a0), v26, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_baseidx_v8i8_v8f16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vsext.vf8 v28, v9
+; RV64-NEXT:    vsll.vi v28, v28, 1
+; RV64-NEXT:    vsetivli a1, 4, e16,m1,ta,mu
+; RV64-NEXT:    vsoxei64.v v8, (a0), v28, v0.t
+; RV64-NEXT:    ret
+  %ptrs = getelementptr inbounds half, half* %base, <8 x i8> %idxs
+  call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> %val, <8 x half*> %ptrs, i32 2, <8 x i1> %m)
+  ret void
+}
+
+define void @mscatter_baseidx_sext_v8i8_v8f16(<8 x half> %val, half* %base, <8 x i8> %idxs, <8 x i1> %m) {
+; RV32-LABEL: mscatter_baseidx_sext_v8i8_v8f16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a1, 8, e32,m2,ta,mu
+; RV32-NEXT:    vsext.vf4 v26, v9
+; RV32-NEXT:    vsll.vi v26, v26, 1
+; RV32-NEXT:    vsetivli a1, 4, e16,m1,ta,mu
+; RV32-NEXT:    vsoxei32.v v8, (a0), v26, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_baseidx_sext_v8i8_v8f16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vsext.vf8 v28, v9
+; RV64-NEXT:    vsll.vi v28, v28, 1
+; RV64-NEXT:    vsetivli a1, 4, e16,m1,ta,mu
+; RV64-NEXT:    vsoxei64.v v8, (a0), v28, v0.t
+; RV64-NEXT:    ret
+  %eidxs = sext <8 x i8> %idxs to <8 x i16>
+  %ptrs = getelementptr inbounds half, half* %base, <8 x i16> %eidxs
+  call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> %val, <8 x half*> %ptrs, i32 2, <8 x i1> %m)
+  ret void
+}
+
+define void @mscatter_baseidx_zext_v8i8_v8f16(<8 x half> %val, half* %base, <8 x i8> %idxs, <8 x i1> %m) {
+; RV32-LABEL: mscatter_baseidx_zext_v8i8_v8f16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a1, 8, e32,m2,ta,mu
+; RV32-NEXT:    vzext.vf4 v26, v9
+; RV32-NEXT:    vsll.vi v26, v26, 1
+; RV32-NEXT:    vsetivli a1, 4, e16,m1,ta,mu
+; RV32-NEXT:    vsoxei32.v v8, (a0), v26, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_baseidx_zext_v8i8_v8f16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vzext.vf8 v28, v9
+; RV64-NEXT:    vsll.vi v28, v28, 1
+; RV64-NEXT:    vsetivli a1, 4, e16,m1,ta,mu
+; RV64-NEXT:    vsoxei64.v v8, (a0), v28, v0.t
+; RV64-NEXT:    ret
+  %eidxs = zext <8 x i8> %idxs to <8 x i16>
+  %ptrs = getelementptr inbounds half, half* %base, <8 x i16> %eidxs
+  call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> %val, <8 x half*> %ptrs, i32 2, <8 x i1> %m)
+  ret void
+}
+
+define void @mscatter_baseidx_v8f16(<8 x half> %val, half* %base, <8 x i16> %idxs, <8 x i1> %m) {
+; RV32-LABEL: mscatter_baseidx_v8f16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a1, 8, e32,m2,ta,mu
+; RV32-NEXT:    vsext.vf2 v26, v9
+; RV32-NEXT:    vsll.vi v26, v26, 1
+; RV32-NEXT:    vsetivli a1, 4, e16,m1,ta,mu
+; RV32-NEXT:    vsoxei32.v v8, (a0), v26, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_baseidx_v8f16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vsext.vf4 v28, v9
+; RV64-NEXT:    vsll.vi v28, v28, 1
+; RV64-NEXT:    vsetivli a1, 4, e16,m1,ta,mu
+; RV64-NEXT:    vsoxei64.v v8, (a0), v28, v0.t
+; RV64-NEXT:    ret
+  %ptrs = getelementptr inbounds half, half* %base, <8 x i16> %idxs
+  call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> %val, <8 x half*> %ptrs, i32 2, <8 x i1> %m)
+  ret void
+}
+
+declare void @llvm.masked.scatter.v1f32.v1p0f32(<1 x float>, <1 x float*>, i32, <1 x i1>)
+
+define void @mscatter_v1f32(<1 x float> %val, <1 x float*> %ptrs, <1 x i1> %m) {
+; RV32-LABEL: mscatter_v1f32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a0, 2, e32,m1,ta,mu
+; RV32-NEXT:    vsoxei32.v v8, (zero), v9, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_v1f32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a0, 1, e32,mf2,ta,mu
+; RV64-NEXT:    vsoxei64.v v8, (zero), v9, v0.t
+; RV64-NEXT:    ret
+  call void @llvm.masked.scatter.v1f32.v1p0f32(<1 x float> %val, <1 x float*> %ptrs, i32 4, <1 x i1> %m)
+  ret void
+}
+
+declare void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float>, <2 x float*>, i32, <2 x i1>)
+
+define void @mscatter_v2f32(<2 x float> %val, <2 x float*> %ptrs, <2 x i1> %m) {
+; RV32-LABEL: mscatter_v2f32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a0, 2, e32,m1,ta,mu
+; RV32-NEXT:    vsoxei32.v v8, (zero), v9, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_v2f32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a0, 1, e32,mf2,ta,mu
+; RV64-NEXT:    vsoxei64.v v8, (zero), v9, v0.t
+; RV64-NEXT:    ret
+  call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> %val, <2 x float*> %ptrs, i32 4, <2 x i1> %m)
+  ret void
+}
+
+declare void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float>, <4 x float*>, i32, <4 x i1>)
+
+define void @mscatter_v4f32(<4 x float> %val, <4 x float*> %ptrs, <4 x i1> %m) {
+; RV32-LABEL: mscatter_v4f32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a0, 2, e32,m1,ta,mu
+; RV32-NEXT:    vsoxei32.v v8, (zero), v9, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_v4f32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a0, 2, e32,m1,ta,mu
+; RV64-NEXT:    vsoxei64.v v8, (zero), v10, v0.t
+; RV64-NEXT:    ret
+  call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %val, <4 x float*> %ptrs, i32 4, <4 x i1> %m)
+  ret void
+}
+
+define void @mscatter_truemask_v4f32(<4 x float> %val, <4 x float*> %ptrs) {
+; RV32-LABEL: mscatter_truemask_v4f32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a0, 2, e32,m1,ta,mu
+; RV32-NEXT:    vsoxei32.v v8, (zero), v9
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_truemask_v4f32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a0, 2, e32,m1,ta,mu
+; RV64-NEXT:    vsoxei64.v v8, (zero), v10
+; RV64-NEXT:    ret
+  %mhead = insertelement <4 x i1> undef, i1 1, i32 0
+  %mtrue = shufflevector <4 x i1> %mhead, <4 x i1> undef, <4 x i32> zeroinitializer
+  call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %val, <4 x float*> %ptrs, i32 4, <4 x i1> %mtrue)
+  ret void
+}
+
+define void @mscatter_falsemask_v4f32(<4 x float> %val, <4 x float*> %ptrs) {
+; RV32-LABEL: mscatter_falsemask_v4f32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_falsemask_v4f32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    ret
+  call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %val, <4 x float*> %ptrs, i32 4, <4 x i1> zeroinitializer)
+  ret void
+}
+
+declare void @llvm.masked.scatter.v8f32.v8p0f32(<8 x float>, <8 x float*>, i32, <8 x i1>)
+
+define void @mscatter_v8f32(<8 x float> %val, <8 x float*> %ptrs, <8 x i1> %m) {
+; RV32-LABEL: mscatter_v8f32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a0, 4, e32,m2,ta,mu
+; RV32-NEXT:    vsoxei32.v v8, (zero), v10, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_v8f32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a0, 4, e32,m2,ta,mu
+; RV64-NEXT:    vsoxei64.v v8, (zero), v12, v0.t
+; RV64-NEXT:    ret
+  call void @llvm.masked.scatter.v8f32.v8p0f32(<8 x float> %val, <8 x float*> %ptrs, i32 4, <8 x i1> %m)
+  ret void
+}
+
+define void @mscatter_baseidx_v8i8_v8f32(<8 x float> %val, float* %base, <8 x i8> %idxs, <8 x i1> %m) {
+; RV32-LABEL: mscatter_baseidx_v8i8_v8f32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a1, 8, e32,m2,ta,mu
+; RV32-NEXT:    vsext.vf4 v26, v10
+; RV32-NEXT:    vsll.vi v26, v26, 2
+; RV32-NEXT:    vsetivli a1, 4, e32,m2,ta,mu
+; RV32-NEXT:    vsoxei32.v v8, (a0), v26, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_baseidx_v8i8_v8f32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vsext.vf8 v28, v10
+; RV64-NEXT:    vsll.vi v28, v28, 2
+; RV64-NEXT:    vsetivli a1, 4, e32,m2,ta,mu
+; RV64-NEXT:    vsoxei64.v v8, (a0), v28, v0.t
+; RV64-NEXT:    ret
+  %ptrs = getelementptr inbounds float, float* %base, <8 x i8> %idxs
+  call void @llvm.masked.scatter.v8f32.v8p0f32(<8 x float> %val, <8 x float*> %ptrs, i32 4, <8 x i1> %m)
+  ret void
+}
+
+define void @mscatter_baseidx_sext_v8i8_v8f32(<8 x float> %val, float* %base, <8 x i8> %idxs, <8 x i1> %m) {
+; RV32-LABEL: mscatter_baseidx_sext_v8i8_v8f32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a1, 8, e32,m2,ta,mu
+; RV32-NEXT:    vsext.vf4 v26, v10
+; RV32-NEXT:    vsll.vi v26, v26, 2
+; RV32-NEXT:    vsetivli a1, 4, e32,m2,ta,mu
+; RV32-NEXT:    vsoxei32.v v8, (a0), v26, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_baseidx_sext_v8i8_v8f32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vsext.vf8 v28, v10
+; RV64-NEXT:    vsll.vi v28, v28, 2
+; RV64-NEXT:    vsetivli a1, 4, e32,m2,ta,mu
+; RV64-NEXT:    vsoxei64.v v8, (a0), v28, v0.t
+; RV64-NEXT:    ret
+  %eidxs = sext <8 x i8> %idxs to <8 x i32>
+  %ptrs = getelementptr inbounds float, float* %base, <8 x i32> %eidxs
+  call void @llvm.masked.scatter.v8f32.v8p0f32(<8 x float> %val, <8 x float*> %ptrs, i32 4, <8 x i1> %m)
+  ret void
+}
+
+define void @mscatter_baseidx_zext_v8i8_v8f32(<8 x float> %val, float* %base, <8 x i8> %idxs, <8 x i1> %m) {
+; RV32-LABEL: mscatter_baseidx_zext_v8i8_v8f32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a1, 8, e32,m2,ta,mu
+; RV32-NEXT:    vzext.vf4 v26, v10
+; RV32-NEXT:    vsll.vi v26, v26, 2
+; RV32-NEXT:    vsetivli a1, 4, e32,m2,ta,mu
+; RV32-NEXT:    vsoxei32.v v8, (a0), v26, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_baseidx_zext_v8i8_v8f32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vzext.vf8 v28, v10
+; RV64-NEXT:    vsll.vi v28, v28, 2
+; RV64-NEXT:    vsetivli a1, 4, e32,m2,ta,mu
+; RV64-NEXT:    vsoxei64.v v8, (a0), v28, v0.t
+; RV64-NEXT:    ret
+  %eidxs = zext <8 x i8> %idxs to <8 x i32>
+  %ptrs = getelementptr inbounds float, float* %base, <8 x i32> %eidxs
+  call void @llvm.masked.scatter.v8f32.v8p0f32(<8 x float> %val, <8 x float*> %ptrs, i32 4, <8 x i1> %m)
+  ret void
+}
+
+define void @mscatter_baseidx_v8i16_v8f32(<8 x float> %val, float* %base, <8 x i16> %idxs, <8 x i1> %m) {
+; RV32-LABEL: mscatter_baseidx_v8i16_v8f32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a1, 8, e32,m2,ta,mu
+; RV32-NEXT:    vsext.vf2 v26, v10
+; RV32-NEXT:    vsll.vi v26, v26, 2
+; RV32-NEXT:    vsetivli a1, 4, e32,m2,ta,mu
+; RV32-NEXT:    vsoxei32.v v8, (a0), v26, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_baseidx_v8i16_v8f32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vsext.vf4 v28, v10
+; RV64-NEXT:    vsll.vi v28, v28, 2
+; RV64-NEXT:    vsetivli a1, 4, e32,m2,ta,mu
+; RV64-NEXT:    vsoxei64.v v8, (a0), v28, v0.t
+; RV64-NEXT:    ret
+  %ptrs = getelementptr inbounds float, float* %base, <8 x i16> %idxs
+  call void @llvm.masked.scatter.v8f32.v8p0f32(<8 x float> %val, <8 x float*> %ptrs, i32 4, <8 x i1> %m)
+  ret void
+}
+
+define void @mscatter_baseidx_sext_v8i16_v8f32(<8 x float> %val, float* %base, <8 x i16> %idxs, <8 x i1> %m) {
+; RV32-LABEL: mscatter_baseidx_sext_v8i16_v8f32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a1, 8, e32,m2,ta,mu
+; RV32-NEXT:    vsext.vf2 v26, v10
+; RV32-NEXT:    vsll.vi v26, v26, 2
+; RV32-NEXT:    vsetivli a1, 4, e32,m2,ta,mu
+; RV32-NEXT:    vsoxei32.v v8, (a0), v26, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_baseidx_sext_v8i16_v8f32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vsext.vf4 v28, v10
+; RV64-NEXT:    vsll.vi v28, v28, 2
+; RV64-NEXT:    vsetivli a1, 4, e32,m2,ta,mu
+; RV64-NEXT:    vsoxei64.v v8, (a0), v28, v0.t
+; RV64-NEXT:    ret
+  %eidxs = sext <8 x i16> %idxs to <8 x i32>
+  %ptrs = getelementptr inbounds float, float* %base, <8 x i32> %eidxs
+  call void @llvm.masked.scatter.v8f32.v8p0f32(<8 x float> %val, <8 x float*> %ptrs, i32 4, <8 x i1> %m)
+  ret void
+}
+
+define void @mscatter_baseidx_zext_v8i16_v8f32(<8 x float> %val, float* %base, <8 x i16> %idxs, <8 x i1> %m) {
+; RV32-LABEL: mscatter_baseidx_zext_v8i16_v8f32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a1, 8, e32,m2,ta,mu
+; RV32-NEXT:    vzext.vf2 v26, v10
+; RV32-NEXT:    vsll.vi v26, v26, 2
+; RV32-NEXT:    vsetivli a1, 4, e32,m2,ta,mu
+; RV32-NEXT:    vsoxei32.v v8, (a0), v26, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_baseidx_zext_v8i16_v8f32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vzext.vf4 v28, v10
+; RV64-NEXT:    vsll.vi v28, v28, 2
+; RV64-NEXT:    vsetivli a1, 4, e32,m2,ta,mu
+; RV64-NEXT:    vsoxei64.v v8, (a0), v28, v0.t
+; RV64-NEXT:    ret
+  %eidxs = zext <8 x i16> %idxs to <8 x i32>
+  %ptrs = getelementptr inbounds float, float* %base, <8 x i32> %eidxs
+  call void @llvm.masked.scatter.v8f32.v8p0f32(<8 x float> %val, <8 x float*> %ptrs, i32 4, <8 x i1> %m)
+  ret void
+}
+
+define void @mscatter_baseidx_v8f32(<8 x float> %val, float* %base, <8 x i32> %idxs, <8 x i1> %m) {
+; RV32-LABEL: mscatter_baseidx_v8f32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a1, 8, e32,m2,ta,mu
+; RV32-NEXT:    vsll.vi v26, v10, 2
+; RV32-NEXT:    vsetivli a1, 4, e32,m2,ta,mu
+; RV32-NEXT:    vsoxei32.v v8, (a0), v26, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_baseidx_v8f32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vsext.vf2 v28, v10
+; RV64-NEXT:    vsll.vi v28, v28, 2
+; RV64-NEXT:    vsetivli a1, 4, e32,m2,ta,mu
+; RV64-NEXT:    vsoxei64.v v8, (a0), v28, v0.t
+; RV64-NEXT:    ret
+  %ptrs = getelementptr inbounds float, float* %base, <8 x i32> %idxs
+  call void @llvm.masked.scatter.v8f32.v8p0f32(<8 x float> %val, <8 x float*> %ptrs, i32 4, <8 x i1> %m)
+  ret void
+}
+
+declare void @llvm.masked.scatter.v1f64.v1p0f64(<1 x double>, <1 x double*>, i32, <1 x i1>)
+
+define void @mscatter_v1f64(<1 x double> %val, <1 x double*> %ptrs, <1 x i1> %m) {
+; RV32-LABEL: mscatter_v1f64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a0, 1, e64,m1,ta,mu
+; RV32-NEXT:    vsoxei32.v v8, (zero), v9, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_v1f64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a0, 1, e64,m1,ta,mu
+; RV64-NEXT:    vsoxei64.v v8, (zero), v9, v0.t
+; RV64-NEXT:    ret
+  call void @llvm.masked.scatter.v1f64.v1p0f64(<1 x double> %val, <1 x double*> %ptrs, i32 8, <1 x i1> %m)
+  ret void
+}
+
+declare void @llvm.masked.scatter.v2f64.v2p0f64(<2 x double>, <2 x double*>, i32, <2 x i1>)
+
+define void @mscatter_v2f64(<2 x double> %val, <2 x double*> %ptrs, <2 x i1> %m) {
+; RV32-LABEL: mscatter_v2f64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a0, 1, e64,m1,ta,mu
+; RV32-NEXT:    vsoxei32.v v8, (zero), v9, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_v2f64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a0, 1, e64,m1,ta,mu
+; RV64-NEXT:    vsoxei64.v v8, (zero), v9, v0.t
+; RV64-NEXT:    ret
+  call void @llvm.masked.scatter.v2f64.v2p0f64(<2 x double> %val, <2 x double*> %ptrs, i32 8, <2 x i1> %m)
+  ret void
+}
+
+declare void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double>, <4 x double*>, i32, <4 x i1>)
+
+define void @mscatter_v4f64(<4 x double> %val, <4 x double*> %ptrs, <4 x i1> %m) {
+; RV32-LABEL: mscatter_v4f64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a0, 2, e64,m2,ta,mu
+; RV32-NEXT:    vsoxei32.v v8, (zero), v10, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_v4f64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a0, 2, e64,m2,ta,mu
+; RV64-NEXT:    vsoxei64.v v8, (zero), v10, v0.t
+; RV64-NEXT:    ret
+  call void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double> %val, <4 x double*> %ptrs, i32 8, <4 x i1> %m)
+  ret void
+}
+
+define void @mscatter_truemask_v4f64(<4 x double> %val, <4 x double*> %ptrs) {
+; RV32-LABEL: mscatter_truemask_v4f64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a0, 2, e64,m2,ta,mu
+; RV32-NEXT:    vsoxei32.v v8, (zero), v10
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_truemask_v4f64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a0, 2, e64,m2,ta,mu
+; RV64-NEXT:    vsoxei64.v v8, (zero), v10
+; RV64-NEXT:    ret
+  %mhead = insertelement <4 x i1> undef, i1 1, i32 0
+  %mtrue = shufflevector <4 x i1> %mhead, <4 x i1> undef, <4 x i32> zeroinitializer
+  call void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double> %val, <4 x double*> %ptrs, i32 8, <4 x i1> %mtrue)
+  ret void
+}
+
+define void @mscatter_falsemask_v4f64(<4 x double> %val, <4 x double*> %ptrs) {
+; RV32-LABEL: mscatter_falsemask_v4f64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_falsemask_v4f64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    ret
+  call void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double> %val, <4 x double*> %ptrs, i32 8, <4 x i1> zeroinitializer)
+  ret void
+}
+
+declare void @llvm.masked.scatter.v8f64.v8p0f64(<8 x double>, <8 x double*>, i32, <8 x i1>)
+
+define void @mscatter_v8f64(<8 x double> %val, <8 x double*> %ptrs, <8 x i1> %m) {
+; RV32-LABEL: mscatter_v8f64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a0, 4, e64,m4,ta,mu
+; RV32-NEXT:    vsoxei32.v v8, (zero), v12, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_v8f64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a0, 4, e64,m4,ta,mu
+; RV64-NEXT:    vsoxei64.v v8, (zero), v12, v0.t
+; RV64-NEXT:    ret
+  call void @llvm.masked.scatter.v8f64.v8p0f64(<8 x double> %val, <8 x double*> %ptrs, i32 8, <8 x i1> %m)
+  ret void
+}
+
+define void @mscatter_baseidx_v8i8_v8f64(<8 x double> %val, double* %base, <8 x i8> %idxs, <8 x i1> %m) {
+; RV32-LABEL: mscatter_baseidx_v8i8_v8f64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a1, 8, e32,m2,ta,mu
+; RV32-NEXT:    vsext.vf4 v26, v12
+; RV32-NEXT:    vsll.vi v26, v26, 3
+; RV32-NEXT:    vsetivli a1, 4, e64,m4,ta,mu
+; RV32-NEXT:    vsoxei32.v v8, (a0), v26, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_baseidx_v8i8_v8f64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vsext.vf8 v28, v12
+; RV64-NEXT:    vsll.vi v28, v28, 3
+; RV64-NEXT:    vsetivli a1, 4, e64,m4,ta,mu
+; RV64-NEXT:    vsoxei64.v v8, (a0), v28, v0.t
+; RV64-NEXT:    ret
+  %ptrs = getelementptr inbounds double, double* %base, <8 x i8> %idxs
+  call void @llvm.masked.scatter.v8f64.v8p0f64(<8 x double> %val, <8 x double*> %ptrs, i32 8, <8 x i1> %m)
+  ret void
+}
+
+define void @mscatter_baseidx_sext_v8i8_v8f64(<8 x double> %val, double* %base, <8 x i8> %idxs, <8 x i1> %m) {
+; RV32-LABEL: mscatter_baseidx_sext_v8i8_v8f64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV32-NEXT:    vsext.vf8 v28, v12
+; RV32-NEXT:    lui a1, %hi(.LCPI82_0)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI82_0)
+; RV32-NEXT:    vsetivli a2, 16, e32,m4,ta,mu
+; RV32-NEXT:    vle32.v v12, (a1)
+; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV32-NEXT:    vsll.vv v28, v28, v12
+; RV32-NEXT:    vsetivli a1, 4, e64,m4,ta,mu
+; RV32-NEXT:    vsoxei64.v v8, (a0), v28, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_baseidx_sext_v8i8_v8f64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vsext.vf8 v28, v12
+; RV64-NEXT:    vsll.vi v28, v28, 3
+; RV64-NEXT:    vsetivli a1, 4, e64,m4,ta,mu
+; RV64-NEXT:    vsoxei64.v v8, (a0), v28, v0.t
+; RV64-NEXT:    ret
+  %eidxs = sext <8 x i8> %idxs to <8 x i64>
+  %ptrs = getelementptr inbounds double, double* %base, <8 x i64> %eidxs
+  call void @llvm.masked.scatter.v8f64.v8p0f64(<8 x double> %val, <8 x double*> %ptrs, i32 8, <8 x i1> %m)
+  ret void
+}
+
+define void @mscatter_baseidx_zext_v8i8_v8f64(<8 x double> %val, double* %base, <8 x i8> %idxs, <8 x i1> %m) {
+; RV32-LABEL: mscatter_baseidx_zext_v8i8_v8f64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV32-NEXT:    vzext.vf8 v28, v12
+; RV32-NEXT:    lui a1, %hi(.LCPI83_0)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI83_0)
+; RV32-NEXT:    vsetivli a2, 16, e32,m4,ta,mu
+; RV32-NEXT:    vle32.v v12, (a1)
+; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV32-NEXT:    vsll.vv v28, v28, v12
+; RV32-NEXT:    vsetivli a1, 4, e64,m4,ta,mu
+; RV32-NEXT:    vsoxei64.v v8, (a0), v28, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_baseidx_zext_v8i8_v8f64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vzext.vf8 v28, v12
+; RV64-NEXT:    vsll.vi v28, v28, 3
+; RV64-NEXT:    vsetivli a1, 4, e64,m4,ta,mu
+; RV64-NEXT:    vsoxei64.v v8, (a0), v28, v0.t
+; RV64-NEXT:    ret
+  %eidxs = zext <8 x i8> %idxs to <8 x i64>
+  %ptrs = getelementptr inbounds double, double* %base, <8 x i64> %eidxs
+  call void @llvm.masked.scatter.v8f64.v8p0f64(<8 x double> %val, <8 x double*> %ptrs, i32 8, <8 x i1> %m)
+  ret void
+}
+
+define void @mscatter_baseidx_v8i16_v8f64(<8 x double> %val, double* %base, <8 x i16> %idxs, <8 x i1> %m) {
+; RV32-LABEL: mscatter_baseidx_v8i16_v8f64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a1, 8, e32,m2,ta,mu
+; RV32-NEXT:    vsext.vf2 v26, v12
+; RV32-NEXT:    vsll.vi v26, v26, 3
+; RV32-NEXT:    vsetivli a1, 4, e64,m4,ta,mu
+; RV32-NEXT:    vsoxei32.v v8, (a0), v26, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_baseidx_v8i16_v8f64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vsext.vf4 v28, v12
+; RV64-NEXT:    vsll.vi v28, v28, 3
+; RV64-NEXT:    vsetivli a1, 4, e64,m4,ta,mu
+; RV64-NEXT:    vsoxei64.v v8, (a0), v28, v0.t
+; RV64-NEXT:    ret
+  %ptrs = getelementptr inbounds double, double* %base, <8 x i16> %idxs
+  call void @llvm.masked.scatter.v8f64.v8p0f64(<8 x double> %val, <8 x double*> %ptrs, i32 8, <8 x i1> %m)
+  ret void
+}
+
+define void @mscatter_baseidx_sext_v8i16_v8f64(<8 x double> %val, double* %base, <8 x i16> %idxs, <8 x i1> %m) {
+; RV32-LABEL: mscatter_baseidx_sext_v8i16_v8f64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV32-NEXT:    vsext.vf4 v28, v12
+; RV32-NEXT:    lui a1, %hi(.LCPI85_0)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI85_0)
+; RV32-NEXT:    vsetivli a2, 16, e32,m4,ta,mu
+; RV32-NEXT:    vle32.v v12, (a1)
+; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV32-NEXT:    vsll.vv v28, v28, v12
+; RV32-NEXT:    vsetivli a1, 4, e64,m4,ta,mu
+; RV32-NEXT:    vsoxei64.v v8, (a0), v28, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_baseidx_sext_v8i16_v8f64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vsext.vf4 v28, v12
+; RV64-NEXT:    vsll.vi v28, v28, 3
+; RV64-NEXT:    vsetivli a1, 4, e64,m4,ta,mu
+; RV64-NEXT:    vsoxei64.v v8, (a0), v28, v0.t
+; RV64-NEXT:    ret
+  %eidxs = sext <8 x i16> %idxs to <8 x i64>
+  %ptrs = getelementptr inbounds double, double* %base, <8 x i64> %eidxs
+  call void @llvm.masked.scatter.v8f64.v8p0f64(<8 x double> %val, <8 x double*> %ptrs, i32 8, <8 x i1> %m)
+  ret void
+}
+
+define void @mscatter_baseidx_zext_v8i16_v8f64(<8 x double> %val, double* %base, <8 x i16> %idxs, <8 x i1> %m) {
+; RV32-LABEL: mscatter_baseidx_zext_v8i16_v8f64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV32-NEXT:    vzext.vf4 v28, v12
+; RV32-NEXT:    lui a1, %hi(.LCPI86_0)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI86_0)
+; RV32-NEXT:    vsetivli a2, 16, e32,m4,ta,mu
+; RV32-NEXT:    vle32.v v12, (a1)
+; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV32-NEXT:    vsll.vv v28, v28, v12
+; RV32-NEXT:    vsetivli a1, 4, e64,m4,ta,mu
+; RV32-NEXT:    vsoxei64.v v8, (a0), v28, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_baseidx_zext_v8i16_v8f64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vzext.vf4 v28, v12
+; RV64-NEXT:    vsll.vi v28, v28, 3
+; RV64-NEXT:    vsetivli a1, 4, e64,m4,ta,mu
+; RV64-NEXT:    vsoxei64.v v8, (a0), v28, v0.t
+; RV64-NEXT:    ret
+  %eidxs = zext <8 x i16> %idxs to <8 x i64>
+  %ptrs = getelementptr inbounds double, double* %base, <8 x i64> %eidxs
+  call void @llvm.masked.scatter.v8f64.v8p0f64(<8 x double> %val, <8 x double*> %ptrs, i32 8, <8 x i1> %m)
+  ret void
+}
+
+define void @mscatter_baseidx_v8i32_v8f64(<8 x double> %val, double* %base, <8 x i32> %idxs, <8 x i1> %m) {
+; RV32-LABEL: mscatter_baseidx_v8i32_v8f64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a1, 8, e32,m2,ta,mu
+; RV32-NEXT:    vsll.vi v26, v12, 3
+; RV32-NEXT:    vsetivli a1, 4, e64,m4,ta,mu
+; RV32-NEXT:    vsoxei32.v v8, (a0), v26, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_baseidx_v8i32_v8f64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vsext.vf2 v28, v12
+; RV64-NEXT:    vsll.vi v28, v28, 3
+; RV64-NEXT:    vsetivli a1, 4, e64,m4,ta,mu
+; RV64-NEXT:    vsoxei64.v v8, (a0), v28, v0.t
+; RV64-NEXT:    ret
+  %ptrs = getelementptr inbounds double, double* %base, <8 x i32> %idxs
+  call void @llvm.masked.scatter.v8f64.v8p0f64(<8 x double> %val, <8 x double*> %ptrs, i32 8, <8 x i1> %m)
+  ret void
+}
+
+define void @mscatter_baseidx_sext_v8i32_v8f64(<8 x double> %val, double* %base, <8 x i32> %idxs, <8 x i1> %m) {
+; RV32-LABEL: mscatter_baseidx_sext_v8i32_v8f64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV32-NEXT:    vsext.vf2 v28, v12
+; RV32-NEXT:    lui a1, %hi(.LCPI88_0)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI88_0)
+; RV32-NEXT:    vsetivli a2, 16, e32,m4,ta,mu
+; RV32-NEXT:    vle32.v v12, (a1)
+; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV32-NEXT:    vsll.vv v28, v28, v12
+; RV32-NEXT:    vsetivli a1, 4, e64,m4,ta,mu
+; RV32-NEXT:    vsoxei64.v v8, (a0), v28, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_baseidx_sext_v8i32_v8f64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vsext.vf2 v28, v12
+; RV64-NEXT:    vsll.vi v28, v28, 3
+; RV64-NEXT:    vsetivli a1, 4, e64,m4,ta,mu
+; RV64-NEXT:    vsoxei64.v v8, (a0), v28, v0.t
+; RV64-NEXT:    ret
+  %eidxs = sext <8 x i32> %idxs to <8 x i64>
+  %ptrs = getelementptr inbounds double, double* %base, <8 x i64> %eidxs
+  call void @llvm.masked.scatter.v8f64.v8p0f64(<8 x double> %val, <8 x double*> %ptrs, i32 8, <8 x i1> %m)
+  ret void
+}
+
+define void @mscatter_baseidx_zext_v8i32_v8f64(<8 x double> %val, double* %base, <8 x i32> %idxs, <8 x i1> %m) {
+; RV32-LABEL: mscatter_baseidx_zext_v8i32_v8f64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV32-NEXT:    vzext.vf2 v28, v12
+; RV32-NEXT:    lui a1, %hi(.LCPI89_0)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI89_0)
+; RV32-NEXT:    vsetivli a2, 16, e32,m4,ta,mu
+; RV32-NEXT:    vle32.v v12, (a1)
+; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV32-NEXT:    vsll.vv v28, v28, v12
+; RV32-NEXT:    vsetivli a1, 4, e64,m4,ta,mu
+; RV32-NEXT:    vsoxei64.v v8, (a0), v28, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_baseidx_zext_v8i32_v8f64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vzext.vf2 v28, v12
+; RV64-NEXT:    vsll.vi v28, v28, 3
+; RV64-NEXT:    vsetivli a1, 4, e64,m4,ta,mu
+; RV64-NEXT:    vsoxei64.v v8, (a0), v28, v0.t
+; RV64-NEXT:    ret
+  %eidxs = zext <8 x i32> %idxs to <8 x i64>
+  %ptrs = getelementptr inbounds double, double* %base, <8 x i64> %eidxs
+  call void @llvm.masked.scatter.v8f64.v8p0f64(<8 x double> %val, <8 x double*> %ptrs, i32 8, <8 x i1> %m)
+  ret void
+}
+
+define void @mscatter_baseidx_v8f64(<8 x double> %val, double* %base, <8 x i64> %idxs, <8 x i1> %m) {
+; RV32-LABEL: mscatter_baseidx_v8f64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    lui a1, %hi(.LCPI90_0)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI90_0)
+; RV32-NEXT:    vsetivli a2, 16, e32,m4,ta,mu
+; RV32-NEXT:    vle32.v v28, (a1)
+; RV32-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV32-NEXT:    vsll.vv v28, v12, v28
+; RV32-NEXT:    vsetivli a1, 4, e64,m4,ta,mu
+; RV32-NEXT:    vsoxei64.v v8, (a0), v28, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_baseidx_v8f64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT:    vsll.vi v28, v12, 3
+; RV64-NEXT:    vsetivli a1, 4, e64,m4,ta,mu
+; RV64-NEXT:    vsoxei64.v v8, (a0), v28, v0.t
+; RV64-NEXT:    ret
+  %ptrs = getelementptr inbounds double, double* %base, <8 x i64> %idxs
+  call void @llvm.masked.scatter.v8f64.v8p0f64(<8 x double> %val, <8 x double*> %ptrs, i32 8, <8 x i1> %m)
+  ret void
+}
+
+declare void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8>, <16 x i8*>, i32, <16 x i1>)
+
+define void @mscatter_baseidx_v16i8(<16 x i8> %val, i8* %base, <16 x i8> %idxs, <16 x i1> %m) {
+; RV32-LABEL: mscatter_baseidx_v16i8:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli a1, 16, e32,m4,ta,mu
+; RV32-NEXT:    vsext.vf4 v28, v9
+; RV32-NEXT:    vsetivli a1, 8, e8,m1,ta,mu
+; RV32-NEXT:    vsoxei32.v v8, (a0), v28, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_baseidx_v16i8:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 16, e64,m8,ta,mu
+; RV64-NEXT:    vsext.vf8 v16, v9
+; RV64-NEXT:    vsetivli a1, 8, e8,m1,ta,mu
+; RV64-NEXT:    vsoxei64.v v8, (a0), v16, v0.t
+; RV64-NEXT:    ret
+  %ptrs = getelementptr inbounds i8, i8* %base, <16 x i8> %idxs
+  call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %val, <16 x i8*> %ptrs, i32 1, <16 x i1> %m)
+  ret void
+}
+
+declare void @llvm.masked.scatter.v32i8.v32p0i8(<32 x i8>, <32 x i8*>, i32, <32 x i1>)
+
+define void @mscatter_baseidx_v32i8(<32 x i8> %val, i8* %base, <32 x i8> %idxs, <32 x i1> %m) {
+; RV32-LABEL: mscatter_baseidx_v32i8:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi a1, zero, 32
+; RV32-NEXT:    vsetvli a1, a1, e32,m8,ta,mu
+; RV32-NEXT:    vsext.vf4 v16, v10
+; RV32-NEXT:    vsetivli a1, 16, e8,m2,ta,mu
+; RV32-NEXT:    vsoxei32.v v8, (a0), v16, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mscatter_baseidx_v32i8:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli a1, 16, e64,m8,ta,mu
+; RV64-NEXT:    vsext.vf8 v16, v10
+; RV64-NEXT:    vsetivli a1, 8, e8,m1,ta,mu
+; RV64-NEXT:    vsoxei64.v v8, (a0), v16, v0.t
+; RV64-NEXT:    vsetivli a1, 16, e8,m2,ta,mu
+; RV64-NEXT:    vslidedown.vi v26, v10, 16
+; RV64-NEXT:    vsetivli a1, 16, e64,m8,ta,mu
+; RV64-NEXT:    vsext.vf8 v16, v26
+; RV64-NEXT:    vsetivli a1, 16, e8,m2,ta,mu
+; RV64-NEXT:    vslidedown.vi v26, v8, 16
+; RV64-NEXT:    vsetivli a1, 2, e8,m1,ta,mu
+; RV64-NEXT:    vslidedown.vi v0, v0, 2
+; RV64-NEXT:    vsetivli a1, 8, e8,m1,ta,mu
+; RV64-NEXT:    vsoxei64.v v26, (a0), v16, v0.t
+; RV64-NEXT:    ret
+  %ptrs = getelementptr inbounds i8, i8* %base, <32 x i8> %idxs
+  call void @llvm.masked.scatter.v32i8.v32p0i8(<32 x i8> %val, <32 x i8*> %ptrs, i32 1, <32 x i1> %m)
+  ret void
+}