[llvm] 0331399 - [RISCV] Support scalable-vector masked gather operations
Fraser Cormack via llvm-commits
llvm-commits at lists.llvm.org
Thu Mar 18 02:33:02 PDT 2021
Author: Fraser Cormack
Date: 2021-03-18T09:26:18Z
New Revision: 0331399dc9346f3c5acdf784ddb96567efc9d538
URL: https://github.com/llvm/llvm-project/commit/0331399dc9346f3c5acdf784ddb96567efc9d538
DIFF: https://github.com/llvm/llvm-project/commit/0331399dc9346f3c5acdf784ddb96567efc9d538.diff
LOG: [RISCV] Support scalable-vector masked gather operations
This patch supports the masked gather intrinsics in RVV.
The RVV indexed load/store instructions only support the "unsigned unscaled"
addressing mode; indices are implicitly zero-extended or truncated to XLEN and
are treated as byte offsets. This ISA supports the intrinsics directly, but not
the majority of various forms of the MGATHER SDNode that LLVM combines to. Any
signed or scaled indexing is extended to the XLEN value type and scaled
accordingly. This is done during DAG combining as widening the index types to
XLEN may produce illegal vectors that require splitting, e.g.
nxv16i8->nxv16i64.
Support for scalable-vector CONCAT_VECTORS was added to avoid spilling via the
stack when lowering split legalized index operands.
Reviewed By: craig.topper
Differential Revision: https://reviews.llvm.org/D96263
Added:
llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll
Modified:
llvm/lib/Target/RISCV/RISCVISelLowering.cpp
llvm/lib/Target/RISCV/RISCVISelLowering.h
Removed:
################################################################################
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index b54e2ce73fd1..ee686102c147 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -474,6 +474,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
+ setOperationAction(ISD::MGATHER, VT, Custom);
+
+ setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
@@ -513,6 +516,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom);
setOperationAction(ISD::FCOPYSIGN, VT, Legal);
+ setOperationAction(ISD::MGATHER, VT, Custom);
+
+ setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
@@ -686,8 +692,10 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
if (Subtarget.hasStdExtZbp()) {
setTargetDAGCombine(ISD::OR);
}
- if (Subtarget.hasStdExtV())
+ if (Subtarget.hasStdExtV()) {
setTargetDAGCombine(ISD::FCOPYSIGN);
+ setTargetDAGCombine(ISD::MGATHER);
+ }
}
EVT RISCVTargetLowering::getSetCCResultType(const DataLayout &DL,
@@ -1629,9 +1637,8 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
// better than going through the stack, as the default expansion does.
SDLoc DL(Op);
MVT VT = Op.getSimpleValueType();
- assert(VT.isFixedLengthVector() && "Unexpected CONCAT_VECTORS lowering");
unsigned NumOpElts =
- Op.getOperand(0).getSimpleValueType().getVectorNumElements();
+ Op.getOperand(0).getSimpleValueType().getVectorMinNumElements();
SDValue Vec = DAG.getUNDEF(VT);
for (const auto &OpIdx : enumerate(Op->ops()))
Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Vec, OpIdx.value(),
@@ -1711,6 +1718,8 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
return lowerFixedLengthVectorSelectToRVV(Op, DAG);
case ISD::FCOPYSIGN:
return lowerFixedLengthVectorFCOPYSIGNToRVV(Op, DAG);
+ case ISD::MGATHER:
+ return lowerMGATHER(Op, DAG);
}
}
@@ -3453,6 +3462,46 @@ SDValue RISCVTargetLowering::lowerToScalableOp(SDValue Op, SelectionDAG &DAG,
return convertFromScalableVector(VT, ScalableRes, DAG, Subtarget);
}
+// Custom lower MGATHER to a legalized form for RVV. It will then be matched to
+// a RVV indexed load. The RVV indexed load/store instructions only support the
+// "unsigned unscaled" addressing mode; indices are implicitly zero-extended or
+// truncated to XLEN and are treated as byte offsets. Any signed or scaled
+// indexing is extended to the XLEN value type and scaled accordingly.
+SDValue RISCVTargetLowering::lowerMGATHER(SDValue Op, SelectionDAG &DAG) const {
+ MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
+ SDLoc DL(Op);
+ MVT VT = Op.getSimpleValueType();
+ SDValue Index = N->getIndex();
+ SDValue Mask = N->getMask();
+ SDValue PassThru = N->getPassThru();
+
+ MVT XLenVT = Subtarget.getXLenVT();
+ assert(N->getBasePtr().getSimpleValueType() == XLenVT &&
+ "Unexpected pointer type");
+ // Targets have to explicitly opt-in for extending vector loads.
+ assert(N->getExtensionType() == ISD::NON_EXTLOAD &&
+ "Unexpected extending MGATHER");
+
+ SDValue VL = getDefaultVLOps(VT, VT, DL, DAG, Subtarget).second;
+ // If the mask is known to be all ones, optimize to an unmasked intrinsic;
+ // the selection of the masked intrinsics doesn't do this for us.
+ if (ISD::isConstantSplatVectorAllOnes(Mask.getNode())) {
+ SDValue IntID = DAG.getTargetConstant(Intrinsic::riscv_vloxei, DL, XLenVT);
+ SDValue Ops[] = {N->getChain(), IntID, N->getBasePtr(), Index, VL};
+ return DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL,
+ DAG.getVTList(VT, MVT::Other), Ops,
+ N->getMemoryVT(), N->getMemOperand());
+ }
+
+ SDValue IntID =
+ DAG.getTargetConstant(Intrinsic::riscv_vloxei_mask, DL, XLenVT);
+ SDValue Ops[] = {N->getChain(), IntID, PassThru, N->getBasePtr(),
+ Index, Mask, VL};
+ return DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL,
+ DAG.getVTList(VT, MVT::Other), Ops,
+ N->getMemoryVT(), N->getMemOperand());
+}
+
// Returns the opcode of the target-specific SDNode that implements the 32-bit
// form of the given Opcode.
static RISCVISD::NodeType getRISCVWOpcode(unsigned Opcode) {
@@ -4470,6 +4519,49 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
return DAG.getNode(ISD::FCOPYSIGN, DL, VT, N->getOperand(0),
DAG.getNode(ISD::FNEG, DL, VT, NewFPExtRound));
}
+ case ISD::MGATHER: {
+ if (!DCI.isBeforeLegalize())
+ break;
+ MaskedGatherSDNode *MGN = cast<MaskedGatherSDNode>(N);
+ SDValue Index = MGN->getIndex();
+ EVT IndexVT = Index.getValueType();
+ MVT XLenVT = Subtarget.getXLenVT();
+ // RISCV indexed loads only support the "unsigned unscaled" addressing
+ // mode, so anything else must be manually legalized.
+ bool NeedsIdxLegalization =
+ MGN->isIndexScaled() ||
+ (MGN->isIndexSigned() && IndexVT.getVectorElementType().bitsLT(XLenVT));
+ if (!NeedsIdxLegalization)
+ break;
+
+ SDLoc DL(N);
+
+ // Any index legalization should first promote to XLenVT, so we don't lose
+ // bits when scaling. This may create an illegal index type so we let
+ // LLVM's legalization take care of the splitting.
+ if (IndexVT.getVectorElementType().bitsLT(XLenVT)) {
+ IndexVT = IndexVT.changeVectorElementType(XLenVT);
+ Index = DAG.getNode(MGN->isIndexSigned() ? ISD::SIGN_EXTEND
+ : ISD::ZERO_EXTEND,
+ DL, IndexVT, Index);
+ }
+
+ unsigned Scale = N->getConstantOperandVal(5);
+ if (MGN->isIndexScaled() && Scale != 1) {
+ // Manually scale the indices by the element size.
+ // TODO: Sanitize the scale operand here?
+ assert(isPowerOf2_32(Scale) && "Expecting power-of-two types");
+ SDValue SplatScale = DAG.getConstant(Log2_32(Scale), DL, IndexVT);
+ Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index, SplatScale);
+ }
+
+ ISD::MemIndexType NewIndexTy = ISD::UNSIGNED_UNSCALED;
+ return DAG.getMaskedGather(
+ N->getVTList(), MGN->getMemoryVT(), DL,
+ {MGN->getChain(), MGN->getPassThru(), MGN->getMask(), MGN->getBasePtr(),
+ Index, MGN->getScale()},
+ MGN->getMemOperand(), NewIndexTy, MGN->getExtensionType());
+ }
}
return SDValue();
@@ -6890,6 +6982,10 @@ Value *RISCVTargetLowering::emitMaskedAtomicCmpXchgIntrinsic(
return Result;
}
+bool RISCVTargetLowering::shouldRemoveExtendFromGSIndex(EVT VT) const {
+ return false;
+}
+
bool RISCVTargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
EVT VT) const {
VT = VT.getScalarType();
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index d454df95b630..1aea84dd258a 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -432,6 +432,8 @@ class RISCVTargetLowering : public TargetLowering {
static MVT getContainerForFixedLengthVector(SelectionDAG &DAG, MVT VT,
const RISCVSubtarget &Subtarget);
+ bool shouldRemoveExtendFromGSIndex(EVT VT) const override;
+
private:
void analyzeInputArgs(MachineFunction &MF, CCState &CCInfo,
const SmallVectorImpl<ISD::InputArg> &Ins,
@@ -475,6 +477,7 @@ class RISCVTargetLowering : public TargetLowering {
SDValue lowerABS(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerFixedLengthVectorFCOPYSIGNToRVV(SDValue Op,
SelectionDAG &DAG) const;
+ SDValue lowerMGATHER(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerFixedLengthVectorLoadToRVV(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerFixedLengthVectorStoreToRVV(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerFixedLengthVectorMaskedLoadToRVV(SDValue Op,
diff --git a/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll
new file mode 100644
index 000000000000..c5f9ea8aa3e3
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll
@@ -0,0 +1,2194 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+experimental-zfh,+experimental-v -target-abi=ilp32d \
+; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefix=RV32
+; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+experimental-zfh,+experimental-v -target-abi=lp64d \
+; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefix=RV64
+
+declare <vscale x 1 x i8> @llvm.masked.gather.nxv1i8.nxv1p0i8(<vscale x 1 x i8*>, i32, <vscale x 1 x i1>, <vscale x 1 x i8>)
+
+define <vscale x 1 x i8> @mgather_nxv1i8(<vscale x 1 x i8*> %ptrs, <vscale x 1 x i1> %m, <vscale x 1 x i8> %passthru) {
+; RV32-LABEL: mgather_nxv1i8:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e8,mf8,tu,mu
+; RV32-NEXT: vloxei32.v v9, (zero), v8, v0.t
+; RV32-NEXT: vmv1r.v v8, v9
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_nxv1i8:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e8,mf8,tu,mu
+; RV64-NEXT: vloxei64.v v9, (zero), v8, v0.t
+; RV64-NEXT: vmv1r.v v8, v9
+; RV64-NEXT: ret
+ %v = call <vscale x 1 x i8> @llvm.masked.gather.nxv1i8.nxv1p0i8(<vscale x 1 x i8*> %ptrs, i32 1, <vscale x 1 x i1> %m, <vscale x 1 x i8> %passthru)
+ ret <vscale x 1 x i8> %v
+}
+
+declare <vscale x 2 x i8> @llvm.masked.gather.nxv2i8.nxv2p0i8(<vscale x 2 x i8*>, i32, <vscale x 2 x i1>, <vscale x 2 x i8>)
+
+define <vscale x 2 x i8> @mgather_nxv2i8(<vscale x 2 x i8*> %ptrs, <vscale x 2 x i1> %m, <vscale x 2 x i8> %passthru) {
+; RV32-LABEL: mgather_nxv2i8:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e8,mf4,tu,mu
+; RV32-NEXT: vloxei32.v v9, (zero), v8, v0.t
+; RV32-NEXT: vmv1r.v v8, v9
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_nxv2i8:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e8,mf4,tu,mu
+; RV64-NEXT: vloxei64.v v10, (zero), v8, v0.t
+; RV64-NEXT: vmv1r.v v8, v10
+; RV64-NEXT: ret
+ %v = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8.nxv2p0i8(<vscale x 2 x i8*> %ptrs, i32 1, <vscale x 2 x i1> %m, <vscale x 2 x i8> %passthru)
+ ret <vscale x 2 x i8> %v
+}
+
+define <vscale x 2 x i16> @mgather_nxv2i8_sextload_nxv2i16(<vscale x 2 x i8*> %ptrs, <vscale x 2 x i1> %m, <vscale x 2 x i8> %passthru) {
+; RV32-LABEL: mgather_nxv2i8_sextload_nxv2i16:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e8,mf4,tu,mu
+; RV32-NEXT: vloxei32.v v9, (zero), v8, v0.t
+; RV32-NEXT: vsetvli a0, zero, e16,mf2,ta,mu
+; RV32-NEXT: vsext.vf2 v8, v9
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_nxv2i8_sextload_nxv2i16:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e8,mf4,tu,mu
+; RV64-NEXT: vloxei64.v v10, (zero), v8, v0.t
+; RV64-NEXT: vsetvli a0, zero, e16,mf2,ta,mu
+; RV64-NEXT: vsext.vf2 v8, v10
+; RV64-NEXT: ret
+ %v = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8.nxv2p0i8(<vscale x 2 x i8*> %ptrs, i32 1, <vscale x 2 x i1> %m, <vscale x 2 x i8> %passthru)
+ %ev = sext <vscale x 2 x i8> %v to <vscale x 2 x i16>
+ ret <vscale x 2 x i16> %ev
+}
+
+define <vscale x 2 x i16> @mgather_nxv2i8_zextload_nxv2i16(<vscale x 2 x i8*> %ptrs, <vscale x 2 x i1> %m, <vscale x 2 x i8> %passthru) {
+; RV32-LABEL: mgather_nxv2i8_zextload_nxv2i16:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e8,mf4,tu,mu
+; RV32-NEXT: vloxei32.v v9, (zero), v8, v0.t
+; RV32-NEXT: vsetvli a0, zero, e16,mf2,ta,mu
+; RV32-NEXT: vzext.vf2 v8, v9
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_nxv2i8_zextload_nxv2i16:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e8,mf4,tu,mu
+; RV64-NEXT: vloxei64.v v10, (zero), v8, v0.t
+; RV64-NEXT: vsetvli a0, zero, e16,mf2,ta,mu
+; RV64-NEXT: vzext.vf2 v8, v10
+; RV64-NEXT: ret
+ %v = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8.nxv2p0i8(<vscale x 2 x i8*> %ptrs, i32 1, <vscale x 2 x i1> %m, <vscale x 2 x i8> %passthru)
+ %ev = zext <vscale x 2 x i8> %v to <vscale x 2 x i16>
+ ret <vscale x 2 x i16> %ev
+}
+
+define <vscale x 2 x i32> @mgather_nxv2i8_sextload_nxv2i32(<vscale x 2 x i8*> %ptrs, <vscale x 2 x i1> %m, <vscale x 2 x i8> %passthru) {
+; RV32-LABEL: mgather_nxv2i8_sextload_nxv2i32:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e8,mf4,tu,mu
+; RV32-NEXT: vloxei32.v v9, (zero), v8, v0.t
+; RV32-NEXT: vsetvli a0, zero, e32,m1,ta,mu
+; RV32-NEXT: vsext.vf4 v8, v9
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_nxv2i8_sextload_nxv2i32:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e8,mf4,tu,mu
+; RV64-NEXT: vloxei64.v v10, (zero), v8, v0.t
+; RV64-NEXT: vsetvli a0, zero, e32,m1,ta,mu
+; RV64-NEXT: vsext.vf4 v8, v10
+; RV64-NEXT: ret
+ %v = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8.nxv2p0i8(<vscale x 2 x i8*> %ptrs, i32 1, <vscale x 2 x i1> %m, <vscale x 2 x i8> %passthru)
+ %ev = sext <vscale x 2 x i8> %v to <vscale x 2 x i32>
+ ret <vscale x 2 x i32> %ev
+}
+
+define <vscale x 2 x i32> @mgather_nxv2i8_zextload_nxv2i32(<vscale x 2 x i8*> %ptrs, <vscale x 2 x i1> %m, <vscale x 2 x i8> %passthru) {
+; RV32-LABEL: mgather_nxv2i8_zextload_nxv2i32:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e8,mf4,tu,mu
+; RV32-NEXT: vloxei32.v v9, (zero), v8, v0.t
+; RV32-NEXT: vsetvli a0, zero, e32,m1,ta,mu
+; RV32-NEXT: vzext.vf4 v8, v9
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_nxv2i8_zextload_nxv2i32:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e8,mf4,tu,mu
+; RV64-NEXT: vloxei64.v v10, (zero), v8, v0.t
+; RV64-NEXT: vsetvli a0, zero, e32,m1,ta,mu
+; RV64-NEXT: vzext.vf4 v8, v10
+; RV64-NEXT: ret
+ %v = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8.nxv2p0i8(<vscale x 2 x i8*> %ptrs, i32 1, <vscale x 2 x i1> %m, <vscale x 2 x i8> %passthru)
+ %ev = zext <vscale x 2 x i8> %v to <vscale x 2 x i32>
+ ret <vscale x 2 x i32> %ev
+}
+
+define <vscale x 2 x i64> @mgather_nxv2i8_sextload_nxv2i64(<vscale x 2 x i8*> %ptrs, <vscale x 2 x i1> %m, <vscale x 2 x i8> %passthru) {
+; RV32-LABEL: mgather_nxv2i8_sextload_nxv2i64:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e8,mf4,tu,mu
+; RV32-NEXT: vloxei32.v v9, (zero), v8, v0.t
+; RV32-NEXT: vsetvli a0, zero, e64,m2,ta,mu
+; RV32-NEXT: vsext.vf8 v26, v9
+; RV32-NEXT: vmv2r.v v8, v26
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_nxv2i8_sextload_nxv2i64:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e8,mf4,tu,mu
+; RV64-NEXT: vloxei64.v v10, (zero), v8, v0.t
+; RV64-NEXT: vsetvli a0, zero, e64,m2,ta,mu
+; RV64-NEXT: vsext.vf8 v8, v10
+; RV64-NEXT: ret
+ %v = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8.nxv2p0i8(<vscale x 2 x i8*> %ptrs, i32 1, <vscale x 2 x i1> %m, <vscale x 2 x i8> %passthru)
+ %ev = sext <vscale x 2 x i8> %v to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %ev
+}
+
+define <vscale x 2 x i64> @mgather_nxv2i8_zextload_nxv2i64(<vscale x 2 x i8*> %ptrs, <vscale x 2 x i1> %m, <vscale x 2 x i8> %passthru) {
+; RV32-LABEL: mgather_nxv2i8_zextload_nxv2i64:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e8,mf4,tu,mu
+; RV32-NEXT: vloxei32.v v9, (zero), v8, v0.t
+; RV32-NEXT: vsetvli a0, zero, e64,m2,ta,mu
+; RV32-NEXT: vzext.vf8 v26, v9
+; RV32-NEXT: vmv2r.v v8, v26
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_nxv2i8_zextload_nxv2i64:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e8,mf4,tu,mu
+; RV64-NEXT: vloxei64.v v10, (zero), v8, v0.t
+; RV64-NEXT: vsetvli a0, zero, e64,m2,ta,mu
+; RV64-NEXT: vzext.vf8 v8, v10
+; RV64-NEXT: ret
+ %v = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8.nxv2p0i8(<vscale x 2 x i8*> %ptrs, i32 1, <vscale x 2 x i1> %m, <vscale x 2 x i8> %passthru)
+ %ev = zext <vscale x 2 x i8> %v to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %ev
+}
+
+declare <vscale x 4 x i8> @llvm.masked.gather.nxv4i8.nxv4p0i8(<vscale x 4 x i8*>, i32, <vscale x 4 x i1>, <vscale x 4 x i8>)
+
+define <vscale x 4 x i8> @mgather_nxv4i8(<vscale x 4 x i8*> %ptrs, <vscale x 4 x i1> %m, <vscale x 4 x i8> %passthru) {
+; RV32-LABEL: mgather_nxv4i8:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e8,mf2,tu,mu
+; RV32-NEXT: vloxei32.v v10, (zero), v8, v0.t
+; RV32-NEXT: vmv1r.v v8, v10
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_nxv4i8:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e8,mf2,tu,mu
+; RV64-NEXT: vloxei64.v v12, (zero), v8, v0.t
+; RV64-NEXT: vmv1r.v v8, v12
+; RV64-NEXT: ret
+ %v = call <vscale x 4 x i8> @llvm.masked.gather.nxv4i8.nxv4p0i8(<vscale x 4 x i8*> %ptrs, i32 1, <vscale x 4 x i1> %m, <vscale x 4 x i8> %passthru)
+ ret <vscale x 4 x i8> %v
+}
+
+define <vscale x 4 x i8> @mgather_truemask_nxv4i8(<vscale x 4 x i8*> %ptrs, <vscale x 4 x i8> %passthru) {
+; RV32-LABEL: mgather_truemask_nxv4i8:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e8,mf2,ta,mu
+; RV32-NEXT: vloxei32.v v8, (zero), v8
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_truemask_nxv4i8:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e8,mf2,ta,mu
+; RV64-NEXT: vloxei64.v v8, (zero), v8
+; RV64-NEXT: ret
+ %mhead = insertelement <vscale x 4 x i1> undef, i1 1, i32 0
+ %mtrue = shufflevector <vscale x 4 x i1> %mhead, <vscale x 4 x i1> undef, <vscale x 4 x i32> zeroinitializer
+ %v = call <vscale x 4 x i8> @llvm.masked.gather.nxv4i8.nxv4p0i8(<vscale x 4 x i8*> %ptrs, i32 1, <vscale x 4 x i1> %mtrue, <vscale x 4 x i8> %passthru)
+ ret <vscale x 4 x i8> %v
+}
+
+declare <vscale x 8 x i8> @llvm.masked.gather.nxv8i8.nxv8p0i8(<vscale x 8 x i8*>, i32, <vscale x 8 x i1>, <vscale x 8 x i8>)
+
+define <vscale x 8 x i8> @mgather_nxv8i8(<vscale x 8 x i8*> %ptrs, <vscale x 8 x i1> %m, <vscale x 8 x i8> %passthru) {
+; RV32-LABEL: mgather_nxv8i8:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e8,m1,tu,mu
+; RV32-NEXT: vloxei32.v v12, (zero), v8, v0.t
+; RV32-NEXT: vmv1r.v v8, v12
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_nxv8i8:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e8,m1,tu,mu
+; RV64-NEXT: vloxei64.v v16, (zero), v8, v0.t
+; RV64-NEXT: vmv1r.v v8, v16
+; RV64-NEXT: ret
+ %v = call <vscale x 8 x i8> @llvm.masked.gather.nxv8i8.nxv8p0i8(<vscale x 8 x i8*> %ptrs, i32 1, <vscale x 8 x i1> %m, <vscale x 8 x i8> %passthru)
+ ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 8 x i8> @mgather_baseidx_nxv8i8(i8* %base, <vscale x 8 x i8> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x i8> %passthru) {
+; RV32-LABEL: mgather_baseidx_nxv8i8:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu
+; RV32-NEXT: vsext.vf4 v28, v8
+; RV32-NEXT: vsetvli a1, zero, e8,m1,tu,mu
+; RV32-NEXT: vloxei32.v v9, (a0), v28, v0.t
+; RV32-NEXT: vmv1r.v v8, v9
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_baseidx_nxv8i8:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu
+; RV64-NEXT: vsext.vf8 v16, v8
+; RV64-NEXT: vsetvli a1, zero, e8,m1,tu,mu
+; RV64-NEXT: vloxei64.v v9, (a0), v16, v0.t
+; RV64-NEXT: vmv1r.v v8, v9
+; RV64-NEXT: ret
+ %ptrs = getelementptr inbounds i8, i8* %base, <vscale x 8 x i8> %idxs
+ %v = call <vscale x 8 x i8> @llvm.masked.gather.nxv8i8.nxv8p0i8(<vscale x 8 x i8*> %ptrs, i32 1, <vscale x 8 x i1> %m, <vscale x 8 x i8> %passthru)
+ ret <vscale x 8 x i8> %v
+}
+
+declare <vscale x 1 x i16> @llvm.masked.gather.nxv1i16.nxv1p0i16(<vscale x 1 x i16*>, i32, <vscale x 1 x i1>, <vscale x 1 x i16>)
+
+define <vscale x 1 x i16> @mgather_nxv1i16(<vscale x 1 x i16*> %ptrs, <vscale x 1 x i1> %m, <vscale x 1 x i16> %passthru) {
+; RV32-LABEL: mgather_nxv1i16:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e16,mf4,tu,mu
+; RV32-NEXT: vloxei32.v v9, (zero), v8, v0.t
+; RV32-NEXT: vmv1r.v v8, v9
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_nxv1i16:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e16,mf4,tu,mu
+; RV64-NEXT: vloxei64.v v9, (zero), v8, v0.t
+; RV64-NEXT: vmv1r.v v8, v9
+; RV64-NEXT: ret
+ %v = call <vscale x 1 x i16> @llvm.masked.gather.nxv1i16.nxv1p0i16(<vscale x 1 x i16*> %ptrs, i32 2, <vscale x 1 x i1> %m, <vscale x 1 x i16> %passthru)
+ ret <vscale x 1 x i16> %v
+}
+
+declare <vscale x 2 x i16> @llvm.masked.gather.nxv2i16.nxv2p0i16(<vscale x 2 x i16*>, i32, <vscale x 2 x i1>, <vscale x 2 x i16>)
+
+define <vscale x 2 x i16> @mgather_nxv2i16(<vscale x 2 x i16*> %ptrs, <vscale x 2 x i1> %m, <vscale x 2 x i16> %passthru) {
+; RV32-LABEL: mgather_nxv2i16:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e16,mf2,tu,mu
+; RV32-NEXT: vloxei32.v v9, (zero), v8, v0.t
+; RV32-NEXT: vmv1r.v v8, v9
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_nxv2i16:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e16,mf2,tu,mu
+; RV64-NEXT: vloxei64.v v10, (zero), v8, v0.t
+; RV64-NEXT: vmv1r.v v8, v10
+; RV64-NEXT: ret
+ %v = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16.nxv2p0i16(<vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %m, <vscale x 2 x i16> %passthru)
+ ret <vscale x 2 x i16> %v
+}
+
+define <vscale x 2 x i32> @mgather_nxv2i16_sextload_nxv2i32(<vscale x 2 x i16*> %ptrs, <vscale x 2 x i1> %m, <vscale x 2 x i16> %passthru) {
+; RV32-LABEL: mgather_nxv2i16_sextload_nxv2i32:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e16,mf2,tu,mu
+; RV32-NEXT: vloxei32.v v9, (zero), v8, v0.t
+; RV32-NEXT: vsetvli a0, zero, e32,m1,ta,mu
+; RV32-NEXT: vsext.vf2 v8, v9
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_nxv2i16_sextload_nxv2i32:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e16,mf2,tu,mu
+; RV64-NEXT: vloxei64.v v10, (zero), v8, v0.t
+; RV64-NEXT: vsetvli a0, zero, e32,m1,ta,mu
+; RV64-NEXT: vsext.vf2 v8, v10
+; RV64-NEXT: ret
+ %v = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16.nxv2p0i16(<vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %m, <vscale x 2 x i16> %passthru)
+ %ev = sext <vscale x 2 x i16> %v to <vscale x 2 x i32>
+ ret <vscale x 2 x i32> %ev
+}
+
+define <vscale x 2 x i32> @mgather_nxv2i16_zextload_nxv2i32(<vscale x 2 x i16*> %ptrs, <vscale x 2 x i1> %m, <vscale x 2 x i16> %passthru) {
+; RV32-LABEL: mgather_nxv2i16_zextload_nxv2i32:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e16,mf2,tu,mu
+; RV32-NEXT: vloxei32.v v9, (zero), v8, v0.t
+; RV32-NEXT: vsetvli a0, zero, e32,m1,ta,mu
+; RV32-NEXT: vzext.vf2 v8, v9
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_nxv2i16_zextload_nxv2i32:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e16,mf2,tu,mu
+; RV64-NEXT: vloxei64.v v10, (zero), v8, v0.t
+; RV64-NEXT: vsetvli a0, zero, e32,m1,ta,mu
+; RV64-NEXT: vzext.vf2 v8, v10
+; RV64-NEXT: ret
+ %v = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16.nxv2p0i16(<vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %m, <vscale x 2 x i16> %passthru)
+ %ev = zext <vscale x 2 x i16> %v to <vscale x 2 x i32>
+ ret <vscale x 2 x i32> %ev
+}
+
+define <vscale x 2 x i64> @mgather_nxv2i16_sextload_nxv2i64(<vscale x 2 x i16*> %ptrs, <vscale x 2 x i1> %m, <vscale x 2 x i16> %passthru) {
+; RV32-LABEL: mgather_nxv2i16_sextload_nxv2i64:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e16,mf2,tu,mu
+; RV32-NEXT: vloxei32.v v9, (zero), v8, v0.t
+; RV32-NEXT: vsetvli a0, zero, e64,m2,ta,mu
+; RV32-NEXT: vsext.vf4 v26, v9
+; RV32-NEXT: vmv2r.v v8, v26
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_nxv2i16_sextload_nxv2i64:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e16,mf2,tu,mu
+; RV64-NEXT: vloxei64.v v10, (zero), v8, v0.t
+; RV64-NEXT: vsetvli a0, zero, e64,m2,ta,mu
+; RV64-NEXT: vsext.vf4 v8, v10
+; RV64-NEXT: ret
+ %v = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16.nxv2p0i16(<vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %m, <vscale x 2 x i16> %passthru)
+ %ev = sext <vscale x 2 x i16> %v to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %ev
+}
+
+define <vscale x 2 x i64> @mgather_nxv2i16_zextload_nxv2i64(<vscale x 2 x i16*> %ptrs, <vscale x 2 x i1> %m, <vscale x 2 x i16> %passthru) {
+; RV32-LABEL: mgather_nxv2i16_zextload_nxv2i64:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e16,mf2,tu,mu
+; RV32-NEXT: vloxei32.v v9, (zero), v8, v0.t
+; RV32-NEXT: vsetvli a0, zero, e64,m2,ta,mu
+; RV32-NEXT: vzext.vf4 v26, v9
+; RV32-NEXT: vmv2r.v v8, v26
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_nxv2i16_zextload_nxv2i64:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e16,mf2,tu,mu
+; RV64-NEXT: vloxei64.v v10, (zero), v8, v0.t
+; RV64-NEXT: vsetvli a0, zero, e64,m2,ta,mu
+; RV64-NEXT: vzext.vf4 v8, v10
+; RV64-NEXT: ret
+ %v = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16.nxv2p0i16(<vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %m, <vscale x 2 x i16> %passthru)
+ %ev = zext <vscale x 2 x i16> %v to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %ev
+}
+
+declare <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0i16(<vscale x 4 x i16*>, i32, <vscale x 4 x i1>, <vscale x 4 x i16>)
+
+define <vscale x 4 x i16> @mgather_nxv4i16(<vscale x 4 x i16*> %ptrs, <vscale x 4 x i1> %m, <vscale x 4 x i16> %passthru) {
+; RV32-LABEL: mgather_nxv4i16:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e16,m1,tu,mu
+; RV32-NEXT: vloxei32.v v10, (zero), v8, v0.t
+; RV32-NEXT: vmv1r.v v8, v10
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_nxv4i16:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e16,m1,tu,mu
+; RV64-NEXT: vloxei64.v v12, (zero), v8, v0.t
+; RV64-NEXT: vmv1r.v v8, v12
+; RV64-NEXT: ret
+ %v = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0i16(<vscale x 4 x i16*> %ptrs, i32 2, <vscale x 4 x i1> %m, <vscale x 4 x i16> %passthru)
+ ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 4 x i16> @mgather_truemask_nxv4i16(<vscale x 4 x i16*> %ptrs, <vscale x 4 x i16> %passthru) {
+; RV32-LABEL: mgather_truemask_nxv4i16:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e16,m1,ta,mu
+; RV32-NEXT: vloxei32.v v8, (zero), v8
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_truemask_nxv4i16:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e16,m1,ta,mu
+; RV64-NEXT: vloxei64.v v8, (zero), v8
+; RV64-NEXT: ret
+ %mhead = insertelement <vscale x 4 x i1> undef, i1 1, i32 0
+ %mtrue = shufflevector <vscale x 4 x i1> %mhead, <vscale x 4 x i1> undef, <vscale x 4 x i32> zeroinitializer
+ %v = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0i16(<vscale x 4 x i16*> %ptrs, i32 2, <vscale x 4 x i1> %mtrue, <vscale x 4 x i16> %passthru)
+ ret <vscale x 4 x i16> %v
+}
+
+declare <vscale x 8 x i16> @llvm.masked.gather.nxv8i16.nxv8p0i16(<vscale x 8 x i16*>, i32, <vscale x 8 x i1>, <vscale x 8 x i16>)
+
+define <vscale x 8 x i16> @mgather_nxv8i16(<vscale x 8 x i16*> %ptrs, <vscale x 8 x i1> %m, <vscale x 8 x i16> %passthru) {
+; RV32-LABEL: mgather_nxv8i16:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e16,m2,tu,mu
+; RV32-NEXT: vloxei32.v v12, (zero), v8, v0.t
+; RV32-NEXT: vmv2r.v v8, v12
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_nxv8i16:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e16,m2,tu,mu
+; RV64-NEXT: vloxei64.v v16, (zero), v8, v0.t
+; RV64-NEXT: vmv2r.v v8, v16
+; RV64-NEXT: ret
+ %v = call <vscale x 8 x i16> @llvm.masked.gather.nxv8i16.nxv8p0i16(<vscale x 8 x i16*> %ptrs, i32 2, <vscale x 8 x i1> %m, <vscale x 8 x i16> %passthru)
+ ret <vscale x 8 x i16> %v
+}
+
+define <vscale x 8 x i16> @mgather_baseidx_nxv8i8_nxv8i16(i16* %base, <vscale x 8 x i8> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x i16> %passthru) {
+; RV32-LABEL: mgather_baseidx_nxv8i8_nxv8i16:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu
+; RV32-NEXT: vsext.vf4 v28, v8
+; RV32-NEXT: vsll.vi v28, v28, 1
+; RV32-NEXT: vsetvli a1, zero, e16,m2,tu,mu
+; RV32-NEXT: vloxei32.v v10, (a0), v28, v0.t
+; RV32-NEXT: vmv2r.v v8, v10
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_baseidx_nxv8i8_nxv8i16:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu
+; RV64-NEXT: vsext.vf8 v16, v8
+; RV64-NEXT: vsll.vi v16, v16, 1
+; RV64-NEXT: vsetvli a1, zero, e16,m2,tu,mu
+; RV64-NEXT: vloxei64.v v10, (a0), v16, v0.t
+; RV64-NEXT: vmv2r.v v8, v10
+; RV64-NEXT: ret
+ %ptrs = getelementptr inbounds i16, i16* %base, <vscale x 8 x i8> %idxs
+ %v = call <vscale x 8 x i16> @llvm.masked.gather.nxv8i16.nxv8p0i16(<vscale x 8 x i16*> %ptrs, i32 2, <vscale x 8 x i1> %m, <vscale x 8 x i16> %passthru)
+ ret <vscale x 8 x i16> %v
+}
+
+define <vscale x 8 x i16> @mgather_baseidx_sext_nxv8i8_nxv8i16(i16* %base, <vscale x 8 x i8> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x i16> %passthru) {
+; RV32-LABEL: mgather_baseidx_sext_nxv8i8_nxv8i16:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu
+; RV32-NEXT: vsext.vf4 v28, v8
+; RV32-NEXT: vsll.vi v28, v28, 1
+; RV32-NEXT: vsetvli a1, zero, e16,m2,tu,mu
+; RV32-NEXT: vloxei32.v v10, (a0), v28, v0.t
+; RV32-NEXT: vmv2r.v v8, v10
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_baseidx_sext_nxv8i8_nxv8i16:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu
+; RV64-NEXT: vsext.vf8 v16, v8
+; RV64-NEXT: vsll.vi v16, v16, 1
+; RV64-NEXT: vsetvli a1, zero, e16,m2,tu,mu
+; RV64-NEXT: vloxei64.v v10, (a0), v16, v0.t
+; RV64-NEXT: vmv2r.v v8, v10
+; RV64-NEXT: ret
+ %eidxs = sext <vscale x 8 x i8> %idxs to <vscale x 8 x i16>
+ %ptrs = getelementptr inbounds i16, i16* %base, <vscale x 8 x i16> %eidxs
+ %v = call <vscale x 8 x i16> @llvm.masked.gather.nxv8i16.nxv8p0i16(<vscale x 8 x i16*> %ptrs, i32 2, <vscale x 8 x i1> %m, <vscale x 8 x i16> %passthru)
+ ret <vscale x 8 x i16> %v
+}
+
+define <vscale x 8 x i16> @mgather_baseidx_zext_nxv8i8_nxv8i16(i16* %base, <vscale x 8 x i8> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x i16> %passthru) {
+; RV32-LABEL: mgather_baseidx_zext_nxv8i8_nxv8i16:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu
+; RV32-NEXT: vzext.vf4 v28, v8
+; RV32-NEXT: vsll.vi v28, v28, 1
+; RV32-NEXT: vsetvli a1, zero, e16,m2,tu,mu
+; RV32-NEXT: vloxei32.v v10, (a0), v28, v0.t
+; RV32-NEXT: vmv2r.v v8, v10
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_baseidx_zext_nxv8i8_nxv8i16:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu
+; RV64-NEXT: vzext.vf8 v16, v8
+; RV64-NEXT: vsll.vi v16, v16, 1
+; RV64-NEXT: vsetvli a1, zero, e16,m2,tu,mu
+; RV64-NEXT: vloxei64.v v10, (a0), v16, v0.t
+; RV64-NEXT: vmv2r.v v8, v10
+; RV64-NEXT: ret
+ %eidxs = zext <vscale x 8 x i8> %idxs to <vscale x 8 x i16>
+ %ptrs = getelementptr inbounds i16, i16* %base, <vscale x 8 x i16> %eidxs
+ %v = call <vscale x 8 x i16> @llvm.masked.gather.nxv8i16.nxv8p0i16(<vscale x 8 x i16*> %ptrs, i32 2, <vscale x 8 x i1> %m, <vscale x 8 x i16> %passthru)
+ ret <vscale x 8 x i16> %v
+}
+
+define <vscale x 8 x i16> @mgather_baseidx_nxv8i16(i16* %base, <vscale x 8 x i16> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x i16> %passthru) {
+; RV32-LABEL: mgather_baseidx_nxv8i16:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu
+; RV32-NEXT: vsext.vf2 v28, v8
+; RV32-NEXT: vsll.vi v28, v28, 1
+; RV32-NEXT: vsetvli a1, zero, e16,m2,tu,mu
+; RV32-NEXT: vloxei32.v v10, (a0), v28, v0.t
+; RV32-NEXT: vmv2r.v v8, v10
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_baseidx_nxv8i16:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu
+; RV64-NEXT: vsext.vf4 v16, v8
+; RV64-NEXT: vsll.vi v16, v16, 1
+; RV64-NEXT: vsetvli a1, zero, e16,m2,tu,mu
+; RV64-NEXT: vloxei64.v v10, (a0), v16, v0.t
+; RV64-NEXT: vmv2r.v v8, v10
+; RV64-NEXT: ret
+ %ptrs = getelementptr inbounds i16, i16* %base, <vscale x 8 x i16> %idxs
+ %v = call <vscale x 8 x i16> @llvm.masked.gather.nxv8i16.nxv8p0i16(<vscale x 8 x i16*> %ptrs, i32 2, <vscale x 8 x i1> %m, <vscale x 8 x i16> %passthru)
+ ret <vscale x 8 x i16> %v
+}
+
+declare <vscale x 1 x i32> @llvm.masked.gather.nxv1i32.nxv1p0i32(<vscale x 1 x i32*>, i32, <vscale x 1 x i1>, <vscale x 1 x i32>)
+
+define <vscale x 1 x i32> @mgather_nxv1i32(<vscale x 1 x i32*> %ptrs, <vscale x 1 x i1> %m, <vscale x 1 x i32> %passthru) {
+; RV32-LABEL: mgather_nxv1i32:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e32,mf2,tu,mu
+; RV32-NEXT: vloxei32.v v9, (zero), v8, v0.t
+; RV32-NEXT: vmv1r.v v8, v9
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_nxv1i32:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e32,mf2,tu,mu
+; RV64-NEXT: vloxei64.v v9, (zero), v8, v0.t
+; RV64-NEXT: vmv1r.v v8, v9
+; RV64-NEXT: ret
+ %v = call <vscale x 1 x i32> @llvm.masked.gather.nxv1i32.nxv1p0i32(<vscale x 1 x i32*> %ptrs, i32 4, <vscale x 1 x i1> %m, <vscale x 1 x i32> %passthru)
+ ret <vscale x 1 x i32> %v
+}
+
+declare <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0i32(<vscale x 2 x i32*>, i32, <vscale x 2 x i1>, <vscale x 2 x i32>)
+
+define <vscale x 2 x i32> @mgather_nxv2i32(<vscale x 2 x i32*> %ptrs, <vscale x 2 x i1> %m, <vscale x 2 x i32> %passthru) {
+; RV32-LABEL: mgather_nxv2i32:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e32,m1,tu,mu
+; RV32-NEXT: vloxei32.v v9, (zero), v8, v0.t
+; RV32-NEXT: vmv1r.v v8, v9
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_nxv2i32:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e32,m1,tu,mu
+; RV64-NEXT: vloxei64.v v10, (zero), v8, v0.t
+; RV64-NEXT: vmv1r.v v8, v10
+; RV64-NEXT: ret
+ %v = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %m, <vscale x 2 x i32> %passthru)
+ ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 2 x i64> @mgather_nxv2i32_sextload_nxv2i64(<vscale x 2 x i32*> %ptrs, <vscale x 2 x i1> %m, <vscale x 2 x i32> %passthru) {
+; RV32-LABEL: mgather_nxv2i32_sextload_nxv2i64:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e32,m1,tu,mu
+; RV32-NEXT: vloxei32.v v9, (zero), v8, v0.t
+; RV32-NEXT: vsetvli a0, zero, e64,m2,ta,mu
+; RV32-NEXT: vsext.vf2 v26, v9
+; RV32-NEXT: vmv2r.v v8, v26
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_nxv2i32_sextload_nxv2i64:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e32,m1,tu,mu
+; RV64-NEXT: vloxei64.v v10, (zero), v8, v0.t
+; RV64-NEXT: vsetvli a0, zero, e64,m2,ta,mu
+; RV64-NEXT: vsext.vf2 v8, v10
+; RV64-NEXT: ret
+ %v = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %m, <vscale x 2 x i32> %passthru)
+ %ev = sext <vscale x 2 x i32> %v to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %ev
+}
+
+define <vscale x 2 x i64> @mgather_nxv2i32_zextload_nxv2i64(<vscale x 2 x i32*> %ptrs, <vscale x 2 x i1> %m, <vscale x 2 x i32> %passthru) {
+; RV32-LABEL: mgather_nxv2i32_zextload_nxv2i64:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e32,m1,tu,mu
+; RV32-NEXT: vloxei32.v v9, (zero), v8, v0.t
+; RV32-NEXT: vsetvli a0, zero, e64,m2,ta,mu
+; RV32-NEXT: vzext.vf2 v26, v9
+; RV32-NEXT: vmv2r.v v8, v26
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_nxv2i32_zextload_nxv2i64:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e32,m1,tu,mu
+; RV64-NEXT: vloxei64.v v10, (zero), v8, v0.t
+; RV64-NEXT: vsetvli a0, zero, e64,m2,ta,mu
+; RV64-NEXT: vzext.vf2 v8, v10
+; RV64-NEXT: ret
+ %v = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %m, <vscale x 2 x i32> %passthru)
+ %ev = zext <vscale x 2 x i32> %v to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %ev
+}
+
+declare <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0i32(<vscale x 4 x i32*>, i32, <vscale x 4 x i1>, <vscale x 4 x i32>)
+
+define <vscale x 4 x i32> @mgather_nxv4i32(<vscale x 4 x i32*> %ptrs, <vscale x 4 x i1> %m, <vscale x 4 x i32> %passthru) {
+; RV32-LABEL: mgather_nxv4i32:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e32,m2,tu,mu
+; RV32-NEXT: vloxei32.v v10, (zero), v8, v0.t
+; RV32-NEXT: vmv2r.v v8, v10
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_nxv4i32:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e32,m2,tu,mu
+; RV64-NEXT: vloxei64.v v12, (zero), v8, v0.t
+; RV64-NEXT: vmv2r.v v8, v12
+; RV64-NEXT: ret
+ %v = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0i32(<vscale x 4 x i32*> %ptrs, i32 4, <vscale x 4 x i1> %m, <vscale x 4 x i32> %passthru)
+ ret <vscale x 4 x i32> %v
+}
+
+define <vscale x 4 x i32> @mgather_truemask_nxv4i32(<vscale x 4 x i32*> %ptrs, <vscale x 4 x i32> %passthru) {
+; RV32-LABEL: mgather_truemask_nxv4i32:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e32,m2,ta,mu
+; RV32-NEXT: vloxei32.v v8, (zero), v8
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_truemask_nxv4i32:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e32,m2,ta,mu
+; RV64-NEXT: vloxei64.v v8, (zero), v8
+; RV64-NEXT: ret
+ %mhead = insertelement <vscale x 4 x i1> undef, i1 1, i32 0
+ %mtrue = shufflevector <vscale x 4 x i1> %mhead, <vscale x 4 x i1> undef, <vscale x 4 x i32> zeroinitializer
+ %v = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0i32(<vscale x 4 x i32*> %ptrs, i32 4, <vscale x 4 x i1> %mtrue, <vscale x 4 x i32> %passthru)
+ ret <vscale x 4 x i32> %v
+}
+
+declare <vscale x 8 x i32> @llvm.masked.gather.nxv8i32.nxv8p0i32(<vscale x 8 x i32*>, i32, <vscale x 8 x i1>, <vscale x 8 x i32>)
+
+define <vscale x 8 x i32> @mgather_nxv8i32(<vscale x 8 x i32*> %ptrs, <vscale x 8 x i1> %m, <vscale x 8 x i32> %passthru) {
+; RV32-LABEL: mgather_nxv8i32:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e32,m4,tu,mu
+; RV32-NEXT: vloxei32.v v12, (zero), v8, v0.t
+; RV32-NEXT: vmv4r.v v8, v12
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_nxv8i32:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e32,m4,tu,mu
+; RV64-NEXT: vloxei64.v v16, (zero), v8, v0.t
+; RV64-NEXT: vmv4r.v v8, v16
+; RV64-NEXT: ret
+ %v = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32.nxv8p0i32(<vscale x 8 x i32*> %ptrs, i32 4, <vscale x 8 x i1> %m, <vscale x 8 x i32> %passthru)
+ ret <vscale x 8 x i32> %v
+}
+
+define <vscale x 8 x i32> @mgather_baseidx_nxv8i8_nxv8i32(i32* %base, <vscale x 8 x i8> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x i32> %passthru) {
+; RV32-LABEL: mgather_baseidx_nxv8i8_nxv8i32:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu
+; RV32-NEXT: vsext.vf4 v28, v8
+; RV32-NEXT: vsll.vi v28, v28, 2
+; RV32-NEXT: vsetvli a1, zero, e32,m4,tu,mu
+; RV32-NEXT: vloxei32.v v12, (a0), v28, v0.t
+; RV32-NEXT: vmv4r.v v8, v12
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_baseidx_nxv8i8_nxv8i32:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu
+; RV64-NEXT: vsext.vf8 v16, v8
+; RV64-NEXT: vsll.vi v16, v16, 2
+; RV64-NEXT: vsetvli a1, zero, e32,m4,tu,mu
+; RV64-NEXT: vloxei64.v v12, (a0), v16, v0.t
+; RV64-NEXT: vmv4r.v v8, v12
+; RV64-NEXT: ret
+ %ptrs = getelementptr inbounds i32, i32* %base, <vscale x 8 x i8> %idxs
+ %v = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32.nxv8p0i32(<vscale x 8 x i32*> %ptrs, i32 4, <vscale x 8 x i1> %m, <vscale x 8 x i32> %passthru)
+ ret <vscale x 8 x i32> %v
+}
+
+define <vscale x 8 x i32> @mgather_baseidx_sext_nxv8i8_nxv8i32(i32* %base, <vscale x 8 x i8> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x i32> %passthru) {
+; RV32-LABEL: mgather_baseidx_sext_nxv8i8_nxv8i32:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu
+; RV32-NEXT: vsext.vf4 v28, v8
+; RV32-NEXT: vsll.vi v28, v28, 2
+; RV32-NEXT: vsetvli a1, zero, e32,m4,tu,mu
+; RV32-NEXT: vloxei32.v v12, (a0), v28, v0.t
+; RV32-NEXT: vmv4r.v v8, v12
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_baseidx_sext_nxv8i8_nxv8i32:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu
+; RV64-NEXT: vsext.vf8 v16, v8
+; RV64-NEXT: vsll.vi v16, v16, 2
+; RV64-NEXT: vsetvli a1, zero, e32,m4,tu,mu
+; RV64-NEXT: vloxei64.v v12, (a0), v16, v0.t
+; RV64-NEXT: vmv4r.v v8, v12
+; RV64-NEXT: ret
+ %eidxs = sext <vscale x 8 x i8> %idxs to <vscale x 8 x i32>
+ %ptrs = getelementptr inbounds i32, i32* %base, <vscale x 8 x i32> %eidxs
+ %v = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32.nxv8p0i32(<vscale x 8 x i32*> %ptrs, i32 4, <vscale x 8 x i1> %m, <vscale x 8 x i32> %passthru)
+ ret <vscale x 8 x i32> %v
+}
+
+define <vscale x 8 x i32> @mgather_baseidx_zext_nxv8i8_nxv8i32(i32* %base, <vscale x 8 x i8> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x i32> %passthru) {
+; RV32-LABEL: mgather_baseidx_zext_nxv8i8_nxv8i32:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu
+; RV32-NEXT: vzext.vf4 v28, v8
+; RV32-NEXT: vsll.vi v28, v28, 2
+; RV32-NEXT: vsetvli a1, zero, e32,m4,tu,mu
+; RV32-NEXT: vloxei32.v v12, (a0), v28, v0.t
+; RV32-NEXT: vmv4r.v v8, v12
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_baseidx_zext_nxv8i8_nxv8i32:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu
+; RV64-NEXT: vzext.vf8 v16, v8
+; RV64-NEXT: vsll.vi v16, v16, 2
+; RV64-NEXT: vsetvli a1, zero, e32,m4,tu,mu
+; RV64-NEXT: vloxei64.v v12, (a0), v16, v0.t
+; RV64-NEXT: vmv4r.v v8, v12
+; RV64-NEXT: ret
+ %eidxs = zext <vscale x 8 x i8> %idxs to <vscale x 8 x i32>
+ %ptrs = getelementptr inbounds i32, i32* %base, <vscale x 8 x i32> %eidxs
+ %v = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32.nxv8p0i32(<vscale x 8 x i32*> %ptrs, i32 4, <vscale x 8 x i1> %m, <vscale x 8 x i32> %passthru)
+ ret <vscale x 8 x i32> %v
+}
+
+define <vscale x 8 x i32> @mgather_baseidx_nxv8i16_nxv8i32(i32* %base, <vscale x 8 x i16> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x i32> %passthru) {
+; RV32-LABEL: mgather_baseidx_nxv8i16_nxv8i32:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu
+; RV32-NEXT: vsext.vf2 v28, v8
+; RV32-NEXT: vsll.vi v28, v28, 2
+; RV32-NEXT: vsetvli a1, zero, e32,m4,tu,mu
+; RV32-NEXT: vloxei32.v v12, (a0), v28, v0.t
+; RV32-NEXT: vmv4r.v v8, v12
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_baseidx_nxv8i16_nxv8i32:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu
+; RV64-NEXT: vsext.vf4 v16, v8
+; RV64-NEXT: vsll.vi v16, v16, 2
+; RV64-NEXT: vsetvli a1, zero, e32,m4,tu,mu
+; RV64-NEXT: vloxei64.v v12, (a0), v16, v0.t
+; RV64-NEXT: vmv4r.v v8, v12
+; RV64-NEXT: ret
+ %ptrs = getelementptr inbounds i32, i32* %base, <vscale x 8 x i16> %idxs
+ %v = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32.nxv8p0i32(<vscale x 8 x i32*> %ptrs, i32 4, <vscale x 8 x i1> %m, <vscale x 8 x i32> %passthru)
+ ret <vscale x 8 x i32> %v
+}
+
+define <vscale x 8 x i32> @mgather_baseidx_sext_nxv8i16_nxv8i32(i32* %base, <vscale x 8 x i16> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x i32> %passthru) {
+; RV32-LABEL: mgather_baseidx_sext_nxv8i16_nxv8i32:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu
+; RV32-NEXT: vsext.vf2 v28, v8
+; RV32-NEXT: vsll.vi v28, v28, 2
+; RV32-NEXT: vsetvli a1, zero, e32,m4,tu,mu
+; RV32-NEXT: vloxei32.v v12, (a0), v28, v0.t
+; RV32-NEXT: vmv4r.v v8, v12
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_baseidx_sext_nxv8i16_nxv8i32:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu
+; RV64-NEXT: vsext.vf4 v16, v8
+; RV64-NEXT: vsll.vi v16, v16, 2
+; RV64-NEXT: vsetvli a1, zero, e32,m4,tu,mu
+; RV64-NEXT: vloxei64.v v12, (a0), v16, v0.t
+; RV64-NEXT: vmv4r.v v8, v12
+; RV64-NEXT: ret
+ %eidxs = sext <vscale x 8 x i16> %idxs to <vscale x 8 x i32>
+ %ptrs = getelementptr inbounds i32, i32* %base, <vscale x 8 x i32> %eidxs
+ %v = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32.nxv8p0i32(<vscale x 8 x i32*> %ptrs, i32 4, <vscale x 8 x i1> %m, <vscale x 8 x i32> %passthru)
+ ret <vscale x 8 x i32> %v
+}
+
+define <vscale x 8 x i32> @mgather_baseidx_zext_nxv8i16_nxv8i32(i32* %base, <vscale x 8 x i16> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x i32> %passthru) {
+; RV32-LABEL: mgather_baseidx_zext_nxv8i16_nxv8i32:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu
+; RV32-NEXT: vzext.vf2 v28, v8
+; RV32-NEXT: vsll.vi v28, v28, 2
+; RV32-NEXT: vsetvli a1, zero, e32,m4,tu,mu
+; RV32-NEXT: vloxei32.v v12, (a0), v28, v0.t
+; RV32-NEXT: vmv4r.v v8, v12
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_baseidx_zext_nxv8i16_nxv8i32:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu
+; RV64-NEXT: vzext.vf4 v16, v8
+; RV64-NEXT: vsll.vi v16, v16, 2
+; RV64-NEXT: vsetvli a1, zero, e32,m4,tu,mu
+; RV64-NEXT: vloxei64.v v12, (a0), v16, v0.t
+; RV64-NEXT: vmv4r.v v8, v12
+; RV64-NEXT: ret
+ %eidxs = zext <vscale x 8 x i16> %idxs to <vscale x 8 x i32>
+ %ptrs = getelementptr inbounds i32, i32* %base, <vscale x 8 x i32> %eidxs
+ %v = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32.nxv8p0i32(<vscale x 8 x i32*> %ptrs, i32 4, <vscale x 8 x i1> %m, <vscale x 8 x i32> %passthru)
+ ret <vscale x 8 x i32> %v
+}
+
+define <vscale x 8 x i32> @mgather_baseidx_nxv8i32(i32* %base, <vscale x 8 x i32> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x i32> %passthru) {
+; RV32-LABEL: mgather_baseidx_nxv8i32:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu
+; RV32-NEXT: vsll.vi v28, v8, 2
+; RV32-NEXT: vsetvli a1, zero, e32,m4,tu,mu
+; RV32-NEXT: vloxei32.v v12, (a0), v28, v0.t
+; RV32-NEXT: vmv4r.v v8, v12
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_baseidx_nxv8i32:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu
+; RV64-NEXT: vsext.vf2 v16, v8
+; RV64-NEXT: vsll.vi v16, v16, 2
+; RV64-NEXT: vsetvli a1, zero, e32,m4,tu,mu
+; RV64-NEXT: vloxei64.v v12, (a0), v16, v0.t
+; RV64-NEXT: vmv4r.v v8, v12
+; RV64-NEXT: ret
+ %ptrs = getelementptr inbounds i32, i32* %base, <vscale x 8 x i32> %idxs
+ %v = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32.nxv8p0i32(<vscale x 8 x i32*> %ptrs, i32 4, <vscale x 8 x i1> %m, <vscale x 8 x i32> %passthru)
+ ret <vscale x 8 x i32> %v
+}
+
+declare <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0i64(<vscale x 1 x i64*>, i32, <vscale x 1 x i1>, <vscale x 1 x i64>)
+
+define <vscale x 1 x i64> @mgather_nxv1i64(<vscale x 1 x i64*> %ptrs, <vscale x 1 x i1> %m, <vscale x 1 x i64> %passthru) {
+; RV32-LABEL: mgather_nxv1i64:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e64,m1,tu,mu
+; RV32-NEXT: vloxei32.v v9, (zero), v8, v0.t
+; RV32-NEXT: vmv1r.v v8, v9
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_nxv1i64:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e64,m1,tu,mu
+; RV64-NEXT: vloxei64.v v9, (zero), v8, v0.t
+; RV64-NEXT: vmv1r.v v8, v9
+; RV64-NEXT: ret
+ %v = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0i64(<vscale x 1 x i64*> %ptrs, i32 8, <vscale x 1 x i1> %m, <vscale x 1 x i64> %passthru)
+ ret <vscale x 1 x i64> %v
+}
+
+declare <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0i64(<vscale x 2 x i64*>, i32, <vscale x 2 x i1>, <vscale x 2 x i64>)
+
+define <vscale x 2 x i64> @mgather_nxv2i64(<vscale x 2 x i64*> %ptrs, <vscale x 2 x i1> %m, <vscale x 2 x i64> %passthru) {
+; RV32-LABEL: mgather_nxv2i64:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e64,m2,tu,mu
+; RV32-NEXT: vloxei32.v v10, (zero), v8, v0.t
+; RV32-NEXT: vmv2r.v v8, v10
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_nxv2i64:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e64,m2,tu,mu
+; RV64-NEXT: vloxei64.v v10, (zero), v8, v0.t
+; RV64-NEXT: vmv2r.v v8, v10
+; RV64-NEXT: ret
+ %v = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0i64(<vscale x 2 x i64*> %ptrs, i32 8, <vscale x 2 x i1> %m, <vscale x 2 x i64> %passthru)
+ ret <vscale x 2 x i64> %v
+}
+
+declare <vscale x 4 x i64> @llvm.masked.gather.nxv4i64.nxv4p0i64(<vscale x 4 x i64*>, i32, <vscale x 4 x i1>, <vscale x 4 x i64>)
+
+define <vscale x 4 x i64> @mgather_nxv4i64(<vscale x 4 x i64*> %ptrs, <vscale x 4 x i1> %m, <vscale x 4 x i64> %passthru) {
+; RV32-LABEL: mgather_nxv4i64:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e64,m4,tu,mu
+; RV32-NEXT: vloxei32.v v12, (zero), v8, v0.t
+; RV32-NEXT: vmv4r.v v8, v12
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_nxv4i64:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e64,m4,tu,mu
+; RV64-NEXT: vloxei64.v v12, (zero), v8, v0.t
+; RV64-NEXT: vmv4r.v v8, v12
+; RV64-NEXT: ret
+ %v = call <vscale x 4 x i64> @llvm.masked.gather.nxv4i64.nxv4p0i64(<vscale x 4 x i64*> %ptrs, i32 8, <vscale x 4 x i1> %m, <vscale x 4 x i64> %passthru)
+ ret <vscale x 4 x i64> %v
+}
+
+define <vscale x 4 x i64> @mgather_truemask_nxv4i64(<vscale x 4 x i64*> %ptrs, <vscale x 4 x i64> %passthru) {
+; RV32-LABEL: mgather_truemask_nxv4i64:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e64,m4,ta,mu
+; RV32-NEXT: vloxei32.v v8, (zero), v8
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_truemask_nxv4i64:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e64,m4,ta,mu
+; RV64-NEXT: vloxei64.v v8, (zero), v8
+; RV64-NEXT: ret
+ %mhead = insertelement <vscale x 4 x i1> undef, i1 1, i32 0
+ %mtrue = shufflevector <vscale x 4 x i1> %mhead, <vscale x 4 x i1> undef, <vscale x 4 x i32> zeroinitializer
+ %v = call <vscale x 4 x i64> @llvm.masked.gather.nxv4i64.nxv4p0i64(<vscale x 4 x i64*> %ptrs, i32 8, <vscale x 4 x i1> %mtrue, <vscale x 4 x i64> %passthru)
+ ret <vscale x 4 x i64> %v
+}
+
+declare <vscale x 8 x i64> @llvm.masked.gather.nxv8i64.nxv8p0i64(<vscale x 8 x i64*>, i32, <vscale x 8 x i1>, <vscale x 8 x i64>)
+
+define <vscale x 8 x i64> @mgather_nxv8i64(<vscale x 8 x i64*> %ptrs, <vscale x 8 x i1> %m, <vscale x 8 x i64> %passthru) {
+; RV32-LABEL: mgather_nxv8i64:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e64,m8,tu,mu
+; RV32-NEXT: vloxei32.v v16, (zero), v8, v0.t
+; RV32-NEXT: vmv8r.v v8, v16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_nxv8i64:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e64,m8,tu,mu
+; RV64-NEXT: vloxei64.v v16, (zero), v8, v0.t
+; RV64-NEXT: vmv8r.v v8, v16
+; RV64-NEXT: ret
+ %v = call <vscale x 8 x i64> @llvm.masked.gather.nxv8i64.nxv8p0i64(<vscale x 8 x i64*> %ptrs, i32 8, <vscale x 8 x i1> %m, <vscale x 8 x i64> %passthru)
+ ret <vscale x 8 x i64> %v
+}
+
+define <vscale x 8 x i64> @mgather_baseidx_nxv8i8_nxv8i64(i64* %base, <vscale x 8 x i8> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x i64> %passthru) {
+; RV32-LABEL: mgather_baseidx_nxv8i8_nxv8i64:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu
+; RV32-NEXT: vsext.vf4 v28, v8
+; RV32-NEXT: vsll.vi v28, v28, 3
+; RV32-NEXT: vsetvli a1, zero, e64,m8,tu,mu
+; RV32-NEXT: vloxei32.v v16, (a0), v28, v0.t
+; RV32-NEXT: vmv8r.v v8, v16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_baseidx_nxv8i8_nxv8i64:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu
+; RV64-NEXT: vsext.vf8 v24, v8
+; RV64-NEXT: vsll.vi v8, v24, 3
+; RV64-NEXT: vsetvli a1, zero, e64,m8,tu,mu
+; RV64-NEXT: vloxei64.v v16, (a0), v8, v0.t
+; RV64-NEXT: vmv8r.v v8, v16
+; RV64-NEXT: ret
+ %ptrs = getelementptr inbounds i64, i64* %base, <vscale x 8 x i8> %idxs
+ %v = call <vscale x 8 x i64> @llvm.masked.gather.nxv8i64.nxv8p0i64(<vscale x 8 x i64*> %ptrs, i32 8, <vscale x 8 x i1> %m, <vscale x 8 x i64> %passthru)
+ ret <vscale x 8 x i64> %v
+}
+
+define <vscale x 8 x i64> @mgather_baseidx_sext_nxv8i8_nxv8i64(i64* %base, <vscale x 8 x i8> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x i64> %passthru) {
+; RV32-LABEL: mgather_baseidx_sext_nxv8i8_nxv8i64:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a1, zero, e64,m8,ta,mu
+; RV32-NEXT: vsext.vf8 v24, v8
+; RV32-NEXT: vsll.vi v8, v24, 3
+; RV32-NEXT: vsetvli a1, zero, e64,m8,tu,mu
+; RV32-NEXT: vloxei64.v v16, (a0), v8, v0.t
+; RV32-NEXT: vmv8r.v v8, v16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_baseidx_sext_nxv8i8_nxv8i64:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu
+; RV64-NEXT: vsext.vf8 v24, v8
+; RV64-NEXT: vsll.vi v8, v24, 3
+; RV64-NEXT: vsetvli a1, zero, e64,m8,tu,mu
+; RV64-NEXT: vloxei64.v v16, (a0), v8, v0.t
+; RV64-NEXT: vmv8r.v v8, v16
+; RV64-NEXT: ret
+ %eidxs = sext <vscale x 8 x i8> %idxs to <vscale x 8 x i64>
+ %ptrs = getelementptr inbounds i64, i64* %base, <vscale x 8 x i64> %eidxs
+ %v = call <vscale x 8 x i64> @llvm.masked.gather.nxv8i64.nxv8p0i64(<vscale x 8 x i64*> %ptrs, i32 8, <vscale x 8 x i1> %m, <vscale x 8 x i64> %passthru)
+ ret <vscale x 8 x i64> %v
+}
+
+define <vscale x 8 x i64> @mgather_baseidx_zext_nxv8i8_nxv8i64(i64* %base, <vscale x 8 x i8> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x i64> %passthru) {
+; RV32-LABEL: mgather_baseidx_zext_nxv8i8_nxv8i64:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a1, zero, e64,m8,ta,mu
+; RV32-NEXT: vzext.vf8 v24, v8
+; RV32-NEXT: vsll.vi v8, v24, 3
+; RV32-NEXT: vsetvli a1, zero, e64,m8,tu,mu
+; RV32-NEXT: vloxei64.v v16, (a0), v8, v0.t
+; RV32-NEXT: vmv8r.v v8, v16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_baseidx_zext_nxv8i8_nxv8i64:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu
+; RV64-NEXT: vzext.vf8 v24, v8
+; RV64-NEXT: vsll.vi v8, v24, 3
+; RV64-NEXT: vsetvli a1, zero, e64,m8,tu,mu
+; RV64-NEXT: vloxei64.v v16, (a0), v8, v0.t
+; RV64-NEXT: vmv8r.v v8, v16
+; RV64-NEXT: ret
+ %eidxs = zext <vscale x 8 x i8> %idxs to <vscale x 8 x i64>
+ %ptrs = getelementptr inbounds i64, i64* %base, <vscale x 8 x i64> %eidxs
+ %v = call <vscale x 8 x i64> @llvm.masked.gather.nxv8i64.nxv8p0i64(<vscale x 8 x i64*> %ptrs, i32 8, <vscale x 8 x i1> %m, <vscale x 8 x i64> %passthru)
+ ret <vscale x 8 x i64> %v
+}
+
+define <vscale x 8 x i64> @mgather_baseidx_nxv8i16_nxv8i64(i64* %base, <vscale x 8 x i16> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x i64> %passthru) {
+; RV32-LABEL: mgather_baseidx_nxv8i16_nxv8i64:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu
+; RV32-NEXT: vsext.vf2 v28, v8
+; RV32-NEXT: vsll.vi v28, v28, 3
+; RV32-NEXT: vsetvli a1, zero, e64,m8,tu,mu
+; RV32-NEXT: vloxei32.v v16, (a0), v28, v0.t
+; RV32-NEXT: vmv8r.v v8, v16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_baseidx_nxv8i16_nxv8i64:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu
+; RV64-NEXT: vsext.vf4 v24, v8
+; RV64-NEXT: vsll.vi v8, v24, 3
+; RV64-NEXT: vsetvli a1, zero, e64,m8,tu,mu
+; RV64-NEXT: vloxei64.v v16, (a0), v8, v0.t
+; RV64-NEXT: vmv8r.v v8, v16
+; RV64-NEXT: ret
+ %ptrs = getelementptr inbounds i64, i64* %base, <vscale x 8 x i16> %idxs
+ %v = call <vscale x 8 x i64> @llvm.masked.gather.nxv8i64.nxv8p0i64(<vscale x 8 x i64*> %ptrs, i32 8, <vscale x 8 x i1> %m, <vscale x 8 x i64> %passthru)
+ ret <vscale x 8 x i64> %v
+}
+
+define <vscale x 8 x i64> @mgather_baseidx_sext_nxv8i16_nxv8i64(i64* %base, <vscale x 8 x i16> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x i64> %passthru) {
+; RV32-LABEL: mgather_baseidx_sext_nxv8i16_nxv8i64:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a1, zero, e64,m8,ta,mu
+; RV32-NEXT: vsext.vf4 v24, v8
+; RV32-NEXT: vsll.vi v8, v24, 3
+; RV32-NEXT: vsetvli a1, zero, e64,m8,tu,mu
+; RV32-NEXT: vloxei64.v v16, (a0), v8, v0.t
+; RV32-NEXT: vmv8r.v v8, v16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_baseidx_sext_nxv8i16_nxv8i64:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu
+; RV64-NEXT: vsext.vf4 v24, v8
+; RV64-NEXT: vsll.vi v8, v24, 3
+; RV64-NEXT: vsetvli a1, zero, e64,m8,tu,mu
+; RV64-NEXT: vloxei64.v v16, (a0), v8, v0.t
+; RV64-NEXT: vmv8r.v v8, v16
+; RV64-NEXT: ret
+ %eidxs = sext <vscale x 8 x i16> %idxs to <vscale x 8 x i64>
+ %ptrs = getelementptr inbounds i64, i64* %base, <vscale x 8 x i64> %eidxs
+ %v = call <vscale x 8 x i64> @llvm.masked.gather.nxv8i64.nxv8p0i64(<vscale x 8 x i64*> %ptrs, i32 8, <vscale x 8 x i1> %m, <vscale x 8 x i64> %passthru)
+ ret <vscale x 8 x i64> %v
+}
+
+define <vscale x 8 x i64> @mgather_baseidx_zext_nxv8i16_nxv8i64(i64* %base, <vscale x 8 x i16> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x i64> %passthru) {
+; RV32-LABEL: mgather_baseidx_zext_nxv8i16_nxv8i64:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a1, zero, e64,m8,ta,mu
+; RV32-NEXT: vzext.vf4 v24, v8
+; RV32-NEXT: vsll.vi v8, v24, 3
+; RV32-NEXT: vsetvli a1, zero, e64,m8,tu,mu
+; RV32-NEXT: vloxei64.v v16, (a0), v8, v0.t
+; RV32-NEXT: vmv8r.v v8, v16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_baseidx_zext_nxv8i16_nxv8i64:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu
+; RV64-NEXT: vzext.vf4 v24, v8
+; RV64-NEXT: vsll.vi v8, v24, 3
+; RV64-NEXT: vsetvli a1, zero, e64,m8,tu,mu
+; RV64-NEXT: vloxei64.v v16, (a0), v8, v0.t
+; RV64-NEXT: vmv8r.v v8, v16
+; RV64-NEXT: ret
+ %eidxs = zext <vscale x 8 x i16> %idxs to <vscale x 8 x i64>
+ %ptrs = getelementptr inbounds i64, i64* %base, <vscale x 8 x i64> %eidxs
+ %v = call <vscale x 8 x i64> @llvm.masked.gather.nxv8i64.nxv8p0i64(<vscale x 8 x i64*> %ptrs, i32 8, <vscale x 8 x i1> %m, <vscale x 8 x i64> %passthru)
+ ret <vscale x 8 x i64> %v
+}
+
+define <vscale x 8 x i64> @mgather_baseidx_nxv8i32_nxv8i64(i64* %base, <vscale x 8 x i32> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x i64> %passthru) {
+; RV32-LABEL: mgather_baseidx_nxv8i32_nxv8i64:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu
+; RV32-NEXT: vsll.vi v28, v8, 3
+; RV32-NEXT: vsetvli a1, zero, e64,m8,tu,mu
+; RV32-NEXT: vloxei32.v v16, (a0), v28, v0.t
+; RV32-NEXT: vmv8r.v v8, v16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_baseidx_nxv8i32_nxv8i64:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu
+; RV64-NEXT: vsext.vf2 v24, v8
+; RV64-NEXT: vsll.vi v8, v24, 3
+; RV64-NEXT: vsetvli a1, zero, e64,m8,tu,mu
+; RV64-NEXT: vloxei64.v v16, (a0), v8, v0.t
+; RV64-NEXT: vmv8r.v v8, v16
+; RV64-NEXT: ret
+ %ptrs = getelementptr inbounds i64, i64* %base, <vscale x 8 x i32> %idxs
+ %v = call <vscale x 8 x i64> @llvm.masked.gather.nxv8i64.nxv8p0i64(<vscale x 8 x i64*> %ptrs, i32 8, <vscale x 8 x i1> %m, <vscale x 8 x i64> %passthru)
+ ret <vscale x 8 x i64> %v
+}
+
+define <vscale x 8 x i64> @mgather_baseidx_sext_nxv8i32_nxv8i64(i64* %base, <vscale x 8 x i32> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x i64> %passthru) {
+; RV32-LABEL: mgather_baseidx_sext_nxv8i32_nxv8i64:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a1, zero, e64,m8,ta,mu
+; RV32-NEXT: vsext.vf2 v24, v8
+; RV32-NEXT: vsll.vi v8, v24, 3
+; RV32-NEXT: vsetvli a1, zero, e64,m8,tu,mu
+; RV32-NEXT: vloxei64.v v16, (a0), v8, v0.t
+; RV32-NEXT: vmv8r.v v8, v16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_baseidx_sext_nxv8i32_nxv8i64:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu
+; RV64-NEXT: vsext.vf2 v24, v8
+; RV64-NEXT: vsll.vi v8, v24, 3
+; RV64-NEXT: vsetvli a1, zero, e64,m8,tu,mu
+; RV64-NEXT: vloxei64.v v16, (a0), v8, v0.t
+; RV64-NEXT: vmv8r.v v8, v16
+; RV64-NEXT: ret
+ %eidxs = sext <vscale x 8 x i32> %idxs to <vscale x 8 x i64>
+ %ptrs = getelementptr inbounds i64, i64* %base, <vscale x 8 x i64> %eidxs
+ %v = call <vscale x 8 x i64> @llvm.masked.gather.nxv8i64.nxv8p0i64(<vscale x 8 x i64*> %ptrs, i32 8, <vscale x 8 x i1> %m, <vscale x 8 x i64> %passthru)
+ ret <vscale x 8 x i64> %v
+}
+
+define <vscale x 8 x i64> @mgather_baseidx_zext_nxv8i32_nxv8i64(i64* %base, <vscale x 8 x i32> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x i64> %passthru) {
+; RV32-LABEL: mgather_baseidx_zext_nxv8i32_nxv8i64:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a1, zero, e64,m8,ta,mu
+; RV32-NEXT: vzext.vf2 v24, v8
+; RV32-NEXT: vsll.vi v8, v24, 3
+; RV32-NEXT: vsetvli a1, zero, e64,m8,tu,mu
+; RV32-NEXT: vloxei64.v v16, (a0), v8, v0.t
+; RV32-NEXT: vmv8r.v v8, v16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_baseidx_zext_nxv8i32_nxv8i64:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu
+; RV64-NEXT: vzext.vf2 v24, v8
+; RV64-NEXT: vsll.vi v8, v24, 3
+; RV64-NEXT: vsetvli a1, zero, e64,m8,tu,mu
+; RV64-NEXT: vloxei64.v v16, (a0), v8, v0.t
+; RV64-NEXT: vmv8r.v v8, v16
+; RV64-NEXT: ret
+ %eidxs = zext <vscale x 8 x i32> %idxs to <vscale x 8 x i64>
+ %ptrs = getelementptr inbounds i64, i64* %base, <vscale x 8 x i64> %eidxs
+ %v = call <vscale x 8 x i64> @llvm.masked.gather.nxv8i64.nxv8p0i64(<vscale x 8 x i64*> %ptrs, i32 8, <vscale x 8 x i1> %m, <vscale x 8 x i64> %passthru)
+ ret <vscale x 8 x i64> %v
+}
+
+define <vscale x 8 x i64> @mgather_baseidx_nxv8i64(i64* %base, <vscale x 8 x i64> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x i64> %passthru) {
+; RV32-LABEL: mgather_baseidx_nxv8i64:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a1, zero, e64,m8,ta,mu
+; RV32-NEXT: vsll.vi v8, v8, 3
+; RV32-NEXT: vsetvli a1, zero, e64,m8,tu,mu
+; RV32-NEXT: vloxei64.v v16, (a0), v8, v0.t
+; RV32-NEXT: vmv8r.v v8, v16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_baseidx_nxv8i64:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu
+; RV64-NEXT: vsll.vi v8, v8, 3
+; RV64-NEXT: vsetvli a1, zero, e64,m8,tu,mu
+; RV64-NEXT: vloxei64.v v16, (a0), v8, v0.t
+; RV64-NEXT: vmv8r.v v8, v16
+; RV64-NEXT: ret
+ %ptrs = getelementptr inbounds i64, i64* %base, <vscale x 8 x i64> %idxs
+ %v = call <vscale x 8 x i64> @llvm.masked.gather.nxv8i64.nxv8p0i64(<vscale x 8 x i64*> %ptrs, i32 8, <vscale x 8 x i1> %m, <vscale x 8 x i64> %passthru)
+ ret <vscale x 8 x i64> %v
+}
+
+declare <vscale x 16 x i64> @llvm.masked.gather.nxv16i64.nxv16p0f64(<vscale x 16 x i64*>, i32, <vscale x 16 x i1>, <vscale x 16 x i64>)
+
+declare <vscale x 16 x i64> @llvm.experimental.vector.insert.nxv8i64.nxv16i64(<vscale x 16 x i64>, <vscale x 8 x i64>, i64 %idx)
+declare <vscale x 16 x i64*> @llvm.experimental.vector.insert.nxv8p0i64.nxv16p0i64(<vscale x 16 x i64*>, <vscale x 8 x i64*>, i64 %idx)
+
+define void @mgather_nxv16i64(<vscale x 8 x i64*> %ptrs0, <vscale x 8 x i64*> %ptrs1, <vscale x 16 x i1> %m, <vscale x 8 x i64> %passthru0, <vscale x 8 x i64> %passthru1, <vscale x 16 x i64>* %out) {
+; RV32-LABEL: mgather_nxv16i64:
+; RV32: # %bb.0:
+; RV32-NEXT: vl8re64.v v24, (a0)
+; RV32-NEXT: vsetvli a0, zero, e64,m8,tu,mu
+; RV32-NEXT: vloxei32.v v16, (zero), v8, v0.t
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: srli a0, a0, 3
+; RV32-NEXT: vsetvli a2, zero, e8,mf4,ta,mu
+; RV32-NEXT: vslidedown.vx v0, v0, a0
+; RV32-NEXT: vsetvli a2, zero, e64,m8,tu,mu
+; RV32-NEXT: vloxei32.v v24, (zero), v12, v0.t
+; RV32-NEXT: slli a0, a0, 6
+; RV32-NEXT: add a0, a1, a0
+; RV32-NEXT: vs8r.v v24, (a0)
+; RV32-NEXT: vs8r.v v16, (a1)
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_nxv16i64:
+; RV64: # %bb.0:
+; RV64-NEXT: addi sp, sp, -16
+; RV64-NEXT: .cfi_def_cfa_offset 16
+; RV64-NEXT: csrr a3, vlenb
+; RV64-NEXT: slli a3, a3, 3
+; RV64-NEXT: sub sp, sp, a3
+; RV64-NEXT: vl8re64.v v24, (a0)
+; RV64-NEXT: addi a0, sp, 16
+; RV64-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV64-NEXT: vmv8r.v v16, v8
+; RV64-NEXT: vl8re64.v v8, (a1)
+; RV64-NEXT: vsetvli a0, zero, e64,m8,tu,mu
+; RV64-NEXT: vloxei64.v v24, (zero), v16, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: srli a0, a0, 3
+; RV64-NEXT: vsetvli a1, zero, e8,mf4,ta,mu
+; RV64-NEXT: vslidedown.vx v0, v0, a0
+; RV64-NEXT: vsetvli a1, zero, e64,m8,tu,mu
+; RV64-NEXT: addi a1, sp, 16
+; RV64-NEXT: vl8re8.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT: vloxei64.v v8, (zero), v16, v0.t
+; RV64-NEXT: slli a0, a0, 6
+; RV64-NEXT: add a0, a2, a0
+; RV64-NEXT: vs8r.v v8, (a0)
+; RV64-NEXT: vs8r.v v24, (a2)
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 3
+; RV64-NEXT: add sp, sp, a0
+; RV64-NEXT: addi sp, sp, 16
+; RV64-NEXT: ret
+ %p0 = call <vscale x 16 x i64*> @llvm.experimental.vector.insert.nxv8p0i64.nxv16p0i64(<vscale x 16 x i64*> undef, <vscale x 8 x i64*> %ptrs0, i64 0)
+ %p1 = call <vscale x 16 x i64*> @llvm.experimental.vector.insert.nxv8p0i64.nxv16p0i64(<vscale x 16 x i64*> %p0, <vscale x 8 x i64*> %ptrs1, i64 8)
+
+ %pt0 = call <vscale x 16 x i64> @llvm.experimental.vector.insert.nxv8i64.nxv16i64(<vscale x 16 x i64> undef, <vscale x 8 x i64> %passthru0, i64 0)
+ %pt1 = call <vscale x 16 x i64> @llvm.experimental.vector.insert.nxv8i64.nxv16i64(<vscale x 16 x i64> %pt0, <vscale x 8 x i64> %passthru1, i64 8)
+
+ %v = call <vscale x 16 x i64> @llvm.masked.gather.nxv16i64.nxv16p0f64(<vscale x 16 x i64*> %p1, i32 8, <vscale x 16 x i1> %m, <vscale x 16 x i64> %pt1)
+ store <vscale x 16 x i64> %v, <vscale x 16 x i64>* %out
+ ret void
+}
+
+
+declare <vscale x 1 x half> @llvm.masked.gather.nxv1f16.nxv1p0f16(<vscale x 1 x half*>, i32, <vscale x 1 x i1>, <vscale x 1 x half>)
+
+define <vscale x 1 x half> @mgather_nxv1f16(<vscale x 1 x half*> %ptrs, <vscale x 1 x i1> %m, <vscale x 1 x half> %passthru) {
+; RV32-LABEL: mgather_nxv1f16:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e16,mf4,tu,mu
+; RV32-NEXT: vloxei32.v v9, (zero), v8, v0.t
+; RV32-NEXT: vmv1r.v v8, v9
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_nxv1f16:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e16,mf4,tu,mu
+; RV64-NEXT: vloxei64.v v9, (zero), v8, v0.t
+; RV64-NEXT: vmv1r.v v8, v9
+; RV64-NEXT: ret
+ %v = call <vscale x 1 x half> @llvm.masked.gather.nxv1f16.nxv1p0f16(<vscale x 1 x half*> %ptrs, i32 2, <vscale x 1 x i1> %m, <vscale x 1 x half> %passthru)
+ ret <vscale x 1 x half> %v
+}
+
+declare <vscale x 2 x half> @llvm.masked.gather.nxv2f16.nxv2p0f16(<vscale x 2 x half*>, i32, <vscale x 2 x i1>, <vscale x 2 x half>)
+
+define <vscale x 2 x half> @mgather_nxv2f16(<vscale x 2 x half*> %ptrs, <vscale x 2 x i1> %m, <vscale x 2 x half> %passthru) {
+; RV32-LABEL: mgather_nxv2f16:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e16,mf2,tu,mu
+; RV32-NEXT: vloxei32.v v9, (zero), v8, v0.t
+; RV32-NEXT: vmv1r.v v8, v9
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_nxv2f16:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e16,mf2,tu,mu
+; RV64-NEXT: vloxei64.v v10, (zero), v8, v0.t
+; RV64-NEXT: vmv1r.v v8, v10
+; RV64-NEXT: ret
+ %v = call <vscale x 2 x half> @llvm.masked.gather.nxv2f16.nxv2p0f16(<vscale x 2 x half*> %ptrs, i32 2, <vscale x 2 x i1> %m, <vscale x 2 x half> %passthru)
+ ret <vscale x 2 x half> %v
+}
+
+declare <vscale x 4 x half> @llvm.masked.gather.nxv4f16.nxv4p0f16(<vscale x 4 x half*>, i32, <vscale x 4 x i1>, <vscale x 4 x half>)
+
+define <vscale x 4 x half> @mgather_nxv4f16(<vscale x 4 x half*> %ptrs, <vscale x 4 x i1> %m, <vscale x 4 x half> %passthru) {
+; RV32-LABEL: mgather_nxv4f16:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e16,m1,tu,mu
+; RV32-NEXT: vloxei32.v v10, (zero), v8, v0.t
+; RV32-NEXT: vmv1r.v v8, v10
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_nxv4f16:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e16,m1,tu,mu
+; RV64-NEXT: vloxei64.v v12, (zero), v8, v0.t
+; RV64-NEXT: vmv1r.v v8, v12
+; RV64-NEXT: ret
+ %v = call <vscale x 4 x half> @llvm.masked.gather.nxv4f16.nxv4p0f16(<vscale x 4 x half*> %ptrs, i32 2, <vscale x 4 x i1> %m, <vscale x 4 x half> %passthru)
+ ret <vscale x 4 x half> %v
+}
+
+define <vscale x 4 x half> @mgather_truemask_nxv4f16(<vscale x 4 x half*> %ptrs, <vscale x 4 x half> %passthru) {
+; RV32-LABEL: mgather_truemask_nxv4f16:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e16,m1,ta,mu
+; RV32-NEXT: vloxei32.v v8, (zero), v8
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_truemask_nxv4f16:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e16,m1,ta,mu
+; RV64-NEXT: vloxei64.v v8, (zero), v8
+; RV64-NEXT: ret
+ %mhead = insertelement <vscale x 4 x i1> undef, i1 1, i32 0
+ %mtrue = shufflevector <vscale x 4 x i1> %mhead, <vscale x 4 x i1> undef, <vscale x 4 x i32> zeroinitializer
+ %v = call <vscale x 4 x half> @llvm.masked.gather.nxv4f16.nxv4p0f16(<vscale x 4 x half*> %ptrs, i32 2, <vscale x 4 x i1> %mtrue, <vscale x 4 x half> %passthru)
+ ret <vscale x 4 x half> %v
+}
+
+declare <vscale x 8 x half> @llvm.masked.gather.nxv8f16.nxv8p0f16(<vscale x 8 x half*>, i32, <vscale x 8 x i1>, <vscale x 8 x half>)
+
+define <vscale x 8 x half> @mgather_nxv8f16(<vscale x 8 x half*> %ptrs, <vscale x 8 x i1> %m, <vscale x 8 x half> %passthru) {
+; RV32-LABEL: mgather_nxv8f16:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e16,m2,tu,mu
+; RV32-NEXT: vloxei32.v v12, (zero), v8, v0.t
+; RV32-NEXT: vmv2r.v v8, v12
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_nxv8f16:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e16,m2,tu,mu
+; RV64-NEXT: vloxei64.v v16, (zero), v8, v0.t
+; RV64-NEXT: vmv2r.v v8, v16
+; RV64-NEXT: ret
+ %v = call <vscale x 8 x half> @llvm.masked.gather.nxv8f16.nxv8p0f16(<vscale x 8 x half*> %ptrs, i32 2, <vscale x 8 x i1> %m, <vscale x 8 x half> %passthru)
+ ret <vscale x 8 x half> %v
+}
+
+define <vscale x 8 x half> @mgather_baseidx_nxv8i8_nxv8f16(half* %base, <vscale x 8 x i8> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x half> %passthru) {
+; RV32-LABEL: mgather_baseidx_nxv8i8_nxv8f16:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu
+; RV32-NEXT: vsext.vf4 v28, v8
+; RV32-NEXT: vsll.vi v28, v28, 1
+; RV32-NEXT: vsetvli a1, zero, e16,m2,tu,mu
+; RV32-NEXT: vloxei32.v v10, (a0), v28, v0.t
+; RV32-NEXT: vmv2r.v v8, v10
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_baseidx_nxv8i8_nxv8f16:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu
+; RV64-NEXT: vsext.vf8 v16, v8
+; RV64-NEXT: vsll.vi v16, v16, 1
+; RV64-NEXT: vsetvli a1, zero, e16,m2,tu,mu
+; RV64-NEXT: vloxei64.v v10, (a0), v16, v0.t
+; RV64-NEXT: vmv2r.v v8, v10
+; RV64-NEXT: ret
+ %ptrs = getelementptr inbounds half, half* %base, <vscale x 8 x i8> %idxs
+ %v = call <vscale x 8 x half> @llvm.masked.gather.nxv8f16.nxv8p0f16(<vscale x 8 x half*> %ptrs, i32 2, <vscale x 8 x i1> %m, <vscale x 8 x half> %passthru)
+ ret <vscale x 8 x half> %v
+}
+
+define <vscale x 8 x half> @mgather_baseidx_sext_nxv8i8_nxv8f16(half* %base, <vscale x 8 x i8> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x half> %passthru) {
+; RV32-LABEL: mgather_baseidx_sext_nxv8i8_nxv8f16:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu
+; RV32-NEXT: vsext.vf4 v28, v8
+; RV32-NEXT: vsll.vi v28, v28, 1
+; RV32-NEXT: vsetvli a1, zero, e16,m2,tu,mu
+; RV32-NEXT: vloxei32.v v10, (a0), v28, v0.t
+; RV32-NEXT: vmv2r.v v8, v10
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_baseidx_sext_nxv8i8_nxv8f16:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu
+; RV64-NEXT: vsext.vf8 v16, v8
+; RV64-NEXT: vsll.vi v16, v16, 1
+; RV64-NEXT: vsetvli a1, zero, e16,m2,tu,mu
+; RV64-NEXT: vloxei64.v v10, (a0), v16, v0.t
+; RV64-NEXT: vmv2r.v v8, v10
+; RV64-NEXT: ret
+ %eidxs = sext <vscale x 8 x i8> %idxs to <vscale x 8 x i16>
+ %ptrs = getelementptr inbounds half, half* %base, <vscale x 8 x i16> %eidxs
+ %v = call <vscale x 8 x half> @llvm.masked.gather.nxv8f16.nxv8p0f16(<vscale x 8 x half*> %ptrs, i32 2, <vscale x 8 x i1> %m, <vscale x 8 x half> %passthru)
+ ret <vscale x 8 x half> %v
+}
+
+define <vscale x 8 x half> @mgather_baseidx_zext_nxv8i8_nxv8f16(half* %base, <vscale x 8 x i8> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x half> %passthru) {
+; RV32-LABEL: mgather_baseidx_zext_nxv8i8_nxv8f16:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu
+; RV32-NEXT: vzext.vf4 v28, v8
+; RV32-NEXT: vsll.vi v28, v28, 1
+; RV32-NEXT: vsetvli a1, zero, e16,m2,tu,mu
+; RV32-NEXT: vloxei32.v v10, (a0), v28, v0.t
+; RV32-NEXT: vmv2r.v v8, v10
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_baseidx_zext_nxv8i8_nxv8f16:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu
+; RV64-NEXT: vzext.vf8 v16, v8
+; RV64-NEXT: vsll.vi v16, v16, 1
+; RV64-NEXT: vsetvli a1, zero, e16,m2,tu,mu
+; RV64-NEXT: vloxei64.v v10, (a0), v16, v0.t
+; RV64-NEXT: vmv2r.v v8, v10
+; RV64-NEXT: ret
+ %eidxs = zext <vscale x 8 x i8> %idxs to <vscale x 8 x i16>
+ %ptrs = getelementptr inbounds half, half* %base, <vscale x 8 x i16> %eidxs
+ %v = call <vscale x 8 x half> @llvm.masked.gather.nxv8f16.nxv8p0f16(<vscale x 8 x half*> %ptrs, i32 2, <vscale x 8 x i1> %m, <vscale x 8 x half> %passthru)
+ ret <vscale x 8 x half> %v
+}
+
+define <vscale x 8 x half> @mgather_baseidx_nxv8f16(half* %base, <vscale x 8 x i16> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x half> %passthru) {
+; RV32-LABEL: mgather_baseidx_nxv8f16:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu
+; RV32-NEXT: vsext.vf2 v28, v8
+; RV32-NEXT: vsll.vi v28, v28, 1
+; RV32-NEXT: vsetvli a1, zero, e16,m2,tu,mu
+; RV32-NEXT: vloxei32.v v10, (a0), v28, v0.t
+; RV32-NEXT: vmv2r.v v8, v10
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_baseidx_nxv8f16:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu
+; RV64-NEXT: vsext.vf4 v16, v8
+; RV64-NEXT: vsll.vi v16, v16, 1
+; RV64-NEXT: vsetvli a1, zero, e16,m2,tu,mu
+; RV64-NEXT: vloxei64.v v10, (a0), v16, v0.t
+; RV64-NEXT: vmv2r.v v8, v10
+; RV64-NEXT: ret
+ %ptrs = getelementptr inbounds half, half* %base, <vscale x 8 x i16> %idxs
+ %v = call <vscale x 8 x half> @llvm.masked.gather.nxv8f16.nxv8p0f16(<vscale x 8 x half*> %ptrs, i32 2, <vscale x 8 x i1> %m, <vscale x 8 x half> %passthru)
+ ret <vscale x 8 x half> %v
+}
+
+declare <vscale x 1 x float> @llvm.masked.gather.nxv1f32.nxv1p0f32(<vscale x 1 x float*>, i32, <vscale x 1 x i1>, <vscale x 1 x float>)
+
+define <vscale x 1 x float> @mgather_nxv1f32(<vscale x 1 x float*> %ptrs, <vscale x 1 x i1> %m, <vscale x 1 x float> %passthru) {
+; RV32-LABEL: mgather_nxv1f32:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e32,mf2,tu,mu
+; RV32-NEXT: vloxei32.v v9, (zero), v8, v0.t
+; RV32-NEXT: vmv1r.v v8, v9
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_nxv1f32:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e32,mf2,tu,mu
+; RV64-NEXT: vloxei64.v v9, (zero), v8, v0.t
+; RV64-NEXT: vmv1r.v v8, v9
+; RV64-NEXT: ret
+ %v = call <vscale x 1 x float> @llvm.masked.gather.nxv1f32.nxv1p0f32(<vscale x 1 x float*> %ptrs, i32 4, <vscale x 1 x i1> %m, <vscale x 1 x float> %passthru)
+ ret <vscale x 1 x float> %v
+}
+
+declare <vscale x 2 x float> @llvm.masked.gather.nxv2f32.nxv2p0f32(<vscale x 2 x float*>, i32, <vscale x 2 x i1>, <vscale x 2 x float>)
+
+define <vscale x 2 x float> @mgather_nxv2f32(<vscale x 2 x float*> %ptrs, <vscale x 2 x i1> %m, <vscale x 2 x float> %passthru) {
+; RV32-LABEL: mgather_nxv2f32:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e32,m1,tu,mu
+; RV32-NEXT: vloxei32.v v9, (zero), v8, v0.t
+; RV32-NEXT: vmv1r.v v8, v9
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_nxv2f32:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e32,m1,tu,mu
+; RV64-NEXT: vloxei64.v v10, (zero), v8, v0.t
+; RV64-NEXT: vmv1r.v v8, v10
+; RV64-NEXT: ret
+ %v = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32.nxv2p0f32(<vscale x 2 x float*> %ptrs, i32 4, <vscale x 2 x i1> %m, <vscale x 2 x float> %passthru)
+ ret <vscale x 2 x float> %v
+}
+
+declare <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0f32(<vscale x 4 x float*>, i32, <vscale x 4 x i1>, <vscale x 4 x float>)
+
+define <vscale x 4 x float> @mgather_nxv4f32(<vscale x 4 x float*> %ptrs, <vscale x 4 x i1> %m, <vscale x 4 x float> %passthru) {
+; RV32-LABEL: mgather_nxv4f32:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e32,m2,tu,mu
+; RV32-NEXT: vloxei32.v v10, (zero), v8, v0.t
+; RV32-NEXT: vmv2r.v v8, v10
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_nxv4f32:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e32,m2,tu,mu
+; RV64-NEXT: vloxei64.v v12, (zero), v8, v0.t
+; RV64-NEXT: vmv2r.v v8, v12
+; RV64-NEXT: ret
+ %v = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0f32(<vscale x 4 x float*> %ptrs, i32 4, <vscale x 4 x i1> %m, <vscale x 4 x float> %passthru)
+ ret <vscale x 4 x float> %v
+}
+
+define <vscale x 4 x float> @mgather_truemask_nxv4f32(<vscale x 4 x float*> %ptrs, <vscale x 4 x float> %passthru) {
+; RV32-LABEL: mgather_truemask_nxv4f32:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e32,m2,ta,mu
+; RV32-NEXT: vloxei32.v v8, (zero), v8
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_truemask_nxv4f32:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e32,m2,ta,mu
+; RV64-NEXT: vloxei64.v v8, (zero), v8
+; RV64-NEXT: ret
+ %mhead = insertelement <vscale x 4 x i1> undef, i1 1, i32 0
+ %mtrue = shufflevector <vscale x 4 x i1> %mhead, <vscale x 4 x i1> undef, <vscale x 4 x i32> zeroinitializer
+ %v = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0f32(<vscale x 4 x float*> %ptrs, i32 4, <vscale x 4 x i1> %mtrue, <vscale x 4 x float> %passthru)
+ ret <vscale x 4 x float> %v
+}
+
+declare <vscale x 8 x float> @llvm.masked.gather.nxv8f32.nxv8p0f32(<vscale x 8 x float*>, i32, <vscale x 8 x i1>, <vscale x 8 x float>)
+
+define <vscale x 8 x float> @mgather_nxv8f32(<vscale x 8 x float*> %ptrs, <vscale x 8 x i1> %m, <vscale x 8 x float> %passthru) {
+; RV32-LABEL: mgather_nxv8f32:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e32,m4,tu,mu
+; RV32-NEXT: vloxei32.v v12, (zero), v8, v0.t
+; RV32-NEXT: vmv4r.v v8, v12
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_nxv8f32:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e32,m4,tu,mu
+; RV64-NEXT: vloxei64.v v16, (zero), v8, v0.t
+; RV64-NEXT: vmv4r.v v8, v16
+; RV64-NEXT: ret
+ %v = call <vscale x 8 x float> @llvm.masked.gather.nxv8f32.nxv8p0f32(<vscale x 8 x float*> %ptrs, i32 4, <vscale x 8 x i1> %m, <vscale x 8 x float> %passthru)
+ ret <vscale x 8 x float> %v
+}
+
+define <vscale x 8 x float> @mgather_baseidx_nxv8i8_nxv8f32(float* %base, <vscale x 8 x i8> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x float> %passthru) {
+; RV32-LABEL: mgather_baseidx_nxv8i8_nxv8f32:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu
+; RV32-NEXT: vsext.vf4 v28, v8
+; RV32-NEXT: vsll.vi v28, v28, 2
+; RV32-NEXT: vsetvli a1, zero, e32,m4,tu,mu
+; RV32-NEXT: vloxei32.v v12, (a0), v28, v0.t
+; RV32-NEXT: vmv4r.v v8, v12
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_baseidx_nxv8i8_nxv8f32:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu
+; RV64-NEXT: vsext.vf8 v16, v8
+; RV64-NEXT: vsll.vi v16, v16, 2
+; RV64-NEXT: vsetvli a1, zero, e32,m4,tu,mu
+; RV64-NEXT: vloxei64.v v12, (a0), v16, v0.t
+; RV64-NEXT: vmv4r.v v8, v12
+; RV64-NEXT: ret
+ %ptrs = getelementptr inbounds float, float* %base, <vscale x 8 x i8> %idxs
+ %v = call <vscale x 8 x float> @llvm.masked.gather.nxv8f32.nxv8p0f32(<vscale x 8 x float*> %ptrs, i32 4, <vscale x 8 x i1> %m, <vscale x 8 x float> %passthru)
+ ret <vscale x 8 x float> %v
+}
+
+define <vscale x 8 x float> @mgather_baseidx_sext_nxv8i8_nxv8f32(float* %base, <vscale x 8 x i8> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x float> %passthru) {
+; RV32-LABEL: mgather_baseidx_sext_nxv8i8_nxv8f32:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu
+; RV32-NEXT: vsext.vf4 v28, v8
+; RV32-NEXT: vsll.vi v28, v28, 2
+; RV32-NEXT: vsetvli a1, zero, e32,m4,tu,mu
+; RV32-NEXT: vloxei32.v v12, (a0), v28, v0.t
+; RV32-NEXT: vmv4r.v v8, v12
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_baseidx_sext_nxv8i8_nxv8f32:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu
+; RV64-NEXT: vsext.vf8 v16, v8
+; RV64-NEXT: vsll.vi v16, v16, 2
+; RV64-NEXT: vsetvli a1, zero, e32,m4,tu,mu
+; RV64-NEXT: vloxei64.v v12, (a0), v16, v0.t
+; RV64-NEXT: vmv4r.v v8, v12
+; RV64-NEXT: ret
+ %eidxs = sext <vscale x 8 x i8> %idxs to <vscale x 8 x i32>
+ %ptrs = getelementptr inbounds float, float* %base, <vscale x 8 x i32> %eidxs
+ %v = call <vscale x 8 x float> @llvm.masked.gather.nxv8f32.nxv8p0f32(<vscale x 8 x float*> %ptrs, i32 4, <vscale x 8 x i1> %m, <vscale x 8 x float> %passthru)
+ ret <vscale x 8 x float> %v
+}
+
+define <vscale x 8 x float> @mgather_baseidx_zext_nxv8i8_nxv8f32(float* %base, <vscale x 8 x i8> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x float> %passthru) {
+; RV32-LABEL: mgather_baseidx_zext_nxv8i8_nxv8f32:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu
+; RV32-NEXT: vzext.vf4 v28, v8
+; RV32-NEXT: vsll.vi v28, v28, 2
+; RV32-NEXT: vsetvli a1, zero, e32,m4,tu,mu
+; RV32-NEXT: vloxei32.v v12, (a0), v28, v0.t
+; RV32-NEXT: vmv4r.v v8, v12
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_baseidx_zext_nxv8i8_nxv8f32:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu
+; RV64-NEXT: vzext.vf8 v16, v8
+; RV64-NEXT: vsll.vi v16, v16, 2
+; RV64-NEXT: vsetvli a1, zero, e32,m4,tu,mu
+; RV64-NEXT: vloxei64.v v12, (a0), v16, v0.t
+; RV64-NEXT: vmv4r.v v8, v12
+; RV64-NEXT: ret
+ %eidxs = zext <vscale x 8 x i8> %idxs to <vscale x 8 x i32>
+ %ptrs = getelementptr inbounds float, float* %base, <vscale x 8 x i32> %eidxs
+ %v = call <vscale x 8 x float> @llvm.masked.gather.nxv8f32.nxv8p0f32(<vscale x 8 x float*> %ptrs, i32 4, <vscale x 8 x i1> %m, <vscale x 8 x float> %passthru)
+ ret <vscale x 8 x float> %v
+}
+
+define <vscale x 8 x float> @mgather_baseidx_nxv8i16_nxv8f32(float* %base, <vscale x 8 x i16> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x float> %passthru) {
+; RV32-LABEL: mgather_baseidx_nxv8i16_nxv8f32:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu
+; RV32-NEXT: vsext.vf2 v28, v8
+; RV32-NEXT: vsll.vi v28, v28, 2
+; RV32-NEXT: vsetvli a1, zero, e32,m4,tu,mu
+; RV32-NEXT: vloxei32.v v12, (a0), v28, v0.t
+; RV32-NEXT: vmv4r.v v8, v12
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_baseidx_nxv8i16_nxv8f32:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu
+; RV64-NEXT: vsext.vf4 v16, v8
+; RV64-NEXT: vsll.vi v16, v16, 2
+; RV64-NEXT: vsetvli a1, zero, e32,m4,tu,mu
+; RV64-NEXT: vloxei64.v v12, (a0), v16, v0.t
+; RV64-NEXT: vmv4r.v v8, v12
+; RV64-NEXT: ret
+ %ptrs = getelementptr inbounds float, float* %base, <vscale x 8 x i16> %idxs
+ %v = call <vscale x 8 x float> @llvm.masked.gather.nxv8f32.nxv8p0f32(<vscale x 8 x float*> %ptrs, i32 4, <vscale x 8 x i1> %m, <vscale x 8 x float> %passthru)
+ ret <vscale x 8 x float> %v
+}
+
+define <vscale x 8 x float> @mgather_baseidx_sext_nxv8i16_nxv8f32(float* %base, <vscale x 8 x i16> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x float> %passthru) {
+; RV32-LABEL: mgather_baseidx_sext_nxv8i16_nxv8f32:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu
+; RV32-NEXT: vsext.vf2 v28, v8
+; RV32-NEXT: vsll.vi v28, v28, 2
+; RV32-NEXT: vsetvli a1, zero, e32,m4,tu,mu
+; RV32-NEXT: vloxei32.v v12, (a0), v28, v0.t
+; RV32-NEXT: vmv4r.v v8, v12
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_baseidx_sext_nxv8i16_nxv8f32:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu
+; RV64-NEXT: vsext.vf4 v16, v8
+; RV64-NEXT: vsll.vi v16, v16, 2
+; RV64-NEXT: vsetvli a1, zero, e32,m4,tu,mu
+; RV64-NEXT: vloxei64.v v12, (a0), v16, v0.t
+; RV64-NEXT: vmv4r.v v8, v12
+; RV64-NEXT: ret
+ %eidxs = sext <vscale x 8 x i16> %idxs to <vscale x 8 x i32>
+ %ptrs = getelementptr inbounds float, float* %base, <vscale x 8 x i32> %eidxs
+ %v = call <vscale x 8 x float> @llvm.masked.gather.nxv8f32.nxv8p0f32(<vscale x 8 x float*> %ptrs, i32 4, <vscale x 8 x i1> %m, <vscale x 8 x float> %passthru)
+ ret <vscale x 8 x float> %v
+}
+
+define <vscale x 8 x float> @mgather_baseidx_zext_nxv8i16_nxv8f32(float* %base, <vscale x 8 x i16> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x float> %passthru) {
+; RV32-LABEL: mgather_baseidx_zext_nxv8i16_nxv8f32:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu
+; RV32-NEXT: vzext.vf2 v28, v8
+; RV32-NEXT: vsll.vi v28, v28, 2
+; RV32-NEXT: vsetvli a1, zero, e32,m4,tu,mu
+; RV32-NEXT: vloxei32.v v12, (a0), v28, v0.t
+; RV32-NEXT: vmv4r.v v8, v12
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_baseidx_zext_nxv8i16_nxv8f32:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu
+; RV64-NEXT: vzext.vf4 v16, v8
+; RV64-NEXT: vsll.vi v16, v16, 2
+; RV64-NEXT: vsetvli a1, zero, e32,m4,tu,mu
+; RV64-NEXT: vloxei64.v v12, (a0), v16, v0.t
+; RV64-NEXT: vmv4r.v v8, v12
+; RV64-NEXT: ret
+ %eidxs = zext <vscale x 8 x i16> %idxs to <vscale x 8 x i32>
+ %ptrs = getelementptr inbounds float, float* %base, <vscale x 8 x i32> %eidxs
+ %v = call <vscale x 8 x float> @llvm.masked.gather.nxv8f32.nxv8p0f32(<vscale x 8 x float*> %ptrs, i32 4, <vscale x 8 x i1> %m, <vscale x 8 x float> %passthru)
+ ret <vscale x 8 x float> %v
+}
+
+define <vscale x 8 x float> @mgather_baseidx_nxv8f32(float* %base, <vscale x 8 x i32> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x float> %passthru) {
+; RV32-LABEL: mgather_baseidx_nxv8f32:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu
+; RV32-NEXT: vsll.vi v28, v8, 2
+; RV32-NEXT: vsetvli a1, zero, e32,m4,tu,mu
+; RV32-NEXT: vloxei32.v v12, (a0), v28, v0.t
+; RV32-NEXT: vmv4r.v v8, v12
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_baseidx_nxv8f32:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu
+; RV64-NEXT: vsext.vf2 v16, v8
+; RV64-NEXT: vsll.vi v16, v16, 2
+; RV64-NEXT: vsetvli a1, zero, e32,m4,tu,mu
+; RV64-NEXT: vloxei64.v v12, (a0), v16, v0.t
+; RV64-NEXT: vmv4r.v v8, v12
+; RV64-NEXT: ret
+ %ptrs = getelementptr inbounds float, float* %base, <vscale x 8 x i32> %idxs
+ %v = call <vscale x 8 x float> @llvm.masked.gather.nxv8f32.nxv8p0f32(<vscale x 8 x float*> %ptrs, i32 4, <vscale x 8 x i1> %m, <vscale x 8 x float> %passthru)
+ ret <vscale x 8 x float> %v
+}
+
+declare <vscale x 1 x double> @llvm.masked.gather.nxv1f64.nxv1p0f64(<vscale x 1 x double*>, i32, <vscale x 1 x i1>, <vscale x 1 x double>)
+
+define <vscale x 1 x double> @mgather_nxv1f64(<vscale x 1 x double*> %ptrs, <vscale x 1 x i1> %m, <vscale x 1 x double> %passthru) {
+; RV32-LABEL: mgather_nxv1f64:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e64,m1,tu,mu
+; RV32-NEXT: vloxei32.v v9, (zero), v8, v0.t
+; RV32-NEXT: vmv1r.v v8, v9
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_nxv1f64:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e64,m1,tu,mu
+; RV64-NEXT: vloxei64.v v9, (zero), v8, v0.t
+; RV64-NEXT: vmv1r.v v8, v9
+; RV64-NEXT: ret
+ %v = call <vscale x 1 x double> @llvm.masked.gather.nxv1f64.nxv1p0f64(<vscale x 1 x double*> %ptrs, i32 8, <vscale x 1 x i1> %m, <vscale x 1 x double> %passthru)
+ ret <vscale x 1 x double> %v
+}
+
+declare <vscale x 2 x double> @llvm.masked.gather.nxv2f64.nxv2p0f64(<vscale x 2 x double*>, i32, <vscale x 2 x i1>, <vscale x 2 x double>)
+
+define <vscale x 2 x double> @mgather_nxv2f64(<vscale x 2 x double*> %ptrs, <vscale x 2 x i1> %m, <vscale x 2 x double> %passthru) {
+; RV32-LABEL: mgather_nxv2f64:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e64,m2,tu,mu
+; RV32-NEXT: vloxei32.v v10, (zero), v8, v0.t
+; RV32-NEXT: vmv2r.v v8, v10
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_nxv2f64:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e64,m2,tu,mu
+; RV64-NEXT: vloxei64.v v10, (zero), v8, v0.t
+; RV64-NEXT: vmv2r.v v8, v10
+; RV64-NEXT: ret
+ %v = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64.nxv2p0f64(<vscale x 2 x double*> %ptrs, i32 8, <vscale x 2 x i1> %m, <vscale x 2 x double> %passthru)
+ ret <vscale x 2 x double> %v
+}
+
+declare <vscale x 4 x double> @llvm.masked.gather.nxv4f64.nxv4p0f64(<vscale x 4 x double*>, i32, <vscale x 4 x i1>, <vscale x 4 x double>)
+
+define <vscale x 4 x double> @mgather_nxv4f64(<vscale x 4 x double*> %ptrs, <vscale x 4 x i1> %m, <vscale x 4 x double> %passthru) {
+; RV32-LABEL: mgather_nxv4f64:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e64,m4,tu,mu
+; RV32-NEXT: vloxei32.v v12, (zero), v8, v0.t
+; RV32-NEXT: vmv4r.v v8, v12
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_nxv4f64:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e64,m4,tu,mu
+; RV64-NEXT: vloxei64.v v12, (zero), v8, v0.t
+; RV64-NEXT: vmv4r.v v8, v12
+; RV64-NEXT: ret
+ %v = call <vscale x 4 x double> @llvm.masked.gather.nxv4f64.nxv4p0f64(<vscale x 4 x double*> %ptrs, i32 8, <vscale x 4 x i1> %m, <vscale x 4 x double> %passthru)
+ ret <vscale x 4 x double> %v
+}
+
+define <vscale x 4 x double> @mgather_truemask_nxv4f64(<vscale x 4 x double*> %ptrs, <vscale x 4 x double> %passthru) {
+; RV32-LABEL: mgather_truemask_nxv4f64:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e64,m4,ta,mu
+; RV32-NEXT: vloxei32.v v8, (zero), v8
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_truemask_nxv4f64:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e64,m4,ta,mu
+; RV64-NEXT: vloxei64.v v8, (zero), v8
+; RV64-NEXT: ret
+ %mhead = insertelement <vscale x 4 x i1> undef, i1 1, i32 0
+ %mtrue = shufflevector <vscale x 4 x i1> %mhead, <vscale x 4 x i1> undef, <vscale x 4 x i32> zeroinitializer
+ %v = call <vscale x 4 x double> @llvm.masked.gather.nxv4f64.nxv4p0f64(<vscale x 4 x double*> %ptrs, i32 8, <vscale x 4 x i1> %mtrue, <vscale x 4 x double> %passthru)
+ ret <vscale x 4 x double> %v
+}
+
+declare <vscale x 8 x double> @llvm.masked.gather.nxv8f64.nxv8p0f64(<vscale x 8 x double*>, i32, <vscale x 8 x i1>, <vscale x 8 x double>)
+
+define <vscale x 8 x double> @mgather_nxv8f64(<vscale x 8 x double*> %ptrs, <vscale x 8 x i1> %m, <vscale x 8 x double> %passthru) {
+; RV32-LABEL: mgather_nxv8f64:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e64,m8,tu,mu
+; RV32-NEXT: vloxei32.v v16, (zero), v8, v0.t
+; RV32-NEXT: vmv8r.v v8, v16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_nxv8f64:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e64,m8,tu,mu
+; RV64-NEXT: vloxei64.v v16, (zero), v8, v0.t
+; RV64-NEXT: vmv8r.v v8, v16
+; RV64-NEXT: ret
+ %v = call <vscale x 8 x double> @llvm.masked.gather.nxv8f64.nxv8p0f64(<vscale x 8 x double*> %ptrs, i32 8, <vscale x 8 x i1> %m, <vscale x 8 x double> %passthru)
+ ret <vscale x 8 x double> %v
+}
+
+define <vscale x 8 x double> @mgather_baseidx_nxv8i8_nxv8f64(double* %base, <vscale x 8 x i8> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x double> %passthru) {
+; RV32-LABEL: mgather_baseidx_nxv8i8_nxv8f64:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu
+; RV32-NEXT: vsext.vf4 v28, v8
+; RV32-NEXT: vsll.vi v28, v28, 3
+; RV32-NEXT: vsetvli a1, zero, e64,m8,tu,mu
+; RV32-NEXT: vloxei32.v v16, (a0), v28, v0.t
+; RV32-NEXT: vmv8r.v v8, v16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_baseidx_nxv8i8_nxv8f64:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu
+; RV64-NEXT: vsext.vf8 v24, v8
+; RV64-NEXT: vsll.vi v8, v24, 3
+; RV64-NEXT: vsetvli a1, zero, e64,m8,tu,mu
+; RV64-NEXT: vloxei64.v v16, (a0), v8, v0.t
+; RV64-NEXT: vmv8r.v v8, v16
+; RV64-NEXT: ret
+ %ptrs = getelementptr inbounds double, double* %base, <vscale x 8 x i8> %idxs
+ %v = call <vscale x 8 x double> @llvm.masked.gather.nxv8f64.nxv8p0f64(<vscale x 8 x double*> %ptrs, i32 8, <vscale x 8 x i1> %m, <vscale x 8 x double> %passthru)
+ ret <vscale x 8 x double> %v
+}
+
+define <vscale x 8 x double> @mgather_baseidx_sext_nxv8i8_nxv8f64(double* %base, <vscale x 8 x i8> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x double> %passthru) {
+; RV32-LABEL: mgather_baseidx_sext_nxv8i8_nxv8f64:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a1, zero, e64,m8,ta,mu
+; RV32-NEXT: vsext.vf8 v24, v8
+; RV32-NEXT: vsll.vi v8, v24, 3
+; RV32-NEXT: vsetvli a1, zero, e64,m8,tu,mu
+; RV32-NEXT: vloxei64.v v16, (a0), v8, v0.t
+; RV32-NEXT: vmv8r.v v8, v16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_baseidx_sext_nxv8i8_nxv8f64:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu
+; RV64-NEXT: vsext.vf8 v24, v8
+; RV64-NEXT: vsll.vi v8, v24, 3
+; RV64-NEXT: vsetvli a1, zero, e64,m8,tu,mu
+; RV64-NEXT: vloxei64.v v16, (a0), v8, v0.t
+; RV64-NEXT: vmv8r.v v8, v16
+; RV64-NEXT: ret
+ %eidxs = sext <vscale x 8 x i8> %idxs to <vscale x 8 x i64>
+ %ptrs = getelementptr inbounds double, double* %base, <vscale x 8 x i64> %eidxs
+ %v = call <vscale x 8 x double> @llvm.masked.gather.nxv8f64.nxv8p0f64(<vscale x 8 x double*> %ptrs, i32 8, <vscale x 8 x i1> %m, <vscale x 8 x double> %passthru)
+ ret <vscale x 8 x double> %v
+}
+
+define <vscale x 8 x double> @mgather_baseidx_zext_nxv8i8_nxv8f64(double* %base, <vscale x 8 x i8> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x double> %passthru) {
+; RV32-LABEL: mgather_baseidx_zext_nxv8i8_nxv8f64:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a1, zero, e64,m8,ta,mu
+; RV32-NEXT: vzext.vf8 v24, v8
+; RV32-NEXT: vsll.vi v8, v24, 3
+; RV32-NEXT: vsetvli a1, zero, e64,m8,tu,mu
+; RV32-NEXT: vloxei64.v v16, (a0), v8, v0.t
+; RV32-NEXT: vmv8r.v v8, v16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_baseidx_zext_nxv8i8_nxv8f64:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu
+; RV64-NEXT: vzext.vf8 v24, v8
+; RV64-NEXT: vsll.vi v8, v24, 3
+; RV64-NEXT: vsetvli a1, zero, e64,m8,tu,mu
+; RV64-NEXT: vloxei64.v v16, (a0), v8, v0.t
+; RV64-NEXT: vmv8r.v v8, v16
+; RV64-NEXT: ret
+ %eidxs = zext <vscale x 8 x i8> %idxs to <vscale x 8 x i64>
+ %ptrs = getelementptr inbounds double, double* %base, <vscale x 8 x i64> %eidxs
+ %v = call <vscale x 8 x double> @llvm.masked.gather.nxv8f64.nxv8p0f64(<vscale x 8 x double*> %ptrs, i32 8, <vscale x 8 x i1> %m, <vscale x 8 x double> %passthru)
+ ret <vscale x 8 x double> %v
+}
+
+define <vscale x 8 x double> @mgather_baseidx_nxv8i16_nxv8f64(double* %base, <vscale x 8 x i16> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x double> %passthru) {
+; RV32-LABEL: mgather_baseidx_nxv8i16_nxv8f64:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu
+; RV32-NEXT: vsext.vf2 v28, v8
+; RV32-NEXT: vsll.vi v28, v28, 3
+; RV32-NEXT: vsetvli a1, zero, e64,m8,tu,mu
+; RV32-NEXT: vloxei32.v v16, (a0), v28, v0.t
+; RV32-NEXT: vmv8r.v v8, v16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_baseidx_nxv8i16_nxv8f64:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu
+; RV64-NEXT: vsext.vf4 v24, v8
+; RV64-NEXT: vsll.vi v8, v24, 3
+; RV64-NEXT: vsetvli a1, zero, e64,m8,tu,mu
+; RV64-NEXT: vloxei64.v v16, (a0), v8, v0.t
+; RV64-NEXT: vmv8r.v v8, v16
+; RV64-NEXT: ret
+ %ptrs = getelementptr inbounds double, double* %base, <vscale x 8 x i16> %idxs
+ %v = call <vscale x 8 x double> @llvm.masked.gather.nxv8f64.nxv8p0f64(<vscale x 8 x double*> %ptrs, i32 8, <vscale x 8 x i1> %m, <vscale x 8 x double> %passthru)
+ ret <vscale x 8 x double> %v
+}
+
+define <vscale x 8 x double> @mgather_baseidx_sext_nxv8i16_nxv8f64(double* %base, <vscale x 8 x i16> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x double> %passthru) {
+; RV32-LABEL: mgather_baseidx_sext_nxv8i16_nxv8f64:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a1, zero, e64,m8,ta,mu
+; RV32-NEXT: vsext.vf4 v24, v8
+; RV32-NEXT: vsll.vi v8, v24, 3
+; RV32-NEXT: vsetvli a1, zero, e64,m8,tu,mu
+; RV32-NEXT: vloxei64.v v16, (a0), v8, v0.t
+; RV32-NEXT: vmv8r.v v8, v16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_baseidx_sext_nxv8i16_nxv8f64:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu
+; RV64-NEXT: vsext.vf4 v24, v8
+; RV64-NEXT: vsll.vi v8, v24, 3
+; RV64-NEXT: vsetvli a1, zero, e64,m8,tu,mu
+; RV64-NEXT: vloxei64.v v16, (a0), v8, v0.t
+; RV64-NEXT: vmv8r.v v8, v16
+; RV64-NEXT: ret
+ %eidxs = sext <vscale x 8 x i16> %idxs to <vscale x 8 x i64>
+ %ptrs = getelementptr inbounds double, double* %base, <vscale x 8 x i64> %eidxs
+ %v = call <vscale x 8 x double> @llvm.masked.gather.nxv8f64.nxv8p0f64(<vscale x 8 x double*> %ptrs, i32 8, <vscale x 8 x i1> %m, <vscale x 8 x double> %passthru)
+ ret <vscale x 8 x double> %v
+}
+
+define <vscale x 8 x double> @mgather_baseidx_zext_nxv8i16_nxv8f64(double* %base, <vscale x 8 x i16> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x double> %passthru) {
+; RV32-LABEL: mgather_baseidx_zext_nxv8i16_nxv8f64:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a1, zero, e64,m8,ta,mu
+; RV32-NEXT: vzext.vf4 v24, v8
+; RV32-NEXT: vsll.vi v8, v24, 3
+; RV32-NEXT: vsetvli a1, zero, e64,m8,tu,mu
+; RV32-NEXT: vloxei64.v v16, (a0), v8, v0.t
+; RV32-NEXT: vmv8r.v v8, v16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_baseidx_zext_nxv8i16_nxv8f64:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu
+; RV64-NEXT: vzext.vf4 v24, v8
+; RV64-NEXT: vsll.vi v8, v24, 3
+; RV64-NEXT: vsetvli a1, zero, e64,m8,tu,mu
+; RV64-NEXT: vloxei64.v v16, (a0), v8, v0.t
+; RV64-NEXT: vmv8r.v v8, v16
+; RV64-NEXT: ret
+ %eidxs = zext <vscale x 8 x i16> %idxs to <vscale x 8 x i64>
+ %ptrs = getelementptr inbounds double, double* %base, <vscale x 8 x i64> %eidxs
+ %v = call <vscale x 8 x double> @llvm.masked.gather.nxv8f64.nxv8p0f64(<vscale x 8 x double*> %ptrs, i32 8, <vscale x 8 x i1> %m, <vscale x 8 x double> %passthru)
+ ret <vscale x 8 x double> %v
+}
+
+define <vscale x 8 x double> @mgather_baseidx_nxv8i32_nxv8f64(double* %base, <vscale x 8 x i32> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x double> %passthru) {
+; RV32-LABEL: mgather_baseidx_nxv8i32_nxv8f64:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu
+; RV32-NEXT: vsll.vi v28, v8, 3
+; RV32-NEXT: vsetvli a1, zero, e64,m8,tu,mu
+; RV32-NEXT: vloxei32.v v16, (a0), v28, v0.t
+; RV32-NEXT: vmv8r.v v8, v16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_baseidx_nxv8i32_nxv8f64:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu
+; RV64-NEXT: vsext.vf2 v24, v8
+; RV64-NEXT: vsll.vi v8, v24, 3
+; RV64-NEXT: vsetvli a1, zero, e64,m8,tu,mu
+; RV64-NEXT: vloxei64.v v16, (a0), v8, v0.t
+; RV64-NEXT: vmv8r.v v8, v16
+; RV64-NEXT: ret
+ %ptrs = getelementptr inbounds double, double* %base, <vscale x 8 x i32> %idxs
+ %v = call <vscale x 8 x double> @llvm.masked.gather.nxv8f64.nxv8p0f64(<vscale x 8 x double*> %ptrs, i32 8, <vscale x 8 x i1> %m, <vscale x 8 x double> %passthru)
+ ret <vscale x 8 x double> %v
+}
+
+define <vscale x 8 x double> @mgather_baseidx_sext_nxv8i32_nxv8f64(double* %base, <vscale x 8 x i32> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x double> %passthru) {
+; RV32-LABEL: mgather_baseidx_sext_nxv8i32_nxv8f64:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a1, zero, e64,m8,ta,mu
+; RV32-NEXT: vsext.vf2 v24, v8
+; RV32-NEXT: vsll.vi v8, v24, 3
+; RV32-NEXT: vsetvli a1, zero, e64,m8,tu,mu
+; RV32-NEXT: vloxei64.v v16, (a0), v8, v0.t
+; RV32-NEXT: vmv8r.v v8, v16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_baseidx_sext_nxv8i32_nxv8f64:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu
+; RV64-NEXT: vsext.vf2 v24, v8
+; RV64-NEXT: vsll.vi v8, v24, 3
+; RV64-NEXT: vsetvli a1, zero, e64,m8,tu,mu
+; RV64-NEXT: vloxei64.v v16, (a0), v8, v0.t
+; RV64-NEXT: vmv8r.v v8, v16
+; RV64-NEXT: ret
+ %eidxs = sext <vscale x 8 x i32> %idxs to <vscale x 8 x i64>
+ %ptrs = getelementptr inbounds double, double* %base, <vscale x 8 x i64> %eidxs
+ %v = call <vscale x 8 x double> @llvm.masked.gather.nxv8f64.nxv8p0f64(<vscale x 8 x double*> %ptrs, i32 8, <vscale x 8 x i1> %m, <vscale x 8 x double> %passthru)
+ ret <vscale x 8 x double> %v
+}
+
+define <vscale x 8 x double> @mgather_baseidx_zext_nxv8i32_nxv8f64(double* %base, <vscale x 8 x i32> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x double> %passthru) {
+; RV32-LABEL: mgather_baseidx_zext_nxv8i32_nxv8f64:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a1, zero, e64,m8,ta,mu
+; RV32-NEXT: vzext.vf2 v24, v8
+; RV32-NEXT: vsll.vi v8, v24, 3
+; RV32-NEXT: vsetvli a1, zero, e64,m8,tu,mu
+; RV32-NEXT: vloxei64.v v16, (a0), v8, v0.t
+; RV32-NEXT: vmv8r.v v8, v16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_baseidx_zext_nxv8i32_nxv8f64:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu
+; RV64-NEXT: vzext.vf2 v24, v8
+; RV64-NEXT: vsll.vi v8, v24, 3
+; RV64-NEXT: vsetvli a1, zero, e64,m8,tu,mu
+; RV64-NEXT: vloxei64.v v16, (a0), v8, v0.t
+; RV64-NEXT: vmv8r.v v8, v16
+; RV64-NEXT: ret
+ %eidxs = zext <vscale x 8 x i32> %idxs to <vscale x 8 x i64>
+ %ptrs = getelementptr inbounds double, double* %base, <vscale x 8 x i64> %eidxs
+ %v = call <vscale x 8 x double> @llvm.masked.gather.nxv8f64.nxv8p0f64(<vscale x 8 x double*> %ptrs, i32 8, <vscale x 8 x i1> %m, <vscale x 8 x double> %passthru)
+ ret <vscale x 8 x double> %v
+}
+
+define <vscale x 8 x double> @mgather_baseidx_nxv8f64(double* %base, <vscale x 8 x i64> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x double> %passthru) {
+; RV32-LABEL: mgather_baseidx_nxv8f64:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a1, zero, e64,m8,ta,mu
+; RV32-NEXT: vsll.vi v8, v8, 3
+; RV32-NEXT: vsetvli a1, zero, e64,m8,tu,mu
+; RV32-NEXT: vloxei64.v v16, (a0), v8, v0.t
+; RV32-NEXT: vmv8r.v v8, v16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_baseidx_nxv8f64:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu
+; RV64-NEXT: vsll.vi v8, v8, 3
+; RV64-NEXT: vsetvli a1, zero, e64,m8,tu,mu
+; RV64-NEXT: vloxei64.v v16, (a0), v8, v0.t
+; RV64-NEXT: vmv8r.v v8, v16
+; RV64-NEXT: ret
+ %ptrs = getelementptr inbounds double, double* %base, <vscale x 8 x i64> %idxs
+ %v = call <vscale x 8 x double> @llvm.masked.gather.nxv8f64.nxv8p0f64(<vscale x 8 x double*> %ptrs, i32 8, <vscale x 8 x i1> %m, <vscale x 8 x double> %passthru)
+ ret <vscale x 8 x double> %v
+}
+
+declare <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0i8(<vscale x 16 x i8*>, i32, <vscale x 16 x i1>, <vscale x 16 x i8>)
+
+define <vscale x 16 x i8> @mgather_baseidx_nxv16i8(i8* %base, <vscale x 16 x i8> %idxs, <vscale x 16 x i1> %m, <vscale x 16 x i8> %passthru) {
+; RV32-LABEL: mgather_baseidx_nxv16i8:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a1, zero, e32,m8,ta,mu
+; RV32-NEXT: vsext.vf4 v16, v8
+; RV32-NEXT: vsetvli a1, zero, e8,m2,tu,mu
+; RV32-NEXT: vloxei32.v v10, (a0), v16, v0.t
+; RV32-NEXT: vmv2r.v v8, v10
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_baseidx_nxv16i8:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu
+; RV64-NEXT: vsext.vf8 v16, v8
+; RV64-NEXT: vsetvli a1, zero, e8,m1,tu,mu
+; RV64-NEXT: vloxei64.v v10, (a0), v16, v0.t
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: srli a1, a1, 3
+; RV64-NEXT: vsetvli a2, zero, e8,mf4,ta,mu
+; RV64-NEXT: vslidedown.vx v0, v0, a1
+; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu
+; RV64-NEXT: vsext.vf8 v16, v9
+; RV64-NEXT: vsetvli a1, zero, e8,m1,tu,mu
+; RV64-NEXT: vloxei64.v v11, (a0), v16, v0.t
+; RV64-NEXT: vmv2r.v v8, v10
+; RV64-NEXT: ret
+ %ptrs = getelementptr inbounds i8, i8* %base, <vscale x 16 x i8> %idxs
+ %v = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0i8(<vscale x 16 x i8*> %ptrs, i32 2, <vscale x 16 x i1> %m, <vscale x 16 x i8> %passthru)
+ ret <vscale x 16 x i8> %v
+}
+
+declare <vscale x 32 x i8> @llvm.masked.gather.nxv32i8.nxv32p0i8(<vscale x 32 x i8*>, i32, <vscale x 32 x i1>, <vscale x 32 x i8>)
+
+define <vscale x 32 x i8> @mgather_baseidx_nxv32i8(i8* %base, <vscale x 32 x i8> %idxs, <vscale x 32 x i1> %m, <vscale x 32 x i8> %passthru) {
+; RV32-LABEL: mgather_baseidx_nxv32i8:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a1, zero, e32,m8,ta,mu
+; RV32-NEXT: vsext.vf4 v16, v8
+; RV32-NEXT: vsetvli a1, zero, e8,m2,tu,mu
+; RV32-NEXT: vloxei32.v v12, (a0), v16, v0.t
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: srli a1, a1, 2
+; RV32-NEXT: vsetvli a2, zero, e8,mf2,ta,mu
+; RV32-NEXT: vslidedown.vx v0, v0, a1
+; RV32-NEXT: vsetvli a1, zero, e32,m8,ta,mu
+; RV32-NEXT: vsext.vf4 v16, v10
+; RV32-NEXT: vsetvli a1, zero, e8,m2,tu,mu
+; RV32-NEXT: vloxei32.v v14, (a0), v16, v0.t
+; RV32-NEXT: vmv4r.v v8, v12
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mgather_baseidx_nxv32i8:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu
+; RV64-NEXT: vsext.vf8 v16, v8
+; RV64-NEXT: vsetvli a1, zero, e8,m1,tu,mu
+; RV64-NEXT: vloxei64.v v12, (a0), v16, v0.t
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: srli a1, a1, 3
+; RV64-NEXT: vsetvli a2, zero, e8,mf4,ta,mu
+; RV64-NEXT: vslidedown.vx v25, v0, a1
+; RV64-NEXT: vmv1r.v v26, v0
+; RV64-NEXT: vsetvli a2, zero, e64,m8,ta,mu
+; RV64-NEXT: vsext.vf8 v16, v9
+; RV64-NEXT: vsetvli a2, zero, e8,m1,tu,mu
+; RV64-NEXT: vmv1r.v v0, v25
+; RV64-NEXT: vloxei64.v v13, (a0), v16, v0.t
+; RV64-NEXT: slli a2, a1, 1
+; RV64-NEXT: vsetvli a3, zero, e8,mf2,ta,mu
+; RV64-NEXT: vslidedown.vx v26, v26, a2
+; RV64-NEXT: vsetvli a2, zero, e8,mf4,ta,mu
+; RV64-NEXT: vslidedown.vx v0, v26, a1
+; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu
+; RV64-NEXT: vsext.vf8 v16, v11
+; RV64-NEXT: vsetvli a1, zero, e8,m1,tu,mu
+; RV64-NEXT: vloxei64.v v15, (a0), v16, v0.t
+; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu
+; RV64-NEXT: vsext.vf8 v16, v10
+; RV64-NEXT: vsetvli a1, zero, e8,m1,tu,mu
+; RV64-NEXT: vmv1r.v v0, v26
+; RV64-NEXT: vloxei64.v v14, (a0), v16, v0.t
+; RV64-NEXT: vmv4r.v v8, v12
+; RV64-NEXT: ret
+ %ptrs = getelementptr inbounds i8, i8* %base, <vscale x 32 x i8> %idxs
+ %v = call <vscale x 32 x i8> @llvm.masked.gather.nxv32i8.nxv32p0i8(<vscale x 32 x i8*> %ptrs, i32 2, <vscale x 32 x i1> %m, <vscale x 32 x i8> %passthru)
+ ret <vscale x 32 x i8> %v
+}
More information about the llvm-commits
mailing list