[llvm] 4ea734e - [RISCV] Unify scalable- and fixed-vector INSERT_SUBVECTOR lowering
Fraser Cormack via llvm-commits
llvm-commits at lists.llvm.org
Mon Mar 1 03:45:07 PST 2021
Author: Fraser Cormack
Date: 2021-03-01T11:38:47Z
New Revision: 4ea734e6ec9da0587da733424fe616b7e401cf8c
URL: https://github.com/llvm/llvm-project/commit/4ea734e6ec9da0587da733424fe616b7e401cf8c
DIFF: https://github.com/llvm/llvm-project/commit/4ea734e6ec9da0587da733424fe616b7e401cf8c.diff
LOG: [RISCV] Unify scalable- and fixed-vector INSERT_SUBVECTOR lowering
This patch unifies the two disparate paths for lowering INSERT_SUBVECTOR
operations under one roof. Consequently, with this patch it is possible to
support any fixed-length subvector insertion, not just "cast-like" ones.
As before, support for the insertion of mask vectors will come in a
separate patch.
Reviewed By: craig.topper
Differential Revision: https://reviews.llvm.org/D97543
Added:
llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll
Modified:
llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
llvm/lib/Target/RISCV/RISCVISelLowering.cpp
Removed:
################################################################################
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index 1fe026a1fedd..552495cc864c 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -945,68 +945,53 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
auto Idx = Node->getConstantOperandVal(2);
MVT SubVecVT = SubV.getSimpleValueType();
- // TODO: This method of selecting INSERT_SUBVECTOR should work
- // with any type of insertion (fixed <-> scalable) but we don't yet
- // correctly identify the canonical register class for fixed-length types.
- // For now, keep the two paths separate.
- if (VT.isScalableVector() && SubVecVT.isScalableVector()) {
- const auto *TRI = Subtarget->getRegisterInfo();
- unsigned SubRegIdx;
- std::tie(SubRegIdx, Idx) =
- RISCVTargetLowering::decomposeSubvectorInsertExtractToSubRegs(
- VT, SubVecVT, Idx, TRI);
-
- // If the Idx hasn't been completely eliminated then this is a subvector
- // insert which doesn't naturally align to a vector register. These must
- // be handled using instructions to manipulate the vector registers.
- if (Idx != 0)
- break;
-
- RISCVVLMUL SubVecLMUL = RISCVTargetLowering::getLMUL(SubVecVT);
- bool IsSubVecPartReg = SubVecLMUL == RISCVVLMUL::LMUL_F2 ||
- SubVecLMUL == RISCVVLMUL::LMUL_F4 ||
- SubVecLMUL == RISCVVLMUL::LMUL_F8;
- (void)IsSubVecPartReg; // Silence unused variable warning without asserts.
- assert((!IsSubVecPartReg || V.isUndef()) &&
- "Expecting lowering to have created legal INSERT_SUBVECTORs when "
- "the subvector is smaller than a full-sized register");
-
- // If we haven't set a SubRegIdx, then we must be going between LMUL<=1
- // types (VR -> VR). This can be done as a copy.
- if (SubRegIdx == RISCV::NoSubRegister) {
- unsigned InRegClassID = RISCVTargetLowering::getRegClassIDForVecVT(VT);
- assert(RISCVTargetLowering::getRegClassIDForVecVT(SubVecVT) ==
- RISCV::VRRegClassID &&
- InRegClassID == RISCV::VRRegClassID &&
- "Unexpected subvector extraction");
- SDValue RC = CurDAG->getTargetConstant(InRegClassID, DL, XLenVT);
- SDNode *NewNode = CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
- DL, VT, SubV, RC);
- return ReplaceNode(Node, NewNode);
- }
+ MVT SubVecContainerVT = SubVecVT;
+ // Establish the correct scalable-vector types for any fixed-length type.
+ if (SubVecVT.isFixedLengthVector())
+ SubVecContainerVT = RISCVTargetLowering::getContainerForFixedLengthVector(
+ *CurDAG, SubVecVT, *Subtarget);
+ if (VT.isFixedLengthVector())
+ VT = RISCVTargetLowering::getContainerForFixedLengthVector(*CurDAG, VT,
+ *Subtarget);
- SDNode *NewNode = CurDAG->getMachineNode(
- TargetOpcode::INSERT_SUBREG, DL, VT, V, SubV,
- CurDAG->getTargetConstant(SubRegIdx, DL, XLenVT));
- return ReplaceNode(Node, NewNode);
- }
+ const auto *TRI = Subtarget->getRegisterInfo();
+ unsigned SubRegIdx;
+ std::tie(SubRegIdx, Idx) =
+ RISCVTargetLowering::decomposeSubvectorInsertExtractToSubRegs(
+ VT, SubVecContainerVT, Idx, TRI);
- if (VT.isScalableVector() && SubVecVT.isFixedLengthVector()) {
- // Bail when not a "cast" like insert_subvector.
- if (Idx != 0)
- break;
- if (!Node->getOperand(0).isUndef())
- break;
+ // If the Idx hasn't been completely eliminated then this is a subvector
+ // insert which doesn't naturally align to a vector register. These must
+ // be handled using instructions to manipulate the vector registers.
+ if (Idx != 0)
+ break;
- unsigned RegClassID = RISCVTargetLowering::getRegClassIDForVecVT(VT);
+ RISCVVLMUL SubVecLMUL = RISCVTargetLowering::getLMUL(SubVecVT);
+ bool IsSubVecPartReg = SubVecLMUL == RISCVVLMUL::LMUL_F2 ||
+ SubVecLMUL == RISCVVLMUL::LMUL_F4 ||
+ SubVecLMUL == RISCVVLMUL::LMUL_F8;
+ (void)IsSubVecPartReg; // Silence unused variable warning without asserts.
+ assert((!IsSubVecPartReg || V.isUndef()) &&
+ "Expecting lowering to have created legal INSERT_SUBVECTORs when "
+ "the subvector is smaller than a full-sized register");
- SDValue RC = CurDAG->getTargetConstant(RegClassID, DL, XLenVT);
+ // If we haven't set a SubRegIdx, then we must be going between
+ // equally-sized LMUL groups (e.g. VR -> VR). This can be done as a copy.
+ if (SubRegIdx == RISCV::NoSubRegister) {
+ unsigned InRegClassID = RISCVTargetLowering::getRegClassIDForVecVT(VT);
+ assert(RISCVTargetLowering::getRegClassIDForVecVT(SubVecContainerVT) ==
+ InRegClassID &&
+ "Unexpected subvector extraction");
+ SDValue RC = CurDAG->getTargetConstant(InRegClassID, DL, XLenVT);
SDNode *NewNode = CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
DL, VT, SubV, RC);
- ReplaceNode(Node, NewNode);
- return;
+ return ReplaceNode(Node, NewNode);
}
- break;
+
+ SDNode *NewNode = CurDAG->getMachineNode(
+ TargetOpcode::INSERT_SUBREG, DL, VT, V, SubV,
+ CurDAG->getTargetConstant(SubRegIdx, DL, XLenVT));
+ return ReplaceNode(Node, NewNode);
}
case ISD::EXTRACT_SUBVECTOR: {
SDValue V = Node->getOperand(0);
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 197d8189b1ef..b3bacef0b083 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -531,6 +531,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
setTruncStoreAction(VT, OtherVT, Expand);
// We use EXTRACT_SUBVECTOR as a "cast" from scalable to fixed.
+ setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
@@ -602,6 +603,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
}
// We use EXTRACT_SUBVECTOR as a "cast" from scalable to fixed.
+ setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
@@ -2436,15 +2438,42 @@ SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
MVT VecVT = Vec.getSimpleValueType();
MVT SubVecVT = SubVec.getSimpleValueType();
- // TODO: Only handle scalable->scalable inserts for now, and revisit this for
- // fixed-length vectors later.
- if (!SubVecVT.isScalableVector() || !VecVT.isScalableVector())
- return Op;
-
SDLoc DL(Op);
+ MVT XLenVT = Subtarget.getXLenVT();
unsigned OrigIdx = Op.getConstantOperandVal(2);
const RISCVRegisterInfo *TRI = Subtarget.getRegisterInfo();
+ // If the subvector vector is a fixed-length type, we cannot use subregister
+ // manipulation to simplify the codegen; we don't know which register of a
+ // LMUL group contains the specific subvector as we only know the minimum
+ // register size. Therefore we must slide the vector group up the full
+ // amount.
+ if (SubVecVT.isFixedLengthVector()) {
+ if (OrigIdx == 0 && Vec.isUndef())
+ return Op;
+ MVT ContainerVT = VecVT;
+ if (VecVT.isFixedLengthVector()) {
+ ContainerVT = RISCVTargetLowering::getContainerForFixedLengthVector(
+ DAG, VecVT, Subtarget);
+ Vec = convertToScalableVector(ContainerVT, Vec, DAG, Subtarget);
+ }
+ SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ContainerVT,
+ DAG.getUNDEF(ContainerVT), SubVec,
+ DAG.getConstant(0, DL, XLenVT));
+ SDValue Mask =
+ getDefaultVLOps(VecVT, ContainerVT, DL, DAG, Subtarget).first;
+ // Set the vector length to only the number of elements we care about. Note
+ // that for slideup this includes the offset.
+ SDValue VL =
+ DAG.getConstant(OrigIdx + SubVecVT.getVectorNumElements(), DL, XLenVT);
+ SDValue SlideupAmt = DAG.getConstant(OrigIdx, DL, XLenVT);
+ SDValue Slideup = DAG.getNode(RISCVISD::VSLIDEUP_VL, DL, ContainerVT, Vec,
+ SubVec, SlideupAmt, Mask, VL);
+ if (!VecVT.isFixedLengthVector())
+ return Slideup;
+ return convertFromScalableVector(VecVT, Slideup, DAG, Subtarget);
+ }
+
unsigned SubRegIdx, RemIdx;
std::tie(SubRegIdx, RemIdx) =
RISCVTargetLowering::decomposeSubvectorInsertExtractToSubRegs(
@@ -2455,11 +2484,11 @@ SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
SubVecLMUL == RISCVVLMUL::LMUL_F4 ||
SubVecLMUL == RISCVVLMUL::LMUL_F8;
- // If the Idx has been completely eliminated and this subvector's size is a
- // vector register or a multiple thereof, or the surrounding elements are
+ // 1. If the Idx has been completely eliminated and this subvector's size is
+ // a vector register or a multiple thereof, or the surrounding elements are
// undef, then this is a subvector insert which naturally aligns to a vector
// register. These can easily be handled using subregister manipulation.
- // If the subvector is smaller than a vector register, then the insertion
+ // 2. If the subvector is smaller than a vector register, then the insertion
// must preserve the undisturbed elements of the register. We do this by
// lowering to an EXTRACT_SUBVECTOR grabbing the nearest LMUL=1 vector type
// (which resolves to a subregister copy), performing a VSLIDEUP to place the
@@ -2475,7 +2504,6 @@ SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
// (in our case undisturbed). This means we can set up a subvector insertion
// where OFFSET is the insertion offset, and the VL is the OFFSET plus the
// size of the subvector.
- MVT XLenVT = Subtarget.getXLenVT();
MVT InterSubVT = getLMUL1VT(VecVT);
// Extract a subvector equal to the nearest full vector register type. This
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll
new file mode 100644
index 000000000000..aa43d6006bb4
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll
@@ -0,0 +1,331 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -mattr=+m,+experimental-v -verify-machineinstrs -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX2
+; RUN: llc -mtriple=riscv64 -mattr=+m,+experimental-v -verify-machineinstrs -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX1
+
+define <vscale x 8 x i32> @insert_nxv8i32_v2i32_0(<vscale x 8 x i32> %vec, <2 x i32>* %svp) {
+; CHECK-LABEL: insert_nxv8i32_v2i32_0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli a1, 2, e32,m1,ta,mu
+; CHECK-NEXT: vle32.v v28, (a0)
+; CHECK-NEXT: vsetivli a0, 2, e32,m4,tu,mu
+; CHECK-NEXT: vslideup.vi v8, v28, 0
+; CHECK-NEXT: ret
+ %sv = load <2 x i32>, <2 x i32>* %svp
+ %v = call <vscale x 8 x i32> @llvm.experimental.vector.insert.v2i32.nxv8i32(<vscale x 8 x i32> %vec, <2 x i32> %sv, i64 0)
+ ret <vscale x 8 x i32> %v
+}
+
+define <vscale x 8 x i32> @insert_nxv8i32_v2i32_2(<vscale x 8 x i32> %vec, <2 x i32>* %svp) {
+; CHECK-LABEL: insert_nxv8i32_v2i32_2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli a1, 2, e32,m1,ta,mu
+; CHECK-NEXT: vle32.v v28, (a0)
+; CHECK-NEXT: vsetivli a0, 4, e32,m4,tu,mu
+; CHECK-NEXT: vslideup.vi v8, v28, 2
+; CHECK-NEXT: ret
+ %sv = load <2 x i32>, <2 x i32>* %svp
+ %v = call <vscale x 8 x i32> @llvm.experimental.vector.insert.v2i32.nxv8i32(<vscale x 8 x i32> %vec, <2 x i32> %sv, i64 2)
+ ret <vscale x 8 x i32> %v
+}
+
+define <vscale x 8 x i32> @insert_nxv8i32_v2i32_6(<vscale x 8 x i32> %vec, <2 x i32>* %svp) {
+; CHECK-LABEL: insert_nxv8i32_v2i32_6:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli a1, 2, e32,m1,ta,mu
+; CHECK-NEXT: vle32.v v28, (a0)
+; CHECK-NEXT: vsetivli a0, 8, e32,m4,tu,mu
+; CHECK-NEXT: vslideup.vi v8, v28, 6
+; CHECK-NEXT: ret
+ %sv = load <2 x i32>, <2 x i32>* %svp
+ %v = call <vscale x 8 x i32> @llvm.experimental.vector.insert.v2i32.nxv8i32(<vscale x 8 x i32> %vec, <2 x i32> %sv, i64 6)
+ ret <vscale x 8 x i32> %v
+}
+
+define <vscale x 8 x i32> @insert_nxv8i32_v8i32_0(<vscale x 8 x i32> %vec, <8 x i32>* %svp) {
+; LMULMAX2-LABEL: insert_nxv8i32_v8i32_0:
+; LMULMAX2: # %bb.0:
+; LMULMAX2-NEXT: vsetivli a1, 8, e32,m2,ta,mu
+; LMULMAX2-NEXT: vle32.v v28, (a0)
+; LMULMAX2-NEXT: vsetivli a0, 8, e32,m4,tu,mu
+; LMULMAX2-NEXT: vslideup.vi v8, v28, 0
+; LMULMAX2-NEXT: ret
+;
+; LMULMAX1-LABEL: insert_nxv8i32_v8i32_0:
+; LMULMAX1: # %bb.0:
+; LMULMAX1-NEXT: vsetivli a1, 4, e32,m1,ta,mu
+; LMULMAX1-NEXT: vle32.v v28, (a0)
+; LMULMAX1-NEXT: addi a0, a0, 16
+; LMULMAX1-NEXT: vle32.v v12, (a0)
+; LMULMAX1-NEXT: vsetivli a0, 4, e32,m4,tu,mu
+; LMULMAX1-NEXT: vslideup.vi v8, v28, 0
+; LMULMAX1-NEXT: vsetivli a0, 8, e32,m4,tu,mu
+; LMULMAX1-NEXT: vslideup.vi v8, v12, 4
+; LMULMAX1-NEXT: ret
+ %sv = load <8 x i32>, <8 x i32>* %svp
+ %v = call <vscale x 8 x i32> @llvm.experimental.vector.insert.v8i32.nxv8i32(<vscale x 8 x i32> %vec, <8 x i32> %sv, i64 0)
+ ret <vscale x 8 x i32> %v
+}
+
+define <vscale x 8 x i32> @insert_nxv8i32_v8i32_4(<vscale x 8 x i32> %vec, <8 x i32>* %svp) {
+; LMULMAX2-LABEL: insert_nxv8i32_v8i32_4:
+; LMULMAX2: # %bb.0:
+; LMULMAX2-NEXT: vsetivli a1, 8, e32,m2,ta,mu
+; LMULMAX2-NEXT: vle32.v v28, (a0)
+; LMULMAX2-NEXT: vsetivli a0, 12, e32,m4,tu,mu
+; LMULMAX2-NEXT: vslideup.vi v8, v28, 4
+; LMULMAX2-NEXT: ret
+;
+; LMULMAX1-LABEL: insert_nxv8i32_v8i32_4:
+; LMULMAX1: # %bb.0:
+; LMULMAX1-NEXT: vsetivli a1, 4, e32,m1,ta,mu
+; LMULMAX1-NEXT: vle32.v v28, (a0)
+; LMULMAX1-NEXT: addi a0, a0, 16
+; LMULMAX1-NEXT: vle32.v v12, (a0)
+; LMULMAX1-NEXT: vsetivli a0, 8, e32,m4,tu,mu
+; LMULMAX1-NEXT: vslideup.vi v8, v28, 4
+; LMULMAX1-NEXT: vsetivli a0, 12, e32,m4,tu,mu
+; LMULMAX1-NEXT: vslideup.vi v8, v12, 8
+; LMULMAX1-NEXT: ret
+ %sv = load <8 x i32>, <8 x i32>* %svp
+ %v = call <vscale x 8 x i32> @llvm.experimental.vector.insert.v8i32.nxv8i32(<vscale x 8 x i32> %vec, <8 x i32> %sv, i64 4)
+ ret <vscale x 8 x i32> %v
+}
+
+define <vscale x 8 x i32> @insert_nxv8i32_v8i32_8(<vscale x 8 x i32> %vec, <8 x i32>* %svp) {
+; LMULMAX2-LABEL: insert_nxv8i32_v8i32_8:
+; LMULMAX2: # %bb.0:
+; LMULMAX2-NEXT: vsetivli a1, 8, e32,m2,ta,mu
+; LMULMAX2-NEXT: vle32.v v28, (a0)
+; LMULMAX2-NEXT: vsetivli a0, 16, e32,m4,tu,mu
+; LMULMAX2-NEXT: vslideup.vi v8, v28, 8
+; LMULMAX2-NEXT: ret
+;
+; LMULMAX1-LABEL: insert_nxv8i32_v8i32_8:
+; LMULMAX1: # %bb.0:
+; LMULMAX1-NEXT: vsetivli a1, 4, e32,m1,ta,mu
+; LMULMAX1-NEXT: vle32.v v28, (a0)
+; LMULMAX1-NEXT: addi a0, a0, 16
+; LMULMAX1-NEXT: vle32.v v12, (a0)
+; LMULMAX1-NEXT: vsetivli a0, 12, e32,m4,tu,mu
+; LMULMAX1-NEXT: vslideup.vi v8, v28, 8
+; LMULMAX1-NEXT: vsetivli a0, 16, e32,m4,tu,mu
+; LMULMAX1-NEXT: vslideup.vi v8, v12, 12
+; LMULMAX1-NEXT: ret
+ %sv = load <8 x i32>, <8 x i32>* %svp
+ %v = call <vscale x 8 x i32> @llvm.experimental.vector.insert.v8i32.nxv8i32(<vscale x 8 x i32> %vec, <8 x i32> %sv, i64 8)
+ ret <vscale x 8 x i32> %v
+}
+
+define <vscale x 8 x i32> @insert_nxv8i32_undef_v2i32_0(<2 x i32>* %svp) {
+; CHECK-LABEL: insert_nxv8i32_undef_v2i32_0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli a1, 2, e32,m1,ta,mu
+; CHECK-NEXT: vle32.v v8, (a0)
+; CHECK-NEXT: ret
+ %sv = load <2 x i32>, <2 x i32>* %svp
+ %v = call <vscale x 8 x i32> @llvm.experimental.vector.insert.v2i32.nxv8i32(<vscale x 8 x i32> undef, <2 x i32> %sv, i64 0)
+ ret <vscale x 8 x i32> %v
+}
+
+define void @insert_v4i32_v2i32_0(<4 x i32>* %vp, <2 x i32>* %svp) {
+; CHECK-LABEL: insert_v4i32_v2i32_0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli a2, 2, e32,m1,ta,mu
+; CHECK-NEXT: vle32.v v25, (a1)
+; CHECK-NEXT: vsetivli a1, 4, e32,m1,ta,mu
+; CHECK-NEXT: vle32.v v26, (a0)
+; CHECK-NEXT: vsetivli a1, 2, e32,m1,tu,mu
+; CHECK-NEXT: vslideup.vi v26, v25, 0
+; CHECK-NEXT: vsetivli a1, 4, e32,m1,ta,mu
+; CHECK-NEXT: vse32.v v26, (a0)
+; CHECK-NEXT: ret
+ %sv = load <2 x i32>, <2 x i32>* %svp
+ %vec = load <4 x i32>, <4 x i32>* %vp
+ %v = call <4 x i32> @llvm.experimental.vector.insert.v2i32.v4i32(<4 x i32> %vec, <2 x i32> %sv, i64 0)
+ store <4 x i32> %v, <4 x i32>* %vp
+ ret void
+}
+
+define void @insert_v4i32_v2i32_2(<4 x i32>* %vp, <2 x i32>* %svp) {
+; CHECK-LABEL: insert_v4i32_v2i32_2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli a2, 2, e32,m1,ta,mu
+; CHECK-NEXT: vle32.v v25, (a1)
+; CHECK-NEXT: vsetivli a1, 4, e32,m1,ta,mu
+; CHECK-NEXT: vle32.v v26, (a0)
+; CHECK-NEXT: vsetivli a1, 4, e32,m1,tu,mu
+; CHECK-NEXT: vslideup.vi v26, v25, 2
+; CHECK-NEXT: vsetivli a1, 4, e32,m1,ta,mu
+; CHECK-NEXT: vse32.v v26, (a0)
+; CHECK-NEXT: ret
+ %sv = load <2 x i32>, <2 x i32>* %svp
+ %vec = load <4 x i32>, <4 x i32>* %vp
+ %v = call <4 x i32> @llvm.experimental.vector.insert.v2i32.v4i32(<4 x i32> %vec, <2 x i32> %sv, i64 2)
+ store <4 x i32> %v, <4 x i32>* %vp
+ ret void
+}
+
+define void @insert_v4i32_undef_v2i32_0(<4 x i32>* %vp, <2 x i32>* %svp) {
+; CHECK-LABEL: insert_v4i32_undef_v2i32_0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli a2, 2, e32,m1,ta,mu
+; CHECK-NEXT: vle32.v v25, (a1)
+; CHECK-NEXT: vsetivli a1, 4, e32,m1,ta,mu
+; CHECK-NEXT: vmv.v.i v26, 0
+; CHECK-NEXT: vsetivli a1, 2, e32,m1,tu,mu
+; CHECK-NEXT: vslideup.vi v26, v25, 0
+; CHECK-NEXT: vsetivli a1, 4, e32,m1,ta,mu
+; CHECK-NEXT: vse32.v v26, (a0)
+; CHECK-NEXT: ret
+ %sv = load <2 x i32>, <2 x i32>* %svp
+ %v = call <4 x i32> @llvm.experimental.vector.insert.v2i32.v4i32(<4 x i32> undef, <2 x i32> %sv, i64 0)
+ store <4 x i32> %v, <4 x i32>* %vp
+ ret void
+}
+
+define void @insert_v8i32_v2i32_0(<8 x i32>* %vp, <2 x i32>* %svp) {
+; LMULMAX2-LABEL: insert_v8i32_v2i32_0:
+; LMULMAX2: # %bb.0:
+; LMULMAX2-NEXT: vsetivli a2, 2, e32,m1,ta,mu
+; LMULMAX2-NEXT: vle32.v v26, (a1)
+; LMULMAX2-NEXT: vsetivli a1, 8, e32,m2,ta,mu
+; LMULMAX2-NEXT: vle32.v v28, (a0)
+; LMULMAX2-NEXT: vsetivli a1, 2, e32,m2,tu,mu
+; LMULMAX2-NEXT: vslideup.vi v28, v26, 0
+; LMULMAX2-NEXT: vsetivli a1, 8, e32,m2,ta,mu
+; LMULMAX2-NEXT: vse32.v v28, (a0)
+; LMULMAX2-NEXT: ret
+;
+; LMULMAX1-LABEL: insert_v8i32_v2i32_0:
+; LMULMAX1: # %bb.0:
+; LMULMAX1-NEXT: vsetivli a2, 2, e32,m1,ta,mu
+; LMULMAX1-NEXT: vle32.v v25, (a1)
+; LMULMAX1-NEXT: vsetivli a1, 4, e32,m1,ta,mu
+; LMULMAX1-NEXT: vle32.v v26, (a0)
+; LMULMAX1-NEXT: vsetivli a1, 2, e32,m1,tu,mu
+; LMULMAX1-NEXT: vslideup.vi v26, v25, 0
+; LMULMAX1-NEXT: vsetivli a1, 4, e32,m1,ta,mu
+; LMULMAX1-NEXT: vse32.v v26, (a0)
+; LMULMAX1-NEXT: ret
+ %sv = load <2 x i32>, <2 x i32>* %svp
+ %vec = load <8 x i32>, <8 x i32>* %vp
+ %v = call <8 x i32> @llvm.experimental.vector.insert.v2i32.v8i32(<8 x i32> %vec, <2 x i32> %sv, i64 0)
+ store <8 x i32> %v, <8 x i32>* %vp
+ ret void
+}
+
+define void @insert_v8i32_v2i32_2(<8 x i32>* %vp, <2 x i32>* %svp) {
+; LMULMAX2-LABEL: insert_v8i32_v2i32_2:
+; LMULMAX2: # %bb.0:
+; LMULMAX2-NEXT: vsetivli a2, 2, e32,m1,ta,mu
+; LMULMAX2-NEXT: vle32.v v26, (a1)
+; LMULMAX2-NEXT: vsetivli a1, 8, e32,m2,ta,mu
+; LMULMAX2-NEXT: vle32.v v28, (a0)
+; LMULMAX2-NEXT: vsetivli a1, 4, e32,m2,tu,mu
+; LMULMAX2-NEXT: vslideup.vi v28, v26, 2
+; LMULMAX2-NEXT: vsetivli a1, 8, e32,m2,ta,mu
+; LMULMAX2-NEXT: vse32.v v28, (a0)
+; LMULMAX2-NEXT: ret
+;
+; LMULMAX1-LABEL: insert_v8i32_v2i32_2:
+; LMULMAX1: # %bb.0:
+; LMULMAX1-NEXT: addi sp, sp, -32
+; LMULMAX1-NEXT: .cfi_def_cfa_offset 32
+; LMULMAX1-NEXT: vsetivli a2, 2, e32,m1,ta,mu
+; LMULMAX1-NEXT: vle32.v v25, (a1)
+; LMULMAX1-NEXT: vsetivli a1, 4, e32,m1,ta,mu
+; LMULMAX1-NEXT: vle32.v v26, (a0)
+; LMULMAX1-NEXT: vse32.v v26, (sp)
+; LMULMAX1-NEXT: addi a1, sp, 8
+; LMULMAX1-NEXT: vsetivli a2, 2, e32,m1,ta,mu
+; LMULMAX1-NEXT: vse32.v v25, (a1)
+; LMULMAX1-NEXT: vsetivli a1, 4, e32,m1,ta,mu
+; LMULMAX1-NEXT: vle32.v v25, (sp)
+; LMULMAX1-NEXT: vse32.v v25, (a0)
+; LMULMAX1-NEXT: addi sp, sp, 32
+; LMULMAX1-NEXT: ret
+ %sv = load <2 x i32>, <2 x i32>* %svp
+ %vec = load <8 x i32>, <8 x i32>* %vp
+ %v = call <8 x i32> @llvm.experimental.vector.insert.v2i32.v8i32(<8 x i32> %vec, <2 x i32> %sv, i64 2)
+ store <8 x i32> %v, <8 x i32>* %vp
+ ret void
+}
+
+define void @insert_v8i32_v2i32_6(<8 x i32>* %vp, <2 x i32>* %svp) {
+; LMULMAX2-LABEL: insert_v8i32_v2i32_6:
+; LMULMAX2: # %bb.0:
+; LMULMAX2-NEXT: vsetivli a2, 2, e32,m1,ta,mu
+; LMULMAX2-NEXT: vle32.v v26, (a1)
+; LMULMAX2-NEXT: vsetivli a1, 8, e32,m2,ta,mu
+; LMULMAX2-NEXT: vle32.v v28, (a0)
+; LMULMAX2-NEXT: vsetivli a1, 8, e32,m2,tu,mu
+; LMULMAX2-NEXT: vslideup.vi v28, v26, 6
+; LMULMAX2-NEXT: vsetivli a1, 8, e32,m2,ta,mu
+; LMULMAX2-NEXT: vse32.v v28, (a0)
+; LMULMAX2-NEXT: ret
+;
+; LMULMAX1-LABEL: insert_v8i32_v2i32_6:
+; LMULMAX1: # %bb.0:
+; LMULMAX1-NEXT: addi sp, sp, -32
+; LMULMAX1-NEXT: .cfi_def_cfa_offset 32
+; LMULMAX1-NEXT: vsetivli a2, 2, e32,m1,ta,mu
+; LMULMAX1-NEXT: vle32.v v25, (a1)
+; LMULMAX1-NEXT: addi a0, a0, 16
+; LMULMAX1-NEXT: vsetivli a1, 4, e32,m1,ta,mu
+; LMULMAX1-NEXT: vle32.v v26, (a0)
+; LMULMAX1-NEXT: addi a1, sp, 16
+; LMULMAX1-NEXT: vse32.v v26, (a1)
+; LMULMAX1-NEXT: addi a2, sp, 24
+; LMULMAX1-NEXT: vsetivli a3, 2, e32,m1,ta,mu
+; LMULMAX1-NEXT: vse32.v v25, (a2)
+; LMULMAX1-NEXT: vsetivli a2, 4, e32,m1,ta,mu
+; LMULMAX1-NEXT: vle32.v v25, (a1)
+; LMULMAX1-NEXT: vse32.v v25, (a0)
+; LMULMAX1-NEXT: addi sp, sp, 32
+; LMULMAX1-NEXT: ret
+ %sv = load <2 x i32>, <2 x i32>* %svp
+ %vec = load <8 x i32>, <8 x i32>* %vp
+ %v = call <8 x i32> @llvm.experimental.vector.insert.v2i32.v8i32(<8 x i32> %vec, <2 x i32> %sv, i64 6)
+ store <8 x i32> %v, <8 x i32>* %vp
+ ret void
+}
+
+define void @insert_v8i32_undef_v2i32_6(<8 x i32>* %vp, <2 x i32>* %svp) {
+; LMULMAX2-LABEL: insert_v8i32_undef_v2i32_6:
+; LMULMAX2: # %bb.0:
+; LMULMAX2-NEXT: vsetivli a2, 2, e32,m1,ta,mu
+; LMULMAX2-NEXT: vle32.v v26, (a1)
+; LMULMAX2-NEXT: vsetivli a1, 8, e32,m2,ta,mu
+; LMULMAX2-NEXT: vslideup.vi v28, v26, 6
+; LMULMAX2-NEXT: vse32.v v28, (a0)
+; LMULMAX2-NEXT: ret
+;
+; LMULMAX1-LABEL: insert_v8i32_undef_v2i32_6:
+; LMULMAX1: # %bb.0:
+; LMULMAX1-NEXT: addi sp, sp, -32
+; LMULMAX1-NEXT: .cfi_def_cfa_offset 32
+; LMULMAX1-NEXT: vsetivli a2, 2, e32,m1,ta,mu
+; LMULMAX1-NEXT: vle32.v v25, (a1)
+; LMULMAX1-NEXT: addi a1, sp, 24
+; LMULMAX1-NEXT: vse32.v v25, (a1)
+; LMULMAX1-NEXT: vsetivli a1, 4, e32,m1,ta,mu
+; LMULMAX1-NEXT: vle32.v v25, (sp)
+; LMULMAX1-NEXT: addi a1, sp, 16
+; LMULMAX1-NEXT: vle32.v v26, (a1)
+; LMULMAX1-NEXT: vse32.v v25, (a0)
+; LMULMAX1-NEXT: addi a0, a0, 16
+; LMULMAX1-NEXT: vse32.v v26, (a0)
+; LMULMAX1-NEXT: addi sp, sp, 32
+; LMULMAX1-NEXT: ret
+ %sv = load <2 x i32>, <2 x i32>* %svp
+ %v = call <8 x i32> @llvm.experimental.vector.insert.v2i32.v8i32(<8 x i32> undef, <2 x i32> %sv, i64 6)
+ store <8 x i32> %v, <8 x i32>* %vp
+ ret void
+}
+
+declare <4 x i32> @llvm.experimental.vector.insert.v2i32.v4i32(<4 x i32>, <2 x i32>, i64)
+declare <8 x i32> @llvm.experimental.vector.insert.v2i32.v8i32(<8 x i32>, <2 x i32>, i64)
+
+declare <vscale x 8 x i32> @llvm.experimental.vector.insert.v2i32.nxv8i32(<vscale x 8 x i32>, <2 x i32>, i64)
+declare <vscale x 8 x i32> @llvm.experimental.vector.insert.v4i32.nxv8i32(<vscale x 8 x i32>, <4 x i32>, i64)
+declare <vscale x 8 x i32> @llvm.experimental.vector.insert.v8i32.nxv8i32(<vscale x 8 x i32>, <8 x i32>, i64)
More information about the llvm-commits
mailing list