[llvm] [RISCV] Lower SEW<=32 vector_deinterleave(2) via vunzip2{a, b} (PR #136463)
via llvm-commits
llvm-commits at lists.llvm.org
Sat Apr 19 16:37:47 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-risc-v
Author: Philip Reames (preames)
<details>
<summary>Changes</summary>
This is a continuation from 22d5890c and adds the neccessary logic to handle SEW!=64 profitably. The interesting case is needing to handle e.g. a single m1 which is split via extract_subvector into two operands, and form that back into a single m1 operation - instead of letting the vslidedown by vlenb/Constant sequence be generated. This is analogous to the getSingleShuffleSrc for vnsrl, and we can share a bunch of code.
---
Patch is 29.82 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/136463.diff
2 Files Affected:
- (modified) llvm/lib/Target/RISCV/RISCVISelLowering.cpp (+45-18)
- (modified) llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll (+384-184)
``````````diff
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 98fba9e86e88a..7cf0d2db42ba1 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -4569,12 +4569,13 @@ static SDValue lowerScalarInsert(SDValue Scalar, SDValue VL, MVT VT,
VL);
}
-// Can this shuffle be performed on exactly one (possibly larger) input?
-static SDValue getSingleShuffleSrc(MVT VT, SDValue V1, SDValue V2) {
-
- if (V2.isUndef())
- return V1;
-
+/// If concat_vector(V1,V2) could be folded away to some existing
+/// vector source, return it. Note that the source may be larger
+/// than the requested concat_vector (i.e. a extract_subvector
+/// might be required.)
+static SDValue FoldConcatVector(SDValue V1, SDValue V2) {
+ EVT VT = V1.getValueType();
+ assert(VT == V1.getValueType() && "precondition");
// Both input must be extracts.
if (V1.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
V2.getOpcode() != ISD::EXTRACT_SUBVECTOR)
@@ -4582,23 +4583,34 @@ static SDValue getSingleShuffleSrc(MVT VT, SDValue V1, SDValue V2) {
// Extracting from the same source.
SDValue Src = V1.getOperand(0);
- if (Src != V2.getOperand(0))
- return SDValue();
-
- // Src needs to have twice the number of elements.
- unsigned NumElts = VT.getVectorNumElements();
- if (!Src.getValueType().isFixedLengthVector() ||
- Src.getValueType().getVectorNumElements() != (NumElts * 2))
+ if (Src != V2.getOperand(0) ||
+ VT.isScalableVector() != Src.getValueType().isScalableVector())
return SDValue();
// The extracts must extract the two halves of the source.
if (V1.getConstantOperandVal(1) != 0 ||
- V2.getConstantOperandVal(1) != NumElts)
+ V2.getConstantOperandVal(1) != VT.getVectorMinNumElements())
return SDValue();
return Src;
}
+// Can this shuffle be performed on exactly one (possibly larger) input?
+static SDValue getSingleShuffleSrc(MVT VT, SDValue V1, SDValue V2) {
+
+ if (V2.isUndef())
+ return V1;
+
+ unsigned NumElts = VT.getVectorNumElements();
+ // Src needs to have twice the number of elements.
+ // TODO: Update shuffle lowering to add the extract subvector
+ if (SDValue Src = FoldConcatVector(V1, V2);
+ Src && Src.getValueType().getVectorNumElements() == (NumElts * 2))
+ return Src;
+
+ return SDValue();
+}
+
/// Is this shuffle interleaving contiguous elements from one vector into the
/// even elements and contiguous elements from another vector into the odd
/// elements. \p EvenSrc will contain the element that should be in the first
@@ -11510,12 +11522,27 @@ SDValue RISCVTargetLowering::lowerVECTOR_DEINTERLEAVE(SDValue Op,
return DAG.getMergeValues(Res, DL);
}
- // TODO: Remove the e64 restriction once the fractional LMUL lowering
- // is improved to always beat the vnsrl lowering below.
- if (Subtarget.hasVendorXRivosVizip() && Factor == 2 &&
- VecVT.getVectorElementType().getSizeInBits() == 64) {
+ if (Subtarget.hasVendorXRivosVizip() && Factor == 2) {
+ MVT VT = Op->getSimpleValueType(0);
SDValue V1 = Op->getOperand(0);
SDValue V2 = Op->getOperand(1);
+
+ // For fractional LMUL, check if we can use a higher LMUL
+ // instruction to avoid a vslidedown.
+ if (SDValue Src = FoldConcatVector(V1, V2);
+ Src && getLMUL1VT(VT).bitsGT(VT)) {
+ EVT NewVT = VT.getDoubleNumVectorElementsVT();
+ SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
+ Src = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewVT, Src, ZeroIdx);
+ SDValue Even = lowerVZIP(RISCVISD::RI_VUNZIP2A_VL, Src,
+ DAG.getUNDEF(NewVT), DL, DAG, Subtarget);
+ SDValue Odd = lowerVZIP(RISCVISD::RI_VUNZIP2B_VL, Src,
+ DAG.getUNDEF(NewVT), DL, DAG, Subtarget);
+ Even = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Even, ZeroIdx);
+ Odd = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Odd, ZeroIdx);
+ return DAG.getMergeValues({Even, Odd}, DL);
+ }
+
SDValue Even =
lowerVZIP(RISCVISD::RI_VUNZIP2A_VL, V1, V2, DL, DAG, Subtarget);
SDValue Odd =
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll
index b0b2390b1de37..8a71cd0826672 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll
@@ -6,62 +6,106 @@
; Integers
define {<vscale x 16 x i1>, <vscale x 16 x i1>} @vector_deinterleave_nxv16i1_nxv32i1(<vscale x 32 x i1> %vec) {
-; CHECK-LABEL: vector_deinterleave_nxv16i1_nxv32i1:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma
-; CHECK-NEXT: vmv.v.i v10, 0
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: vmerge.vim v8, v10, 1, v0
-; CHECK-NEXT: srli a0, a0, 2
-; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, ma
-; CHECK-NEXT: vslidedown.vx v0, v0, a0
-; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma
-; CHECK-NEXT: vmerge.vim v10, v10, 1, v0
-; CHECK-NEXT: vnsrl.wi v12, v8, 0
-; CHECK-NEXT: vnsrl.wi v14, v8, 8
-; CHECK-NEXT: vmsne.vi v0, v12, 0
-; CHECK-NEXT: vmsne.vi v8, v14, 0
-; CHECK-NEXT: ret
+; V-LABEL: vector_deinterleave_nxv16i1_nxv32i1:
+; V: # %bb.0:
+; V-NEXT: vsetvli a0, zero, e8, m2, ta, ma
+; V-NEXT: vmv.v.i v10, 0
+; V-NEXT: csrr a0, vlenb
+; V-NEXT: vmerge.vim v8, v10, 1, v0
+; V-NEXT: srli a0, a0, 2
+; V-NEXT: vsetvli a1, zero, e8, mf2, ta, ma
+; V-NEXT: vslidedown.vx v0, v0, a0
+; V-NEXT: vsetvli a0, zero, e8, m2, ta, ma
+; V-NEXT: vmerge.vim v10, v10, 1, v0
+; V-NEXT: vnsrl.wi v12, v8, 0
+; V-NEXT: vnsrl.wi v14, v8, 8
+; V-NEXT: vmsne.vi v0, v12, 0
+; V-NEXT: vmsne.vi v8, v14, 0
+; V-NEXT: ret
+;
+; ZIP-LABEL: vector_deinterleave_nxv16i1_nxv32i1:
+; ZIP: # %bb.0:
+; ZIP-NEXT: vsetvli a0, zero, e8, m2, ta, ma
+; ZIP-NEXT: vmv.v.i v8, 0
+; ZIP-NEXT: csrr a0, vlenb
+; ZIP-NEXT: vmerge.vim v10, v8, 1, v0
+; ZIP-NEXT: srli a0, a0, 2
+; ZIP-NEXT: vsetvli a1, zero, e8, mf2, ta, ma
+; ZIP-NEXT: vslidedown.vx v0, v0, a0
+; ZIP-NEXT: vsetvli a0, zero, e8, m2, ta, ma
+; ZIP-NEXT: vmerge.vim v8, v8, 1, v0
+; ZIP-NEXT: ri.vunzip2a.vv v12, v10, v8
+; ZIP-NEXT: ri.vunzip2b.vv v14, v10, v8
+; ZIP-NEXT: vmsne.vi v0, v12, 0
+; ZIP-NEXT: vmsne.vi v8, v14, 0
+; ZIP-NEXT: ret
%retval = call {<vscale x 16 x i1>, <vscale x 16 x i1>} @llvm.vector.deinterleave2.nxv32i1(<vscale x 32 x i1> %vec)
ret {<vscale x 16 x i1>, <vscale x 16 x i1>} %retval
}
define {<vscale x 16 x i8>, <vscale x 16 x i8>} @vector_deinterleave_nxv16i8_nxv32i8(<vscale x 32 x i8> %vec) {
-; CHECK-LABEL: vector_deinterleave_nxv16i8_nxv32i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma
-; CHECK-NEXT: vnsrl.wi v12, v8, 0
-; CHECK-NEXT: vnsrl.wi v14, v8, 8
-; CHECK-NEXT: vmv.v.v v8, v12
-; CHECK-NEXT: vmv.v.v v10, v14
-; CHECK-NEXT: ret
+; V-LABEL: vector_deinterleave_nxv16i8_nxv32i8:
+; V: # %bb.0:
+; V-NEXT: vsetvli a0, zero, e8, m2, ta, ma
+; V-NEXT: vnsrl.wi v12, v8, 0
+; V-NEXT: vnsrl.wi v14, v8, 8
+; V-NEXT: vmv.v.v v8, v12
+; V-NEXT: vmv.v.v v10, v14
+; V-NEXT: ret
+;
+; ZIP-LABEL: vector_deinterleave_nxv16i8_nxv32i8:
+; ZIP: # %bb.0:
+; ZIP-NEXT: vsetvli a0, zero, e8, m2, ta, ma
+; ZIP-NEXT: ri.vunzip2a.vv v12, v8, v10
+; ZIP-NEXT: ri.vunzip2b.vv v14, v8, v10
+; ZIP-NEXT: vmv.v.v v8, v12
+; ZIP-NEXT: vmv.v.v v10, v14
+; ZIP-NEXT: ret
%retval = call {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %vec)
ret {<vscale x 16 x i8>, <vscale x 16 x i8>} %retval
}
define {<vscale x 8 x i16>, <vscale x 8 x i16>} @vector_deinterleave_nxv8i16_nxv16i16(<vscale x 16 x i16> %vec) {
-; CHECK-LABEL: vector_deinterleave_nxv8i16_nxv16i16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-NEXT: vnsrl.wi v12, v8, 0
-; CHECK-NEXT: vnsrl.wi v14, v8, 16
-; CHECK-NEXT: vmv.v.v v8, v12
-; CHECK-NEXT: vmv.v.v v10, v14
-; CHECK-NEXT: ret
+; V-LABEL: vector_deinterleave_nxv8i16_nxv16i16:
+; V: # %bb.0:
+; V-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; V-NEXT: vnsrl.wi v12, v8, 0
+; V-NEXT: vnsrl.wi v14, v8, 16
+; V-NEXT: vmv.v.v v8, v12
+; V-NEXT: vmv.v.v v10, v14
+; V-NEXT: ret
+;
+; ZIP-LABEL: vector_deinterleave_nxv8i16_nxv16i16:
+; ZIP: # %bb.0:
+; ZIP-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; ZIP-NEXT: ri.vunzip2a.vv v12, v8, v10
+; ZIP-NEXT: ri.vunzip2b.vv v14, v8, v10
+; ZIP-NEXT: vmv.v.v v8, v12
+; ZIP-NEXT: vmv.v.v v10, v14
+; ZIP-NEXT: ret
%retval = call {<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> %vec)
ret {<vscale x 8 x i16>, <vscale x 8 x i16>} %retval
}
define {<vscale x 4 x i32>, <vscale x 4 x i32>} @vector_deinterleave_nxv4i32_nxvv8i32(<vscale x 8 x i32> %vec) {
-; CHECK-LABEL: vector_deinterleave_nxv4i32_nxvv8i32:
-; CHECK: # %bb.0:
-; CHECK-NEXT: li a0, 32
-; CHECK-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; CHECK-NEXT: vnsrl.wx v12, v8, a0
-; CHECK-NEXT: vnsrl.wi v14, v8, 0
-; CHECK-NEXT: vmv.v.v v8, v14
-; CHECK-NEXT: vmv.v.v v10, v12
-; CHECK-NEXT: ret
+; V-LABEL: vector_deinterleave_nxv4i32_nxvv8i32:
+; V: # %bb.0:
+; V-NEXT: li a0, 32
+; V-NEXT: vsetvli a1, zero, e32, m2, ta, ma
+; V-NEXT: vnsrl.wx v12, v8, a0
+; V-NEXT: vnsrl.wi v14, v8, 0
+; V-NEXT: vmv.v.v v8, v14
+; V-NEXT: vmv.v.v v10, v12
+; V-NEXT: ret
+;
+; ZIP-LABEL: vector_deinterleave_nxv4i32_nxvv8i32:
+; ZIP: # %bb.0:
+; ZIP-NEXT: vsetvli a0, zero, e32, m2, ta, ma
+; ZIP-NEXT: ri.vunzip2a.vv v12, v8, v10
+; ZIP-NEXT: ri.vunzip2b.vv v14, v8, v10
+; ZIP-NEXT: vmv.v.v v8, v12
+; ZIP-NEXT: vmv.v.v v10, v14
+; ZIP-NEXT: ret
%retval = call {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %vec)
ret {<vscale x 4 x i32>, <vscale x 4 x i32>} %retval
}
@@ -122,69 +166,122 @@ ret {<vscale x 4 x i64>, <vscale x 4 x i64>} %retval
define {<vscale x 64 x i1>, <vscale x 64 x i1>} @vector_deinterleave_nxv64i1_nxv128i1(<vscale x 128 x i1> %vec) {
-; CHECK-LABEL: vector_deinterleave_nxv64i1_nxv128i1:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e8, m8, ta, ma
-; CHECK-NEXT: vmv.v.i v24, 0
-; CHECK-NEXT: vmerge.vim v16, v24, 1, v0
-; CHECK-NEXT: vmv1r.v v0, v8
-; CHECK-NEXT: vmerge.vim v24, v24, 1, v0
-; CHECK-NEXT: vsetvli a0, zero, e8, m4, ta, ma
-; CHECK-NEXT: vnsrl.wi v8, v16, 0
-; CHECK-NEXT: vnsrl.wi v0, v16, 8
-; CHECK-NEXT: vnsrl.wi v12, v24, 0
-; CHECK-NEXT: vnsrl.wi v4, v24, 8
-; CHECK-NEXT: vsetvli a0, zero, e8, m8, ta, ma
-; CHECK-NEXT: vmsne.vi v16, v8, 0
-; CHECK-NEXT: vmsne.vi v8, v0, 0
-; CHECK-NEXT: vmv1r.v v0, v16
-; CHECK-NEXT: ret
+; V-LABEL: vector_deinterleave_nxv64i1_nxv128i1:
+; V: # %bb.0:
+; V-NEXT: vsetvli a0, zero, e8, m8, ta, ma
+; V-NEXT: vmv.v.i v24, 0
+; V-NEXT: vmerge.vim v16, v24, 1, v0
+; V-NEXT: vmv1r.v v0, v8
+; V-NEXT: vmerge.vim v24, v24, 1, v0
+; V-NEXT: vsetvli a0, zero, e8, m4, ta, ma
+; V-NEXT: vnsrl.wi v8, v16, 0
+; V-NEXT: vnsrl.wi v0, v16, 8
+; V-NEXT: vnsrl.wi v12, v24, 0
+; V-NEXT: vnsrl.wi v4, v24, 8
+; V-NEXT: vsetvli a0, zero, e8, m8, ta, ma
+; V-NEXT: vmsne.vi v16, v8, 0
+; V-NEXT: vmsne.vi v8, v0, 0
+; V-NEXT: vmv1r.v v0, v16
+; V-NEXT: ret
+;
+; ZIP-LABEL: vector_deinterleave_nxv64i1_nxv128i1:
+; ZIP: # %bb.0:
+; ZIP-NEXT: vsetvli a0, zero, e8, m8, ta, ma
+; ZIP-NEXT: vmv1r.v v9, v0
+; ZIP-NEXT: vmv1r.v v0, v8
+; ZIP-NEXT: vmv.v.i v24, 0
+; ZIP-NEXT: vmerge.vim v16, v24, 1, v0
+; ZIP-NEXT: vmv1r.v v0, v9
+; ZIP-NEXT: vmerge.vim v24, v24, 1, v0
+; ZIP-NEXT: vsetvli a0, zero, e8, m4, ta, ma
+; ZIP-NEXT: ri.vunzip2a.vv v12, v16, v20
+; ZIP-NEXT: ri.vunzip2b.vv v4, v16, v20
+; ZIP-NEXT: ri.vunzip2a.vv v8, v24, v28
+; ZIP-NEXT: ri.vunzip2b.vv v0, v24, v28
+; ZIP-NEXT: vsetvli a0, zero, e8, m8, ta, ma
+; ZIP-NEXT: vmsne.vi v16, v8, 0
+; ZIP-NEXT: vmsne.vi v8, v0, 0
+; ZIP-NEXT: vmv1r.v v0, v16
+; ZIP-NEXT: ret
%retval = call {<vscale x 64 x i1>, <vscale x 64 x i1>} @llvm.vector.deinterleave2.nxv128i1(<vscale x 128 x i1> %vec)
ret {<vscale x 64 x i1>, <vscale x 64 x i1>} %retval
}
define {<vscale x 64 x i8>, <vscale x 64 x i8>} @vector_deinterleave_nxv64i8_nxv128i8(<vscale x 128 x i8> %vec) {
-; CHECK-LABEL: vector_deinterleave_nxv64i8_nxv128i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e8, m4, ta, ma
-; CHECK-NEXT: vmv8r.v v24, v8
-; CHECK-NEXT: vnsrl.wi v8, v24, 0
-; CHECK-NEXT: vnsrl.wi v0, v24, 8
-; CHECK-NEXT: vnsrl.wi v12, v16, 0
-; CHECK-NEXT: vnsrl.wi v4, v16, 8
-; CHECK-NEXT: vmv8r.v v16, v0
-; CHECK-NEXT: ret
+; V-LABEL: vector_deinterleave_nxv64i8_nxv128i8:
+; V: # %bb.0:
+; V-NEXT: vsetvli a0, zero, e8, m4, ta, ma
+; V-NEXT: vmv8r.v v24, v8
+; V-NEXT: vnsrl.wi v8, v24, 0
+; V-NEXT: vnsrl.wi v0, v24, 8
+; V-NEXT: vnsrl.wi v12, v16, 0
+; V-NEXT: vnsrl.wi v4, v16, 8
+; V-NEXT: vmv8r.v v16, v0
+; V-NEXT: ret
+;
+; ZIP-LABEL: vector_deinterleave_nxv64i8_nxv128i8:
+; ZIP: # %bb.0:
+; ZIP-NEXT: vsetvli a0, zero, e8, m4, ta, ma
+; ZIP-NEXT: ri.vunzip2a.vv v28, v16, v20
+; ZIP-NEXT: ri.vunzip2b.vv v4, v16, v20
+; ZIP-NEXT: ri.vunzip2a.vv v24, v8, v12
+; ZIP-NEXT: ri.vunzip2b.vv v0, v8, v12
+; ZIP-NEXT: vmv8r.v v8, v24
+; ZIP-NEXT: vmv8r.v v16, v0
+; ZIP-NEXT: ret
%retval = call {<vscale x 64 x i8>, <vscale x 64 x i8>} @llvm.vector.deinterleave2.nxv128i8(<vscale x 128 x i8> %vec)
ret {<vscale x 64 x i8>, <vscale x 64 x i8>} %retval
}
define {<vscale x 32 x i16>, <vscale x 32 x i16>} @vector_deinterleave_nxv32i16_nxv64i16(<vscale x 64 x i16> %vec) {
-; CHECK-LABEL: vector_deinterleave_nxv32i16_nxv64i16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT: vmv8r.v v24, v8
-; CHECK-NEXT: vnsrl.wi v8, v24, 0
-; CHECK-NEXT: vnsrl.wi v0, v24, 16
-; CHECK-NEXT: vnsrl.wi v12, v16, 0
-; CHECK-NEXT: vnsrl.wi v4, v16, 16
-; CHECK-NEXT: vmv8r.v v16, v0
-; CHECK-NEXT: ret
+; V-LABEL: vector_deinterleave_nxv32i16_nxv64i16:
+; V: # %bb.0:
+; V-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; V-NEXT: vmv8r.v v24, v8
+; V-NEXT: vnsrl.wi v8, v24, 0
+; V-NEXT: vnsrl.wi v0, v24, 16
+; V-NEXT: vnsrl.wi v12, v16, 0
+; V-NEXT: vnsrl.wi v4, v16, 16
+; V-NEXT: vmv8r.v v16, v0
+; V-NEXT: ret
+;
+; ZIP-LABEL: vector_deinterleave_nxv32i16_nxv64i16:
+; ZIP: # %bb.0:
+; ZIP-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; ZIP-NEXT: ri.vunzip2a.vv v28, v16, v20
+; ZIP-NEXT: ri.vunzip2b.vv v4, v16, v20
+; ZIP-NEXT: ri.vunzip2a.vv v24, v8, v12
+; ZIP-NEXT: ri.vunzip2b.vv v0, v8, v12
+; ZIP-NEXT: vmv8r.v v8, v24
+; ZIP-NEXT: vmv8r.v v16, v0
+; ZIP-NEXT: ret
%retval = call {<vscale x 32 x i16>, <vscale x 32 x i16>} @llvm.vector.deinterleave2.nxv64i16(<vscale x 64 x i16> %vec)
ret {<vscale x 32 x i16>, <vscale x 32 x i16>} %retval
}
define {<vscale x 16 x i32>, <vscale x 16 x i32>} @vector_deinterleave_nxv16i32_nxvv32i32(<vscale x 32 x i32> %vec) {
-; CHECK-LABEL: vector_deinterleave_nxv16i32_nxvv32i32:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e32, m4, ta, ma
-; CHECK-NEXT: vmv8r.v v24, v16
-; CHECK-NEXT: li a0, 32
-; CHECK-NEXT: vnsrl.wx v20, v24, a0
-; CHECK-NEXT: vnsrl.wx v16, v8, a0
-; CHECK-NEXT: vnsrl.wi v0, v8, 0
-; CHECK-NEXT: vnsrl.wi v4, v24, 0
-; CHECK-NEXT: vmv8r.v v8, v0
-; CHECK-NEXT: ret
+; V-LABEL: vector_deinterleave_nxv16i32_nxvv32i32:
+; V: # %bb.0:
+; V-NEXT: vsetvli a0, zero, e32, m4, ta, ma
+; V-NEXT: vmv8r.v v24, v16
+; V-NEXT: li a0, 32
+; V-NEXT: vnsrl.wx v20, v24, a0
+; V-NEXT: vnsrl.wx v16, v8, a0
+; V-NEXT: vnsrl.wi v0, v8, 0
+; V-NEXT: vnsrl.wi v4, v24, 0
+; V-NEXT: vmv8r.v v8, v0
+; V-NEXT: ret
+;
+; ZIP-LABEL: vector_deinterleave_nxv16i32_nxvv32i32:
+; ZIP: # %bb.0:
+; ZIP-NEXT: vsetvli a0, zero, e32, m4, ta, ma
+; ZIP-NEXT: ri.vunzip2a.vv v28, v16, v20
+; ZIP-NEXT: ri.vunzip2b.vv v4, v16, v20
+; ZIP-NEXT: ri.vunzip2a.vv v24, v8, v12
+; ZIP-NEXT: ri.vunzip2b.vv v0, v8, v12
+; ZIP-NEXT: vmv8r.v v8, v24
+; ZIP-NEXT: vmv8r.v v16, v0
+; ZIP-NEXT: ret
%retval = call {<vscale x 16 x i32>, <vscale x 16 x i32>} @llvm.vector.deinterleave2.nxv32i32(<vscale x 32 x i32> %vec)
ret {<vscale x 16 x i32>, <vscale x 16 x i32>} %retval
}
@@ -254,105 +351,175 @@ ret {<vscale x 8 x i64>, <vscale x 8 x i64>} %retval
; Floats
define {<vscale x 2 x bfloat>, <vscale x 2 x bfloat>} @vector_deinterleave_nxv2bf16_nxv4bf16(<vscale x 4 x bfloat> %vec) {
-; CHECK-LABEL: vector_deinterleave_nxv2bf16_nxv4bf16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
-; CHECK-NEXT: vnsrl.wi v10, v8, 0
-; CHECK-NEXT: vnsrl.wi v9, v8, 16
-; CHECK-NEXT: vmv1r.v v8, v10
-; CHECK-NEXT: ret
+; V-LABEL: vector_deinterleave_nxv2bf16_nxv4bf16:
+; V: # %bb.0:
+; V-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; V-NEXT: vnsrl.wi v10, v8, 0
+; V-NEXT: vnsrl.wi v9, v8, 16
+; V-NEXT: vmv1r.v v8, v10
+; V-NEXT: ret
+;
+; ZIP-LABEL: vector_deinterleave_nxv2bf16_nxv4bf16:
+; ZIP: # %bb.0:
+; ZIP-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; ZIP-NEXT: ri.vunzip2a.vv v10, v8, v9
+; ZIP-NEXT: ri.vunzip2b.vv v9, v8, v11
+; ZIP-NEXT: vmv.v.v v8, v10
+; ZIP-NEXT: ret
%retval = call {<vscale x 2 x bfloat>, <vscale x 2 x bfloat>} @llvm.vector.deinterleave2.nxv4bf16(<vscale x 4 x bfloat> %vec)
ret {<vscale x 2 x bfloat>, <vscale x 2 x bfloat>} %retval
}
define {<vscale x 2 x half>, <vscale x 2 x half>} @vector_deinterleave_nxv2f16_nxv4f16(<vscale x 4 x half> %vec) {
-; CHECK-LABEL: vector_deinterleave_nxv2f16_nxv4f16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
-; CHECK-NEXT: vnsrl.wi v10, v8, 0
-; CHECK-NEXT: vnsrl.wi v9, v8, 16
-; CHECK-NEXT: vmv1r.v v8, v10
-; CHECK-NEXT: ret
+; V-LABEL: vector_deinterleave_nxv2f16_nxv4f16:
+; V: # %bb.0:
+; V-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; V-NEXT: vnsrl.wi v10, v8, 0
+; V-NEXT: vnsrl.wi v9, v8, 16
+; V-NEXT: vmv1r.v v8, v10
+; V-NEXT: ret
+;
+; ZIP-LABEL: vector_deinterleave_nxv2f16_nxv4f16:
+; ZIP: # %bb.0:
+; ZIP-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; ZIP-NEXT: ri.vunzip2a.vv v10, v8, v9
+; ZIP-NEXT: ri.vunzip2b.vv v9, v8, v11
+; ZIP-NEXT: vmv.v.v v8, v10
+; ZIP-NEXT: ret
%retval = call {<vscale x 2 x half>, <vscale x 2 x half>} @llvm.vector.deinterleave2.nxv4f16(<vscale x 4 x half> %vec)
ret {<vscale x 2 x half>, <vscale x 2 x half>} %retval
}
define {<vscale x 4 x bfloat>, <vscale x 4 x bfloat>} @vector_deinterleave_nxv4bf16_nxv8bf16(<vscale x 8 x bfloat> %vec) {
-; CHECK-LABEL: vector_deinterleave_nxv4bf16_nxv8bf16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT: vnsrl.wi v10, v8, 0
-; CHECK-NEXT: vnsrl.wi v11, v8, 16
-; CHECK-NEXT: vmv.v.v v8, v10
-; CHECK-NEXT: vmv.v.v v9, v11
-; CHECK-NEXT: ret
+; V-LABEL: vector_deinterleave_nxv4bf16_nxv8bf16:
+; V: # %bb.0:
+; V-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; V-NEXT: vnsrl.wi v10, v8, 0
+; V-NEXT: vnsrl.wi v11, v8, 16
+; V-NEXT: vmv.v.v v8, v10
+; V-NEXT: vmv.v.v v9, v11
+; V-NEXT: ret
+;
+; ZIP-LABEL: vector_deinterleave_nxv4bf16_nxv8bf16:
+; ZIP: # %bb.0:
+; ZIP-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; ZIP-NEXT: ri.vunzip2a.vv v10, v8, v9
+; ZIP-NEXT: ri.vunzip2b.vv v11, v8, v9
+; ZIP-NEXT: vmv.v.v v8, v10
+; ZIP-NEXT: vmv.v.v v9, v11
+; ZIP-NEXT: ret
%retval = call {<vscale x 4 x bfloat>, <vscale x 4 x bfloat>} @llvm.vector.deinterleave2.nxv8bf16(<vscale x 8 x bfloat> %vec)
ret {<vscale x 4 x bfloat>, <vscale x 4 x bfloat>} %retval
}
define {<vscale x 4 x half>, <vscale x 4 x half>} @vector_deinterleave_nxv4f16_nxv8f16(<vscale x 8 x half> %vec) {
-; CHECK-LABEL: vector_deinterleave_nxv4f16_nxv8f16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT: vnsrl.wi v10, v8, 0
-; CHECK-NEXT: ...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/136463
More information about the llvm-commits
mailing list