[llvm] [RISCV] Optimize two source deinterleave2 via ri.vunzip2{a, b} (PR #142667)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Jun 3 13:20:06 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-risc-v
Author: Philip Reames (preames)
<details>
<summary>Changes</summary>
As done for the existing vnsrl cases, we can split a two source deinterleave2
into two single source deinterleave2 and a slideup. Additionally, if we happen
to know the exact VLEN and our fixed vectors are an even number of vector
registers, we can avoid the need to split and just use both registers sources.
In the review, I included these as separate changes since I find that slightly
easier to follow. I can either land these squashed or individually as reviewers
prefer.
---
Full diff: https://github.com/llvm/llvm-project/pull/142667.diff
2 Files Affected:
- (modified) llvm/lib/Target/RISCV/RISCVISelLowering.cpp (+17)
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave2.ll (+21-61)
``````````diff
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index f74ca2a1c5492..777f4f91908d4 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -5830,6 +5830,9 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
Index == 0 ? RISCVISD::RI_VUNZIP2A_VL : RISCVISD::RI_VUNZIP2B_VL;
if (V2.isUndef())
return lowerVZIP(Opc, V1, V2, DL, DAG, Subtarget);
+ if (auto VLEN = Subtarget.getRealVLen();
+ VLEN && VT.getSizeInBits().getKnownMinValue() % *VLEN == 0)
+ return lowerVZIP(Opc, V1, V2, DL, DAG, Subtarget);
if (SDValue Src = foldConcatVector(V1, V2)) {
EVT NewVT = VT.getDoubleNumVectorElementsVT();
Src = DAG.getExtractSubvector(DL, NewVT, Src, 0);
@@ -5837,6 +5840,20 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
lowerVZIP(Opc, Src, DAG.getUNDEF(NewVT), DL, DAG, Subtarget);
return DAG.getExtractSubvector(DL, VT, Res, 0);
}
+ // Narrow each source and concatenate them.
+ // FIXME: For small LMUL it is better to concatenate first.
+ if (1 < count_if(Mask,
+ [&Mask](int Idx) { return Idx < (int)Mask.size(); }) &&
+ 1 < count_if(Mask,
+ [&Mask](int Idx) { return Idx >= (int)Mask.size(); })) {
+ SDValue Lo = lowerVZIP(Opc, V1, DAG.getUNDEF(VT), DL, DAG, Subtarget);
+ SDValue Hi = lowerVZIP(Opc, V2, DAG.getUNDEF(VT), DL, DAG, Subtarget);
+
+ MVT SubVT = VT.getHalfNumVectorElementsVT();
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
+ DAG.getExtractSubvector(DL, SubVT, Lo, 0),
+ DAG.getExtractSubvector(DL, SubVT, Hi, 0));
+ }
}
if (SDValue V =
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave2.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave2.ll
index 9c884454aa025..14b0e8352efa3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave2.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave2.ll
@@ -1364,13 +1364,11 @@ define <4 x i64> @unzip2a_dual_v4i64(<4 x i64> %a, <4 x i64> %b) {
;
; ZIP-LABEL: unzip2a_dual_v4i64:
; ZIP: # %bb.0: # %entry
-; ZIP-NEXT: vsetivli zero, 4, e64, m1, ta, mu
-; ZIP-NEXT: vmv.v.i v0, 8
-; ZIP-NEXT: vslideup.vi v10, v9, 2
-; ZIP-NEXT: vslideup.vi v10, v9, 1, v0.t
-; ZIP-NEXT: vmv.v.i v0, 12
-; ZIP-NEXT: ri.vunzip2a.vv v11, v8, v9
-; ZIP-NEXT: vmerge.vvm v8, v11, v10, v0
+; ZIP-NEXT: vsetivli zero, 4, e64, m1, ta, ma
+; ZIP-NEXT: ri.vunzip2a.vv v11, v9, v10
+; ZIP-NEXT: ri.vunzip2a.vv v9, v8, v10
+; ZIP-NEXT: vslideup.vi v9, v11, 2
+; ZIP-NEXT: vmv.v.v v8, v9
; ZIP-NEXT: ret
entry:
%c = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -1502,16 +1500,11 @@ define <16 x i64> @unzip2a_dual_v16i64(<16 x i64> %a, <16 x i64> %b) {
; ZIP-LABEL: unzip2a_dual_v16i64:
; ZIP: # %bb.0: # %entry
; ZIP-NEXT: vsetivli zero, 8, e64, m2, ta, ma
-; ZIP-NEXT: ri.vunzip2a.vv v16, v8, v10
-; ZIP-NEXT: vsetivli zero, 16, e16, m1, ta, ma
-; ZIP-NEXT: vid.v v8
-; ZIP-NEXT: li a0, -256
-; ZIP-NEXT: vadd.vv v8, v8, v8
-; ZIP-NEXT: vmv.s.x v0, a0
-; ZIP-NEXT: vadd.vi v8, v8, -16
-; ZIP-NEXT: vsetvli zero, zero, e64, m4, ta, mu
-; ZIP-NEXT: vrgatherei16.vv v16, v12, v8, v0.t
-; ZIP-NEXT: vmv.v.v v8, v16
+; ZIP-NEXT: ri.vunzip2a.vv v16, v12, v14
+; ZIP-NEXT: ri.vunzip2a.vv v12, v8, v10
+; ZIP-NEXT: vsetivli zero, 16, e64, m4, ta, ma
+; ZIP-NEXT: vslideup.vi v12, v16, 8
+; ZIP-NEXT: vmv.v.v v8, v12
; ZIP-NEXT: ret
entry:
%c = shufflevector <16 x i64> %a, <16 x i64> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
@@ -1557,13 +1550,9 @@ define <4 x i64> @unzip2a_dual_v4i64_exact(<4 x i64> %a, <4 x i64> %b) vscale_ra
;
; ZIP-LABEL: unzip2a_dual_v4i64_exact:
; ZIP: # %bb.0: # %entry
-; ZIP-NEXT: vsetivli zero, 4, e64, m1, ta, mu
-; ZIP-NEXT: vmv.v.i v0, 8
-; ZIP-NEXT: vslideup.vi v10, v9, 2
-; ZIP-NEXT: vslideup.vi v10, v9, 1, v0.t
-; ZIP-NEXT: vmv.v.i v0, 12
-; ZIP-NEXT: ri.vunzip2a.vv v11, v8, v9
-; ZIP-NEXT: vmerge.vvm v8, v11, v10, v0
+; ZIP-NEXT: vsetivli zero, 4, e64, m1, ta, ma
+; ZIP-NEXT: ri.vunzip2a.vv v10, v8, v9
+; ZIP-NEXT: vmv.v.v v8, v10
; ZIP-NEXT: ret
entry:
%c = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -1609,13 +1598,12 @@ define <4 x i64> @unzip2a_dual_v4i64_exact_nf2(<4 x i64> %a, <4 x i64> %b) vscal
;
; ZIP-LABEL: unzip2a_dual_v4i64_exact_nf2:
; ZIP: # %bb.0: # %entry
-; ZIP-NEXT: vsetivli zero, 4, e64, m1, ta, mu
-; ZIP-NEXT: vmv.v.i v0, 8
-; ZIP-NEXT: vslideup.vi v10, v9, 2
-; ZIP-NEXT: vslideup.vi v10, v9, 1, v0.t
-; ZIP-NEXT: vmv.v.i v0, 12
-; ZIP-NEXT: ri.vunzip2a.vv v11, v8, v9
-; ZIP-NEXT: vmerge.vvm v8, v11, v10, v0
+; ZIP-NEXT: vsetivli zero, 4, e64, m1, ta, ma
+; ZIP-NEXT: ri.vunzip2a.vv v11, v9, v10
+; ZIP-NEXT: ri.vunzip2a.vv v9, v8, v10
+; ZIP-NEXT: vsetvli zero, zero, e64, m1, tu, ma
+; ZIP-NEXT: vslideup.vi v9, v11, 2
+; ZIP-NEXT: vmv1r.v v8, v9
; ZIP-NEXT: ret
entry:
%c = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -1740,37 +1728,9 @@ define <16 x i64> @unzip2a_dual_v16i64_exact(<16 x i64> %a, <16 x i64> %b) vscal
;
; ZIP-LABEL: unzip2a_dual_v16i64_exact:
; ZIP: # %bb.0: # %entry
-; ZIP-NEXT: vsetivli zero, 4, e64, m1, ta, mu
-; ZIP-NEXT: vslideup.vi v18, v15, 2
-; ZIP-NEXT: vmv.v.i v16, 8
-; ZIP-NEXT: vmv.v.i v17, 12
-; ZIP-NEXT: vslideup.vi v20, v13, 2
-; ZIP-NEXT: vmv.v.v v0, v16
-; ZIP-NEXT: vslideup.vi v18, v15, 1, v0.t
-; ZIP-NEXT: ri.vunzip2a.vv v15, v14, v19
-; ZIP-NEXT: vmv.v.v v0, v17
-; ZIP-NEXT: vmerge.vvm v15, v15, v18, v0
-; ZIP-NEXT: vmv.v.v v0, v16
-; ZIP-NEXT: vslideup.vi v20, v13, 1, v0.t
-; ZIP-NEXT: ri.vunzip2a.vv v14, v12, v13
-; ZIP-NEXT: vslideup.vi v12, v11, 2
-; ZIP-NEXT: vslideup.vi v18, v9, 2
-; ZIP-NEXT: vmv.v.v v0, v17
-; ZIP-NEXT: vmerge.vvm v14, v14, v20, v0
-; ZIP-NEXT: li a0, -256
-; ZIP-NEXT: ri.vunzip2a.vv v20, v10, v13
-; ZIP-NEXT: ri.vunzip2a.vv v10, v8, v19
-; ZIP-NEXT: vmv.v.v v0, v16
-; ZIP-NEXT: vslideup.vi v12, v11, 1, v0.t
-; ZIP-NEXT: vmv.v.v v0, v17
-; ZIP-NEXT: vmerge.vvm v13, v20, v12, v0
-; ZIP-NEXT: vmv.v.v v0, v16
-; ZIP-NEXT: vslideup.vi v18, v9, 1, v0.t
-; ZIP-NEXT: vmv.v.v v0, v17
-; ZIP-NEXT: vmerge.vvm v12, v10, v18, v0
-; ZIP-NEXT: vmv.s.x v0, a0
; ZIP-NEXT: vsetivli zero, 16, e64, m4, ta, ma
-; ZIP-NEXT: vmerge.vvm v8, v12, v12, v0
+; ZIP-NEXT: ri.vunzip2a.vv v16, v8, v12
+; ZIP-NEXT: vmv.v.v v8, v16
; ZIP-NEXT: ret
entry:
%c = shufflevector <16 x i64> %a, <16 x i64> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
``````````
</details>
https://github.com/llvm/llvm-project/pull/142667
More information about the llvm-commits
mailing list