[llvm] [RISCV] Recursively split concat_vector into smaller LMULs (PR #83035)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Feb 26 09:18:06 PST 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-risc-v
Author: Luke Lau (lukel97)
<details>
<summary>Changes</summary>
This is the concat_vector equivalent of #<!-- -->81312, in that we recursively split concat_vectors with more than two operands into smaller concat_vectors.
This allows us to break up the chain of vslideups, as well as perform the vslideups at a smaller LMUL, which in turn reduces register pressure as the previous lowering performed N vslideups at the highest result LMUL.
This is done as a DAG combine so that any undef operands are combined away: If we do this during lowering then we end up with unnecessary vslideups of undefs.
---
Patch is 114.13 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/83035.diff
11 Files Affected:
- (modified) llvm/lib/Target/RISCV/RISCVISelLowering.cpp (+48-4)
- (modified) llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll (+57-61)
- (modified) llvm/test/CodeGen/RISCV/rvv/combine-store-extract-crash.ll (+2-2)
- (modified) llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll (+1-2)
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access-zve32x.ll (+20-23)
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll (+56-128)
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-combine.ll (+47-79)
- (modified) llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll (+617-449)
- (modified) llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll (+7-6)
- (modified) llvm/test/CodeGen/RISCV/rvv/pr63596.ll (+18-19)
- (modified) llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll (+88-117)
``````````diff
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 0c98642748d4ec..64661ff26f5444 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -15262,13 +15262,54 @@ static SDValue performINSERT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ConcatOps);
}
+// Recursively split up concat_vectors with more than 2 operands:
+//
+// concat_vector op1, op2, op3, op4
+// ->
+// concat_vector (concat_vector op1, op2), (concat_vector op3, op4)
+//
+// This reduces the length of the chain of vslideups and allows us to perform
+// the vslideups at a smaller LMUL.
+//
+// We do this as a DAG combine rather than during lowering so that any undef
+// operands can get combined away.
+static SDValue
+performCONCAT_VECTORSSplitCombine(SDNode *N, SelectionDAG &DAG,
+ const RISCVTargetLowering &TLI) {
+ SDLoc DL(N);
+
+ if (N->getNumOperands() <= 2)
+ return SDValue();
+
+ if (!TLI.isTypeLegal(N->getValueType(0)))
+ return SDValue();
+ MVT VT = N->getSimpleValueType(0);
+
+ MVT HalfVT = VT.getHalfNumVectorElementsVT();
+ size_t HalfNumOps = (N->getNumOperands() + 1) / 2;
+ SDValue BotSubConcat = DAG.getNode(ISD::CONCAT_VECTORS, DL, HalfVT,
+ N->ops().take_front(HalfNumOps));
+ SDValue TopSubConcat = DAG.getNode(ISD::CONCAT_VECTORS, DL, HalfVT,
+ N->ops().drop_front(HalfNumOps));
+
+ // Lower to an insert_subvector directly so the concat_vectors don't get
+ // recombined.
+ SDValue Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT),
+ BotSubConcat, DAG.getVectorIdxConstant(0, DL));
+ Vec = DAG.getNode(
+ ISD::INSERT_SUBVECTOR, DL, VT, Vec, TopSubConcat,
+ DAG.getVectorIdxConstant(HalfVT.getVectorMinNumElements(), DL));
+ return Vec;
+}
+
// If we're concatenating a series of vector loads like
// concat_vectors (load v4i8, p+0), (load v4i8, p+n), (load v4i8, p+n*2) ...
// Then we can turn this into a strided load by widening the vector elements
// vlse32 p, stride=n
-static SDValue performCONCAT_VECTORSCombine(SDNode *N, SelectionDAG &DAG,
- const RISCVSubtarget &Subtarget,
- const RISCVTargetLowering &TLI) {
+static SDValue
+performCONCAT_VECTORSStridedLoadCombine(SDNode *N, SelectionDAG &DAG,
+ const RISCVSubtarget &Subtarget,
+ const RISCVTargetLowering &TLI) {
SDLoc DL(N);
EVT VT = N->getValueType(0);
@@ -16373,7 +16414,10 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
return V;
break;
case ISD::CONCAT_VECTORS:
- if (SDValue V = performCONCAT_VECTORSCombine(N, DAG, Subtarget, *this))
+ if (SDValue V =
+ performCONCAT_VECTORSStridedLoadCombine(N, DAG, Subtarget, *this))
+ return V;
+ if (SDValue V = performCONCAT_VECTORSSplitCombine(N, DAG, *this))
return V;
break;
case ISD::INSERT_VECTOR_ELT:
diff --git a/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll b/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll
index 87d95d7596d4fa..fd087ba3a21d12 100644
--- a/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll
@@ -123,36 +123,35 @@ define <32 x i1> @fv32(ptr %p, i64 %index, i64 %tc) {
define <64 x i1> @fv64(ptr %p, i64 %index, i64 %tc) {
; CHECK-LABEL: fv64:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; CHECK-NEXT: lui a0, %hi(.LCPI9_0)
; CHECK-NEXT: addi a0, a0, %lo(.LCPI9_0)
+; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; CHECK-NEXT: vle8.v v8, (a0)
-; CHECK-NEXT: vid.v v16
-; CHECK-NEXT: vsaddu.vx v16, v16, a1
-; CHECK-NEXT: vmsltu.vx v0, v16, a2
-; CHECK-NEXT: vsext.vf8 v16, v8
-; CHECK-NEXT: vsaddu.vx v8, v16, a1
-; CHECK-NEXT: vmsltu.vx v16, v8, a2
-; CHECK-NEXT: vsetivli zero, 4, e8, mf2, tu, ma
-; CHECK-NEXT: vslideup.vi v0, v16, 2
; CHECK-NEXT: lui a0, %hi(.LCPI9_1)
; CHECK-NEXT: addi a0, a0, %lo(.LCPI9_1)
-; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT: vle8.v v8, (a0)
+; CHECK-NEXT: vle8.v v9, (a0)
; CHECK-NEXT: vsext.vf8 v16, v8
-; CHECK-NEXT: vsaddu.vx v8, v16, a1
-; CHECK-NEXT: vmsltu.vx v16, v8, a2
-; CHECK-NEXT: vsetivli zero, 6, e8, mf2, tu, ma
-; CHECK-NEXT: vslideup.vi v0, v16, 4
+; CHECK-NEXT: vsaddu.vx v16, v16, a1
+; CHECK-NEXT: vmsltu.vx v8, v16, a2
+; CHECK-NEXT: vsext.vf8 v16, v9
+; CHECK-NEXT: vsaddu.vx v16, v16, a1
+; CHECK-NEXT: vmsltu.vx v9, v16, a2
+; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
+; CHECK-NEXT: vslideup.vi v9, v8, 2
+; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; CHECK-NEXT: lui a0, %hi(.LCPI9_2)
; CHECK-NEXT: addi a0, a0, %lo(.LCPI9_2)
-; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; CHECK-NEXT: vle8.v v8, (a0)
+; CHECK-NEXT: vid.v v16
+; CHECK-NEXT: vsaddu.vx v16, v16, a1
+; CHECK-NEXT: vmsltu.vx v0, v16, a2
; CHECK-NEXT: vsext.vf8 v16, v8
-; CHECK-NEXT: vsaddu.vx v8, v16, a1
-; CHECK-NEXT: vmsltu.vx v16, v8, a2
+; CHECK-NEXT: vsaddu.vx v16, v16, a1
+; CHECK-NEXT: vmsltu.vx v8, v16, a2
+; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
+; CHECK-NEXT: vslideup.vi v0, v8, 2
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT: vslideup.vi v0, v16, 6
+; CHECK-NEXT: vslideup.vi v0, v9, 4
; CHECK-NEXT: ret
%mask = call <64 x i1> @llvm.get.active.lane.mask.v64i1.i64(i64 %index, i64 %tc)
ret <64 x i1> %mask
@@ -161,72 +160,69 @@ define <64 x i1> @fv64(ptr %p, i64 %index, i64 %tc) {
define <128 x i1> @fv128(ptr %p, i64 %index, i64 %tc) {
; CHECK-LABEL: fv128:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; CHECK-NEXT: lui a0, %hi(.LCPI10_0)
; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_0)
+; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; CHECK-NEXT: vle8.v v8, (a0)
-; CHECK-NEXT: vid.v v16
-; CHECK-NEXT: vsaddu.vx v16, v16, a1
-; CHECK-NEXT: vmsltu.vx v0, v16, a2
-; CHECK-NEXT: vsext.vf8 v16, v8
-; CHECK-NEXT: vsaddu.vx v8, v16, a1
-; CHECK-NEXT: vmsltu.vx v16, v8, a2
-; CHECK-NEXT: vsetivli zero, 4, e8, m1, tu, ma
-; CHECK-NEXT: vslideup.vi v0, v16, 2
; CHECK-NEXT: lui a0, %hi(.LCPI10_1)
; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_1)
-; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT: vle8.v v8, (a0)
+; CHECK-NEXT: vle8.v v9, (a0)
; CHECK-NEXT: vsext.vf8 v16, v8
-; CHECK-NEXT: vsaddu.vx v8, v16, a1
-; CHECK-NEXT: vmsltu.vx v16, v8, a2
-; CHECK-NEXT: vsetivli zero, 6, e8, m1, tu, ma
-; CHECK-NEXT: vslideup.vi v0, v16, 4
+; CHECK-NEXT: vsaddu.vx v16, v16, a1
+; CHECK-NEXT: vmsltu.vx v8, v16, a2
+; CHECK-NEXT: vsext.vf8 v16, v9
+; CHECK-NEXT: vsaddu.vx v16, v16, a1
+; CHECK-NEXT: vmsltu.vx v9, v16, a2
+; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
+; CHECK-NEXT: vslideup.vi v9, v8, 2
; CHECK-NEXT: lui a0, %hi(.LCPI10_2)
; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_2)
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; CHECK-NEXT: vle8.v v8, (a0)
-; CHECK-NEXT: vsext.vf8 v16, v8
-; CHECK-NEXT: vsaddu.vx v8, v16, a1
-; CHECK-NEXT: vmsltu.vx v16, v8, a2
-; CHECK-NEXT: vsetivli zero, 8, e8, m1, tu, ma
-; CHECK-NEXT: vslideup.vi v0, v16, 6
; CHECK-NEXT: lui a0, %hi(.LCPI10_3)
; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_3)
-; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT: vle8.v v8, (a0)
+; CHECK-NEXT: vle8.v v10, (a0)
; CHECK-NEXT: vsext.vf8 v16, v8
-; CHECK-NEXT: vsaddu.vx v8, v16, a1
-; CHECK-NEXT: vmsltu.vx v16, v8, a2
-; CHECK-NEXT: vsetivli zero, 10, e8, m1, tu, ma
-; CHECK-NEXT: vslideup.vi v0, v16, 8
+; CHECK-NEXT: vsaddu.vx v16, v16, a1
+; CHECK-NEXT: vmsltu.vx v8, v16, a2
+; CHECK-NEXT: vsext.vf8 v16, v10
+; CHECK-NEXT: vsaddu.vx v16, v16, a1
+; CHECK-NEXT: vmsltu.vx v10, v16, a2
+; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
+; CHECK-NEXT: vslideup.vi v10, v8, 2
+; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT: vslideup.vi v10, v9, 4
; CHECK-NEXT: lui a0, %hi(.LCPI10_4)
; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_4)
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; CHECK-NEXT: vle8.v v8, (a0)
-; CHECK-NEXT: vsext.vf8 v16, v8
-; CHECK-NEXT: vsaddu.vx v8, v16, a1
-; CHECK-NEXT: vmsltu.vx v16, v8, a2
-; CHECK-NEXT: vsetivli zero, 12, e8, m1, tu, ma
-; CHECK-NEXT: vslideup.vi v0, v16, 10
; CHECK-NEXT: lui a0, %hi(.LCPI10_5)
; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_5)
-; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT: vle8.v v8, (a0)
+; CHECK-NEXT: vle8.v v9, (a0)
; CHECK-NEXT: vsext.vf8 v16, v8
-; CHECK-NEXT: vsaddu.vx v8, v16, a1
-; CHECK-NEXT: vmsltu.vx v16, v8, a2
-; CHECK-NEXT: vsetivli zero, 14, e8, m1, tu, ma
-; CHECK-NEXT: vslideup.vi v0, v16, 12
+; CHECK-NEXT: vsaddu.vx v16, v16, a1
+; CHECK-NEXT: vmsltu.vx v8, v16, a2
+; CHECK-NEXT: vsext.vf8 v16, v9
+; CHECK-NEXT: vsaddu.vx v16, v16, a1
+; CHECK-NEXT: vmsltu.vx v9, v16, a2
+; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
+; CHECK-NEXT: vslideup.vi v9, v8, 2
+; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; CHECK-NEXT: lui a0, %hi(.LCPI10_6)
; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_6)
-; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; CHECK-NEXT: vle8.v v8, (a0)
+; CHECK-NEXT: vid.v v16
+; CHECK-NEXT: vsaddu.vx v16, v16, a1
+; CHECK-NEXT: vmsltu.vx v0, v16, a2
; CHECK-NEXT: vsext.vf8 v16, v8
-; CHECK-NEXT: vsaddu.vx v8, v16, a1
-; CHECK-NEXT: vmsltu.vx v16, v8, a2
-; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma
-; CHECK-NEXT: vslideup.vi v0, v16, 14
+; CHECK-NEXT: vsaddu.vx v16, v16, a1
+; CHECK-NEXT: vmsltu.vx v8, v16, a2
+; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
+; CHECK-NEXT: vslideup.vi v0, v8, 2
+; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT: vslideup.vi v0, v9, 4
+; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-NEXT: vslideup.vi v0, v10, 8
; CHECK-NEXT: ret
%mask = call <128 x i1> @llvm.get.active.lane.mask.v128i1.i64(i64 %index, i64 %tc)
ret <128 x i1> %mask
diff --git a/llvm/test/CodeGen/RISCV/rvv/combine-store-extract-crash.ll b/llvm/test/CodeGen/RISCV/rvv/combine-store-extract-crash.ll
index c64216180c2af7..ed434deea1a837 100644
--- a/llvm/test/CodeGen/RISCV/rvv/combine-store-extract-crash.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/combine-store-extract-crash.ll
@@ -19,7 +19,7 @@ define void @test(ptr %ref_array, ptr %sad_array) {
; RV32-NEXT: th.swia a0, (a1), 4, 0
; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
; RV32-NEXT: vle8.v v10, (a3)
-; RV32-NEXT: vsetivli zero, 8, e8, m1, tu, ma
+; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
; RV32-NEXT: vslideup.vi v10, v9, 4
; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
; RV32-NEXT: vzext.vf4 v12, v10
@@ -42,7 +42,7 @@ define void @test(ptr %ref_array, ptr %sad_array) {
; RV64-NEXT: th.swia a0, (a1), 4, 0
; RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
; RV64-NEXT: vle8.v v10, (a3)
-; RV64-NEXT: vsetivli zero, 8, e8, m1, tu, ma
+; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
; RV64-NEXT: vslideup.vi v10, v9, 4
; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, ma
; RV64-NEXT: vzext.vf4 v12, v10
diff --git a/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll
index a2d02b6bb641b2..418e842a6725e0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll
@@ -469,9 +469,8 @@ define <vscale x 6 x half> @extract_nxv6f16_nxv12f16_6(<vscale x 12 x half> %in)
; CHECK: # %bb.0:
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: srli a0, a0, 2
-; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT: vslidedown.vx v13, v10, a0
; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT: vslidedown.vx v13, v10, a0
; CHECK-NEXT: vslidedown.vx v12, v9, a0
; CHECK-NEXT: add a1, a0, a0
; CHECK-NEXT: vsetvli zero, a1, e16, m1, tu, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access-zve32x.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access-zve32x.ll
index 8acc70faaa1fc9..8f9d26ba001c22 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access-zve32x.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access-zve32x.ll
@@ -7,14 +7,14 @@
define <4 x i1> @load_large_vector(ptr %p) {
; ZVE32X-LABEL: load_large_vector:
; ZVE32X: # %bb.0:
-; ZVE32X-NEXT: ld a1, 80(a0)
-; ZVE32X-NEXT: ld a2, 72(a0)
-; ZVE32X-NEXT: ld a3, 56(a0)
-; ZVE32X-NEXT: ld a4, 32(a0)
-; ZVE32X-NEXT: ld a5, 24(a0)
-; ZVE32X-NEXT: ld a6, 48(a0)
-; ZVE32X-NEXT: ld a7, 8(a0)
-; ZVE32X-NEXT: ld a0, 0(a0)
+; ZVE32X-NEXT: ld a1, 8(a0)
+; ZVE32X-NEXT: ld a2, 0(a0)
+; ZVE32X-NEXT: ld a3, 32(a0)
+; ZVE32X-NEXT: ld a4, 80(a0)
+; ZVE32X-NEXT: ld a5, 72(a0)
+; ZVE32X-NEXT: ld a6, 24(a0)
+; ZVE32X-NEXT: ld a7, 56(a0)
+; ZVE32X-NEXT: ld a0, 48(a0)
; ZVE32X-NEXT: xor a4, a5, a4
; ZVE32X-NEXT: snez a4, a4
; ZVE32X-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
@@ -28,12 +28,10 @@ define <4 x i1> @load_large_vector(ptr %p) {
; ZVE32X-NEXT: vmv.s.x v10, a0
; ZVE32X-NEXT: vand.vi v10, v10, 1
; ZVE32X-NEXT: vmsne.vi v0, v10, 0
-; ZVE32X-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
+; ZVE32X-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; ZVE32X-NEXT: vmv.v.i v10, 0
; ZVE32X-NEXT: vmerge.vim v11, v10, 1, v0
-; ZVE32X-NEXT: vsetivli zero, 2, e8, mf4, tu, ma
; ZVE32X-NEXT: vslideup.vi v11, v9, 1
-; ZVE32X-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
; ZVE32X-NEXT: vmsne.vi v0, v11, 0
; ZVE32X-NEXT: vmerge.vim v9, v10, 1, v0
; ZVE32X-NEXT: xor a0, a6, a3
@@ -42,22 +40,21 @@ define <4 x i1> @load_large_vector(ptr %p) {
; ZVE32X-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
; ZVE32X-NEXT: vand.vi v11, v11, 1
; ZVE32X-NEXT: vmsne.vi v0, v11, 0
-; ZVE32X-NEXT: vmerge.vim v11, v8, 1, v0
-; ZVE32X-NEXT: vsetivli zero, 3, e8, mf4, tu, ma
-; ZVE32X-NEXT: vslideup.vi v9, v11, 2
-; ZVE32X-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
-; ZVE32X-NEXT: vmsne.vi v0, v9, 0
-; ZVE32X-NEXT: vmerge.vim v9, v10, 1, v0
+; ZVE32X-NEXT: vmerge.vim v8, v8, 1, v0
; ZVE32X-NEXT: xor a1, a2, a1
; ZVE32X-NEXT: snez a0, a1
-; ZVE32X-NEXT: vmv.s.x v10, a0
-; ZVE32X-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; ZVE32X-NEXT: vand.vi v10, v10, 1
+; ZVE32X-NEXT: vmv.s.x v11, a0
+; ZVE32X-NEXT: vand.vi v11, v11, 1
+; ZVE32X-NEXT: vmsne.vi v0, v11, 0
+; ZVE32X-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
+; ZVE32X-NEXT: vmerge.vim v10, v10, 1, v0
+; ZVE32X-NEXT: vslideup.vi v10, v8, 1
; ZVE32X-NEXT: vmsne.vi v0, v10, 0
-; ZVE32X-NEXT: vmerge.vim v8, v8, 1, v0
; ZVE32X-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
-; ZVE32X-NEXT: vslideup.vi v9, v8, 3
-; ZVE32X-NEXT: vmsne.vi v0, v9, 0
+; ZVE32X-NEXT: vmv.v.i v8, 0
+; ZVE32X-NEXT: vmerge.vim v8, v8, 1, v0
+; ZVE32X-NEXT: vslideup.vi v8, v9, 2
+; ZVE32X-NEXT: vmsne.vi v0, v8, 0
; ZVE32X-NEXT: ret
;
; ZVE64X-LABEL: load_large_vector:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll
index 6ef5aa846d6d96..59fd864e191716 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll
@@ -16,14 +16,11 @@ define <8 x i32> @concat_2xv4i32(<4 x i32> %a, <4 x i32> %b) {
define <8 x i32> @concat_4xv2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d) {
; CHECK-LABEL: concat_4xv2i32:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmv1r.v v12, v11
-; CHECK-NEXT: vmv1r.v v14, v9
-; CHECK-NEXT: vsetivli zero, 4, e32, m2, tu, ma
-; CHECK-NEXT: vslideup.vi v8, v14, 2
-; CHECK-NEXT: vsetivli zero, 6, e32, m2, tu, ma
-; CHECK-NEXT: vslideup.vi v8, v10, 4
+; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT: vslideup.vi v10, v11, 2
+; CHECK-NEXT: vslideup.vi v8, v9, 2
; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT: vslideup.vi v8, v12, 6
+; CHECK-NEXT: vslideup.vi v8, v10, 4
; CHECK-NEXT: ret
%ab = shufflevector <2 x i32> %a, <2 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%cd = shufflevector <2 x i32> %c, <2 x i32> %d, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -34,24 +31,18 @@ define <8 x i32> @concat_4xv2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x
define <8 x i32> @concat_8xv1i32(<1 x i32> %a, <1 x i32> %b, <1 x i32> %c, <1 x i32> %d, <1 x i32> %e, <1 x i32> %f, <1 x i32> %g, <1 x i32> %h) {
; CHECK-LABEL: concat_8xv1i32:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmv1r.v v16, v15
-; CHECK-NEXT: vmv1r.v v18, v13
-; CHECK-NEXT: vmv1r.v v20, v11
-; CHECK-NEXT: vmv1r.v v22, v9
-; CHECK-NEXT: vsetivli zero, 2, e32, m2, tu, ma
-; CHECK-NEXT: vslideup.vi v8, v22, 1
-; CHECK-NEXT: vsetivli zero, 3, e32, m2, tu, ma
+; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-NEXT: vslideup.vi v14, v15, 1
+; CHECK-NEXT: vslideup.vi v12, v13, 1
+; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT: vslideup.vi v12, v14, 2
+; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-NEXT: vslideup.vi v10, v11, 1
+; CHECK-NEXT: vslideup.vi v8, v9, 1
+; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; CHECK-NEXT: vslideup.vi v8, v10, 2
-; CHECK-NEXT: vsetivli zero, 4, e32, m2, tu, ma
-; CHECK-NEXT: vslideup.vi v8, v20, 3
-; CHECK-NEXT: vsetivli zero, 5, e32, m2, tu, ma
-; CHECK-NEXT: vslideup.vi v8, v12, 4
-; CHECK-NEXT: vsetivli zero, 6, e32, m2, tu, ma
-; CHECK-NEXT: vslideup.vi v8, v18, 5
-; CHECK-NEXT: vsetivli zero, 7, e32, m2, tu, ma
-; CHECK-NEXT: vslideup.vi v8, v14, 6
; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT: vslideup.vi v8, v16, 7
+; CHECK-NEXT: vslideup.vi v8, v12, 4
; CHECK-NEXT: ret
%ab = shufflevector <1 x i32> %a, <1 x i32> %b, <2 x i32> <i32 0, i32 1>
%cd = shufflevector <1 x i32> %c, <1 x i32> %d, <2 x i32> <i32 0, i32 1>
@@ -77,15 +68,14 @@ define <16 x i32> @concat_2xv8i32(<8 x i32> %a, <8 x i32> %b) {
define <16 x i32> @concat_4xv4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) {
; CHECK-LABEL: concat_4xv4i32:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmv1r.v v12, v11
-; CHECK-NEXT: vmv1r.v v16, v10
-; CHECK-NEXT: vmv1r.v v20, v9
-; CHECK-NEXT: vsetivli zero, 8, e32, m4, tu, ma
-; CHECK-NEXT: vslideup.vi v8, v20, 4
-; CHECK-NEXT: vsetivli zero, 12, e32, m4, tu, ma
-; CHECK-NEXT: vslideup.vi v8, v16, 8
+; CHECK-NEXT: vmv1r.v v14, v11
+; CHECK-NEXT: vmv1r.v v12, v10
+; CHECK-NEXT: vmv1r.v v10, v9
+; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT: vslideup.vi v12, v14, 4
+; CHECK-NEXT: vslideup.vi v8, v10, 4
; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT: vslideup.vi v8, v12, 12
+; CHECK-NEXT: vslideup.vi v8, v12, 8
; CHECK-NEXT: ret
%ab = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%cd = shufflevector <4 x i32> %c, <4 x i32> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -96,26 +86,18 @@ define <16 x i32> @concat_4xv4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x
define <16 x i32> @concat_8xv2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d, <2 x i32> %e, <2 x i32> %f, <2 x i32> %g, <2 x i32> %h) {
; CHECK-LABEL: concat_8xv2i32:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmv1r.v v16, v15
-; CHECK-NEXT: vmv1r.v v20, v14
-; CHECK-NEXT: vmv1r.v v24, v13
-; CHECK-NEXT: vmv1r.v v28, v11
-; CHECK-NEXT: vmv1r.v v0, v10
-; CHECK-NEXT: vmv1r.v v4, v9
-; CHECK-NEXT: vsetivli zero, 4, e32, m4, tu, ma
-; CHECK-NEXT: vslideup.vi v8, v4, 2
-; CHECK-NEXT: vsetivli zero, 6, e32, m4, tu, ma
-; CHECK-NEXT: vslideup.vi v8, v0, 4
-; CHECK-NEXT: ...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/83035
More information about the llvm-commits
mailing list