[llvm] [RISCV] Recurse on second operand of two operand shuffles (PR #79197)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Jan 23 11:55:48 PST 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-risc-v
Author: Philip Reames (preames)
<details>
<summary>Changes</summary>
This builds on bdc41106ee48dce59c500c9a3957af947f30c8c3.
This change completes the migration to a recursive shuffle lowering strategy where when we encounter an unknown two argument shuffle, we lower each operand as a single source permute, and then use a vselect (i.e. a vmerge) to combine the results. This relies for code quality on the post-isel combine which will aggressively fold that vmerge back into the materialization of the second operand if possible.
Note: The change includes only the most immediately obvious of the stylistic cleanup. There's a bunch of code movement that this enables that I'll do as a separate patch as rolling it into this creates an unreadable diff.
---
Patch is 57.12 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/79197.diff
9 Files Affected:
- (modified) llvm/lib/Target/RISCV/RISCVISelLowering.cpp (+17-62)
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll (+27-9)
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll (+27-9)
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll (+8-15)
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll (+143-190)
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll (+2-5)
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-transpose.ll (+52-77)
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll (+16-20)
- (modified) llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll (+4-4)
``````````diff
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index e39888637062c6..c91a66e26f982b 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -4975,12 +4975,7 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
// As a backup, shuffles can be lowered via a vrgather instruction, possibly
// merged with a second vrgather.
- SmallVector<SDValue> GatherIndicesLHS, GatherIndicesRHS;
-
- // Keep a track of which non-undef indices are used by each LHS/RHS shuffle
- // half.
- DenseMap<int, unsigned> LHSIndexCounts, RHSIndexCounts;
-
+ SmallVector<int> ShuffleMaskLHS, ShuffleMaskRHS;
SmallVector<SDValue> MaskVals;
// Now construct the mask that will be used by the blended vrgather operation.
@@ -4989,28 +4984,20 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
bool SelectMaskVal = (MaskIndex < (int)NumElts) ^ !SwapOps;
MaskVals.push_back(DAG.getConstant(SelectMaskVal, DL, XLenVT));
bool IsLHSOrUndefIndex = MaskIndex < (int)NumElts;
- GatherIndicesLHS.push_back(IsLHSOrUndefIndex && MaskIndex >= 0
- ? DAG.getConstant(MaskIndex, DL, XLenVT)
- : DAG.getUNDEF(XLenVT));
- GatherIndicesRHS.push_back(
- IsLHSOrUndefIndex ? DAG.getUNDEF(XLenVT)
- : DAG.getConstant(MaskIndex - NumElts, DL, XLenVT));
- if (IsLHSOrUndefIndex && MaskIndex >= 0)
- ++LHSIndexCounts[MaskIndex];
- if (!IsLHSOrUndefIndex)
- ++RHSIndexCounts[MaskIndex - NumElts];
+ ShuffleMaskLHS.push_back(IsLHSOrUndefIndex && MaskIndex >= 0
+ ? MaskIndex : -1);
+ ShuffleMaskRHS.push_back(IsLHSOrUndefIndex ? -1 : (MaskIndex - NumElts));
}
if (SwapOps) {
std::swap(V1, V2);
- std::swap(GatherIndicesLHS, GatherIndicesRHS);
+ std::swap(ShuffleMaskLHS, ShuffleMaskRHS);
}
assert(MaskVals.size() == NumElts && "Unexpected select-like shuffle");
MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
SDValue SelectMask = DAG.getBuildVector(MaskVT, DL, MaskVals);
- unsigned GatherVXOpc = RISCVISD::VRGATHER_VX_VL;
unsigned GatherVVOpc = RISCVISD::VRGATHER_VV_VL;
MVT IndexVT = VT.changeTypeToInteger();
// Since we can't introduce illegal index types at this stage, use i16 and
@@ -5038,6 +5025,11 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
// are handled above.
if (V2.isUndef()) {
V1 = convertToScalableVector(ContainerVT, V1, DAG, Subtarget);
+ SmallVector<SDValue> GatherIndicesLHS;
+ for (int ShuffleIdx : ShuffleMaskLHS)
+ GatherIndicesLHS.push_back(ShuffleIdx != -1
+ ? DAG.getConstant(ShuffleIdx, DL, XLenVT)
+ : DAG.getUNDEF(XLenVT));
SDValue LHSIndices = DAG.getBuildVector(IndexVT, DL, GatherIndicesLHS);
LHSIndices = convertToScalableVector(IndexContainerVT, LHSIndices, DAG,
Subtarget);
@@ -5046,50 +5038,13 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
return convertFromScalableVector(VT, Gather, DAG, Subtarget);
}
- // Translate the gather index we computed above (and possibly swapped)
- // back to a shuffle mask. This step should disappear once we complete
- // the migration to recursive design.
- SmallVector<int> ShuffleMaskLHS;
- ShuffleMaskLHS.reserve(GatherIndicesLHS.size());
- for (SDValue GatherIndex : GatherIndicesLHS) {
- if (GatherIndex.isUndef()) {
- ShuffleMaskLHS.push_back(-1);
- continue;
- }
- auto *IdxC = cast<ConstantSDNode>(GatherIndex);
- ShuffleMaskLHS.push_back(IdxC->getZExtValue());
- }
-
- // Recursively invoke lowering for the LHS as if there were no RHS.
- // This allows us to leverage all of our single source permute tricks.
- SDValue Gather =
- DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), ShuffleMaskLHS);
- Gather = convertToScalableVector(ContainerVT, Gather, DAG, Subtarget);
-
- // Blend in second vector source with an additional vrgather.
- V2 = convertToScalableVector(ContainerVT, V2, DAG, Subtarget);
-
- MVT MaskContainerVT = ContainerVT.changeVectorElementType(MVT::i1);
- SelectMask =
- convertToScalableVector(MaskContainerVT, SelectMask, DAG, Subtarget);
-
- // If only one index is used, we can use a "splat" vrgather.
- // TODO: We can splat the most-common index and fix-up any stragglers, if
- // that's beneficial.
- if (RHSIndexCounts.size() == 1) {
- int SplatIndex = RHSIndexCounts.begin()->getFirst();
- Gather = DAG.getNode(GatherVXOpc, DL, ContainerVT, V2,
- DAG.getConstant(SplatIndex, DL, XLenVT), Gather,
- SelectMask, VL);
- } else {
- SDValue RHSIndices = DAG.getBuildVector(IndexVT, DL, GatherIndicesRHS);
- RHSIndices =
- convertToScalableVector(IndexContainerVT, RHSIndices, DAG, Subtarget);
- Gather = DAG.getNode(GatherVVOpc, DL, ContainerVT, V2, RHSIndices, Gather,
- SelectMask, VL);
- }
-
- return convertFromScalableVector(VT, Gather, DAG, Subtarget);
+ // Recursively invoke lowering for each operand if we had two
+ // independent single source permutes, and then combine the result via a
+ // vselect. Note that the vselect will likely be folded back into the
+ // second permute (vrgather, or other) by the post-isel combine.
+ V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), ShuffleMaskLHS);
+ V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), ShuffleMaskRHS);
+ return DAG.getNode(ISD::VSELECT, DL, VT, SelectMask, V2, V1);
}
bool RISCVTargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll
index dab530751ef96b..6bfd0ac932672f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll
@@ -238,26 +238,44 @@ define <64 x half> @interleave_v32f16(<32 x half> %x, <32 x half> %y) {
define <64 x float> @interleave_v32f32(<32 x float> %x, <32 x float> %y) {
; V128-LABEL: interleave_v32f32:
; V128: # %bb.0:
+; V128-NEXT: addi sp, sp, -16
+; V128-NEXT: .cfi_def_cfa_offset 16
+; V128-NEXT: csrr a0, vlenb
+; V128-NEXT: slli a0, a0, 3
+; V128-NEXT: sub sp, sp, a0
+; V128-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; V128-NEXT: vmv8r.v v0, v16
+; V128-NEXT: addi a0, sp, 16
+; V128-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; V128-NEXT: vmv8r.v v16, v8
; V128-NEXT: vsetivli zero, 16, e32, m8, ta, ma
-; V128-NEXT: vslidedown.vi v0, v8, 16
+; V128-NEXT: vslidedown.vi v8, v0, 16
; V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma
; V128-NEXT: vwaddu.vv v24, v0, v8
; V128-NEXT: li a0, -1
; V128-NEXT: vwmaccu.vx v24, a0, v8
-; V128-NEXT: lui a1, %hi(.LCPI10_0)
-; V128-NEXT: addi a1, a1, %lo(.LCPI10_0)
-; V128-NEXT: li a2, 32
-; V128-NEXT: vsetvli zero, a2, e32, m8, ta, mu
-; V128-NEXT: vle16.v v12, (a1)
+; V128-NEXT: vsetivli zero, 16, e32, m8, ta, ma
+; V128-NEXT: vslidedown.vi v0, v16, 16
+; V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; V128-NEXT: vwaddu.vv v8, v0, v16
+; V128-NEXT: vwmaccu.vx v8, a0, v16
; V128-NEXT: lui a1, 699051
; V128-NEXT: addi a1, a1, -1366
+; V128-NEXT: li a2, 32
; V128-NEXT: vmv.s.x v0, a1
-; V128-NEXT: vrgatherei16.vv v24, v16, v12, v0.t
+; V128-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; V128-NEXT: vmerge.vvm v24, v8, v24, v0
; V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; V128-NEXT: vwaddu.vv v0, v8, v16
-; V128-NEXT: vwmaccu.vx v0, a0, v16
+; V128-NEXT: addi a1, sp, 16
+; V128-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
+; V128-NEXT: vwaddu.vv v0, v16, v8
+; V128-NEXT: vwmaccu.vx v0, a0, v8
; V128-NEXT: vmv8r.v v8, v0
; V128-NEXT: vmv8r.v v16, v24
+; V128-NEXT: csrr a0, vlenb
+; V128-NEXT: slli a0, a0, 3
+; V128-NEXT: add sp, sp, a0
+; V128-NEXT: addi sp, sp, 16
; V128-NEXT: ret
;
; V512-LABEL: interleave_v32f32:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll
index 9e21cc9e3d624a..6da83644413bc2 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll
@@ -403,26 +403,44 @@ define <64 x i16> @interleave_v32i16(<32 x i16> %x, <32 x i16> %y) {
define <64 x i32> @interleave_v32i32(<32 x i32> %x, <32 x i32> %y) {
; V128-LABEL: interleave_v32i32:
; V128: # %bb.0:
+; V128-NEXT: addi sp, sp, -16
+; V128-NEXT: .cfi_def_cfa_offset 16
+; V128-NEXT: csrr a0, vlenb
+; V128-NEXT: slli a0, a0, 3
+; V128-NEXT: sub sp, sp, a0
+; V128-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; V128-NEXT: vmv8r.v v0, v16
+; V128-NEXT: addi a0, sp, 16
+; V128-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; V128-NEXT: vmv8r.v v16, v8
; V128-NEXT: vsetivli zero, 16, e32, m8, ta, ma
-; V128-NEXT: vslidedown.vi v0, v8, 16
+; V128-NEXT: vslidedown.vi v8, v0, 16
; V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma
; V128-NEXT: vwaddu.vv v24, v0, v8
; V128-NEXT: li a0, -1
; V128-NEXT: vwmaccu.vx v24, a0, v8
-; V128-NEXT: lui a1, %hi(.LCPI17_0)
-; V128-NEXT: addi a1, a1, %lo(.LCPI17_0)
-; V128-NEXT: li a2, 32
-; V128-NEXT: vsetvli zero, a2, e32, m8, ta, mu
-; V128-NEXT: vle16.v v12, (a1)
+; V128-NEXT: vsetivli zero, 16, e32, m8, ta, ma
+; V128-NEXT: vslidedown.vi v0, v16, 16
+; V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; V128-NEXT: vwaddu.vv v8, v0, v16
+; V128-NEXT: vwmaccu.vx v8, a0, v16
; V128-NEXT: lui a1, 699051
; V128-NEXT: addi a1, a1, -1366
+; V128-NEXT: li a2, 32
; V128-NEXT: vmv.s.x v0, a1
-; V128-NEXT: vrgatherei16.vv v24, v16, v12, v0.t
+; V128-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; V128-NEXT: vmerge.vvm v24, v8, v24, v0
; V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; V128-NEXT: vwaddu.vv v0, v8, v16
-; V128-NEXT: vwmaccu.vx v0, a0, v16
+; V128-NEXT: addi a1, sp, 16
+; V128-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
+; V128-NEXT: vwaddu.vv v0, v16, v8
+; V128-NEXT: vwmaccu.vx v0, a0, v8
; V128-NEXT: vmv8r.v v8, v0
; V128-NEXT: vmv8r.v v16, v24
+; V128-NEXT: csrr a0, vlenb
+; V128-NEXT: slli a0, a0, 3
+; V128-NEXT: add sp, sp, a0
+; V128-NEXT: addi sp, sp, 16
; V128-NEXT: ret
;
; V512-LABEL: interleave_v32i32:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
index a26a87a1f3c139..23629ace4cf48c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
@@ -611,12 +611,10 @@ define <8 x i8> @concat_4xi8_start_undef(<8 x i8> %v, <8 x i8> %w) {
define <8 x i8> @concat_4xi8_start_undef_at_start(<8 x i8> %v, <8 x i8> %w) {
; CHECK-LABEL: concat_4xi8_start_undef_at_start:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; CHECK-NEXT: vid.v v10
; CHECK-NEXT: li a0, 224
+; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
; CHECK-NEXT: vmv.s.x v0, a0
-; CHECK-NEXT: vadd.vi v10, v10, -4
-; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t
+; CHECK-NEXT: vslideup.vi v8, v9, 4, v0.t
; CHECK-NEXT: ret
%res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 9, i32 10, i32 11>
ret <8 x i8> %res
@@ -625,12 +623,10 @@ define <8 x i8> @concat_4xi8_start_undef_at_start(<8 x i8> %v, <8 x i8> %w) {
define <8 x i8> @merge_start_into_end_non_contiguous(<8 x i8> %v, <8 x i8> %w) {
; CHECK-LABEL: merge_start_into_end_non_contiguous:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; CHECK-NEXT: vid.v v10
; CHECK-NEXT: li a0, 144
+; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
; CHECK-NEXT: vmv.s.x v0, a0
-; CHECK-NEXT: vadd.vi v10, v10, -4
-; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t
+; CHECK-NEXT: vslideup.vi v8, v9, 4, v0.t
; CHECK-NEXT: ret
%res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 5, i32 6, i32 11>
ret <8 x i8> %res
@@ -670,12 +666,11 @@ define <8 x i8> @merge_start_into_start(<8 x i8> %v, <8 x i8> %w) {
define <8 x i8> @merge_slidedown(<8 x i8> %v, <8 x i8> %w) {
; CHECK-LABEL: merge_slidedown:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; CHECK-NEXT: vslidedown.vi v8, v8, 1
+; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
; CHECK-NEXT: li a0, 195
; CHECK-NEXT: vmv.s.x v0, a0
-; CHECK-NEXT: vid.v v10
-; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t
+; CHECK-NEXT: vslidedown.vi v8, v8, 1
+; CHECK-NEXT: vmerge.vvm v8, v8, v9, v0
; CHECK-NEXT: ret
%res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 8, i32 9, i32 3, i32 4, i32 5, i32 6, i32 14, i32 15>
ret <8 x i8> %res
@@ -686,12 +681,10 @@ define <8 x i8> @merge_non_contiguous_slideup_slidedown(<8 x i8> %v, <8 x i8> %w
; CHECK-LABEL: merge_non_contiguous_slideup_slidedown:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; CHECK-NEXT: vid.v v10
-; CHECK-NEXT: vadd.vi v10, v10, -1
; CHECK-NEXT: li a0, 234
; CHECK-NEXT: vmv.s.x v0, a0
; CHECK-NEXT: vslidedown.vi v8, v8, 2
-; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t
+; CHECK-NEXT: vslideup.vi v8, v9, 1, v0.t
; CHECK-NEXT: ret
%res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 2, i32 8, i32 4, i32 10, i32 6, i32 12, i32 13, i32 14>
ret <8 x i8> %res
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
index f889041647b235..e27ff0a573d5f2 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
@@ -159,69 +159,57 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
; RV32-NEXT: addi sp, sp, -16
; RV32-NEXT: .cfi_def_cfa_offset 16
; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: li a3, 58
+; RV32-NEXT: li a3, 56
; RV32-NEXT: mul a2, a2, a3
; RV32-NEXT: sub sp, sp, a2
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x3a, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 58 * vlenb
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 56 * vlenb
; RV32-NEXT: addi a3, a1, 256
; RV32-NEXT: li a2, 32
; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT: vle32.v v8, (a3)
+; RV32-NEXT: vle32.v v16, (a3)
; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 25
-; RV32-NEXT: mul a3, a3, a4
+; RV32-NEXT: slli a3, a3, 5
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 16
-; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
; RV32-NEXT: addi a3, a1, 128
; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT: vslideup.vi v16, v8, 4
+; RV32-NEXT: vslideup.vi v8, v16, 4
; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: li a5, 12
-; RV32-NEXT: mul a4, a4, a5
-; RV32-NEXT: add a4, sp, a4
-; RV32-NEXT: addi a4, a4, 16
-; RV32-NEXT: vs4r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT: vsetvli zero, zero, e16, m2, ta, ma
-; RV32-NEXT: vid.v v20
-; RV32-NEXT: vadd.vi v4, v20, -10
-; RV32-NEXT: vmv.v.v v2, v20
-; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: slli a5, a4, 4
-; RV32-NEXT: add a4, a5, a4
+; RV32-NEXT: slli a4, a4, 4
; RV32-NEXT: add a4, sp, a4
; RV32-NEXT: addi a4, a4, 16
-; RV32-NEXT: vs2r.v v20, (a4) # Unknown-size Folded Spill
+; RV32-NEXT: vs4r.v v8, (a4) # Unknown-size Folded Spill
; RV32-NEXT: lui a4, 12
; RV32-NEXT: vmv.s.x v1, a4
; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma
-; RV32-NEXT: vslidedown.vi v8, v8, 16
+; RV32-NEXT: vslidedown.vi v16, v16, 16
; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: slli a5, a4, 5
-; RV32-NEXT: add a4, a5, a4
+; RV32-NEXT: li a5, 24
+; RV32-NEXT: mul a4, a4, a5
; RV32-NEXT: add a4, sp, a4
; RV32-NEXT: addi a4, a4, 16
-; RV32-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill
+; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu
; RV32-NEXT: vmv1r.v v0, v1
; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: slli a4, a4, 4
+; RV32-NEXT: slli a4, a4, 2
; RV32-NEXT: add a4, sp, a4
; RV32-NEXT: addi a4, a4, 16
; RV32-NEXT: vs1r.v v1, (a4) # Unknown-size Folded Spill
-; RV32-NEXT: vrgatherei16.vv v16, v8, v4, v0.t
+; RV32-NEXT: vslideup.vi v8, v16, 10, v0.t
; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: li a5, 21
+; RV32-NEXT: li a5, 20
; RV32-NEXT: mul a4, a4, a5
; RV32-NEXT: add a4, sp, a4
; RV32-NEXT: addi a4, a4, 16
-; RV32-NEXT: vs4r.v v16, (a4) # Unknown-size Folded Spill
+; RV32-NEXT: vs4r.v v8, (a4) # Unknown-size Folded Spill
; RV32-NEXT: lui a4, %hi(.LCPI6_0)
; RV32-NEXT: addi a4, a4, %lo(.LCPI6_0)
; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, mu
; RV32-NEXT: vle16.v v8, (a4)
; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: slli a4, a4, 2
+; RV32-NEXT: slli a4, a4, 3
; RV32-NEXT: add a4, sp, a4
; RV32-NEXT: addi a4, a4, 16
; RV32-NEXT: vs4r.v v8, (a4) # Unknown-size Folded Spill
@@ -233,14 +221,14 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
; RV32-NEXT: vs4r.v v8, (a4) # Unknown-size Folded Spill
; RV32-NEXT: vle32.v v16, (a1)
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a4, 49
+; RV32-NEXT: li a4, 40
; RV32-NEXT: mul a1, a1, a4
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
; RV32-NEXT: vle32.v v24, (a3)
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a3, 41
+; RV32-NEXT: li a3, 48
; RV32-NEXT: mul a1, a1, a3
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
@@ -248,12 +236,13 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
; RV32-NEXT: addi a1, a5, -64
; RV32-NEXT: vmv.s.x v0, a1
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
+; RV32-NEXT: li a3, 12
+; RV32-NEXT: mul a1, a1, a3
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
; RV32-NEXT: vs1r.v v0, (a1) # Unknown-size Folded Spill
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 2
+; RV32-NEXT: slli a1, a1, 3
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
; RV32-NEXT: vl4r.v v4, (a1) # Unknown-size Folded Reload
@@ -263,366 +252,330 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
; RV32-NEXT: vrgatherei16.vv v8, v24, v16, v0.t
; RV32-NEXT: vsetivli zero, 12, e32, m4, tu, ma
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a3, 21
+; RV32-NEXT: li a3, 20
; RV32-NEXT: mul a1, a1, a3
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
; RV32-NEXT: vl4r.v v12, (a1) # Unknown-size Folded Reload
; RV32-NEXT: vmv.v.v v12, v8
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a3, 21
+; RV32-NEXT: li a3, 20
; RV32-NEXT: mul a1, a1, a3
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
; RV32-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a3, 25
-; RV32-NEXT: mul a1, a1, a3
+; RV32-NEXT: slli a1, a1, 5
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
-; RV32-NEXT: vslideup.vi v...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/79197
More information about the llvm-commits
mailing list