[llvm] [RISCV] Improve lowering of spread(2) shuffles (PR #118658)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Dec 4 07:46:56 PST 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-risc-v
Author: Philip Reames (preames)
<details>
<summary>Changes</summary>
A spread(2) shuffle is just a interleave with an undef lane. The existing lowering was reusing the even lane for the undef value. This was entirely legal, but non-optimal.
---
Full diff: https://github.com/llvm/llvm-project/pull/118658.diff
4 Files Affected:
- (modified) llvm/lib/Target/RISCV/RISCVISelLowering.cpp (+28-9)
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll (+15-21)
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll (+29-41)
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll (+7-10)
``````````diff
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 4544a922def1a3..1efb9e7d52dbd2 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -5331,17 +5331,36 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
// Extract the halves of the vectors.
MVT HalfVT = VT.getHalfNumVectorElementsVT();
+ // Recognize if one half is actually undef; the matching above will
+ // otherwise reuse the even stream for the undef one. This improves
+ // spread(2) shuffles.
+ bool EvenIsUndef = true, OddIsUndef = true;
+ for (unsigned i = 0; i < Mask.size(); i++) {
+ if (i % 2 == 0)
+ EvenIsUndef &= (Mask[i] == -1);
+ else
+ OddIsUndef &= (Mask[i] == -1);
+ }
+
int Size = Mask.size();
SDValue EvenV, OddV;
- assert(EvenSrc >= 0 && "Undef source?");
- EvenV = (EvenSrc / Size) == 0 ? V1 : V2;
- EvenV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, EvenV,
- DAG.getVectorIdxConstant(EvenSrc % Size, DL));
-
- assert(OddSrc >= 0 && "Undef source?");
- OddV = (OddSrc / Size) == 0 ? V1 : V2;
- OddV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, OddV,
- DAG.getVectorIdxConstant(OddSrc % Size, DL));
+ if (EvenIsUndef) {
+ EvenV = DAG.getUNDEF(HalfVT);
+ } else {
+ assert(EvenSrc >= 0 && "Undef source?");
+ EvenV = (EvenSrc / Size) == 0 ? V1 : V2;
+ EvenV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, EvenV,
+ DAG.getVectorIdxConstant(EvenSrc % Size, DL));
+ }
+
+ if (OddIsUndef) {
+ OddV = DAG.getUNDEF(HalfVT);
+ } else {
+ assert(OddSrc >= 0 && "Undef source?");
+ OddV = (OddSrc / Size) == 0 ? V1 : V2;
+ OddV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, OddV,
+ DAG.getVectorIdxConstant(OddSrc % Size, DL));
+ }
return getWideningInterleave(EvenV, OddV, DL, DAG, Subtarget);
}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll
index e4b8e9debad271..97e458e70565ce 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll
@@ -242,33 +242,27 @@ define <64 x float> @interleave_v32f32(<32 x float> %x, <32 x float> %y) {
; V128-NEXT: slli a0, a0, 3
; V128-NEXT: sub sp, sp, a0
; V128-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; V128-NEXT: vmv8r.v v24, v16
-; V128-NEXT: vmv8r.v v16, v8
-; V128-NEXT: vmv8r.v v8, v24
; V128-NEXT: addi a0, sp, 16
-; V128-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
+; V128-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
; V128-NEXT: vsetivli zero, 16, e32, m8, ta, ma
-; V128-NEXT: vslidedown.vi v0, v24, 16
-; V128-NEXT: li a0, -1
-; V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; V128-NEXT: vwaddu.vv v24, v8, v0
-; V128-NEXT: vwmaccu.vx v24, a0, v0
-; V128-NEXT: vsetivli zero, 16, e32, m8, ta, ma
-; V128-NEXT: vslidedown.vi v0, v16, 16
+; V128-NEXT: vslidedown.vi v24, v16, 16
+; V128-NEXT: li a0, 32
+; V128-NEXT: vslidedown.vi v0, v8, 16
; V128-NEXT: lui a1, 699051
-; V128-NEXT: li a2, 32
-; V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; V128-NEXT: vwaddu.vv v8, v0, v16
+; V128-NEXT: vsetivli zero, 16, e64, m8, ta, ma
+; V128-NEXT: vzext.vf2 v8, v24
+; V128-NEXT: vzext.vf2 v24, v0
; V128-NEXT: addi a1, a1, -1366
; V128-NEXT: vmv.s.x v0, a1
-; V128-NEXT: vwmaccu.vx v8, a0, v16
-; V128-NEXT: vsetvli zero, a2, e32, m8, ta, ma
-; V128-NEXT: vmerge.vvm v24, v8, v24, v0
-; V128-NEXT: addi a1, sp, 16
-; V128-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
+; V128-NEXT: vsll.vx v8, v8, a0
+; V128-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; V128-NEXT: vmerge.vvm v24, v24, v8, v0
+; V128-NEXT: addi a0, sp, 16
+; V128-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; V128-NEXT: vwaddu.vv v0, v16, v8
-; V128-NEXT: vwmaccu.vx v0, a0, v8
+; V128-NEXT: vwaddu.vv v0, v8, v16
+; V128-NEXT: li a0, -1
+; V128-NEXT: vwmaccu.vx v0, a0, v16
; V128-NEXT: vmv8r.v v8, v0
; V128-NEXT: vmv8r.v v16, v24
; V128-NEXT: csrr a0, vlenb
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll
index 66af5718fb9dc5..a8eb1f97fd1a2c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll
@@ -186,35 +186,29 @@ define <4 x i32> @interleave_v4i32_offset_2(<4 x i32> %x, <4 x i32> %y) {
define <4 x i32> @interleave_v4i32_offset_1(<4 x i32> %x, <4 x i32> %y) {
; V128-LABEL: interleave_v4i32_offset_1:
; V128: # %bb.0:
-; V128-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
-; V128-NEXT: vwaddu.vv v10, v8, v8
-; V128-NEXT: li a0, -1
; V128-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; V128-NEXT: vid.v v11
-; V128-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
-; V128-NEXT: vwmaccu.vx v10, a0, v8
-; V128-NEXT: vsetivli zero, 4, e32, m1, ta, mu
-; V128-NEXT: vsrl.vi v8, v11, 1
+; V128-NEXT: vid.v v10
; V128-NEXT: vmv.v.i v0, 10
-; V128-NEXT: vadd.vi v8, v8, 1
-; V128-NEXT: vrgather.vv v10, v9, v8, v0.t
+; V128-NEXT: vsrl.vi v10, v10, 1
+; V128-NEXT: vadd.vi v11, v10, 1
+; V128-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; V128-NEXT: vzext.vf2 v10, v8
+; V128-NEXT: vsetivli zero, 4, e32, m1, ta, mu
+; V128-NEXT: vrgather.vv v10, v9, v11, v0.t
; V128-NEXT: vmv.v.v v8, v10
; V128-NEXT: ret
;
; V512-LABEL: interleave_v4i32_offset_1:
; V512: # %bb.0:
-; V512-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
-; V512-NEXT: vwaddu.vv v10, v8, v8
-; V512-NEXT: li a0, -1
; V512-NEXT: vsetivli zero, 4, e32, mf2, ta, ma
-; V512-NEXT: vid.v v11
-; V512-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
-; V512-NEXT: vwmaccu.vx v10, a0, v8
-; V512-NEXT: vsetivli zero, 4, e32, mf2, ta, mu
-; V512-NEXT: vsrl.vi v8, v11, 1
+; V512-NEXT: vid.v v10
; V512-NEXT: vmv.v.i v0, 10
-; V512-NEXT: vadd.vi v8, v8, 1
-; V512-NEXT: vrgather.vv v10, v9, v8, v0.t
+; V512-NEXT: vsrl.vi v10, v10, 1
+; V512-NEXT: vadd.vi v11, v10, 1
+; V512-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; V512-NEXT: vzext.vf2 v10, v8
+; V512-NEXT: vsetivli zero, 4, e32, mf2, ta, mu
+; V512-NEXT: vrgather.vv v10, v9, v11, v0.t
; V512-NEXT: vmv1r.v v8, v10
; V512-NEXT: ret
%a = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 0, i32 5, i32 1, i32 6>
@@ -411,33 +405,27 @@ define <64 x i32> @interleave_v32i32(<32 x i32> %x, <32 x i32> %y) {
; V128-NEXT: slli a0, a0, 3
; V128-NEXT: sub sp, sp, a0
; V128-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; V128-NEXT: vmv8r.v v24, v16
-; V128-NEXT: vmv8r.v v16, v8
-; V128-NEXT: vmv8r.v v8, v24
; V128-NEXT: addi a0, sp, 16
-; V128-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
-; V128-NEXT: vsetivli zero, 16, e32, m8, ta, ma
-; V128-NEXT: vslidedown.vi v0, v24, 16
-; V128-NEXT: li a0, -1
-; V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; V128-NEXT: vwaddu.vv v24, v8, v0
-; V128-NEXT: vwmaccu.vx v24, a0, v0
+; V128-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
; V128-NEXT: vsetivli zero, 16, e32, m8, ta, ma
-; V128-NEXT: vslidedown.vi v0, v16, 16
+; V128-NEXT: vslidedown.vi v24, v16, 16
+; V128-NEXT: li a0, 32
+; V128-NEXT: vslidedown.vi v0, v8, 16
; V128-NEXT: lui a1, 699051
-; V128-NEXT: li a2, 32
-; V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; V128-NEXT: vwaddu.vv v8, v0, v16
+; V128-NEXT: vsetivli zero, 16, e64, m8, ta, ma
+; V128-NEXT: vzext.vf2 v8, v24
+; V128-NEXT: vzext.vf2 v24, v0
; V128-NEXT: addi a1, a1, -1366
; V128-NEXT: vmv.s.x v0, a1
-; V128-NEXT: vwmaccu.vx v8, a0, v16
-; V128-NEXT: vsetvli zero, a2, e32, m8, ta, ma
-; V128-NEXT: vmerge.vvm v24, v8, v24, v0
-; V128-NEXT: addi a1, sp, 16
-; V128-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
+; V128-NEXT: vsll.vx v8, v8, a0
+; V128-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; V128-NEXT: vmerge.vvm v24, v24, v8, v0
+; V128-NEXT: addi a0, sp, 16
+; V128-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; V128-NEXT: vwaddu.vv v0, v16, v8
-; V128-NEXT: vwmaccu.vx v0, a0, v8
+; V128-NEXT: vwaddu.vv v0, v8, v16
+; V128-NEXT: li a0, -1
+; V128-NEXT: vwmaccu.vx v0, a0, v16
; V128-NEXT: vmv8r.v v8, v0
; V128-NEXT: vmv8r.v v16, v24
; V128-NEXT: csrr a0, vlenb
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
index 5d307211ead6e6..a9ae2181333e88 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
@@ -801,11 +801,9 @@ define <8 x i32> @shuffle_compress_singlesrc_gaps_e32(<8 x i32> %v) {
define <8 x i32> @shuffle_spread2_singlesrc_e32(<8 x i32> %v) {
; CHECK-LABEL: shuffle_spread2_singlesrc_e32:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT: vwaddu.vv v10, v8, v8
-; CHECK-NEXT: li a0, -1
-; CHECK-NEXT: vwmaccu.vx v10, a0, v8
-; CHECK-NEXT: vmv2r.v v8, v10
+; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-NEXT: vzext.vf2 v10, v8
+; CHECK-NEXT: vmv.v.v v8, v10
; CHECK-NEXT: ret
%out = shufflevector <8 x i32> %v, <8 x i32> poison, <8 x i32> <i32 0, i32 undef, i32 1, i32 undef, i32 2, i32 undef, i32 3, i32 undef>
ret <8 x i32> %out
@@ -814,11 +812,10 @@ define <8 x i32> @shuffle_spread2_singlesrc_e32(<8 x i32> %v) {
define <8 x i32> @shuffle_spread2_singlesrc_e32_index1(<8 x i32> %v) {
; CHECK-LABEL: shuffle_spread2_singlesrc_e32_index1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT: vwaddu.vv v10, v8, v8
-; CHECK-NEXT: li a0, -1
-; CHECK-NEXT: vwmaccu.vx v10, a0, v8
-; CHECK-NEXT: vmv2r.v v8, v10
+; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-NEXT: vzext.vf2 v10, v8
+; CHECK-NEXT: li a0, 32
+; CHECK-NEXT: vsll.vx v8, v10, a0
; CHECK-NEXT: ret
%out = shufflevector <8 x i32> %v, <8 x i32> poison, <8 x i32> <i32 undef, i32 0, i32 undef, i32 1, i32 undef, i32 2, i32 undef, i32 3>
ret <8 x i32> %out
``````````
</details>
https://github.com/llvm/llvm-project/pull/118658
More information about the llvm-commits
mailing list