[llvm] [RISCV] Always combine scalarized shufflevectors back to vector_shuffle (PR #88147)
Luke Lau via llvm-commits
llvm-commits at lists.llvm.org
Tue Apr 9 09:17:43 PDT 2024
https://github.com/lukel97 created https://github.com/llvm/llvm-project/pull/88147
There is a loop in 526.blender_r that is vectorized with a fixed length VF and has a shufflevector emitted like this:
shufflevector <16 x i1> %0, <16 x i1> poison, <8 x i32> <...>
Because the mask is a different length from the source vectors, SelectionDAGBuilder emits this as a series of extract_vector_elts and build_vectors (since vector_shuffle requires the mask and source types be the same).
99% of the time DAGCombiner will recombine this back into a vector_shuffle with the appropriate extract_subvector on the single source vector.
But that combine is conditional on isExtractSubvectorCheap returning true, and whenever it fails it ends up in scalarization. The resulting code is an expensive sequence of vslidedowns and vector->scalar->vector domain crossings, and this occurs in the case above since it's an i1 vector.
We've had to tweak isExtractSubvectorCheap before to prevent similar issues arising from this (8d7e73effe860 and b8545e1ece271), but there's other corner cases where we can't truthfully claim extract_subvector is cheap, e.g. for fixed vectors >= 32 in length.
The shufflevector cost model also doesn't account for these cases where the combine fails.
This patch adds a hook so that we always perform the combine regardless of isExtractSubvectorCheap, since failing to perform the combine on RISC-V is more or less equivalent to N * extract_subvectors anyway. This is less relevant for other targets where extract_vector_elt is cheaper.
The tests were added in a separate commit in this PR so the diff should be viewable.
>From 5e380921e26a92ae64288941a356292df12a5192 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Tue, 9 Apr 2024 16:37:48 +0800
Subject: [PATCH 1/2] Add tests for shufflevectors that have a different length
from their source vectors
---
.../fixed-vectors-shuffle-different-length.ll | 316 ++++++++++++++++++
1 file changed, 316 insertions(+)
create mode 100644 llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-different-length.ll
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-different-length.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-different-length.ll
new file mode 100644
index 00000000000000..3b17476ed58a1f
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-different-length.ll
@@ -0,0 +1,316 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -mtriple=riscv32 -mattr=+v -verify-machineinstrs | FileCheck -check-prefixes=CHECK,RV32 %s
+; RUN: llc < %s -mtriple=riscv64 -mattr=+v -verify-machineinstrs | FileCheck -check-prefixes=CHECK,RV64 %s
+
+
+define <8 x i1> @v8i1_v16i1(<16 x i1>) {
+; RV32-LABEL: v8i1_v16i1:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV32-NEXT: vmv.x.s a0, v0
+; RV32-NEXT: slli a1, a0, 19
+; RV32-NEXT: srli a1, a1, 31
+; RV32-NEXT: slli a2, a0, 26
+; RV32-NEXT: srli a2, a2, 31
+; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
+; RV32-NEXT: vmv.v.x v8, a2
+; RV32-NEXT: vslide1down.vx v8, v8, a1
+; RV32-NEXT: slli a1, a0, 24
+; RV32-NEXT: srli a1, a1, 31
+; RV32-NEXT: vslide1down.vx v8, v8, a1
+; RV32-NEXT: slli a1, a0, 29
+; RV32-NEXT: srli a1, a1, 31
+; RV32-NEXT: vslide1down.vx v8, v8, a1
+; RV32-NEXT: slli a1, a0, 18
+; RV32-NEXT: srli a1, a1, 31
+; RV32-NEXT: slli a2, a0, 16
+; RV32-NEXT: srli a2, a2, 31
+; RV32-NEXT: vmv.v.x v9, a2
+; RV32-NEXT: vslide1down.vx v9, v9, a1
+; RV32-NEXT: slli a1, a0, 27
+; RV32-NEXT: srli a1, a1, 31
+; RV32-NEXT: vslide1down.vx v9, v9, a1
+; RV32-NEXT: slli a0, a0, 28
+; RV32-NEXT: srli a0, a0, 31
+; RV32-NEXT: vmv.v.i v0, 15
+; RV32-NEXT: vslide1down.vx v9, v9, a0
+; RV32-NEXT: vslidedown.vi v9, v8, 4, v0.t
+; RV32-NEXT: vand.vi v8, v9, 1
+; RV32-NEXT: vmsne.vi v0, v8, 0
+; RV32-NEXT: ret
+;
+; RV64-LABEL: v8i1_v16i1:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV64-NEXT: vmv.x.s a0, v0
+; RV64-NEXT: slli a1, a0, 51
+; RV64-NEXT: srli a1, a1, 63
+; RV64-NEXT: slli a2, a0, 58
+; RV64-NEXT: srli a2, a2, 63
+; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
+; RV64-NEXT: vmv.v.x v8, a2
+; RV64-NEXT: vslide1down.vx v8, v8, a1
+; RV64-NEXT: slli a1, a0, 56
+; RV64-NEXT: srli a1, a1, 63
+; RV64-NEXT: vslide1down.vx v8, v8, a1
+; RV64-NEXT: slli a1, a0, 61
+; RV64-NEXT: srli a1, a1, 63
+; RV64-NEXT: vslide1down.vx v8, v8, a1
+; RV64-NEXT: slli a1, a0, 50
+; RV64-NEXT: srli a1, a1, 63
+; RV64-NEXT: slli a2, a0, 48
+; RV64-NEXT: srli a2, a2, 63
+; RV64-NEXT: vmv.v.x v9, a2
+; RV64-NEXT: vslide1down.vx v9, v9, a1
+; RV64-NEXT: slli a1, a0, 59
+; RV64-NEXT: srli a1, a1, 63
+; RV64-NEXT: vslide1down.vx v9, v9, a1
+; RV64-NEXT: slli a0, a0, 60
+; RV64-NEXT: srli a0, a0, 63
+; RV64-NEXT: vmv.v.i v0, 15
+; RV64-NEXT: vslide1down.vx v9, v9, a0
+; RV64-NEXT: vslidedown.vi v9, v8, 4, v0.t
+; RV64-NEXT: vand.vi v8, v9, 1
+; RV64-NEXT: vmsne.vi v0, v8, 0
+; RV64-NEXT: ret
+ %2 = shufflevector <16 x i1> %0, <16 x i1> poison, <8 x i32> <i32 5, i32 12, i32 7, i32 2, i32 15, i32 13, i32 4, i32 3>
+ ret <8 x i1> %2
+}
+
+define <4 x i32> @v4i32_v8i32(<8 x i32>) {
+; CHECK-LABEL: v4i32_v8i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT: vid.v v10
+; CHECK-NEXT: vsrl.vi v10, v10, 1
+; CHECK-NEXT: vrsub.vi v11, v10, 3
+; CHECK-NEXT: vrgather.vv v10, v8, v11
+; CHECK-NEXT: vsetivli zero, 4, e32, m2, ta, ma
+; CHECK-NEXT: vslidedown.vi v8, v8, 4
+; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
+; CHECK-NEXT: vmv.v.i v0, 5
+; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu
+; CHECK-NEXT: vslidedown.vi v10, v8, 1, v0.t
+; CHECK-NEXT: vmv.v.v v8, v10
+; CHECK-NEXT: ret
+ %2 = shufflevector <8 x i32> %0, <8 x i32> poison, <4 x i32> <i32 5, i32 3, i32 7, i32 2>
+ ret <4 x i32> %2
+}
+
+define <4 x i32> @v4i32_v16i32(<16 x i32>) {
+; RV32-LABEL: v4i32_v16i32:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma
+; RV32-NEXT: vmv.v.i v12, 1
+; RV32-NEXT: vmv.v.i v14, 6
+; RV32-NEXT: vsetivli zero, 2, e16, m1, tu, ma
+; RV32-NEXT: vslideup.vi v14, v12, 1
+; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma
+; RV32-NEXT: vid.v v12
+; RV32-NEXT: vadd.vv v12, v12, v12
+; RV32-NEXT: vadd.vi v15, v12, 1
+; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; RV32-NEXT: vrgatherei16.vv v12, v8, v15
+; RV32-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
+; RV32-NEXT: vmv.v.i v0, 10
+; RV32-NEXT: vsetivli zero, 8, e32, m4, ta, ma
+; RV32-NEXT: vslidedown.vi v8, v8, 8
+; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu
+; RV32-NEXT: vrgatherei16.vv v12, v8, v14, v0.t
+; RV32-NEXT: vmv1r.v v8, v12
+; RV32-NEXT: ret
+;
+; RV64-LABEL: v4i32_v16i32:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 8, e16, m1, ta, ma
+; RV64-NEXT: vid.v v12
+; RV64-NEXT: vadd.vv v12, v12, v12
+; RV64-NEXT: vadd.vi v14, v12, 1
+; RV64-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; RV64-NEXT: vrgatherei16.vv v12, v8, v14
+; RV64-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
+; RV64-NEXT: vmv.v.i v0, 10
+; RV64-NEXT: vsetivli zero, 8, e32, m4, ta, ma
+; RV64-NEXT: vslidedown.vi v8, v8, 8
+; RV64-NEXT: li a0, 3
+; RV64-NEXT: slli a0, a0, 33
+; RV64-NEXT: addi a0, a0, 1
+; RV64-NEXT: slli a0, a0, 16
+; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; RV64-NEXT: vmv.v.x v10, a0
+; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, mu
+; RV64-NEXT: vrgatherei16.vv v12, v8, v10, v0.t
+; RV64-NEXT: vmv1r.v v8, v12
+; RV64-NEXT: ret
+ %2 = shufflevector <16 x i32> %0, <16 x i32> poison, <4 x i32> <i32 1, i32 9, i32 5, i32 14>
+ ret <4 x i32> %2
+}
+
+define <4 x i32> @v4i32_v32i32(<32 x i32>) {
+; RV32-LABEL: v4i32_v32i32:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -256
+; RV32-NEXT: .cfi_def_cfa_offset 256
+; RV32-NEXT: sw ra, 252(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s0, 248(sp) # 4-byte Folded Spill
+; RV32-NEXT: .cfi_offset ra, -4
+; RV32-NEXT: .cfi_offset s0, -8
+; RV32-NEXT: addi s0, sp, 256
+; RV32-NEXT: .cfi_def_cfa s0, 0
+; RV32-NEXT: andi sp, sp, -128
+; RV32-NEXT: li a0, 32
+; RV32-NEXT: mv a1, sp
+; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; RV32-NEXT: vse32.v v8, (a1)
+; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma
+; RV32-NEXT: vslidedown.vi v10, v8, 1
+; RV32-NEXT: vmv.x.s a0, v10
+; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT: lw a1, 36(sp)
+; RV32-NEXT: vmv.v.x v10, a0
+; RV32-NEXT: vslide1down.vx v10, v10, a1
+; RV32-NEXT: vsetivli zero, 1, e32, m2, ta, ma
+; RV32-NEXT: vslidedown.vi v8, v8, 4
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT: lw a1, 120(sp)
+; RV32-NEXT: vslide1down.vx v8, v10, a0
+; RV32-NEXT: vslide1down.vx v8, v8, a1
+; RV32-NEXT: addi sp, s0, -256
+; RV32-NEXT: lw ra, 252(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s0, 248(sp) # 4-byte Folded Reload
+; RV32-NEXT: addi sp, sp, 256
+; RV32-NEXT: ret
+;
+; RV64-LABEL: v4i32_v32i32:
+; RV64: # %bb.0:
+; RV64-NEXT: addi sp, sp, -256
+; RV64-NEXT: .cfi_def_cfa_offset 256
+; RV64-NEXT: sd ra, 248(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s0, 240(sp) # 8-byte Folded Spill
+; RV64-NEXT: .cfi_offset ra, -8
+; RV64-NEXT: .cfi_offset s0, -16
+; RV64-NEXT: addi s0, sp, 256
+; RV64-NEXT: .cfi_def_cfa s0, 0
+; RV64-NEXT: andi sp, sp, -128
+; RV64-NEXT: li a0, 32
+; RV64-NEXT: mv a1, sp
+; RV64-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; RV64-NEXT: vse32.v v8, (a1)
+; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma
+; RV64-NEXT: vslidedown.vi v10, v8, 1
+; RV64-NEXT: vmv.x.s a0, v10
+; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; RV64-NEXT: lw a1, 36(sp)
+; RV64-NEXT: vmv.v.x v10, a0
+; RV64-NEXT: vslide1down.vx v10, v10, a1
+; RV64-NEXT: vsetivli zero, 1, e32, m2, ta, ma
+; RV64-NEXT: vslidedown.vi v8, v8, 4
+; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; RV64-NEXT: lw a1, 120(sp)
+; RV64-NEXT: vslide1down.vx v8, v10, a0
+; RV64-NEXT: vslide1down.vx v8, v8, a1
+; RV64-NEXT: addi sp, s0, -256
+; RV64-NEXT: ld ra, 248(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s0, 240(sp) # 8-byte Folded Reload
+; RV64-NEXT: addi sp, sp, 256
+; RV64-NEXT: ret
+ %2 = shufflevector <32 x i32> %0, <32 x i32> poison, <4 x i32> <i32 1, i32 9, i32 4, i32 30>
+ ret <4 x i32> %2
+}
+
+define <16 x i1> @v16i1_v8i1(<8 x i1>) {
+; CHECK-LABEL: v16i1_v8i1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-NEXT: lui a0, %hi(.LCPI4_0)
+; CHECK-NEXT: addi a0, a0, %lo(.LCPI4_0)
+; CHECK-NEXT: vle8.v v8, (a0)
+; CHECK-NEXT: vmv.v.i v9, 0
+; CHECK-NEXT: vmerge.vim v9, v9, 1, v0
+; CHECK-NEXT: vrgather.vv v10, v9, v8
+; CHECK-NEXT: vmsne.vi v0, v10, 0
+; CHECK-NEXT: ret
+ %2 = shufflevector <8 x i1> %0, <8 x i1> poison, <16 x i32> <i32 2, i32 3, i32 0, i32 5, i32 1, i32 2, i32 0, i32 6, i32 2, i32 3, i32 0, i32 7, i32 1, i32 2, i32 0, i32 4>
+ ret <16 x i1> %2
+}
+
+define <8 x i32> @v8i32_v4i32(<4 x i32>) {
+; CHECK-LABEL: v8i32_v4i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lui a0, %hi(.LCPI5_0)
+; CHECK-NEXT: addi a0, a0, %lo(.LCPI5_0)
+; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT: vle16.v v12, (a0)
+; CHECK-NEXT: vrgatherei16.vv v10, v8, v12
+; CHECK-NEXT: vmv.v.v v8, v10
+; CHECK-NEXT: ret
+ %2 = shufflevector <4 x i32> %0, <4 x i32> poison, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 1, i32 2, i32 0, i32 3>
+ ret <8 x i32> %2
+}
+
+define <16 x i32> @v16i32_v4i32(<4 x i32>) {
+; CHECK-LABEL: v16i32_v4i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lui a0, 2
+; CHECK-NEXT: addi a1, a0, 265
+; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; CHECK-NEXT: vmv.s.x v0, a1
+; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-NEXT: vmv.v.i v9, 3
+; CHECK-NEXT: vmerge.vim v9, v9, 2, v0
+; CHECK-NEXT: lui a1, 4
+; CHECK-NEXT: addi a1, a1, 548
+; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.s.x v0, a1
+; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma
+; CHECK-NEXT: vmerge.vim v9, v9, 0, v0
+; CHECK-NEXT: addi a0, a0, -1856
+; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.s.x v0, a0
+; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma
+; CHECK-NEXT: vmerge.vim v9, v9, 1, v0
+; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT: vsext.vf2 v16, v9
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vrgatherei16.vv v12, v8, v16
+; CHECK-NEXT: vmv.v.v v8, v12
+; CHECK-NEXT: ret
+ %2 = shufflevector <4 x i32> %0, <4 x i32> poison, <16 x i32> <i32 2, i32 3, i32 0, i32 2, i32 3, i32 0, i32 1, i32 1, i32 2, i32 0, i32 3, i32 1, i32 1, i32 2, i32 0, i32 3>
+ ret <16 x i32> %2
+}
+
+define <32 x i32> @v32i32_v4i32(<4 x i32>) {
+; CHECK-LABEL: v32i32_v4i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: li a0, 32
+; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT: vmv.v.i v10, 3
+; CHECK-NEXT: lui a0, 135432
+; CHECK-NEXT: addi a0, a0, 1161
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vmv.s.x v0, a0
+; CHECK-NEXT: vsetvli zero, zero, e8, m2, ta, ma
+; CHECK-NEXT: vmerge.vim v10, v10, 2, v0
+; CHECK-NEXT: lui a0, 270865
+; CHECK-NEXT: addi a0, a0, 548
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vmv.s.x v0, a0
+; CHECK-NEXT: vsetvli zero, zero, e8, m2, ta, ma
+; CHECK-NEXT: vmerge.vim v10, v10, 0, v0
+; CHECK-NEXT: lui a0, 100550
+; CHECK-NEXT: addi a0, a0, 64
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vmv.s.x v0, a0
+; CHECK-NEXT: vsetvli zero, zero, e8, m2, ta, ma
+; CHECK-NEXT: vmerge.vim v10, v10, 1, v0
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vsext.vf2 v24, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vrgatherei16.vv v16, v8, v24
+; CHECK-NEXT: vmv.v.v v8, v16
+; CHECK-NEXT: ret
+ %2 = shufflevector <4 x i32> %0, <4 x i32> poison, <32 x i32> <i32 2, i32 3, i32 0, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 2, i32 3, i32 0, i32 1, i32 1, i32 2, i32 0, i32 3, i32 1, i32 1, i32 2, i32 0, i32 3, i32 1, i32 2, i32 0, i32 3, i32 1, i32 1, i32 2, i32 0, i32 3>
+ ret <32 x i32> %2
+}
>From 346a0df0154836751bcbd23596ce92e1a0ecfebb Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Tue, 9 Apr 2024 16:56:15 +0800
Subject: [PATCH 2/2] [RISCV] Always combine scalarized shufflevectors back to
vector_shuffle
There is a loop in 526.blender_r that is vectorized with a fixed length VF and has a shufflevector emitted like this:
shufflevector <16 x i1> %0, <16 x i1> poison, <8 x i32> <...>
Because the mask is a different length from the source vectors, SelectionDAGBuilder emits this as a series of extract_vector_elts and build_vectors (since vector_shuffle requires the mask and source types be the same).
99% of the time DAGCombiner will recombine this back into a vector_shuffle with the appropriate extract_subvector on the single source vector.
But that combine is conditional on isExtractSubvectorCheap returning true, and whenever it fails it ends up in scalarization. The resulting code is an expensive sequence of vslidedowns and vector->scalar->vector domain crossings, and this occurs in the case above since it's an i1 vector.
We've had to tweak isExtractSubvectorCheap before to prevent similar issues arising from this (8d7e73effe860 and b8545e1ece271), but there's other corner cases where we can't truthfully claim extract_subvector is cheap, e.g. for fixed vectors >= 32 in length.
The shufflevector cost model also doesn't account for these cases where the combine fails.
This patch adds a hook so that we always perform the combine regardless of isExtractSubvectorCheap, since failing to perform the combine on RISC-V is more or less equivalent to N * extract_subvectors anyway. This is less relevant for other targets where extract_vector_elt is cheaper.
The tests were added in a separate commit in this PR so the diff should be viewable.
---
llvm/include/llvm/CodeGen/TargetLowering.h | 7 +
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 6 +-
llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 6 +
llvm/lib/Target/RISCV/RISCVISelLowering.h | 1 +
.../rvv/fixed-vectors-interleaved-access.ll | 60 ++----
.../fixed-vectors-shuffle-different-length.ll | 179 +++++-------------
6 files changed, 81 insertions(+), 178 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index a4dc097446186a..2d2ad0368beb75 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3344,6 +3344,13 @@ class TargetLoweringBase {
return false;
}
+ // Return true if a BUILD_VECTOR of EXTRACT_VECTOR_ELTs should always be
+ // combined into a VECTOR_SHUFFLE when possible regardless of
+ // isExtractSubvectorCheap.
+ virtual bool aggressivelyPreferVectorShuffle(EVT VecVT) const {
+ return false;
+ }
+
// Return true if CodeGenPrepare should consider splitting large offset of a
// GEP to make the GEP fit into the addressing mode and can be sunk into the
// same blocks of its users.
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 8fe074666a3dc9..731f4513abf2a1 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -22790,7 +22790,8 @@ SDValue DAGCombiner::createBuildVecShuffle(const SDLoc &DL, SDNode *N,
VecIn1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ConcatOps);
VecIn2 = SDValue();
} else if (InVT1Size == VTSize * 2) {
- if (!TLI.isExtractSubvectorCheap(VT, InVT1, NumElems))
+ if (!TLI.isExtractSubvectorCheap(VT, InVT1, NumElems) &&
+ !TLI.aggressivelyPreferVectorShuffle(VT))
return SDValue();
if (!VecIn2.getNode()) {
@@ -22830,7 +22831,8 @@ SDValue DAGCombiner::createBuildVecShuffle(const SDLoc &DL, SDNode *N,
ConcatOps[0] = VecIn2;
VecIn2 = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ConcatOps);
} else if (InVT1Size / VTSize > 1 && InVT1Size % VTSize == 0) {
- if (!TLI.isExtractSubvectorCheap(VT, InVT1, NumElems) ||
+ if ((!TLI.isExtractSubvectorCheap(VT, InVT1, NumElems) &&
+ !TLI.aggressivelyPreferVectorShuffle(VT)) ||
!TLI.isTypeLegal(InVT1) || !TLI.isTypeLegal(InVT2))
return SDValue();
// If dest vector has less than two elements, then use shuffle and extract
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 80cc41b458ca81..93c7ba0c39d107 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -2242,6 +2242,12 @@ bool RISCVTargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
return Index == 0 || Index == ResElts;
}
+// Prefer a wider vector_shuffle over (build_vector extract_vector_elt, ...)
+// since extract_vector_elts are expensive
+bool RISCVTargetLowering::aggressivelyPreferVectorShuffle(EVT VT) const {
+ return true;
+}
+
MVT RISCVTargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
CallingConv::ID CC,
EVT VT) const {
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index c28552354bf422..dd0998b65ff1b4 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -506,6 +506,7 @@ class RISCVTargetLowering : public TargetLowering {
bool ForCodeSize) const override;
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
unsigned Index) const override;
+ bool aggressivelyPreferVectorShuffle(EVT VT) const override;
bool isIntDivCheap(EVT VT, AttributeList Attr) const override;
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
index f98cb343a2ab42..3adc8e036d6955 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
@@ -8,51 +8,21 @@
; FIXME: This should be widened to a vlseg2 of <4 x i32> with VL set to 3
define {<3 x i32>, <3 x i32>} @load_factor2_v3(ptr %ptr) {
-; RV32-LABEL: load_factor2_v3:
-; RV32: # %bb.0:
-; RV32-NEXT: vsetivli zero, 6, e32, m2, ta, ma
-; RV32-NEXT: vle32.v v10, (a0)
-; RV32-NEXT: vsetivli zero, 2, e32, m1, ta, ma
-; RV32-NEXT: vslidedown.vi v9, v10, 2
-; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
-; RV32-NEXT: vwaddu.vv v8, v10, v9
-; RV32-NEXT: li a0, -1
-; RV32-NEXT: vwmaccu.vx v8, a0, v9
-; RV32-NEXT: vmv.v.i v0, 4
-; RV32-NEXT: vsetivli zero, 4, e32, m2, ta, ma
-; RV32-NEXT: vslidedown.vi v12, v10, 4
-; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu
-; RV32-NEXT: vrgather.vi v8, v12, 0, v0.t
-; RV32-NEXT: vid.v v9
-; RV32-NEXT: vadd.vv v9, v9, v9
-; RV32-NEXT: vadd.vi v11, v9, 1
-; RV32-NEXT: vrgather.vv v9, v10, v11
-; RV32-NEXT: vrgather.vi v9, v12, 1, v0.t
-; RV32-NEXT: ret
-;
-; RV64-LABEL: load_factor2_v3:
-; RV64: # %bb.0:
-; RV64-NEXT: vsetivli zero, 6, e32, m2, ta, ma
-; RV64-NEXT: vle32.v v10, (a0)
-; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV64-NEXT: vid.v v8
-; RV64-NEXT: vadd.vv v8, v8, v8
-; RV64-NEXT: vadd.vi v8, v8, 1
-; RV64-NEXT: vrgather.vv v9, v10, v8
-; RV64-NEXT: vmv.v.i v0, 4
-; RV64-NEXT: vsetivli zero, 4, e32, m2, ta, ma
-; RV64-NEXT: vslidedown.vi v12, v10, 4
-; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, mu
-; RV64-NEXT: vrgather.vi v9, v12, 1, v0.t
-; RV64-NEXT: vsetivli zero, 2, e32, m1, ta, ma
-; RV64-NEXT: vslidedown.vi v11, v10, 2
-; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
-; RV64-NEXT: vwaddu.vv v8, v10, v11
-; RV64-NEXT: li a0, -1
-; RV64-NEXT: vwmaccu.vx v8, a0, v11
-; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, mu
-; RV64-NEXT: vrgather.vi v8, v12, 0, v0.t
-; RV64-NEXT: ret
+; CHECK-LABEL: load_factor2_v3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 6, e32, m2, ta, ma
+; CHECK-NEXT: vle32.v v10, (a0)
+; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT: vid.v v8
+; CHECK-NEXT: vadd.vv v12, v8, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT: vrgatherei16.vv v8, v10, v12
+; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT: vadd.vi v9, v12, 1
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT: vrgatherei16.vv v12, v10, v9
+; CHECK-NEXT: vmv1r.v v9, v12
+; CHECK-NEXT: ret
%interleaved.vec = load <6 x i32>, ptr %ptr
%v0 = shufflevector <6 x i32> %interleaved.vec, <6 x i32> poison, <3 x i32> <i32 0, i32 2, i32 4>
%v1 = shufflevector <6 x i32> %interleaved.vec, <6 x i32> poison, <3 x i32> <i32 1, i32 3, i32 5>
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-different-length.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-different-length.ll
index 3b17476ed58a1f..7c83eb57f5777c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-different-length.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-different-length.ll
@@ -4,75 +4,28 @@
define <8 x i1> @v8i1_v16i1(<16 x i1>) {
-; RV32-LABEL: v8i1_v16i1:
-; RV32: # %bb.0:
-; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; RV32-NEXT: vmv.x.s a0, v0
-; RV32-NEXT: slli a1, a0, 19
-; RV32-NEXT: srli a1, a1, 31
-; RV32-NEXT: slli a2, a0, 26
-; RV32-NEXT: srli a2, a2, 31
-; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV32-NEXT: vmv.v.x v8, a2
-; RV32-NEXT: vslide1down.vx v8, v8, a1
-; RV32-NEXT: slli a1, a0, 24
-; RV32-NEXT: srli a1, a1, 31
-; RV32-NEXT: vslide1down.vx v8, v8, a1
-; RV32-NEXT: slli a1, a0, 29
-; RV32-NEXT: srli a1, a1, 31
-; RV32-NEXT: vslide1down.vx v8, v8, a1
-; RV32-NEXT: slli a1, a0, 18
-; RV32-NEXT: srli a1, a1, 31
-; RV32-NEXT: slli a2, a0, 16
-; RV32-NEXT: srli a2, a2, 31
-; RV32-NEXT: vmv.v.x v9, a2
-; RV32-NEXT: vslide1down.vx v9, v9, a1
-; RV32-NEXT: slli a1, a0, 27
-; RV32-NEXT: srli a1, a1, 31
-; RV32-NEXT: vslide1down.vx v9, v9, a1
-; RV32-NEXT: slli a0, a0, 28
-; RV32-NEXT: srli a0, a0, 31
-; RV32-NEXT: vmv.v.i v0, 15
-; RV32-NEXT: vslide1down.vx v9, v9, a0
-; RV32-NEXT: vslidedown.vi v9, v8, 4, v0.t
-; RV32-NEXT: vand.vi v8, v9, 1
-; RV32-NEXT: vmsne.vi v0, v8, 0
-; RV32-NEXT: ret
-;
-; RV64-LABEL: v8i1_v16i1:
-; RV64: # %bb.0:
-; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; RV64-NEXT: vmv.x.s a0, v0
-; RV64-NEXT: slli a1, a0, 51
-; RV64-NEXT: srli a1, a1, 63
-; RV64-NEXT: slli a2, a0, 58
-; RV64-NEXT: srli a2, a2, 63
-; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64-NEXT: vmv.v.x v8, a2
-; RV64-NEXT: vslide1down.vx v8, v8, a1
-; RV64-NEXT: slli a1, a0, 56
-; RV64-NEXT: srli a1, a1, 63
-; RV64-NEXT: vslide1down.vx v8, v8, a1
-; RV64-NEXT: slli a1, a0, 61
-; RV64-NEXT: srli a1, a1, 63
-; RV64-NEXT: vslide1down.vx v8, v8, a1
-; RV64-NEXT: slli a1, a0, 50
-; RV64-NEXT: srli a1, a1, 63
-; RV64-NEXT: slli a2, a0, 48
-; RV64-NEXT: srli a2, a2, 63
-; RV64-NEXT: vmv.v.x v9, a2
-; RV64-NEXT: vslide1down.vx v9, v9, a1
-; RV64-NEXT: slli a1, a0, 59
-; RV64-NEXT: srli a1, a1, 63
-; RV64-NEXT: vslide1down.vx v9, v9, a1
-; RV64-NEXT: slli a0, a0, 60
-; RV64-NEXT: srli a0, a0, 63
-; RV64-NEXT: vmv.v.i v0, 15
-; RV64-NEXT: vslide1down.vx v9, v9, a0
-; RV64-NEXT: vslidedown.vi v9, v8, 4, v0.t
-; RV64-NEXT: vand.vi v8, v9, 1
-; RV64-NEXT: vmsne.vi v0, v8, 0
-; RV64-NEXT: ret
+; CHECK-LABEL: v8i1_v16i1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT: lui a0, %hi(.LCPI0_0)
+; CHECK-NEXT: addi a0, a0, %lo(.LCPI0_0)
+; CHECK-NEXT: vle8.v v8, (a0)
+; CHECK-NEXT: vmv.v.i v9, 0
+; CHECK-NEXT: vmerge.vim v10, v9, 1, v0
+; CHECK-NEXT: vrgather.vv v11, v10, v8
+; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
+; CHECK-NEXT: vslidedown.vi v0, v0, 1
+; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
+; CHECK-NEXT: lui a0, %hi(.LCPI0_1)
+; CHECK-NEXT: addi a0, a0, %lo(.LCPI0_1)
+; CHECK-NEXT: vle8.v v10, (a0)
+; CHECK-NEXT: li a0, 50
+; CHECK-NEXT: vmv.s.x v8, a0
+; CHECK-NEXT: vmerge.vim v9, v9, 1, v0
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vrgather.vv v11, v9, v10, v0.t
+; CHECK-NEXT: vmsne.vi v0, v11, 0
+; CHECK-NEXT: ret
%2 = shufflevector <16 x i1> %0, <16 x i1> poison, <8 x i32> <i32 5, i32 12, i32 7, i32 2, i32 15, i32 13, i32 4, i32 3>
ret <8 x i1> %2
}
@@ -149,72 +102,36 @@ define <4 x i32> @v4i32_v16i32(<16 x i32>) {
define <4 x i32> @v4i32_v32i32(<32 x i32>) {
; RV32-LABEL: v4i32_v32i32:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -256
-; RV32-NEXT: .cfi_def_cfa_offset 256
-; RV32-NEXT: sw ra, 252(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s0, 248(sp) # 4-byte Folded Spill
-; RV32-NEXT: .cfi_offset ra, -4
-; RV32-NEXT: .cfi_offset s0, -8
-; RV32-NEXT: addi s0, sp, 256
-; RV32-NEXT: .cfi_def_cfa s0, 0
-; RV32-NEXT: andi sp, sp, -128
-; RV32-NEXT: li a0, 32
-; RV32-NEXT: mv a1, sp
-; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, ma
-; RV32-NEXT: vse32.v v8, (a1)
-; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma
-; RV32-NEXT: vslidedown.vi v10, v8, 1
-; RV32-NEXT: vmv.x.s a0, v10
-; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT: lw a1, 36(sp)
-; RV32-NEXT: vmv.v.x v10, a0
-; RV32-NEXT: vslide1down.vx v10, v10, a1
-; RV32-NEXT: vsetivli zero, 1, e32, m2, ta, ma
-; RV32-NEXT: vslidedown.vi v8, v8, 4
-; RV32-NEXT: vmv.x.s a0, v8
-; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT: lw a1, 120(sp)
-; RV32-NEXT: vslide1down.vx v8, v10, a0
-; RV32-NEXT: vslide1down.vx v8, v8, a1
-; RV32-NEXT: addi sp, s0, -256
-; RV32-NEXT: lw ra, 252(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s0, 248(sp) # 4-byte Folded Reload
-; RV32-NEXT: addi sp, sp, 256
+; RV32-NEXT: lui a0, %hi(.LCPI3_0)
+; RV32-NEXT: addi a0, a0, %lo(.LCPI3_0)
+; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT: vle16.v v24, (a0)
+; RV32-NEXT: vmv8r.v v16, v8
+; RV32-NEXT: vrgatherei16.vv v8, v16, v24
+; RV32-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
+; RV32-NEXT: vmv.v.i v0, 8
+; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma
+; RV32-NEXT: vslidedown.vi v16, v16, 16
+; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu
+; RV32-NEXT: vrgather.vi v8, v16, 14, v0.t
; RV32-NEXT: ret
;
; RV64-LABEL: v4i32_v32i32:
; RV64: # %bb.0:
-; RV64-NEXT: addi sp, sp, -256
-; RV64-NEXT: .cfi_def_cfa_offset 256
-; RV64-NEXT: sd ra, 248(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s0, 240(sp) # 8-byte Folded Spill
-; RV64-NEXT: .cfi_offset ra, -8
-; RV64-NEXT: .cfi_offset s0, -16
-; RV64-NEXT: addi s0, sp, 256
-; RV64-NEXT: .cfi_def_cfa s0, 0
-; RV64-NEXT: andi sp, sp, -128
-; RV64-NEXT: li a0, 32
-; RV64-NEXT: mv a1, sp
-; RV64-NEXT: vsetvli zero, a0, e32, m8, ta, ma
-; RV64-NEXT: vse32.v v8, (a1)
-; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma
-; RV64-NEXT: vslidedown.vi v10, v8, 1
-; RV64-NEXT: vmv.x.s a0, v10
-; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV64-NEXT: lw a1, 36(sp)
-; RV64-NEXT: vmv.v.x v10, a0
-; RV64-NEXT: vslide1down.vx v10, v10, a1
-; RV64-NEXT: vsetivli zero, 1, e32, m2, ta, ma
-; RV64-NEXT: vslidedown.vi v8, v8, 4
-; RV64-NEXT: vmv.x.s a0, v8
-; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV64-NEXT: lw a1, 120(sp)
-; RV64-NEXT: vslide1down.vx v8, v10, a0
-; RV64-NEXT: vslide1down.vx v8, v8, a1
-; RV64-NEXT: addi sp, s0, -256
-; RV64-NEXT: ld ra, 248(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s0, 240(sp) # 8-byte Folded Reload
-; RV64-NEXT: addi sp, sp, 256
+; RV64-NEXT: vmv8r.v v16, v8
+; RV64-NEXT: lui a0, 262153
+; RV64-NEXT: slli a0, a0, 4
+; RV64-NEXT: addi a0, a0, 1
+; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma
+; RV64-NEXT: vmv.v.x v12, a0
+; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; RV64-NEXT: vrgatherei16.vv v8, v16, v12
+; RV64-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
+; RV64-NEXT: vmv.v.i v0, 8
+; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, ma
+; RV64-NEXT: vslidedown.vi v16, v16, 16
+; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, mu
+; RV64-NEXT: vrgather.vi v8, v16, 14, v0.t
; RV64-NEXT: ret
%2 = shufflevector <32 x i32> %0, <32 x i32> poison, <4 x i32> <i32 1, i32 9, i32 4, i32 30>
ret <4 x i32> %2
More information about the llvm-commits
mailing list