[llvm] [RISCV] Consider all subvector extracts within a single VREG cheap (PR #81032)
Philip Reames via llvm-commits
llvm-commits at lists.llvm.org
Wed Feb 7 11:51:25 PST 2024
https://github.com/preames created https://github.com/llvm/llvm-project/pull/81032
This adjusts the isSubVectorExtractCheap callback to consider any extract which fits entirely within the first VLEN bits of the src vector (and uses a 5 bit immediate for the slide) as cheap. These can be done via a single m1 vslide1down.vi instruction.
This allows our generic DAG combine logic to kick in and recognize a few more cases where shuffle source is longer than the dest, but that using a wider shuffle is still profitable. (Or as shown in the test diff, we can split the wider source and do two narrower shuffles.)
>From 4e6ff02c3d1327fda5406079aa84f0305d78ef16 Mon Sep 17 00:00:00 2001
From: Philip Reames <preames at rivosinc.com>
Date: Wed, 7 Feb 2024 11:40:21 -0800
Subject: [PATCH] [RISCV] Consider all subvector extracts within a single VREG
cheap
This adjusts the isSubVectorExtractCheap callback to consider any
extract which fits entirely within the first VLEN bits of the src
vector (and uses a 5 bit immediate for the slide) as cheap. These
can be done via a single m1 vslide1down.vi instruction.
This allows our generic DAG combine logic to kick in and recognize
a few more cases where shuffle source is longer than the dest, but
that using a wider shuffle is still profitable. (Or as shown in
the test diff, we can split the wider source and do two narrower
shuffles.)
---
llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 28 ++++-
.../RISCV/rvv/fixed-vectors-int-shuffles.ll | 110 +++---------------
2 files changed, 41 insertions(+), 97 deletions(-)
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 27037f4d5c5c85..1f0e7fd8ada345 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -2173,19 +2173,35 @@ bool RISCVTargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
if (ResVT.isScalableVector() || SrcVT.isScalableVector())
return false;
+ EVT EltVT = ResVT.getVectorElementType();
+ if (EltVT != SrcVT.getVectorElementType())
+ return false;
+
+ // The smallest type we can slide is i8.
+ // TODO: We can extract index 0 from a mask vector without a slide.
+ if (EltVT == MVT::i1)
+ return false;
+
unsigned ResElts = ResVT.getVectorNumElements();
unsigned SrcElts = SrcVT.getVectorNumElements();
+ unsigned MinVLen = Subtarget.getRealMinVLen();
+ unsigned MinVLMAX = MinVLen / EltVT.getSizeInBits();
+
+ // If we're extracting only data from the first VLEN bits of the source
+ // then we can always do this with an m1 vslidedown.vx. Restricting the
+ // Index ensures we can use a vslidedown.vi.
+ // TODO: We can generalize this when the exact VLEN is known.
+ if (Index + ResElts <= MinVLMAX && Index < 31)
+ return true;
+
// Convervatively only handle extracting half of a vector.
- // TODO: Relax this.
+ // TODO: For sizes which aren't multiples of VLEN sizes, this may not be
+ // a cheap extract. However, this case is important in practice for
+ // shuffled extracts of longer vectors. How resolve?
if ((ResElts * 2) != SrcElts)
return false;
- // The smallest type we can slide is i8.
- // TODO: We can extract index 0 from a mask vector without a slide.
- if (ResVT.getVectorElementType() == MVT::i1)
- return false;
-
// Slide can support arbitrary index, but we only treat vslidedown.vi as
// cheap.
if (Index >= 32)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
index acad71bb59590b..0e8d9cf0306690 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
@@ -722,97 +722,25 @@ define <8 x i32> @shuffle_v8i32_2(<8 x i32> %x, <8 x i32> %y) {
; FIXME: This could be expressed as a vrgather.vv
define <8 x i8> @shuffle_v64i8_v8i8(<64 x i8> %wide.vec) {
-; RV32-LABEL: shuffle_v64i8_v8i8:
-; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -128
-; RV32-NEXT: .cfi_def_cfa_offset 128
-; RV32-NEXT: sw ra, 124(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s0, 120(sp) # 4-byte Folded Spill
-; RV32-NEXT: .cfi_offset ra, -4
-; RV32-NEXT: .cfi_offset s0, -8
-; RV32-NEXT: addi s0, sp, 128
-; RV32-NEXT: .cfi_def_cfa s0, 0
-; RV32-NEXT: andi sp, sp, -64
-; RV32-NEXT: li a0, 64
-; RV32-NEXT: mv a1, sp
-; RV32-NEXT: vsetvli zero, a0, e8, m4, ta, ma
-; RV32-NEXT: vse8.v v8, (a1)
-; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; RV32-NEXT: vslidedown.vi v10, v8, 8
-; RV32-NEXT: vmv.x.s a0, v10
-; RV32-NEXT: vmv.x.s a1, v8
-; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
-; RV32-NEXT: vmv.v.x v10, a1
-; RV32-NEXT: vslide1down.vx v10, v10, a0
-; RV32-NEXT: vsetivli zero, 1, e8, m2, ta, ma
-; RV32-NEXT: vslidedown.vi v12, v8, 16
-; RV32-NEXT: vmv.x.s a0, v12
-; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
-; RV32-NEXT: vslide1down.vx v10, v10, a0
-; RV32-NEXT: vsetivli zero, 1, e8, m2, ta, ma
-; RV32-NEXT: vslidedown.vi v8, v8, 24
-; RV32-NEXT: vmv.x.s a0, v8
-; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
-; RV32-NEXT: vslide1down.vx v8, v10, a0
-; RV32-NEXT: lbu a0, 32(sp)
-; RV32-NEXT: lbu a1, 40(sp)
-; RV32-NEXT: lbu a2, 48(sp)
-; RV32-NEXT: lbu a3, 56(sp)
-; RV32-NEXT: vslide1down.vx v8, v8, a0
-; RV32-NEXT: vslide1down.vx v8, v8, a1
-; RV32-NEXT: vslide1down.vx v8, v8, a2
-; RV32-NEXT: vslide1down.vx v8, v8, a3
-; RV32-NEXT: addi sp, s0, -128
-; RV32-NEXT: lw ra, 124(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s0, 120(sp) # 4-byte Folded Reload
-; RV32-NEXT: addi sp, sp, 128
-; RV32-NEXT: ret
-;
-; RV64-LABEL: shuffle_v64i8_v8i8:
-; RV64: # %bb.0:
-; RV64-NEXT: addi sp, sp, -128
-; RV64-NEXT: .cfi_def_cfa_offset 128
-; RV64-NEXT: sd ra, 120(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s0, 112(sp) # 8-byte Folded Spill
-; RV64-NEXT: .cfi_offset ra, -8
-; RV64-NEXT: .cfi_offset s0, -16
-; RV64-NEXT: addi s0, sp, 128
-; RV64-NEXT: .cfi_def_cfa s0, 0
-; RV64-NEXT: andi sp, sp, -64
-; RV64-NEXT: li a0, 64
-; RV64-NEXT: mv a1, sp
-; RV64-NEXT: vsetvli zero, a0, e8, m4, ta, ma
-; RV64-NEXT: vse8.v v8, (a1)
-; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; RV64-NEXT: vslidedown.vi v10, v8, 8
-; RV64-NEXT: vmv.x.s a0, v10
-; RV64-NEXT: vmv.x.s a1, v8
-; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
-; RV64-NEXT: vmv.v.x v10, a1
-; RV64-NEXT: vslide1down.vx v10, v10, a0
-; RV64-NEXT: vsetivli zero, 1, e8, m2, ta, ma
-; RV64-NEXT: vslidedown.vi v12, v8, 16
-; RV64-NEXT: vmv.x.s a0, v12
-; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
-; RV64-NEXT: vslide1down.vx v10, v10, a0
-; RV64-NEXT: vsetivli zero, 1, e8, m2, ta, ma
-; RV64-NEXT: vslidedown.vi v8, v8, 24
-; RV64-NEXT: vmv.x.s a0, v8
-; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
-; RV64-NEXT: vslide1down.vx v8, v10, a0
-; RV64-NEXT: lbu a0, 32(sp)
-; RV64-NEXT: lbu a1, 40(sp)
-; RV64-NEXT: lbu a2, 48(sp)
-; RV64-NEXT: lbu a3, 56(sp)
-; RV64-NEXT: vslide1down.vx v8, v8, a0
-; RV64-NEXT: vslide1down.vx v8, v8, a1
-; RV64-NEXT: vslide1down.vx v8, v8, a2
-; RV64-NEXT: vslide1down.vx v8, v8, a3
-; RV64-NEXT: addi sp, s0, -128
-; RV64-NEXT: ld ra, 120(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s0, 112(sp) # 8-byte Folded Reload
-; RV64-NEXT: addi sp, sp, 128
-; RV64-NEXT: ret
+; CHECK-LABEL: shuffle_v64i8_v8i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: li a0, 32
+; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT: vid.v v12
+; CHECK-NEXT: vsll.vi v14, v12, 3
+; CHECK-NEXT: vrgather.vv v12, v8, v14
+; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-NEXT: vslidedown.vx v8, v8, a0
+; CHECK-NEXT: li a1, 240
+; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT: vmv.s.x v0, a1
+; CHECK-NEXT: lui a1, 98561
+; CHECK-NEXT: addi a1, a1, -2048
+; CHECK-NEXT: vmv.v.x v10, a1
+; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, mu
+; CHECK-NEXT: vrgather.vv v12, v8, v10, v0.t
+; CHECK-NEXT: vmv1r.v v8, v12
+; CHECK-NEXT: ret
%s = shufflevector <64 x i8> %wide.vec, <64 x i8> poison, <8 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56>
ret <8 x i8> %s
}
More information about the llvm-commits
mailing list