[llvm] [RISCV] Shrink vslidedown when lowering fixed extract_subvector (PR #65598)

Thu Sep 7 04:24:26 PDT 2023

https://github.com/lukel97 created https://github.com/llvm/llvm-project/pull/65598:

As noted in
https://github.com/llvm/llvm-project/pull/65392#discussion_r1316259471, when
lowering an extract of a fixed length vector from another vector, we don't need
to perform the vslidedown on the full vector type. Instead we can extract the
smallest subregister that contains the subvector to be extracted and perform
the vslidedown with a smaller LMUL. E.g, with +Zvl128b:

v2i64 = extract_subvector nxv4i64, 2

is currently lowered as

vsetivli zero, 2, e64, m4, ta, ma
vslidedown.vi v8, v8, 2

This patch shrinks the vslidedown to LMUL=2:

vsetivli zero, 2, e64, m2, ta, ma
vslidedown.vi v8, v8, 2

Because we know that there's at least 128*2=256 bits in v8 at LMUL=2, and we
only need the first 256 bits to extract a v2i64 at index 2.

I've split this out into a separate PR rather than include it in #65392, with
the hope that we'll be able to generalize it later.


>From 75ccc38b55ff7ce94204673d4fbb51a66ca04e1b Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Thu, 7 Sep 2023 11:01:05 +0100
Subject: [PATCH] [RISCV] Shrink vslidedown when lowering fixed
 extract_subvector

As noted in
https://github.com/llvm/llvm-project/pull/65392#discussion_r1316259471, when
lowering an extract of a fixed length vector from another vector, we don't need
to perform the vslidedown on the full vector type. Instead we can extract the
smallest subregister that contains the subvector to be extracted and perform
the vslidedown with a smaller LMUL. E.g, with +Zvl128b:

v2i64 = extract_subvector nxv4i64, 2

is currently lowered as

vsetivli zero, 2, e64, m4, ta, ma
vslidedown.vi v8, v8, 2

This patch shrinks the vslidedown to LMUL=2:

vsetivli zero, 2, e64, m2, ta, ma
vslidedown.vi v8, v8, 2

Because we know that there's at least 128*2=256 bits in v8 at LMUL=2, and we
only need the first 256 bits to extract a v2i64 at index 2.

I've split this out into a separate PR rather than include it in #65392, with
the hope that we'll be able to generalize it later.
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   | 33 ++++++++++++++++++
 .../rvv/fixed-vectors-extract-subvector.ll    | 34 +++++++++----------
 2 files changed, 50 insertions(+), 17 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 05e656ac817027c..e80ea0b4d98ae6d 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -8684,6 +8684,39 @@ SDValue RISCVTargetLowering::lowerEXTRACT_SUBVECTOR(SDValue Op,
       ContainerVT = getContainerForFixedLengthVector(VecVT);
       Vec = convertToScalableVector(ContainerVT, Vec, DAG, Subtarget);
     }
+
+    // The minimum number of elements for a scalable vector type, e.g. nxv1i32
+    // is not legal on Zve32x.
+    const uint64_t MinLegalNumElts =
+        RISCV::RVVBitsPerBlock / Subtarget.getELen();
+    const uint64_t MinVscale =
+        Subtarget.getRealMinVLen() / RISCV::RVVBitsPerBlock;
+
+    // Even if we don't know the exact subregister the subvector is going to
+    // reside in, we know that the subvector is located within the first N bits
+    // of Vec:
+    //
+    // N = (OrigIdx + SubVecVT.getVectorNumElements()) * EltSizeInBits
+    //   = MinVscale * MinEltsNeeded * EltSizeInBits
+    //
+    // From this we can work out the smallest type that contains everything we
+    // need to extract, <vscale x MinEltsNeeded x Elt>
+    uint64_t MinEltsNeeded =
+        (OrigIdx + SubVecVT.getVectorNumElements()) / MinVscale;
+
+    // Round up the number of elements so it's a valid power of 2 scalable
+    // vector type, and make sure it's not less than smallest legal vector type.
+    MinEltsNeeded = std::max(MinLegalNumElts, PowerOf2Ceil(MinEltsNeeded));
+
+    assert(MinEltsNeeded <= ContainerVT.getVectorMinNumElements());
+
+    // Shrink down Vec so we're performing the slidedown on the smallest
+    // possible type.
+    ContainerVT = MVT::getScalableVectorVT(ContainerVT.getVectorElementType(),
+                                           MinEltsNeeded);
+    Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ContainerVT, Vec,
+                      DAG.getVectorIdxConstant(0, DL));
+
     SDValue Mask =
         getDefaultVLOps(VecVT, ContainerVT, DL, DAG, Subtarget).first;
     // Set the vector length to only the number of elements we care about. This
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-subvector.ll
index aee931408cc6146..a34ed145fef57dd 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-subvector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-subvector.ll
@@ -89,7 +89,7 @@ define void @extract_v2i32_v8i32_2(ptr %x, ptr %y) {
 ; LMULMAX2:       # %bb.0:
 ; LMULMAX2-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; LMULMAX2-NEXT:    vle32.v v8, (a0)
-; LMULMAX2-NEXT:    vsetivli zero, 2, e32, m2, ta, ma
+; LMULMAX2-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
 ; LMULMAX2-NEXT:    vslidedown.vi v8, v8, 2
 ; LMULMAX2-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; LMULMAX2-NEXT:    vse32.v v8, (a1)
@@ -151,7 +151,7 @@ define void @extract_v2i32_nxv16i32_0(<vscale x 16 x i32> %x, ptr %y) {
 define void @extract_v2i32_nxv16i32_6(<vscale x 16 x i32> %x, ptr %y) {
 ; CHECK-LABEL: extract_v2i32_nxv16i32_6:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e32, m8, ta, ma
+; CHECK-NEXT:    vsetivli zero, 2, e32, m2, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v8, v8, 6
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; CHECK-NEXT:    vse32.v v8, (a0)
@@ -188,7 +188,7 @@ define void @extract_v2i8_nxv2i8_2(<vscale x 2 x i8> %x, ptr %y) {
 define void @extract_v8i32_nxv16i32_8(<vscale x 16 x i32> %x, ptr %y) {
 ; LMULMAX2-LABEL: extract_v8i32_nxv16i32_8:
 ; LMULMAX2:       # %bb.0:
-; LMULMAX2-NEXT:    vsetivli zero, 8, e32, m8, ta, ma
+; LMULMAX2-NEXT:    vsetivli zero, 8, e32, m4, ta, ma
 ; LMULMAX2-NEXT:    vslidedown.vi v8, v8, 8
 ; LMULMAX2-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; LMULMAX2-NEXT:    vse32.v v8, (a0)
@@ -196,13 +196,13 @@ define void @extract_v8i32_nxv16i32_8(<vscale x 16 x i32> %x, ptr %y) {
 ;
 ; LMULMAX1-LABEL: extract_v8i32_nxv16i32_8:
 ; LMULMAX1:       # %bb.0:
-; LMULMAX1-NEXT:    vsetivli zero, 4, e32, m8, ta, ma
-; LMULMAX1-NEXT:    vslidedown.vi v16, v8, 8
+; LMULMAX1-NEXT:    vsetivli zero, 4, e32, m4, ta, ma
+; LMULMAX1-NEXT:    vslidedown.vi v12, v8, 8
 ; LMULMAX1-NEXT:    vslidedown.vi v8, v8, 12
 ; LMULMAX1-NEXT:    addi a1, a0, 16
 ; LMULMAX1-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; LMULMAX1-NEXT:    vse32.v v8, (a1)
-; LMULMAX1-NEXT:    vse32.v v16, (a0)
+; LMULMAX1-NEXT:    vse32.v v12, (a0)
 ; LMULMAX1-NEXT:    ret
   %c = call <8 x i32> @llvm.vector.extract.v8i32.nxv16i32(<vscale x 16 x i32> %x, i64 8)
   store <8 x i32> %c, ptr %y
@@ -238,7 +238,7 @@ define void @extract_v8i1_v64i1_8(ptr %x, ptr %y) {
 ; LMULMAX2-NEXT:    li a2, 32
 ; LMULMAX2-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
 ; LMULMAX2-NEXT:    vlm.v v8, (a0)
-; LMULMAX2-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
+; LMULMAX2-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
 ; LMULMAX2-NEXT:    vslidedown.vi v8, v8, 1
 ; LMULMAX2-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; LMULMAX2-NEXT:    vsm.v v8, (a1)
@@ -266,7 +266,7 @@ define void @extract_v8i1_v64i1_48(ptr %x, ptr %y) {
 ; LMULMAX2-NEXT:    li a2, 32
 ; LMULMAX2-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
 ; LMULMAX2-NEXT:    vlm.v v8, (a0)
-; LMULMAX2-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
+; LMULMAX2-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
 ; LMULMAX2-NEXT:    vslidedown.vi v8, v8, 2
 ; LMULMAX2-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; LMULMAX2-NEXT:    vsm.v v8, (a1)
@@ -311,7 +311,7 @@ define void @extract_v8i1_nxv64i1_0(<vscale x 64 x i1> %x, ptr %y) {
 define void @extract_v8i1_nxv64i1_8(<vscale x 64 x i1> %x, ptr %y) {
 ; CHECK-LABEL: extract_v8i1_nxv64i1_8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v8, v0, 1
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    vsm.v v8, (a0)
@@ -324,7 +324,7 @@ define void @extract_v8i1_nxv64i1_8(<vscale x 64 x i1> %x, ptr %y) {
 define void @extract_v8i1_nxv64i1_48(<vscale x 64 x i1> %x, ptr %y) {
 ; CHECK-LABEL: extract_v8i1_nxv64i1_48:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vsetivli zero, 1, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v8, v0, 6
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    vsm.v v8, (a0)
@@ -382,7 +382,7 @@ define void @extract_v2i1_v64i1_2(ptr %x, ptr %y) {
 ; LMULMAX2-NEXT:    vlm.v v0, (a0)
 ; LMULMAX2-NEXT:    vmv.v.i v8, 0
 ; LMULMAX2-NEXT:    vmerge.vim v8, v8, 1, v0
-; LMULMAX2-NEXT:    vsetivli zero, 2, e8, m2, ta, ma
+; LMULMAX2-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; LMULMAX2-NEXT:    vslidedown.vi v8, v8, 2
 ; LMULMAX2-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
 ; LMULMAX2-NEXT:    vmsne.vi v0, v8, 0
@@ -403,7 +403,7 @@ define void @extract_v2i1_v64i1_2(ptr %x, ptr %y) {
 ; LMULMAX1-NEXT:    vlm.v v0, (a0)
 ; LMULMAX1-NEXT:    vmv.v.i v8, 0
 ; LMULMAX1-NEXT:    vmerge.vim v8, v8, 1, v0
-; LMULMAX1-NEXT:    vsetivli zero, 2, e8, m1, ta, ma
+; LMULMAX1-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; LMULMAX1-NEXT:    vslidedown.vi v8, v8, 2
 ; LMULMAX1-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
 ; LMULMAX1-NEXT:    vmsne.vi v0, v8, 0
@@ -432,7 +432,7 @@ define void @extract_v2i1_v64i1_42(ptr %x, ptr %y) {
 ; LMULMAX2-NEXT:    vlm.v v0, (a0)
 ; LMULMAX2-NEXT:    vmv.v.i v8, 0
 ; LMULMAX2-NEXT:    vmerge.vim v8, v8, 1, v0
-; LMULMAX2-NEXT:    vsetivli zero, 2, e8, m2, ta, ma
+; LMULMAX2-NEXT:    vsetivli zero, 2, e8, m1, ta, ma
 ; LMULMAX2-NEXT:    vslidedown.vi v8, v8, 10
 ; LMULMAX2-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
 ; LMULMAX2-NEXT:    vmsne.vi v0, v8, 0
@@ -543,7 +543,7 @@ define void @extract_v2i1_nxv64i1_2(<vscale x 64 x i1> %x, ptr %y) {
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m8, ta, ma
 ; CHECK-NEXT:    vmv.v.i v8, 0
 ; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-NEXT:    vsetivli zero, 2, e8, m8, ta, ma
+; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v8, v8, 2
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
@@ -569,7 +569,7 @@ define void @extract_v2i1_nxv64i1_42(<vscale x 64 x i1> %x, ptr %y) {
 ; CHECK-NEXT:    vmv.v.i v8, 0
 ; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    li a1, 42
-; CHECK-NEXT:    vsetivli zero, 2, e8, m8, ta, ma
+; CHECK-NEXT:    vsetivli zero, 2, e8, m4, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v8, v8, a1
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
@@ -594,7 +594,7 @@ define void @extract_v2i1_nxv32i1_26(<vscale x 32 x i1> %x, ptr %y) {
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
 ; CHECK-NEXT:    vmv.v.i v8, 0
 ; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-NEXT:    vsetivli zero, 2, e8, m4, ta, ma
+; CHECK-NEXT:    vsetivli zero, 2, e8, m2, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v8, v8, 26
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
@@ -616,7 +616,7 @@ define void @extract_v2i1_nxv32i1_26(<vscale x 32 x i1> %x, ptr %y) {
 define void @extract_v8i1_nxv32i1_16(<vscale x 32 x i1> %x, ptr %y) {
 ; CHECK-LABEL: extract_v8i1_nxv32i1_16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v8, v0, 2
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    vsm.v v8, (a0)