[llvm] [RISCV] Prefer vsetivli for VLMAX when VLEN is exactly known (PR #75509)

Thu Dec 14 10:09:40 PST 2023

https://github.com/preames created https://github.com/llvm/llvm-project/pull/75509

If VLEN is exactly known, we may be able to use the vsetivli encoding instead of the vsetvli a0, zero, <vtype> encoding.  This slightly reduces register pressure.

This builds on 632f1c5, but reverses course a bit.  It turns out to be quite complicated to canonicalize from VLMAX to immediate early because the sentinel value is widely used in tablegen patterns without knowledge of LMUL.  Instead, we canonicalize towards the VLMAX representation, and then pick the immediate form during insertion since we have the LMUL information there.

Within InsertVSETVLI, this could reasonable fit in a couple places.  If reviewers want me to e.g. move it to emission, let me know.  Doing so may require a bit of extra code to e.g. handle comparisons of the two forms, but shouldn't be too complicated.

>From 6b97bc1be41b1db3427fd6a1fa2d52cf25979a9a Mon Sep 17 00:00:00 2001
From: Philip Reames <preames at rivosinc.com>
Date: Thu, 14 Dec 2023 07:44:52 -0800
Subject: [PATCH] [RISCV] Prefer vsetivli for VLMAX when VLEN is exactly known

If VLEN is exactly known, we may be able to use the vsetivli encoding
instead of the vsetvli a0, zero, <vtype> encoding.  This slightly reduces
register pressure.

This builds on 632f1c5, but reverses course a bit.  It turns out to be
quite complicated to canonicalize from VLMAX to immediate early because
the sentinel value is widely used in tablegen patterns without knowledge
of LMUL.  Instead, we canonicalize towards the VLMAX representation,
and then pick the immediate form during insertion since we have the
LMUL information there.

Within InsertVSETVLI, this could reasonable fit in a couple places.  If
reviewers want me to e.g. move it to emission, let me know.  Doing so
may require a bit of extra code to e.g. handle comparisons of the
two forms, but shouldn't be too complicated.
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   |  7 ++-
 llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp  | 26 +++++++--
 .../rvv/fixed-vectors-extract-subvector.ll    | 58 +++++++++++++------
 llvm/test/CodeGen/RISCV/rvv/load-add-store.ll | 14 ++---
 4 files changed, 72 insertions(+), 33 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 91bccc77f93fd0..3b07a7acf10a5d 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -2582,11 +2582,12 @@ static SDValue getAllOnesMask(MVT VecVT, SDValue VL, const SDLoc &DL,
 
 static SDValue getVLOp(uint64_t NumElts, MVT ContainerVT, const SDLoc &DL,
                        SelectionDAG &DAG, const RISCVSubtarget &Subtarget) {
-  // If we know the exact VLEN, our VL is exactly equal to VLMAX, and
-  // we can't encode the AVL as an immediate, use the VLMAX encoding.
+  // If we know the exact VLEN, and our VL is exactly equal to VLMAX,
+  // canonicalize the representation.  InsertVSETVLI will pick the immediate
+  // encoding later if profitable.
   const auto [MinVLMAX, MaxVLMAX] =
       RISCVTargetLowering::computeVLMAXBounds(ContainerVT, Subtarget);
-  if (MinVLMAX == MaxVLMAX && NumElts == MinVLMAX && NumElts > 31)
+  if (MinVLMAX == MaxVLMAX && NumElts == MinVLMAX)
     return DAG.getRegister(RISCV::X0, Subtarget.getXLenVT());
 
   return DAG.getConstant(NumElts, DL, Subtarget.getXLenVT());
diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
index b2d36b362b3a0f..1aa35f1c644a2f 100644
--- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
@@ -800,7 +800,18 @@ static VSETVLIInfo getInfoForVSETVLI(const MachineInstr &MI) {
   return NewInfo;
 }
 
+static unsigned computeVLMAX(unsigned VLEN, unsigned SEW,
+                             RISCVII::VLMUL VLMul) {
+  auto [LMul, Fractional] = RISCVVType::decodeVLMUL(VLMul);
+  if (Fractional)
+    VLEN = VLEN / LMul;
+  else
+    VLEN = VLEN * LMul;
+  return VLEN/SEW;
+}
+
 static VSETVLIInfo computeInfoForInstr(const MachineInstr &MI, uint64_t TSFlags,
+                                       const RISCVSubtarget &ST,
                                        const MachineRegisterInfo *MRI) {
   VSETVLIInfo InstrInfo;
 
@@ -842,8 +853,15 @@ static VSETVLIInfo computeInfoForInstr(const MachineInstr &MI, uint64_t TSFlags,
     if (VLOp.isImm()) {
       int64_t Imm = VLOp.getImm();
       // Conver the VLMax sentintel to X0 register.
-      if (Imm == RISCV::VLMaxSentinel)
-        InstrInfo.setAVLReg(RISCV::X0);
+      if (Imm == RISCV::VLMaxSentinel) {
+        // If we know the exact VLEN, see if we can use the constant encoding
+        // for the VLMAX instead.  This reduces register pressure slightly.
+        const unsigned VLMAX = computeVLMAX(ST.getRealMaxVLen(), SEW, VLMul);
+        if (ST.getRealMinVLen() == ST.getRealMaxVLen() && VLMAX <= 31)
+          InstrInfo.setAVLImm(VLMAX);
+        else
+          InstrInfo.setAVLReg(RISCV::X0);
+      }
       else
         InstrInfo.setAVLImm(Imm);
     } else {
@@ -979,7 +997,7 @@ static bool isLMUL1OrSmaller(RISCVII::VLMUL LMUL) {
 bool RISCVInsertVSETVLI::needVSETVLI(const MachineInstr &MI,
                                      const VSETVLIInfo &Require,
                                      const VSETVLIInfo &CurInfo) const {
-  assert(Require == computeInfoForInstr(MI, MI.getDesc().TSFlags, MRI));
+  assert(Require == computeInfoForInstr(MI, MI.getDesc().TSFlags, *ST, MRI));
 
   if (!CurInfo.isValid() || CurInfo.isUnknown() || CurInfo.hasSEWLMULRatioOnly())
     return true;
@@ -1067,7 +1085,7 @@ void RISCVInsertVSETVLI::transferBefore(VSETVLIInfo &Info,
   if (!RISCVII::hasSEWOp(TSFlags))
     return;
 
-  const VSETVLIInfo NewInfo = computeInfoForInstr(MI, TSFlags, MRI);
+  const VSETVLIInfo NewInfo = computeInfoForInstr(MI, TSFlags, *ST, MRI);
   assert(NewInfo.isValid() && !NewInfo.isUnknown());
   if (Info.isValid() && !needVSETVLI(MI, NewInfo, Info))
     return;
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-subvector.ll
index 0b9db09aab3a9c..69d71a1d0c5a9b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-subvector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-subvector.ll
@@ -593,25 +593,45 @@ define void @extract_v2i1_nxv2i1_0(<vscale x 2 x i1> %x, ptr %y) {
 }
 
 define void @extract_v2i1_nxv2i1_2(<vscale x 2 x i1> %x, ptr %y) {
-; CHECK-LABEL: extract_v2i1_nxv2i1_2:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v8, v8, 2
-; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
-; CHECK-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.i v9, 0
-; CHECK-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
-; CHECK-NEXT:    vmv.v.v v9, v8
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vmsne.vi v8, v9, 0
-; CHECK-NEXT:    vsm.v v8, (a0)
-; CHECK-NEXT:    ret
+; CHECK-V-LABEL: extract_v2i1_nxv2i1_2:
+; CHECK-V:       # %bb.0:
+; CHECK-V-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
+; CHECK-V-NEXT:    vmv.v.i v8, 0
+; CHECK-V-NEXT:    vmerge.vim v8, v8, 1, v0
+; CHECK-V-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; CHECK-V-NEXT:    vslidedown.vi v8, v8, 2
+; CHECK-V-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; CHECK-V-NEXT:    vmsne.vi v0, v8, 0
+; CHECK-V-NEXT:    vmv.v.i v8, 0
+; CHECK-V-NEXT:    vmerge.vim v8, v8, 1, v0
+; CHECK-V-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-V-NEXT:    vmv.v.i v9, 0
+; CHECK-V-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
+; CHECK-V-NEXT:    vmv.v.v v9, v8
+; CHECK-V-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-V-NEXT:    vmsne.vi v8, v9, 0
+; CHECK-V-NEXT:    vsm.v v8, (a0)
+; CHECK-V-NEXT:    ret
+;
+; CHECK-KNOWNVLEN128-LABEL: extract_v2i1_nxv2i1_2:
+; CHECK-KNOWNVLEN128:       # %bb.0:
+; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; CHECK-KNOWNVLEN128-NEXT:    vmv.v.i v8, 0
+; CHECK-KNOWNVLEN128-NEXT:    vmerge.vim v8, v8, 1, v0
+; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; CHECK-KNOWNVLEN128-NEXT:    vslidedown.vi v8, v8, 2
+; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; CHECK-KNOWNVLEN128-NEXT:    vmsne.vi v0, v8, 0
+; CHECK-KNOWNVLEN128-NEXT:    vmv.v.i v8, 0
+; CHECK-KNOWNVLEN128-NEXT:    vmerge.vim v8, v8, 1, v0
+; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-KNOWNVLEN128-NEXT:    vmv.v.i v9, 0
+; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
+; CHECK-KNOWNVLEN128-NEXT:    vmv.v.v v9, v8
+; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-KNOWNVLEN128-NEXT:    vmsne.vi v8, v9, 0
+; CHECK-KNOWNVLEN128-NEXT:    vsm.v v8, (a0)
+; CHECK-KNOWNVLEN128-NEXT:    ret
   %c = call <2 x i1> @llvm.vector.extract.v2i1.nxv2i1(<vscale x 2 x i1> %x, i64 2)
   store <2 x i1> %c, ptr %y
   ret void
diff --git a/llvm/test/CodeGen/RISCV/rvv/load-add-store.ll b/llvm/test/CodeGen/RISCV/rvv/load-add-store.ll
index 3bb465ba998a23..4d72bc9af1903e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/load-add-store.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/load-add-store.ll
@@ -362,7 +362,7 @@ define void @exact_vlen_vadd_vint8m1(ptr %pc, ptr %pa, ptr %pb) nounwind vscale_
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vl1r.v v8, (a1)
 ; CHECK-NEXT:    vl1r.v v9, (a2)
-; CHECK-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
 ; CHECK-NEXT:    vs1r.v v8, (a0)
 ; CHECK-NEXT:    ret
@@ -392,7 +392,7 @@ define void @exact_vlen_vadd_vint8m2(ptr %pc, ptr %pa, ptr %pb) nounwind vscale_
 define void @exact_vlen_vadd_vint8mf2(ptr %pc, ptr %pa, ptr %pb) nounwind vscale_range(2,2) {
 ; CHECK-LABEL: exact_vlen_vadd_vint8mf2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    vle8.v v8, (a1)
 ; CHECK-NEXT:    vle8.v v9, (a2)
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
@@ -408,7 +408,7 @@ define void @exact_vlen_vadd_vint8mf2(ptr %pc, ptr %pa, ptr %pb) nounwind vscale
 define void @exact_vlen_vadd_vint8mf4(ptr %pc, ptr %pa, ptr %pb) nounwind vscale_range(2,2) {
 ; CHECK-LABEL: exact_vlen_vadd_vint8mf4:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; CHECK-NEXT:    vle8.v v8, (a1)
 ; CHECK-NEXT:    vle8.v v9, (a2)
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
@@ -424,7 +424,7 @@ define void @exact_vlen_vadd_vint8mf4(ptr %pc, ptr %pa, ptr %pb) nounwind vscale
 define void @exact_vlen_vadd_vint8mf8(ptr %pc, ptr %pa, ptr %pb) nounwind vscale_range(2,2) {
 ; CHECK-LABEL: exact_vlen_vadd_vint8mf8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a3, zero, e8, mf8, ta, ma
+; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
 ; CHECK-NEXT:    vle8.v v8, (a1)
 ; CHECK-NEXT:    vle8.v v9, (a2)
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
@@ -442,7 +442,7 @@ define void @exact_vlen_vadd_vint32m1(ptr %pc, ptr %pa, ptr %pb) nounwind vscale
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vl1re32.v v8, (a1)
 ; CHECK-NEXT:    vl1re32.v v9, (a2)
-; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
 ; CHECK-NEXT:    vs1r.v v8, (a0)
 ; CHECK-NEXT:    ret
@@ -458,7 +458,7 @@ define void @exact_vlen_vadd_vint32m2(ptr %pc, ptr %pa, ptr %pb) nounwind vscale
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vl2re32.v v8, (a1)
 ; CHECK-NEXT:    vl2re32.v v10, (a2)
-; CHECK-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; CHECK-NEXT:    vadd.vv v8, v8, v10
 ; CHECK-NEXT:    vs2r.v v8, (a0)
 ; CHECK-NEXT:    ret
@@ -474,7 +474,7 @@ define void @exact_vlen_vadd_vint32m4(ptr %pc, ptr %pa, ptr %pb) nounwind vscale
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vl4re32.v v8, (a1)
 ; CHECK-NEXT:    vl4re32.v v12, (a2)
-; CHECK-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
+; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; CHECK-NEXT:    vadd.vv v8, v8, v12
 ; CHECK-NEXT:    vs4r.v v8, (a0)
 ; CHECK-NEXT:    ret