[llvm] [RISCV] Introduce local peephole to reduce VLs based on demanded VL (PR #104689)

Fri Aug 23 00:42:55 PDT 2024

================
@@ -81,6 +82,96 @@ char RISCVVectorPeephole::ID = 0;
 INITIALIZE_PASS(RISCVVectorPeephole, DEBUG_TYPE, "RISC-V Fold Masks", false,
                 false)
 
+/// Given two VL operands, do we know that LHS <= RHS?
+static bool isVLKnownLE(const MachineOperand &LHS, const MachineOperand &RHS) {
+  if (LHS.isReg() && RHS.isReg() && LHS.getReg().isVirtual() &&
+      LHS.getReg() == RHS.getReg())
+    return true;
+  if (RHS.isImm() && RHS.getImm() == RISCV::VLMaxSentinel)
+    return true;
+  if (LHS.isImm() && LHS.getImm() == RISCV::VLMaxSentinel)
+    return false;
+  if (!LHS.isImm() || !RHS.isImm())
+    return false;
+  return LHS.getImm() <= RHS.getImm();
+}
+
+static unsigned getSEWLMULRatio(const MachineInstr &MI) {
+  RISCVII::VLMUL LMUL = RISCVII::getLMul(MI.getDesc().TSFlags);
+  unsigned Log2SEW = MI.getOperand(RISCVII::getSEWOpNum(MI.getDesc())).getImm();
+  return RISCVVType::getSEWLMULRatio(1 << Log2SEW, LMUL);
+}
+
+// Attempt to reduce the VL of an instruction whose sole use is feeding a
+// instruction with a narrower VL.  This currently works backwards from the
+// user instruction (which might have a smaller VL).
+bool RISCVVectorPeephole::tryToReduceVL(MachineInstr &MI) const {
+  // Note that the goal here is a bit multifaceted.
+  // 1) For store's reducing the VL of the value being stored may help to
+  //    reduce VL toggles.  This is somewhat of an artifact of the fact we
+  //    promote arithmetic instructions but VL predicate stores.
+  // 2) For vmv.v.v reducing VL eagerly on the source instruction allows us
+  //    to share code with the foldVMV_V_V transform below.
+  //
+  // Note that to the best of our knowledge, reducing VL is generally not
+  // a significant win on real hardware unless we can also reduce LMUL which
+  // this code doesn't try to do.
+  //
+  // TODO: We can handle a bunch more instructions here, and probably
+  // recurse backwards through operands too.
+  unsigned SrcIdx = 0;
+  switch (RISCV::getRVVMCOpcode(MI.getOpcode())) {
+  default:
+    return false;
+  case RISCV::VSE8_V:
+  case RISCV::VSE16_V:
+  case RISCV::VSE32_V:
+  case RISCV::VSE64_V:
+    break;
+  case RISCV::VMV_V_V:
+    SrcIdx = 2;
+    break;
+  }
+
+  MachineOperand &VL = MI.getOperand(RISCVII::getVLOpNum(MI.getDesc()));
+  if (VL.isImm() && VL.getImm() == RISCV::VLMaxSentinel)
+    return false;
+
+  Register SrcReg = MI.getOperand(SrcIdx).getReg();
+  // Note: one *use*, not one *user*.
+  if (!MRI->hasOneUse(SrcReg))
+    return false;
+
+  MachineInstr *Src = MRI->getVRegDef(SrcReg);
+  if (!Src || Src->hasUnmodeledSideEffects() ||
+      Src->getParent() != MI.getParent() || Src->getNumDefs() != 1 ||
+      !RISCVII::hasVLOp(Src->getDesc().TSFlags) ||
+      !RISCVII::hasSEWOp(Src->getDesc().TSFlags))
+    return false;
+
+  // Src needs to have the same VLMAX as MI
+  if (getSEWLMULRatio(MI) != getSEWLMULRatio(*Src))
----------------
lukel97 wrote:

I think you're right, its possible to have the same VLMAX but different EEWs at fractional LMULs. I think this was relying on the register classes to enforce that the LMULs are the same.

```llvm
define <vscale x 1 x i8> @unfoldable_mismatched_sew_2(<vscale x 1 x i8> %passthru, <vscale x 1 x i16> %x, <vscale x 1 x i16> %y, i64 %avl) {
  %a = call <vscale x 1 x i16> @llvm.riscv.vadd.nxv1i16.nxv1i16(<vscale x 1 x i16> poison, <vscale x 1 x i16> %x, <vscale x 1 x i16> %y, i64 %avl)
  %a.bitcast = bitcast <vscale x 1 x i16> %a to <vscale x 2 x i8>
  %a.insert = call <vscale x 1 x i8> @llvm.vector.extract(<vscale x 2 x i8> %a.bitcast, i64 0)
  %b = call <vscale x 1 x i8> @llvm.riscv.vmv.v.v.nx1i8(<vscale x 1 x i8> %passthru, <vscale x 1 x i8> %a.insert, i64 %avl)
  ret <vscale x 1 x i8> %b
}
```

However looks like we don't miscompile this by a fluke, there's a trivial copy that gets in the way of the fold:

```
  %4:vr = PseudoVADD_VV_MF4 $noreg(tied-def 0), %1:vr, %2:vr, %3:gprnox0, 4, 0
  %5:vr = COPY %4:vr
  %6:vr = PseudoVMV_V_V_MF8 %0:vr(tied-def 0), killed %5:vr, %3:gprnox0, 3, 0
```

Last time I checked we didn't encode the EEW of the operands anywhere.

https://github.com/llvm/llvm-project/pull/104689