[llvm] [RISCV] Introduce local peephole to reduce VLs based on demanded VL (PR #104689)

Thu Aug 22 23:54:32 PDT 2024

================
@@ -81,6 +82,96 @@ char RISCVVectorPeephole::ID = 0;
 INITIALIZE_PASS(RISCVVectorPeephole, DEBUG_TYPE, "RISC-V Fold Masks", false,
                 false)
 
+/// Given two VL operands, do we know that LHS <= RHS?
+static bool isVLKnownLE(const MachineOperand &LHS, const MachineOperand &RHS) {
+  if (LHS.isReg() && RHS.isReg() && LHS.getReg().isVirtual() &&
+      LHS.getReg() == RHS.getReg())
+    return true;
+  if (RHS.isImm() && RHS.getImm() == RISCV::VLMaxSentinel)
+    return true;
+  if (LHS.isImm() && LHS.getImm() == RISCV::VLMaxSentinel)
+    return false;
+  if (!LHS.isImm() || !RHS.isImm())
+    return false;
+  return LHS.getImm() <= RHS.getImm();
+}
+
+static unsigned getSEWLMULRatio(const MachineInstr &MI) {
+  RISCVII::VLMUL LMUL = RISCVII::getLMul(MI.getDesc().TSFlags);
+  unsigned Log2SEW = MI.getOperand(RISCVII::getSEWOpNum(MI.getDesc())).getImm();
+  return RISCVVType::getSEWLMULRatio(1 << Log2SEW, LMUL);
+}
+
+// Attempt to reduce the VL of an instruction whose sole use is feeding a
+// instruction with a narrower VL.  This currently works backwards from the
+// user instruction (which might have a smaller VL).
+bool RISCVVectorPeephole::tryToReduceVL(MachineInstr &MI) const {
+  // Note that the goal here is a bit multifaceted.
+  // 1) For store's reducing the VL of the value being stored may help to
+  //    reduce VL toggles.  This is somewhat of an artifact of the fact we
+  //    promote arithmetic instructions but VL predicate stores.
+  // 2) For vmv.v.v reducing VL eagerly on the source instruction allows us
+  //    to share code with the foldVMV_V_V transform below.
+  //
+  // Note that to the best of our knowledge, reducing VL is generally not
+  // a significant win on real hardware unless we can also reduce LMUL which
+  // this code doesn't try to do.
+  //
+  // TODO: We can handle a bunch more instructions here, and probably
+  // recurse backwards through operands too.
+  unsigned SrcIdx = 0;
+  switch (RISCV::getRVVMCOpcode(MI.getOpcode())) {
+  default:
+    return false;
+  case RISCV::VSE8_V:
+  case RISCV::VSE16_V:
+  case RISCV::VSE32_V:
+  case RISCV::VSE64_V:
+    break;
+  case RISCV::VMV_V_V:
+    SrcIdx = 2;
+    break;
+  }
+
+  MachineOperand &VL = MI.getOperand(RISCVII::getVLOpNum(MI.getDesc()));
+  if (VL.isImm() && VL.getImm() == RISCV::VLMaxSentinel)
+    return false;
+
+  Register SrcReg = MI.getOperand(SrcIdx).getReg();
+  // Note: one *use*, not one *user*.
+  if (!MRI->hasOneUse(SrcReg))
+    return false;
+
+  MachineInstr *Src = MRI->getVRegDef(SrcReg);
+  if (!Src || Src->hasUnmodeledSideEffects() ||
+      Src->getParent() != MI.getParent() || Src->getNumDefs() != 1 ||
+      !RISCVII::hasVLOp(Src->getDesc().TSFlags) ||
+      !RISCVII::hasSEWOp(Src->getDesc().TSFlags))
+    return false;
+
+  // Src needs to have the same VLMAX as MI
+  if (getSEWLMULRatio(MI) != getSEWLMULRatio(*Src))
----------------
topperc wrote:

I'm not sure this is sufficient. You need to know that the EEW and EMUL of Src's destination is identical to the EEW and EMUL of MI's operand. If there was a bitcast between them in IR, it's possible for them to be different. Bitcasts don't codegen to anything.

https://github.com/llvm/llvm-project/pull/104689