[llvm] [RISCV] Move performCombineVMergeAndVOps into RISCVFoldMasks (PR #71764)

Tue Nov 14 23:37:23 PST 2023

================
@@ -88,6 +98,304 @@ bool RISCVFoldMasks::isAllOnesMask(MachineInstr *MaskCopy) {
   }
 }
 
+static unsigned getVMSetForLMul(RISCVII::VLMUL LMUL) {
+  switch (LMUL) {
+  case RISCVII::LMUL_F8:
+    return RISCV::PseudoVMSET_M_B1;
+  case RISCVII::LMUL_F4:
+    return RISCV::PseudoVMSET_M_B2;
+  case RISCVII::LMUL_F2:
+    return RISCV::PseudoVMSET_M_B4;
+  case RISCVII::LMUL_1:
+    return RISCV::PseudoVMSET_M_B8;
+  case RISCVII::LMUL_2:
+    return RISCV::PseudoVMSET_M_B16;
+  case RISCVII::LMUL_4:
+    return RISCV::PseudoVMSET_M_B32;
+  case RISCVII::LMUL_8:
+    return RISCV::PseudoVMSET_M_B64;
+  case RISCVII::LMUL_RESERVED:
+    llvm_unreachable("Unexpected LMUL");
+  }
+  llvm_unreachable("Unknown VLMUL enum");
+}
+
+// Try to sink From to before To, also sinking any instructions between From and
+// To where there is a write-after-read dependency on a physical register.
+static bool sinkInstructionAndDeps(MachineInstr &From, MachineInstr &To) {
+  assert(From.getParent() == To.getParent());
+  SmallVector<MachineInstr *> Worklist, ToSink;
+  Worklist.push_back(&From);
+
+  // Rather than compute whether or not we saw a store for every instruction,
+  // just compute it once even if it's more conservative.
+  bool SawStore = false;
+  for (MachineBasicBlock::instr_iterator II = From.getIterator();
+       II != To.getIterator(); II++) {
+    if (II->mayStore()) {
+      SawStore = true;
+      break;
+    }
+  }
+
+  while (!Worklist.empty()) {
+    MachineInstr *MI = Worklist.pop_back_val();
+
+    if (!MI->isSafeToMove(nullptr, SawStore))
+      return false;
+
+    SmallSet<Register, 8> Defs, Uses;
+    for (MachineOperand &Def : MI->all_defs())
+      Defs.insert(Def.getReg());
+    for (MachineOperand &Use : MI->all_uses())
+      Uses.insert(Use.getReg());
+
+    // If anything from [MI, To] uses a definition of MI, we can't sink it.
+    for (MachineBasicBlock::instr_iterator II = MI->getIterator();
+         II != To.getIterator(); II++) {
+      for (MachineOperand &Use : II->all_uses()) {
+        if (Defs.contains(Use.getReg()))
+          return false;
+      }
+    }
+
+    // If MI uses any physical registers, we need to sink any instructions after
+    // it where there might be a write-after-read dependency.
+    for (MachineBasicBlock::instr_iterator II = MI->getIterator();
+         II != To.getIterator(); II++) {
+      bool NeedsSink = any_of(II->all_defs(), [&Uses](MachineOperand &Def) {
+        return Def.getReg().isPhysical() && Uses.contains(Def.getReg());
+      });
+      if (NeedsSink)
+        Worklist.push_back(&*II);
+    }
+
+    ToSink.push_back(MI);
+  }
+
+  for (MachineInstr *MI : ToSink)
+    MI->moveBefore(&To);
+  return true;
+}
+
+// Returns true if LHS is the same register as RHS, or if LHS is undefined.
+bool RISCVFoldMasks::isOpSameAs(const MachineOperand &LHS,
+                                const MachineOperand &RHS) {
+  if (LHS.getReg() == RISCV::NoRegister)
+    return true;
+  if (RHS.getReg() == RISCV::NoRegister)
+    return false;
+  return TRI->lookThruCopyLike(LHS.getReg(), MRI) ==
+         TRI->lookThruCopyLike(RHS.getReg(), MRI);
+}
+
+// Try to fold away VMERGE_VVM instructions. We handle these cases:
+// -Masked TU VMERGE_VVM combined with an unmasked TA instruction instruction
+//  folds to a masked TU instruction. VMERGE_VVM must have have merge operand
+//  same as false operand.
+// -Masked TA VMERGE_VVM combined with an unmasked TA instruction fold to a
+//  masked TA instruction.
+// -Unmasked TU VMERGE_VVM combined with a masked MU TA instruction folds to
+//  masked TU instruction. Both instructions must have the same merge operand.
+//  VMERGE_VVM must have have merge operand same as false operand.
+// Note: The VMERGE_VVM forms above (TA, and TU) refer to the policy implied,
+// not the pseudo name.  That is, a TA VMERGE_VVM can be either the _TU pseudo
+// form with an IMPLICIT_DEF passthrough operand or the unsuffixed (TA) pseudo
+// form.
+bool RISCVFoldMasks::foldVMergeIntoOps(MachineInstr &MI,
+                                       MachineInstr *MaskDef) {
+  MachineOperand *True;
+  MachineOperand *Merge;
+  MachineOperand *False;
+
+  const unsigned BaseOpc = RISCV::getRVVMCOpcode(MI.getOpcode());
+  // A vmv.v.v is equivalent to a vmerge with an all-ones mask.
+  if (BaseOpc == RISCV::VMV_V_V) {
+    Merge = &MI.getOperand(1);
+    False = &MI.getOperand(1);
+    True = &MI.getOperand(2);
+  } else if (BaseOpc == RISCV::VMERGE_VVM) {
+    Merge = &MI.getOperand(1);
+    False = &MI.getOperand(2);
+    True = &MI.getOperand(3);
+  } else
+    return false;
+
+  MachineInstr &TrueMI = *MRI->getVRegDef(True->getReg());
+
+  // We require that either merge and false are the same, or that merge
+  // is undefined.
+  if (!isOpSameAs(*Merge, *False))
+    return false;
+
+  // N must be the only user of True.
+  if (!MRI->hasOneUse(True->getReg()))
+    return false;
+
+  unsigned TrueOpc = TrueMI.getOpcode();
+  const MCInstrDesc &TrueMCID = TrueMI.getDesc();
+  bool HasTiedDest = RISCVII::isFirstDefTiedToFirstUse(TrueMCID);
+
+  bool IsMasked = false;
+  const RISCV::RISCVMaskedPseudoInfo *Info =
+      RISCV::lookupMaskedIntrinsicByUnmasked(TrueOpc);
+  if (!Info && HasTiedDest) {
+    Info = RISCV::getMaskedPseudoInfo(TrueOpc);
+    IsMasked = true;
+  }
+
+  if (!Info)
+    return false;
+
+  // When Mask is not a true mask, this transformation is illegal for some
+  // operations whose results are affected by mask, like viota.m.
+  if (Info->MaskAffectsResult && BaseOpc == RISCV::VMERGE_VVM &&
+      !isAllOnesMask(MaskDef))
+    return false;
+
+  MachineOperand &TrueMergeOp = TrueMI.getOperand(1);
+  if (HasTiedDest && TrueMergeOp.getReg() != RISCV::NoRegister) {
+    // The vmerge instruction must be TU.
+    // FIXME: This could be relaxed, but we need to handle the policy for the
+    // resulting op correctly.
+    if (Merge->getReg() == RISCV::NoRegister)
+      return false;
+    // Both the vmerge instruction and the True instruction must have the same
+    // merge operand.
+    if (!isOpSameAs(TrueMergeOp, *False))
+      return false;
+  }
+
+  if (IsMasked) {
+    assert(HasTiedDest && "Expected tied dest");
+    // The vmerge instruction must be TU.
+    if (Merge->getReg() == RISCV::NoRegister)
+      return false;
+    // The vmerge instruction must have an all 1s mask since we're going to keep
+    // the mask from the True instruction.
+    // FIXME: Support mask agnostic True instruction which would have an
+    // undef merge operand.
+    if (BaseOpc == RISCV::VMERGE_VVM && !isAllOnesMask(MaskDef))
+      return false;
+  }
+
+  // Skip if True has side effect.
+  // TODO: Support vleff and vlsegff.
+  if (TII->get(TrueOpc).hasUnmodeledSideEffects())
+    return false;
+
+  // The vector policy operand may be present for masked intrinsics
+  const MachineOperand &TrueVL =
+      TrueMI.getOperand(RISCVII::getVLOpNum(TrueMCID));
+
+  auto GetMinVL =
+      [](const MachineOperand &LHS,
+         const MachineOperand &RHS) -> std::optional<MachineOperand> {
+    if (LHS.isReg() && RHS.isReg() && LHS.getReg().isVirtual() &&
+        LHS.getReg() == RHS.getReg())
+      return LHS;
+    if (LHS.isImm() && LHS.getImm() == RISCV::VLMaxSentinel)
+      return RHS;
+    if (RHS.isImm() && RHS.getImm() == RISCV::VLMaxSentinel)
+      return LHS;
+    if (!LHS.isImm() || !RHS.isImm())
+      return std::nullopt;
+    return LHS.getImm() <= RHS.getImm() ? LHS : RHS;
+  };
+
+  // Because MI and True must have the same merge operand (or True's operand is
+  // implicit_def), the "effective" body is the minimum of their VLs.
+  const MachineOperand VL = MI.getOperand(RISCVII::getVLOpNum(MI.getDesc()));
+  auto MinVL = GetMinVL(TrueVL, VL);
+  if (!MinVL)
+    return false;
+  bool VLChanged = !MinVL->isIdenticalTo(VL);
+
+  // If we end up changing the VL or mask of True, then we need to make sure it
+  // doesn't raise any observable fp exceptions, since changing the active
+  // elements will affect how fflags is set.
+  if (VLChanged || !IsMasked)
+    if (TrueMCID.mayRaiseFPException() &&
+        !TrueMI.getFlag(MachineInstr::MIFlag::NoFPExcept))
+      return false;
+
+  unsigned MaskedOpc = Info->MaskedPseudo;
+  const MCInstrDesc &MaskedMCID = TII->get(MaskedOpc);
+#ifndef NDEBUG
+  assert(RISCVII::hasVecPolicyOp(MaskedMCID.TSFlags) &&
+         "Expected instructions with mask have policy operand.");
+  assert(MaskedMCID.getOperandConstraint(MaskedMCID.getNumDefs(),
+                                         MCOI::TIED_TO) == 0 &&
+         "Expected instructions with mask have a tied dest.");
+#endif
+
+  // Sink True down to MI so that it can access MI's operands.
+  if (!sinkInstructionAndDeps(TrueMI, MI))
----------------
lukel97 wrote:

> I don't think we need to (or want to) move TrueMI's operands at all.

I've named this method rather misleadingly: It only sinks write-after-read dependencies of physical registers for reasons I'll explain below, not any of TrueMI's operands.

As for "hoist the mask" versus "sink TrueMI", I settled on the latter since it ended up being the simpler option.
The ordering of instructions from SelectionDAG typically looks like this:

```
%true = PsuedoVADD ...
%mask = PseudoVMSET ...
$v0 = COPY %mask
PsuedoVMERGE $v0, %true, ...
```

Where the mask is pretty much always between TrueMI and the vmerge because of the glue. So it never ends up dominating TrueMI in practice.

If we hoist the mask, the trivial case is that it's a VMSET with only some immediate operands.
However a lot of the time the mask will be computed and we'll quickly hit the wall with a naïve implementation, since there'll be a bunch of instructions that need hoisted e.g.

```
%true = PsuedoVADD ...
// All of these need hoisted
%m1 = PseudoVFOO ...
...
%mn = PseudoVBAR ...
%mask = PseudoVMSEQ %mn
$v0 = COPY %mask
PsuedoVMERGE $v0, %true, ...
```

If we sink TrueMI, there's usually much less we need to do: We check beforehand that the vmerge is the only user of TrueMI, so 99.99% of the time we can just call `TrueMI.beforeBefore(MI)`.

For loads/faulting instructions we also just need to check that there's no stores in between and `isSafeToMove` returns true. However, the annoying case is for pseudos that also read a physical register, e.g.:

```
$frm = FSRM foo
%true = PseudoVFCVT_RM_X_F ... implicit $frm
$frm = FSRM bar
%mask = PseudoVMSET ...
$v0 = COPY %mask
PsuedoVMERGE $v0, %true, ...

// can't just sink %true:

$frm = FSRM foo
$frm = FSRM bar <-- WAR dependency
%mask = PseudoVMSET ...
$v0 = COPY %mask
%true = PseudoVFCVT_RM_X_F ... implicit $frm <-- incorrect $frm
PsuedoVMERGE $v0, %true, ...
```

In which case we need to sink those write-after-read dependencies too. But this is a much rarer case in comparison to hoisting the mask with lots of dependencies, which is why I ended up settling on the sinking approach.

I am in agreement though that I would like to make this simpler though. I just want to clarify how I ended with the sinking approach. 

> One thing I can't help but think - all of this would be much easier without the copies to V0.

That thought crossed my mind as well. Is it because all the vector pseudos have their mask operands defined as `VMaskOp`?:

```tablegen
def VMaskOp : RegisterOperand<VMV0> {
  let ParserMatchClass = VMaskAsmOperand;
  let PrintMethod = "printVMaskReg";
  let EncoderMethod = "getVMaskReg";
  let DecoderMethod = "decodeVMaskReg";
}
```

https://github.com/llvm/llvm-project/pull/71764