[llvm] be2cb82 - [riscv] Remove mutation of prior vsetvli from insertion dataflow

Wed May 25 10:51:25 PDT 2022

Author: Philip Reames
Date: 2022-05-25T10:51:14-07:00
New Revision: be2cb824d0d49a6483f310552dac47a35b72face

URL: https://github.com/llvm/llvm-project/commit/be2cb824d0d49a6483f310552dac47a35b72face
DIFF: https://github.com/llvm/llvm-project/commit/be2cb824d0d49a6483f310552dac47a35b72face.diff

LOG: [riscv] Remove mutation of prior vsetvli from insertion dataflow

This moves mutation entirely out of the main algorithm.

The immediate trigger is that we hit another case of the same issue I thought we'd fixed in 72925d9. It turns out we hadn't considered the cross block case.

As a brief summary, the issue being fixed is that if we mutate a previous vsetvli in phase 3, there's a possibility that some later use of that vsetvli changes "compatibility". In the cross_block_mutate test, this later vsetvli occurs in another block (and is thus visit order dependent too!). This causes us to fail strict asserts. (To be explicit, the current on by default workaround should compensate. It's only when we turn that off that we have problems.)

Now, I want to explicitly call out an alternate workaround. We could leave the mutation in phase 3, and simplify restrict it to the case where the previous vsetvli's GPR result is unused. That covers the case we've actually seen. (I'll note that codegen regressions with a simple form of this were significant. We might have to check specifically for the use outside block case to keep them reasonable, which complicates the workaround slightly.)

Personally, I'm at the point where I want the mutation pulled out just for robustness sake. I'm worried there's yet one more form of this bug we haven't thought about.

The other motivation for this change is that it does give us a couple of minor codegen wins. None appear to be hugely significant, but improvements never hurt right?

Differential Revision: https://reviews.llvm.org/D125270

Added: 
    

Modified: 
    llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
    llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll
    llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
index 1bfa8371a259..2ff2a60cc961 100644

--- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
@@ -462,6 +462,7 @@ class RISCVInsertVSETVLI : public MachineFunctionPass {
   void computeIncomingVLVTYPE(const MachineBasicBlock &MBB);
   void emitVSETVLIs(MachineBasicBlock &MBB);
   void doLocalPrepass(MachineBasicBlock &MBB);
+  void doLocalPostpass(MachineBasicBlock &MBB);
   void doPRE(MachineBasicBlock &MBB);
 };
 
@@ -478,6 +479,15 @@ static bool isVectorConfigInstr(const MachineInstr &MI) {
          MI.getOpcode() == RISCV::PseudoVSETIVLI;
 }
 
+/// Return true if this is 'vsetvli x0, x0, vtype' which preserves
+/// VL and only sets VTYPE.
+static bool isVLPreservingConfig(const MachineInstr &MI) {
+  if (MI.getOpcode() != RISCV::PseudoVSETVLIX0)
+    return false;
+  assert(RISCV::X0 == MI.getOperand(1).getReg());
+  return RISCV::X0 == MI.getOperand(0).getReg();
+}
+
 static MachineInstr *elideCopies(MachineInstr *MI,
                                  const MachineRegisterInfo *MRI) {
   while (true) {
@@ -1065,9 +1075,6 @@ bool RISCVInsertVSETVLI::needVSETVLIPHI(const VSETVLIInfo &Require,
 
 void RISCVInsertVSETVLI::emitVSETVLIs(MachineBasicBlock &MBB) {
   VSETVLIInfo CurInfo;
-  // Only be set if current VSETVLIInfo is from an explicit VSET(I)VLI.
-  MachineInstr *PrevVSETVLIMI = nullptr;
-
   for (MachineInstr &MI : MBB) {
     // If this is an explicit VSETVLI or VSETIVLI, update our state.
     if (isVectorConfigInstr(MI)) {
@@ -1078,7 +1085,6 @@ void RISCVInsertVSETVLI::emitVSETVLIs(MachineBasicBlock &MBB) {
       MI.getOperand(3).setIsDead(false);
       MI.getOperand(4).setIsDead(false);
       CurInfo = getInfoForVSETVLI(MI);
-      PrevVSETVLIMI = &MI;
       continue;
     }
 
@@ -1125,29 +1131,10 @@ void RISCVInsertVSETVLI::emitVSETVLIs(MachineBasicBlock &MBB) {
         // vl/vtype for succesor blocks.
         if (!canSkipVSETVLIForLoadStore(MI, NewInfo, CurInfo) &&
             needVSETVLI(NewInfo, CurInfo)) {
-          // If the previous VL/VTYPE is set by VSETVLI and do not use, Merge it
-          // with current VL/VTYPE.
-          bool NeedInsertVSETVLI = true;
-          if (PrevVSETVLIMI) {
-            // If these two VSETVLI have the same AVL and the same VLMAX,
-            // we could merge these two VSETVLI.
-            // TODO: If we remove this, we get a `vsetvli x0, x0, vtype'
-            // here.  We could simply let this be emitted, then remove
-            // the unused vsetvlis in a post-pass.
-            if (CurInfo.hasSameAVL(NewInfo) && CurInfo.hasSameVLMAX(NewInfo)) {
-              // WARNING: For correctness, it is essential the contents of VL
-              // and VTYPE stay the same after MI.  This greatly limits the
-              // mutation we can legally do here.
-              PrevVSETVLIMI->getOperand(2).setImm(NewInfo.encodeVTYPE());
-              NeedInsertVSETVLI = false;
-            }
-          }
-          if (NeedInsertVSETVLI)
-            insertVSETVLI(MBB, MI, NewInfo, CurInfo);
+          insertVSETVLI(MBB, MI, NewInfo, CurInfo);
           CurInfo = NewInfo;
         }
       }
-      PrevVSETVLIMI = nullptr;
     }
 
     // If this is something that updates VL/VTYPE that we don't know about, set
@@ -1155,7 +1142,6 @@ void RISCVInsertVSETVLI::emitVSETVLIs(MachineBasicBlock &MBB) {
     if (MI.isCall() || MI.isInlineAsm() || MI.modifiesRegister(RISCV::VL) ||
         MI.modifiesRegister(RISCV::VTYPE)) {
       CurInfo = VSETVLIInfo::getUnknown();
-      PrevVSETVLIMI = nullptr;
     }
   }
 
@@ -1378,6 +1364,52 @@ void RISCVInsertVSETVLI::doPRE(MachineBasicBlock &MBB) {
                 AvailableInfo, OldInfo);
 }
 
+void RISCVInsertVSETVLI::doLocalPostpass(MachineBasicBlock &MBB) {
+  MachineInstr *PrevMI = nullptr;
+  bool UsedVL = false, UsedVTYPE = false;
+  SmallVector<MachineInstr*> ToDelete;
+  for (MachineInstr &MI : MBB) {
+    // Note: Must be *before* vsetvli handling to account for config cases
+    // which only change some subfields.
+    if (MI.isCall() || MI.isInlineAsm() || MI.readsRegister(RISCV::VL))
+      UsedVL = true;
+    if (MI.isCall() || MI.isInlineAsm() || MI.readsRegister(RISCV::VTYPE))
+      UsedVTYPE = true;
+
+    if (!isVectorConfigInstr(MI))
+      continue;
+
+    if (PrevMI) {
+      if (!UsedVL && !UsedVTYPE) {
+        ToDelete.push_back(PrevMI);
+        // fallthrough
+      } else if (!UsedVTYPE && isVLPreservingConfig(MI)) {
+        // Note: `vsetvli x0, x0, vtype' is the canonical instruction
+        // for this case.  If you find yourself wanting to add other forms
+        // to this "unused VTYPE" case, we're probably missing a
+        // canonicalization earlier.
+        // Note: We don't need to explicitly check vtype compatibility
+        // here because this form is only legal (per ISA) when not
+        // changing VL.
+        PrevMI->getOperand(2).setImm(MI.getOperand(2).getImm());
+        ToDelete.push_back(&MI);
+        // Leave PrevMI unchanged
+        continue;
+      }
+    }
+    PrevMI = &MI;
+    UsedVL = false;
+    UsedVTYPE = false;
+    Register VRegDef = MI.getOperand(0).getReg();
+    if (VRegDef != RISCV::X0 &&
+        !(VRegDef.isVirtual() && MRI->use_nodbg_empty(VRegDef)))
+      UsedVL = true;
+  }
+
+  for (auto *MI : ToDelete)
+    MI->eraseFromParent();
+}
+
 bool RISCVInsertVSETVLI::runOnMachineFunction(MachineFunction &MF) {
   // Skip if the vector extension is not enabled.
   const RISCVSubtarget &ST = MF.getSubtarget<RISCVSubtarget>();
@@ -1443,6 +1475,15 @@ bool RISCVInsertVSETVLI::runOnMachineFunction(MachineFunction &MF) {
   for (MachineBasicBlock &MBB : MF)
     emitVSETVLIs(MBB);
 
+  // Now that all vsetvlis are explicit, go through and do block local
+  // DSE and peephole based demanded fields based transforms.  Note that
+  // this *must* be done outside the main dataflow so long as we allow
+  // any cross block analysis within the dataflow.  We can't have both
+  // demanded fields based mutation and non-local analysis in the
+  // dataflow at the same time without introducing inconsistencies.
+  for (MachineBasicBlock &MBB : MF)
+    doLocalPostpass(MBB);
+
   // Once we're fully done rewriting all the instructions, do a final pass
   // through to check for VSETVLIs which write to an unused destination.
   // For the non X0, X0 variant, we can replace the destination register

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll
index b9886aafe83c..9b92926b25f3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll
@@ -747,7 +747,6 @@ define void @vector_init_vsetvli_fv2(i64 %N, double* %c) {
 ; CHECK-LABEL: vector_init_vsetvli_fv2:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    li a2, 0
-; CHECK-NEXT:    vsetivli zero, 4, e64, m1, ta, mu
 ; CHECK-NEXT:    vsetvli a3, zero, e64, m1, ta, mu
 ; CHECK-NEXT:    vmv.v.i v8, 0
 ; CHECK-NEXT:  .LBB15_1: # %for.body
@@ -812,14 +811,13 @@ for.end:                                          ; preds = %for.body
 ; Demonstrates a case where mutation in phase3 is problematic.  We mutate the
 ; vsetvli without considering that it changes the compatibility result of the
 ; vadd in the second block.
-; FIXME: This currently crashes with strict asserts enabled.
 define <vscale x 4 x i32> @cross_block_mutate(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b,
 ; CHECK-LABEL: cross_block_mutate:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetivli a0, 6, e32, m2, tu, mu
 ; CHECK-NEXT:    vmv.s.x v8, a0
-; CHECK-NEXT:    vadd.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, tu, mu
+; CHECK-NEXT:    vadd.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    ret
                                          <vscale x 4 x i1> %mask) {
 entry:

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll
index e625e446eea0..3fff538b5717 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll
@@ -328,8 +328,7 @@ entry:
 define double @test17(i64 %avl, <vscale x 1 x double> %a, <vscale x 1 x double> %b) nounwind {
 ; CHECK-LABEL: test17:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
+; CHECK-NEXT:    vsetvli a0, a0, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.f.s ft0, v8
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
 ; CHECK-NEXT:    vfadd.vv v8, v8, v9
@@ -468,7 +467,6 @@ entry:
 define void @avl_forward4(<vscale x 2 x i32> %v, <vscale x 2 x i32>* %p, i64 %reg) nounwind {
 ; CHECK-LABEL: avl_forward4:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
 ; CHECK-NEXT:    vse32.v v8, (a0)
 ; CHECK-NEXT:    ret
@@ -498,7 +496,6 @@ entry:
 define <vscale x 1 x i64> @vleNff(i64* %str, i64 %n, i64 %x) {
 ; CHECK-LABEL: vleNff:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, mu
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
 ; CHECK-NEXT:    vle64ff.v v8, (a0)
 ; CHECK-NEXT:    csrr a0, vl
@@ -526,7 +523,6 @@ define <vscale x 2 x i32> @avl_forward5(<vscale x 2 x i32>* %addr) {
 ; CHECK-LABEL: avl_forward5:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 32
-; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, mu
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    ret