[llvm] [RISCV] Introduce local peephole to reduce VLs based on demanded VL (PR #104689)

Sat Aug 17 18:17:19 PDT 2024

llvmbot wrote:




@llvm/pr-subscribers-backend-risc-v

Author: Philip Reames (preames)

<details>
<summary>Changes</summary>

This is a fairly narrow transform (at the moment) to reduce the VLs of instructions feeding a store with a smaller VL.  Note that the goal of this transform isn't really to reduce VL - it's to reduce VL *toggles*.  To our knowledge, small reductions in VL without also changing LMUL are generally not profitable on existing hardware.

For a single use instruction without side effects, fp exceptions, or a result dependency on VL, reducing VL is legal if only a subset of elements are legal. We'd already implemented this logic for vmv.v.v, and this patch simply applies it to stores as an alternate root.

Longer term, I plan to extend this to other root instructions (i.e. different kind of stores, reduces, etc..), and add a more general recursive walkback through operands.

One risk with the dataflow based approach is that we could be reducing VL of an instruction scheduled in a region with the wider VL (i.e. mixed mode computations) forcing an additional VL toggle.  An example of this is the @insert_subvector_dag_loop test case, but it doesn't appear to happen widely.  I think this is a risk we should accept.

---

Patch is 50.48 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/104689.diff


15 Files Affected:

- (modified) llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp (+102-48) 
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-abs.ll (+1-1) 
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll (+5-59) 
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i.ll (-4) 
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-i2fp.ll (-4) 
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector-shuffle.ll (+4-3) 
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll (+1-1) 
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-splat.ll (+1-2) 
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll (+6-56) 
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-non-power-of-2.ll (-16) 
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-combine.ll (+1-2) 
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect.ll (+8-16) 
- (modified) llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll (+1-2) 
- (modified) llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll (+2-8) 
- (modified) llvm/test/CodeGen/RISCV/rvv/zve32-types.ll (-8) 


``````````diff

diff --git a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
index 2abed1ac984e35..b2891376ea5e47 100644
--- a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
+++ b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
@@ -61,6 +61,7 @@ class RISCVVectorPeephole : public MachineFunctionPass {
   }
 
 private:
+  bool tryToReduceVL(MachineInstr &MI) const;
   bool convertToVLMAX(MachineInstr &MI) const;
   bool convertToWholeRegister(MachineInstr &MI) const;
   bool convertToUnmasked(MachineInstr &MI) const;
@@ -81,6 +82,101 @@ char RISCVVectorPeephole::ID = 0;
 INITIALIZE_PASS(RISCVVectorPeephole, DEBUG_TYPE, "RISC-V Fold Masks", false,
                 false)
 
+
+/// Given two VL operands, returns the one known to be the smallest or nullptr
+/// if unknown.
+static const MachineOperand *getKnownMinVL(const MachineOperand *LHS,
+                                           const MachineOperand *RHS) {
+  if (LHS->isReg() && RHS->isReg() && LHS->getReg().isVirtual() &&
+      LHS->getReg() == RHS->getReg())
+    return LHS;
+  if (LHS->isImm() && LHS->getImm() == RISCV::VLMaxSentinel)
+    return RHS;
+  if (RHS->isImm() && RHS->getImm() == RISCV::VLMaxSentinel)
+    return LHS;
+  if (!LHS->isImm() || !RHS->isImm())
+    return nullptr;
+  return LHS->getImm() <= RHS->getImm() ? LHS : RHS;
+}
+
+static unsigned getSEWLMULRatio(const MachineInstr &MI) {
+  RISCVII::VLMUL LMUL = RISCVII::getLMul(MI.getDesc().TSFlags);
+  unsigned Log2SEW = MI.getOperand(RISCVII::getSEWOpNum(MI.getDesc())).getImm();
+  return RISCVVType::getSEWLMULRatio(1 << Log2SEW, LMUL);
+}
+
+// Attempt to reduce the VL of an instruction whose sole use is feeding a
+// instruction with a narrower VL.  This currently works backwards from the
+// user instruction (which might have a smaller VL).
+bool RISCVVectorPeephole::tryToReduceVL(MachineInstr &MI) const {
+  // Note that the goal here is a bit multifaceted.
+  // 1) For store's reducing the VL of the value being stored may help to
+  //    reduce VL toggles.  This is somewhat of an artifact of the fact we
+  //    promote arithmetic instructions but VL predicate stores.
+  // 2) For vmv.v.v reducing VL eagerly on the source instruction allows us
+  //    to share code with the foldVMV_V_V transform below.
+  //
+  // Note that to the best of our knowledge, reducing VL is generally not
+  // a significant win on real hardware unless we can also reduce LMUL which
+  // this code doesn't try to do.
+  //
+  // TODO: We can handle a bunch more instructions here, and probably
+  // recurse backwards through operands too.
+  unsigned SrcIdx = 0;
+  switch (RISCV::getRVVMCOpcode(MI.getOpcode())) {
+  default:
+    return false;
+  case RISCV::VSE8_V:
+  case RISCV::VSE16_V:
+  case RISCV::VSE32_V:
+  case RISCV::VSE64_V:
+    break;
+  case RISCV::VMV_V_V:
+    SrcIdx = 2;
+    break;
+  }
+
+  MachineOperand &VL = MI.getOperand(RISCVII::getVLOpNum(MI.getDesc()));
+  if (VL.isImm() && VL.getImm() == RISCV::VLMaxSentinel)
+    return false;
+
+  Register SrcReg = MI.getOperand(SrcIdx).getReg();
+  // Note: one *use*, not one *user*.
+  if (!MRI->hasOneUse(SrcReg))
+    return false;
+
+  MachineInstr *Src = MRI->getVRegDef(SrcReg);
+  if (!Src || Src->hasUnmodeledSideEffects() ||
+      Src->getParent() != MI.getParent() || Src->getNumDefs() != 1 ||
+      !RISCVII::hasVLOp(Src->getDesc().TSFlags) ||
+      !RISCVII::hasSEWOp(Src->getDesc().TSFlags))
+    return false;
+
+  // Src needs to have the same VLMAX as MI
+  if (getSEWLMULRatio(MI) != getSEWLMULRatio(*Src))
+    return false;
+
+  bool ActiveElementsAffectResult = RISCVII::activeElementsAffectResult(
+      TII->get(RISCV::getRVVMCOpcode(Src->getOpcode())).TSFlags);
+  if (ActiveElementsAffectResult || Src->mayRaiseFPException())
+    return false;
+
+  MachineOperand &SrcVL =
+    Src->getOperand(RISCVII::getVLOpNum(Src->getDesc()));
+  const MachineOperand *MinVL = getKnownMinVL(&VL, &SrcVL);
+  if (!MinVL || MinVL == &SrcVL)
+    return false;
+
+  if (MinVL->isImm())
+    SrcVL.ChangeToImmediate(MinVL->getImm());
+  else if (MinVL->isReg())
+    SrcVL.ChangeToRegister(MinVL->getReg(), false);
+
+  // TODO: For instructions with a passthru, we could clear the passthru
+  // and tail policy since we've just proven the tail is not demanded.
+  return true;
+}
+
 /// Check if an operand is an immediate or a materialized ADDI $x0, imm.
 std::optional<unsigned>
 RISCVVectorPeephole::getConstant(const MachineOperand &VL) const {
@@ -325,22 +421,6 @@ bool RISCVVectorPeephole::convertToUnmasked(MachineInstr &MI) const {
   return true;
 }
 
-/// Given two VL operands, returns the one known to be the smallest or nullptr
-/// if unknown.
-static const MachineOperand *getKnownMinVL(const MachineOperand *LHS,
-                                           const MachineOperand *RHS) {
-  if (LHS->isReg() && RHS->isReg() && LHS->getReg().isVirtual() &&
-      LHS->getReg() == RHS->getReg())
-    return LHS;
-  if (LHS->isImm() && LHS->getImm() == RISCV::VLMaxSentinel)
-    return RHS;
-  if (RHS->isImm() && RHS->getImm() == RISCV::VLMaxSentinel)
-    return LHS;
-  if (!LHS->isImm() || !RHS->isImm())
-    return nullptr;
-  return LHS->getImm() <= RHS->getImm() ? LHS : RHS;
-}
-
 /// Check if it's safe to move From down to To, checking that no physical
 /// registers are clobbered.
 static bool isSafeToMove(const MachineInstr &From, const MachineInstr &To) {
@@ -362,12 +442,6 @@ static bool isSafeToMove(const MachineInstr &From, const MachineInstr &To) {
   return From.isSafeToMove(SawStore);
 }
 
-static unsigned getSEWLMULRatio(const MachineInstr &MI) {
-  RISCVII::VLMUL LMUL = RISCVII::getLMul(MI.getDesc().TSFlags);
-  unsigned Log2SEW = MI.getOperand(RISCVII::getSEWOpNum(MI.getDesc())).getImm();
-  return RISCVVType::getSEWLMULRatio(1 << Log2SEW, LMUL);
-}
-
 /// If a PseudoVMV_V_V is the only user of its input, fold its passthru and VL
 /// into it.
 ///
@@ -404,33 +478,17 @@ bool RISCVVectorPeephole::foldVMV_V_V(MachineInstr &MI) {
       SrcPassthru.getReg() != Passthru.getReg())
     return false;
 
-  // Because Src and MI have the same passthru, we can use either AVL as long as
-  // it's the smaller of the two.
-  //
-  // (src pt, ..., vl=5)       x x x x x|. . .
-  // (vmv.v.v pt, src, vl=3)   x x x|. . . . .
-  // ->
-  // (src pt, ..., vl=3)       x x x|. . . . .
-  //
-  // (src pt, ..., vl=3)       x x x|. . . . .
-  // (vmv.v.v pt, src, vl=6)   x x x . . .|. .
-  // ->
-  // (src pt, ..., vl=3)       x x x|. . . . .
+  // Src VL will have already been reduced if legal (see tryToReduceVL),
+  // so we don't need to handle a smaller source VL here.  However, the
+  // user's VL may be larger
   MachineOperand &SrcVL = Src->getOperand(RISCVII::getVLOpNum(Src->getDesc()));
   const MachineOperand *MinVL = getKnownMinVL(&MI.getOperand(3), &SrcVL);
-  if (!MinVL)
-    return false;
-
-  bool VLChanged = !MinVL->isIdenticalTo(SrcVL);
-  bool ActiveElementsAffectResult = RISCVII::activeElementsAffectResult(
-      TII->get(RISCV::getRVVMCOpcode(Src->getOpcode())).TSFlags);
-
-  if (VLChanged && (ActiveElementsAffectResult || Src->mayRaiseFPException()))
+  if (!MinVL || !MinVL->isIdenticalTo(SrcVL))
     return false;
 
   // If Src ends up using MI's passthru/VL, move it so it can access it.
   // TODO: We don't need to do this if they already dominate Src.
-  if (!SrcVL.isIdenticalTo(*MinVL) || !SrcPassthru.isIdenticalTo(Passthru)) {
+  if (!SrcPassthru.isIdenticalTo(Passthru)) {
     if (!isSafeToMove(*Src, MI))
       return false;
     Src->moveBefore(&MI);
@@ -445,11 +503,6 @@ bool RISCVVectorPeephole::foldVMV_V_V(MachineInstr &MI) {
                                               *Src->getParent()->getParent()));
   }
 
-  if (MinVL->isImm())
-    SrcVL.ChangeToImmediate(MinVL->getImm());
-  else if (MinVL->isReg())
-    SrcVL.ChangeToRegister(MinVL->getReg(), false);
-
   // Use a conservative tu,mu policy, RISCVInsertVSETVLI will relax it if
   // passthru is undef.
   Src->getOperand(RISCVII::getVecPolicyOpNum(Src->getDesc()))
@@ -498,6 +551,7 @@ bool RISCVVectorPeephole::runOnMachineFunction(MachineFunction &MF) {
   for (MachineBasicBlock &MBB : MF) {
     for (MachineInstr &MI : make_early_inc_range(MBB)) {
       Changed |= convertToVLMAX(MI);
+      Changed |= tryToReduceVL(MI);
       Changed |= convertToUnmasked(MI);
       Changed |= convertToWholeRegister(MI);
       Changed |= convertVMergeToVMv(MI);
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-abs.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-abs.ll
index f607add17b4b9d..ac7d3d9109e39c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-abs.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-abs.ll
@@ -41,8 +41,8 @@ define void @abs_v6i16(ptr %x) {
 ; CHECK-NEXT:    vle16.v v8, (a0)
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vrsub.vi v9, v8, 0
-; CHECK-NEXT:    vmax.vv v8, v8, v9
 ; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
+; CHECK-NEXT:    vmax.vv v8, v8, v9
 ; CHECK-NEXT:    vse16.v v8, (a0)
 ; CHECK-NEXT:    ret
   %a = load <6 x i16>, ptr %x
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll
index d25312268ada62..a6e224d475a312 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll
@@ -47,9 +47,7 @@ define void @fadd_v6f16(ptr %x, ptr %y) {
 ; ZVFH-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; ZVFH-NEXT:    vle16.v v8, (a0)
 ; ZVFH-NEXT:    vle16.v v9, (a1)
-; ZVFH-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfadd.vv v8, v8, v9
-; ZVFH-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; ZVFH-NEXT:    vse16.v v8, (a0)
 ; ZVFH-NEXT:    ret
 ;
@@ -173,9 +171,7 @@ define void @fsub_v6f16(ptr %x, ptr %y) {
 ; ZVFH-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; ZVFH-NEXT:    vle16.v v8, (a0)
 ; ZVFH-NEXT:    vle16.v v9, (a1)
-; ZVFH-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfsub.vv v8, v8, v9
-; ZVFH-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; ZVFH-NEXT:    vse16.v v8, (a0)
 ; ZVFH-NEXT:    ret
 ;
@@ -299,9 +295,7 @@ define void @fmul_v6f16(ptr %x, ptr %y) {
 ; ZVFH-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; ZVFH-NEXT:    vle16.v v8, (a0)
 ; ZVFH-NEXT:    vle16.v v9, (a1)
-; ZVFH-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfmul.vv v8, v8, v9
-; ZVFH-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; ZVFH-NEXT:    vse16.v v8, (a0)
 ; ZVFH-NEXT:    ret
 ;
@@ -425,9 +419,7 @@ define void @fdiv_v6f16(ptr %x, ptr %y) {
 ; ZVFH-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; ZVFH-NEXT:    vle16.v v8, (a0)
 ; ZVFH-NEXT:    vle16.v v9, (a1)
-; ZVFH-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfdiv.vv v8, v8, v9
-; ZVFH-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; ZVFH-NEXT:    vse16.v v8, (a0)
 ; ZVFH-NEXT:    ret
 ;
@@ -546,9 +538,7 @@ define void @fneg_v6f16(ptr %x) {
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; ZVFH-NEXT:    vle16.v v8, (a0)
-; ZVFH-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfneg.v v8, v8
-; ZVFH-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; ZVFH-NEXT:    vse16.v v8, (a0)
 ; ZVFH-NEXT:    ret
 ;
@@ -658,9 +648,7 @@ define void @fabs_v6f16(ptr %x) {
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; ZVFH-NEXT:    vle16.v v8, (a0)
-; ZVFH-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfabs.v v8, v8
-; ZVFH-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; ZVFH-NEXT:    vse16.v v8, (a0)
 ; ZVFH-NEXT:    ret
 ;
@@ -778,9 +766,7 @@ define void @copysign_v6f16(ptr %x, ptr %y) {
 ; ZVFH-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; ZVFH-NEXT:    vle16.v v8, (a0)
 ; ZVFH-NEXT:    vle16.v v9, (a1)
-; ZVFH-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfsgnj.vv v8, v8, v9
-; ZVFH-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; ZVFH-NEXT:    vse16.v v8, (a0)
 ; ZVFH-NEXT:    ret
 ;
@@ -911,9 +897,7 @@ define void @copysign_vf_v6f16(ptr %x, half %y) {
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; ZVFH-NEXT:    vle16.v v8, (a0)
-; ZVFH-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfsgnj.vf v8, v8, fa0
-; ZVFH-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; ZVFH-NEXT:    vse16.v v8, (a0)
 ; ZVFH-NEXT:    ret
 ;
@@ -1053,9 +1037,7 @@ define void @copysign_neg_v6f16(ptr %x, ptr %y) {
 ; ZVFH-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; ZVFH-NEXT:    vle16.v v8, (a0)
 ; ZVFH-NEXT:    vle16.v v9, (a1)
-; ZVFH-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfsgnjn.vv v8, v8, v9
-; ZVFH-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; ZVFH-NEXT:    vse16.v v8, (a0)
 ; ZVFH-NEXT:    ret
 ;
@@ -1204,8 +1186,8 @@ define void @copysign_neg_trunc_v3f16_v3f32(ptr %x, ptr %y) {
 ; ZVFH-NEXT:    vle16.v v9, (a0)
 ; ZVFH-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFH-NEXT:    vfncvt.f.f.w v10, v8
-; ZVFH-NEXT:    vfsgnjn.vv v8, v9, v10
 ; ZVFH-NEXT:    vsetivli zero, 3, e16, mf2, ta, ma
+; ZVFH-NEXT:    vfsgnjn.vv v8, v9, v10
 ; ZVFH-NEXT:    vse16.v v8, (a0)
 ; ZVFH-NEXT:    ret
 ;
@@ -1334,9 +1316,7 @@ define void @sqrt_v6f16(ptr %x) {
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; ZVFH-NEXT:    vle16.v v8, (a0)
-; ZVFH-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfsqrt.v v8, v8
-; ZVFH-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; ZVFH-NEXT:    vse16.v v8, (a0)
 ; ZVFH-NEXT:    ret
 ;
@@ -1459,9 +1439,7 @@ define void @fma_v6f16(ptr %x, ptr %y, ptr %z) {
 ; ZVFH-NEXT:    vle16.v v8, (a0)
 ; ZVFH-NEXT:    vle16.v v9, (a1)
 ; ZVFH-NEXT:    vle16.v v10, (a2)
-; ZVFH-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfmacc.vv v10, v8, v9
-; ZVFH-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; ZVFH-NEXT:    vse16.v v10, (a0)
 ; ZVFH-NEXT:    ret
 ;
@@ -1609,9 +1587,7 @@ define void @fmsub_v6f16(ptr %x, ptr %y, ptr %z) {
 ; ZVFH-NEXT:    vle16.v v8, (a0)
 ; ZVFH-NEXT:    vle16.v v9, (a1)
 ; ZVFH-NEXT:    vle16.v v10, (a2)
-; ZVFH-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfmsac.vv v10, v8, v9
-; ZVFH-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; ZVFH-NEXT:    vse16.v v10, (a0)
 ; ZVFH-NEXT:    ret
 ;
@@ -2246,9 +2222,7 @@ define void @fadd_vf_v6f16(ptr %x, half %y) {
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; ZVFH-NEXT:    vle16.v v8, (a0)
-; ZVFH-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfadd.vf v8, v8, fa0
-; ZVFH-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; ZVFH-NEXT:    vse16.v v8, (a0)
 ; ZVFH-NEXT:    ret
 ;
@@ -2386,9 +2360,7 @@ define void @fadd_fv_v6f16(ptr %x, half %y) {
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; ZVFH-NEXT:    vle16.v v8, (a0)
-; ZVFH-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfadd.vf v8, v8, fa0
-; ZVFH-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; ZVFH-NEXT:    vse16.v v8, (a0)
 ; ZVFH-NEXT:    ret
 ;
@@ -2526,9 +2498,7 @@ define void @fsub_vf_v6f16(ptr %x, half %y) {
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; ZVFH-NEXT:    vle16.v v8, (a0)
-; ZVFH-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfsub.vf v8, v8, fa0
-; ZVFH-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; ZVFH-NEXT:    vse16.v v8, (a0)
 ; ZVFH-NEXT:    ret
 ;
@@ -2666,9 +2636,7 @@ define void @fsub_fv_v6f16(ptr %x, half %y) {
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; ZVFH-NEXT:    vle16.v v8, (a0)
-; ZVFH-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfrsub.vf v8, v8, fa0
-; ZVFH-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; ZVFH-NEXT:    vse16.v v8, (a0)
 ; ZVFH-NEXT:    ret
 ;
@@ -2806,9 +2774,7 @@ define void @fmul_vf_v6f16(ptr %x, half %y) {
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; ZVFH-NEXT:    vle16.v v8, (a0)
-; ZVFH-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfmul.vf v8, v8, fa0
-; ZVFH-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; ZVFH-NEXT:    vse16.v v8, (a0)
 ; ZVFH-NEXT:    ret
 ;
@@ -2946,9 +2912,7 @@ define void @fmul_fv_v6f16(ptr %x, half %y) {
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; ZVFH-NEXT:    vle16.v v8, (a0)
-; ZVFH-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfmul.vf v8, v8, fa0
-; ZVFH-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; ZVFH-NEXT:    vse16.v v8, (a0)
 ; ZVFH-NEXT:    ret
 ;
@@ -3086,9 +3050,7 @@ define void @fdiv_vf_v6f16(ptr %x, half %y) {
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; ZVFH-NEXT:    vle16.v v8, (a0)
-; ZVFH-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfdiv.vf v8, v8, fa0
-; ZVFH-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; ZVFH-NEXT:    vse16.v v8, (a0)
 ; ZVFH-NEXT:    ret
 ;
@@ -3226,9 +3188,7 @@ define void @fdiv_fv_v6f16(ptr %x, half %y) {
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; ZVFH-NEXT:    vle16.v v8, (a0)
-; ZVFH-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfrdiv.vf v8, v8, fa0
-; ZVFH-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; ZVFH-NEXT:    vse16.v v8, (a0)
 ; ZVFH-NEXT:    ret
 ;
@@ -3371,9 +3331,7 @@ define void @fma_vf_v6f16(ptr %x, ptr %y, half %z) {
 ; ZVFH-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; ZVFH-NEXT:    vle16.v v8, (a0)
 ; ZVFH-NEXT:    vle16.v v9, (a1)
-; ZVFH-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfmacc.vf v9, fa0, v8
-; ZVFH-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; ZVFH-NEXT:    vse16.v v9, (a0)
 ; ZVFH-NEXT:    ret
 ;
@@ -3526,9 +3484,7 @@ define void @fma_fv_v6f16(ptr %x, ptr %y, half %z) {
 ; ZVFH-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; ZVFH-NEXT:    vle16.v v8, (a0)
 ; ZVFH-NEXT:    vle16.v v9, (a1)
-; ZVFH-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfmacc.vf v9, fa0, v8
-; ZVFH-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; ZVFH-NEXT:    vse16.v v9, (a0)
 ; ZVFH-NEXT:    ret
 ;
@@ -3687,9 +3643,7 @@ define void @fmsub_vf_v6f16(ptr %x, ptr %y, half %z) {
 ; ZVFH-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; ZVFH-NEXT:    vle16.v v8, (a0)
 ; ZVFH-NEXT:    vle16.v v9, (a1)
-; ZVFH-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfmsac.vf v9, fa0, v8
-; ZVFH-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; ZVFH-NEXT:    vse16.v v9, (a0)
 ; ZVFH-NEXT:    ret
 ;
@@ -3893,9 +3847,8 @@ define void @trunc_v6f16(ptr %x) {
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
 ; ZVFH-NEXT:    vfcvt.rtz.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; ZVFH-NEXT:    vsetvli zero, zero, e16, m1, ta, mu
+; ZVFH-NEXT:    vsetivli zero, 6, e16, m1, ta, mu
 ; ZVFH-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
-; ZVFH-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; ZVFH-NEXT:    vse16.v v8, (a0)
 ; ZVFH-NEXT:    ret
   %a = load <6 x half>, ptr %x
@@ -4023,9 +3976,8 @@ define void @ceil_v6f16(ptr %x) {
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a1
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; ZVFH-NEXT:    vsetvli zero, zero, e16, m1, ta, mu
+; ZVFH-NEXT:    vsetivli zero, 6, e16, m1, ta, mu
 ; ZVFH-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
-; ZVFH-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; ZVFH-NEXT:    vse16.v v8, (a0)
 ; ZVFH-NEXT:    ret
 ;
@@ -4210,9 +4162,8 @@ define void @floor_v6f16(ptr %x) {
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a1
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; ZVFH-NEXT:    vsetvli zero, zero, e16, m1, ta, mu
+; ZVFH-NEXT:    vsetivli zero, 6, e16, m1, ta, mu
 ; ZVFH-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
-; ZVFH-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; ZVFH-NEXT:    vse16.v v8, (a0)
 ; ZVFH-NEXT:    ret
 ;
@@ -4397,9 +4348,8 @@ define void @round_v6f16(ptr %x) {
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a1
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; ZVFH-NEXT:    vsetvli zero, zero, e16, m1, ta, mu
+; ZVFH-NEXT:    vsetivli zero, 6, e16, m1, ta, mu
 ; ZVFH-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
-; ZVFH-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; ZVFH-NEXT:    vse16.v v8, (a0)
 ; ZVFH-NEXT:    ret
 ;
@@ -4782,9 +4732,7 @@ define void @fmuladd_v6f16(ptr %x, ptr %y, ptr %z) {
 ; ZVFH-NEXT:    vle16.v v8, (a0)
 ; ZVFH-NEXT:    vle16.v v9, (a1)
 ; ZVFH-NEXT:    vle16.v v10, (a2)
-; ZVFH-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfmacc.vv v10, v8, v9
-; ZVFH-NEX...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/104689