[llvm] 1b56b2b - [RISCV] Transform VMERGE_VVM_<LMUL>_TU with all ones mask to VADD_VI_<LMUL>_TU.

Tue Sep 13 19:01:47 PDT 2022

Author: Yeting Kuo
Date: 2022-09-14T10:01:37+08:00
New Revision: 1b56b2b2678cde21f7c20e83f881ded9b96518e4

URL: https://github.com/llvm/llvm-project/commit/1b56b2b2678cde21f7c20e83f881ded9b96518e4
DIFF: https://github.com/llvm/llvm-project/commit/1b56b2b2678cde21f7c20e83f881ded9b96518e4.diff

LOG: [RISCV] Transform VMERGE_VVM_<LMUL>_TU with all ones mask to VADD_VI_<LMUL>_TU.

The transformation is benefit because vmerge.vvm always needs mask operand but
vadd.vi may not.

Reviewed By: craig.topper

Differential Revision: https://reviews.llvm.org/D133255

Added: 
    llvm/test/CodeGen/RISCV/rvv/rvv-vmerge-to-vadd.ll

Modified: 
    llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
    llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index 52dd41d7798f..dcdcb155f904 100644

--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -2505,18 +2505,8 @@ bool RISCVDAGToDAGISel::doPeepholeSExtW(SDNode *N) {
   return false;
 }
 
-// Optimize masked RVV pseudo instructions with a known all-ones mask to their
-// corresponding "unmasked" pseudo versions. The mask we're interested in will
-// take the form of a V0 physical register operand, with a glued
-// register-setting instruction.
-bool RISCVDAGToDAGISel::doPeepholeMaskedRVV(SDNode *N) {
-  const RISCV::RISCVMaskedPseudoInfo *I =
-      RISCV::getMaskedPseudoInfo(N->getMachineOpcode());
-  if (!I)
-    return false;
-
-  unsigned MaskOpIdx = I->MaskOpIdx;
-
+// Return true if we can make sure mask of N is all-ones mask.
+static bool usesAllOnesMask(SDNode *N, unsigned MaskOpIdx) {
   // Check that we're using V0 as a mask register.
   if (!isa<RegisterSDNode>(N->getOperand(MaskOpIdx)) ||
       cast<RegisterSDNode>(N->getOperand(MaskOpIdx))->getReg() != RISCV::V0)
@@ -2546,7 +2536,23 @@ bool RISCVDAGToDAGISel::doPeepholeMaskedRVV(SDNode *N) {
   // TODO: Check that the VMSET is the expected bitwidth? The pseudo has
   // undefined behaviour if it's the wrong bitwidth, so we could choose to
   // assume that it's all-ones? Same applies to its VL.
-  if (!MaskSetter->isMachineOpcode() || !IsVMSet(MaskSetter.getMachineOpcode()))
+  return MaskSetter->isMachineOpcode() &&
+         IsVMSet(MaskSetter.getMachineOpcode());
+}
+
+// Optimize masked RVV pseudo instructions with a known all-ones mask to their
+// corresponding "unmasked" pseudo versions. The mask we're interested in will
+// take the form of a V0 physical register operand, with a glued
+// register-setting instruction.
+bool RISCVDAGToDAGISel::doPeepholeMaskedRVV(SDNode *N) {
+  const RISCV::RISCVMaskedPseudoInfo *I =
+      RISCV::getMaskedPseudoInfo(N->getMachineOpcode());
+  if (!I)
+    return false;
+
+  unsigned MaskOpIdx = I->MaskOpIdx;
+
+  if (!usesAllOnesMask(N, MaskOpIdx))
     return false;
 
   // Retrieve the tail policy operand index, if any.
@@ -2600,6 +2606,7 @@ bool RISCVDAGToDAGISel::doPeepholeMaskedRVV(SDNode *N) {
   }
 
   // Transitively apply any node glued to our new node.
+  const auto *Glued = N->getGluedNode();
   if (auto *TGlued = Glued->getGluedNode())
     Ops.push_back(SDValue(TGlued, TGlued->getNumValues() - 1));
 
@@ -2614,121 +2621,167 @@ bool RISCVDAGToDAGISel::doPeepholeMaskedRVV(SDNode *N) {
 // peephole only deals with VMERGE_VVM which is TU and has false operand same as
 // its true operand now. E.g. (VMERGE_VVM_M1_TU False, False, (VADD_M1 ...),
 // ...) -> (VADD_VV_M1_MASK)
-bool RISCVDAGToDAGISel::doPeepholeMergeVVMFold() {
-  bool MadeChange = false;
-  SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
+bool RISCVDAGToDAGISel::performCombineVMergeAndVOps(SDNode *N) {
+  SDValue Merge = N->getOperand(0);
+  SDValue True = N->getOperand(2);
+  SDValue Mask = N->getOperand(3);
+  SDValue VL = N->getOperand(4);
 
-  while (Position != CurDAG->allnodes_begin()) {
-    SDNode *N = &*--Position;
-    if (N->use_empty() || !N->isMachineOpcode())
-      continue;
+  assert(True.getResNo() == 0 &&
+         "Expect True is the first output of an instruction.");
 
-    auto IsVMergeTU = [](unsigned Opcode) {
-      return Opcode == RISCV::PseudoVMERGE_VVM_MF8_TU ||
-             Opcode == RISCV::PseudoVMERGE_VVM_MF4_TU ||
-             Opcode == RISCV::PseudoVMERGE_VVM_MF2_TU ||
-             Opcode == RISCV::PseudoVMERGE_VVM_M1_TU ||
-             Opcode == RISCV::PseudoVMERGE_VVM_M2_TU ||
-             Opcode == RISCV::PseudoVMERGE_VVM_M4_TU ||
-             Opcode == RISCV::PseudoVMERGE_VVM_M8_TU;
-    };
+  // Need N is the exactly one using True.
+  if (!True.hasOneUse())
+    return false;
 
-    unsigned Opc = N->getMachineOpcode();
-    // TODO: Also deal with TA VMerge nodes.
-    if (!IsVMergeTU(Opc))
-      continue;
+  if (!True.isMachineOpcode())
+    return false;
 
-    SDValue Merge = N->getOperand(0);
-    SDValue False = N->getOperand(1);
-    SDValue True = N->getOperand(2);
-    SDValue Mask = N->getOperand(3);
-    SDValue VL = N->getOperand(4);
+  unsigned TrueOpc = True.getMachineOpcode();
 
-    if (Merge != False)
-      continue;
+  // Skip if True has merge operand.
+  // TODO: Deal with True having same merge operand with N.
+  if (RISCVII::hasMergeOp(TII->get(TrueOpc).TSFlags))
+    return false;
 
-    assert(True.getResNo() == 0 &&
-           "Expect True is the first output of an instruction.");
+  // Skip if True has side effect.
+  // TODO: Support velff and vlsegff.
+  if (TII->get(TrueOpc).hasUnmodeledSideEffects())
+    return false;
 
-    // Need N is the exactly one using True.
-    if (!True.hasOneUse())
-      continue;
+  // Only deal with True when True is unmasked intrinsic now.
+  const RISCV::RISCVMaskedPseudoInfo *Info =
+      RISCV::lookupMaskedIntrinsicByUnmaskedTA(TrueOpc);
 
-    if (!True.isMachineOpcode())
-      continue;
+  if (!Info)
+    return false;
 
-    unsigned TrueOpc = True.getMachineOpcode();
+  // The last operand of unmasked intrinsic should be sew or chain.
+  bool HasChainOp =
+      True.getOperand(True.getNumOperands() - 1).getValueType() == MVT::Other;
 
-    // Skip if True has merge operand.
-    // TODO: Deal with True having same merge operand with N.
-    if (RISCVII::hasMergeOp(TII->get(TrueOpc).TSFlags))
-      continue;
+  // Need True has same VL with N.
+  unsigned TrueVLIndex = True.getNumOperands() - HasChainOp - 2;
+  SDValue TrueVL = True.getOperand(TrueVLIndex);
 
-    // Skip if True has side effect.
-    // TODO: Support velff and vlsegff.
-    if (TII->get(TrueOpc).hasUnmodeledSideEffects())
-      continue;
+  auto IsNoFPExcept = [this](SDValue N) {
+    return !this->mayRaiseFPException(N.getNode()) ||
+           N->getFlags().hasNoFPExcept();
+  };
 
-    // Only deal with True when True is unmasked intrinsic now.
-    const RISCV::RISCVMaskedPseudoInfo *Info =
-        RISCV::lookupMaskedIntrinsicByUnmaskedTA(TrueOpc);
+  // Allow the peephole for non-exception True with VLMAX vector length, since
+  // all the values after VL of N are dependent on Merge. VLMAX should be
+  // lowered to (XLenVT -1).
+  if (TrueVL != VL && !(IsNoFPExcept(True) && isAllOnesConstant(TrueVL)))
+    return false;
 
-    if (!Info)
-      continue;
+  SDLoc DL(N);
+  unsigned MaskedOpc = Info->MaskedPseudo;
+  assert(RISCVII::hasVecPolicyOp(TII->get(MaskedOpc).TSFlags) &&
+         "Expected instructions with mask have policy operand.");
 
-    // The last operand of unmasked intrinsic should be sew or chain.
-    bool HasChainOp =
-        True.getOperand(True.getNumOperands() - 1).getValueType() == MVT::Other;
+  SmallVector<SDValue, 8> Ops;
+  Ops.push_back(Merge);
+  Ops.append(True->op_begin(), True->op_begin() + TrueVLIndex);
+  Ops.append({Mask, VL, /* SEW */ True.getOperand(TrueVLIndex + 1)});
+  Ops.push_back(
+      CurDAG->getTargetConstant(/* TUMU */ 0, DL, Subtarget->getXLenVT()));
 
-    // Need True has same VL with N.
-    unsigned TrueVLIndex = True.getNumOperands() - HasChainOp - 2;
-    SDValue TrueVL = True.getOperand(TrueVLIndex);
+  // Result node should have chain operand of True.
+  if (HasChainOp)
+    Ops.push_back(True.getOperand(True.getNumOperands() - 1));
 
-    auto IsNoFPExcept = [this](SDValue N) {
-      return !this->mayRaiseFPException(N.getNode()) ||
-             N->getFlags().hasNoFPExcept();
-    };
+  // Result node should take over glued node of N.
+  if (N->getGluedNode())
+    Ops.push_back(N->getOperand(N->getNumOperands() - 1));
 
-    // Allow the peephole for non-exception True with VLMAX vector length, since
-    // all the values after VL of N are dependent on Merge. VLMAX should be
-    // lowered to (XLenVT -1).
-    if (TrueVL != VL && !(IsNoFPExcept(True) && isAllOnesConstant(TrueVL)))
-      continue;
+  SDNode *Result =
+      CurDAG->getMachineNode(MaskedOpc, DL, True->getVTList(), Ops);
+  Result->setFlags(True->getFlags());
 
-    SDLoc DL(N);
-    unsigned MaskedOpc = Info->MaskedPseudo;
-    assert(RISCVII::hasVecPolicyOp(TII->get(MaskedOpc).TSFlags) &&
-           "Expected instructions with mask have policy operand.");
+  // Replace vmerge.vvm node by Result.
+  ReplaceUses(SDValue(N, 0), SDValue(Result, 0));
 
-    SmallVector<SDValue, 8> Ops;
-    Ops.push_back(Merge);
-    Ops.append(True->op_begin(), True->op_begin() + TrueVLIndex);
-    Ops.append({Mask, VL, /* SEW */ True.getOperand(TrueVLIndex + 1)});
-    Ops.push_back(
-        CurDAG->getTargetConstant(/* TUMU */ 0, DL, Subtarget->getXLenVT()));
+  // Replace another value of True. E.g. chain and VL.
+  for (unsigned Idx = 1; Idx < True->getNumValues(); ++Idx)
+    ReplaceUses(True.getValue(Idx), SDValue(Result, Idx));
 
-    // Result node should have chain operand of True.
-    if (HasChainOp)
-      Ops.push_back(True.getOperand(True.getNumOperands() - 1));
+  // Try to transform Result to unmasked intrinsic.
+  doPeepholeMaskedRVV(Result);
+  return true;
+}
 
-    // Result node should take over glued node of N.
-    if (N->getGluedNode())
-      Ops.push_back(N->getOperand(N->getNumOperands() - 1));
+// Transform (VMERGE_VVM_<LMUL>_TU false, false, true, allones, vl, sew) to
+// (VADD_VI_<LMUL>_TU false, true, 0, vl, sew). It may decrease uses of VMSET.
+bool RISCVDAGToDAGISel::performVMergeToVAdd(SDNode *N) {
+  unsigned NewOpc;
+  switch (N->getMachineOpcode()) {
+  default:
+    llvm_unreachable("Expected VMERGE_VVM_<LMUL>_TU instruction.");
+  case RISCV::PseudoVMERGE_VVM_MF8_TU:
+    NewOpc = RISCV::PseudoVADD_VI_MF8_TU;
+    break;
+  case RISCV::PseudoVMERGE_VVM_MF4_TU:
+    NewOpc = RISCV::PseudoVADD_VI_MF4_TU;
+    break;
+  case RISCV::PseudoVMERGE_VVM_MF2_TU:
+    NewOpc = RISCV::PseudoVADD_VI_MF2_TU;
+    break;
+  case RISCV::PseudoVMERGE_VVM_M1_TU:
+    NewOpc = RISCV::PseudoVADD_VI_M1_TU;
+    break;
+  case RISCV::PseudoVMERGE_VVM_M2_TU:
+    NewOpc = RISCV::PseudoVADD_VI_M2_TU;
+    break;
+  case RISCV::PseudoVMERGE_VVM_M4_TU:
+    NewOpc = RISCV::PseudoVADD_VI_M4_TU;
+    break;
+  case RISCV::PseudoVMERGE_VVM_M8_TU:
+    NewOpc = RISCV::PseudoVADD_VI_M8_TU;
+    break;
+  }
 
-    SDNode *Result =
-        CurDAG->getMachineNode(MaskedOpc, DL, True->getVTList(), Ops);
-    Result->setFlags(True->getFlags());
+  if (!usesAllOnesMask(N, /* MaskOpIdx */ 3))
+    return false;
 
-    // Replace vmerge.vvm node by Result.
-    ReplaceUses(SDValue(N, 0), SDValue(Result, 0));
+  SDLoc DL(N);
+  EVT VT = N->getValueType(0);
+  SDValue Ops[] = {N->getOperand(1), N->getOperand(2),
+                   CurDAG->getTargetConstant(0, DL, Subtarget->getXLenVT()),
+                   N->getOperand(4), N->getOperand(5)};
+  SDNode *Result = CurDAG->getMachineNode(NewOpc, DL, VT, Ops);
+  ReplaceUses(N, Result);
+  return true;
+}
+
+bool RISCVDAGToDAGISel::doPeepholeMergeVVMFold() {
+  bool MadeChange = false;
+  SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
 
-    // Replace another value of True. E.g. chain and VL.
-    for (unsigned Idx = 1; Idx < True->getNumValues(); ++Idx)
-      ReplaceUses(True.getValue(Idx), SDValue(Result, Idx));
+  while (Position != CurDAG->allnodes_begin()) {
+    SDNode *N = &*--Position;
+    if (N->use_empty() || !N->isMachineOpcode())
+      continue;
+
+    auto IsVMergeTU = [](unsigned Opcode) {
+      return Opcode == RISCV::PseudoVMERGE_VVM_MF8_TU ||
+             Opcode == RISCV::PseudoVMERGE_VVM_MF4_TU ||
+             Opcode == RISCV::PseudoVMERGE_VVM_MF2_TU ||
+             Opcode == RISCV::PseudoVMERGE_VVM_M1_TU ||
+             Opcode == RISCV::PseudoVMERGE_VVM_M2_TU ||
+             Opcode == RISCV::PseudoVMERGE_VVM_M4_TU ||
+             Opcode == RISCV::PseudoVMERGE_VVM_M8_TU;
+    };
+
+    unsigned Opc = N->getMachineOpcode();
+    // The following optimizations require that the merge operand of N is same
+    // as the false operand of N.
+    // TODO: Also deal with TA VMerge nodes.
+    if (!IsVMergeTU(Opc) || N->getOperand(0) != N->getOperand(1))
+      continue;
 
-    // Try to transform Result to unmasked intrinsic.
-    doPeepholeMaskedRVV(Result);
-    MadeChange = true;
+    MadeChange |= performCombineVMergeAndVOps(N);
+    MadeChange |= performVMergeToVAdd(N);
   }
   return MadeChange;
 }

diff  --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
index cd8064bc056c..efe8b0ddb4e6 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
@@ -133,6 +133,8 @@ class RISCVDAGToDAGISel : public SelectionDAGISel {
   bool doPeepholeSExtW(SDNode *Node);
   bool doPeepholeMaskedRVV(SDNode *Node);
   bool doPeepholeMergeVVMFold();
+  bool performVMergeToVAdd(SDNode *N);
+  bool performCombineVMergeAndVOps(SDNode *N);
 };
 
 namespace RISCV {

diff  --git a/llvm/test/CodeGen/RISCV/rvv/rvv-vmerge-to-vadd.ll b/llvm/test/CodeGen/RISCV/rvv/rvv-vmerge-to-vadd.ll
new file mode 100644
index 000000000000..a2eac9956ad5
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/rvv-vmerge-to-vadd.ll
@@ -0,0 +1,93 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=riscv64 -mattr=+v -verify-machineinstrs | FileCheck %s
+define <vscale x 1 x i8> @vpmerge_mf8(<vscale x 1 x i8> %x, <vscale x 1 x i8> %y, i32 zeroext %vl) {
+; CHECK-LABEL: vpmerge_mf8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, tu, mu
+; CHECK-NEXT:    vadd.vi v8, v9, 0
+; CHECK-NEXT:    ret
+  %splat = insertelement <vscale x 1 x i1> poison, i1 -1, i8 0
+  %allones = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %1 = call <vscale x 1 x i8> @llvm.vp.merge.nxv1i8(<vscale x 1 x i1> %allones, <vscale x 1 x i8> %y, <vscale x 1 x i8> %x, i32 %vl)
+  ret <vscale x 1 x i8> %1
+}
+
+define <vscale x 2 x i8> @vpmerge_mf4(<vscale x 2 x i8> %x, <vscale x 2 x i8> %y, i32 zeroext %vl) {
+; CHECK-LABEL: vpmerge_mf4:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, tu, mu
+; CHECK-NEXT:    vadd.vi v8, v9, 0
+; CHECK-NEXT:    ret
+  %splat = insertelement <vscale x 2 x i1> poison, i1 -1, i8 0
+  %allones = shufflevector <vscale x 2 x i1> %splat, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %1 = call <vscale x 2 x i8> @llvm.vp.merge.nxv2i8(<vscale x 2 x i1> %allones, <vscale x 2 x i8> %y, <vscale x 2 x i8> %x, i32 %vl)
+  ret <vscale x 2 x i8> %1
+}
+
+define <vscale x 4 x i8> @vpmerge_mf2(<vscale x 4 x i8> %x, <vscale x 4 x i8> %y, i32 zeroext %vl) {
+; CHECK-LABEL: vpmerge_mf2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, tu, mu
+; CHECK-NEXT:    vadd.vi v8, v9, 0
+; CHECK-NEXT:    ret
+  %splat = insertelement <vscale x 4 x i1> poison, i1 -1, i8 0
+  %allones = shufflevector <vscale x 4 x i1> %splat, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %1 = call <vscale x 4 x i8> @llvm.vp.merge.nxv4i8(<vscale x 4 x i1> %allones, <vscale x 4 x i8> %y, <vscale x 4 x i8> %x, i32 %vl)
+  ret <vscale x 4 x i8> %1
+}
+
+define <vscale x 8 x i8> @vpmerge_m1(<vscale x 8 x i8> %x, <vscale x 8 x i8> %y, i32 zeroext %vl) {
+; CHECK-LABEL: vpmerge_m1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, tu, mu
+; CHECK-NEXT:    vadd.vi v8, v9, 0
+; CHECK-NEXT:    ret
+  %splat = insertelement <vscale x 8 x i1> poison, i1 -1, i8 0
+  %allones = shufflevector <vscale x 8 x i1> %splat, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %1 = call <vscale x 8 x i8> @llvm.vp.merge.nxv8i8(<vscale x 8 x i1> %allones, <vscale x 8 x i8> %y, <vscale x 8 x i8> %x, i32 %vl)
+  ret <vscale x 8 x i8> %1
+}
+
+define <vscale x 8 x i16> @vpmerge_m2(<vscale x 8 x i16> %x, <vscale x 8 x i16> %y, i32 zeroext %vl) {
+; CHECK-LABEL: vpmerge_m2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, tu, mu
+; CHECK-NEXT:    vadd.vi v8, v10, 0
+; CHECK-NEXT:    ret
+  %splat = insertelement <vscale x 8 x i1> poison, i1 -1, i16 0
+  %allones = shufflevector <vscale x 8 x i1> %splat, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %1 = call <vscale x 8 x i16> @llvm.vp.merge.nxv8i16(<vscale x 8 x i1> %allones, <vscale x 8 x i16> %y, <vscale x 8 x i16> %x, i32 %vl)
+  ret <vscale x 8 x i16> %1
+}
+
+define <vscale x 8 x i32> @vpmerge_m4(<vscale x 8 x i32> %x, <vscale x 8 x i32> %y, i32 zeroext %vl) {
+; CHECK-LABEL: vpmerge_m4:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, tu, mu
+; CHECK-NEXT:    vadd.vi v8, v12, 0
+; CHECK-NEXT:    ret
+  %splat = insertelement <vscale x 8 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 8 x i1> %splat, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %1 = call <vscale x 8 x i32> @llvm.vp.merge.nxv8i32(<vscale x 8 x i1> %allones, <vscale x 8 x i32> %y, <vscale x 8 x i32> %x, i32 %vl)
+  ret <vscale x 8 x i32> %1
+}
+
+define <vscale x 8 x i64> @vpmerge_m8(<vscale x 8 x i64> %x, <vscale x 8 x i64> %y, i32 zeroext %vl) {
+; CHECK-LABEL: vpmerge_m8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, tu, mu
+; CHECK-NEXT:    vadd.vi v8, v16, 0
+; CHECK-NEXT:    ret
+  %splat = insertelement <vscale x 8 x i1> poison, i1 -1, i64 0
+  %allones = shufflevector <vscale x 8 x i1> %splat, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %1 = call <vscale x 8 x i64> @llvm.vp.merge.nxv8i64(<vscale x 8 x i1> %allones, <vscale x 8 x i64> %y, <vscale x 8 x i64> %x, i32 %vl)
+  ret <vscale x 8 x i64> %1
+}
+
+declare <vscale x 1 x i8> @llvm.vp.merge.nxv1i8(<vscale x 1 x i1>, <vscale x 1 x i8>, <vscale x 1 x i8>, i32)
+declare <vscale x 2 x i8> @llvm.vp.merge.nxv2i8(<vscale x 2 x i1>, <vscale x 2 x i8>, <vscale x 2 x i8>, i32)
+declare <vscale x 4 x i8> @llvm.vp.merge.nxv4i8(<vscale x 4 x i1>, <vscale x 4 x i8>, <vscale x 4 x i8>, i32)
+declare <vscale x 8 x i8> @llvm.vp.merge.nxv8i8(<vscale x 8 x i1>, <vscale x 8 x i8>, <vscale x 8 x i8>, i32)
+declare <vscale x 8 x i16> @llvm.vp.merge.nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
+declare <vscale x 8 x i32> @llvm.vp.merge.nxv8i32(<vscale x 8 x i1>, <vscale x 8 x i32>, <vscale x 8 x i32>, i32)
+declare <vscale x 8 x i64> @llvm.vp.merge.nxv8i64(<vscale x 8 x i1>, <vscale x 8 x i64>, <vscale x 8 x i64>, i32)