[llvm] 3297571 - [VE] v256f32|64 fma isel

Mon Mar 14 08:00:26 PDT 2022

Author: Simon Moll
Date: 2022-03-14T15:59:13+01:00
New Revision: 3297571e325a73bad615c57ec117a12f190bd6bc

URL: https://github.com/llvm/llvm-project/commit/3297571e325a73bad615c57ec117a12f190bd6bc
DIFF: https://github.com/llvm/llvm-project/commit/3297571e325a73bad615c57ec117a12f190bd6bc.diff

LOG: [VE] v256f32|64 fma isel

llvm.fma|fmuladd vp.fma isel and tests

Reviewed By: kaz7

Differential Revision: https://reviews.llvm.org/D121477

Added: 
    llvm/test/CodeGen/VE/Vector/vec_fma.ll
    llvm/test/CodeGen/VE/Vector/vp_fma.ll
    llvm/test/CodeGen/VE/Vector/vp_fma_merge.ll

Modified: 
    llvm/lib/Target/VE/VVPISelLowering.cpp
    llvm/lib/Target/VE/VVPInstrInfo.td
    llvm/lib/Target/VE/VVPInstrPatternsVec.td
    llvm/lib/Target/VE/VVPNodes.def

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/VE/VVPISelLowering.cpp b/llvm/lib/Target/VE/VVPISelLowering.cpp
index ab6d2aef6f0a1..881efe3568d22 100644

--- a/llvm/lib/Target/VE/VVPISelLowering.cpp
+++ b/llvm/lib/Target/VE/VVPISelLowering.cpp
@@ -92,20 +92,31 @@ SDValue VETargetLowering::lowerToVVP(SDValue Op, SelectionDAG &DAG) const {
                                        VectorV, Mask, AVL, Op->getFlags());
   }
 
-  if (VVPOpcode == VEISD::VVP_SELECT) {
+  switch (VVPOpcode) {
+  default:
+    llvm_unreachable("lowerToVVP called for unexpected SDNode.");
+  case VEISD::VVP_FFMA: {
+    // VE has a swizzled operand order in FMA (compared to LLVM IR and
+    // SDNodes).
+    auto X = Op->getOperand(2);
+    auto Y = Op->getOperand(0);
+    auto Z = Op->getOperand(1);
+    return CDAG.getNode(VVPOpcode, LegalVecVT, {X, Y, Z, Mask, AVL});
+  }
+  case VEISD::VVP_SELECT: {
     auto Mask = Op->getOperand(0);
     auto OnTrue = Op->getOperand(1);
     auto OnFalse = Op->getOperand(2);
     return CDAG.getNode(VVPOpcode, LegalVecVT, {OnTrue, OnFalse, Mask, AVL});
   }
-  if (VVPOpcode == VEISD::VVP_SETCC) {
+  case VEISD::VVP_SETCC: {
     EVT LegalResVT = getTypeToTransformTo(*DAG.getContext(), Op.getValueType());
     auto LHS = Op->getOperand(0);
     auto RHS = Op->getOperand(1);
     auto Pred = Op->getOperand(2);
     return CDAG.getNode(VVPOpcode, LegalResVT, {LHS, RHS, Pred, Mask, AVL});
   }
-  llvm_unreachable("lowerToVVP called for unexpected SDNode.");
+  }
 }
 
 SDValue VETargetLowering::lowerVVP_LOAD_STORE(SDValue Op,

diff  --git a/llvm/lib/Target/VE/VVPInstrInfo.td b/llvm/lib/Target/VE/VVPInstrInfo.td
index 8257033e42d16..594aa613a411f 100644
--- a/llvm/lib/Target/VE/VVPInstrInfo.td
+++ b/llvm/lib/Target/VE/VVPInstrInfo.td
@@ -72,6 +72,17 @@ def SDTFPBinOpVVP : SDTypeProfile<1, 4, [      // vvp_fadd, etc.
   IsVLVT<4>
 ]>;
 
+// TernaryFPOp(x,y,z,mask,vl)
+def SDTFPTernaryOpVVP : SDTypeProfile<1, 5, [
+  SDTCisSameAs<0, 1>,
+  SDTCisSameAs<0, 2>,
+  SDTCisSameAs<0, 3>,
+  SDTCisFP<0>,
+  SDTCisInt<4>,
+  SDTCisSameNumEltsAs<0, 4>,
+  IsVLVT<5>
+]>;
+
 // Select(OnTrue, OnFalse, SelMask, vl)
 def SDTSelectVVP : SDTypeProfile<1, 4, [       // vp_select, vp_merge
   SDTCisVec<0>,
@@ -110,6 +121,12 @@ class vvp_commutative<SDNode RootOp> :
   [(RootOp node:$lhs, node:$rhs, node:$mask, node:$vlen),
    (RootOp node:$rhs, node:$lhs, node:$mask, node:$vlen)]>;
 
+class vvp_fma_commutative<SDNode RootOp> :
+  PatFrags<
+  (ops node:$X, node:$Y, node:$Z, node:$mask, node:$vlen),
+  [(RootOp node:$X, node:$Y, node:$Z, node:$mask, node:$vlen),
+   (RootOp node:$X, node:$Z, node:$Y, node:$mask, node:$vlen)]>;
+
 // VVP node definitions.
 def vvp_add    : SDNode<"VEISD::VVP_ADD",  SDTIntBinOpVVP>;
 def c_vvp_add  : vvp_commutative<vvp_add>;
@@ -142,6 +159,9 @@ def vvp_fmul    : SDNode<"VEISD::VVP_FMUL",  SDTFPBinOpVVP>;
 def c_vvp_fmul  : vvp_commutative<vvp_fmul>;
 def vvp_fdiv    : SDNode<"VEISD::VVP_FDIV",  SDTFPBinOpVVP>;
 
+def vvp_ffma    : SDNode<"VEISD::VVP_FFMA",  SDTFPTernaryOpVVP>;
+def c_vvp_ffma  : vvp_fma_commutative<vvp_ffma>;
+
 def vvp_scatter : SDNode<"VEISD::VVP_SCATTER",  SDTScatterVVP,
                          [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
 def vvp_gather  : SDNode<"VEISD::VVP_GATHER",  SDTGatherVVP,

diff  --git a/llvm/lib/Target/VE/VVPInstrPatternsVec.td b/llvm/lib/Target/VE/VVPInstrPatternsVec.td
index f25fe6561d5cd..0efbb4d3a1a6e 100644
--- a/llvm/lib/Target/VE/VVPInstrPatternsVec.td
+++ b/llvm/lib/Target/VE/VVPInstrPatternsVec.td
@@ -376,6 +376,118 @@ defm : Binary_rv_vv<c_vvp_fmul,
 defm : Binary_rv_vv<vvp_fsub,
                     i64, v512f32, v512i1, "PVFSUB">;
 
+multiclass Ternary_vvv<
+    SDPatternOperator OpNode, ValueType DataVT,
+    ValueType MaskVT, string OpBaseName> {
+  // Masked with passthru.
+  def : Pat<(vvp_select
+              (OpNode DataVT:$vx, DataVT:$vy, DataVT:$vz,
+                (MaskVT srcvalue), (i32 srcvalue)),
+              DataVT:$vfalse,
+              MaskVT:$mask,
+              i32:$avl),
+            (!cast<Instruction>(OpBaseName#"vvvml_v")
+              $vx, $vy, $vz, $mask, $avl, $vfalse)>;
+
+  // Unmasked.
+  def : Pat<(OpNode DataVT:$vx, DataVT:$vy, DataVT:$vz,
+              (MaskVT true_mask), i32:$avl),
+            (!cast<Instruction>(OpBaseName#"vvvl")
+              $vx, $vy, $vz, $avl)>;
+
+  // Masked.
+  def : Pat<(OpNode DataVT:$vx, DataVT:$vy, DataVT:$vz,
+              MaskVT:$mask, i32:$avl),
+            (!cast<Instruction>(OpBaseName#"vvvml")
+              $vx, $vy, $vz, $mask, $avl)>;
+}
+
+multiclass Ternary_rvv<
+    SDPatternOperator OpNode,
+    ValueType ScalarVT, ValueType DataVT,
+    ValueType MaskVT, string OpBaseName> {
+  // Masked with passthru, broadcast first.
+  def : Pat<(vvp_select
+              (OpNode
+                (any_broadcast ScalarVT:$sx), DataVT:$vy, DataVT:$vz,
+                (MaskVT srcvalue), (i32 srcvalue)),
+              DataVT:$vfalse,
+              MaskVT:$mask,
+              i32:$avl),
+            (!cast<Instruction>(OpBaseName#"rvvml_v")
+              $sx, $vy, $vz, $mask, $avl, $vfalse)>;
+
+  // Unmasked, broadcast first.
+  def : Pat<(OpNode
+              (any_broadcast ScalarVT:$sx), DataVT:$vy, DataVT:$vz,
+              (MaskVT true_mask), i32:$avl),
+            (!cast<Instruction>(OpBaseName#"rvvl")
+              $sx, $vy, $vz, $avl)>;
+
+  // Masked, broadcast first.
+  def : Pat<(OpNode 
+              (any_broadcast ScalarVT:$sx), DataVT:$vy, DataVT:$vz,
+              MaskVT:$mask, i32:$avl),
+            (!cast<Instruction>(OpBaseName#"rvvml")
+              $sx, $vy, $vz, $mask, $avl)>;
+}
+
+multiclass Ternary_vrv<
+    SDPatternOperator OpNode,
+    ValueType ScalarVT, ValueType DataVT,
+    ValueType MaskVT, string OpBaseName> {
+  // Masked with passthru, broadcast second.
+  def : Pat<(vvp_select
+              (OpNode
+                DataVT:$vx, (any_broadcast ScalarVT:$sy), DataVT:$vz,
+                (MaskVT srcvalue), (i32 srcvalue)),
+              DataVT:$vfalse,
+              MaskVT:$mask,
+              i32:$avl),
+            (!cast<Instruction>(OpBaseName#"vrvml_v")
+              $vx, $sy, $vz,
+              $mask, $avl, $vfalse)>;
+
+  // Unmasked, broadcast second.
+  def : Pat<(OpNode
+              DataVT:$vx, (any_broadcast ScalarVT:$sy), DataVT:$vz,
+              (MaskVT true_mask), i32:$avl),
+            (!cast<Instruction>(OpBaseName#"vrvl")
+              $vx, $sy, $vz, $avl)>;
+
+  // Masked, broadcast second.
+  def : Pat<(OpNode
+              DataVT:$vx, (any_broadcast ScalarVT:$sy), DataVT:$vz,
+              MaskVT:$mask, i32:$avl),
+            (!cast<Instruction>(OpBaseName#"vrvml")
+              $vx, $sy, $vz, $mask, $avl)>;
+}
+
+multiclass Ternary_rvv_vrv_vvv<
+    SDPatternOperator OpNode,
+    ValueType ScalarVT, ValueType DataVT,
+    ValueType MaskVT, string OpBaseName> {
+  defm : Ternary_rvv<OpNode, ScalarVT, DataVT, MaskVT, OpBaseName>;
+  defm : Ternary_vrv<OpNode, ScalarVT, DataVT, MaskVT, OpBaseName>;
+  defm : Ternary_vvv<OpNode, DataVT, MaskVT, OpBaseName>;
+}
+
+// Expand both 64bit and 32 bit variant (256 elements)
+multiclass Ternary_ShortLong<
+    SDPatternOperator OpNode,
+    ValueType LongScalarVT, ValueType LongDataVT, string LongOpBaseName,
+    ValueType ShortScalarVT, ValueType ShortDataVT, string ShortOpBaseName> {
+  defm : Ternary_rvv_vrv_vvv<OpNode, LongScalarVT, LongDataVT,
+                             v256i1, LongOpBaseName>;
+  defm : Ternary_rvv_vrv_vvv<OpNode, ShortScalarVT, ShortDataVT,
+                             v256i1, ShortOpBaseName>;
+}
+
+defm : Ternary_ShortLong<c_vvp_ffma,
+                         f64, v256f64, "VFMADD", f32, v256f32, "VFMADS">;
+defm : Ternary_rvv_vrv_vvv<c_vvp_ffma,
+                           i64, v512f32, v512i1, "PVFMAD">;
+
 multiclass Merge_mvv<
     SDPatternOperator OpNode,
     ValueType DataVT, ValueType MaskVT,

diff  --git a/llvm/lib/Target/VE/VVPNodes.def b/llvm/lib/Target/VE/VVPNodes.def
index e042410a0e52e..2d8c694eea5f9 100644
--- a/llvm/lib/Target/VE/VVPNodes.def
+++ b/llvm/lib/Target/VE/VVPNodes.def
@@ -33,6 +33,14 @@
             HANDLE_VP_TO_VVP(VPNAME, VVPNAME)
 #endif
 
+/// ADD_TERNARY_VVP_OP(VVPNAME,SDNAME)
+/// \p VVPName is a VVP Ternary operator.
+/// \p SDNAME is the generic SD opcode corresponding to \p VVPName.
+#ifndef ADD_TERNARY_VVP_OP
+#define ADD_TERNARY_VVP_OP(VVPNAME,SDNAME) \
+    ADD_VVP_OP(VVPNAME,SDNAME)
+#endif
+
 #ifndef ADD_BINARY_VVP_OP_COMPACT
 #define ADD_BINARY_VVP_OP_COMPACT(NAME) \
     ADD_BINARY_VVP_OP(VVP_##NAME,VP_##NAME,NAME)
@@ -97,6 +105,8 @@ ADD_BINARY_VVP_OP_COMPACT(FSUB) REGISTER_PACKED(VVP_FSUB)
 ADD_BINARY_VVP_OP_COMPACT(FMUL) REGISTER_PACKED(VVP_FMUL)
 ADD_BINARY_VVP_OP_COMPACT(FDIV)
 
+ADD_TERNARY_VVP_OP(VVP_FFMA,FMA) HANDLE_VP_TO_VVP(VP_FMA, VVP_FFMA) REGISTER_PACKED(VVP_FFMA)
+
 ADD_VVP_OP(VVP_SETCC, SETCC)
 
 // Shuffles.
@@ -106,6 +116,7 @@ HANDLE_VP_TO_VVP(VP_MERGE, VVP_SELECT)
 
 
 #undef ADD_BINARY_VVP_OP
+#undef ADD_TERNARY_VVP_OP
 #undef ADD_BINARY_VVP_OP_COMPACT
 #undef ADD_REDUCE_VVP_OP
 #undef ADD_VVP_OP

diff  --git a/llvm/test/CodeGen/VE/Vector/vec_fma.ll b/llvm/test/CodeGen/VE/Vector/vec_fma.ll
new file mode 100644
index 0000000000000..e0a92f165d99f
--- /dev/null
+++ b/llvm/test/CodeGen/VE/Vector/vec_fma.ll
@@ -0,0 +1,106 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -march=ve -mattr=+vpu | FileCheck %s
+
+declare <256 x float> @llvm.fma.v256f32(<256 x float>, <256 x float>, <256 x float>)
+
+define fastcc <256 x float> @test_vec_fma_v256f32_vvv(<256 x float> %i0, <256 x float> %i1, <256 x float> %i2) {
+; CHECK-LABEL: test_vec_fma_v256f32_vvv:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lea %s0, 256
+; CHECK-NEXT:    lvl %s0
+; CHECK-NEXT:    vfmad.s %v0, %v2, %v0, %v1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %r0 = call <256 x float> @llvm.fma.v256f32(<256 x float> %i0, <256 x float> %i1, <256 x float> %i2)
+  ret <256 x float> %r0
+}
+
+define fastcc <256 x float> @test_vec_fma_v256f32_rvv(float %s0, <256 x float> %i1, <256 x float> %i2) {
+; CHECK-LABEL: test_vec_fma_v256f32_rvv:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lea %s1, 256
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vfmad.s %v0, %v1, %s0, %v0
+; CHECK-NEXT:    b.l.t (, %s10)
+  %xins = insertelement <256 x float> undef, float %s0, i32 0
+  %i0 = shufflevector <256 x float> %xins, <256 x float> undef, <256 x i32> zeroinitializer
+  %r0 = call <256 x float> @llvm.fma.v256f32(<256 x float> %i0, <256 x float> %i1, <256 x float> %i2)
+  ret <256 x float> %r0
+}
+
+define fastcc <256 x float> @test_vec_fma_v256f32_vrv(<256 x float> %i0, float %s1, <256 x float> %i2) {
+; CHECK-LABEL: test_vec_fma_v256f32_vrv:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lea %s1, 256
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vfmad.s %v0, %v1, %s0, %v0
+; CHECK-NEXT:    b.l.t (, %s10)
+  %yins = insertelement <256 x float> undef, float %s1, i32 0
+  %i1 = shufflevector <256 x float> %yins, <256 x float> undef, <256 x i32> zeroinitializer
+  %r0 = call <256 x float> @llvm.fma.v256f32(<256 x float> %i0, <256 x float> %i1, <256 x float> %i2)
+  ret <256 x float> %r0
+}
+
+define fastcc <256 x float> @test_vec_fma_v256f32_vvr(<256 x float> %i0, <256 x float> %i1, float %s2) {
+; CHECK-LABEL: test_vec_fma_v256f32_vvr:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lea %s1, 256
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vfmad.s %v0, %s0, %v0, %v1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %zins = insertelement <256 x float> undef, float %s2, i32 0
+  %i2 = shufflevector <256 x float> %zins, <256 x float> undef, <256 x i32> zeroinitializer
+  %r0 = call <256 x float> @llvm.fma.v256f32(<256 x float> %i0, <256 x float> %i1, <256 x float> %i2)
+  ret <256 x float> %r0
+}
+
+declare <256 x double> @llvm.fma.v256f64(<256 x double>, <256 x double>, <256 x double>)
+
+define fastcc <256 x double> @test_vec_fma_v256f64_vvv(<256 x double> %i0, <256 x double> %i1, <256 x double> %i2) {
+; CHECK-LABEL: test_vec_fma_v256f64_vvv:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lea %s0, 256
+; CHECK-NEXT:    lvl %s0
+; CHECK-NEXT:    vfmad.d %v0, %v2, %v0, %v1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %r0 = call <256 x double> @llvm.fma.v256f64(<256 x double> %i0, <256 x double> %i1, <256 x double> %i2)
+  ret <256 x double> %r0
+}
+
+define fastcc <256 x double> @test_vec_fma_v256f64_rvv(double %s0, <256 x double> %i1, <256 x double> %i2) {
+; CHECK-LABEL: test_vec_fma_v256f64_rvv:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lea %s1, 256
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vfmad.d %v0, %v1, %s0, %v0
+; CHECK-NEXT:    b.l.t (, %s10)
+  %xins = insertelement <256 x double> undef, double %s0, i32 0
+  %i0 = shufflevector <256 x double> %xins, <256 x double> undef, <256 x i32> zeroinitializer
+  %r0 = call <256 x double> @llvm.fma.v256f64(<256 x double> %i0, <256 x double> %i1, <256 x double> %i2)
+  ret <256 x double> %r0
+}
+
+define fastcc <256 x double> @test_vec_fma_v256f64_vrv(<256 x double> %i0, double %s1, <256 x double> %i2) {
+; CHECK-LABEL: test_vec_fma_v256f64_vrv:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lea %s1, 256
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vfmad.d %v0, %v1, %s0, %v0
+; CHECK-NEXT:    b.l.t (, %s10)
+  %yins = insertelement <256 x double> undef, double %s1, i32 0
+  %i1 = shufflevector <256 x double> %yins, <256 x double> undef, <256 x i32> zeroinitializer
+  %r0 = call <256 x double> @llvm.fma.v256f64(<256 x double> %i0, <256 x double> %i1, <256 x double> %i2)
+  ret <256 x double> %r0
+}
+
+define fastcc <256 x double> @test_vec_fma_v256f64_vvr(<256 x double> %i0, <256 x double> %i1, double %s2) {
+; CHECK-LABEL: test_vec_fma_v256f64_vvr:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lea %s1, 256
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vfmad.d %v0, %s0, %v0, %v1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %zins = insertelement <256 x double> undef, double %s2, i32 0
+  %i2 = shufflevector <256 x double> %zins, <256 x double> undef, <256 x i32> zeroinitializer
+  %r0 = call <256 x double> @llvm.fma.v256f64(<256 x double> %i0, <256 x double> %i1, <256 x double> %i2)
+  ret <256 x double> %r0
+}

diff  --git a/llvm/test/CodeGen/VE/Vector/vp_fma.ll b/llvm/test/CodeGen/VE/Vector/vp_fma.ll
new file mode 100644
index 0000000000000..6934184427638
--- /dev/null
+++ b/llvm/test/CodeGen/VE/Vector/vp_fma.ll
@@ -0,0 +1,106 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -march=ve -mattr=+vpu | FileCheck %s
+
+declare <256 x float> @llvm.vp.fma.v256f32(<256 x float>, <256 x float>, <256 x float>, <256 x i1>, i32)
+
+define fastcc <256 x float> @test_vp_fma_v256f32_vvv(<256 x float> %i0, <256 x float> %i1, <256 x float> %i2, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: test_vp_fma_v256f32_vvv:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and %s0, %s0, (32)0
+; CHECK-NEXT:    lvl %s0
+; CHECK-NEXT:    vfmad.s %v0, %v2, %v0, %v1, %vm1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %r0 = call <256 x float> @llvm.vp.fma.v256f32(<256 x float> %i0, <256 x float> %i1, <256 x float> %i2, <256 x i1> %m, i32 %n)
+  ret <256 x float> %r0
+}
+
+define fastcc <256 x float> @test_vp_fma_v256f32_rvv(float %s0, <256 x float> %i1, <256 x float> %i2, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: test_vp_fma_v256f32_rvv:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and %s1, %s1, (32)0
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vfmad.s %v0, %v1, %s0, %v0, %vm1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %xins = insertelement <256 x float> undef, float %s0, i32 0
+  %i0 = shufflevector <256 x float> %xins, <256 x float> undef, <256 x i32> zeroinitializer
+  %r0 = call <256 x float> @llvm.vp.fma.v256f32(<256 x float> %i0, <256 x float> %i1, <256 x float> %i2, <256 x i1> %m, i32 %n)
+  ret <256 x float> %r0
+}
+
+define fastcc <256 x float> @test_vp_fma_v256f32_vrv(<256 x float> %i0, float %s1, <256 x float> %i2, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: test_vp_fma_v256f32_vrv:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and %s1, %s1, (32)0
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vfmad.s %v0, %v1, %s0, %v0, %vm1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %yins = insertelement <256 x float> undef, float %s1, i32 0
+  %i1 = shufflevector <256 x float> %yins, <256 x float> undef, <256 x i32> zeroinitializer
+  %r0 = call <256 x float> @llvm.vp.fma.v256f32(<256 x float> %i0, <256 x float> %i1, <256 x float> %i2, <256 x i1> %m, i32 %n)
+  ret <256 x float> %r0
+}
+
+define fastcc <256 x float> @test_vp_fma_v256f32_vvr(<256 x float> %i0, <256 x float> %i1, float %s2, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: test_vp_fma_v256f32_vvr:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and %s1, %s1, (32)0
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vfmad.s %v0, %s0, %v0, %v1, %vm1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %zins = insertelement <256 x float> undef, float %s2, i32 0
+  %i2 = shufflevector <256 x float> %zins, <256 x float> undef, <256 x i32> zeroinitializer
+  %r0 = call <256 x float> @llvm.vp.fma.v256f32(<256 x float> %i0, <256 x float> %i1, <256 x float> %i2, <256 x i1> %m, i32 %n)
+  ret <256 x float> %r0
+}
+
+declare <256 x double> @llvm.vp.fma.v256f64(<256 x double>, <256 x double>, <256 x double>, <256 x i1>, i32)
+
+define fastcc <256 x double> @test_vp_fma_v256f64_vvv(<256 x double> %i0, <256 x double> %i1, <256 x double> %i2, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: test_vp_fma_v256f64_vvv:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and %s0, %s0, (32)0
+; CHECK-NEXT:    lvl %s0
+; CHECK-NEXT:    vfmad.d %v0, %v2, %v0, %v1, %vm1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %r0 = call <256 x double> @llvm.vp.fma.v256f64(<256 x double> %i0, <256 x double> %i1, <256 x double> %i2, <256 x i1> %m, i32 %n)
+  ret <256 x double> %r0
+}
+
+define fastcc <256 x double> @test_vp_fma_v256f64_rvv(double %s0, <256 x double> %i1, <256 x double> %i2, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: test_vp_fma_v256f64_rvv:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and %s1, %s1, (32)0
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vfmad.d %v0, %v1, %s0, %v0, %vm1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %xins = insertelement <256 x double> undef, double %s0, i32 0
+  %i0 = shufflevector <256 x double> %xins, <256 x double> undef, <256 x i32> zeroinitializer
+  %r0 = call <256 x double> @llvm.vp.fma.v256f64(<256 x double> %i0, <256 x double> %i1, <256 x double> %i2, <256 x i1> %m, i32 %n)
+  ret <256 x double> %r0
+}
+
+define fastcc <256 x double> @test_vp_fma_v256f64_vrv(<256 x double> %i0, double %s1, <256 x double> %i2, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: test_vp_fma_v256f64_vrv:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and %s1, %s1, (32)0
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vfmad.d %v0, %v1, %s0, %v0, %vm1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %yins = insertelement <256 x double> undef, double %s1, i32 0
+  %i1 = shufflevector <256 x double> %yins, <256 x double> undef, <256 x i32> zeroinitializer
+  %r0 = call <256 x double> @llvm.vp.fma.v256f64(<256 x double> %i0, <256 x double> %i1, <256 x double> %i2, <256 x i1> %m, i32 %n)
+  ret <256 x double> %r0
+}
+
+define fastcc <256 x double> @test_vp_fma_v256f64_vvr(<256 x double> %i0, <256 x double> %i1, double %s2, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: test_vp_fma_v256f64_vvr:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and %s1, %s1, (32)0
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vfmad.d %v0, %s0, %v0, %v1, %vm1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %zins = insertelement <256 x double> undef, double %s2, i32 0
+  %i2 = shufflevector <256 x double> %zins, <256 x double> undef, <256 x i32> zeroinitializer
+  %r0 = call <256 x double> @llvm.vp.fma.v256f64(<256 x double> %i0, <256 x double> %i1, <256 x double> %i2, <256 x i1> %m, i32 %n)
+  ret <256 x double> %r0
+}

diff  --git a/llvm/test/CodeGen/VE/Vector/vp_fma_merge.ll b/llvm/test/CodeGen/VE/Vector/vp_fma_merge.ll
new file mode 100644
index 0000000000000..da2851538c659
--- /dev/null
+++ b/llvm/test/CodeGen/VE/Vector/vp_fma_merge.ll
@@ -0,0 +1,71 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -march=ve -mattr=+vpu | FileCheck %s
+
+declare <256 x float> @llvm.vp.merge.v256f32(<256 x i1>, <256 x float>, <256 x float>, i32)
+declare <256 x float> @llvm.vp.fma.v256f32(<256 x float>, <256 x float>, <256 x float>, <256 x i1>, i32)
+
+define fastcc <256 x float> @test_vp_fma_v256f32_vvv_merge(<256 x float> %i0, <256 x float> %i1, <256 x float> %i2, <256 x i1> %m, i32 %n, <256 x float> %passthru) {
+; CHECK-LABEL: test_vp_fma_v256f32_vvv_merge:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and %s0, %s0, (32)0
+; CHECK-NEXT:    lvl %s0
+; CHECK-NEXT:    vfmad.s %v3, %v2, %v0, %v1, %vm1
+; CHECK-NEXT:    lea %s16, 256
+; CHECK-NEXT:    lvl %s16
+; CHECK-NEXT:    vor %v0, (0)1, %v3
+; CHECK-NEXT:    b.l.t (, %s10)
+  %vr = call <256 x float> @llvm.vp.fma.v256f32(<256 x float> %i0, <256 x float> %i1, <256 x float> %i2, <256 x i1> %m, i32 %n)
+  %r0 = call <256 x float> @llvm.vp.merge.v256f32(<256 x i1> %m, <256 x float> %vr, <256 x float> %passthru, i32 %n)
+  ret <256 x float> %r0
+}
+
+define fastcc <256 x float> @test_vp_fma_v256f32_rvv_merge(float %s0, <256 x float> %i1, <256 x float> %i2, <256 x i1> %m, i32 %n, <256 x float> %passthru) {
+; CHECK-LABEL: test_vp_fma_v256f32_rvv_merge:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and %s1, %s1, (32)0
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vfmad.s %v2, %v1, %s0, %v0, %vm1
+; CHECK-NEXT:    lea %s16, 256
+; CHECK-NEXT:    lvl %s16
+; CHECK-NEXT:    vor %v0, (0)1, %v2
+; CHECK-NEXT:    b.l.t (, %s10)
+  %xins = insertelement <256 x float> undef, float %s0, i32 0
+  %i0 = shufflevector <256 x float> %xins, <256 x float> undef, <256 x i32> zeroinitializer
+  %vr = call <256 x float> @llvm.vp.fma.v256f32(<256 x float> %i0, <256 x float> %i1, <256 x float> %i2, <256 x i1> %m, i32 %n)
+  %r0 = call <256 x float> @llvm.vp.merge.v256f32(<256 x i1> %m, <256 x float> %vr, <256 x float> %passthru, i32 %n)
+  ret <256 x float> %r0
+}
+
+define fastcc <256 x float> @test_vp_fma_v256f32_vrv_merge(<256 x float> %i0, float %s1, <256 x float> %i2, <256 x i1> %m, i32 %n, <256 x float> %passthru) {
+; CHECK-LABEL: test_vp_fma_v256f32_vrv_merge:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and %s1, %s1, (32)0
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vfmad.s %v2, %v1, %s0, %v0, %vm1
+; CHECK-NEXT:    lea %s16, 256
+; CHECK-NEXT:    lvl %s16
+; CHECK-NEXT:    vor %v0, (0)1, %v2
+; CHECK-NEXT:    b.l.t (, %s10)
+  %yins = insertelement <256 x float> undef, float %s1, i32 0
+  %i1 = shufflevector <256 x float> %yins, <256 x float> undef, <256 x i32> zeroinitializer
+  %vr = call <256 x float> @llvm.vp.fma.v256f32(<256 x float> %i0, <256 x float> %i1, <256 x float> %i2, <256 x i1> %m, i32 %n)
+  %r0 = call <256 x float> @llvm.vp.merge.v256f32(<256 x i1> %m, <256 x float> %vr, <256 x float> %passthru, i32 %n)
+  ret <256 x float> %r0
+}
+
+define fastcc <256 x float> @test_vp_fma_v256f32_vvr_merge(<256 x float> %i0, <256 x float> %i1, float %s2, <256 x i1> %m, i32 %n, <256 x float> %passthru) {
+; CHECK-LABEL: test_vp_fma_v256f32_vvr_merge:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and %s1, %s1, (32)0
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vfmad.s %v2, %s0, %v0, %v1, %vm1
+; CHECK-NEXT:    lea %s16, 256
+; CHECK-NEXT:    lvl %s16
+; CHECK-NEXT:    vor %v0, (0)1, %v2
+; CHECK-NEXT:    b.l.t (, %s10)
+  %zins = insertelement <256 x float> undef, float %s2, i32 0
+  %i2 = shufflevector <256 x float> %zins, <256 x float> undef, <256 x i32> zeroinitializer
+  %vr = call <256 x float> @llvm.vp.fma.v256f32(<256 x float> %i0, <256 x float> %i1, <256 x float> %i2, <256 x i1> %m, i32 %n)
+  %r0 = call <256 x float> @llvm.vp.merge.v256f32(<256 x i1> %m, <256 x float> %vr, <256 x float> %passthru, i32 %n)
+  ret <256 x float> %r0
+}