[llvm] 95bf5ac - [VE] select|vp.merge|vp.select v256 isel and tests

Mon Jan 17 06:59:58 PST 2022

Author: Simon Moll
Date: 2022-01-17T15:58:54+01:00
New Revision: 95bf5ac8a827f55e70cc54e783b12b83ea0f56fd

URL: https://github.com/llvm/llvm-project/commit/95bf5ac8a827f55e70cc54e783b12b83ea0f56fd
DIFF: https://github.com/llvm/llvm-project/commit/95bf5ac8a827f55e70cc54e783b12b83ea0f56fd.diff

LOG: [VE] select|vp.merge|vp.select v256 isel and tests

Use the `VMRG` for all three operations for now. `vp_select` will be
used in passthru patterns.

Reviewed By: kaz7

Differential Revision: https://reviews.llvm.org/D117206

Added: 
    llvm/test/CodeGen/VE/Vector/vec_select.ll
    llvm/test/CodeGen/VE/Vector/vp_merge.ll
    llvm/test/CodeGen/VE/Vector/vp_select.ll

Modified: 
    llvm/lib/Target/VE/VEISelLowering.cpp
    llvm/lib/Target/VE/VVPInstrInfo.td
    llvm/lib/Target/VE/VVPInstrPatternsVec.td
    llvm/lib/Target/VE/VVPNodes.def

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/VE/VEISelLowering.cpp b/llvm/lib/Target/VE/VEISelLowering.cpp
index 4cdc8952218af..601bd5b0481e9 100644

--- a/llvm/lib/Target/VE/VEISelLowering.cpp
+++ b/llvm/lib/Target/VE/VEISelLowering.cpp
@@ -1720,7 +1720,7 @@ SDValue VETargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::EXTRACT_VECTOR_ELT:
     return lowerEXTRACT_VECTOR_ELT(Op, DAG);
 
-#define ADD_BINARY_VVP_OP(VVP_NAME, VP_NAME, ISD_NAME) case ISD::ISD_NAME:
+#define ADD_VVP_OP(VVP_NAME, ISD_NAME) case ISD::ISD_NAME:
 #include "VVPNodes.def"
     return lowerToVVP(Op, DAG);
   }
@@ -2729,6 +2729,11 @@ SDValue VETargetLowering::lowerToVVP(SDValue Op, SelectionDAG &DAG) const {
     assert(LegalVecVT.isSimple());
     return DAG.getNode(VVPOpcode, DL, LegalVecVT, Op->getOperand(0),
                        Op->getOperand(1), Mask, AVL);
+  } else if (VVPOpcode == VEISD::VVP_SELECT) {
+    auto Mask = Op->getOperand(0);
+    auto OnTrue = Op->getOperand(1);
+    auto OnFalse = Op->getOperand(2);
+    return DAG.getNode(VVPOpcode, DL, LegalVecVT, OnTrue, OnFalse, Mask, AVL);
   }
   llvm_unreachable("lowerToVVP called for unexpected SDNode.");
 }

diff  --git a/llvm/lib/Target/VE/VVPInstrInfo.td b/llvm/lib/Target/VE/VVPInstrInfo.td
index 99566e91ec11c..ef9c238066c07 100644
--- a/llvm/lib/Target/VE/VVPInstrInfo.td
+++ b/llvm/lib/Target/VE/VVPInstrInfo.td
@@ -39,6 +39,15 @@ def SDTFPBinOpVVP : SDTypeProfile<1, 4, [      // vvp_fadd, etc.
   IsVLVT<4>
 ]>;
 
+// Select(OnTrue, OnFalse, SelMask, vl)
+def SDTSelectVVP : SDTypeProfile<1, 4, [       // vp_select, vp_merge
+  SDTCisVec<0>,
+  SDTCisSameNumEltsAs<0, 3>,
+  SDTCisSameAs<0, 1>,
+  SDTCisSameAs<1, 2>,
+  IsVLVT<4>
+]>;
+
 // Binary operator commutative pattern.
 class vvp_commutative<SDNode RootOp> :
   PatFrags<
@@ -79,3 +88,5 @@ def c_vvp_fmul  : vvp_commutative<vvp_fmul>;
 def vvp_fdiv    : SDNode<"VEISD::VVP_FDIV",  SDTFPBinOpVVP>;
 
 // } Binary Operators
+
+def vvp_select : SDNode<"VEISD::VVP_SELECT", SDTSelectVVP>;

diff  --git a/llvm/lib/Target/VE/VVPInstrPatternsVec.td b/llvm/lib/Target/VE/VVPInstrPatternsVec.td
index 8d5d9d1035479..cb210013f2f67 100644
--- a/llvm/lib/Target/VE/VVPInstrPatternsVec.td
+++ b/llvm/lib/Target/VE/VVPInstrPatternsVec.td
@@ -191,3 +191,35 @@ defm : Binary_rv_vv_ShortLong<vvp_fsub,
 defm : Binary_rv_vr_vv_ShortLong<vvp_fdiv,
                               f64, v256f64, "VFDIVD",
                               f32, v256f32, "VFDIVS">;
+
+multiclass Merge_mvv<
+    SDPatternOperator OpNode,
+    ValueType DataVT, ValueType MaskVT,
+    string OpBaseName> {
+  // Masked.
+  def : Pat<(OpNode
+                DataVT:$vtrue, DataVT:$vfalse,
+                MaskVT:$vm,
+                i32:$avl),
+            (!cast<Instruction>(OpBaseName#"vvml_v")
+                $vfalse, $vtrue, $vm, $avl, $vfalse)>;
+}
+
+multiclass Merge_mvv_ShortLong<
+    SDPatternOperator OpNode,
+    ValueType LongDataVT, ValueType ShortDataVT,
+    string OpBaseName> {
+  defm : Merge_mvv<OpNode,
+                   LongDataVT, v256i1,
+                   OpBaseName>;
+  defm : Merge_mvv<OpNode,
+                   ShortDataVT, v256i1,
+                   OpBaseName>;
+}
+
+defm : Merge_mvv_ShortLong<vvp_select,
+                           v256f64,
+                           v256f32, "VMRG">;
+defm : Merge_mvv_ShortLong<vvp_select,
+                           v256i64,
+                           v256i32, "VMRG">;

diff  --git a/llvm/lib/Target/VE/VVPNodes.def b/llvm/lib/Target/VE/VVPNodes.def
index 8a9231f7d3e67..8000f84c5dbe5 100644
--- a/llvm/lib/Target/VE/VVPNodes.def
+++ b/llvm/lib/Target/VE/VVPNodes.def
@@ -59,6 +59,11 @@ ADD_BINARY_VVP_OP_COMPACT(FSUB)
 ADD_BINARY_VVP_OP_COMPACT(FMUL)
 ADD_BINARY_VVP_OP_COMPACT(FDIV)
 
+// Shuffles.
+ADD_VVP_OP(VVP_SELECT,VSELECT)
+HANDLE_VP_TO_VVP(VP_SELECT, VVP_SELECT)
+HANDLE_VP_TO_VVP(VP_MERGE, VVP_SELECT)
+
 #undef ADD_BINARY_VVP_OP
 #undef ADD_BINARY_VVP_OP_COMPACT
 #undef ADD_VVP_OP

diff  --git a/llvm/test/CodeGen/VE/Vector/vec_select.ll b/llvm/test/CodeGen/VE/Vector/vec_select.ll
new file mode 100644
index 0000000000000..8ccbff7828d3f
--- /dev/null
+++ b/llvm/test/CodeGen/VE/Vector/vec_select.ll
@@ -0,0 +1,135 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -march=ve -mattr=+vpu | FileCheck %s
+
+declare <256 x i32> @llvm.vec.select.v256i32(<256 x i1>, <256 x i32>, <256 x i32>, i32)
+
+define fastcc <256 x i32> @test_vec_select_v256i32_vv(<256 x i32> %i0, <256 x i32> %i1, <256 x i1> %m) {
+; CHECK-LABEL: test_vec_select_v256i32_vv:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lea %s0, 256
+; CHECK-NEXT:    lvl %s0
+; CHECK-NEXT:    vmrg %v1, %v1, %v0, %vm1
+; CHECK-NEXT:    lea %s16, 256
+; CHECK-NEXT:    lvl %s16
+; CHECK-NEXT:    vor %v0, (0)1, %v1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %r0 = select <256 x i1> %m, <256 x i32> %i0, <256 x i32> %i1
+  ret <256 x i32> %r0
+}
+
+define fastcc <256 x i32> @test_vec_select_v256i32_vr(<256 x i32> %i0, i32 %s1, <256 x i1> %m) {
+; CHECK-LABEL: test_vec_select_v256i32_vr:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and %s0, %s0, (32)0
+; CHECK-NEXT:    lea %s1, 256
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vbrd %v1, %s0
+; CHECK-NEXT:    vmrg %v1, %v1, %v0, %vm1
+; CHECK-NEXT:    lea %s16, 256
+; CHECK-NEXT:    lvl %s16
+; CHECK-NEXT:    vor %v0, (0)1, %v1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %xins = insertelement <256 x i32> undef, i32 %s1, i32 0
+  %i1 = shufflevector <256 x i32> %xins, <256 x i32> undef, <256 x i32> zeroinitializer
+  %r0 = select <256 x i1> %m, <256 x i32> %i0, <256 x i32> %i1
+  ret <256 x i32> %r0
+}
+
+declare <256 x float> @llvm.vec.select.v256f32(<256 x i1>, <256 x float>, <256 x float>, i32)
+
+define fastcc <256 x float> @test_vec_select_v256f32_vv(<256 x float> %i0, <256 x float> %i1, <256 x i1> %m) {
+; CHECK-LABEL: test_vec_select_v256f32_vv:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lea %s0, 256
+; CHECK-NEXT:    lvl %s0
+; CHECK-NEXT:    vmrg %v1, %v1, %v0, %vm1
+; CHECK-NEXT:    lea %s16, 256
+; CHECK-NEXT:    lvl %s16
+; CHECK-NEXT:    vor %v0, (0)1, %v1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %r0 = select <256 x i1> %m, <256 x float> %i0, <256 x float> %i1
+  ret <256 x float> %r0
+}
+
+define fastcc <256 x float> @test_vec_select_v256f32_vr(<256 x float> %i0, float %s1, <256 x i1> %m) {
+; CHECK-LABEL: test_vec_select_v256f32_vr:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lea %s1, 256
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vbrd %v1, %s0
+; CHECK-NEXT:    vmrg %v1, %v1, %v0, %vm1
+; CHECK-NEXT:    lea %s16, 256
+; CHECK-NEXT:    lvl %s16
+; CHECK-NEXT:    vor %v0, (0)1, %v1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %xins = insertelement <256 x float> undef, float %s1, i32 0
+  %i1 = shufflevector <256 x float> %xins, <256 x float> undef, <256 x i32> zeroinitializer
+  %r0 = select <256 x i1> %m, <256 x float> %i0, <256 x float> %i1
+  ret <256 x float> %r0
+}
+
+declare <256 x double> @llvm.vec.select.v256f64(<256 x i1>, <256 x double>, <256 x double>, i32)
+
+define fastcc <256 x double> @test_vec_select_v256f64_vv(<256 x double> %i0, <256 x double> %i1, <256 x i1> %m) {
+; CHECK-LABEL: test_vec_select_v256f64_vv:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lea %s0, 256
+; CHECK-NEXT:    lvl %s0
+; CHECK-NEXT:    vmrg %v1, %v1, %v0, %vm1
+; CHECK-NEXT:    lea %s16, 256
+; CHECK-NEXT:    lvl %s16
+; CHECK-NEXT:    vor %v0, (0)1, %v1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %r0 = select <256 x i1> %m, <256 x double> %i0, <256 x double> %i1
+  ret <256 x double> %r0
+}
+
+define fastcc <256 x double> @test_vec_select_v256f64_vr(<256 x double> %i0, double %s1, <256 x i1> %m) {
+; CHECK-LABEL: test_vec_select_v256f64_vr:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lea %s1, 256
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vbrd %v1, %s0
+; CHECK-NEXT:    vmrg %v1, %v1, %v0, %vm1
+; CHECK-NEXT:    lea %s16, 256
+; CHECK-NEXT:    lvl %s16
+; CHECK-NEXT:    vor %v0, (0)1, %v1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %xins = insertelement <256 x double> undef, double %s1, i32 0
+  %i1 = shufflevector <256 x double> %xins, <256 x double> undef, <256 x i32> zeroinitializer
+  %r0 = select <256 x i1> %m, <256 x double> %i0, <256 x double> %i1
+  ret <256 x double> %r0
+}
+
+declare <256 x i64> @llvm.vec.select.v256i64(<256 x i1>, <256 x i64>, <256 x i64>, i32)
+
+define fastcc <256 x i64> @test_vec_select_v256i64_vv(<256 x i64> %i0, <256 x i64> %i1, <256 x i1> %m) {
+; CHECK-LABEL: test_vec_select_v256i64_vv:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lea %s0, 256
+; CHECK-NEXT:    lvl %s0
+; CHECK-NEXT:    vmrg %v1, %v1, %v0, %vm1
+; CHECK-NEXT:    lea %s16, 256
+; CHECK-NEXT:    lvl %s16
+; CHECK-NEXT:    vor %v0, (0)1, %v1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %r0 = select <256 x i1> %m, <256 x i64> %i0, <256 x i64> %i1
+  ret <256 x i64> %r0
+}
+
+define fastcc <256 x i64> @test_vec_select_v256i64_vr(<256 x i64> %i0, i64 %s1, <256 x i1> %m) {
+; CHECK-LABEL: test_vec_select_v256i64_vr:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lea %s1, 256
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vbrd %v1, %s0
+; CHECK-NEXT:    vmrg %v1, %v1, %v0, %vm1
+; CHECK-NEXT:    lea %s16, 256
+; CHECK-NEXT:    lvl %s16
+; CHECK-NEXT:    vor %v0, (0)1, %v1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %xins = insertelement <256 x i64> undef, i64 %s1, i32 0
+  %i1 = shufflevector <256 x i64> %xins, <256 x i64> undef, <256 x i32> zeroinitializer
+  %r0 = select <256 x i1> %m, <256 x i64> %i0, <256 x i64> %i1
+  ret <256 x i64> %r0
+}

diff  --git a/llvm/test/CodeGen/VE/Vector/vp_merge.ll b/llvm/test/CodeGen/VE/Vector/vp_merge.ll
new file mode 100644
index 0000000000000..fdd38099b5b91
--- /dev/null
+++ b/llvm/test/CodeGen/VE/Vector/vp_merge.ll
@@ -0,0 +1,143 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -march=ve -mattr=+vpu | FileCheck %s
+
+declare <256 x i32> @llvm.vp.merge.v256i32(<256 x i1>, <256 x i32>, <256 x i32>, i32)
+
+define fastcc <256 x i32> @test_vp_merge_v256i32_vv(<256 x i32> %i0, <256 x i32> %i1, <256 x i1> %m, i32 %pivot) {
+; CHECK-LABEL: test_vp_merge_v256i32_vv:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and %s0, %s0, (32)0
+; CHECK-NEXT:    lvl %s0
+; CHECK-NEXT:    vmrg %v1, %v1, %v0, %vm1
+; CHECK-NEXT:    lea %s16, 256
+; CHECK-NEXT:    lvl %s16
+; CHECK-NEXT:    vor %v0, (0)1, %v1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %r0 = call <256 x i32> @llvm.vp.merge.v256i32(<256 x i1> %m, <256 x i32> %i0, <256 x i32> %i1, i32 %pivot)
+  ret <256 x i32> %r0
+}
+
+define fastcc <256 x i32> @test_vp_merge_v256i32_vr(<256 x i32> %i0, i32 %s1, <256 x i1> %m, i32 %pivot) {
+; CHECK-LABEL: test_vp_merge_v256i32_vr:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and %s1, %s1, (32)0
+; CHECK-NEXT:    and %s0, %s0, (32)0
+; CHECK-NEXT:    lea %s2, 256
+; CHECK-NEXT:    lvl %s2
+; CHECK-NEXT:    vbrd %v1, %s0
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vmrg %v1, %v1, %v0, %vm1
+; CHECK-NEXT:    lea %s16, 256
+; CHECK-NEXT:    lvl %s16
+; CHECK-NEXT:    vor %v0, (0)1, %v1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %xins = insertelement <256 x i32> undef, i32 %s1, i32 0
+  %i1 = shufflevector <256 x i32> %xins, <256 x i32> undef, <256 x i32> zeroinitializer
+  %r0 = call <256 x i32> @llvm.vp.merge.v256i32(<256 x i1> %m, <256 x i32> %i0, <256 x i32> %i1, i32 %pivot)
+  ret <256 x i32> %r0
+}
+
+declare <256 x float> @llvm.vp.merge.v256f32(<256 x i1>, <256 x float>, <256 x float>, i32)
+
+define fastcc <256 x float> @test_vp_merge_v256f32_vv(<256 x float> %i0, <256 x float> %i1, <256 x i1> %m, i32 %pivot) {
+; CHECK-LABEL: test_vp_merge_v256f32_vv:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and %s0, %s0, (32)0
+; CHECK-NEXT:    lvl %s0
+; CHECK-NEXT:    vmrg %v1, %v1, %v0, %vm1
+; CHECK-NEXT:    lea %s16, 256
+; CHECK-NEXT:    lvl %s16
+; CHECK-NEXT:    vor %v0, (0)1, %v1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %r0 = call <256 x float> @llvm.vp.merge.v256f32(<256 x i1> %m, <256 x float> %i0, <256 x float> %i1, i32 %pivot)
+  ret <256 x float> %r0
+}
+
+define fastcc <256 x float> @test_vp_merge_v256f32_vr(<256 x float> %i0, float %s1, <256 x i1> %m, i32 %pivot) {
+; CHECK-LABEL: test_vp_merge_v256f32_vr:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and %s1, %s1, (32)0
+; CHECK-NEXT:    lea %s2, 256
+; CHECK-NEXT:    lvl %s2
+; CHECK-NEXT:    vbrd %v1, %s0
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vmrg %v1, %v1, %v0, %vm1
+; CHECK-NEXT:    lea %s16, 256
+; CHECK-NEXT:    lvl %s16
+; CHECK-NEXT:    vor %v0, (0)1, %v1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %xins = insertelement <256 x float> undef, float %s1, i32 0
+  %i1 = shufflevector <256 x float> %xins, <256 x float> undef, <256 x i32> zeroinitializer
+  %r0 = call <256 x float> @llvm.vp.merge.v256f32(<256 x i1> %m, <256 x float> %i0, <256 x float> %i1, i32 %pivot)
+  ret <256 x float> %r0
+}
+
+declare <256 x double> @llvm.vp.merge.v256f64(<256 x i1>, <256 x double>, <256 x double>, i32)
+
+define fastcc <256 x double> @test_vp_merge_v256f64_vv(<256 x double> %i0, <256 x double> %i1, <256 x i1> %m, i32 %pivot) {
+; CHECK-LABEL: test_vp_merge_v256f64_vv:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and %s0, %s0, (32)0
+; CHECK-NEXT:    lvl %s0
+; CHECK-NEXT:    vmrg %v1, %v1, %v0, %vm1
+; CHECK-NEXT:    lea %s16, 256
+; CHECK-NEXT:    lvl %s16
+; CHECK-NEXT:    vor %v0, (0)1, %v1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %r0 = call <256 x double> @llvm.vp.merge.v256f64(<256 x i1> %m, <256 x double> %i0, <256 x double> %i1, i32 %pivot)
+  ret <256 x double> %r0
+}
+
+define fastcc <256 x double> @test_vp_merge_v256f64_vr(<256 x double> %i0, double %s1, <256 x i1> %m, i32 %pivot) {
+; CHECK-LABEL: test_vp_merge_v256f64_vr:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and %s1, %s1, (32)0
+; CHECK-NEXT:    lea %s2, 256
+; CHECK-NEXT:    lvl %s2
+; CHECK-NEXT:    vbrd %v1, %s0
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vmrg %v1, %v1, %v0, %vm1
+; CHECK-NEXT:    lea %s16, 256
+; CHECK-NEXT:    lvl %s16
+; CHECK-NEXT:    vor %v0, (0)1, %v1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %xins = insertelement <256 x double> undef, double %s1, i32 0
+  %i1 = shufflevector <256 x double> %xins, <256 x double> undef, <256 x i32> zeroinitializer
+  %r0 = call <256 x double> @llvm.vp.merge.v256f64(<256 x i1> %m, <256 x double> %i0, <256 x double> %i1, i32 %pivot)
+  ret <256 x double> %r0
+}
+
+declare <256 x i64> @llvm.vp.merge.v256i64(<256 x i1>, <256 x i64>, <256 x i64>, i32)
+
+define fastcc <256 x i64> @test_vp_merge_v256i64_vv(<256 x i64> %i0, <256 x i64> %i1, <256 x i1> %m, i32 %pivot) {
+; CHECK-LABEL: test_vp_merge_v256i64_vv:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and %s0, %s0, (32)0
+; CHECK-NEXT:    lvl %s0
+; CHECK-NEXT:    vmrg %v1, %v1, %v0, %vm1
+; CHECK-NEXT:    lea %s16, 256
+; CHECK-NEXT:    lvl %s16
+; CHECK-NEXT:    vor %v0, (0)1, %v1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %r0 = call <256 x i64> @llvm.vp.merge.v256i64(<256 x i1> %m, <256 x i64> %i0, <256 x i64> %i1, i32 %pivot)
+  ret <256 x i64> %r0
+}
+
+define fastcc <256 x i64> @test_vp_merge_v256i64_vr(<256 x i64> %i0, i64 %s1, <256 x i1> %m, i32 %pivot) {
+; CHECK-LABEL: test_vp_merge_v256i64_vr:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and %s1, %s1, (32)0
+; CHECK-NEXT:    lea %s2, 256
+; CHECK-NEXT:    lvl %s2
+; CHECK-NEXT:    vbrd %v1, %s0
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vmrg %v1, %v1, %v0, %vm1
+; CHECK-NEXT:    lea %s16, 256
+; CHECK-NEXT:    lvl %s16
+; CHECK-NEXT:    vor %v0, (0)1, %v1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %xins = insertelement <256 x i64> undef, i64 %s1, i32 0
+  %i1 = shufflevector <256 x i64> %xins, <256 x i64> undef, <256 x i32> zeroinitializer
+  %r0 = call <256 x i64> @llvm.vp.merge.v256i64(<256 x i1> %m, <256 x i64> %i0, <256 x i64> %i1, i32 %pivot)
+  ret <256 x i64> %r0
+}

diff  --git a/llvm/test/CodeGen/VE/Vector/vp_select.ll b/llvm/test/CodeGen/VE/Vector/vp_select.ll
new file mode 100644
index 0000000000000..f716efa471942
--- /dev/null
+++ b/llvm/test/CodeGen/VE/Vector/vp_select.ll
@@ -0,0 +1,143 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -march=ve -mattr=+vpu | FileCheck %s
+
+declare <256 x i32> @llvm.vp.select.v256i32(<256 x i1>, <256 x i32>, <256 x i32>, i32)
+
+define fastcc <256 x i32> @test_vp_select_v256i32_vv(<256 x i32> %i0, <256 x i32> %i1, <256 x i1> %m, i32 %pivot) {
+; CHECK-LABEL: test_vp_select_v256i32_vv:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and %s0, %s0, (32)0
+; CHECK-NEXT:    lvl %s0
+; CHECK-NEXT:    vmrg %v1, %v1, %v0, %vm1
+; CHECK-NEXT:    lea %s16, 256
+; CHECK-NEXT:    lvl %s16
+; CHECK-NEXT:    vor %v0, (0)1, %v1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %r0 = call <256 x i32> @llvm.vp.select.v256i32(<256 x i1> %m, <256 x i32> %i0, <256 x i32> %i1, i32 %pivot)
+  ret <256 x i32> %r0
+}
+
+define fastcc <256 x i32> @test_vp_select_v256i32_vr(<256 x i32> %i0, i32 %s1, <256 x i1> %m, i32 %pivot) {
+; CHECK-LABEL: test_vp_select_v256i32_vr:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and %s1, %s1, (32)0
+; CHECK-NEXT:    and %s0, %s0, (32)0
+; CHECK-NEXT:    lea %s2, 256
+; CHECK-NEXT:    lvl %s2
+; CHECK-NEXT:    vbrd %v1, %s0
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vmrg %v1, %v1, %v0, %vm1
+; CHECK-NEXT:    lea %s16, 256
+; CHECK-NEXT:    lvl %s16
+; CHECK-NEXT:    vor %v0, (0)1, %v1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %xins = insertelement <256 x i32> undef, i32 %s1, i32 0
+  %i1 = shufflevector <256 x i32> %xins, <256 x i32> undef, <256 x i32> zeroinitializer
+  %r0 = call <256 x i32> @llvm.vp.select.v256i32(<256 x i1> %m, <256 x i32> %i0, <256 x i32> %i1, i32 %pivot)
+  ret <256 x i32> %r0
+}
+
+declare <256 x float> @llvm.vp.select.v256f32(<256 x i1>, <256 x float>, <256 x float>, i32)
+
+define fastcc <256 x float> @test_vp_select_v256f32_vv(<256 x float> %i0, <256 x float> %i1, <256 x i1> %m, i32 %pivot) {
+; CHECK-LABEL: test_vp_select_v256f32_vv:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and %s0, %s0, (32)0
+; CHECK-NEXT:    lvl %s0
+; CHECK-NEXT:    vmrg %v1, %v1, %v0, %vm1
+; CHECK-NEXT:    lea %s16, 256
+; CHECK-NEXT:    lvl %s16
+; CHECK-NEXT:    vor %v0, (0)1, %v1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %r0 = call <256 x float> @llvm.vp.select.v256f32(<256 x i1> %m, <256 x float> %i0, <256 x float> %i1, i32 %pivot)
+  ret <256 x float> %r0
+}
+
+define fastcc <256 x float> @test_vp_select_v256f32_vr(<256 x float> %i0, float %s1, <256 x i1> %m, i32 %pivot) {
+; CHECK-LABEL: test_vp_select_v256f32_vr:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and %s1, %s1, (32)0
+; CHECK-NEXT:    lea %s2, 256
+; CHECK-NEXT:    lvl %s2
+; CHECK-NEXT:    vbrd %v1, %s0
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vmrg %v1, %v1, %v0, %vm1
+; CHECK-NEXT:    lea %s16, 256
+; CHECK-NEXT:    lvl %s16
+; CHECK-NEXT:    vor %v0, (0)1, %v1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %xins = insertelement <256 x float> undef, float %s1, i32 0
+  %i1 = shufflevector <256 x float> %xins, <256 x float> undef, <256 x i32> zeroinitializer
+  %r0 = call <256 x float> @llvm.vp.select.v256f32(<256 x i1> %m, <256 x float> %i0, <256 x float> %i1, i32 %pivot)
+  ret <256 x float> %r0
+}
+
+declare <256 x double> @llvm.vp.select.v256f64(<256 x i1>, <256 x double>, <256 x double>, i32)
+
+define fastcc <256 x double> @test_vp_select_v256f64_vv(<256 x double> %i0, <256 x double> %i1, <256 x i1> %m, i32 %pivot) {
+; CHECK-LABEL: test_vp_select_v256f64_vv:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and %s0, %s0, (32)0
+; CHECK-NEXT:    lvl %s0
+; CHECK-NEXT:    vmrg %v1, %v1, %v0, %vm1
+; CHECK-NEXT:    lea %s16, 256
+; CHECK-NEXT:    lvl %s16
+; CHECK-NEXT:    vor %v0, (0)1, %v1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %r0 = call <256 x double> @llvm.vp.select.v256f64(<256 x i1> %m, <256 x double> %i0, <256 x double> %i1, i32 %pivot)
+  ret <256 x double> %r0
+}
+
+define fastcc <256 x double> @test_vp_select_v256f64_vr(<256 x double> %i0, double %s1, <256 x i1> %m, i32 %pivot) {
+; CHECK-LABEL: test_vp_select_v256f64_vr:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and %s1, %s1, (32)0
+; CHECK-NEXT:    lea %s2, 256
+; CHECK-NEXT:    lvl %s2
+; CHECK-NEXT:    vbrd %v1, %s0
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vmrg %v1, %v1, %v0, %vm1
+; CHECK-NEXT:    lea %s16, 256
+; CHECK-NEXT:    lvl %s16
+; CHECK-NEXT:    vor %v0, (0)1, %v1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %xins = insertelement <256 x double> undef, double %s1, i32 0
+  %i1 = shufflevector <256 x double> %xins, <256 x double> undef, <256 x i32> zeroinitializer
+  %r0 = call <256 x double> @llvm.vp.select.v256f64(<256 x i1> %m, <256 x double> %i0, <256 x double> %i1, i32 %pivot)
+  ret <256 x double> %r0
+}
+
+declare <256 x i64> @llvm.vp.select.v256i64(<256 x i1>, <256 x i64>, <256 x i64>, i32)
+
+define fastcc <256 x i64> @test_vp_select_v256i64_vv(<256 x i64> %i0, <256 x i64> %i1, <256 x i1> %m, i32 %pivot) {
+; CHECK-LABEL: test_vp_select_v256i64_vv:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and %s0, %s0, (32)0
+; CHECK-NEXT:    lvl %s0
+; CHECK-NEXT:    vmrg %v1, %v1, %v0, %vm1
+; CHECK-NEXT:    lea %s16, 256
+; CHECK-NEXT:    lvl %s16
+; CHECK-NEXT:    vor %v0, (0)1, %v1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %r0 = call <256 x i64> @llvm.vp.select.v256i64(<256 x i1> %m, <256 x i64> %i0, <256 x i64> %i1, i32 %pivot)
+  ret <256 x i64> %r0
+}
+
+define fastcc <256 x i64> @test_vp_select_v256i64_vr(<256 x i64> %i0, i64 %s1, <256 x i1> %m, i32 %pivot) {
+; CHECK-LABEL: test_vp_select_v256i64_vr:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and %s1, %s1, (32)0
+; CHECK-NEXT:    lea %s2, 256
+; CHECK-NEXT:    lvl %s2
+; CHECK-NEXT:    vbrd %v1, %s0
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vmrg %v1, %v1, %v0, %vm1
+; CHECK-NEXT:    lea %s16, 256
+; CHECK-NEXT:    lvl %s16
+; CHECK-NEXT:    vor %v0, (0)1, %v1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %xins = insertelement <256 x i64> undef, i64 %s1, i32 0
+  %i1 = shufflevector <256 x i64> %xins, <256 x i64> undef, <256 x i32> zeroinitializer
+  %r0 = call <256 x i64> @llvm.vp.select.v256i64(<256 x i1> %m, <256 x i64> %i0, <256 x i64> %i1, i32 %pivot)
+  ret <256 x i64> %r0
+}