[llvm] 00d93de - [LegalizeVectorOps][X86][RISCV] Expand vector S/USHLSAT instead of unrolling.

Craig Topper via llvm-commits llvm-commits at lists.llvm.org
Thu Oct 27 09:09:47 PDT 2022


Author: Craig Topper
Date: 2022-10-27T09:09:36-07:00
New Revision: 00d93def778afa7fd117e615e4f6fb7645e25f49

URL: https://github.com/llvm/llvm-project/commit/00d93def778afa7fd117e615e4f6fb7645e25f49
DIFF: https://github.com/llvm/llvm-project/commit/00d93def778afa7fd117e615e4f6fb7645e25f49.diff

LOG: [LegalizeVectorOps][X86][RISCV] Expand vector S/USHLSAT instead of unrolling.

Reviewed By: RKSimon

Differential Revision: https://reviews.llvm.org/D136478

Added: 
    

Modified: 
    llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
    llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
    llvm/test/CodeGen/RISCV/rvv/sshl_sat_vec.ll
    llvm/test/CodeGen/RISCV/rvv/ushl_sat_vec.ll
    llvm/test/CodeGen/X86/sshl_sat_vec.ll
    llvm/test/CodeGen/X86/ushl_sat_vec.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 6f0cde6fbddb3..1683063a35373 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -856,6 +856,13 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) {
       return;
     }
     break;
+  case ISD::USHLSAT:
+  case ISD::SSHLSAT:
+    if (SDValue Expanded = TLI.expandShlSat(Node, DAG)) {
+      Results.push_back(Expanded);
+      return;
+    }
+    break;
   case ISD::FP_TO_SINT_SAT:
   case ISD::FP_TO_UINT_SAT:
     // Expand the fpsosisat if it is scalable to prevent it from unrolling below.

diff  --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 0d49b4782a411..cc0789423cd44 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -9227,9 +9227,13 @@ SDValue TargetLowering::expandShlSat(SDNode *Node, SelectionDAG &DAG) const {
   assert(VT == RHS.getValueType() && "Expected operands to be the same type");
   assert(VT.isInteger() && "Expected operands to be integers");
 
+  if (VT.isVector() && !isOperationLegalOrCustom(ISD::VSELECT, VT))
+    return DAG.UnrollVectorOp(Node);
+
   // If LHS != (LHS << RHS) >> RHS, we have overflow and must saturate.
 
   unsigned BW = VT.getScalarSizeInBits();
+  EVT BoolVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
   SDValue Result = DAG.getNode(ISD::SHL, dl, VT, LHS, RHS);
   SDValue Orig =
       DAG.getNode(IsSigned ? ISD::SRA : ISD::SRL, dl, VT, Result, RHS);
@@ -9238,14 +9242,14 @@ SDValue TargetLowering::expandShlSat(SDNode *Node, SelectionDAG &DAG) const {
   if (IsSigned) {
     SDValue SatMin = DAG.getConstant(APInt::getSignedMinValue(BW), dl, VT);
     SDValue SatMax = DAG.getConstant(APInt::getSignedMaxValue(BW), dl, VT);
-    SatVal = DAG.getSelectCC(dl, LHS, DAG.getConstant(0, dl, VT),
-                             SatMin, SatMax, ISD::SETLT);
+    SDValue Cond =
+        DAG.getSetCC(dl, BoolVT, LHS, DAG.getConstant(0, dl, VT), ISD::SETLT);
+    SatVal = DAG.getSelect(dl, VT, Cond, SatMin, SatMax);
   } else {
     SatVal = DAG.getConstant(APInt::getMaxValue(BW), dl, VT);
   }
-  Result = DAG.getSelectCC(dl, LHS, Orig, SatVal, Result, ISD::SETNE);
-
-  return Result;
+  SDValue Cond = DAG.getSetCC(dl, BoolVT, LHS, Orig, ISD::SETNE);
+  return DAG.getSelect(dl, VT, Cond, SatVal, Result);
 }
 
 SDValue

diff  --git a/llvm/test/CodeGen/RISCV/rvv/sshl_sat_vec.ll b/llvm/test/CodeGen/RISCV/rvv/sshl_sat_vec.ll
index 309be04a5eb29..a734bfa84f55e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/sshl_sat_vec.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/sshl_sat_vec.ll
@@ -9,34 +9,18 @@ declare <16 x i8> @llvm.sshl.sat.v16i8(<16 x i8>, <16 x i8>)
 define <2 x i64> @vec_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
 ; CHECK-LABEL: vec_v2i64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 0, e64, m1, ta, ma
-; CHECK-NEXT:    vmv.x.s a2, v8
-; CHECK-NEXT:    li a1, -1
-; CHECK-NEXT:    vmv.x.s a3, v9
-; CHECK-NEXT:    sll a0, a2, a3
-; CHECK-NEXT:    sra a3, a0, a3
-; CHECK-NEXT:    srli a1, a1, 1
-; CHECK-NEXT:    beq a2, a3, .LBB0_2
-; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    slti a0, a2, 0
-; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:  .LBB0_2:
-; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v9, v9, 1
-; CHECK-NEXT:    vmv.x.s a4, v9
-; CHECK-NEXT:    vslidedown.vi v8, v8, 1
-; CHECK-NEXT:    vmv.x.s a3, v8
-; CHECK-NEXT:    sll a2, a3, a4
-; CHECK-NEXT:    sra a4, a2, a4
-; CHECK-NEXT:    beq a3, a4, .LBB0_4
-; CHECK-NEXT:  # %bb.3:
-; CHECK-NEXT:    slti a2, a3, 0
-; CHECK-NEXT:    add a2, a2, a1
-; CHECK-NEXT:  .LBB0_4:
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT:    vmv.v.x v8, a2
-; CHECK-NEXT:    vsetvli zero, zero, e64, m1, tu, ma
-; CHECK-NEXT:    vmv.s.x v8, a0
+; CHECK-NEXT:    vmslt.vx v0, v8, zero
+; CHECK-NEXT:    li a0, -1
+; CHECK-NEXT:    srli a1, a0, 1
+; CHECK-NEXT:    vsll.vv v10, v8, v9
+; CHECK-NEXT:    vsra.vv v9, v10, v9
+; CHECK-NEXT:    vmsne.vv v8, v8, v9
+; CHECK-NEXT:    vmv.v.x v9, a1
+; CHECK-NEXT:    slli a0, a0, 63
+; CHECK-NEXT:    vmerge.vxm v9, v9, a0, v0
+; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vmerge.vvm v8, v10, v9, v0
 ; CHECK-NEXT:    ret
   %tmp = call <2 x i64> @llvm.sshl.sat.v2i64(<2 x i64> %x, <2 x i64> %y)
   ret <2 x i64> %tmp
@@ -45,69 +29,19 @@ define <2 x i64> @vec_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
 define <4 x i32> @vec_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; CHECK-LABEL: vec_v4i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 0, e32, m1, ta, ma
-; CHECK-NEXT:    vmv.x.s a0, v8
-; CHECK-NEXT:    slli a2, a0, 32
-; CHECK-NEXT:    li a0, -1
-; CHECK-NEXT:    vmv.x.s a3, v9
-; CHECK-NEXT:    sll a1, a2, a3
-; CHECK-NEXT:    sra a3, a1, a3
-; CHECK-NEXT:    srli a0, a0, 1
-; CHECK-NEXT:    beq a2, a3, .LBB1_2
-; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    slti a1, a2, 0
-; CHECK-NEXT:    add a1, a1, a0
-; CHECK-NEXT:  .LBB1_2:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    srli a1, a1, 32
-; CHECK-NEXT:    sw a1, 0(sp)
-; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v10, v9, 3
-; CHECK-NEXT:    vmv.x.s a3, v10
-; CHECK-NEXT:    vslidedown.vi v10, v8, 3
-; CHECK-NEXT:    vmv.x.s a1, v10
-; CHECK-NEXT:    slli a2, a1, 32
-; CHECK-NEXT:    sll a1, a2, a3
-; CHECK-NEXT:    sra a3, a1, a3
-; CHECK-NEXT:    beq a2, a3, .LBB1_4
-; CHECK-NEXT:  # %bb.3:
-; CHECK-NEXT:    slti a1, a2, 0
-; CHECK-NEXT:    add a1, a1, a0
-; CHECK-NEXT:  .LBB1_4:
-; CHECK-NEXT:    srli a3, a1, 32
-; CHECK-NEXT:    vslidedown.vi v10, v9, 2
-; CHECK-NEXT:    vmv.x.s a4, v10
-; CHECK-NEXT:    vslidedown.vi v10, v8, 2
-; CHECK-NEXT:    vmv.x.s a1, v10
-; CHECK-NEXT:    slli a2, a1, 32
-; CHECK-NEXT:    sll a1, a2, a4
-; CHECK-NEXT:    sra a4, a1, a4
-; CHECK-NEXT:    sw a3, 12(sp)
-; CHECK-NEXT:    beq a2, a4, .LBB1_6
-; CHECK-NEXT:  # %bb.5:
-; CHECK-NEXT:    slti a1, a2, 0
-; CHECK-NEXT:    add a1, a1, a0
-; CHECK-NEXT:  .LBB1_6:
-; CHECK-NEXT:    srli a3, a1, 32
-; CHECK-NEXT:    vslidedown.vi v9, v9, 1
-; CHECK-NEXT:    vmv.x.s a4, v9
-; CHECK-NEXT:    vslidedown.vi v8, v8, 1
-; CHECK-NEXT:    vmv.x.s a1, v8
-; CHECK-NEXT:    slli a2, a1, 32
-; CHECK-NEXT:    sll a1, a2, a4
-; CHECK-NEXT:    sra a4, a1, a4
-; CHECK-NEXT:    sw a3, 8(sp)
-; CHECK-NEXT:    beq a2, a4, .LBB1_8
-; CHECK-NEXT:  # %bb.7:
-; CHECK-NEXT:    slti a1, a2, 0
-; CHECK-NEXT:    add a1, a1, a0
-; CHECK-NEXT:  .LBB1_8:
-; CHECK-NEXT:    srli a0, a1, 32
-; CHECK-NEXT:    sw a0, 4(sp)
-; CHECK-NEXT:    mv a0, sp
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    vmslt.vx v0, v8, zero
+; CHECK-NEXT:    lui a0, 524288
+; CHECK-NEXT:    addiw a0, a0, -1
+; CHECK-NEXT:    vsll.vv v10, v8, v9
+; CHECK-NEXT:    vsra.vv v9, v10, v9
+; CHECK-NEXT:    vmsne.vv v8, v8, v9
+; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    li a0, 1
+; CHECK-NEXT:    slli a0, a0, 31
+; CHECK-NEXT:    vmerge.vxm v9, v9, a0, v0
+; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vmerge.vvm v8, v10, v9, v0
 ; CHECK-NEXT:    ret
   %tmp = call <4 x i32> @llvm.sshl.sat.v4i32(<4 x i32> %x, <4 x i32> %y)
   ret <4 x i32> %tmp
@@ -116,125 +50,17 @@ define <4 x i32> @vec_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
 define <8 x i16> @vec_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
 ; CHECK-LABEL: vec_v8i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 0, e16, m1, ta, ma
-; CHECK-NEXT:    vmv.x.s a0, v8
-; CHECK-NEXT:    slli a2, a0, 48
-; CHECK-NEXT:    li a0, -1
-; CHECK-NEXT:    vmv.x.s a3, v9
-; CHECK-NEXT:    sll a1, a2, a3
-; CHECK-NEXT:    sra a3, a1, a3
-; CHECK-NEXT:    srli a0, a0, 1
-; CHECK-NEXT:    beq a2, a3, .LBB2_2
-; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    slti a1, a2, 0
-; CHECK-NEXT:    add a1, a1, a0
-; CHECK-NEXT:  .LBB2_2:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    srli a1, a1, 48
-; CHECK-NEXT:    sh a1, 0(sp)
-; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v10, v9, 7
-; CHECK-NEXT:    vmv.x.s a3, v10
-; CHECK-NEXT:    vslidedown.vi v10, v8, 7
-; CHECK-NEXT:    vmv.x.s a1, v10
-; CHECK-NEXT:    slli a2, a1, 48
-; CHECK-NEXT:    sll a1, a2, a3
-; CHECK-NEXT:    sra a3, a1, a3
-; CHECK-NEXT:    beq a2, a3, .LBB2_4
-; CHECK-NEXT:  # %bb.3:
-; CHECK-NEXT:    slti a1, a2, 0
-; CHECK-NEXT:    add a1, a1, a0
-; CHECK-NEXT:  .LBB2_4:
-; CHECK-NEXT:    srli a3, a1, 48
-; CHECK-NEXT:    vslidedown.vi v10, v9, 6
-; CHECK-NEXT:    vmv.x.s a4, v10
-; CHECK-NEXT:    vslidedown.vi v10, v8, 6
-; CHECK-NEXT:    vmv.x.s a1, v10
-; CHECK-NEXT:    slli a2, a1, 48
-; CHECK-NEXT:    sll a1, a2, a4
-; CHECK-NEXT:    sra a4, a1, a4
-; CHECK-NEXT:    sh a3, 14(sp)
-; CHECK-NEXT:    beq a2, a4, .LBB2_6
-; CHECK-NEXT:  # %bb.5:
-; CHECK-NEXT:    slti a1, a2, 0
-; CHECK-NEXT:    add a1, a1, a0
-; CHECK-NEXT:  .LBB2_6:
-; CHECK-NEXT:    srli a3, a1, 48
-; CHECK-NEXT:    vslidedown.vi v10, v9, 5
-; CHECK-NEXT:    vmv.x.s a4, v10
-; CHECK-NEXT:    vslidedown.vi v10, v8, 5
-; CHECK-NEXT:    vmv.x.s a1, v10
-; CHECK-NEXT:    slli a2, a1, 48
-; CHECK-NEXT:    sll a1, a2, a4
-; CHECK-NEXT:    sra a4, a1, a4
-; CHECK-NEXT:    sh a3, 12(sp)
-; CHECK-NEXT:    beq a2, a4, .LBB2_8
-; CHECK-NEXT:  # %bb.7:
-; CHECK-NEXT:    slti a1, a2, 0
-; CHECK-NEXT:    add a1, a1, a0
-; CHECK-NEXT:  .LBB2_8:
-; CHECK-NEXT:    srli a3, a1, 48
-; CHECK-NEXT:    vslidedown.vi v10, v9, 4
-; CHECK-NEXT:    vmv.x.s a4, v10
-; CHECK-NEXT:    vslidedown.vi v10, v8, 4
-; CHECK-NEXT:    vmv.x.s a1, v10
-; CHECK-NEXT:    slli a2, a1, 48
-; CHECK-NEXT:    sll a1, a2, a4
-; CHECK-NEXT:    sra a4, a1, a4
-; CHECK-NEXT:    sh a3, 10(sp)
-; CHECK-NEXT:    beq a2, a4, .LBB2_10
-; CHECK-NEXT:  # %bb.9:
-; CHECK-NEXT:    slti a1, a2, 0
-; CHECK-NEXT:    add a1, a1, a0
-; CHECK-NEXT:  .LBB2_10:
-; CHECK-NEXT:    srli a3, a1, 48
-; CHECK-NEXT:    vslidedown.vi v10, v9, 3
-; CHECK-NEXT:    vmv.x.s a4, v10
-; CHECK-NEXT:    vslidedown.vi v10, v8, 3
-; CHECK-NEXT:    vmv.x.s a1, v10
-; CHECK-NEXT:    slli a2, a1, 48
-; CHECK-NEXT:    sll a1, a2, a4
-; CHECK-NEXT:    sra a4, a1, a4
-; CHECK-NEXT:    sh a3, 8(sp)
-; CHECK-NEXT:    beq a2, a4, .LBB2_12
-; CHECK-NEXT:  # %bb.11:
-; CHECK-NEXT:    slti a1, a2, 0
-; CHECK-NEXT:    add a1, a1, a0
-; CHECK-NEXT:  .LBB2_12:
-; CHECK-NEXT:    srli a3, a1, 48
-; CHECK-NEXT:    vslidedown.vi v10, v9, 2
-; CHECK-NEXT:    vmv.x.s a4, v10
-; CHECK-NEXT:    vslidedown.vi v10, v8, 2
-; CHECK-NEXT:    vmv.x.s a1, v10
-; CHECK-NEXT:    slli a2, a1, 48
-; CHECK-NEXT:    sll a1, a2, a4
-; CHECK-NEXT:    sra a4, a1, a4
-; CHECK-NEXT:    sh a3, 6(sp)
-; CHECK-NEXT:    beq a2, a4, .LBB2_14
-; CHECK-NEXT:  # %bb.13:
-; CHECK-NEXT:    slti a1, a2, 0
-; CHECK-NEXT:    add a1, a1, a0
-; CHECK-NEXT:  .LBB2_14:
-; CHECK-NEXT:    srli a3, a1, 48
-; CHECK-NEXT:    vslidedown.vi v9, v9, 1
-; CHECK-NEXT:    vmv.x.s a4, v9
-; CHECK-NEXT:    vslidedown.vi v8, v8, 1
-; CHECK-NEXT:    vmv.x.s a1, v8
-; CHECK-NEXT:    slli a2, a1, 48
-; CHECK-NEXT:    sll a1, a2, a4
-; CHECK-NEXT:    sra a4, a1, a4
-; CHECK-NEXT:    sh a3, 4(sp)
-; CHECK-NEXT:    beq a2, a4, .LBB2_16
-; CHECK-NEXT:  # %bb.15:
-; CHECK-NEXT:    slti a1, a2, 0
-; CHECK-NEXT:    add a1, a1, a0
-; CHECK-NEXT:  .LBB2_16:
-; CHECK-NEXT:    srli a0, a1, 48
-; CHECK-NEXT:    sh a0, 2(sp)
-; CHECK-NEXT:    mv a0, sp
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a0)
-; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    vmslt.vx v0, v8, zero
+; CHECK-NEXT:    lui a0, 8
+; CHECK-NEXT:    addiw a1, a0, -1
+; CHECK-NEXT:    vsll.vv v10, v8, v9
+; CHECK-NEXT:    vsra.vv v9, v10, v9
+; CHECK-NEXT:    vmsne.vv v8, v8, v9
+; CHECK-NEXT:    vmv.v.x v9, a1
+; CHECK-NEXT:    vmerge.vxm v9, v9, a0, v0
+; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vmerge.vvm v8, v10, v9, v0
 ; CHECK-NEXT:    ret
   %tmp = call <8 x i16> @llvm.sshl.sat.v8i16(<8 x i16> %x, <8 x i16> %y)
   ret <8 x i16> %tmp
@@ -243,238 +69,102 @@ define <8 x i16> @vec_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
 define <16 x i8> @vec_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
 ; CHECK-LABEL: vec_v16i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 0, e8, m1, ta, ma
-; CHECK-NEXT:    vmv.x.s a0, v8
-; CHECK-NEXT:    slli a2, a0, 56
-; CHECK-NEXT:    li a0, -1
-; CHECK-NEXT:    vmv.x.s a3, v9
-; CHECK-NEXT:    sll a1, a2, a3
-; CHECK-NEXT:    sra a3, a1, a3
-; CHECK-NEXT:    srli a0, a0, 1
-; CHECK-NEXT:    beq a2, a3, .LBB3_2
-; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    slti a1, a2, 0
-; CHECK-NEXT:    add a1, a1, a0
-; CHECK-NEXT:  .LBB3_2:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    srli a1, a1, 56
-; CHECK-NEXT:    sb a1, 0(sp)
-; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v10, v9, 15
-; CHECK-NEXT:    vmv.x.s a3, v10
-; CHECK-NEXT:    vslidedown.vi v10, v8, 15
-; CHECK-NEXT:    vmv.x.s a1, v10
-; CHECK-NEXT:    slli a2, a1, 56
-; CHECK-NEXT:    sll a1, a2, a3
-; CHECK-NEXT:    sra a3, a1, a3
-; CHECK-NEXT:    beq a2, a3, .LBB3_4
-; CHECK-NEXT:  # %bb.3:
-; CHECK-NEXT:    slti a1, a2, 0
-; CHECK-NEXT:    add a1, a1, a0
-; CHECK-NEXT:  .LBB3_4:
-; CHECK-NEXT:    srli a3, a1, 56
-; CHECK-NEXT:    vslidedown.vi v10, v9, 14
-; CHECK-NEXT:    vmv.x.s a4, v10
-; CHECK-NEXT:    vslidedown.vi v10, v8, 14
-; CHECK-NEXT:    vmv.x.s a1, v10
-; CHECK-NEXT:    slli a2, a1, 56
-; CHECK-NEXT:    sll a1, a2, a4
-; CHECK-NEXT:    sra a4, a1, a4
-; CHECK-NEXT:    sb a3, 15(sp)
-; CHECK-NEXT:    beq a2, a4, .LBB3_6
-; CHECK-NEXT:  # %bb.5:
-; CHECK-NEXT:    slti a1, a2, 0
-; CHECK-NEXT:    add a1, a1, a0
-; CHECK-NEXT:  .LBB3_6:
-; CHECK-NEXT:    srli a3, a1, 56
-; CHECK-NEXT:    vslidedown.vi v10, v9, 13
-; CHECK-NEXT:    vmv.x.s a4, v10
-; CHECK-NEXT:    vslidedown.vi v10, v8, 13
-; CHECK-NEXT:    vmv.x.s a1, v10
-; CHECK-NEXT:    slli a2, a1, 56
-; CHECK-NEXT:    sll a1, a2, a4
-; CHECK-NEXT:    sra a4, a1, a4
-; CHECK-NEXT:    sb a3, 14(sp)
-; CHECK-NEXT:    beq a2, a4, .LBB3_8
-; CHECK-NEXT:  # %bb.7:
-; CHECK-NEXT:    slti a1, a2, 0
-; CHECK-NEXT:    add a1, a1, a0
-; CHECK-NEXT:  .LBB3_8:
-; CHECK-NEXT:    srli a3, a1, 56
-; CHECK-NEXT:    vslidedown.vi v10, v9, 12
-; CHECK-NEXT:    vmv.x.s a4, v10
-; CHECK-NEXT:    vslidedown.vi v10, v8, 12
-; CHECK-NEXT:    vmv.x.s a1, v10
-; CHECK-NEXT:    slli a2, a1, 56
-; CHECK-NEXT:    sll a1, a2, a4
-; CHECK-NEXT:    sra a4, a1, a4
-; CHECK-NEXT:    sb a3, 13(sp)
-; CHECK-NEXT:    beq a2, a4, .LBB3_10
-; CHECK-NEXT:  # %bb.9:
-; CHECK-NEXT:    slti a1, a2, 0
-; CHECK-NEXT:    add a1, a1, a0
-; CHECK-NEXT:  .LBB3_10:
-; CHECK-NEXT:    srli a3, a1, 56
-; CHECK-NEXT:    vslidedown.vi v10, v9, 11
-; CHECK-NEXT:    vmv.x.s a4, v10
-; CHECK-NEXT:    vslidedown.vi v10, v8, 11
-; CHECK-NEXT:    vmv.x.s a1, v10
-; CHECK-NEXT:    slli a2, a1, 56
-; CHECK-NEXT:    sll a1, a2, a4
-; CHECK-NEXT:    sra a4, a1, a4
-; CHECK-NEXT:    sb a3, 12(sp)
-; CHECK-NEXT:    beq a2, a4, .LBB3_12
-; CHECK-NEXT:  # %bb.11:
-; CHECK-NEXT:    slti a1, a2, 0
-; CHECK-NEXT:    add a1, a1, a0
-; CHECK-NEXT:  .LBB3_12:
-; CHECK-NEXT:    srli a3, a1, 56
-; CHECK-NEXT:    vslidedown.vi v10, v9, 10
-; CHECK-NEXT:    vmv.x.s a4, v10
-; CHECK-NEXT:    vslidedown.vi v10, v8, 10
-; CHECK-NEXT:    vmv.x.s a1, v10
-; CHECK-NEXT:    slli a2, a1, 56
-; CHECK-NEXT:    sll a1, a2, a4
-; CHECK-NEXT:    sra a4, a1, a4
-; CHECK-NEXT:    sb a3, 11(sp)
-; CHECK-NEXT:    beq a2, a4, .LBB3_14
-; CHECK-NEXT:  # %bb.13:
-; CHECK-NEXT:    slti a1, a2, 0
-; CHECK-NEXT:    add a1, a1, a0
-; CHECK-NEXT:  .LBB3_14:
-; CHECK-NEXT:    srli a3, a1, 56
-; CHECK-NEXT:    vslidedown.vi v10, v9, 9
-; CHECK-NEXT:    vmv.x.s a4, v10
-; CHECK-NEXT:    vslidedown.vi v10, v8, 9
-; CHECK-NEXT:    vmv.x.s a1, v10
-; CHECK-NEXT:    slli a2, a1, 56
-; CHECK-NEXT:    sll a1, a2, a4
-; CHECK-NEXT:    sra a4, a1, a4
-; CHECK-NEXT:    sb a3, 10(sp)
-; CHECK-NEXT:    beq a2, a4, .LBB3_16
-; CHECK-NEXT:  # %bb.15:
-; CHECK-NEXT:    slti a1, a2, 0
-; CHECK-NEXT:    add a1, a1, a0
-; CHECK-NEXT:  .LBB3_16:
-; CHECK-NEXT:    srli a3, a1, 56
-; CHECK-NEXT:    vslidedown.vi v10, v9, 8
-; CHECK-NEXT:    vmv.x.s a4, v10
-; CHECK-NEXT:    vslidedown.vi v10, v8, 8
-; CHECK-NEXT:    vmv.x.s a1, v10
-; CHECK-NEXT:    slli a2, a1, 56
-; CHECK-NEXT:    sll a1, a2, a4
-; CHECK-NEXT:    sra a4, a1, a4
-; CHECK-NEXT:    sb a3, 9(sp)
-; CHECK-NEXT:    beq a2, a4, .LBB3_18
-; CHECK-NEXT:  # %bb.17:
-; CHECK-NEXT:    slti a1, a2, 0
-; CHECK-NEXT:    add a1, a1, a0
-; CHECK-NEXT:  .LBB3_18:
-; CHECK-NEXT:    srli a3, a1, 56
-; CHECK-NEXT:    vslidedown.vi v10, v9, 7
-; CHECK-NEXT:    vmv.x.s a4, v10
-; CHECK-NEXT:    vslidedown.vi v10, v8, 7
-; CHECK-NEXT:    vmv.x.s a1, v10
-; CHECK-NEXT:    slli a2, a1, 56
-; CHECK-NEXT:    sll a1, a2, a4
-; CHECK-NEXT:    sra a4, a1, a4
-; CHECK-NEXT:    sb a3, 8(sp)
-; CHECK-NEXT:    beq a2, a4, .LBB3_20
-; CHECK-NEXT:  # %bb.19:
-; CHECK-NEXT:    slti a1, a2, 0
-; CHECK-NEXT:    add a1, a1, a0
-; CHECK-NEXT:  .LBB3_20:
-; CHECK-NEXT:    srli a3, a1, 56
-; CHECK-NEXT:    vslidedown.vi v10, v9, 6
-; CHECK-NEXT:    vmv.x.s a4, v10
-; CHECK-NEXT:    vslidedown.vi v10, v8, 6
-; CHECK-NEXT:    vmv.x.s a1, v10
-; CHECK-NEXT:    slli a2, a1, 56
-; CHECK-NEXT:    sll a1, a2, a4
-; CHECK-NEXT:    sra a4, a1, a4
-; CHECK-NEXT:    sb a3, 7(sp)
-; CHECK-NEXT:    beq a2, a4, .LBB3_22
-; CHECK-NEXT:  # %bb.21:
-; CHECK-NEXT:    slti a1, a2, 0
-; CHECK-NEXT:    add a1, a1, a0
-; CHECK-NEXT:  .LBB3_22:
-; CHECK-NEXT:    srli a3, a1, 56
-; CHECK-NEXT:    vslidedown.vi v10, v9, 5
-; CHECK-NEXT:    vmv.x.s a4, v10
-; CHECK-NEXT:    vslidedown.vi v10, v8, 5
-; CHECK-NEXT:    vmv.x.s a1, v10
-; CHECK-NEXT:    slli a2, a1, 56
-; CHECK-NEXT:    sll a1, a2, a4
-; CHECK-NEXT:    sra a4, a1, a4
-; CHECK-NEXT:    sb a3, 6(sp)
-; CHECK-NEXT:    beq a2, a4, .LBB3_24
-; CHECK-NEXT:  # %bb.23:
-; CHECK-NEXT:    slti a1, a2, 0
-; CHECK-NEXT:    add a1, a1, a0
-; CHECK-NEXT:  .LBB3_24:
-; CHECK-NEXT:    srli a3, a1, 56
-; CHECK-NEXT:    vslidedown.vi v10, v9, 4
-; CHECK-NEXT:    vmv.x.s a4, v10
-; CHECK-NEXT:    vslidedown.vi v10, v8, 4
-; CHECK-NEXT:    vmv.x.s a1, v10
-; CHECK-NEXT:    slli a2, a1, 56
-; CHECK-NEXT:    sll a1, a2, a4
-; CHECK-NEXT:    sra a4, a1, a4
-; CHECK-NEXT:    sb a3, 5(sp)
-; CHECK-NEXT:    beq a2, a4, .LBB3_26
-; CHECK-NEXT:  # %bb.25:
-; CHECK-NEXT:    slti a1, a2, 0
-; CHECK-NEXT:    add a1, a1, a0
-; CHECK-NEXT:  .LBB3_26:
-; CHECK-NEXT:    srli a3, a1, 56
-; CHECK-NEXT:    vslidedown.vi v10, v9, 3
-; CHECK-NEXT:    vmv.x.s a4, v10
-; CHECK-NEXT:    vslidedown.vi v10, v8, 3
-; CHECK-NEXT:    vmv.x.s a1, v10
-; CHECK-NEXT:    slli a2, a1, 56
-; CHECK-NEXT:    sll a1, a2, a4
-; CHECK-NEXT:    sra a4, a1, a4
-; CHECK-NEXT:    sb a3, 4(sp)
-; CHECK-NEXT:    beq a2, a4, .LBB3_28
-; CHECK-NEXT:  # %bb.27:
-; CHECK-NEXT:    slti a1, a2, 0
-; CHECK-NEXT:    add a1, a1, a0
-; CHECK-NEXT:  .LBB3_28:
-; CHECK-NEXT:    srli a3, a1, 56
-; CHECK-NEXT:    vslidedown.vi v10, v9, 2
-; CHECK-NEXT:    vmv.x.s a4, v10
-; CHECK-NEXT:    vslidedown.vi v10, v8, 2
-; CHECK-NEXT:    vmv.x.s a1, v10
-; CHECK-NEXT:    slli a2, a1, 56
-; CHECK-NEXT:    sll a1, a2, a4
-; CHECK-NEXT:    sra a4, a1, a4
-; CHECK-NEXT:    sb a3, 3(sp)
-; CHECK-NEXT:    beq a2, a4, .LBB3_30
-; CHECK-NEXT:  # %bb.29:
-; CHECK-NEXT:    slti a1, a2, 0
-; CHECK-NEXT:    add a1, a1, a0
-; CHECK-NEXT:  .LBB3_30:
-; CHECK-NEXT:    srli a3, a1, 56
-; CHECK-NEXT:    vslidedown.vi v9, v9, 1
-; CHECK-NEXT:    vmv.x.s a4, v9
-; CHECK-NEXT:    vslidedown.vi v8, v8, 1
-; CHECK-NEXT:    vmv.x.s a1, v8
-; CHECK-NEXT:    slli a2, a1, 56
-; CHECK-NEXT:    sll a1, a2, a4
-; CHECK-NEXT:    sra a4, a1, a4
-; CHECK-NEXT:    sb a3, 2(sp)
-; CHECK-NEXT:    beq a2, a4, .LBB3_32
-; CHECK-NEXT:  # %bb.31:
-; CHECK-NEXT:    slti a1, a2, 0
-; CHECK-NEXT:    add a1, a1, a0
-; CHECK-NEXT:  .LBB3_32:
-; CHECK-NEXT:    srli a0, a1, 56
-; CHECK-NEXT:    sb a0, 1(sp)
-; CHECK-NEXT:    mv a0, sp
 ; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
-; CHECK-NEXT:    vle8.v v8, (a0)
-; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    vmslt.vx v0, v8, zero
+; CHECK-NEXT:    li a0, 127
+; CHECK-NEXT:    vsll.vv v10, v8, v9
+; CHECK-NEXT:    vsra.vv v9, v10, v9
+; CHECK-NEXT:    vmsne.vv v8, v8, v9
+; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    li a0, 128
+; CHECK-NEXT:    vmerge.vxm v9, v9, a0, v0
+; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vmerge.vvm v8, v10, v9, v0
 ; CHECK-NEXT:    ret
   %tmp = call <16 x i8> @llvm.sshl.sat.v16i8(<16 x i8> %x, <16 x i8> %y)
   ret <16 x i8> %tmp
 }
+
+declare <vscale x 2 x i64> @llvm.sshl.sat.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>)
+declare <vscale x 4 x i32> @llvm.sshl.sat.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>)
+declare <vscale x 8 x i16> @llvm.sshl.sat.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>)
+declare <vscale x 16 x i8> @llvm.sshl.sat.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>)
+
+define <vscale x 2 x i64> @vec_nxv2i64(<vscale x 2 x i64> %x, <vscale x 2 x i64> %y) nounwind {
+; CHECK-LABEL: vec_nxv2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vmslt.vx v0, v8, zero
+; CHECK-NEXT:    li a0, -1
+; CHECK-NEXT:    srli a1, a0, 1
+; CHECK-NEXT:    vsll.vv v12, v8, v10
+; CHECK-NEXT:    vsra.vv v14, v12, v10
+; CHECK-NEXT:    vmsne.vv v10, v8, v14
+; CHECK-NEXT:    vmv.v.x v8, a1
+; CHECK-NEXT:    slli a0, a0, 63
+; CHECK-NEXT:    vmerge.vxm v8, v8, a0, v0
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmerge.vvm v8, v12, v8, v0
+; CHECK-NEXT:    ret
+  %tmp = call <vscale x 2 x i64> @llvm.sshl.sat.nxv2i64(<vscale x 2 x i64> %x, <vscale x 2 x i64> %y)
+  ret <vscale x 2 x i64> %tmp
+}
+
+define <vscale x 4 x i32> @vec_nxv4i32(<vscale x 4 x i32> %x, <vscale x 4 x i32> %y) nounwind {
+; CHECK-LABEL: vec_nxv4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vmslt.vx v0, v8, zero
+; CHECK-NEXT:    lui a0, 524288
+; CHECK-NEXT:    addiw a0, a0, -1
+; CHECK-NEXT:    vsll.vv v12, v8, v10
+; CHECK-NEXT:    vsra.vv v14, v12, v10
+; CHECK-NEXT:    vmsne.vv v10, v8, v14
+; CHECK-NEXT:    vmv.v.x v8, a0
+; CHECK-NEXT:    li a0, 1
+; CHECK-NEXT:    slli a0, a0, 31
+; CHECK-NEXT:    vmerge.vxm v8, v8, a0, v0
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmerge.vvm v8, v12, v8, v0
+; CHECK-NEXT:    ret
+  %tmp = call <vscale x 4 x i32> @llvm.sshl.sat.nxv4i32(<vscale x 4 x i32> %x, <vscale x 4 x i32> %y)
+  ret <vscale x 4 x i32> %tmp
+}
+
+define <vscale x 8 x i16> @vec_nxv8i16(<vscale x 8 x i16> %x, <vscale x 8 x i16> %y) nounwind {
+; CHECK-LABEL: vec_nxv8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vmslt.vx v0, v8, zero
+; CHECK-NEXT:    lui a0, 8
+; CHECK-NEXT:    addiw a1, a0, -1
+; CHECK-NEXT:    vsll.vv v12, v8, v10
+; CHECK-NEXT:    vsra.vv v14, v12, v10
+; CHECK-NEXT:    vmsne.vv v10, v8, v14
+; CHECK-NEXT:    vmv.v.x v8, a1
+; CHECK-NEXT:    vmerge.vxm v8, v8, a0, v0
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmerge.vvm v8, v12, v8, v0
+; CHECK-NEXT:    ret
+  %tmp = call <vscale x 8 x i16> @llvm.sshl.sat.nxv8i16(<vscale x 8 x i16> %x, <vscale x 8 x i16> %y)
+  ret <vscale x 8 x i16> %tmp
+}
+
+define <vscale x 16 x i8> @vec_nxv16i8(<vscale x 16 x i8> %x, <vscale x 16 x i8> %y) nounwind {
+; CHECK-LABEL: vec_nxv16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
+; CHECK-NEXT:    vmslt.vx v0, v8, zero
+; CHECK-NEXT:    li a0, 127
+; CHECK-NEXT:    vsll.vv v12, v8, v10
+; CHECK-NEXT:    vsra.vv v14, v12, v10
+; CHECK-NEXT:    vmsne.vv v10, v8, v14
+; CHECK-NEXT:    vmv.v.x v8, a0
+; CHECK-NEXT:    li a0, 128
+; CHECK-NEXT:    vmerge.vxm v8, v8, a0, v0
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmerge.vvm v8, v12, v8, v0
+; CHECK-NEXT:    ret
+  %tmp = call <vscale x 16 x i8> @llvm.sshl.sat.nxv16i8(<vscale x 16 x i8> %x, <vscale x 16 x i8> %y)
+  ret <vscale x 16 x i8> %tmp
+}

diff  --git a/llvm/test/CodeGen/RISCV/rvv/ushl_sat_vec.ll b/llvm/test/CodeGen/RISCV/rvv/ushl_sat_vec.ll
index 2dcfe1addb050..ba970e62875a9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ushl_sat_vec.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ushl_sat_vec.ll
@@ -9,30 +9,11 @@ declare <16 x i8> @llvm.ushl.sat.v16i8(<16 x i8>, <16 x i8>)
 define <2 x i64> @vec_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
 ; CHECK-LABEL: vec_v2i64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vmv.x.s a0, v10
-; CHECK-NEXT:    vslidedown.vi v10, v8, 1
-; CHECK-NEXT:    vmv.x.s a1, v10
-; CHECK-NEXT:    sll a2, a1, a0
-; CHECK-NEXT:    srl a0, a2, a0
-; CHECK-NEXT:    xor a0, a1, a0
-; CHECK-NEXT:    seqz a0, a0
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    or a0, a0, a2
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT:    vmv.v.x v10, a0
-; CHECK-NEXT:    vmv.x.s a0, v9
-; CHECK-NEXT:    vmv.x.s a1, v8
-; CHECK-NEXT:    sll a2, a1, a0
-; CHECK-NEXT:    srl a0, a2, a0
-; CHECK-NEXT:    xor a0, a1, a0
-; CHECK-NEXT:    seqz a0, a0
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    or a0, a0, a2
-; CHECK-NEXT:    vsetvli zero, zero, e64, m1, tu, ma
-; CHECK-NEXT:    vmv.s.x v10, a0
-; CHECK-NEXT:    vmv1r.v v8, v10
+; CHECK-NEXT:    vsll.vv v10, v8, v9
+; CHECK-NEXT:    vsrl.vv v9, v10, v9
+; CHECK-NEXT:    vmsne.vv v0, v8, v9
+; CHECK-NEXT:    vmerge.vim v8, v10, -1, v0
 ; CHECK-NEXT:    ret
   %tmp = call <2 x i64> @llvm.ushl.sat.v2i64(<2 x i64> %x, <2 x i64> %y)
   ret <2 x i64> %tmp
@@ -41,63 +22,11 @@ define <2 x i64> @vec_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
 define <4 x i32> @vec_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; CHECK-LABEL: vec_v4i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    vsetivli zero, 0, e32, m1, ta, ma
-; CHECK-NEXT:    vmv.x.s a0, v9
-; CHECK-NEXT:    vmv.x.s a1, v8
-; CHECK-NEXT:    slli a1, a1, 32
-; CHECK-NEXT:    sll a2, a1, a0
-; CHECK-NEXT:    srl a0, a2, a0
-; CHECK-NEXT:    xor a0, a1, a0
-; CHECK-NEXT:    seqz a0, a0
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    or a0, a0, a2
-; CHECK-NEXT:    srli a0, a0, 32
-; CHECK-NEXT:    sw a0, 0(sp)
-; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v10, v9, 3
-; CHECK-NEXT:    vmv.x.s a0, v10
-; CHECK-NEXT:    vslidedown.vi v10, v8, 3
-; CHECK-NEXT:    vmv.x.s a1, v10
-; CHECK-NEXT:    slli a1, a1, 32
-; CHECK-NEXT:    sll a2, a1, a0
-; CHECK-NEXT:    srl a0, a2, a0
-; CHECK-NEXT:    xor a0, a1, a0
-; CHECK-NEXT:    seqz a0, a0
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    or a0, a0, a2
-; CHECK-NEXT:    srli a0, a0, 32
-; CHECK-NEXT:    sw a0, 12(sp)
-; CHECK-NEXT:    vslidedown.vi v10, v9, 2
-; CHECK-NEXT:    vmv.x.s a0, v10
-; CHECK-NEXT:    vslidedown.vi v10, v8, 2
-; CHECK-NEXT:    vmv.x.s a1, v10
-; CHECK-NEXT:    slli a1, a1, 32
-; CHECK-NEXT:    sll a2, a1, a0
-; CHECK-NEXT:    srl a0, a2, a0
-; CHECK-NEXT:    xor a0, a1, a0
-; CHECK-NEXT:    seqz a0, a0
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    or a0, a0, a2
-; CHECK-NEXT:    srli a0, a0, 32
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    vslidedown.vi v9, v9, 1
-; CHECK-NEXT:    vmv.x.s a0, v9
-; CHECK-NEXT:    vslidedown.vi v8, v8, 1
-; CHECK-NEXT:    vmv.x.s a1, v8
-; CHECK-NEXT:    slli a1, a1, 32
-; CHECK-NEXT:    sll a2, a1, a0
-; CHECK-NEXT:    srl a0, a2, a0
-; CHECK-NEXT:    xor a0, a1, a0
-; CHECK-NEXT:    seqz a0, a0
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    or a0, a0, a2
-; CHECK-NEXT:    srli a0, a0, 32
-; CHECK-NEXT:    sw a0, 4(sp)
-; CHECK-NEXT:    mv a0, sp
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    vsll.vv v10, v8, v9
+; CHECK-NEXT:    vsrl.vv v9, v10, v9
+; CHECK-NEXT:    vmsne.vv v0, v8, v9
+; CHECK-NEXT:    vmerge.vim v8, v10, -1, v0
 ; CHECK-NEXT:    ret
   %tmp = call <4 x i32> @llvm.ushl.sat.v4i32(<4 x i32> %x, <4 x i32> %y)
   ret <4 x i32> %tmp
@@ -106,115 +35,11 @@ define <4 x i32> @vec_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
 define <8 x i16> @vec_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
 ; CHECK-LABEL: vec_v8i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    vsetivli zero, 0, e16, m1, ta, ma
-; CHECK-NEXT:    vmv.x.s a0, v9
-; CHECK-NEXT:    vmv.x.s a1, v8
-; CHECK-NEXT:    slli a1, a1, 48
-; CHECK-NEXT:    sll a2, a1, a0
-; CHECK-NEXT:    srl a0, a2, a0
-; CHECK-NEXT:    xor a0, a1, a0
-; CHECK-NEXT:    seqz a0, a0
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    or a0, a0, a2
-; CHECK-NEXT:    srli a0, a0, 48
-; CHECK-NEXT:    sh a0, 0(sp)
-; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v10, v9, 7
-; CHECK-NEXT:    vmv.x.s a0, v10
-; CHECK-NEXT:    vslidedown.vi v10, v8, 7
-; CHECK-NEXT:    vmv.x.s a1, v10
-; CHECK-NEXT:    slli a1, a1, 48
-; CHECK-NEXT:    sll a2, a1, a0
-; CHECK-NEXT:    srl a0, a2, a0
-; CHECK-NEXT:    xor a0, a1, a0
-; CHECK-NEXT:    seqz a0, a0
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    or a0, a0, a2
-; CHECK-NEXT:    srli a0, a0, 48
-; CHECK-NEXT:    sh a0, 14(sp)
-; CHECK-NEXT:    vslidedown.vi v10, v9, 6
-; CHECK-NEXT:    vmv.x.s a0, v10
-; CHECK-NEXT:    vslidedown.vi v10, v8, 6
-; CHECK-NEXT:    vmv.x.s a1, v10
-; CHECK-NEXT:    slli a1, a1, 48
-; CHECK-NEXT:    sll a2, a1, a0
-; CHECK-NEXT:    srl a0, a2, a0
-; CHECK-NEXT:    xor a0, a1, a0
-; CHECK-NEXT:    seqz a0, a0
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    or a0, a0, a2
-; CHECK-NEXT:    srli a0, a0, 48
-; CHECK-NEXT:    sh a0, 12(sp)
-; CHECK-NEXT:    vslidedown.vi v10, v9, 5
-; CHECK-NEXT:    vmv.x.s a0, v10
-; CHECK-NEXT:    vslidedown.vi v10, v8, 5
-; CHECK-NEXT:    vmv.x.s a1, v10
-; CHECK-NEXT:    slli a1, a1, 48
-; CHECK-NEXT:    sll a2, a1, a0
-; CHECK-NEXT:    srl a0, a2, a0
-; CHECK-NEXT:    xor a0, a1, a0
-; CHECK-NEXT:    seqz a0, a0
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    or a0, a0, a2
-; CHECK-NEXT:    srli a0, a0, 48
-; CHECK-NEXT:    sh a0, 10(sp)
-; CHECK-NEXT:    vslidedown.vi v10, v9, 4
-; CHECK-NEXT:    vmv.x.s a0, v10
-; CHECK-NEXT:    vslidedown.vi v10, v8, 4
-; CHECK-NEXT:    vmv.x.s a1, v10
-; CHECK-NEXT:    slli a1, a1, 48
-; CHECK-NEXT:    sll a2, a1, a0
-; CHECK-NEXT:    srl a0, a2, a0
-; CHECK-NEXT:    xor a0, a1, a0
-; CHECK-NEXT:    seqz a0, a0
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    or a0, a0, a2
-; CHECK-NEXT:    srli a0, a0, 48
-; CHECK-NEXT:    sh a0, 8(sp)
-; CHECK-NEXT:    vslidedown.vi v10, v9, 3
-; CHECK-NEXT:    vmv.x.s a0, v10
-; CHECK-NEXT:    vslidedown.vi v10, v8, 3
-; CHECK-NEXT:    vmv.x.s a1, v10
-; CHECK-NEXT:    slli a1, a1, 48
-; CHECK-NEXT:    sll a2, a1, a0
-; CHECK-NEXT:    srl a0, a2, a0
-; CHECK-NEXT:    xor a0, a1, a0
-; CHECK-NEXT:    seqz a0, a0
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    or a0, a0, a2
-; CHECK-NEXT:    srli a0, a0, 48
-; CHECK-NEXT:    sh a0, 6(sp)
-; CHECK-NEXT:    vslidedown.vi v10, v9, 2
-; CHECK-NEXT:    vmv.x.s a0, v10
-; CHECK-NEXT:    vslidedown.vi v10, v8, 2
-; CHECK-NEXT:    vmv.x.s a1, v10
-; CHECK-NEXT:    slli a1, a1, 48
-; CHECK-NEXT:    sll a2, a1, a0
-; CHECK-NEXT:    srl a0, a2, a0
-; CHECK-NEXT:    xor a0, a1, a0
-; CHECK-NEXT:    seqz a0, a0
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    or a0, a0, a2
-; CHECK-NEXT:    srli a0, a0, 48
-; CHECK-NEXT:    sh a0, 4(sp)
-; CHECK-NEXT:    vslidedown.vi v9, v9, 1
-; CHECK-NEXT:    vmv.x.s a0, v9
-; CHECK-NEXT:    vslidedown.vi v8, v8, 1
-; CHECK-NEXT:    vmv.x.s a1, v8
-; CHECK-NEXT:    slli a1, a1, 48
-; CHECK-NEXT:    sll a2, a1, a0
-; CHECK-NEXT:    srl a0, a2, a0
-; CHECK-NEXT:    xor a0, a1, a0
-; CHECK-NEXT:    seqz a0, a0
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    or a0, a0, a2
-; CHECK-NEXT:    srli a0, a0, 48
-; CHECK-NEXT:    sh a0, 2(sp)
-; CHECK-NEXT:    mv a0, sp
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a0)
-; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    vsll.vv v10, v8, v9
+; CHECK-NEXT:    vsrl.vv v9, v10, v9
+; CHECK-NEXT:    vmsne.vv v0, v8, v9
+; CHECK-NEXT:    vmerge.vim v8, v10, -1, v0
 ; CHECK-NEXT:    ret
   %tmp = call <8 x i16> @llvm.ushl.sat.v8i16(<8 x i16> %x, <8 x i16> %y)
   ret <8 x i16> %tmp
@@ -223,220 +48,69 @@ define <8 x i16> @vec_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
 define <16 x i8> @vec_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
 ; CHECK-LABEL: vec_v16i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    vsetivli zero, 0, e8, m1, ta, ma
-; CHECK-NEXT:    vmv.x.s a0, v9
-; CHECK-NEXT:    vmv.x.s a1, v8
-; CHECK-NEXT:    slli a1, a1, 56
-; CHECK-NEXT:    sll a2, a1, a0
-; CHECK-NEXT:    srl a0, a2, a0
-; CHECK-NEXT:    xor a0, a1, a0
-; CHECK-NEXT:    seqz a0, a0
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    or a0, a0, a2
-; CHECK-NEXT:    srli a0, a0, 56
-; CHECK-NEXT:    sb a0, 0(sp)
-; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v10, v9, 15
-; CHECK-NEXT:    vmv.x.s a0, v10
-; CHECK-NEXT:    vslidedown.vi v10, v8, 15
-; CHECK-NEXT:    vmv.x.s a1, v10
-; CHECK-NEXT:    slli a1, a1, 56
-; CHECK-NEXT:    sll a2, a1, a0
-; CHECK-NEXT:    srl a0, a2, a0
-; CHECK-NEXT:    xor a0, a1, a0
-; CHECK-NEXT:    seqz a0, a0
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    or a0, a0, a2
-; CHECK-NEXT:    srli a0, a0, 56
-; CHECK-NEXT:    sb a0, 15(sp)
-; CHECK-NEXT:    vslidedown.vi v10, v9, 14
-; CHECK-NEXT:    vmv.x.s a0, v10
-; CHECK-NEXT:    vslidedown.vi v10, v8, 14
-; CHECK-NEXT:    vmv.x.s a1, v10
-; CHECK-NEXT:    slli a1, a1, 56
-; CHECK-NEXT:    sll a2, a1, a0
-; CHECK-NEXT:    srl a0, a2, a0
-; CHECK-NEXT:    xor a0, a1, a0
-; CHECK-NEXT:    seqz a0, a0
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    or a0, a0, a2
-; CHECK-NEXT:    srli a0, a0, 56
-; CHECK-NEXT:    sb a0, 14(sp)
-; CHECK-NEXT:    vslidedown.vi v10, v9, 13
-; CHECK-NEXT:    vmv.x.s a0, v10
-; CHECK-NEXT:    vslidedown.vi v10, v8, 13
-; CHECK-NEXT:    vmv.x.s a1, v10
-; CHECK-NEXT:    slli a1, a1, 56
-; CHECK-NEXT:    sll a2, a1, a0
-; CHECK-NEXT:    srl a0, a2, a0
-; CHECK-NEXT:    xor a0, a1, a0
-; CHECK-NEXT:    seqz a0, a0
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    or a0, a0, a2
-; CHECK-NEXT:    srli a0, a0, 56
-; CHECK-NEXT:    sb a0, 13(sp)
-; CHECK-NEXT:    vslidedown.vi v10, v9, 12
-; CHECK-NEXT:    vmv.x.s a0, v10
-; CHECK-NEXT:    vslidedown.vi v10, v8, 12
-; CHECK-NEXT:    vmv.x.s a1, v10
-; CHECK-NEXT:    slli a1, a1, 56
-; CHECK-NEXT:    sll a2, a1, a0
-; CHECK-NEXT:    srl a0, a2, a0
-; CHECK-NEXT:    xor a0, a1, a0
-; CHECK-NEXT:    seqz a0, a0
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    or a0, a0, a2
-; CHECK-NEXT:    srli a0, a0, 56
-; CHECK-NEXT:    sb a0, 12(sp)
-; CHECK-NEXT:    vslidedown.vi v10, v9, 11
-; CHECK-NEXT:    vmv.x.s a0, v10
-; CHECK-NEXT:    vslidedown.vi v10, v8, 11
-; CHECK-NEXT:    vmv.x.s a1, v10
-; CHECK-NEXT:    slli a1, a1, 56
-; CHECK-NEXT:    sll a2, a1, a0
-; CHECK-NEXT:    srl a0, a2, a0
-; CHECK-NEXT:    xor a0, a1, a0
-; CHECK-NEXT:    seqz a0, a0
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    or a0, a0, a2
-; CHECK-NEXT:    srli a0, a0, 56
-; CHECK-NEXT:    sb a0, 11(sp)
-; CHECK-NEXT:    vslidedown.vi v10, v9, 10
-; CHECK-NEXT:    vmv.x.s a0, v10
-; CHECK-NEXT:    vslidedown.vi v10, v8, 10
-; CHECK-NEXT:    vmv.x.s a1, v10
-; CHECK-NEXT:    slli a1, a1, 56
-; CHECK-NEXT:    sll a2, a1, a0
-; CHECK-NEXT:    srl a0, a2, a0
-; CHECK-NEXT:    xor a0, a1, a0
-; CHECK-NEXT:    seqz a0, a0
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    or a0, a0, a2
-; CHECK-NEXT:    srli a0, a0, 56
-; CHECK-NEXT:    sb a0, 10(sp)
-; CHECK-NEXT:    vslidedown.vi v10, v9, 9
-; CHECK-NEXT:    vmv.x.s a0, v10
-; CHECK-NEXT:    vslidedown.vi v10, v8, 9
-; CHECK-NEXT:    vmv.x.s a1, v10
-; CHECK-NEXT:    slli a1, a1, 56
-; CHECK-NEXT:    sll a2, a1, a0
-; CHECK-NEXT:    srl a0, a2, a0
-; CHECK-NEXT:    xor a0, a1, a0
-; CHECK-NEXT:    seqz a0, a0
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    or a0, a0, a2
-; CHECK-NEXT:    srli a0, a0, 56
-; CHECK-NEXT:    sb a0, 9(sp)
-; CHECK-NEXT:    vslidedown.vi v10, v9, 8
-; CHECK-NEXT:    vmv.x.s a0, v10
-; CHECK-NEXT:    vslidedown.vi v10, v8, 8
-; CHECK-NEXT:    vmv.x.s a1, v10
-; CHECK-NEXT:    slli a1, a1, 56
-; CHECK-NEXT:    sll a2, a1, a0
-; CHECK-NEXT:    srl a0, a2, a0
-; CHECK-NEXT:    xor a0, a1, a0
-; CHECK-NEXT:    seqz a0, a0
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    or a0, a0, a2
-; CHECK-NEXT:    srli a0, a0, 56
-; CHECK-NEXT:    sb a0, 8(sp)
-; CHECK-NEXT:    vslidedown.vi v10, v9, 7
-; CHECK-NEXT:    vmv.x.s a0, v10
-; CHECK-NEXT:    vslidedown.vi v10, v8, 7
-; CHECK-NEXT:    vmv.x.s a1, v10
-; CHECK-NEXT:    slli a1, a1, 56
-; CHECK-NEXT:    sll a2, a1, a0
-; CHECK-NEXT:    srl a0, a2, a0
-; CHECK-NEXT:    xor a0, a1, a0
-; CHECK-NEXT:    seqz a0, a0
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    or a0, a0, a2
-; CHECK-NEXT:    srli a0, a0, 56
-; CHECK-NEXT:    sb a0, 7(sp)
-; CHECK-NEXT:    vslidedown.vi v10, v9, 6
-; CHECK-NEXT:    vmv.x.s a0, v10
-; CHECK-NEXT:    vslidedown.vi v10, v8, 6
-; CHECK-NEXT:    vmv.x.s a1, v10
-; CHECK-NEXT:    slli a1, a1, 56
-; CHECK-NEXT:    sll a2, a1, a0
-; CHECK-NEXT:    srl a0, a2, a0
-; CHECK-NEXT:    xor a0, a1, a0
-; CHECK-NEXT:    seqz a0, a0
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    or a0, a0, a2
-; CHECK-NEXT:    srli a0, a0, 56
-; CHECK-NEXT:    sb a0, 6(sp)
-; CHECK-NEXT:    vslidedown.vi v10, v9, 5
-; CHECK-NEXT:    vmv.x.s a0, v10
-; CHECK-NEXT:    vslidedown.vi v10, v8, 5
-; CHECK-NEXT:    vmv.x.s a1, v10
-; CHECK-NEXT:    slli a1, a1, 56
-; CHECK-NEXT:    sll a2, a1, a0
-; CHECK-NEXT:    srl a0, a2, a0
-; CHECK-NEXT:    xor a0, a1, a0
-; CHECK-NEXT:    seqz a0, a0
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    or a0, a0, a2
-; CHECK-NEXT:    srli a0, a0, 56
-; CHECK-NEXT:    sb a0, 5(sp)
-; CHECK-NEXT:    vslidedown.vi v10, v9, 4
-; CHECK-NEXT:    vmv.x.s a0, v10
-; CHECK-NEXT:    vslidedown.vi v10, v8, 4
-; CHECK-NEXT:    vmv.x.s a1, v10
-; CHECK-NEXT:    slli a1, a1, 56
-; CHECK-NEXT:    sll a2, a1, a0
-; CHECK-NEXT:    srl a0, a2, a0
-; CHECK-NEXT:    xor a0, a1, a0
-; CHECK-NEXT:    seqz a0, a0
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    or a0, a0, a2
-; CHECK-NEXT:    srli a0, a0, 56
-; CHECK-NEXT:    sb a0, 4(sp)
-; CHECK-NEXT:    vslidedown.vi v10, v9, 3
-; CHECK-NEXT:    vmv.x.s a0, v10
-; CHECK-NEXT:    vslidedown.vi v10, v8, 3
-; CHECK-NEXT:    vmv.x.s a1, v10
-; CHECK-NEXT:    slli a1, a1, 56
-; CHECK-NEXT:    sll a2, a1, a0
-; CHECK-NEXT:    srl a0, a2, a0
-; CHECK-NEXT:    xor a0, a1, a0
-; CHECK-NEXT:    seqz a0, a0
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    or a0, a0, a2
-; CHECK-NEXT:    srli a0, a0, 56
-; CHECK-NEXT:    sb a0, 3(sp)
-; CHECK-NEXT:    vslidedown.vi v10, v9, 2
-; CHECK-NEXT:    vmv.x.s a0, v10
-; CHECK-NEXT:    vslidedown.vi v10, v8, 2
-; CHECK-NEXT:    vmv.x.s a1, v10
-; CHECK-NEXT:    slli a1, a1, 56
-; CHECK-NEXT:    sll a2, a1, a0
-; CHECK-NEXT:    srl a0, a2, a0
-; CHECK-NEXT:    xor a0, a1, a0
-; CHECK-NEXT:    seqz a0, a0
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    or a0, a0, a2
-; CHECK-NEXT:    srli a0, a0, 56
-; CHECK-NEXT:    sb a0, 2(sp)
-; CHECK-NEXT:    vslidedown.vi v9, v9, 1
-; CHECK-NEXT:    vmv.x.s a0, v9
-; CHECK-NEXT:    vslidedown.vi v8, v8, 1
-; CHECK-NEXT:    vmv.x.s a1, v8
-; CHECK-NEXT:    slli a1, a1, 56
-; CHECK-NEXT:    sll a2, a1, a0
-; CHECK-NEXT:    srl a0, a2, a0
-; CHECK-NEXT:    xor a0, a1, a0
-; CHECK-NEXT:    seqz a0, a0
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    or a0, a0, a2
-; CHECK-NEXT:    srli a0, a0, 56
-; CHECK-NEXT:    sb a0, 1(sp)
-; CHECK-NEXT:    mv a0, sp
 ; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
-; CHECK-NEXT:    vle8.v v8, (a0)
-; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    vsll.vv v10, v8, v9
+; CHECK-NEXT:    vsrl.vv v9, v10, v9
+; CHECK-NEXT:    vmsne.vv v0, v8, v9
+; CHECK-NEXT:    vmerge.vim v8, v10, -1, v0
 ; CHECK-NEXT:    ret
   %tmp = call <16 x i8> @llvm.ushl.sat.v16i8(<16 x i8> %x, <16 x i8> %y)
   ret <16 x i8> %tmp
 }
+
+declare <vscale x 2 x i64> @llvm.ushl.sat.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>)
+declare <vscale x 4 x i32> @llvm.ushl.sat.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>)
+declare <vscale x 8 x i16> @llvm.ushl.sat.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>)
+declare <vscale x 16 x i8> @llvm.ushl.sat.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>)
+
+define <vscale x 2 x i64> @vec_nxv2i64(<vscale x 2 x i64> %x, <vscale x 2 x i64> %y) nounwind {
+; CHECK-LABEL: vec_nxv2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vsll.vv v12, v8, v10
+; CHECK-NEXT:    vsrl.vv v10, v12, v10
+; CHECK-NEXT:    vmsne.vv v0, v8, v10
+; CHECK-NEXT:    vmerge.vim v8, v12, -1, v0
+; CHECK-NEXT:    ret
+  %tmp = call <vscale x 2 x i64> @llvm.ushl.sat.nxv2i64(<vscale x 2 x i64> %x, <vscale x 2 x i64> %y)
+  ret <vscale x 2 x i64> %tmp
+}
+
+define <vscale x 4 x i32> @vec_nxv4i32(<vscale x 4 x i32> %x, <vscale x 4 x i32> %y) nounwind {
+; CHECK-LABEL: vec_nxv4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vsll.vv v12, v8, v10
+; CHECK-NEXT:    vsrl.vv v10, v12, v10
+; CHECK-NEXT:    vmsne.vv v0, v8, v10
+; CHECK-NEXT:    vmerge.vim v8, v12, -1, v0
+; CHECK-NEXT:    ret
+  %tmp = call <vscale x 4 x i32> @llvm.ushl.sat.nxv4i32(<vscale x 4 x i32> %x, <vscale x 4 x i32> %y)
+  ret <vscale x 4 x i32> %tmp
+}
+
+define <vscale x 8 x i16> @vec_nxv8i16(<vscale x 8 x i16> %x, <vscale x 8 x i16> %y) nounwind {
+; CHECK-LABEL: vec_nxv8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsll.vv v12, v8, v10
+; CHECK-NEXT:    vsrl.vv v10, v12, v10
+; CHECK-NEXT:    vmsne.vv v0, v8, v10
+; CHECK-NEXT:    vmerge.vim v8, v12, -1, v0
+; CHECK-NEXT:    ret
+  %tmp = call <vscale x 8 x i16> @llvm.ushl.sat.nxv8i16(<vscale x 8 x i16> %x, <vscale x 8 x i16> %y)
+  ret <vscale x 8 x i16> %tmp
+}
+
+define <vscale x 16 x i8> @vec_nxv16i8(<vscale x 16 x i8> %x, <vscale x 16 x i8> %y) nounwind {
+; CHECK-LABEL: vec_nxv16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
+; CHECK-NEXT:    vsll.vv v12, v8, v10
+; CHECK-NEXT:    vsrl.vv v10, v12, v10
+; CHECK-NEXT:    vmsne.vv v0, v8, v10
+; CHECK-NEXT:    vmerge.vim v8, v12, -1, v0
+; CHECK-NEXT:    ret
+  %tmp = call <vscale x 16 x i8> @llvm.ushl.sat.nxv16i8(<vscale x 16 x i8> %x, <vscale x 16 x i8> %y)
+  ret <vscale x 16 x i8> %tmp
+}

diff  --git a/llvm/test/CodeGen/X86/sshl_sat_vec.ll b/llvm/test/CodeGen/X86/sshl_sat_vec.ll
index bdae47e3970ee..bd9ee00d32e70 100644
--- a/llvm/test/CodeGen/X86/sshl_sat_vec.ll
+++ b/llvm/test/CodeGen/X86/sshl_sat_vec.ll
@@ -11,73 +11,53 @@ declare <16 x i8> @llvm.sshl.sat.v16i8(<16 x i8>, <16 x i8>)
 define <2 x i64> @vec_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
 ; X64-LABEL: vec_v2i64:
 ; X64:       # %bb.0:
-; X64-NEXT:    movq %xmm0, %rax
-; X64-NEXT:    xorl %edx, %edx
-; X64-NEXT:    testq %rax, %rax
-; X64-NEXT:    sets %dl
-; X64-NEXT:    movabsq $9223372036854775807, %rsi # imm = 0x7FFFFFFFFFFFFFFF
-; X64-NEXT:    addq %rsi, %rdx
-; X64-NEXT:    movq %xmm1, %rcx
-; X64-NEXT:    movq %rax, %rdi
-; X64-NEXT:    shlq %cl, %rdi
-; X64-NEXT:    movq %rdi, %r8
-; X64-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-NEXT:    sarq %cl, %r8
-; X64-NEXT:    cmpq %r8, %rax
-; X64-NEXT:    cmovneq %rdx, %rdi
-; X64-NEXT:    movq %rdi, %xmm2
-; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X64-NEXT:    movq %xmm0, %rax
-; X64-NEXT:    xorl %edx, %edx
-; X64-NEXT:    testq %rax, %rax
-; X64-NEXT:    sets %dl
-; X64-NEXT:    addq %rsi, %rdx
-; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
-; X64-NEXT:    movq %xmm0, %rcx
-; X64-NEXT:    movq %rax, %rsi
-; X64-NEXT:    shlq %cl, %rsi
-; X64-NEXT:    movq %rsi, %rdi
-; X64-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-NEXT:    sarq %cl, %rdi
-; X64-NEXT:    cmpq %rdi, %rax
-; X64-NEXT:    cmovneq %rdx, %rsi
-; X64-NEXT:    movq %rsi, %xmm0
-; X64-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
-; X64-NEXT:    movdqa %xmm2, %xmm0
+; X64-NEXT:    movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; X64-NEXT:    movdqa %xmm2, %xmm3
+; X64-NEXT:    psrlq %xmm1, %xmm3
+; X64-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
+; X64-NEXT:    movdqa %xmm2, %xmm5
+; X64-NEXT:    psrlq %xmm4, %xmm5
+; X64-NEXT:    movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1]
+; X64-NEXT:    movdqa %xmm0, %xmm6
+; X64-NEXT:    psllq %xmm1, %xmm6
+; X64-NEXT:    movdqa %xmm0, %xmm3
+; X64-NEXT:    psllq %xmm4, %xmm3
+; X64-NEXT:    movdqa %xmm3, %xmm7
+; X64-NEXT:    movsd {{.*#+}} xmm3 = xmm6[0],xmm3[1]
+; X64-NEXT:    psrlq %xmm1, %xmm6
+; X64-NEXT:    psrlq %xmm4, %xmm7
+; X64-NEXT:    movsd {{.*#+}} xmm7 = xmm6[0],xmm7[1]
+; X64-NEXT:    xorpd %xmm5, %xmm7
+; X64-NEXT:    psubq %xmm5, %xmm7
+; X64-NEXT:    pcmpeqd %xmm0, %xmm7
+; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm7[1,0,3,2]
+; X64-NEXT:    pand %xmm7, %xmm1
+; X64-NEXT:    andpd %xmm1, %xmm3
+; X64-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; X64-NEXT:    pand %xmm2, %xmm0
+; X64-NEXT:    pxor %xmm5, %xmm5
+; X64-NEXT:    pcmpgtd %xmm4, %xmm5
+; X64-NEXT:    pcmpeqd %xmm4, %xmm4
+; X64-NEXT:    pxor %xmm5, %xmm4
+; X64-NEXT:    pandn %xmm4, %xmm2
+; X64-NEXT:    por %xmm0, %xmm2
+; X64-NEXT:    pandn %xmm2, %xmm1
+; X64-NEXT:    por %xmm3, %xmm1
+; X64-NEXT:    movdqa %xmm1, %xmm0
 ; X64-NEXT:    retq
 ;
 ; X64-AVX2-LABEL: vec_v2i64:
 ; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vpextrq $1, %xmm0, %rax
-; X64-AVX2-NEXT:    xorl %edx, %edx
-; X64-AVX2-NEXT:    testq %rax, %rax
-; X64-AVX2-NEXT:    sets %dl
-; X64-AVX2-NEXT:    movabsq $9223372036854775807, %rsi # imm = 0x7FFFFFFFFFFFFFFF
-; X64-AVX2-NEXT:    addq %rsi, %rdx
-; X64-AVX2-NEXT:    vpextrq $1, %xmm1, %rcx
-; X64-AVX2-NEXT:    movq %rax, %rdi
-; X64-AVX2-NEXT:    shlq %cl, %rdi
-; X64-AVX2-NEXT:    movq %rdi, %r8
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-AVX2-NEXT:    sarq %cl, %r8
-; X64-AVX2-NEXT:    cmpq %r8, %rax
-; X64-AVX2-NEXT:    cmovneq %rdx, %rdi
-; X64-AVX2-NEXT:    vmovq %rdi, %xmm2
-; X64-AVX2-NEXT:    vmovq %xmm0, %rax
-; X64-AVX2-NEXT:    xorl %edx, %edx
-; X64-AVX2-NEXT:    testq %rax, %rax
-; X64-AVX2-NEXT:    sets %dl
-; X64-AVX2-NEXT:    addq %rsi, %rdx
-; X64-AVX2-NEXT:    vmovq %xmm1, %rcx
-; X64-AVX2-NEXT:    movq %rax, %rsi
-; X64-AVX2-NEXT:    shlq %cl, %rsi
-; X64-AVX2-NEXT:    movq %rsi, %rdi
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-AVX2-NEXT:    sarq %cl, %rdi
-; X64-AVX2-NEXT:    cmpq %rdi, %rax
-; X64-AVX2-NEXT:    cmovneq %rdx, %rsi
-; X64-AVX2-NEXT:    vmovq %rsi, %xmm0
-; X64-AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; X64-AVX2-NEXT:    vmovapd {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; X64-AVX2-NEXT:    vmovapd {{.*#+}} xmm3 = [9223372036854775807,9223372036854775807]
+; X64-AVX2-NEXT:    vblendvpd %xmm0, %xmm2, %xmm3, %xmm3
+; X64-AVX2-NEXT:    vpsrlvq %xmm1, %xmm2, %xmm2
+; X64-AVX2-NEXT:    vpsllvq %xmm1, %xmm0, %xmm4
+; X64-AVX2-NEXT:    vpsrlvq %xmm1, %xmm4, %xmm1
+; X64-AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm1
+; X64-AVX2-NEXT:    vpsubq %xmm2, %xmm1, %xmm1
+; X64-AVX2-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vblendvpd %xmm0, %xmm4, %xmm3, %xmm0
 ; X64-AVX2-NEXT:    retq
 ;
 ; X86-LABEL: vec_v2i64:
@@ -175,132 +155,53 @@ define <2 x i64> @vec_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
 define <4 x i32> @vec_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X64-LABEL: vec_v4i32:
 ; X64:       # %bb.0:
-; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3]
-; X64-NEXT:    movd %xmm2, %eax
-; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3]
-; X64-NEXT:    movd %xmm2, %ecx
-; X64-NEXT:    movl %eax, %edx
-; X64-NEXT:    shll %cl, %edx
-; X64-NEXT:    movl %edx, %esi
-; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NEXT:    sarl %cl, %esi
-; X64-NEXT:    xorl %ecx, %ecx
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    sets %cl
-; X64-NEXT:    addl $2147483647, %ecx # imm = 0x7FFFFFFF
-; X64-NEXT:    cmpl %esi, %eax
-; X64-NEXT:    cmovel %edx, %ecx
-; X64-NEXT:    movd %ecx, %xmm2
-; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; X64-NEXT:    movd %xmm3, %eax
-; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
-; X64-NEXT:    movd %xmm3, %ecx
-; X64-NEXT:    movl %eax, %edx
-; X64-NEXT:    shll %cl, %edx
-; X64-NEXT:    movl %edx, %esi
-; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NEXT:    sarl %cl, %esi
-; X64-NEXT:    xorl %ecx, %ecx
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    sets %cl
-; X64-NEXT:    addl $2147483647, %ecx # imm = 0x7FFFFFFF
-; X64-NEXT:    cmpl %esi, %eax
-; X64-NEXT:    cmovel %edx, %ecx
-; X64-NEXT:    movd %ecx, %xmm3
-; X64-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; X64-NEXT:    movd %xmm0, %eax
-; X64-NEXT:    movd %xmm1, %ecx
-; X64-NEXT:    movl %eax, %edx
-; X64-NEXT:    shll %cl, %edx
-; X64-NEXT:    movl %edx, %esi
-; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NEXT:    sarl %cl, %esi
-; X64-NEXT:    xorl %ecx, %ecx
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    sets %cl
-; X64-NEXT:    addl $2147483647, %ecx # imm = 0x7FFFFFFF
-; X64-NEXT:    cmpl %esi, %eax
-; X64-NEXT:    cmovel %edx, %ecx
-; X64-NEXT:    movd %ecx, %xmm2
-; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X64-NEXT:    movd %xmm0, %eax
-; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
-; X64-NEXT:    movd %xmm0, %ecx
-; X64-NEXT:    movl %eax, %edx
-; X64-NEXT:    shll %cl, %edx
-; X64-NEXT:    movl %edx, %esi
-; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NEXT:    sarl %cl, %esi
-; X64-NEXT:    xorl %ecx, %ecx
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    sets %cl
-; X64-NEXT:    addl $2147483647, %ecx # imm = 0x7FFFFFFF
-; X64-NEXT:    cmpl %esi, %eax
-; X64-NEXT:    cmovel %edx, %ecx
-; X64-NEXT:    movd %ecx, %xmm0
-; X64-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; X64-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; X64-NEXT:    movdqa %xmm2, %xmm0
+; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; X64-NEXT:    pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
+; X64-NEXT:    pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7]
+; X64-NEXT:    pslld $23, %xmm1
+; X64-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; X64-NEXT:    cvttps2dq %xmm1, %xmm5
+; X64-NEXT:    movdqa %xmm0, %xmm1
+; X64-NEXT:    pmuludq %xmm5, %xmm1
+; X64-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[0,2,2,3]
+; X64-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
+; X64-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; X64-NEXT:    pmuludq %xmm7, %xmm5
+; X64-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
+; X64-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
+; X64-NEXT:    pshuflw {{.*#+}} xmm5 = xmm2[2,3,3,3,4,5,6,7]
+; X64-NEXT:    movdqa %xmm6, %xmm7
+; X64-NEXT:    psrad %xmm5, %xmm7
+; X64-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7]
+; X64-NEXT:    movdqa %xmm1, %xmm5
+; X64-NEXT:    psrad %xmm2, %xmm5
+; X64-NEXT:    punpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm7[1]
+; X64-NEXT:    movdqa %xmm6, %xmm2
+; X64-NEXT:    psrad %xmm3, %xmm2
+; X64-NEXT:    psrad %xmm4, %xmm1
+; X64-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; X64-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,3],xmm5[0,3]
+; X64-NEXT:    pcmpeqd %xmm0, %xmm1
+; X64-NEXT:    pand %xmm1, %xmm6
+; X64-NEXT:    pxor %xmm2, %xmm2
+; X64-NEXT:    pcmpgtd %xmm0, %xmm2
+; X64-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; X64-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-NEXT:    por %xmm2, %xmm0
+; X64-NEXT:    pandn %xmm0, %xmm1
+; X64-NEXT:    por %xmm6, %xmm1
+; X64-NEXT:    movdqa %xmm1, %xmm0
 ; X64-NEXT:    retq
 ;
 ; X64-AVX2-LABEL: vec_v4i32:
 ; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vpextrd $1, %xmm0, %eax
-; X64-AVX2-NEXT:    vpextrd $1, %xmm1, %ecx
-; X64-AVX2-NEXT:    movl %eax, %edx
-; X64-AVX2-NEXT:    shll %cl, %edx
-; X64-AVX2-NEXT:    movl %edx, %esi
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    sarl %cl, %esi
-; X64-AVX2-NEXT:    xorl %edi, %edi
-; X64-AVX2-NEXT:    testl %eax, %eax
-; X64-AVX2-NEXT:    sets %dil
-; X64-AVX2-NEXT:    addl $2147483647, %edi # imm = 0x7FFFFFFF
-; X64-AVX2-NEXT:    cmpl %esi, %eax
-; X64-AVX2-NEXT:    cmovel %edx, %edi
-; X64-AVX2-NEXT:    vmovd %xmm0, %eax
-; X64-AVX2-NEXT:    vmovd %xmm1, %ecx
-; X64-AVX2-NEXT:    movl %eax, %edx
-; X64-AVX2-NEXT:    shll %cl, %edx
-; X64-AVX2-NEXT:    movl %edx, %esi
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    sarl %cl, %esi
-; X64-AVX2-NEXT:    xorl %ecx, %ecx
-; X64-AVX2-NEXT:    testl %eax, %eax
-; X64-AVX2-NEXT:    sets %cl
-; X64-AVX2-NEXT:    addl $2147483647, %ecx # imm = 0x7FFFFFFF
-; X64-AVX2-NEXT:    cmpl %esi, %eax
-; X64-AVX2-NEXT:    cmovel %edx, %ecx
-; X64-AVX2-NEXT:    vmovd %ecx, %xmm2
-; X64-AVX2-NEXT:    vpinsrd $1, %edi, %xmm2, %xmm2
-; X64-AVX2-NEXT:    vpextrd $2, %xmm0, %eax
-; X64-AVX2-NEXT:    vpextrd $2, %xmm1, %ecx
-; X64-AVX2-NEXT:    movl %eax, %edx
-; X64-AVX2-NEXT:    shll %cl, %edx
-; X64-AVX2-NEXT:    movl %edx, %esi
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    sarl %cl, %esi
-; X64-AVX2-NEXT:    xorl %ecx, %ecx
-; X64-AVX2-NEXT:    testl %eax, %eax
-; X64-AVX2-NEXT:    sets %cl
-; X64-AVX2-NEXT:    addl $2147483647, %ecx # imm = 0x7FFFFFFF
-; X64-AVX2-NEXT:    cmpl %esi, %eax
-; X64-AVX2-NEXT:    cmovel %edx, %ecx
-; X64-AVX2-NEXT:    vpinsrd $2, %ecx, %xmm2, %xmm2
-; X64-AVX2-NEXT:    vpextrd $3, %xmm0, %eax
-; X64-AVX2-NEXT:    vpextrd $3, %xmm1, %ecx
-; X64-AVX2-NEXT:    movl %eax, %edx
-; X64-AVX2-NEXT:    shll %cl, %edx
-; X64-AVX2-NEXT:    movl %edx, %esi
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    sarl %cl, %esi
-; X64-AVX2-NEXT:    xorl %ecx, %ecx
-; X64-AVX2-NEXT:    testl %eax, %eax
-; X64-AVX2-NEXT:    sets %cl
-; X64-AVX2-NEXT:    addl $2147483647, %ecx # imm = 0x7FFFFFFF
-; X64-AVX2-NEXT:    cmpl %esi, %eax
-; X64-AVX2-NEXT:    cmovel %edx, %ecx
-; X64-AVX2-NEXT:    vpinsrd $3, %ecx, %xmm2, %xmm0
+; X64-AVX2-NEXT:    vbroadcastss {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; X64-AVX2-NEXT:    vbroadcastss {{.*#+}} xmm3 = [2147483647,2147483647,2147483647,2147483647]
+; X64-AVX2-NEXT:    vblendvps %xmm0, %xmm2, %xmm3, %xmm2
+; X64-AVX2-NEXT:    vpsllvd %xmm1, %xmm0, %xmm3
+; X64-AVX2-NEXT:    vpsravd %xmm1, %xmm3, %xmm1
+; X64-AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vblendvps %xmm0, %xmm3, %xmm2, %xmm0
 ; X64-AVX2-NEXT:    retq
 ;
 ; X86-LABEL: vec_v4i32:
@@ -376,241 +277,84 @@ define <4 x i32> @vec_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
 define <8 x i16> @vec_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
 ; X64-LABEL: vec_v8i16:
 ; X64:       # %bb.0:
-; X64-NEXT:    pextrw $7, %xmm0, %eax
-; X64-NEXT:    pextrw $7, %xmm1, %ecx
-; X64-NEXT:    movl %eax, %edx
-; X64-NEXT:    shll %cl, %edx
-; X64-NEXT:    movswl %dx, %esi
-; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NEXT:    sarl %cl, %esi
-; X64-NEXT:    xorl %ecx, %ecx
-; X64-NEXT:    testw %ax, %ax
-; X64-NEXT:    sets %cl
-; X64-NEXT:    addl $32767, %ecx # imm = 0x7FFF
-; X64-NEXT:    cmpw %si, %ax
-; X64-NEXT:    cmovel %edx, %ecx
-; X64-NEXT:    movd %ecx, %xmm2
-; X64-NEXT:    pextrw $6, %xmm0, %eax
-; X64-NEXT:    pextrw $6, %xmm1, %ecx
-; X64-NEXT:    movl %eax, %edx
-; X64-NEXT:    shll %cl, %edx
-; X64-NEXT:    movswl %dx, %esi
-; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NEXT:    sarl %cl, %esi
-; X64-NEXT:    xorl %ecx, %ecx
-; X64-NEXT:    testw %ax, %ax
-; X64-NEXT:    sets %cl
-; X64-NEXT:    addl $32767, %ecx # imm = 0x7FFF
-; X64-NEXT:    cmpw %si, %ax
-; X64-NEXT:    cmovel %edx, %ecx
-; X64-NEXT:    movd %ecx, %xmm3
-; X64-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; X64-NEXT:    pextrw $5, %xmm0, %eax
-; X64-NEXT:    pextrw $5, %xmm1, %ecx
-; X64-NEXT:    movl %eax, %edx
-; X64-NEXT:    shll %cl, %edx
-; X64-NEXT:    movswl %dx, %esi
-; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NEXT:    sarl %cl, %esi
-; X64-NEXT:    xorl %ecx, %ecx
-; X64-NEXT:    testw %ax, %ax
-; X64-NEXT:    sets %cl
-; X64-NEXT:    addl $32767, %ecx # imm = 0x7FFF
-; X64-NEXT:    cmpw %si, %ax
-; X64-NEXT:    cmovel %edx, %ecx
-; X64-NEXT:    movd %ecx, %xmm4
-; X64-NEXT:    pextrw $4, %xmm0, %eax
-; X64-NEXT:    pextrw $4, %xmm1, %ecx
-; X64-NEXT:    movl %eax, %edx
-; X64-NEXT:    shll %cl, %edx
-; X64-NEXT:    movswl %dx, %esi
-; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NEXT:    sarl %cl, %esi
-; X64-NEXT:    xorl %ecx, %ecx
-; X64-NEXT:    testw %ax, %ax
-; X64-NEXT:    sets %cl
-; X64-NEXT:    addl $32767, %ecx # imm = 0x7FFF
-; X64-NEXT:    cmpw %si, %ax
-; X64-NEXT:    cmovel %edx, %ecx
-; X64-NEXT:    movd %ecx, %xmm2
-; X64-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; X64-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; X64-NEXT:    pextrw $3, %xmm0, %eax
-; X64-NEXT:    pextrw $3, %xmm1, %ecx
-; X64-NEXT:    movl %eax, %edx
-; X64-NEXT:    shll %cl, %edx
-; X64-NEXT:    movswl %dx, %esi
-; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NEXT:    sarl %cl, %esi
-; X64-NEXT:    xorl %ecx, %ecx
-; X64-NEXT:    testw %ax, %ax
-; X64-NEXT:    sets %cl
-; X64-NEXT:    addl $32767, %ecx # imm = 0x7FFF
-; X64-NEXT:    cmpw %si, %ax
-; X64-NEXT:    cmovel %edx, %ecx
-; X64-NEXT:    movd %ecx, %xmm4
-; X64-NEXT:    pextrw $2, %xmm0, %eax
-; X64-NEXT:    pextrw $2, %xmm1, %ecx
-; X64-NEXT:    movl %eax, %edx
-; X64-NEXT:    shll %cl, %edx
-; X64-NEXT:    movswl %dx, %esi
-; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NEXT:    sarl %cl, %esi
-; X64-NEXT:    xorl %ecx, %ecx
-; X64-NEXT:    testw %ax, %ax
-; X64-NEXT:    sets %cl
-; X64-NEXT:    addl $32767, %ecx # imm = 0x7FFF
-; X64-NEXT:    cmpw %si, %ax
-; X64-NEXT:    cmovel %edx, %ecx
-; X64-NEXT:    movd %ecx, %xmm3
-; X64-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; X64-NEXT:    pextrw $1, %xmm0, %eax
-; X64-NEXT:    pextrw $1, %xmm1, %ecx
-; X64-NEXT:    movl %eax, %edx
-; X64-NEXT:    shll %cl, %edx
-; X64-NEXT:    movswl %dx, %esi
-; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NEXT:    sarl %cl, %esi
-; X64-NEXT:    xorl %ecx, %ecx
-; X64-NEXT:    testw %ax, %ax
-; X64-NEXT:    sets %cl
-; X64-NEXT:    addl $32767, %ecx # imm = 0x7FFF
-; X64-NEXT:    cmpw %si, %ax
-; X64-NEXT:    cmovel %edx, %ecx
-; X64-NEXT:    movd %ecx, %xmm4
-; X64-NEXT:    movd %xmm0, %eax
-; X64-NEXT:    movd %xmm1, %ecx
-; X64-NEXT:    movl %eax, %edx
-; X64-NEXT:    shll %cl, %edx
-; X64-NEXT:    movswl %dx, %esi
-; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NEXT:    sarl %cl, %esi
-; X64-NEXT:    xorl %ecx, %ecx
-; X64-NEXT:    testw %ax, %ax
-; X64-NEXT:    sets %cl
-; X64-NEXT:    addl $32767, %ecx # imm = 0x7FFF
-; X64-NEXT:    cmpw %si, %ax
-; X64-NEXT:    cmovel %edx, %ecx
-; X64-NEXT:    movd %ecx, %xmm0
-; X64-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; X64-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; X64-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; X64-NEXT:    movdqa %xmm1, %xmm2
+; X64-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
+; X64-NEXT:    pslld $23, %xmm2
+; X64-NEXT:    movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
+; X64-NEXT:    paddd %xmm3, %xmm2
+; X64-NEXT:    cvttps2dq %xmm2, %xmm2
+; X64-NEXT:    pslld $16, %xmm2
+; X64-NEXT:    psrad $16, %xmm2
+; X64-NEXT:    movdqa %xmm1, %xmm4
+; X64-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3]
+; X64-NEXT:    pslld $23, %xmm4
+; X64-NEXT:    paddd %xmm3, %xmm4
+; X64-NEXT:    cvttps2dq %xmm4, %xmm3
+; X64-NEXT:    pslld $16, %xmm3
+; X64-NEXT:    psrad $16, %xmm3
+; X64-NEXT:    packssdw %xmm2, %xmm3
+; X64-NEXT:    pmullw %xmm0, %xmm3
+; X64-NEXT:    psllw $12, %xmm1
+; X64-NEXT:    movdqa %xmm1, %xmm2
+; X64-NEXT:    psraw $15, %xmm2
+; X64-NEXT:    movdqa %xmm3, %xmm4
+; X64-NEXT:    psraw $8, %xmm4
+; X64-NEXT:    pand %xmm2, %xmm4
+; X64-NEXT:    pandn %xmm3, %xmm2
+; X64-NEXT:    por %xmm4, %xmm2
+; X64-NEXT:    paddw %xmm1, %xmm1
+; X64-NEXT:    movdqa %xmm1, %xmm4
+; X64-NEXT:    psraw $15, %xmm4
+; X64-NEXT:    movdqa %xmm4, %xmm5
+; X64-NEXT:    pandn %xmm2, %xmm5
+; X64-NEXT:    psraw $4, %xmm2
+; X64-NEXT:    pand %xmm4, %xmm2
+; X64-NEXT:    por %xmm5, %xmm2
+; X64-NEXT:    paddw %xmm1, %xmm1
+; X64-NEXT:    movdqa %xmm1, %xmm4
+; X64-NEXT:    psraw $15, %xmm4
+; X64-NEXT:    movdqa %xmm4, %xmm5
+; X64-NEXT:    pandn %xmm2, %xmm5
+; X64-NEXT:    psraw $2, %xmm2
+; X64-NEXT:    pand %xmm4, %xmm2
+; X64-NEXT:    por %xmm5, %xmm2
+; X64-NEXT:    paddw %xmm1, %xmm1
+; X64-NEXT:    psraw $15, %xmm1
+; X64-NEXT:    movdqa %xmm1, %xmm4
+; X64-NEXT:    pandn %xmm2, %xmm4
+; X64-NEXT:    psraw $1, %xmm2
+; X64-NEXT:    pand %xmm1, %xmm2
+; X64-NEXT:    por %xmm4, %xmm2
+; X64-NEXT:    pcmpeqw %xmm0, %xmm2
+; X64-NEXT:    pand %xmm2, %xmm3
+; X64-NEXT:    pxor %xmm1, %xmm1
+; X64-NEXT:    pcmpgtw %xmm0, %xmm1
+; X64-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; X64-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-NEXT:    por %xmm1, %xmm0
+; X64-NEXT:    pandn %xmm0, %xmm2
+; X64-NEXT:    por %xmm3, %xmm2
+; X64-NEXT:    movdqa %xmm2, %xmm0
 ; X64-NEXT:    retq
 ;
 ; X64-AVX2-LABEL: vec_v8i16:
 ; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vpextrw $1, %xmm0, %edx
-; X64-AVX2-NEXT:    vpextrw $1, %xmm1, %ecx
-; X64-AVX2-NEXT:    movl %edx, %esi
-; X64-AVX2-NEXT:    shll %cl, %esi
-; X64-AVX2-NEXT:    movswl %si, %edi
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    sarl %cl, %edi
-; X64-AVX2-NEXT:    xorl %eax, %eax
-; X64-AVX2-NEXT:    testw %dx, %dx
-; X64-AVX2-NEXT:    sets %al
-; X64-AVX2-NEXT:    addl $32767, %eax # imm = 0x7FFF
-; X64-AVX2-NEXT:    cmpw %di, %dx
-; X64-AVX2-NEXT:    cmovel %esi, %eax
-; X64-AVX2-NEXT:    vmovd %xmm0, %edx
-; X64-AVX2-NEXT:    vmovd %xmm1, %ecx
-; X64-AVX2-NEXT:    movl %edx, %esi
-; X64-AVX2-NEXT:    shll %cl, %esi
-; X64-AVX2-NEXT:    movswl %si, %edi
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    sarl %cl, %edi
-; X64-AVX2-NEXT:    xorl %ecx, %ecx
-; X64-AVX2-NEXT:    testw %dx, %dx
-; X64-AVX2-NEXT:    sets %cl
-; X64-AVX2-NEXT:    addl $32767, %ecx # imm = 0x7FFF
-; X64-AVX2-NEXT:    cmpw %di, %dx
-; X64-AVX2-NEXT:    cmovel %esi, %ecx
-; X64-AVX2-NEXT:    vmovd %ecx, %xmm2
-; X64-AVX2-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
-; X64-AVX2-NEXT:    vpextrw $2, %xmm0, %eax
-; X64-AVX2-NEXT:    vpextrw $2, %xmm1, %ecx
-; X64-AVX2-NEXT:    movl %eax, %edx
-; X64-AVX2-NEXT:    shll %cl, %edx
-; X64-AVX2-NEXT:    movswl %dx, %esi
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    sarl %cl, %esi
-; X64-AVX2-NEXT:    xorl %ecx, %ecx
-; X64-AVX2-NEXT:    testw %ax, %ax
-; X64-AVX2-NEXT:    sets %cl
-; X64-AVX2-NEXT:    addl $32767, %ecx # imm = 0x7FFF
-; X64-AVX2-NEXT:    cmpw %si, %ax
-; X64-AVX2-NEXT:    cmovel %edx, %ecx
-; X64-AVX2-NEXT:    vpinsrw $2, %ecx, %xmm2, %xmm2
-; X64-AVX2-NEXT:    vpextrw $3, %xmm0, %eax
-; X64-AVX2-NEXT:    vpextrw $3, %xmm1, %ecx
-; X64-AVX2-NEXT:    movl %eax, %edx
-; X64-AVX2-NEXT:    shll %cl, %edx
-; X64-AVX2-NEXT:    movswl %dx, %esi
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    sarl %cl, %esi
-; X64-AVX2-NEXT:    xorl %ecx, %ecx
-; X64-AVX2-NEXT:    testw %ax, %ax
-; X64-AVX2-NEXT:    sets %cl
-; X64-AVX2-NEXT:    addl $32767, %ecx # imm = 0x7FFF
-; X64-AVX2-NEXT:    cmpw %si, %ax
-; X64-AVX2-NEXT:    cmovel %edx, %ecx
-; X64-AVX2-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
-; X64-AVX2-NEXT:    vpextrw $4, %xmm0, %eax
-; X64-AVX2-NEXT:    vpextrw $4, %xmm1, %ecx
-; X64-AVX2-NEXT:    movl %eax, %edx
-; X64-AVX2-NEXT:    shll %cl, %edx
-; X64-AVX2-NEXT:    movswl %dx, %esi
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    sarl %cl, %esi
-; X64-AVX2-NEXT:    xorl %ecx, %ecx
-; X64-AVX2-NEXT:    testw %ax, %ax
-; X64-AVX2-NEXT:    sets %cl
-; X64-AVX2-NEXT:    addl $32767, %ecx # imm = 0x7FFF
-; X64-AVX2-NEXT:    cmpw %si, %ax
-; X64-AVX2-NEXT:    cmovel %edx, %ecx
-; X64-AVX2-NEXT:    vpinsrw $4, %ecx, %xmm2, %xmm2
-; X64-AVX2-NEXT:    vpextrw $5, %xmm0, %eax
-; X64-AVX2-NEXT:    vpextrw $5, %xmm1, %ecx
-; X64-AVX2-NEXT:    movl %eax, %edx
-; X64-AVX2-NEXT:    shll %cl, %edx
-; X64-AVX2-NEXT:    movswl %dx, %esi
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    sarl %cl, %esi
-; X64-AVX2-NEXT:    xorl %ecx, %ecx
-; X64-AVX2-NEXT:    testw %ax, %ax
-; X64-AVX2-NEXT:    sets %cl
-; X64-AVX2-NEXT:    addl $32767, %ecx # imm = 0x7FFF
-; X64-AVX2-NEXT:    cmpw %si, %ax
-; X64-AVX2-NEXT:    cmovel %edx, %ecx
-; X64-AVX2-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
-; X64-AVX2-NEXT:    vpextrw $6, %xmm0, %eax
-; X64-AVX2-NEXT:    vpextrw $6, %xmm1, %ecx
-; X64-AVX2-NEXT:    movl %eax, %edx
-; X64-AVX2-NEXT:    shll %cl, %edx
-; X64-AVX2-NEXT:    movswl %dx, %esi
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    sarl %cl, %esi
-; X64-AVX2-NEXT:    xorl %ecx, %ecx
-; X64-AVX2-NEXT:    testw %ax, %ax
-; X64-AVX2-NEXT:    sets %cl
-; X64-AVX2-NEXT:    addl $32767, %ecx # imm = 0x7FFF
-; X64-AVX2-NEXT:    cmpw %si, %ax
-; X64-AVX2-NEXT:    cmovel %edx, %ecx
-; X64-AVX2-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
-; X64-AVX2-NEXT:    vpextrw $7, %xmm0, %eax
-; X64-AVX2-NEXT:    vpextrw $7, %xmm1, %ecx
-; X64-AVX2-NEXT:    movl %eax, %edx
-; X64-AVX2-NEXT:    shll %cl, %edx
-; X64-AVX2-NEXT:    movswl %dx, %esi
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    sarl %cl, %esi
-; X64-AVX2-NEXT:    xorl %ecx, %ecx
-; X64-AVX2-NEXT:    testw %ax, %ax
-; X64-AVX2-NEXT:    sets %cl
-; X64-AVX2-NEXT:    addl $32767, %ecx # imm = 0x7FFF
-; X64-AVX2-NEXT:    cmpw %si, %ax
-; X64-AVX2-NEXT:    cmovel %edx, %ecx
-; X64-AVX2-NEXT:    vpinsrw $7, %ecx, %xmm2, %xmm0
+; X64-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; X64-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; X64-AVX2-NEXT:    vpsllvd %ymm1, %ymm2, %ymm2
+; X64-AVX2-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
+; X64-AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; X64-AVX2-NEXT:    vpmovsxwd %xmm2, %ymm3
+; X64-AVX2-NEXT:    vpsravd %ymm1, %ymm3, %ymm1
+; X64-AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm3
+; X64-AVX2-NEXT:    vpackssdw %xmm3, %xmm1, %xmm1
+; X64-AVX2-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm1
+; X64-AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; X64-AVX2-NEXT:    vpcmpgtw %xmm0, %xmm3, %xmm0
+; X64-AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [32767,32767,32767,32767,32767,32767,32767,32767]
+; X64-AVX2-NEXT:    vpblendvb %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm0
+; X64-AVX2-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vzeroupper
 ; X64-AVX2-NEXT:    retq
 ;
 ; X86-LABEL: vec_v8i16:
@@ -748,492 +492,137 @@ define <8 x i16> @vec_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
 define <16 x i8> @vec_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
 ; X64-LABEL: vec_v16i8:
 ; X64:       # %bb.0:
-; X64-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; X64-NEXT:    movl %eax, %edx
-; X64-NEXT:    shlb %cl, %dl
-; X64-NEXT:    movzbl %dl, %edx
-; X64-NEXT:    movl %edx, %esi
-; X64-NEXT:    sarb %cl, %sil
-; X64-NEXT:    xorl %ecx, %ecx
-; X64-NEXT:    testb %al, %al
-; X64-NEXT:    sets %cl
-; X64-NEXT:    addl $127, %ecx
-; X64-NEXT:    cmpb %sil, %al
-; X64-NEXT:    cmovel %edx, %ecx
-; X64-NEXT:    movd %ecx, %xmm0
-; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; X64-NEXT:    movl %eax, %edx
-; X64-NEXT:    shlb %cl, %dl
-; X64-NEXT:    movzbl %dl, %edx
-; X64-NEXT:    movl %edx, %esi
-; X64-NEXT:    sarb %cl, %sil
-; X64-NEXT:    xorl %ecx, %ecx
-; X64-NEXT:    testb %al, %al
-; X64-NEXT:    sets %cl
-; X64-NEXT:    addl $127, %ecx
-; X64-NEXT:    cmpb %sil, %al
-; X64-NEXT:    cmovel %edx, %ecx
-; X64-NEXT:    movd %ecx, %xmm1
-; X64-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; X64-NEXT:    movl %eax, %edx
-; X64-NEXT:    shlb %cl, %dl
-; X64-NEXT:    movzbl %dl, %edx
-; X64-NEXT:    movl %edx, %esi
-; X64-NEXT:    sarb %cl, %sil
-; X64-NEXT:    xorl %ecx, %ecx
-; X64-NEXT:    testb %al, %al
-; X64-NEXT:    sets %cl
-; X64-NEXT:    addl $127, %ecx
-; X64-NEXT:    cmpb %sil, %al
-; X64-NEXT:    cmovel %edx, %ecx
-; X64-NEXT:    movd %ecx, %xmm2
-; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; X64-NEXT:    movl %eax, %edx
-; X64-NEXT:    shlb %cl, %dl
-; X64-NEXT:    movzbl %dl, %edx
-; X64-NEXT:    movl %edx, %esi
-; X64-NEXT:    sarb %cl, %sil
-; X64-NEXT:    xorl %ecx, %ecx
-; X64-NEXT:    testb %al, %al
-; X64-NEXT:    sets %cl
-; X64-NEXT:    addl $127, %ecx
-; X64-NEXT:    cmpb %sil, %al
-; X64-NEXT:    cmovel %edx, %ecx
-; X64-NEXT:    movd %ecx, %xmm0
-; X64-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; X64-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; X64-NEXT:    movl %eax, %edx
-; X64-NEXT:    shlb %cl, %dl
-; X64-NEXT:    movzbl %dl, %edx
-; X64-NEXT:    movl %edx, %esi
-; X64-NEXT:    sarb %cl, %sil
-; X64-NEXT:    xorl %ecx, %ecx
-; X64-NEXT:    testb %al, %al
-; X64-NEXT:    sets %cl
-; X64-NEXT:    addl $127, %ecx
-; X64-NEXT:    cmpb %sil, %al
-; X64-NEXT:    cmovel %edx, %ecx
-; X64-NEXT:    movd %ecx, %xmm1
-; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; X64-NEXT:    movl %eax, %edx
-; X64-NEXT:    shlb %cl, %dl
-; X64-NEXT:    movzbl %dl, %edx
-; X64-NEXT:    movl %edx, %esi
-; X64-NEXT:    sarb %cl, %sil
-; X64-NEXT:    xorl %ecx, %ecx
-; X64-NEXT:    testb %al, %al
-; X64-NEXT:    sets %cl
-; X64-NEXT:    addl $127, %ecx
-; X64-NEXT:    cmpb %sil, %al
-; X64-NEXT:    cmovel %edx, %ecx
-; X64-NEXT:    movd %ecx, %xmm2
-; X64-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; X64-NEXT:    movl %eax, %edx
-; X64-NEXT:    shlb %cl, %dl
-; X64-NEXT:    movzbl %dl, %edx
-; X64-NEXT:    movl %edx, %esi
-; X64-NEXT:    sarb %cl, %sil
-; X64-NEXT:    xorl %ecx, %ecx
-; X64-NEXT:    testb %al, %al
-; X64-NEXT:    sets %cl
-; X64-NEXT:    addl $127, %ecx
-; X64-NEXT:    cmpb %sil, %al
-; X64-NEXT:    cmovel %edx, %ecx
-; X64-NEXT:    movd %ecx, %xmm3
-; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; X64-NEXT:    movl %eax, %edx
-; X64-NEXT:    shlb %cl, %dl
-; X64-NEXT:    movzbl %dl, %edx
-; X64-NEXT:    movl %edx, %esi
-; X64-NEXT:    sarb %cl, %sil
-; X64-NEXT:    xorl %ecx, %ecx
-; X64-NEXT:    testb %al, %al
-; X64-NEXT:    sets %cl
-; X64-NEXT:    addl $127, %ecx
-; X64-NEXT:    cmpb %sil, %al
-; X64-NEXT:    cmovel %edx, %ecx
-; X64-NEXT:    movd %ecx, %xmm1
-; X64-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
-; X64-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; X64-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; X64-NEXT:    movl %eax, %edx
-; X64-NEXT:    shlb %cl, %dl
-; X64-NEXT:    movzbl %dl, %edx
-; X64-NEXT:    movl %edx, %esi
-; X64-NEXT:    sarb %cl, %sil
-; X64-NEXT:    xorl %ecx, %ecx
-; X64-NEXT:    testb %al, %al
-; X64-NEXT:    sets %cl
-; X64-NEXT:    addl $127, %ecx
-; X64-NEXT:    cmpb %sil, %al
-; X64-NEXT:    cmovel %edx, %ecx
-; X64-NEXT:    movd %ecx, %xmm2
-; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; X64-NEXT:    movl %eax, %edx
-; X64-NEXT:    shlb %cl, %dl
-; X64-NEXT:    movzbl %dl, %edx
-; X64-NEXT:    movl %edx, %esi
-; X64-NEXT:    sarb %cl, %sil
-; X64-NEXT:    xorl %ecx, %ecx
-; X64-NEXT:    testb %al, %al
-; X64-NEXT:    sets %cl
-; X64-NEXT:    addl $127, %ecx
-; X64-NEXT:    cmpb %sil, %al
-; X64-NEXT:    cmovel %edx, %ecx
-; X64-NEXT:    movd %ecx, %xmm0
-; X64-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; X64-NEXT:    movl %eax, %edx
-; X64-NEXT:    shlb %cl, %dl
-; X64-NEXT:    movzbl %dl, %edx
-; X64-NEXT:    movl %edx, %esi
-; X64-NEXT:    sarb %cl, %sil
-; X64-NEXT:    xorl %ecx, %ecx
-; X64-NEXT:    testb %al, %al
-; X64-NEXT:    sets %cl
-; X64-NEXT:    addl $127, %ecx
-; X64-NEXT:    cmpb %sil, %al
-; X64-NEXT:    cmovel %edx, %ecx
-; X64-NEXT:    movd %ecx, %xmm3
-; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; X64-NEXT:    movl %eax, %edx
-; X64-NEXT:    shlb %cl, %dl
-; X64-NEXT:    movzbl %dl, %edx
-; X64-NEXT:    movl %edx, %esi
-; X64-NEXT:    sarb %cl, %sil
-; X64-NEXT:    xorl %ecx, %ecx
-; X64-NEXT:    testb %al, %al
-; X64-NEXT:    sets %cl
-; X64-NEXT:    addl $127, %ecx
-; X64-NEXT:    cmpb %sil, %al
-; X64-NEXT:    cmovel %edx, %ecx
-; X64-NEXT:    movd %ecx, %xmm2
-; X64-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
-; X64-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; X64-NEXT:    movl %eax, %edx
-; X64-NEXT:    shlb %cl, %dl
-; X64-NEXT:    movzbl %dl, %edx
-; X64-NEXT:    movl %edx, %esi
-; X64-NEXT:    sarb %cl, %sil
-; X64-NEXT:    xorl %ecx, %ecx
-; X64-NEXT:    testb %al, %al
-; X64-NEXT:    sets %cl
-; X64-NEXT:    addl $127, %ecx
-; X64-NEXT:    cmpb %sil, %al
-; X64-NEXT:    cmovel %edx, %ecx
-; X64-NEXT:    movd %ecx, %xmm0
-; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; X64-NEXT:    movl %eax, %edx
-; X64-NEXT:    shlb %cl, %dl
-; X64-NEXT:    movzbl %dl, %edx
-; X64-NEXT:    movl %edx, %esi
-; X64-NEXT:    sarb %cl, %sil
-; X64-NEXT:    xorl %ecx, %ecx
-; X64-NEXT:    testb %al, %al
-; X64-NEXT:    sets %cl
-; X64-NEXT:    addl $127, %ecx
-; X64-NEXT:    cmpb %sil, %al
-; X64-NEXT:    cmovel %edx, %ecx
-; X64-NEXT:    movd %ecx, %xmm3
-; X64-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; X64-NEXT:    movl %esi, %edi
-; X64-NEXT:    shlb %cl, %dil
-; X64-NEXT:    movzbl %dil, %edi
-; X64-NEXT:    movl %edi, %r8d
-; X64-NEXT:    sarb %cl, %r8b
-; X64-NEXT:    xorl %ecx, %ecx
-; X64-NEXT:    testb %sil, %sil
-; X64-NEXT:    sets %cl
-; X64-NEXT:    addl $127, %ecx
-; X64-NEXT:    cmpb %r8b, %sil
-; X64-NEXT:    cmovel %edi, %ecx
-; X64-NEXT:    movd %ecx, %xmm4
-; X64-NEXT:    movl %edx, %esi
-; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    shlb %cl, %sil
-; X64-NEXT:    movzbl %sil, %esi
-; X64-NEXT:    movl %esi, %edi
-; X64-NEXT:    sarb %cl, %dil
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    testb %dl, %dl
-; X64-NEXT:    sets %al
-; X64-NEXT:    addl $127, %eax
-; X64-NEXT:    cmpb %dil, %dl
-; X64-NEXT:    cmovel %esi, %eax
-; X64-NEXT:    movd %eax, %xmm0
-; X64-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
-; X64-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; X64-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; X64-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64-NEXT:    psllw $5, %xmm1
+; X64-NEXT:    pxor %xmm3, %xmm3
+; X64-NEXT:    pxor %xmm4, %xmm4
+; X64-NEXT:    pcmpgtb %xmm1, %xmm4
+; X64-NEXT:    movdqa %xmm0, %xmm2
+; X64-NEXT:    psllw $4, %xmm2
+; X64-NEXT:    pand %xmm4, %xmm2
+; X64-NEXT:    pandn %xmm0, %xmm4
+; X64-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; X64-NEXT:    por %xmm4, %xmm2
+; X64-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
+; X64-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
+; X64-NEXT:    paddb %xmm1, %xmm1
+; X64-NEXT:    pxor %xmm6, %xmm6
+; X64-NEXT:    pcmpgtb %xmm1, %xmm6
+; X64-NEXT:    movdqa %xmm6, %xmm7
+; X64-NEXT:    pandn %xmm2, %xmm7
+; X64-NEXT:    psllw $2, %xmm2
+; X64-NEXT:    pand %xmm6, %xmm2
+; X64-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; X64-NEXT:    por %xmm7, %xmm2
+; X64-NEXT:    paddb %xmm1, %xmm1
+; X64-NEXT:    pxor %xmm6, %xmm6
+; X64-NEXT:    pcmpgtb %xmm1, %xmm6
+; X64-NEXT:    movdqa %xmm6, %xmm1
+; X64-NEXT:    pandn %xmm2, %xmm1
+; X64-NEXT:    paddb %xmm2, %xmm2
+; X64-NEXT:    pand %xmm6, %xmm2
+; X64-NEXT:    por %xmm1, %xmm2
+; X64-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15]
+; X64-NEXT:    pxor %xmm1, %xmm1
+; X64-NEXT:    pcmpgtw %xmm4, %xmm1
+; X64-NEXT:    movdqa %xmm1, %xmm7
+; X64-NEXT:    pandn %xmm6, %xmm7
+; X64-NEXT:    psraw $4, %xmm6
+; X64-NEXT:    pand %xmm1, %xmm6
+; X64-NEXT:    por %xmm7, %xmm6
+; X64-NEXT:    paddw %xmm4, %xmm4
+; X64-NEXT:    pxor %xmm1, %xmm1
+; X64-NEXT:    pcmpgtw %xmm4, %xmm1
+; X64-NEXT:    movdqa %xmm1, %xmm7
+; X64-NEXT:    pandn %xmm6, %xmm7
+; X64-NEXT:    psraw $2, %xmm6
+; X64-NEXT:    pand %xmm1, %xmm6
+; X64-NEXT:    por %xmm7, %xmm6
+; X64-NEXT:    paddw %xmm4, %xmm4
+; X64-NEXT:    pxor %xmm1, %xmm1
+; X64-NEXT:    pcmpgtw %xmm4, %xmm1
+; X64-NEXT:    movdqa %xmm1, %xmm4
+; X64-NEXT:    pandn %xmm6, %xmm4
+; X64-NEXT:    psraw $1, %xmm6
+; X64-NEXT:    pand %xmm1, %xmm6
+; X64-NEXT:    por %xmm4, %xmm6
+; X64-NEXT:    psrlw $8, %xmm6
+; X64-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; X64-NEXT:    pxor %xmm4, %xmm4
+; X64-NEXT:    pcmpgtw %xmm5, %xmm4
+; X64-NEXT:    movdqa %xmm4, %xmm7
+; X64-NEXT:    pandn %xmm1, %xmm7
+; X64-NEXT:    psraw $4, %xmm1
+; X64-NEXT:    pand %xmm4, %xmm1
+; X64-NEXT:    por %xmm7, %xmm1
+; X64-NEXT:    paddw %xmm5, %xmm5
+; X64-NEXT:    pxor %xmm4, %xmm4
+; X64-NEXT:    pcmpgtw %xmm5, %xmm4
+; X64-NEXT:    movdqa %xmm4, %xmm7
+; X64-NEXT:    pandn %xmm1, %xmm7
+; X64-NEXT:    psraw $2, %xmm1
+; X64-NEXT:    pand %xmm4, %xmm1
+; X64-NEXT:    por %xmm7, %xmm1
+; X64-NEXT:    paddw %xmm5, %xmm5
+; X64-NEXT:    pxor %xmm4, %xmm4
+; X64-NEXT:    pcmpgtw %xmm5, %xmm4
+; X64-NEXT:    movdqa %xmm4, %xmm5
+; X64-NEXT:    pandn %xmm1, %xmm5
+; X64-NEXT:    psraw $1, %xmm1
+; X64-NEXT:    pand %xmm4, %xmm1
+; X64-NEXT:    por %xmm5, %xmm1
+; X64-NEXT:    psrlw $8, %xmm1
+; X64-NEXT:    packuswb %xmm6, %xmm1
+; X64-NEXT:    pcmpeqb %xmm0, %xmm1
+; X64-NEXT:    pand %xmm1, %xmm2
+; X64-NEXT:    pcmpgtb %xmm0, %xmm3
+; X64-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; X64-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-NEXT:    por %xmm3, %xmm0
+; X64-NEXT:    pandn %xmm0, %xmm1
+; X64-NEXT:    por %xmm2, %xmm1
+; X64-NEXT:    movdqa %xmm1, %xmm0
 ; X64-NEXT:    retq
 ;
 ; X64-AVX2-LABEL: vec_v16i8:
 ; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vpextrb $1, %xmm1, %ecx
-; X64-AVX2-NEXT:    vpextrb $1, %xmm0, %edx
-; X64-AVX2-NEXT:    movl %edx, %eax
-; X64-AVX2-NEXT:    shlb %cl, %al
-; X64-AVX2-NEXT:    movzbl %al, %esi
-; X64-AVX2-NEXT:    movl %esi, %edi
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    sarb %cl, %dil
-; X64-AVX2-NEXT:    xorl %eax, %eax
-; X64-AVX2-NEXT:    testb %dl, %dl
-; X64-AVX2-NEXT:    sets %al
-; X64-AVX2-NEXT:    addl $127, %eax
-; X64-AVX2-NEXT:    cmpb %dil, %dl
-; X64-AVX2-NEXT:    cmovel %esi, %eax
-; X64-AVX2-NEXT:    vmovd %xmm1, %ecx
-; X64-AVX2-NEXT:    vmovd %xmm0, %edx
-; X64-AVX2-NEXT:    movl %edx, %esi
-; X64-AVX2-NEXT:    shlb %cl, %sil
-; X64-AVX2-NEXT:    movzbl %sil, %esi
-; X64-AVX2-NEXT:    movl %esi, %edi
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    sarb %cl, %dil
-; X64-AVX2-NEXT:    xorl %ecx, %ecx
-; X64-AVX2-NEXT:    testb %dl, %dl
-; X64-AVX2-NEXT:    sets %cl
-; X64-AVX2-NEXT:    addl $127, %ecx
-; X64-AVX2-NEXT:    cmpb %dil, %dl
-; X64-AVX2-NEXT:    cmovel %esi, %ecx
-; X64-AVX2-NEXT:    vmovd %ecx, %xmm2
-; X64-AVX2-NEXT:    vpinsrb $1, %eax, %xmm2, %xmm2
-; X64-AVX2-NEXT:    vpextrb $2, %xmm1, %ecx
-; X64-AVX2-NEXT:    vpextrb $2, %xmm0, %eax
-; X64-AVX2-NEXT:    movl %eax, %edx
-; X64-AVX2-NEXT:    shlb %cl, %dl
-; X64-AVX2-NEXT:    movzbl %dl, %edx
-; X64-AVX2-NEXT:    movl %edx, %esi
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    sarb %cl, %sil
-; X64-AVX2-NEXT:    xorl %ecx, %ecx
-; X64-AVX2-NEXT:    testb %al, %al
-; X64-AVX2-NEXT:    sets %cl
-; X64-AVX2-NEXT:    addl $127, %ecx
-; X64-AVX2-NEXT:    cmpb %sil, %al
-; X64-AVX2-NEXT:    cmovel %edx, %ecx
-; X64-AVX2-NEXT:    vpinsrb $2, %ecx, %xmm2, %xmm2
-; X64-AVX2-NEXT:    vpextrb $3, %xmm1, %ecx
-; X64-AVX2-NEXT:    vpextrb $3, %xmm0, %eax
-; X64-AVX2-NEXT:    movl %eax, %edx
-; X64-AVX2-NEXT:    shlb %cl, %dl
-; X64-AVX2-NEXT:    movzbl %dl, %edx
-; X64-AVX2-NEXT:    movl %edx, %esi
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    sarb %cl, %sil
-; X64-AVX2-NEXT:    xorl %ecx, %ecx
-; X64-AVX2-NEXT:    testb %al, %al
-; X64-AVX2-NEXT:    sets %cl
-; X64-AVX2-NEXT:    addl $127, %ecx
-; X64-AVX2-NEXT:    cmpb %sil, %al
-; X64-AVX2-NEXT:    cmovel %edx, %ecx
-; X64-AVX2-NEXT:    vpinsrb $3, %ecx, %xmm2, %xmm2
-; X64-AVX2-NEXT:    vpextrb $4, %xmm1, %ecx
-; X64-AVX2-NEXT:    vpextrb $4, %xmm0, %eax
-; X64-AVX2-NEXT:    movl %eax, %edx
-; X64-AVX2-NEXT:    shlb %cl, %dl
-; X64-AVX2-NEXT:    movzbl %dl, %edx
-; X64-AVX2-NEXT:    movl %edx, %esi
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    sarb %cl, %sil
-; X64-AVX2-NEXT:    xorl %ecx, %ecx
-; X64-AVX2-NEXT:    testb %al, %al
-; X64-AVX2-NEXT:    sets %cl
-; X64-AVX2-NEXT:    addl $127, %ecx
-; X64-AVX2-NEXT:    cmpb %sil, %al
-; X64-AVX2-NEXT:    cmovel %edx, %ecx
-; X64-AVX2-NEXT:    vpinsrb $4, %ecx, %xmm2, %xmm2
-; X64-AVX2-NEXT:    vpextrb $5, %xmm1, %ecx
-; X64-AVX2-NEXT:    vpextrb $5, %xmm0, %eax
-; X64-AVX2-NEXT:    movl %eax, %edx
-; X64-AVX2-NEXT:    shlb %cl, %dl
-; X64-AVX2-NEXT:    movzbl %dl, %edx
-; X64-AVX2-NEXT:    movl %edx, %esi
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    sarb %cl, %sil
-; X64-AVX2-NEXT:    xorl %ecx, %ecx
-; X64-AVX2-NEXT:    testb %al, %al
-; X64-AVX2-NEXT:    sets %cl
-; X64-AVX2-NEXT:    addl $127, %ecx
-; X64-AVX2-NEXT:    cmpb %sil, %al
-; X64-AVX2-NEXT:    cmovel %edx, %ecx
-; X64-AVX2-NEXT:    vpinsrb $5, %ecx, %xmm2, %xmm2
-; X64-AVX2-NEXT:    vpextrb $6, %xmm1, %ecx
-; X64-AVX2-NEXT:    vpextrb $6, %xmm0, %eax
-; X64-AVX2-NEXT:    movl %eax, %edx
-; X64-AVX2-NEXT:    shlb %cl, %dl
-; X64-AVX2-NEXT:    movzbl %dl, %edx
-; X64-AVX2-NEXT:    movl %edx, %esi
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    sarb %cl, %sil
-; X64-AVX2-NEXT:    xorl %ecx, %ecx
-; X64-AVX2-NEXT:    testb %al, %al
-; X64-AVX2-NEXT:    sets %cl
-; X64-AVX2-NEXT:    addl $127, %ecx
-; X64-AVX2-NEXT:    cmpb %sil, %al
-; X64-AVX2-NEXT:    cmovel %edx, %ecx
-; X64-AVX2-NEXT:    vpinsrb $6, %ecx, %xmm2, %xmm2
-; X64-AVX2-NEXT:    vpextrb $7, %xmm1, %ecx
-; X64-AVX2-NEXT:    vpextrb $7, %xmm0, %eax
-; X64-AVX2-NEXT:    movl %eax, %edx
-; X64-AVX2-NEXT:    shlb %cl, %dl
-; X64-AVX2-NEXT:    movzbl %dl, %edx
-; X64-AVX2-NEXT:    movl %edx, %esi
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    sarb %cl, %sil
-; X64-AVX2-NEXT:    xorl %ecx, %ecx
-; X64-AVX2-NEXT:    testb %al, %al
-; X64-AVX2-NEXT:    sets %cl
-; X64-AVX2-NEXT:    addl $127, %ecx
-; X64-AVX2-NEXT:    cmpb %sil, %al
-; X64-AVX2-NEXT:    cmovel %edx, %ecx
-; X64-AVX2-NEXT:    vpinsrb $7, %ecx, %xmm2, %xmm2
-; X64-AVX2-NEXT:    vpextrb $8, %xmm1, %ecx
-; X64-AVX2-NEXT:    vpextrb $8, %xmm0, %eax
-; X64-AVX2-NEXT:    movl %eax, %edx
-; X64-AVX2-NEXT:    shlb %cl, %dl
-; X64-AVX2-NEXT:    movzbl %dl, %edx
-; X64-AVX2-NEXT:    movl %edx, %esi
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    sarb %cl, %sil
-; X64-AVX2-NEXT:    xorl %ecx, %ecx
-; X64-AVX2-NEXT:    testb %al, %al
-; X64-AVX2-NEXT:    sets %cl
-; X64-AVX2-NEXT:    addl $127, %ecx
-; X64-AVX2-NEXT:    cmpb %sil, %al
-; X64-AVX2-NEXT:    cmovel %edx, %ecx
-; X64-AVX2-NEXT:    vpinsrb $8, %ecx, %xmm2, %xmm2
-; X64-AVX2-NEXT:    vpextrb $9, %xmm1, %ecx
-; X64-AVX2-NEXT:    vpextrb $9, %xmm0, %eax
-; X64-AVX2-NEXT:    movl %eax, %edx
-; X64-AVX2-NEXT:    shlb %cl, %dl
-; X64-AVX2-NEXT:    movzbl %dl, %edx
-; X64-AVX2-NEXT:    movl %edx, %esi
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    sarb %cl, %sil
-; X64-AVX2-NEXT:    xorl %ecx, %ecx
-; X64-AVX2-NEXT:    testb %al, %al
-; X64-AVX2-NEXT:    sets %cl
-; X64-AVX2-NEXT:    addl $127, %ecx
-; X64-AVX2-NEXT:    cmpb %sil, %al
-; X64-AVX2-NEXT:    cmovel %edx, %ecx
-; X64-AVX2-NEXT:    vpinsrb $9, %ecx, %xmm2, %xmm2
-; X64-AVX2-NEXT:    vpextrb $10, %xmm1, %ecx
-; X64-AVX2-NEXT:    vpextrb $10, %xmm0, %eax
-; X64-AVX2-NEXT:    movl %eax, %edx
-; X64-AVX2-NEXT:    shlb %cl, %dl
-; X64-AVX2-NEXT:    movzbl %dl, %edx
-; X64-AVX2-NEXT:    movl %edx, %esi
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    sarb %cl, %sil
-; X64-AVX2-NEXT:    xorl %ecx, %ecx
-; X64-AVX2-NEXT:    testb %al, %al
-; X64-AVX2-NEXT:    sets %cl
-; X64-AVX2-NEXT:    addl $127, %ecx
-; X64-AVX2-NEXT:    cmpb %sil, %al
-; X64-AVX2-NEXT:    cmovel %edx, %ecx
-; X64-AVX2-NEXT:    vpinsrb $10, %ecx, %xmm2, %xmm2
-; X64-AVX2-NEXT:    vpextrb $11, %xmm1, %ecx
-; X64-AVX2-NEXT:    vpextrb $11, %xmm0, %eax
-; X64-AVX2-NEXT:    movl %eax, %edx
-; X64-AVX2-NEXT:    shlb %cl, %dl
-; X64-AVX2-NEXT:    movzbl %dl, %edx
-; X64-AVX2-NEXT:    movl %edx, %esi
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    sarb %cl, %sil
-; X64-AVX2-NEXT:    xorl %ecx, %ecx
-; X64-AVX2-NEXT:    testb %al, %al
-; X64-AVX2-NEXT:    sets %cl
-; X64-AVX2-NEXT:    addl $127, %ecx
-; X64-AVX2-NEXT:    cmpb %sil, %al
-; X64-AVX2-NEXT:    cmovel %edx, %ecx
-; X64-AVX2-NEXT:    vpinsrb $11, %ecx, %xmm2, %xmm2
-; X64-AVX2-NEXT:    vpextrb $12, %xmm1, %ecx
-; X64-AVX2-NEXT:    vpextrb $12, %xmm0, %eax
-; X64-AVX2-NEXT:    movl %eax, %edx
-; X64-AVX2-NEXT:    shlb %cl, %dl
-; X64-AVX2-NEXT:    movzbl %dl, %edx
-; X64-AVX2-NEXT:    movl %edx, %esi
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    sarb %cl, %sil
-; X64-AVX2-NEXT:    xorl %ecx, %ecx
-; X64-AVX2-NEXT:    testb %al, %al
-; X64-AVX2-NEXT:    sets %cl
-; X64-AVX2-NEXT:    addl $127, %ecx
-; X64-AVX2-NEXT:    cmpb %sil, %al
-; X64-AVX2-NEXT:    cmovel %edx, %ecx
-; X64-AVX2-NEXT:    vpinsrb $12, %ecx, %xmm2, %xmm2
-; X64-AVX2-NEXT:    vpextrb $13, %xmm1, %ecx
-; X64-AVX2-NEXT:    vpextrb $13, %xmm0, %eax
-; X64-AVX2-NEXT:    movl %eax, %edx
-; X64-AVX2-NEXT:    shlb %cl, %dl
-; X64-AVX2-NEXT:    movzbl %dl, %edx
-; X64-AVX2-NEXT:    movl %edx, %esi
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    sarb %cl, %sil
-; X64-AVX2-NEXT:    xorl %ecx, %ecx
-; X64-AVX2-NEXT:    testb %al, %al
-; X64-AVX2-NEXT:    sets %cl
-; X64-AVX2-NEXT:    addl $127, %ecx
-; X64-AVX2-NEXT:    cmpb %sil, %al
-; X64-AVX2-NEXT:    cmovel %edx, %ecx
-; X64-AVX2-NEXT:    vpinsrb $13, %ecx, %xmm2, %xmm2
-; X64-AVX2-NEXT:    vpextrb $14, %xmm1, %ecx
-; X64-AVX2-NEXT:    vpextrb $14, %xmm0, %eax
-; X64-AVX2-NEXT:    movl %eax, %edx
-; X64-AVX2-NEXT:    shlb %cl, %dl
-; X64-AVX2-NEXT:    movzbl %dl, %edx
-; X64-AVX2-NEXT:    movl %edx, %esi
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    sarb %cl, %sil
-; X64-AVX2-NEXT:    xorl %ecx, %ecx
-; X64-AVX2-NEXT:    testb %al, %al
-; X64-AVX2-NEXT:    sets %cl
-; X64-AVX2-NEXT:    addl $127, %ecx
-; X64-AVX2-NEXT:    cmpb %sil, %al
-; X64-AVX2-NEXT:    cmovel %edx, %ecx
-; X64-AVX2-NEXT:    vpinsrb $14, %ecx, %xmm2, %xmm2
-; X64-AVX2-NEXT:    vpextrb $15, %xmm1, %ecx
-; X64-AVX2-NEXT:    vpextrb $15, %xmm0, %eax
-; X64-AVX2-NEXT:    movl %eax, %edx
-; X64-AVX2-NEXT:    shlb %cl, %dl
-; X64-AVX2-NEXT:    movzbl %dl, %edx
-; X64-AVX2-NEXT:    movl %edx, %esi
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    sarb %cl, %sil
-; X64-AVX2-NEXT:    xorl %ecx, %ecx
-; X64-AVX2-NEXT:    testb %al, %al
-; X64-AVX2-NEXT:    sets %cl
-; X64-AVX2-NEXT:    addl $127, %ecx
-; X64-AVX2-NEXT:    cmpb %sil, %al
-; X64-AVX2-NEXT:    cmovel %edx, %ecx
-; X64-AVX2-NEXT:    vpinsrb $15, %ecx, %xmm2, %xmm0
+; X64-AVX2-NEXT:    vpsllw $5, %xmm1, %xmm1
+; X64-AVX2-NEXT:    vpsllw $4, %xmm0, %xmm2
+; X64-AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; X64-AVX2-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm2
+; X64-AVX2-NEXT:    vpsllw $2, %xmm2, %xmm3
+; X64-AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
+; X64-AVX2-NEXT:    vpaddb %xmm1, %xmm1, %xmm4
+; X64-AVX2-NEXT:    vpblendvb %xmm4, %xmm3, %xmm2, %xmm2
+; X64-AVX2-NEXT:    vpaddb %xmm2, %xmm2, %xmm3
+; X64-AVX2-NEXT:    vpaddb %xmm4, %xmm4, %xmm4
+; X64-AVX2-NEXT:    vpblendvb %xmm4, %xmm3, %xmm2, %xmm2
+; X64-AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; X64-AVX2-NEXT:    vpsraw $4, %xmm3, %xmm4
+; X64-AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; X64-AVX2-NEXT:    vpblendvb %xmm5, %xmm4, %xmm3, %xmm3
+; X64-AVX2-NEXT:    vpsraw $2, %xmm3, %xmm4
+; X64-AVX2-NEXT:    vpaddw %xmm5, %xmm5, %xmm5
+; X64-AVX2-NEXT:    vpblendvb %xmm5, %xmm4, %xmm3, %xmm3
+; X64-AVX2-NEXT:    vpsraw $1, %xmm3, %xmm4
+; X64-AVX2-NEXT:    vpaddw %xmm5, %xmm5, %xmm5
+; X64-AVX2-NEXT:    vpblendvb %xmm5, %xmm4, %xmm3, %xmm3
+; X64-AVX2-NEXT:    vpsrlw $8, %xmm3, %xmm3
+; X64-AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X64-AVX2-NEXT:    vpsraw $4, %xmm4, %xmm5
+; X64-AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X64-AVX2-NEXT:    vpblendvb %xmm1, %xmm5, %xmm4, %xmm4
+; X64-AVX2-NEXT:    vpsraw $2, %xmm4, %xmm5
+; X64-AVX2-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
+; X64-AVX2-NEXT:    vpblendvb %xmm1, %xmm5, %xmm4, %xmm4
+; X64-AVX2-NEXT:    vpsraw $1, %xmm4, %xmm5
+; X64-AVX2-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
+; X64-AVX2-NEXT:    vpblendvb %xmm1, %xmm5, %xmm4, %xmm1
+; X64-AVX2-NEXT:    vpsrlw $8, %xmm1, %xmm1
+; X64-AVX2-NEXT:    vpackuswb %xmm3, %xmm1, %xmm1
+; X64-AVX2-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm1
+; X64-AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; X64-AVX2-NEXT:    vpblendvb %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm0
+; X64-AVX2-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
 ; X64-AVX2-NEXT:    retq
 ;
 ; X86-LABEL: vec_v16i8:

diff  --git a/llvm/test/CodeGen/X86/ushl_sat_vec.ll b/llvm/test/CodeGen/X86/ushl_sat_vec.ll
index ad9fc85eaeb18..5246c8b14b80c 100644
--- a/llvm/test/CodeGen/X86/ushl_sat_vec.ll
+++ b/llvm/test/CodeGen/X86/ushl_sat_vec.ll
@@ -11,57 +11,31 @@ declare <16 x i8> @llvm.ushl.sat.v16i8(<16 x i8>, <16 x i8>)
 define <2 x i64> @vec_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
 ; X64-LABEL: vec_v2i64:
 ; X64:       # %bb.0:
-; X64-NEXT:    movq %xmm0, %rax
-; X64-NEXT:    movq %xmm1, %rcx
-; X64-NEXT:    movq %rax, %rdx
-; X64-NEXT:    shlq %cl, %rdx
-; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-NEXT:    shrq %cl, %rsi
-; X64-NEXT:    cmpq %rsi, %rax
-; X64-NEXT:    movq $-1, %rax
-; X64-NEXT:    cmovneq %rax, %rdx
-; X64-NEXT:    movq %rdx, %xmm2
-; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X64-NEXT:    movq %xmm0, %rdx
-; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
-; X64-NEXT:    movq %xmm0, %rcx
-; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    shlq %cl, %rsi
-; X64-NEXT:    movq %rsi, %rdi
-; X64-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-NEXT:    shrq %cl, %rdi
-; X64-NEXT:    cmpq %rdi, %rdx
-; X64-NEXT:    cmovneq %rax, %rsi
-; X64-NEXT:    movq %rsi, %xmm0
-; X64-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
-; X64-NEXT:    movdqa %xmm2, %xmm0
+; X64-NEXT:    movdqa %xmm0, %xmm2
+; X64-NEXT:    psllq %xmm1, %xmm2
+; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
+; X64-NEXT:    movdqa %xmm0, %xmm4
+; X64-NEXT:    psllq %xmm3, %xmm4
+; X64-NEXT:    movdqa %xmm4, %xmm5
+; X64-NEXT:    movsd {{.*#+}} xmm4 = xmm2[0],xmm4[1]
+; X64-NEXT:    psrlq %xmm1, %xmm2
+; X64-NEXT:    psrlq %xmm3, %xmm5
+; X64-NEXT:    movsd {{.*#+}} xmm5 = xmm2[0],xmm5[1]
+; X64-NEXT:    pcmpeqd %xmm5, %xmm0
+; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
+; X64-NEXT:    pand %xmm1, %xmm0
+; X64-NEXT:    pcmpeqd %xmm1, %xmm1
+; X64-NEXT:    pxor %xmm1, %xmm0
+; X64-NEXT:    por %xmm4, %xmm0
 ; X64-NEXT:    retq
 ;
 ; X64-AVX2-LABEL: vec_v2i64:
 ; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vpextrq $1, %xmm0, %rax
-; X64-AVX2-NEXT:    vpextrq $1, %xmm1, %rcx
-; X64-AVX2-NEXT:    movq %rax, %rdx
-; X64-AVX2-NEXT:    shlq %cl, %rdx
-; X64-AVX2-NEXT:    movq %rdx, %rsi
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-AVX2-NEXT:    shrq %cl, %rsi
-; X64-AVX2-NEXT:    cmpq %rsi, %rax
-; X64-AVX2-NEXT:    movq $-1, %rax
-; X64-AVX2-NEXT:    cmovneq %rax, %rdx
-; X64-AVX2-NEXT:    vmovq %rdx, %xmm2
-; X64-AVX2-NEXT:    vmovq %xmm0, %rdx
-; X64-AVX2-NEXT:    vmovq %xmm1, %rcx
-; X64-AVX2-NEXT:    movq %rdx, %rsi
-; X64-AVX2-NEXT:    shlq %cl, %rsi
-; X64-AVX2-NEXT:    movq %rsi, %rdi
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-AVX2-NEXT:    shrq %cl, %rdi
-; X64-AVX2-NEXT:    cmpq %rdi, %rdx
-; X64-AVX2-NEXT:    cmovneq %rax, %rsi
-; X64-AVX2-NEXT:    vmovq %rsi, %xmm0
-; X64-AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; X64-AVX2-NEXT:    vpsllvq %xmm1, %xmm0, %xmm2
+; X64-AVX2-NEXT:    vpsrlvq %xmm1, %xmm2, %xmm1
+; X64-AVX2-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; X64-AVX2-NEXT:    vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
 ; X64-AVX2-NEXT:    retq
 ;
 ; X86-LABEL: vec_v2i64:
@@ -147,102 +121,45 @@ define <2 x i64> @vec_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
 define <4 x i32> @vec_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X64-LABEL: vec_v4i32:
 ; X64:       # %bb.0:
-; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3]
-; X64-NEXT:    movd %xmm2, %eax
-; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3]
-; X64-NEXT:    movd %xmm2, %ecx
-; X64-NEXT:    movl %eax, %edx
-; X64-NEXT:    shll %cl, %edx
-; X64-NEXT:    movl %edx, %esi
-; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NEXT:    shrl %cl, %esi
-; X64-NEXT:    cmpl %esi, %eax
-; X64-NEXT:    movl $-1, %eax
-; X64-NEXT:    cmovnel %eax, %edx
-; X64-NEXT:    movd %edx, %xmm2
-; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; X64-NEXT:    movd %xmm3, %edx
-; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
-; X64-NEXT:    movd %xmm3, %ecx
-; X64-NEXT:    movl %edx, %esi
-; X64-NEXT:    shll %cl, %esi
-; X64-NEXT:    movl %esi, %edi
-; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NEXT:    shrl %cl, %edi
-; X64-NEXT:    cmpl %edi, %edx
-; X64-NEXT:    cmovnel %eax, %esi
-; X64-NEXT:    movd %esi, %xmm3
-; X64-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; X64-NEXT:    movd %xmm0, %edx
-; X64-NEXT:    movd %xmm1, %ecx
-; X64-NEXT:    movl %edx, %esi
-; X64-NEXT:    shll %cl, %esi
-; X64-NEXT:    movl %esi, %edi
-; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NEXT:    shrl %cl, %edi
-; X64-NEXT:    cmpl %edi, %edx
-; X64-NEXT:    cmovnel %eax, %esi
-; X64-NEXT:    movd %esi, %xmm2
-; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X64-NEXT:    movd %xmm0, %edx
-; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
-; X64-NEXT:    movd %xmm0, %ecx
-; X64-NEXT:    movl %edx, %esi
-; X64-NEXT:    shll %cl, %esi
-; X64-NEXT:    movl %esi, %edi
-; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NEXT:    shrl %cl, %edi
-; X64-NEXT:    cmpl %edi, %edx
-; X64-NEXT:    cmovnel %eax, %esi
-; X64-NEXT:    movd %esi, %xmm0
-; X64-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; X64-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; X64-NEXT:    movdqa %xmm2, %xmm0
+; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; X64-NEXT:    pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
+; X64-NEXT:    pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7]
+; X64-NEXT:    pslld $23, %xmm1
+; X64-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; X64-NEXT:    cvttps2dq %xmm1, %xmm1
+; X64-NEXT:    movdqa %xmm0, %xmm5
+; X64-NEXT:    pmuludq %xmm1, %xmm5
+; X64-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,2,2,3]
+; X64-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
+; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; X64-NEXT:    pmuludq %xmm7, %xmm1
+; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; X64-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1]
+; X64-NEXT:    pshuflw {{.*#+}} xmm1 = xmm2[2,3,3,3,4,5,6,7]
+; X64-NEXT:    movdqa %xmm6, %xmm7
+; X64-NEXT:    psrld %xmm1, %xmm7
+; X64-NEXT:    pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7]
+; X64-NEXT:    movdqa %xmm5, %xmm2
+; X64-NEXT:    psrld %xmm1, %xmm2
+; X64-NEXT:    punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm7[1]
+; X64-NEXT:    movdqa %xmm6, %xmm1
+; X64-NEXT:    psrld %xmm3, %xmm1
+; X64-NEXT:    psrld %xmm4, %xmm5
+; X64-NEXT:    punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm1[0]
+; X64-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,3],xmm2[0,3]
+; X64-NEXT:    pcmpeqd %xmm5, %xmm0
+; X64-NEXT:    pcmpeqd %xmm1, %xmm1
+; X64-NEXT:    pxor %xmm1, %xmm0
+; X64-NEXT:    por %xmm6, %xmm0
 ; X64-NEXT:    retq
 ;
 ; X64-AVX2-LABEL: vec_v4i32:
 ; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vpextrd $1, %xmm0, %eax
-; X64-AVX2-NEXT:    vpextrd $1, %xmm1, %ecx
-; X64-AVX2-NEXT:    movl %eax, %edx
-; X64-AVX2-NEXT:    shll %cl, %edx
-; X64-AVX2-NEXT:    movl %edx, %esi
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    shrl %cl, %esi
-; X64-AVX2-NEXT:    cmpl %esi, %eax
-; X64-AVX2-NEXT:    movl $-1, %eax
-; X64-AVX2-NEXT:    cmovnel %eax, %edx
-; X64-AVX2-NEXT:    vmovd %xmm0, %esi
-; X64-AVX2-NEXT:    vmovd %xmm1, %ecx
-; X64-AVX2-NEXT:    movl %esi, %edi
-; X64-AVX2-NEXT:    shll %cl, %edi
-; X64-AVX2-NEXT:    movl %edi, %r8d
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    shrl %cl, %r8d
-; X64-AVX2-NEXT:    cmpl %r8d, %esi
-; X64-AVX2-NEXT:    cmovnel %eax, %edi
-; X64-AVX2-NEXT:    vmovd %edi, %xmm2
-; X64-AVX2-NEXT:    vpinsrd $1, %edx, %xmm2, %xmm2
-; X64-AVX2-NEXT:    vpextrd $2, %xmm0, %edx
-; X64-AVX2-NEXT:    vpextrd $2, %xmm1, %ecx
-; X64-AVX2-NEXT:    movl %edx, %esi
-; X64-AVX2-NEXT:    shll %cl, %esi
-; X64-AVX2-NEXT:    movl %esi, %edi
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    shrl %cl, %edi
-; X64-AVX2-NEXT:    cmpl %edi, %edx
-; X64-AVX2-NEXT:    cmovnel %eax, %esi
-; X64-AVX2-NEXT:    vpinsrd $2, %esi, %xmm2, %xmm2
-; X64-AVX2-NEXT:    vpextrd $3, %xmm0, %edx
-; X64-AVX2-NEXT:    vpextrd $3, %xmm1, %ecx
-; X64-AVX2-NEXT:    movl %edx, %esi
-; X64-AVX2-NEXT:    shll %cl, %esi
-; X64-AVX2-NEXT:    movl %esi, %edi
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    shrl %cl, %edi
-; X64-AVX2-NEXT:    cmpl %edi, %edx
-; X64-AVX2-NEXT:    cmovnel %eax, %esi
-; X64-AVX2-NEXT:    vpinsrd $3, %esi, %xmm2, %xmm0
+; X64-AVX2-NEXT:    vpsllvd %xmm1, %xmm0, %xmm2
+; X64-AVX2-NEXT:    vpsrlvd %xmm1, %xmm2, %xmm1
+; X64-AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; X64-AVX2-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
 ; X64-AVX2-NEXT:    retq
 ;
 ; X86-LABEL: vec_v4i32:
@@ -303,195 +220,76 @@ define <4 x i32> @vec_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
 define <8 x i16> @vec_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
 ; X64-LABEL: vec_v8i16:
 ; X64:       # %bb.0:
-; X64-NEXT:    pextrw $7, %xmm0, %eax
-; X64-NEXT:    pextrw $7, %xmm1, %ecx
-; X64-NEXT:    movl %eax, %edx
-; X64-NEXT:    shll %cl, %edx
-; X64-NEXT:    movzwl %dx, %edx
-; X64-NEXT:    movl %edx, %esi
-; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NEXT:    shrl %cl, %esi
-; X64-NEXT:    cmpw %si, %ax
-; X64-NEXT:    movl $65535, %eax # imm = 0xFFFF
-; X64-NEXT:    cmovnel %eax, %edx
-; X64-NEXT:    movd %edx, %xmm2
-; X64-NEXT:    pextrw $6, %xmm0, %edx
-; X64-NEXT:    pextrw $6, %xmm1, %ecx
-; X64-NEXT:    movl %edx, %esi
-; X64-NEXT:    shll %cl, %esi
-; X64-NEXT:    movzwl %si, %esi
-; X64-NEXT:    movl %esi, %edi
-; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NEXT:    shrl %cl, %edi
-; X64-NEXT:    cmpw %di, %dx
-; X64-NEXT:    cmovnel %eax, %esi
-; X64-NEXT:    movd %esi, %xmm3
-; X64-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; X64-NEXT:    pextrw $5, %xmm0, %edx
-; X64-NEXT:    pextrw $5, %xmm1, %ecx
-; X64-NEXT:    movl %edx, %esi
-; X64-NEXT:    shll %cl, %esi
-; X64-NEXT:    movzwl %si, %esi
-; X64-NEXT:    movl %esi, %edi
-; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NEXT:    shrl %cl, %edi
-; X64-NEXT:    cmpw %di, %dx
-; X64-NEXT:    cmovnel %eax, %esi
-; X64-NEXT:    movd %esi, %xmm4
-; X64-NEXT:    pextrw $4, %xmm0, %edx
-; X64-NEXT:    pextrw $4, %xmm1, %ecx
-; X64-NEXT:    movl %edx, %esi
-; X64-NEXT:    shll %cl, %esi
-; X64-NEXT:    movzwl %si, %esi
-; X64-NEXT:    movl %esi, %edi
-; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NEXT:    shrl %cl, %edi
-; X64-NEXT:    cmpw %di, %dx
-; X64-NEXT:    cmovnel %eax, %esi
-; X64-NEXT:    movd %esi, %xmm2
-; X64-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; X64-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; X64-NEXT:    pextrw $3, %xmm0, %edx
-; X64-NEXT:    pextrw $3, %xmm1, %ecx
-; X64-NEXT:    movl %edx, %esi
-; X64-NEXT:    shll %cl, %esi
-; X64-NEXT:    movzwl %si, %esi
-; X64-NEXT:    movl %esi, %edi
-; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NEXT:    shrl %cl, %edi
-; X64-NEXT:    cmpw %di, %dx
-; X64-NEXT:    cmovnel %eax, %esi
-; X64-NEXT:    movd %esi, %xmm4
-; X64-NEXT:    pextrw $2, %xmm0, %edx
-; X64-NEXT:    pextrw $2, %xmm1, %ecx
-; X64-NEXT:    movl %edx, %esi
-; X64-NEXT:    shll %cl, %esi
-; X64-NEXT:    movzwl %si, %esi
-; X64-NEXT:    movl %esi, %edi
-; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NEXT:    shrl %cl, %edi
-; X64-NEXT:    cmpw %di, %dx
-; X64-NEXT:    cmovnel %eax, %esi
-; X64-NEXT:    movd %esi, %xmm3
-; X64-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; X64-NEXT:    pextrw $1, %xmm0, %edx
-; X64-NEXT:    pextrw $1, %xmm1, %ecx
-; X64-NEXT:    movl %edx, %esi
-; X64-NEXT:    shll %cl, %esi
-; X64-NEXT:    movzwl %si, %esi
-; X64-NEXT:    movl %esi, %edi
-; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NEXT:    shrl %cl, %edi
-; X64-NEXT:    cmpw %di, %dx
-; X64-NEXT:    cmovnel %eax, %esi
-; X64-NEXT:    movd %esi, %xmm4
-; X64-NEXT:    movd %xmm0, %edx
-; X64-NEXT:    movd %xmm1, %ecx
-; X64-NEXT:    movl %edx, %esi
-; X64-NEXT:    shll %cl, %esi
-; X64-NEXT:    movzwl %si, %esi
-; X64-NEXT:    movl %esi, %edi
-; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NEXT:    shrl %cl, %edi
-; X64-NEXT:    cmpw %di, %dx
-; X64-NEXT:    cmovnel %eax, %esi
-; X64-NEXT:    movd %esi, %xmm0
-; X64-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; X64-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; X64-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; X64-NEXT:    movdqa %xmm1, %xmm2
+; X64-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
+; X64-NEXT:    pslld $23, %xmm2
+; X64-NEXT:    movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
+; X64-NEXT:    paddd %xmm3, %xmm2
+; X64-NEXT:    cvttps2dq %xmm2, %xmm4
+; X64-NEXT:    pslld $16, %xmm4
+; X64-NEXT:    psrad $16, %xmm4
+; X64-NEXT:    movdqa %xmm1, %xmm2
+; X64-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
+; X64-NEXT:    pslld $23, %xmm2
+; X64-NEXT:    paddd %xmm3, %xmm2
+; X64-NEXT:    cvttps2dq %xmm2, %xmm2
+; X64-NEXT:    pslld $16, %xmm2
+; X64-NEXT:    psrad $16, %xmm2
+; X64-NEXT:    packssdw %xmm4, %xmm2
+; X64-NEXT:    pmullw %xmm0, %xmm2
+; X64-NEXT:    psllw $12, %xmm1
+; X64-NEXT:    movdqa %xmm1, %xmm3
+; X64-NEXT:    psraw $15, %xmm3
+; X64-NEXT:    movdqa %xmm2, %xmm4
+; X64-NEXT:    psrlw $8, %xmm4
+; X64-NEXT:    pand %xmm3, %xmm4
+; X64-NEXT:    pandn %xmm2, %xmm3
+; X64-NEXT:    por %xmm4, %xmm3
+; X64-NEXT:    paddw %xmm1, %xmm1
+; X64-NEXT:    movdqa %xmm1, %xmm4
+; X64-NEXT:    psraw $15, %xmm4
+; X64-NEXT:    movdqa %xmm4, %xmm5
+; X64-NEXT:    pandn %xmm3, %xmm5
+; X64-NEXT:    psrlw $4, %xmm3
+; X64-NEXT:    pand %xmm4, %xmm3
+; X64-NEXT:    por %xmm5, %xmm3
+; X64-NEXT:    paddw %xmm1, %xmm1
+; X64-NEXT:    movdqa %xmm1, %xmm4
+; X64-NEXT:    psraw $15, %xmm4
+; X64-NEXT:    movdqa %xmm4, %xmm5
+; X64-NEXT:    pandn %xmm3, %xmm5
+; X64-NEXT:    psrlw $2, %xmm3
+; X64-NEXT:    pand %xmm4, %xmm3
+; X64-NEXT:    por %xmm5, %xmm3
+; X64-NEXT:    paddw %xmm1, %xmm1
+; X64-NEXT:    psraw $15, %xmm1
+; X64-NEXT:    movdqa %xmm1, %xmm4
+; X64-NEXT:    pandn %xmm3, %xmm4
+; X64-NEXT:    psrlw $1, %xmm3
+; X64-NEXT:    pand %xmm1, %xmm3
+; X64-NEXT:    por %xmm4, %xmm3
+; X64-NEXT:    pcmpeqw %xmm3, %xmm0
+; X64-NEXT:    pcmpeqd %xmm1, %xmm1
+; X64-NEXT:    pxor %xmm1, %xmm0
+; X64-NEXT:    por %xmm2, %xmm0
 ; X64-NEXT:    retq
 ;
 ; X64-AVX2-LABEL: vec_v8i16:
 ; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vpextrw $1, %xmm0, %eax
-; X64-AVX2-NEXT:    vpextrw $1, %xmm1, %ecx
-; X64-AVX2-NEXT:    movl %eax, %edx
-; X64-AVX2-NEXT:    shll %cl, %edx
-; X64-AVX2-NEXT:    movzwl %dx, %edx
-; X64-AVX2-NEXT:    movl %edx, %esi
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    shrl %cl, %esi
-; X64-AVX2-NEXT:    cmpw %si, %ax
-; X64-AVX2-NEXT:    movl $65535, %eax # imm = 0xFFFF
-; X64-AVX2-NEXT:    cmovnel %eax, %edx
-; X64-AVX2-NEXT:    vmovd %xmm0, %esi
-; X64-AVX2-NEXT:    vmovd %xmm1, %ecx
-; X64-AVX2-NEXT:    movl %esi, %edi
-; X64-AVX2-NEXT:    shll %cl, %edi
-; X64-AVX2-NEXT:    movzwl %di, %edi
-; X64-AVX2-NEXT:    movl %edi, %r8d
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    shrl %cl, %r8d
-; X64-AVX2-NEXT:    cmpw %r8w, %si
-; X64-AVX2-NEXT:    cmovnel %eax, %edi
-; X64-AVX2-NEXT:    vmovd %edi, %xmm2
-; X64-AVX2-NEXT:    vpinsrw $1, %edx, %xmm2, %xmm2
-; X64-AVX2-NEXT:    vpextrw $2, %xmm0, %edx
-; X64-AVX2-NEXT:    vpextrw $2, %xmm1, %ecx
-; X64-AVX2-NEXT:    movl %edx, %esi
-; X64-AVX2-NEXT:    shll %cl, %esi
-; X64-AVX2-NEXT:    movzwl %si, %esi
-; X64-AVX2-NEXT:    movl %esi, %edi
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    shrl %cl, %edi
-; X64-AVX2-NEXT:    cmpw %di, %dx
-; X64-AVX2-NEXT:    cmovnel %eax, %esi
-; X64-AVX2-NEXT:    vpinsrw $2, %esi, %xmm2, %xmm2
-; X64-AVX2-NEXT:    vpextrw $3, %xmm0, %edx
-; X64-AVX2-NEXT:    vpextrw $3, %xmm1, %ecx
-; X64-AVX2-NEXT:    movl %edx, %esi
-; X64-AVX2-NEXT:    shll %cl, %esi
-; X64-AVX2-NEXT:    movzwl %si, %esi
-; X64-AVX2-NEXT:    movl %esi, %edi
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    shrl %cl, %edi
-; X64-AVX2-NEXT:    cmpw %di, %dx
-; X64-AVX2-NEXT:    cmovnel %eax, %esi
-; X64-AVX2-NEXT:    vpinsrw $3, %esi, %xmm2, %xmm2
-; X64-AVX2-NEXT:    vpextrw $4, %xmm0, %edx
-; X64-AVX2-NEXT:    vpextrw $4, %xmm1, %ecx
-; X64-AVX2-NEXT:    movl %edx, %esi
-; X64-AVX2-NEXT:    shll %cl, %esi
-; X64-AVX2-NEXT:    movzwl %si, %esi
-; X64-AVX2-NEXT:    movl %esi, %edi
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    shrl %cl, %edi
-; X64-AVX2-NEXT:    cmpw %di, %dx
-; X64-AVX2-NEXT:    cmovnel %eax, %esi
-; X64-AVX2-NEXT:    vpinsrw $4, %esi, %xmm2, %xmm2
-; X64-AVX2-NEXT:    vpextrw $5, %xmm0, %edx
-; X64-AVX2-NEXT:    vpextrw $5, %xmm1, %ecx
-; X64-AVX2-NEXT:    movl %edx, %esi
-; X64-AVX2-NEXT:    shll %cl, %esi
-; X64-AVX2-NEXT:    movzwl %si, %esi
-; X64-AVX2-NEXT:    movl %esi, %edi
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    shrl %cl, %edi
-; X64-AVX2-NEXT:    cmpw %di, %dx
-; X64-AVX2-NEXT:    cmovnel %eax, %esi
-; X64-AVX2-NEXT:    vpinsrw $5, %esi, %xmm2, %xmm2
-; X64-AVX2-NEXT:    vpextrw $6, %xmm0, %edx
-; X64-AVX2-NEXT:    vpextrw $6, %xmm1, %ecx
-; X64-AVX2-NEXT:    movl %edx, %esi
-; X64-AVX2-NEXT:    shll %cl, %esi
-; X64-AVX2-NEXT:    movzwl %si, %esi
-; X64-AVX2-NEXT:    movl %esi, %edi
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    shrl %cl, %edi
-; X64-AVX2-NEXT:    cmpw %di, %dx
-; X64-AVX2-NEXT:    cmovnel %eax, %esi
-; X64-AVX2-NEXT:    vpinsrw $6, %esi, %xmm2, %xmm2
-; X64-AVX2-NEXT:    vpextrw $7, %xmm0, %edx
-; X64-AVX2-NEXT:    vpextrw $7, %xmm1, %ecx
-; X64-AVX2-NEXT:    movl %edx, %esi
-; X64-AVX2-NEXT:    shll %cl, %esi
-; X64-AVX2-NEXT:    movzwl %si, %esi
-; X64-AVX2-NEXT:    movl %esi, %edi
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    shrl %cl, %edi
-; X64-AVX2-NEXT:    cmpw %di, %dx
-; X64-AVX2-NEXT:    cmovnel %eax, %esi
-; X64-AVX2-NEXT:    vpinsrw $7, %esi, %xmm2, %xmm0
+; X64-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; X64-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; X64-AVX2-NEXT:    vpsllvd %ymm1, %ymm2, %ymm2
+; X64-AVX2-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
+; X64-AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; X64-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; X64-AVX2-NEXT:    vpsrlvd %ymm1, %ymm3, %ymm1
+; X64-AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm3
+; X64-AVX2-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
+; X64-AVX2-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; X64-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vzeroupper
 ; X64-AVX2-NEXT:    retq
 ;
 ; X86-LABEL: vec_v8i16:
@@ -610,366 +408,82 @@ define <8 x i16> @vec_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
 define <16 x i8> @vec_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
 ; X64-LABEL: vec_v16i8:
 ; X64:       # %bb.0:
-; X64-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; X64-NEXT:    movl %eax, %edx
-; X64-NEXT:    shlb %cl, %dl
-; X64-NEXT:    movzbl %dl, %esi
-; X64-NEXT:    movl %esi, %edx
-; X64-NEXT:    shrb %cl, %dl
-; X64-NEXT:    cmpb %dl, %al
-; X64-NEXT:    movl $255, %edx
-; X64-NEXT:    cmovnel %edx, %esi
-; X64-NEXT:    movd %esi, %xmm0
-; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; X64-NEXT:    movl %eax, %esi
-; X64-NEXT:    shlb %cl, %sil
-; X64-NEXT:    movzbl %sil, %esi
-; X64-NEXT:    movl %esi, %edi
-; X64-NEXT:    shrb %cl, %dil
-; X64-NEXT:    cmpb %dil, %al
-; X64-NEXT:    cmovnel %edx, %esi
-; X64-NEXT:    movd %esi, %xmm1
-; X64-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; X64-NEXT:    movl %eax, %esi
-; X64-NEXT:    shlb %cl, %sil
-; X64-NEXT:    movzbl %sil, %esi
-; X64-NEXT:    movl %esi, %edi
-; X64-NEXT:    shrb %cl, %dil
-; X64-NEXT:    cmpb %dil, %al
-; X64-NEXT:    cmovnel %edx, %esi
-; X64-NEXT:    movd %esi, %xmm2
-; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; X64-NEXT:    movl %eax, %esi
-; X64-NEXT:    shlb %cl, %sil
-; X64-NEXT:    movzbl %sil, %esi
-; X64-NEXT:    movl %esi, %edi
-; X64-NEXT:    shrb %cl, %dil
-; X64-NEXT:    cmpb %dil, %al
-; X64-NEXT:    cmovnel %edx, %esi
-; X64-NEXT:    movd %esi, %xmm0
-; X64-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; X64-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; X64-NEXT:    movl %eax, %esi
-; X64-NEXT:    shlb %cl, %sil
-; X64-NEXT:    movzbl %sil, %esi
-; X64-NEXT:    movl %esi, %edi
-; X64-NEXT:    shrb %cl, %dil
-; X64-NEXT:    cmpb %dil, %al
-; X64-NEXT:    cmovnel %edx, %esi
-; X64-NEXT:    movd %esi, %xmm1
-; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; X64-NEXT:    movl %eax, %esi
-; X64-NEXT:    shlb %cl, %sil
-; X64-NEXT:    movzbl %sil, %esi
-; X64-NEXT:    movl %esi, %edi
-; X64-NEXT:    shrb %cl, %dil
-; X64-NEXT:    cmpb %dil, %al
-; X64-NEXT:    cmovnel %edx, %esi
-; X64-NEXT:    movd %esi, %xmm2
-; X64-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; X64-NEXT:    movl %eax, %esi
-; X64-NEXT:    shlb %cl, %sil
-; X64-NEXT:    movzbl %sil, %esi
-; X64-NEXT:    movl %esi, %edi
-; X64-NEXT:    shrb %cl, %dil
-; X64-NEXT:    cmpb %dil, %al
-; X64-NEXT:    cmovnel %edx, %esi
-; X64-NEXT:    movd %esi, %xmm3
-; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; X64-NEXT:    movl %eax, %esi
-; X64-NEXT:    shlb %cl, %sil
-; X64-NEXT:    movzbl %sil, %esi
-; X64-NEXT:    movl %esi, %edi
-; X64-NEXT:    shrb %cl, %dil
-; X64-NEXT:    cmpb %dil, %al
-; X64-NEXT:    cmovnel %edx, %esi
-; X64-NEXT:    movd %esi, %xmm1
-; X64-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
-; X64-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; X64-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; X64-NEXT:    movl %eax, %esi
-; X64-NEXT:    shlb %cl, %sil
-; X64-NEXT:    movzbl %sil, %esi
-; X64-NEXT:    movl %esi, %edi
-; X64-NEXT:    shrb %cl, %dil
-; X64-NEXT:    cmpb %dil, %al
-; X64-NEXT:    cmovnel %edx, %esi
-; X64-NEXT:    movd %esi, %xmm2
-; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; X64-NEXT:    movl %eax, %esi
-; X64-NEXT:    shlb %cl, %sil
-; X64-NEXT:    movzbl %sil, %esi
-; X64-NEXT:    movl %esi, %edi
-; X64-NEXT:    shrb %cl, %dil
-; X64-NEXT:    cmpb %dil, %al
-; X64-NEXT:    cmovnel %edx, %esi
-; X64-NEXT:    movd %esi, %xmm0
-; X64-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; X64-NEXT:    movl %eax, %esi
-; X64-NEXT:    shlb %cl, %sil
-; X64-NEXT:    movzbl %sil, %esi
-; X64-NEXT:    movl %esi, %edi
-; X64-NEXT:    shrb %cl, %dil
-; X64-NEXT:    cmpb %dil, %al
-; X64-NEXT:    cmovnel %edx, %esi
-; X64-NEXT:    movd %esi, %xmm3
-; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; X64-NEXT:    movl %eax, %esi
-; X64-NEXT:    shlb %cl, %sil
-; X64-NEXT:    movzbl %sil, %esi
-; X64-NEXT:    movl %esi, %edi
-; X64-NEXT:    shrb %cl, %dil
-; X64-NEXT:    cmpb %dil, %al
-; X64-NEXT:    cmovnel %edx, %esi
-; X64-NEXT:    movd %esi, %xmm2
-; X64-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
-; X64-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; X64-NEXT:    movl %eax, %esi
-; X64-NEXT:    shlb %cl, %sil
-; X64-NEXT:    movzbl %sil, %esi
-; X64-NEXT:    movl %esi, %edi
-; X64-NEXT:    shrb %cl, %dil
-; X64-NEXT:    cmpb %dil, %al
-; X64-NEXT:    cmovnel %edx, %esi
-; X64-NEXT:    movd %esi, %xmm0
-; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; X64-NEXT:    movl %eax, %esi
-; X64-NEXT:    shlb %cl, %sil
-; X64-NEXT:    movzbl %sil, %esi
-; X64-NEXT:    movl %esi, %edi
-; X64-NEXT:    shrb %cl, %dil
-; X64-NEXT:    cmpb %dil, %al
-; X64-NEXT:    cmovnel %edx, %esi
-; X64-NEXT:    movd %esi, %xmm3
-; X64-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edi
-; X64-NEXT:    movl %edi, %r8d
-; X64-NEXT:    shlb %cl, %r8b
-; X64-NEXT:    movzbl %r8b, %r8d
-; X64-NEXT:    movl %r8d, %r9d
-; X64-NEXT:    shrb %cl, %r9b
-; X64-NEXT:    cmpb %r9b, %dil
-; X64-NEXT:    cmovnel %edx, %r8d
-; X64-NEXT:    movd %r8d, %xmm4
-; X64-NEXT:    movl %esi, %edi
-; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    shlb %cl, %dil
-; X64-NEXT:    movzbl %dil, %edi
-; X64-NEXT:    movl %edi, %r8d
-; X64-NEXT:    shrb %cl, %r8b
-; X64-NEXT:    cmpb %r8b, %sil
-; X64-NEXT:    cmovnel %edx, %edi
-; X64-NEXT:    movd %edi, %xmm0
-; X64-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
-; X64-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; X64-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; X64-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64-NEXT:    psllw $5, %xmm1
+; X64-NEXT:    pxor %xmm3, %xmm3
+; X64-NEXT:    pxor %xmm4, %xmm4
+; X64-NEXT:    pcmpgtb %xmm1, %xmm4
+; X64-NEXT:    movdqa %xmm4, %xmm5
+; X64-NEXT:    pandn %xmm0, %xmm5
+; X64-NEXT:    movdqa %xmm0, %xmm2
+; X64-NEXT:    psllw $4, %xmm2
+; X64-NEXT:    pand %xmm4, %xmm2
+; X64-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; X64-NEXT:    por %xmm5, %xmm2
+; X64-NEXT:    paddb %xmm1, %xmm1
+; X64-NEXT:    pxor %xmm5, %xmm5
+; X64-NEXT:    pcmpgtb %xmm1, %xmm5
+; X64-NEXT:    movdqa %xmm5, %xmm6
+; X64-NEXT:    pandn %xmm2, %xmm6
+; X64-NEXT:    psllw $2, %xmm2
+; X64-NEXT:    pand %xmm5, %xmm2
+; X64-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; X64-NEXT:    por %xmm6, %xmm2
+; X64-NEXT:    paddb %xmm1, %xmm1
+; X64-NEXT:    pcmpgtb %xmm1, %xmm3
+; X64-NEXT:    movdqa %xmm3, %xmm1
+; X64-NEXT:    pandn %xmm2, %xmm1
+; X64-NEXT:    paddb %xmm2, %xmm2
+; X64-NEXT:    pand %xmm3, %xmm2
+; X64-NEXT:    por %xmm1, %xmm2
+; X64-NEXT:    movdqa %xmm2, %xmm1
+; X64-NEXT:    psrlw $4, %xmm1
+; X64-NEXT:    pand %xmm4, %xmm1
+; X64-NEXT:    pandn %xmm2, %xmm4
+; X64-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; X64-NEXT:    por %xmm4, %xmm1
+; X64-NEXT:    movdqa %xmm5, %xmm4
+; X64-NEXT:    pandn %xmm1, %xmm4
+; X64-NEXT:    psrlw $2, %xmm1
+; X64-NEXT:    pand %xmm5, %xmm1
+; X64-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; X64-NEXT:    por %xmm4, %xmm1
+; X64-NEXT:    movdqa %xmm3, %xmm4
+; X64-NEXT:    pandn %xmm1, %xmm4
+; X64-NEXT:    psrlw $1, %xmm1
+; X64-NEXT:    pand %xmm3, %xmm1
+; X64-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; X64-NEXT:    por %xmm4, %xmm1
+; X64-NEXT:    pcmpeqb %xmm1, %xmm0
+; X64-NEXT:    pcmpeqd %xmm1, %xmm1
+; X64-NEXT:    pxor %xmm1, %xmm0
+; X64-NEXT:    por %xmm2, %xmm0
 ; X64-NEXT:    retq
 ;
 ; X64-AVX2-LABEL: vec_v16i8:
 ; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vpextrb $1, %xmm1, %ecx
-; X64-AVX2-NEXT:    vpextrb $1, %xmm0, %eax
-; X64-AVX2-NEXT:    movl %eax, %edx
-; X64-AVX2-NEXT:    shlb %cl, %dl
-; X64-AVX2-NEXT:    movzbl %dl, %edx
-; X64-AVX2-NEXT:    movl %edx, %esi
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    shrb %cl, %sil
-; X64-AVX2-NEXT:    cmpb %sil, %al
-; X64-AVX2-NEXT:    movl $255, %eax
-; X64-AVX2-NEXT:    cmovnel %eax, %edx
-; X64-AVX2-NEXT:    vmovd %xmm1, %ecx
-; X64-AVX2-NEXT:    vmovd %xmm0, %esi
-; X64-AVX2-NEXT:    movl %esi, %edi
-; X64-AVX2-NEXT:    shlb %cl, %dil
-; X64-AVX2-NEXT:    movzbl %dil, %edi
-; X64-AVX2-NEXT:    movl %edi, %r8d
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    shrb %cl, %r8b
-; X64-AVX2-NEXT:    cmpb %r8b, %sil
-; X64-AVX2-NEXT:    cmovnel %eax, %edi
-; X64-AVX2-NEXT:    vmovd %edi, %xmm2
-; X64-AVX2-NEXT:    vpinsrb $1, %edx, %xmm2, %xmm2
-; X64-AVX2-NEXT:    vpextrb $2, %xmm1, %ecx
-; X64-AVX2-NEXT:    vpextrb $2, %xmm0, %edx
-; X64-AVX2-NEXT:    movl %edx, %esi
-; X64-AVX2-NEXT:    shlb %cl, %sil
-; X64-AVX2-NEXT:    movzbl %sil, %esi
-; X64-AVX2-NEXT:    movl %esi, %edi
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    shrb %cl, %dil
-; X64-AVX2-NEXT:    cmpb %dil, %dl
-; X64-AVX2-NEXT:    cmovnel %eax, %esi
-; X64-AVX2-NEXT:    vpinsrb $2, %esi, %xmm2, %xmm2
-; X64-AVX2-NEXT:    vpextrb $3, %xmm1, %ecx
-; X64-AVX2-NEXT:    vpextrb $3, %xmm0, %edx
-; X64-AVX2-NEXT:    movl %edx, %esi
-; X64-AVX2-NEXT:    shlb %cl, %sil
-; X64-AVX2-NEXT:    movzbl %sil, %esi
-; X64-AVX2-NEXT:    movl %esi, %edi
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    shrb %cl, %dil
-; X64-AVX2-NEXT:    cmpb %dil, %dl
-; X64-AVX2-NEXT:    cmovnel %eax, %esi
-; X64-AVX2-NEXT:    vpinsrb $3, %esi, %xmm2, %xmm2
-; X64-AVX2-NEXT:    vpextrb $4, %xmm1, %ecx
-; X64-AVX2-NEXT:    vpextrb $4, %xmm0, %edx
-; X64-AVX2-NEXT:    movl %edx, %esi
-; X64-AVX2-NEXT:    shlb %cl, %sil
-; X64-AVX2-NEXT:    movzbl %sil, %esi
-; X64-AVX2-NEXT:    movl %esi, %edi
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    shrb %cl, %dil
-; X64-AVX2-NEXT:    cmpb %dil, %dl
-; X64-AVX2-NEXT:    cmovnel %eax, %esi
-; X64-AVX2-NEXT:    vpinsrb $4, %esi, %xmm2, %xmm2
-; X64-AVX2-NEXT:    vpextrb $5, %xmm1, %ecx
-; X64-AVX2-NEXT:    vpextrb $5, %xmm0, %edx
-; X64-AVX2-NEXT:    movl %edx, %esi
-; X64-AVX2-NEXT:    shlb %cl, %sil
-; X64-AVX2-NEXT:    movzbl %sil, %esi
-; X64-AVX2-NEXT:    movl %esi, %edi
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    shrb %cl, %dil
-; X64-AVX2-NEXT:    cmpb %dil, %dl
-; X64-AVX2-NEXT:    cmovnel %eax, %esi
-; X64-AVX2-NEXT:    vpinsrb $5, %esi, %xmm2, %xmm2
-; X64-AVX2-NEXT:    vpextrb $6, %xmm1, %ecx
-; X64-AVX2-NEXT:    vpextrb $6, %xmm0, %edx
-; X64-AVX2-NEXT:    movl %edx, %esi
-; X64-AVX2-NEXT:    shlb %cl, %sil
-; X64-AVX2-NEXT:    movzbl %sil, %esi
-; X64-AVX2-NEXT:    movl %esi, %edi
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    shrb %cl, %dil
-; X64-AVX2-NEXT:    cmpb %dil, %dl
-; X64-AVX2-NEXT:    cmovnel %eax, %esi
-; X64-AVX2-NEXT:    vpinsrb $6, %esi, %xmm2, %xmm2
-; X64-AVX2-NEXT:    vpextrb $7, %xmm1, %ecx
-; X64-AVX2-NEXT:    vpextrb $7, %xmm0, %edx
-; X64-AVX2-NEXT:    movl %edx, %esi
-; X64-AVX2-NEXT:    shlb %cl, %sil
-; X64-AVX2-NEXT:    movzbl %sil, %esi
-; X64-AVX2-NEXT:    movl %esi, %edi
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    shrb %cl, %dil
-; X64-AVX2-NEXT:    cmpb %dil, %dl
-; X64-AVX2-NEXT:    cmovnel %eax, %esi
-; X64-AVX2-NEXT:    vpinsrb $7, %esi, %xmm2, %xmm2
-; X64-AVX2-NEXT:    vpextrb $8, %xmm1, %ecx
-; X64-AVX2-NEXT:    vpextrb $8, %xmm0, %edx
-; X64-AVX2-NEXT:    movl %edx, %esi
-; X64-AVX2-NEXT:    shlb %cl, %sil
-; X64-AVX2-NEXT:    movzbl %sil, %esi
-; X64-AVX2-NEXT:    movl %esi, %edi
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    shrb %cl, %dil
-; X64-AVX2-NEXT:    cmpb %dil, %dl
-; X64-AVX2-NEXT:    cmovnel %eax, %esi
-; X64-AVX2-NEXT:    vpinsrb $8, %esi, %xmm2, %xmm2
-; X64-AVX2-NEXT:    vpextrb $9, %xmm1, %ecx
-; X64-AVX2-NEXT:    vpextrb $9, %xmm0, %edx
-; X64-AVX2-NEXT:    movl %edx, %esi
-; X64-AVX2-NEXT:    shlb %cl, %sil
-; X64-AVX2-NEXT:    movzbl %sil, %esi
-; X64-AVX2-NEXT:    movl %esi, %edi
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    shrb %cl, %dil
-; X64-AVX2-NEXT:    cmpb %dil, %dl
-; X64-AVX2-NEXT:    cmovnel %eax, %esi
-; X64-AVX2-NEXT:    vpinsrb $9, %esi, %xmm2, %xmm2
-; X64-AVX2-NEXT:    vpextrb $10, %xmm1, %ecx
-; X64-AVX2-NEXT:    vpextrb $10, %xmm0, %edx
-; X64-AVX2-NEXT:    movl %edx, %esi
-; X64-AVX2-NEXT:    shlb %cl, %sil
-; X64-AVX2-NEXT:    movzbl %sil, %esi
-; X64-AVX2-NEXT:    movl %esi, %edi
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    shrb %cl, %dil
-; X64-AVX2-NEXT:    cmpb %dil, %dl
-; X64-AVX2-NEXT:    cmovnel %eax, %esi
-; X64-AVX2-NEXT:    vpinsrb $10, %esi, %xmm2, %xmm2
-; X64-AVX2-NEXT:    vpextrb $11, %xmm1, %ecx
-; X64-AVX2-NEXT:    vpextrb $11, %xmm0, %edx
-; X64-AVX2-NEXT:    movl %edx, %esi
-; X64-AVX2-NEXT:    shlb %cl, %sil
-; X64-AVX2-NEXT:    movzbl %sil, %esi
-; X64-AVX2-NEXT:    movl %esi, %edi
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    shrb %cl, %dil
-; X64-AVX2-NEXT:    cmpb %dil, %dl
-; X64-AVX2-NEXT:    cmovnel %eax, %esi
-; X64-AVX2-NEXT:    vpinsrb $11, %esi, %xmm2, %xmm2
-; X64-AVX2-NEXT:    vpextrb $12, %xmm1, %ecx
-; X64-AVX2-NEXT:    vpextrb $12, %xmm0, %edx
-; X64-AVX2-NEXT:    movl %edx, %esi
-; X64-AVX2-NEXT:    shlb %cl, %sil
-; X64-AVX2-NEXT:    movzbl %sil, %esi
-; X64-AVX2-NEXT:    movl %esi, %edi
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    shrb %cl, %dil
-; X64-AVX2-NEXT:    cmpb %dil, %dl
-; X64-AVX2-NEXT:    cmovnel %eax, %esi
-; X64-AVX2-NEXT:    vpinsrb $12, %esi, %xmm2, %xmm2
-; X64-AVX2-NEXT:    vpextrb $13, %xmm1, %ecx
-; X64-AVX2-NEXT:    vpextrb $13, %xmm0, %edx
-; X64-AVX2-NEXT:    movl %edx, %esi
-; X64-AVX2-NEXT:    shlb %cl, %sil
-; X64-AVX2-NEXT:    movzbl %sil, %esi
-; X64-AVX2-NEXT:    movl %esi, %edi
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    shrb %cl, %dil
-; X64-AVX2-NEXT:    cmpb %dil, %dl
-; X64-AVX2-NEXT:    cmovnel %eax, %esi
-; X64-AVX2-NEXT:    vpinsrb $13, %esi, %xmm2, %xmm2
-; X64-AVX2-NEXT:    vpextrb $14, %xmm1, %ecx
-; X64-AVX2-NEXT:    vpextrb $14, %xmm0, %edx
-; X64-AVX2-NEXT:    movl %edx, %esi
-; X64-AVX2-NEXT:    shlb %cl, %sil
-; X64-AVX2-NEXT:    movzbl %sil, %esi
-; X64-AVX2-NEXT:    movl %esi, %edi
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    shrb %cl, %dil
-; X64-AVX2-NEXT:    cmpb %dil, %dl
-; X64-AVX2-NEXT:    cmovnel %eax, %esi
-; X64-AVX2-NEXT:    vpinsrb $14, %esi, %xmm2, %xmm2
-; X64-AVX2-NEXT:    vpextrb $15, %xmm1, %ecx
-; X64-AVX2-NEXT:    vpextrb $15, %xmm0, %edx
-; X64-AVX2-NEXT:    movl %edx, %esi
-; X64-AVX2-NEXT:    shlb %cl, %sil
-; X64-AVX2-NEXT:    movzbl %sil, %esi
-; X64-AVX2-NEXT:    movl %esi, %edi
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    shrb %cl, %dil
-; X64-AVX2-NEXT:    cmpb %dil, %dl
-; X64-AVX2-NEXT:    cmovnel %eax, %esi
-; X64-AVX2-NEXT:    vpinsrb $15, %esi, %xmm2, %xmm0
+; X64-AVX2-NEXT:    vpsllw $5, %xmm1, %xmm1
+; X64-AVX2-NEXT:    vpsllw $4, %xmm0, %xmm2
+; X64-AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; X64-AVX2-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm2
+; X64-AVX2-NEXT:    vpsllw $2, %xmm2, %xmm3
+; X64-AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
+; X64-AVX2-NEXT:    vpaddb %xmm1, %xmm1, %xmm4
+; X64-AVX2-NEXT:    vpblendvb %xmm4, %xmm3, %xmm2, %xmm2
+; X64-AVX2-NEXT:    vpaddb %xmm2, %xmm2, %xmm3
+; X64-AVX2-NEXT:    vpaddb %xmm4, %xmm4, %xmm5
+; X64-AVX2-NEXT:    vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
+; X64-AVX2-NEXT:    vpsrlw $4, %xmm2, %xmm3
+; X64-AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
+; X64-AVX2-NEXT:    vpblendvb %xmm1, %xmm3, %xmm2, %xmm1
+; X64-AVX2-NEXT:    vpsrlw $2, %xmm1, %xmm3
+; X64-AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
+; X64-AVX2-NEXT:    vpblendvb %xmm4, %xmm3, %xmm1, %xmm1
+; X64-AVX2-NEXT:    vpsrlw $1, %xmm1, %xmm3
+; X64-AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
+; X64-AVX2-NEXT:    vpblendvb %xmm5, %xmm3, %xmm1, %xmm1
+; X64-AVX2-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; X64-AVX2-NEXT:    vpblendvb %xmm0, %xmm2, %xmm1, %xmm0
 ; X64-AVX2-NEXT:    retq
 ;
 ; X86-LABEL: vec_v16i8:


        


More information about the llvm-commits mailing list