[llvm] r271996 - [InstCombine][AVX2] Add support for simplifying AVX2 per-element shifts to native shifts

Tue Jun 7 03:27:15 PDT 2016

Author: rksimon
Date: Tue Jun  7 05:27:15 2016
New Revision: 271996

URL: http://llvm.org/viewvc/llvm-project?rev=271996&view=rev
Log:
[InstCombine][AVX2] Add support for simplifying AVX2 per-element shifts to native shifts

Unlike native shifts, the AVX2 per-element shift instructions VPSRAV/VPSRLV/VPSLLV handle out of range shift values (logical shifts set the result to zero, arithmetic shifts splat the sign bit).

If the shift amount is constant we can sometimes convert these instructions to native shifts:

1 - if all shift amounts are in range then the conversion is trivial.
2 - out of range arithmetic shifts can be clamped to the (bitwidth - 1) (a legal shift amount) before conversion.
3 - logical shifts just return zero if all elements have out of range shift amounts.

In addition, UNDEF shift amounts are handled - either as an UNDEF shift amount in a native shift or as an UNDEF in the logical 'all out of range' zero constant special case for logical shifts.

Differential Revision: http://reviews.llvm.org/D19675

Modified:
    llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp
    llvm/trunk/test/Transforms/InstCombine/x86-vector-shifts.ll

Modified: llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp?rev=271996&r1=271995&r2=271996&view=diff
==============================================================================

--- llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp (original)
+++ llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp Tue Jun  7 05:27:15 2016
@@ -325,6 +325,117 @@ static Value *simplifyX86immShift(const
   return Builder.CreateAShr(Vec, ShiftVec);
 }
 
+// Attempt to simplify AVX2 per-element shift intrinsics to a generic IR shift.
+// Unlike the generic IR shifts, the intrinsics have defined behaviour for out
+// of range shift amounts (logical - set to zero, arithmetic - splat sign bit).
+static Value *simplifyX86varShift(const IntrinsicInst &II,
+                                  InstCombiner::BuilderTy &Builder) {
+  bool LogicalShift = false;
+  bool ShiftLeft = false;
+
+  switch (II.getIntrinsicID()) {
+  default:
+    return nullptr;
+  case Intrinsic::x86_avx2_psrav_d:
+  case Intrinsic::x86_avx2_psrav_d_256:
+    LogicalShift = false;
+    ShiftLeft = false;
+    break;
+  case Intrinsic::x86_avx2_psrlv_d:
+  case Intrinsic::x86_avx2_psrlv_d_256:
+  case Intrinsic::x86_avx2_psrlv_q:
+  case Intrinsic::x86_avx2_psrlv_q_256:
+    LogicalShift = true;
+    ShiftLeft = false;
+    break;
+  case Intrinsic::x86_avx2_psllv_d:
+  case Intrinsic::x86_avx2_psllv_d_256:
+  case Intrinsic::x86_avx2_psllv_q:
+  case Intrinsic::x86_avx2_psllv_q_256:
+    LogicalShift = true;
+    ShiftLeft = true;
+    break;
+  }
+  assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
+
+  // Simplify if all shift amounts are constant/undef.
+  auto *CShift = dyn_cast<Constant>(II.getArgOperand(1));
+  if (!CShift)
+    return nullptr;
+
+  auto Vec = II.getArgOperand(0);
+  auto VT = cast<VectorType>(II.getType());
+  auto SVT = VT->getVectorElementType();
+  int NumElts = VT->getNumElements();
+  int BitWidth = SVT->getIntegerBitWidth();
+
+  // Collect each element's shift amount.
+  // We also collect special cases: UNDEF = -1, OUT-OF-RANGE = BitWidth.
+  bool AnyOutOfRange = false;
+  SmallVector<int, 8> ShiftAmts;
+  for (int I = 0; I < NumElts; ++I) {
+    auto *CElt = CShift->getAggregateElement(I);
+    if (CElt && isa<UndefValue>(CElt)) {
+      ShiftAmts.push_back(-1);
+      continue;
+    }
+
+    auto *COp = dyn_cast_or_null<ConstantInt>(CElt);
+    if (!COp)
+      return nullptr;
+
+    // Handle out of range shifts.
+    // If LogicalShift - set to BitWidth (special case).
+    // If ArithmeticShift - set to (BitWidth - 1) (sign splat).
+    APInt ShiftVal = COp->getValue();
+    if (ShiftVal.uge(BitWidth)) {
+      AnyOutOfRange = LogicalShift;
+      ShiftAmts.push_back(LogicalShift ? BitWidth : BitWidth - 1);
+      continue;
+    }
+
+    ShiftAmts.push_back((int)ShiftVal.getZExtValue());
+  }
+
+  // If all elements out of range or UNDEF, return vector of zeros/undefs.
+  // ArithmeticShift should only hit this if they are all UNDEF.
+  auto OutOfRange = [&](int Idx) { return (Idx < 0) || (BitWidth <= Idx); };
+  if (llvm::all_of(ShiftAmts, OutOfRange)) {
+    SmallVector<Constant *, 8> ConstantVec;
+    for (int Idx : ShiftAmts) {
+      if (Idx < 0) {
+        ConstantVec.push_back(UndefValue::get(SVT));
+      } else {
+        assert(LogicalShift && "Logical shift expected");
+        ConstantVec.push_back(ConstantInt::getNullValue(SVT));
+      }
+    }
+    return ConstantVector::get(ConstantVec);
+  }
+
+  // We can't handle only some out of range values with generic logical shifts.
+  if (AnyOutOfRange)
+    return nullptr;
+
+  // Build the shift amount constant vector.
+  SmallVector<Constant *, 8> ShiftVecAmts;
+  for (int Idx : ShiftAmts) {
+    if (Idx < 0)
+      ShiftVecAmts.push_back(UndefValue::get(SVT));
+    else
+      ShiftVecAmts.push_back(ConstantInt::get(SVT, Idx));
+  }
+  auto ShiftVec = ConstantVector::get(ShiftVecAmts);
+
+  if (ShiftLeft)
+    return Builder.CreateShl(Vec, ShiftVec);
+
+  if (LogicalShift)
+    return Builder.CreateLShr(Vec, ShiftVec);
+
+  return Builder.CreateAShr(Vec, ShiftVec);
+}
+
 static Value *simplifyX86movmsk(const IntrinsicInst &II,
                                 InstCombiner::BuilderTy &Builder) {
   Value *Arg = II.getArgOperand(0);
@@ -1656,6 +1767,20 @@ Instruction *InstCombiner::visitCallInst
     break;
   }
 
+  case Intrinsic::x86_avx2_psllv_d:
+  case Intrinsic::x86_avx2_psllv_d_256:
+  case Intrinsic::x86_avx2_psllv_q:
+  case Intrinsic::x86_avx2_psllv_q_256:
+  case Intrinsic::x86_avx2_psrav_d:
+  case Intrinsic::x86_avx2_psrav_d_256:
+  case Intrinsic::x86_avx2_psrlv_d:
+  case Intrinsic::x86_avx2_psrlv_d_256:
+  case Intrinsic::x86_avx2_psrlv_q:
+  case Intrinsic::x86_avx2_psrlv_q_256:
+    if (Value *V = simplifyX86varShift(*II, *Builder))
+      return replaceInstUsesWith(*II, V);
+    break;
+
   case Intrinsic::x86_sse41_insertps:
     if (Value *V = simplifyX86insertps(*II, *Builder))
       return replaceInstUsesWith(*II, V);

Modified: llvm/trunk/test/Transforms/InstCombine/x86-vector-shifts.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/x86-vector-shifts.ll?rev=271996&r1=271995&r2=271996&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/x86-vector-shifts.ll (original)
+++ llvm/trunk/test/Transforms/InstCombine/x86-vector-shifts.ll Tue Jun  7 05:27:15 2016
@@ -940,8 +940,7 @@ define <4 x i64> @avx2_psll_q_64(<4 x i6
 
 define <4 x i32> @avx2_psrav_d_128_0(<4 x i32> %v) {
 ; CHECK-LABEL: @avx2_psrav_d_128_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> %v, <4 x i32> zeroinitializer)
-; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+; CHECK-NEXT:    ret <4 x i32> %v
 ;
   %1 = tail call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> %v, <4 x i32> zeroinitializer)
   ret <4 x i32> %1
@@ -949,8 +948,7 @@ define <4 x i32> @avx2_psrav_d_128_0(<4
 
 define <8 x i32> @avx2_psrav_d_256_0(<8 x i32> %v) {
 ; CHECK-LABEL: @avx2_psrav_d_256_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> %v, <8 x i32> zeroinitializer)
-; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+; CHECK-NEXT:    ret <8 x i32> %v
 ;
   %1 = tail call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> %v, <8 x i32> zeroinitializer)
   ret <8 x i32> %1
@@ -958,7 +956,7 @@ define <8 x i32> @avx2_psrav_d_256_0(<8
 
 define <4 x i32> @avx2_psrav_d_128_var(<4 x i32> %v) {
 ; CHECK-LABEL: @avx2_psrav_d_128_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> %v, <4 x i32> <i32 0, i32 8, i32 16, i32 64>)
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <4 x i32> %v, <i32 0, i32 8, i32 16, i32 31>
 ; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
 ;
   %1 = tail call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> %v, <4 x i32> <i32 0, i32 8, i32 16, i32 64>)
@@ -967,7 +965,7 @@ define <4 x i32> @avx2_psrav_d_128_var(<
 
 define <8 x i32> @avx2_psrav_d_256_var(<8 x i32> %v) {
 ; CHECK-LABEL: @avx2_psrav_d_256_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> %v, <8 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 24, i32 8, i32 0>)
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <8 x i32> %v, <i32 0, i32 8, i32 16, i32 24, i32 31, i32 24, i32 8, i32 0>
 ; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
 ;
   %1 = tail call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> %v, <8 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 24, i32 8, i32 0>)
@@ -976,7 +974,7 @@ define <8 x i32> @avx2_psrav_d_256_var(<
 
 define <4 x i32> @avx2_psrav_d_128_allbig(<4 x i32> %v) {
 ; CHECK-LABEL: @avx2_psrav_d_128_allbig(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> %v, <4 x i32> <i32 32, i32 100, i32 -255, i32 undef>)
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <4 x i32> %v, <i32 31, i32 31, i32 31, i32 undef>
 ; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
 ;
   %1 = tail call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> %v, <4 x i32> <i32 32, i32 100, i32 -255, i32 undef>)
@@ -985,7 +983,7 @@ define <4 x i32> @avx2_psrav_d_128_allbi
 
 define <8 x i32> @avx2_psrav_d_256_allbig(<8 x i32> %v) {
 ; CHECK-LABEL: @avx2_psrav_d_256_allbig(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> %v, <8 x i32> <i32 undef, i32 100, i32 255, i32 55555, i32 -32, i32 -100, i32 -255, i32 -55555>)
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <8 x i32> %v, <i32 undef, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
 ; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
 ;
   %1 = tail call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> %v, <8 x i32> <i32 undef, i32 100, i32 255, i32 55555, i32 -32, i32 -100, i32 -255, i32 -55555>)
@@ -994,7 +992,7 @@ define <8 x i32> @avx2_psrav_d_256_allbi
 
 define <4 x i32> @avx2_psrav_d_128_undef(<4 x i32> %v) {
 ; CHECK-LABEL: @avx2_psrav_d_128_undef(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> %v, <4 x i32> <i32 undef, i32 8, i32 16, i32 64>)
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <4 x i32> %v, <i32 undef, i32 8, i32 16, i32 31>
 ; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
 ;
   %1 = insertelement <4 x i32> <i32 0, i32 8, i32 16, i32 64>, i32 undef, i32 0
@@ -1004,7 +1002,7 @@ define <4 x i32> @avx2_psrav_d_128_undef
 
 define <8 x i32> @avx2_psrav_d_256_undef(<8 x i32> %v) {
 ; CHECK-LABEL: @avx2_psrav_d_256_undef(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> %v, <8 x i32> <i32 0, i32 undef, i32 16, i32 24, i32 32, i32 24, i32 8, i32 0>)
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <8 x i32> %v, <i32 0, i32 undef, i32 16, i32 24, i32 31, i32 24, i32 8, i32 0>
 ; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
 ;
   %1 = insertelement <8 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 24, i32 8, i32 0>, i32 undef, i32 1
@@ -1018,8 +1016,7 @@ define <8 x i32> @avx2_psrav_d_256_undef
 
 define <4 x i32> @avx2_psrlv_d_128_0(<4 x i32> %v) {
 ; CHECK-LABEL: @avx2_psrlv_d_128_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> %v, <4 x i32> zeroinitializer)
-; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+; CHECK-NEXT:    ret <4 x i32> %v
 ;
   %1 = tail call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> %v, <4 x i32> zeroinitializer)
   ret <4 x i32> %1
@@ -1027,8 +1024,7 @@ define <4 x i32> @avx2_psrlv_d_128_0(<4
 
 define <8 x i32> @avx2_psrlv_d_256_0(<8 x i32> %v) {
 ; CHECK-LABEL: @avx2_psrlv_d_256_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> %v, <8 x i32> zeroinitializer)
-; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+; CHECK-NEXT:    ret <8 x i32> %v
 ;
   %1 = tail call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> %v, <8 x i32> zeroinitializer)
   ret <8 x i32> %1
@@ -1036,7 +1032,7 @@ define <8 x i32> @avx2_psrlv_d_256_0(<8
 
 define <4 x i32> @avx2_psrlv_d_128_var(<4 x i32> %v) {
 ; CHECK-LABEL: @avx2_psrlv_d_128_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> %v, <4 x i32> <i32 0, i32 8, i32 16, i32 31>)
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr <4 x i32> %v, <i32 0, i32 8, i32 16, i32 31>
 ; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
 ;
   %1 = tail call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> %v, <4 x i32> <i32 0, i32 8, i32 16, i32 31>)
@@ -1045,7 +1041,7 @@ define <4 x i32> @avx2_psrlv_d_128_var(<
 
 define <8 x i32> @avx2_psrlv_d_256_var(<8 x i32> %v) {
 ; CHECK-LABEL: @avx2_psrlv_d_256_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> %v, <8 x i32> <i32 0, i32 8, i32 16, i32 24, i32 31, i32 24, i32 8, i32 0>)
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr <8 x i32> %v, <i32 0, i32 8, i32 16, i32 24, i32 31, i32 24, i32 8, i32 0>
 ; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
 ;
   %1 = tail call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> %v, <8 x i32> <i32 0, i32 8, i32 16, i32 24, i32 31, i32 24, i32 8, i32 0>)
@@ -1072,8 +1068,7 @@ define <8 x i32> @avx2_psrlv_d_256_big(<
 
 define <4 x i32> @avx2_psrlv_d_128_allbig(<4 x i32> %v) {
 ; CHECK-LABEL: @avx2_psrlv_d_128_allbig(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> %v, <4 x i32> <i32 32, i32 100, i32 -255, i32 undef>)
-; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+; CHECK-NEXT:    ret <4 x i32> <i32 0, i32 0, i32 0, i32 undef>
 ;
   %1 = tail call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> %v, <4 x i32> <i32 32, i32 100, i32 -255, i32 undef>)
   ret <4 x i32> %1
@@ -1081,8 +1076,7 @@ define <4 x i32> @avx2_psrlv_d_128_allbi
 
 define <8 x i32> @avx2_psrlv_d_256_allbig(<8 x i32> %v) {
 ; CHECK-LABEL: @avx2_psrlv_d_256_allbig(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> %v, <8 x i32> <i32 undef, i32 100, i32 255, i32 55555, i32 -32, i32 -100, i32 -255, i32 -55555>)
-; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+; CHECK-NEXT:    ret <8 x i32> <i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
 ;
   %1 = tail call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> %v, <8 x i32> <i32 undef, i32 100, i32 255, i32 55555, i32 -32, i32 -100, i32 -255, i32 -55555>)
   ret <8 x i32> %1
@@ -1090,7 +1084,7 @@ define <8 x i32> @avx2_psrlv_d_256_allbi
 
 define <4 x i32> @avx2_psrlv_d_128_undef(<4 x i32> %v) {
 ; CHECK-LABEL: @avx2_psrlv_d_128_undef(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> %v, <4 x i32> <i32 undef, i32 8, i32 16, i32 31>)
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr <4 x i32> %v, <i32 undef, i32 8, i32 16, i32 31>
 ; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
 ;
   %1 = insertelement <4 x i32> <i32 0, i32 8, i32 16, i32 31>, i32 undef, i32 0
@@ -1100,7 +1094,7 @@ define <4 x i32> @avx2_psrlv_d_128_undef
 
 define <8 x i32> @avx2_psrlv_d_256_undef(<8 x i32> %v) {
 ; CHECK-LABEL: @avx2_psrlv_d_256_undef(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> %v, <8 x i32> <i32 0, i32 undef, i32 16, i32 31, i32 31, i32 24, i32 8, i32 0>)
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr <8 x i32> %v, <i32 0, i32 undef, i32 16, i32 31, i32 31, i32 24, i32 8, i32 0>
 ; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
 ;
   %1 = insertelement <8 x i32> <i32 0, i32 8, i32 16, i32 31, i32 31, i32 24, i32 8, i32 0>, i32 undef, i32 1
@@ -1110,8 +1104,7 @@ define <8 x i32> @avx2_psrlv_d_256_undef
 
 define <2 x i64> @avx2_psrlv_q_128_0(<2 x i64> %v) {
 ; CHECK-LABEL: @avx2_psrlv_q_128_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> %v, <2 x i64> zeroinitializer)
-; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+; CHECK-NEXT:    ret <2 x i64> %v
 ;
   %1 = tail call <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> %v, <2 x i64> zeroinitializer)
   ret <2 x i64> %1
@@ -1119,8 +1112,7 @@ define <2 x i64> @avx2_psrlv_q_128_0(<2
 
 define <4 x i64> @avx2_psrlv_q_256_0(<4 x i64> %v) {
 ; CHECK-LABEL: @avx2_psrlv_q_256_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> %v, <4 x i64> zeroinitializer)
-; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
+; CHECK-NEXT:    ret <4 x i64> %v
 ;
   %1 = tail call <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> %v, <4 x i64> zeroinitializer)
   ret <4 x i64> %1
@@ -1128,7 +1120,7 @@ define <4 x i64> @avx2_psrlv_q_256_0(<4
 
 define <2 x i64> @avx2_psrlv_q_128_var(<2 x i64> %v) {
 ; CHECK-LABEL: @avx2_psrlv_q_128_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> %v, <2 x i64> <i64 0, i64 8>)
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr <2 x i64> %v, <i64 0, i64 8>
 ; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
 ;
   %1 = tail call <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> %v, <2 x i64> <i64 0, i64 8>)
@@ -1137,7 +1129,7 @@ define <2 x i64> @avx2_psrlv_q_128_var(<
 
 define <4 x i64> @avx2_psrlv_q_256_var(<4 x i64> %v) {
 ; CHECK-LABEL: @avx2_psrlv_q_256_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> %v, <4 x i64> <i64 0, i64 8, i64 16, i64 31>)
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr <4 x i64> %v, <i64 0, i64 8, i64 16, i64 31>
 ; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
 ;
   %1 = tail call <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> %v, <4 x i64> <i64 0, i64 8, i64 16, i64 31>)
@@ -1164,8 +1156,7 @@ define <4 x i64> @avx2_psrlv_q_256_big(<
 
 define <2 x i64> @avx2_psrlv_q_128_allbig(<2 x i64> %v) {
 ; CHECK-LABEL: @avx2_psrlv_q_128_allbig(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> %v, <2 x i64> <i64 128, i64 -64>)
-; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+; CHECK-NEXT:    ret <2 x i64> zeroinitializer
 ;
   %1 = tail call <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> %v, <2 x i64> <i64 128, i64 -64>)
   ret <2 x i64> %1
@@ -1173,8 +1164,7 @@ define <2 x i64> @avx2_psrlv_q_128_allbi
 
 define <4 x i64> @avx2_psrlv_q_256_allbig(<4 x i64> %v) {
 ; CHECK-LABEL: @avx2_psrlv_q_256_allbig(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> %v, <4 x i64> <i64 64, i64 undef, i64 -128, i64 -60>)
-; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
+; CHECK-NEXT:    ret <4 x i64> <i64 0, i64 undef, i64 0, i64 0>
 ;
   %1 = tail call <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> %v, <4 x i64> <i64 64, i64 undef, i64 -128, i64 -60>)
   ret <4 x i64> %1
@@ -1182,7 +1172,7 @@ define <4 x i64> @avx2_psrlv_q_256_allbi
 
 define <2 x i64> @avx2_psrlv_q_128_undef(<2 x i64> %v) {
 ; CHECK-LABEL: @avx2_psrlv_q_128_undef(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> %v, <2 x i64> <i64 0, i64 undef>)
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr <2 x i64> %v, <i64 0, i64 undef>
 ; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
 ;
   %1 = insertelement <2 x i64> <i64 0, i64 8>, i64 undef, i64 1
@@ -1192,7 +1182,7 @@ define <2 x i64> @avx2_psrlv_q_128_undef
 
 define <4 x i64> @avx2_psrlv_q_256_undef(<4 x i64> %v) {
 ; CHECK-LABEL: @avx2_psrlv_q_256_undef(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> %v, <4 x i64> <i64 undef, i64 8, i64 16, i64 31>)
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr <4 x i64> %v, <i64 undef, i64 8, i64 16, i64 31>
 ; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
 ;
   %1 = insertelement <4 x i64> <i64 0, i64 8, i64 16, i64 31>, i64 undef, i64 0
@@ -1206,8 +1196,7 @@ define <4 x i64> @avx2_psrlv_q_256_undef
 
 define <4 x i32> @avx2_psllv_d_128_0(<4 x i32> %v) {
 ; CHECK-LABEL: @avx2_psllv_d_128_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> %v, <4 x i32> zeroinitializer)
-; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+; CHECK-NEXT:    ret <4 x i32> %v
 ;
   %1 = tail call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> %v, <4 x i32> zeroinitializer)
   ret <4 x i32> %1
@@ -1215,8 +1204,7 @@ define <4 x i32> @avx2_psllv_d_128_0(<4
 
 define <8 x i32> @avx2_psllv_d_256_0(<8 x i32> %v) {
 ; CHECK-LABEL: @avx2_psllv_d_256_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> %v, <8 x i32> zeroinitializer)
-; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+; CHECK-NEXT:    ret <8 x i32> %v
 ;
   %1 = tail call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> %v, <8 x i32> zeroinitializer)
   ret <8 x i32> %1
@@ -1224,7 +1212,7 @@ define <8 x i32> @avx2_psllv_d_256_0(<8
 
 define <4 x i32> @avx2_psllv_d_128_var(<4 x i32> %v) {
 ; CHECK-LABEL: @avx2_psllv_d_128_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> %v, <4 x i32> <i32 0, i32 8, i32 16, i32 31>)
+; CHECK-NEXT:    [[TMP1:%.*]] = shl <4 x i32> %v, <i32 0, i32 8, i32 16, i32 31>
 ; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
 ;
   %1 = tail call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> %v, <4 x i32> <i32 0, i32 8, i32 16, i32 31>)
@@ -1233,7 +1221,7 @@ define <4 x i32> @avx2_psllv_d_128_var(<
 
 define <8 x i32> @avx2_psllv_d_256_var(<8 x i32> %v) {
 ; CHECK-LABEL: @avx2_psllv_d_256_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> %v, <8 x i32> <i32 0, i32 8, i32 16, i32 24, i32 31, i32 24, i32 8, i32 0>)
+; CHECK-NEXT:    [[TMP1:%.*]] = shl <8 x i32> %v, <i32 0, i32 8, i32 16, i32 24, i32 31, i32 24, i32 8, i32 0>
 ; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
 ;
   %1 = tail call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> %v, <8 x i32> <i32 0, i32 8, i32 16, i32 24, i32 31, i32 24, i32 8, i32 0>)
@@ -1260,8 +1248,7 @@ define <8 x i32> @avx2_psllv_d_256_big(<
 
 define <4 x i32> @avx2_psllv_d_128_allbig(<4 x i32> %v) {
 ; CHECK-LABEL: @avx2_psllv_d_128_allbig(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> %v, <4 x i32> <i32 32, i32 100, i32 -255, i32 undef>)
-; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+; CHECK-NEXT:    ret <4 x i32> <i32 0, i32 0, i32 0, i32 undef>
 ;
   %1 = tail call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> %v, <4 x i32> <i32 32, i32 100, i32 -255, i32 undef>)
   ret <4 x i32> %1
@@ -1269,8 +1256,7 @@ define <4 x i32> @avx2_psllv_d_128_allbi
 
 define <8 x i32> @avx2_psllv_d_256_allbig(<8 x i32> %v) {
 ; CHECK-LABEL: @avx2_psllv_d_256_allbig(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> %v, <8 x i32> <i32 undef, i32 100, i32 255, i32 55555, i32 -32, i32 -100, i32 -255, i32 -55555>)
-; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+; CHECK-NEXT:    ret <8 x i32> <i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
 ;
   %1 = tail call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> %v, <8 x i32> <i32 undef, i32 100, i32 255, i32 55555, i32 -32, i32 -100, i32 -255, i32 -55555>)
   ret <8 x i32> %1
@@ -1278,7 +1264,7 @@ define <8 x i32> @avx2_psllv_d_256_allbi
 
 define <4 x i32> @avx2_psllv_d_128_undef(<4 x i32> %v) {
 ; CHECK-LABEL: @avx2_psllv_d_128_undef(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> %v, <4 x i32> <i32 undef, i32 8, i32 16, i32 31>)
+; CHECK-NEXT:    [[TMP1:%.*]] = shl <4 x i32> %v, <i32 undef, i32 8, i32 16, i32 31>
 ; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
 ;
   %1 = insertelement <4 x i32> <i32 0, i32 8, i32 16, i32 31>, i32 undef, i32 0
@@ -1288,7 +1274,7 @@ define <4 x i32> @avx2_psllv_d_128_undef
 
 define <8 x i32> @avx2_psllv_d_256_undef(<8 x i32> %v) {
 ; CHECK-LABEL: @avx2_psllv_d_256_undef(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> %v, <8 x i32> <i32 0, i32 undef, i32 16, i32 31, i32 31, i32 24, i32 8, i32 0>)
+; CHECK-NEXT:    [[TMP1:%.*]] = shl <8 x i32> %v, <i32 0, i32 undef, i32 16, i32 31, i32 31, i32 24, i32 8, i32 0>
 ; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
 ;
   %1 = insertelement <8 x i32> <i32 0, i32 8, i32 16, i32 31, i32 31, i32 24, i32 8, i32 0>, i32 undef, i32 1
@@ -1298,8 +1284,7 @@ define <8 x i32> @avx2_psllv_d_256_undef
 
 define <2 x i64> @avx2_psllv_q_128_0(<2 x i64> %v) {
 ; CHECK-LABEL: @avx2_psllv_q_128_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> %v, <2 x i64> zeroinitializer)
-; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+; CHECK-NEXT:    ret <2 x i64> %v
 ;
   %1 = tail call <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> %v, <2 x i64> zeroinitializer)
   ret <2 x i64> %1
@@ -1307,8 +1292,7 @@ define <2 x i64> @avx2_psllv_q_128_0(<2
 
 define <4 x i64> @avx2_psllv_q_256_0(<4 x i64> %v) {
 ; CHECK-LABEL: @avx2_psllv_q_256_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> %v, <4 x i64> zeroinitializer)
-; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
+; CHECK-NEXT:    ret <4 x i64> %v
 ;
   %1 = tail call <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> %v, <4 x i64> zeroinitializer)
   ret <4 x i64> %1
@@ -1316,7 +1300,7 @@ define <4 x i64> @avx2_psllv_q_256_0(<4
 
 define <2 x i64> @avx2_psllv_q_128_var(<2 x i64> %v) {
 ; CHECK-LABEL: @avx2_psllv_q_128_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> %v, <2 x i64> <i64 0, i64 8>)
+; CHECK-NEXT:    [[TMP1:%.*]] = shl <2 x i64> %v, <i64 0, i64 8>
 ; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
 ;
   %1 = tail call <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> %v, <2 x i64> <i64 0, i64 8>)
@@ -1325,7 +1309,7 @@ define <2 x i64> @avx2_psllv_q_128_var(<
 
 define <4 x i64> @avx2_psllv_q_256_var(<4 x i64> %v) {
 ; CHECK-LABEL: @avx2_psllv_q_256_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> %v, <4 x i64> <i64 0, i64 8, i64 16, i64 31>)
+; CHECK-NEXT:    [[TMP1:%.*]] = shl <4 x i64> %v, <i64 0, i64 8, i64 16, i64 31>
 ; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
 ;
   %1 = tail call <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> %v, <4 x i64> <i64 0, i64 8, i64 16, i64 31>)
@@ -1352,8 +1336,7 @@ define <4 x i64> @avx2_psllv_q_256_big(<
 
 define <2 x i64> @avx2_psllv_q_128_allbig(<2 x i64> %v) {
 ; CHECK-LABEL: @avx2_psllv_q_128_allbig(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> %v, <2 x i64> <i64 128, i64 -64>)
-; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+; CHECK-NEXT:    ret <2 x i64> zeroinitializer
 ;
   %1 = tail call <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> %v, <2 x i64> <i64 128, i64 -64>)
   ret <2 x i64> %1
@@ -1361,8 +1344,7 @@ define <2 x i64> @avx2_psllv_q_128_allbi
 
 define <4 x i64> @avx2_psllv_q_256_allbig(<4 x i64> %v) {
 ; CHECK-LABEL: @avx2_psllv_q_256_allbig(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> %v, <4 x i64> <i64 64, i64 undef, i64 -128, i64 -60>)
-; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
+; CHECK-NEXT:    ret <4 x i64> <i64 0, i64 undef, i64 0, i64 0>
 ;
   %1 = tail call <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> %v, <4 x i64> <i64 64, i64 undef, i64 -128, i64 -60>)
   ret <4 x i64> %1
@@ -1370,7 +1352,7 @@ define <4 x i64> @avx2_psllv_q_256_allbi
 
 define <2 x i64> @avx2_psllv_q_128_undef(<2 x i64> %v) {
 ; CHECK-LABEL: @avx2_psllv_q_128_undef(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> %v, <2 x i64> <i64 0, i64 undef>)
+; CHECK-NEXT:    [[TMP1:%.*]] = shl <2 x i64> %v, <i64 0, i64 undef>
 ; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
 ;
   %1 = insertelement <2 x i64> <i64 0, i64 8>, i64 undef, i64 1
@@ -1380,7 +1362,7 @@ define <2 x i64> @avx2_psllv_q_128_undef
 
 define <4 x i64> @avx2_psllv_q_256_undef(<4 x i64> %v) {
 ; CHECK-LABEL: @avx2_psllv_q_256_undef(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> %v, <4 x i64> <i64 undef, i64 8, i64 16, i64 31>)
+; CHECK-NEXT:    [[TMP1:%.*]] = shl <4 x i64> %v, <i64 undef, i64 8, i64 16, i64 31>
 ; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
 ;
   %1 = insertelement <4 x i64> <i64 0, i64 8, i64 16, i64 31>, i64 undef, i64 0