[PATCH][InstCombine][X86] Improve the folding of calls to X86 packed shifts intrinsics.

Wed May 7 08:42:20 PDT 2014

Hi,

This patch teaches InstCombine how to fold a packed SSE2/AVX2 shift
intrinsic into its first operand if the shift count is a zerovector
(i.e. a 'ConstantAggregateZero').

Also, this patch teaches InstCombine how to lower a packed arithmetic
shift intrinsics into an 'ashr' instruction if the shift count is
known to be smaller than the vector element size.

Please let me know if ok to submit.

Thanks,
Andrea Di Biagio
SN Systems - Sony Computer Entertainment Group
-------------- next part --------------
Index: lib/Transforms/InstCombine/InstCombineCalls.cpp
===================================================================

--- lib/Transforms/InstCombine/InstCombineCalls.cpp	(revision 208218)
+++ lib/Transforms/InstCombine/InstCombineCalls.cpp	(working copy)
@@ -582,9 +582,21 @@
   case Intrinsic::x86_avx2_psrl_w:
   case Intrinsic::x86_avx2_psrli_d:
   case Intrinsic::x86_avx2_psrli_q:
-  case Intrinsic::x86_avx2_psrli_w: {
-    // Simplify if count is constant. To 0 if >= BitWidth,
-    // otherwise to shl/lshr.
+  case Intrinsic::x86_avx2_psrli_w:
+  case Intrinsic::x86_sse2_psra_w:
+  case Intrinsic::x86_sse2_psra_d:
+  case Intrinsic::x86_avx2_psra_w:
+  case Intrinsic::x86_avx2_psra_d:
+  case Intrinsic::x86_sse2_psrai_w:
+  case Intrinsic::x86_sse2_psrai_d:
+  case Intrinsic::x86_avx2_psrai_w:
+  case Intrinsic::x86_avx2_psrai_d: {
+    if (isa<ConstantAggregateZero>(II->getArgOperand(1)))
+      // Fold this vector shift to its first operand.
+      return ReplaceInstUsesWith(CI, II->getArgOperand(0));
+
+    // Simplify if count is constant. To 0 if >= BitWidth and
+    // this is a logical shift; otherwise to shl/lshr/ashr.
     auto CDV = dyn_cast<ConstantDataVector>(II->getArgOperand(1));
     auto CInt = dyn_cast<ConstantInt>(II->getArgOperand(1));
     if (!CDV && !CInt)
@@ -595,14 +607,8 @@
     else
       Count = CInt;
 
-    auto Vec = II->getArgOperand(0);
-    auto VT = cast<VectorType>(Vec->getType());
-    if (Count->getZExtValue() >
-        VT->getElementType()->getPrimitiveSizeInBits() - 1)
-      return ReplaceInstUsesWith(
-          CI, ConstantAggregateZero::get(Vec->getType()));
-
-    bool isPackedShiftLeft = true;
+    bool isPackedLogicalShiftRight = false;
+    bool isPackedArithmeticShift = false;
     switch (II->getIntrinsicID()) {
     default : break;
     case Intrinsic::x86_sse2_psrl_d:
@@ -616,17 +622,36 @@
     case Intrinsic::x86_avx2_psrl_w:
     case Intrinsic::x86_avx2_psrli_d:
     case Intrinsic::x86_avx2_psrli_q:
-    case Intrinsic::x86_avx2_psrli_w: isPackedShiftLeft = false; break;
+    case Intrinsic::x86_avx2_psrli_w: isPackedLogicalShiftRight = true; break;
+    case Intrinsic::x86_sse2_psra_w:
+    case Intrinsic::x86_sse2_psra_d:
+    case Intrinsic::x86_avx2_psra_w:
+    case Intrinsic::x86_avx2_psra_d:
+    case Intrinsic::x86_sse2_psrai_w:
+    case Intrinsic::x86_sse2_psrai_d:
+    case Intrinsic::x86_avx2_psrai_w:
+    case Intrinsic::x86_avx2_psrai_d: isPackedArithmeticShift = true; break;
     }
 
+    auto Vec = II->getArgOperand(0);
+    auto VT = cast<VectorType>(Vec->getType());
+    if (!isPackedArithmeticShift && (Count->getZExtValue() >
+        VT->getElementType()->getPrimitiveSizeInBits() - 1))
+      return ReplaceInstUsesWith(
+          CI, ConstantAggregateZero::get(Vec->getType()));
+
     unsigned VWidth = VT->getNumElements();
     // Get a constant vector of the same type as the first operand.
     auto VTCI = ConstantInt::get(VT->getElementType(), Count->getZExtValue());
-    if (isPackedShiftLeft)
-      return BinaryOperator::CreateShl(Vec,
+    if (isPackedLogicalShiftRight)
+      return BinaryOperator::CreateLShr(Vec,
           Builder->CreateVectorSplat(VWidth, VTCI));
 
-    return BinaryOperator::CreateLShr(Vec,
+    if (isPackedArithmeticShift)
+       return BinaryOperator::CreateAShr(Vec,
+          Builder->CreateVectorSplat(VWidth, VTCI));
+     
+    return BinaryOperator::CreateShl(Vec,
         Builder->CreateVectorSplat(VWidth, VTCI));
   }
 
Index: test/Transforms/InstCombine/vec_demanded_elts.ll
===================================================================
--- test/Transforms/InstCombine/vec_demanded_elts.ll	(revision 208218)
+++ test/Transforms/InstCombine/vec_demanded_elts.ll	(working copy)
@@ -458,6 +458,7 @@
 ; CHECK: test_avx2_0
 ; CHECK: ret <4 x i64> zeroinitializer
 }
+
 define <2 x i64> @test_sse2_psrl_1() nounwind readnone uwtable {
   %S = bitcast i32 1 to i32
   %1 = zext i32 %S to i64
@@ -550,6 +551,223 @@
 ; CHECK: ret <4 x i64> zeroinitializer
 }
 
+define <8 x i16> @test_sse2_psra_1() nounwind readnone uwtable {
+  %S = bitcast i32 1 to i32
+  %1 = zext i32 %S to i64
+  %2 = insertelement <2 x i64> undef, i64 %1, i32 0
+  %3 = insertelement <2 x i64> %2, i64 0, i32 1
+  %4 = bitcast <2 x i64> %3 to <8 x i16>
+  %5 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> <i16 16, i16 32, i16 64, i16 128, i16 256, i16 512, i16 1024, i16 2048>, <8 x i16> %4)
+  %6 = bitcast <8 x i16> %5 to <4 x i32>
+  %7 = bitcast <2 x i64> %3 to <4 x i32>
+  %8 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %6, <4 x i32> %7)
+  %9 = bitcast <4 x i32> %8 to <8 x i16>
+  %10 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %9, i32 %S)
+  %11 = bitcast <8 x i16> %10 to <4 x i32>
+  %12 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %11, i32 %S)
+  %13 = bitcast <4 x i32> %12 to <8 x i16>
+  ret <8 x i16> %13
+; CHECK: test_sse2_psra_1
+; CHECK: ret <8 x i16> <i16 1, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128>
+}
+
+define <4 x i64> @test_avx2_psra_1() nounwind readnone uwtable {
+  %S = bitcast i32 1 to i32
+  %1 = zext i32 %S to i64
+  %2 = insertelement <2 x i64> undef, i64 %1, i32 0
+  %3 = insertelement <2 x i64> %2, i64 0, i32 1
+  %4 = bitcast <2 x i64> %3 to <8 x i16>
+  %5 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> <i16 1024, i16 0, i16 0, i16 0, i16 2048, i16 0, i16 0, i16 0, i16 4096, i16 0, i16 0, i16 0, i16 8192, i16 0, i16 0, i16 0>, <8 x i16> %4)
+  %6 = bitcast <16 x i16> %5 to <8 x i32>
+  %7 = bitcast <2 x i64> %3 to <4 x i32>
+  %8 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %6, <4 x i32> %7)
+  %9 = bitcast <8 x i32> %8 to <16 x i16>
+  %10 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %9, i32 %S)
+  %11 = bitcast <16 x i16> %10 to <8 x i32>
+  %12 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %11, i32 %S)
+  %13 = bitcast <8 x i32> %12 to <4 x i64>
+  ret <4 x i64> %13
+; CHECK: test_avx2_psra_1
+; CHECK: ret <4 x i64> <i64 64, i64 128, i64 256, i64 512>
+}
+
+define <2 x i64> @test_sse2_psra_0() nounwind readnone uwtable {
+  %S = bitcast i32 128 to i32
+  %1 = zext i32 %S to i64
+  %2 = insertelement <2 x i64> undef, i64 %1, i32 0
+  %3 = insertelement <2 x i64> %2, i64 0, i32 1
+  %4 = bitcast <2 x i64> %3 to <8 x i16>
+  %5 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> <i16 32, i16 64, i16 128, i16 256, i16 512, i16 1024, i16 2048, i16 4096>, <8 x i16> %4)
+  %6 = bitcast <8 x i16> %5 to <4 x i32>
+  %7 = bitcast <2 x i64> %3 to <4 x i32>
+  %8 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %6, <4 x i32> %7)
+  %9 = bitcast <4 x i32> %8 to <8 x i16>
+  %10 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %9, i32 %S)
+  %11 = bitcast <8 x i16> %10 to <4 x i32>
+  %12 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %11, i32 %S)
+  %13 = bitcast <4 x i32> %12 to <2 x i64>
+  ret <2 x i64> %13
+; CHECK: test_sse2_psra_0
+; CHECK: ret <2 x i64> <i64 -1, i64 -1>
+}
+
+define <4 x i64> @test_avx2_psra_0() nounwind readnone uwtable {
+  %S = bitcast i32 128 to i32
+  %1 = zext i32 %S to i64
+  %2 = insertelement <2 x i64> undef, i64 %1, i32 0
+  %3 = insertelement <2 x i64> %2, i64 0, i32 1
+  %4 = bitcast <2 x i64> %3 to <8 x i16>
+  %5 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> <i16 1024, i16 0, i16 0, i16 0, i16 2048, i16 0, i16 0, i16 0, i16 4096, i16 0, i16 0, i16 0, i16 8192, i16 0, i16 0, i16 0>, <8 x i16> %4)
+  %6 = bitcast <16 x i16> %5 to <8 x i32>
+  %7 = bitcast <2 x i64> %3 to <4 x i32>
+  %8 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %6, <4 x i32> %7)
+  %9 = bitcast <8 x i32> %8 to <16 x i16>
+  %10 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %9, i32 %S)
+  %11 = bitcast <16 x i16> %10 to <8 x i32>
+  %12 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %11, i32 %S)
+  %13 = bitcast <8 x i32> %12 to <4 x i64>
+  ret <4 x i64> %13
+; CHECK: test_avx2_psra_0
+; CHECK: ret <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>
+}
+
+; Test that we correctly fold shifts by zero.
+
+define <8 x i16> @test_sse2_2() nounwind readnone uwtable {
+  %S = bitcast i32 0 to i32
+  %1 = zext i32 %S to i64
+  %2 = insertelement <2 x i64> undef, i64 %1, i32 0
+  %3 = insertelement <2 x i64> %2, i64 0, i32 1
+  %4 = bitcast <2 x i64> %3 to <8 x i16>
+  %5 = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>, <8 x i16> %4)
+  %6 = bitcast <8 x i16> %5 to <4 x i32>
+  %7 = bitcast <2 x i64> %3 to <4 x i32>
+  %8 = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %6, <4 x i32> %7)
+  %9 = bitcast <4 x i32> %8 to <2 x i64>
+  %10 = tail call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %9, <2 x i64> %3)
+  %11 = bitcast <2 x i64> %10 to <8 x i16>
+  %12 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %11, i32 %S)
+  %13 = bitcast <8 x i16> %12 to <4 x i32>
+  %14 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %13, i32 %S)
+  %15 = bitcast <4 x i32> %14 to <2 x i64>
+  %16 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %15, i32 %S)
+  %17 = bitcast <2 x i64> %16 to <8 x i16>
+  ret <8 x i16> %17
+; CHECK: test_sse2_2
+; CHECK: ret <8 x i16> <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>
+}
+
+define <4 x i64> @test_avx2_2() nounwind readnone uwtable {
+  %S = bitcast i32 0 to i32
+  %1 = zext i32 %S to i64
+  %2 = insertelement <2 x i64> undef, i64 %1, i32 0
+  %3 = insertelement <2 x i64> %2, i64 0, i32 1
+  %4 = bitcast <2 x i64> %3 to <8 x i16>
+  %5 = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> <i16 1, i16 0, i16 0, i16 0, i16 2, i16 0, i16 0, i16 0, i16 3, i16 0, i16 0, i16 0, i16 4, i16 0, i16 0, i16 0>, <8 x i16> %4)
+  %6 = bitcast <16 x i16> %5 to <8 x i32>
+  %7 = bitcast <2 x i64> %3 to <4 x i32>
+  %8 = tail call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %6, <4 x i32> %7)
+  %9 = bitcast <8 x i32> %8 to <4 x i64>
+  %10 = tail call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %9, <2 x i64> %3)
+  %11 = bitcast <4 x i64> %10 to <16 x i16>
+  %12 = tail call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %11, i32 %S)
+  %13 = bitcast <16 x i16> %12 to <8 x i32>
+  %14 = tail call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %13, i32 %S)
+  %15 = bitcast <8 x i32> %14 to <4 x i64>
+  %16 = tail call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %15, i32 %S)
+  ret <4 x i64> %16
+; CHECK: test_avx2_2
+; CHECK: ret <4 x i64> <i64 1, i64 2, i64 3, i64 4>
+}
+
+define <8 x i16> @test_sse2_psrl_2() nounwind readnone uwtable {
+  %S = bitcast i32 0 to i32
+  %1 = zext i32 %S to i64
+  %2 = insertelement <2 x i64> undef, i64 %1, i32 0
+  %3 = insertelement <2 x i64> %2, i64 0, i32 1
+  %4 = bitcast <2 x i64> %3 to <8 x i16>
+  %5 = tail call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> <i16 16, i16 32, i16 64, i16 128, i16 256, i16 512, i16 1024, i16 2048>, <8 x i16> %4)
+  %6 = bitcast <8 x i16> %5 to <4 x i32>
+  %7 = bitcast <2 x i64> %3 to <4 x i32>
+  %8 = tail call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %6, <4 x i32> %7)
+  %9 = bitcast <4 x i32> %8 to <2 x i64>
+  %10 = tail call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %9, <2 x i64> %3)
+  %11 = bitcast <2 x i64> %10 to <8 x i16>
+  %12 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %11, i32 %S)
+  %13 = bitcast <8 x i16> %12 to <4 x i32>
+  %14 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %13, i32 %S)
+  %15 = bitcast <4 x i32> %14 to <2 x i64>
+  %16 = tail call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %15, i32 %S)
+  %17 = bitcast <2 x i64> %16 to <8 x i16>
+  ret <8 x i16> %17
+; CHECK: test_sse2_psrl_2
+; CHECK: ret <8 x i16> <i16 16, i16 32, i16 64, i16 128, i16 256, i16 512, i16 1024, i16 2048>
+}
+
+define <4 x i64> @test_avx2_psrl_2() nounwind readnone uwtable {
+  %S = bitcast i32 0 to i32
+  %1 = zext i32 %S to i64
+  %2 = insertelement <2 x i64> undef, i64 %1, i32 0
+  %3 = insertelement <2 x i64> %2, i64 0, i32 1
+  %4 = bitcast <2 x i64> %3 to <8 x i16>
+  %5 = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> <i16 1024, i16 0, i16 0, i16 0, i16 2048, i16 0, i16 0, i16 0, i16 4096, i16 0, i16 0, i16 0, i16 8192, i16 0, i16 0, i16 0>, <8 x i16> %4)
+  %6 = bitcast <16 x i16> %5 to <8 x i32>
+  %7 = bitcast <2 x i64> %3 to <4 x i32>
+  %8 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %6, <4 x i32> %7)
+  %9 = bitcast <8 x i32> %8 to <4 x i64>
+  %10 = tail call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %9, <2 x i64> %3)
+  %11 = bitcast <4 x i64> %10 to <16 x i16>
+  %12 = tail call <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16> %11, i32 %S)
+  %13 = bitcast <16 x i16> %12 to <8 x i32>
+  %14 = tail call <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32> %13, i32 %S)
+  %15 = bitcast <8 x i32> %14 to <4 x i64>
+  %16 = tail call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> %15, i32 %S)
+  ret <4 x i64> %16
+; CHECK: test_avx2_psrl_2
+; CHECK: ret <4 x i64> <i64 1024, i64 2048, i64 4096, i64 8192>
+}
+
+define <8 x i16> @test_sse2_psra_2() nounwind readnone uwtable {
+  %S = bitcast i32 0 to i32
+  %1 = zext i32 %S to i64
+  %2 = insertelement <2 x i64> undef, i64 %1, i32 0
+  %3 = insertelement <2 x i64> %2, i64 0, i32 1
+  %4 = bitcast <2 x i64> %3 to <8 x i16>
+  %5 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> <i16 16, i16 32, i16 64, i16 128, i16 256, i16 512, i16 1024, i16 2048>, <8 x i16> %4)
+  %6 = bitcast <8 x i16> %5 to <4 x i32>
+  %7 = bitcast <2 x i64> %3 to <4 x i32>
+  %8 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %6, <4 x i32> %7)
+  %9 = bitcast <4 x i32> %8 to <8 x i16>
+  %10 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %9, i32 %S)
+  %11 = bitcast <8 x i16> %10 to <4 x i32>
+  %12 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %11, i32 %S)
+  %13 = bitcast <4 x i32> %6 to <8 x i16>
+  ret <8 x i16> %13
+; CHECK: test_sse2_psra_2
+; CHECK: ret <8 x i16> <i16 16, i16 32, i16 64, i16 128, i16 256, i16 512, i16 1024, i16 2048>
+}
+
+define <4 x i64> @test_avx2_psra_2() nounwind readnone uwtable {
+  %S = bitcast i32 0 to i32
+  %1 = zext i32 %S to i64
+  %2 = insertelement <2 x i64> undef, i64 %1, i32 0
+  %3 = insertelement <2 x i64> %2, i64 0, i32 1
+  %4 = bitcast <2 x i64> %3 to <8 x i16>
+  %5 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> <i16 1024, i16 0, i16 0, i16 0, i16 2048, i16 0, i16 0, i16 0, i16 4096, i16 0, i16 0, i16 0, i16 8192, i16 0, i16 0, i16 0>, <8 x i16> %4)
+  %6 = bitcast <16 x i16> %5 to <8 x i32>
+  %7 = bitcast <2 x i64> %3 to <4 x i32>
+  %8 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %6, <4 x i32> %7)
+  %9 = bitcast <8 x i32> %8 to <16 x i16>
+  %10 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %9, i32 %S)
+  %11 = bitcast <16 x i16> %10 to <8 x i32>
+  %12 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %11, i32 %S)
+  %13 = bitcast <8 x i32> %12 to <4 x i64>
+  ret <4 x i64> %13
+; CHECK: test_avx2_psra_2
+; CHECK: ret <4 x i64> <i64 1024, i64 2048, i64 4096, i64 8192>
+}
+
+
 declare <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64>, i32) #1
 declare <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32>, i32) #1
 declare <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16>, i32) #1
@@ -574,5 +792,13 @@
 declare <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64>, <2 x i64>) #1
 declare <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32>, <4 x i32>) #1
 declare <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16>, <8 x i16>) #1
+declare <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32>, i32) #1
+declare <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16>, i32) #1
+declare <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32>, <4 x i32>) #1
+declare <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16>, <8 x i16>) #1
+declare <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32>, i32) #1
+declare <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16>, i32) #1
+declare <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32>, <4 x i32>) #1
+declare <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16>, <8 x i16>) #1
 
 attributes #1 = { nounwind readnone }