[llvm] r292793 - [InstCombine][X86] Add MULDQ/MULUDQ constant folding support

Mon Jan 23 07:22:59 PST 2017

Author: rksimon
Date: Mon Jan 23 09:22:59 2017
New Revision: 292793

URL: http://llvm.org/viewvc/llvm-project?rev=292793&view=rev
Log:
[InstCombine][X86] Add MULDQ/MULUDQ constant folding support

Modified:
    llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp
    llvm/trunk/test/Transforms/InstCombine/x86-muldq.ll

Modified: llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp?rev=292793&r1=292792&r2=292793&view=diff
==============================================================================

--- llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp (original)
+++ llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp Mon Jan 23 09:22:59 2017
@@ -510,16 +510,53 @@ static Value *simplifyX86varShift(const
   return Builder.CreateAShr(Vec, ShiftVec);
 }
 
-static Value *simplifyX86muldq(const IntrinsicInst &II) {
+static Value *simplifyX86muldq(const IntrinsicInst &II,
+                               InstCombiner::BuilderTy &Builder) {
   Value *Arg0 = II.getArgOperand(0);
   Value *Arg1 = II.getArgOperand(1);
   Type *ResTy = II.getType();
+  assert(Arg0->getType()->getScalarSizeInBits() == 32 &&
+         Arg1->getType()->getScalarSizeInBits() == 32 &&
+         ResTy->getScalarSizeInBits() == 64 && "Unexpected muldq/muludq types");
 
   // muldq/muludq(undef, undef) -> zero (matches generic mul behavior)
   if (isa<UndefValue>(Arg0) && isa<UndefValue>(Arg1))
     return ConstantAggregateZero::get(ResTy);
 
-  return nullptr;
+  // Constant folding.
+  // PMULDQ  = (mul(vXi64 sext(shuffle<0,2,..>(Arg0)),
+  //                vXi64 sext(shuffle<0,2,..>(Arg1))))
+  // PMULUDQ = (mul(vXi64 zext(shuffle<0,2,..>(Arg0)),
+  //                vXi64 zext(shuffle<0,2,..>(Arg1))))
+  if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1))
+    return nullptr;
+
+  unsigned NumElts = ResTy->getVectorNumElements();
+  assert(Arg0->getType()->getVectorNumElements() == (2 * NumElts) &&
+         Arg1->getType()->getVectorNumElements() == (2 * NumElts) &&
+         "Unexpected muldq/muludq types");
+
+  unsigned IntrinsicID = II.getIntrinsicID();
+  bool IsSigned = (Intrinsic::x86_sse41_pmuldq == IntrinsicID ||
+                   Intrinsic::x86_avx2_pmul_dq == IntrinsicID ||
+                   Intrinsic::x86_avx512_pmul_dq_512 == IntrinsicID);
+
+  SmallVector<unsigned, 16> ShuffleMask;
+  for (unsigned i = 0; i != NumElts; ++i)
+    ShuffleMask.push_back(i * 2);
+
+  auto *LHS = Builder.CreateShuffleVector(Arg0, Arg0, ShuffleMask);
+  auto *RHS = Builder.CreateShuffleVector(Arg1, Arg1, ShuffleMask);
+
+  if (IsSigned) {
+    LHS = Builder.CreateSExt(LHS, ResTy);
+    RHS = Builder.CreateSExt(RHS, ResTy);
+  } else {
+    LHS = Builder.CreateZExt(LHS, ResTy);
+    RHS = Builder.CreateZExt(RHS, ResTy);
+  }
+
+  return Builder.CreateMul(LHS, RHS);
 }
 
 static Value *simplifyX86movmsk(const IntrinsicInst &II,
@@ -2154,7 +2191,7 @@ Instruction *InstCombiner::visitCallInst
   case Intrinsic::x86_avx2_pmulu_dq:
   case Intrinsic::x86_avx512_pmul_dq_512:
   case Intrinsic::x86_avx512_pmulu_dq_512: {
-    if (Value *V = simplifyX86muldq(*II))
+    if (Value *V = simplifyX86muldq(*II, *Builder))
       return replaceInstUsesWith(*II, V);
 
     unsigned VWidth = II->getType()->getVectorNumElements();

Modified: llvm/trunk/test/Transforms/InstCombine/x86-muldq.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/x86-muldq.ll?rev=292793&r1=292792&r2=292793&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/x86-muldq.ll (original)
+++ llvm/trunk/test/Transforms/InstCombine/x86-muldq.ll Mon Jan 23 09:22:59 2017
@@ -55,8 +55,7 @@ define <8 x i64> @undef_pmuldq_512(<16 x
 
 define <2 x i64> @undef_zero_pmuludq_128(<4 x i32> %a0, <4 x i32> %a1) {
 ; CHECK-LABEL: @undef_zero_pmuludq_128(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 0, i32 undef>)
-; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+; CHECK-NEXT:    ret <2 x i64> zeroinitializer
 ;
   %1 = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> undef, <4 x i32> zeroinitializer)
   ret <2 x i64> %1
@@ -64,8 +63,7 @@ define <2 x i64> @undef_zero_pmuludq_128
 
 define <4 x i64> @undef_zero_pmuludq_256(<8 x i32> %a0, <8 x i32> %a1) {
 ; CHECK-LABEL: @undef_zero_pmuludq_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32> <i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 undef>, <8 x i32> undef)
-; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
+; CHECK-NEXT:    ret <4 x i64> zeroinitializer
 ;
   %1 = call <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32> zeroinitializer, <8 x i32> undef)
   ret <4 x i64> %1
@@ -73,8 +71,7 @@ define <4 x i64> @undef_zero_pmuludq_256
 
 define <8 x i64> @undef_zero_pmuludq_512(<16 x i32> %a0, <16 x i32> %a1) {
 ; CHECK-LABEL: @undef_zero_pmuludq_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> undef, <16 x i32> <i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 undef>)
-; CHECK-NEXT:    ret <8 x i64> [[TMP1]]
+; CHECK-NEXT:    ret <8 x i64> zeroinitializer
 ;
   %1 = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> undef, <16 x i32> zeroinitializer)
   ret <8 x i64> %1
@@ -82,8 +79,7 @@ define <8 x i64> @undef_zero_pmuludq_512
 
 define <2 x i64> @undef_zero_pmuldq_128(<4 x i32> %a0, <4 x i32> %a1) {
 ; CHECK-LABEL: @undef_zero_pmuldq_128(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> <i32 0, i32 undef, i32 0, i32 undef>, <4 x i32> undef)
-; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+; CHECK-NEXT:    ret <2 x i64> zeroinitializer
 ;
   %1 = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> zeroinitializer, <4 x i32> undef)
   ret <2 x i64> %1
@@ -91,8 +87,7 @@ define <2 x i64> @undef_zero_pmuldq_128(
 
 define <4 x i64> @undef_zero_pmuldq_256(<8 x i32> %a0, <8 x i32> %a1) {
 ; CHECK-LABEL: @undef_zero_pmuldq_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32> undef, <8 x i32> <i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 undef>)
-; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
+; CHECK-NEXT:    ret <4 x i64> zeroinitializer
 ;
   %1 = call <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32> undef, <8 x i32> zeroinitializer)
   ret <4 x i64> %1
@@ -100,8 +95,7 @@ define <4 x i64> @undef_zero_pmuldq_256(
 
 define <8 x i64> @undef_zero_pmuldq_512(<16 x i32> %a0, <16 x i32> %a1) {
 ; CHECK-LABEL: @undef_zero_pmuldq_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> <i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 undef>, <16 x i32> undef)
-; CHECK-NEXT:    ret <8 x i64> [[TMP1]]
+; CHECK-NEXT:    ret <8 x i64> zeroinitializer
 ;
   %1 = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> zeroinitializer, <16 x i32> undef)
   ret <8 x i64> %1
@@ -113,8 +107,7 @@ define <8 x i64> @undef_zero_pmuldq_512(
 
 define <2 x i64> @fold_pmuludq_128(<4 x i32> %a0, <4 x i32> %a1) {
 ; CHECK-LABEL: @fold_pmuludq_128(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> <i32 -1, i32 undef, i32 -1, i32 undef>, <4 x i32> <i32 2147483647, i32 undef, i32 1, i32 undef>)
-; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+; CHECK-NEXT:    ret <2 x i64> <i64 9223372030412324865, i64 4294967295>
 ;
   %1 = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> <i32 2147483647, i32 1, i32 1, i32 3>)
   ret <2 x i64> %1
@@ -122,8 +115,7 @@ define <2 x i64> @fold_pmuludq_128(<4 x
 
 define <4 x i64> @fold_pmuludq_256(<8 x i32> %a0, <8 x i32> %a1) {
 ; CHECK-LABEL: @fold_pmuludq_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32> <i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 undef>, <8 x i32> <i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 undef>)
-; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
+; CHECK-NEXT:    ret <4 x i64> zeroinitializer
 ;
   %1 = call <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32> zeroinitializer, <8 x i32> zeroinitializer)
   ret <4 x i64> %1
@@ -131,8 +123,7 @@ define <4 x i64> @fold_pmuludq_256(<8 x
 
 define <8 x i64> @fold_pmuludq_512(<16 x i32> %a0, <16 x i32> %a1) {
 ; CHECK-LABEL: @fold_pmuludq_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 2, i32 undef, i32 undef, i32 undef, i32 -1, i32 undef, i32 65536, i32 undef, i32 -65536, i32 undef>, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 255, i32 undef, i32 65535, i32 undef, i32 0, i32 undef, i32 -65535, i32 undef, i32 2147483647, i32 undef, i32 65536, i32 undef>)
-; CHECK-NEXT:    ret <8 x i64> [[TMP1]]
+; CHECK-NEXT:    ret <8 x i64> <i64 0, i64 0, i64 255, i64 131070, i64 0, i64 -281474976645121, i64 140737488289792, i64 281470681743360>
 ;
   %1 = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> <i32 0, i32 0, i32 undef, i32 0, i32 1, i32 1, i32 2, i32 2, i32 undef, i32 undef, i32 -1, i32 -1, i32 65536, i32 -1, i32 -65536, i32 undef>, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 1, i32 255, i32 -256, i32 65535, i32 -65536, i32 0, i32 -1, i32 -65535, i32 -65535, i32 2147483647, i32 2147483648, i32 65536, i32 -65535>)
   ret <8 x i64> %1
@@ -140,8 +131,7 @@ define <8 x i64> @fold_pmuludq_512(<16 x
 
 define <2 x i64> @fold_pmuldq_128(<4 x i32> %a0, <4 x i32> %a1) {
 ; CHECK-LABEL: @fold_pmuldq_128(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> <i32 undef, i32 undef, i32 -1, i32 undef>, <4 x i32> <i32 undef, i32 undef, i32 -2, i32 undef>)
-; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+; CHECK-NEXT:    ret <2 x i64> <i64 0, i64 2>
 ;
   %1 = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> <i32 undef, i32 -1, i32 -1, i32 -1>, <4 x i32> <i32 undef, i32 1, i32 -2, i32 3>)
   ret <2 x i64> %1
@@ -149,8 +139,7 @@ define <2 x i64> @fold_pmuldq_128(<4 x i
 
 define <4 x i64> @fold_pmuldq_256(<8 x i32> %a0, <8 x i32> %a1) {
 ; CHECK-LABEL: @fold_pmuldq_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32> <i32 undef, i32 undef, i32 -65535, i32 undef, i32 65536, i32 undef, i32 -2147483648, i32 undef>, <8 x i32> <i32 0, i32 undef, i32 -65535, i32 undef, i32 2147483647, i32 undef, i32 65536, i32 undef>)
-; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
+; CHECK-NEXT:    ret <4 x i64> <i64 0, i64 4294836225, i64 140737488289792, i64 -140737488355328>
 ;
   %1 = call <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32> <i32 undef, i32 1, i32 -65535, i32 128, i32 65536, i32 2147483647, i32 -2147483648, i32 65536>, <8 x i32> <i32 0, i32 -1, i32 -65535, i32 -65535, i32 2147483647, i32 2147483648, i32 65536, i32 -65535>)
   ret <4 x i64> %1
@@ -158,8 +147,7 @@ define <4 x i64> @fold_pmuldq_256(<8 x i
 
 define <8 x i64> @fold_pmuldq_512(<16 x i32> %a0, <16 x i32> %a1) {
 ; CHECK-LABEL: @fold_pmuldq_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> <i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 undef>, <16 x i32> <i32 undef, i32 undef, i32 -3, i32 undef, i32 8, i32 undef, i32 -256, i32 undef, i32 undef, i32 undef, i32 -65535, i32 undef, i32 65536, i32 undef, i32 -2147483648, i32 undef>)
-; CHECK-NEXT:    ret <8 x i64> [[TMP1]]
+; CHECK-NEXT:    ret <8 x i64> zeroinitializer
 ;
   %1 = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> zeroinitializer, <16 x i32> <i32 undef, i32 -1, i32 -3, i32 -1, i32 8, i32 10, i32 -256, i32 65536, i32 undef, i32 1, i32 -65535, i32 128, i32 65536, i32 2147483647, i32 -2147483648, i32 65536>)
   ret <8 x i64> %1