[llvm] r233127 - optimize the AVX2 (integer) version of vperm2 into a shuffle

Tue Mar 24 15:39:30 PDT 2015

Author: spatel
Date: Tue Mar 24 17:39:29 2015
New Revision: 233127

URL: http://llvm.org/viewvc/llvm-project?rev=233127&view=rev
Log:
optimize the AVX2 (integer) version of vperm2 into a shuffle

...because this is what happens when an instruction
set puts its underwear on after its pants.

This is an extension of r232852, r233100, and 233110:
http://llvm.org/viewvc/llvm-project?view=revision&revision=232852
http://llvm.org/viewvc/llvm-project?view=revision&revision=233100
http://llvm.org/viewvc/llvm-project?view=revision&revision=233110


Modified:
    llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp
    llvm/trunk/test/Transforms/InstCombine/x86-vperm2.ll

Modified: llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp?rev=233127&r1=233126&r2=233127&view=diff
==============================================================================

--- llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp (original)
+++ llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp Tue Mar 24 17:39:29 2015
@@ -972,7 +972,7 @@ Instruction *InstCombiner::visitCallInst
   case Intrinsic::x86_avx_vperm2f128_pd_256:
   case Intrinsic::x86_avx_vperm2f128_ps_256:
   case Intrinsic::x86_avx_vperm2f128_si_256:
-    // TODO: Add the AVX2 version of this instruction.
+  case Intrinsic::x86_avx2_vperm2i128:
     if (Value *V = SimplifyX86vperm2(*II, *Builder))
       return ReplaceInstUsesWith(*II, V);
     break;

Modified: llvm/trunk/test/Transforms/InstCombine/x86-vperm2.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/x86-vperm2.ll?rev=233127&r1=233126&r2=233127&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/x86-vperm2.ll (original)
+++ llvm/trunk/test/Transforms/InstCombine/x86-vperm2.ll Tue Mar 24 17:39:29 2015
@@ -12,7 +12,7 @@ define <4 x double> @perm2pd_non_const_i
 }
 
 
-; In the following 3 tests, both zero mask bits of the immediate are set.
+; In the following 4 tests, both zero mask bits of the immediate are set.
 
 define <4 x double> @perm2pd_0x88(<4 x double> %a0, <4 x double> %a1) {
   %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 136) 
@@ -38,6 +38,14 @@ define <8 x i32> @perm2si_0x88(<8 x i32>
 ; CHECK-NEXT:  ret <8 x i32> zeroinitializer
 }
 
+define <4 x i64> @perm2i_0x88(<4 x i64> %a0, <4 x i64> %a1) {
+  %res = call <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64> %a0, <4 x i64> %a1, i8 136) 
+  ret <4 x i64> %res
+
+; CHECK-LABEL: @perm2i_0x88
+; CHECK-NEXT:  ret <4 x i64> zeroinitializer
+}
+
 
 ; The other control bits are ignored when zero mask bits of the immediate are set.
 
@@ -207,6 +215,18 @@ define <8 x float> @perm2ps_0x31(<8 x fl
 }
 
 
+; Confirm that the AVX2 version works the same.
+
+define <4 x i64> @perm2i_0x33(<4 x i64> %a0, <4 x i64> %a1) {
+  %res = call <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64> %a0, <4 x i64> %a1, i8 51)
+  ret <4 x i64> %res
+
+; CHECK-LABEL: @perm2i_0x33
+; CHECK-NEXT:  %1 = shufflevector <4 x i64> %a1, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
+; CHECK-NEXT:  ret <4 x i64> %1
+}
+
+
 ; Confirm that when a single zero mask bit is set, we replace a source vector with zeros.
 
 define <4 x double> @perm2pd_0x81(<4 x double> %a0, <4 x double> %a1) {
@@ -245,7 +265,19 @@ define <4 x double> @perm2pd_0x08(<4 x d
 ; CHECK-NEXT:  ret <4 x double>
 }
 
+; Check one more with the AVX2 version.
+
+define <4 x i64> @perm2i_0x28(<4 x i64> %a0, <4 x i64> %a1) {
+  %res = call <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64> %a0, <4 x i64> %a1, i8 40)
+  ret <4 x i64> %res
+
+; CHECK-LABEL: @perm2i_0x28
+; CHECK-NEXT:  shufflevector <4 x i64> <i64 0{{.*}}, <4 x i64> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:  ret <4 x i64>
+}
+
 declare <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone
 declare <8 x float> @llvm.x86.avx.vperm2f128.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
 declare <8 x i32> @llvm.x86.avx.vperm2f128.si.256(<8 x i32>, <8 x i32>, i8) nounwind readnone
+declare <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64>, <4 x i64>, i8) nounwind readnone