[llvm] r291122 - [CostModel][X86] Add support for broadcast shuffle costs

Simon Pilgrim via llvm-commits llvm-commits at lists.llvm.org
Thu Jan 5 07:56:09 PST 2017


Author: rksimon
Date: Thu Jan  5 09:56:08 2017
New Revision: 291122

URL: http://llvm.org/viewvc/llvm-project?rev=291122&view=rev
Log:
[CostModel][X86] Add support for broadcast shuffle costs

Currently only for broadcasts with input and output of the same width.

Differential Revision: https://reviews.llvm.org/D27811

Modified:
    llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp
    llvm/trunk/test/Analysis/CostModel/X86/shuffle-broadcast.ll

Modified: llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp?rev=291122&r1=291121&r2=291122&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp Thu Jan  5 09:56:08 2017
@@ -605,7 +605,14 @@ int X86TTIImpl::getShuffleCost(TTI::Shuf
   // 64-bit packed integer vectors (v2i32) are promoted to type v2i64.
   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
 
-  if (Kind == TTI::SK_Reverse || Kind == TTI::SK_Alternate) {
+  if (Kind == TTI::SK_Reverse || Kind == TTI::SK_Alternate ||
+      Kind == TTI::SK_Broadcast) {
+    // For Broadcasts we are splatting the first element from the first input
+    // register, so only need to reference that input and all the output
+    // registers are the same.
+    if (Kind == TTI::SK_Broadcast)
+      LT.first = 1;
+
     static const CostTblEntry AVX512VBMIShuffleTbl[] = {
       { TTI::SK_Reverse, MVT::v64i8,  1 }, // vpermb
       { TTI::SK_Reverse, MVT::v32i8,  1 }  // vpermb
@@ -617,10 +624,13 @@ int X86TTIImpl::getShuffleCost(TTI::Shuf
         return LT.first * Entry->Cost;
 
     static const CostTblEntry AVX512BWShuffleTbl[] = {
-      { TTI::SK_Reverse, MVT::v32i16, 1 }, // vpermw
-      { TTI::SK_Reverse, MVT::v16i16, 1 }, // vpermw
-      { TTI::SK_Reverse, MVT::v64i8,  6 }  // vextracti64x4 + 2*vperm2i128
-                                           // + 2*pshufb + vinserti64x4
+      { TTI::SK_Broadcast, MVT::v32i16, 1 }, // vpbroadcastw
+      { TTI::SK_Broadcast, MVT::v64i8,  1 }, // vpbroadcastb
+
+      { TTI::SK_Reverse,   MVT::v32i16, 1 }, // vpermw
+      { TTI::SK_Reverse,   MVT::v16i16, 1 }, // vpermw
+      { TTI::SK_Reverse,   MVT::v64i8,  6 }  // vextracti64x4 + 2*vperm2i128
+                                             // + 2*pshufb + vinserti64x4
     };
 
     if (ST->hasBWI())
@@ -629,10 +639,15 @@ int X86TTIImpl::getShuffleCost(TTI::Shuf
         return LT.first * Entry->Cost;
 
     static const CostTblEntry AVX512ShuffleTbl[] = {
-      { TTI::SK_Reverse, MVT::v8f64,  1 }, // vpermpd
-      { TTI::SK_Reverse, MVT::v16f32, 1 }, // vpermps
-      { TTI::SK_Reverse, MVT::v8i64,  1 }, // vpermq
-      { TTI::SK_Reverse, MVT::v16i32, 1 }, // vpermd
+      { TTI::SK_Broadcast, MVT::v8f64,  1 }, // vbroadcastpd
+      { TTI::SK_Broadcast, MVT::v16f32, 1 }, // vbroadcastps
+      { TTI::SK_Broadcast, MVT::v8i64,  1 }, // vpbroadcastq
+      { TTI::SK_Broadcast, MVT::v16i32, 1 }, // vpbroadcastd
+
+      { TTI::SK_Reverse,   MVT::v8f64,  1 }, // vpermpd
+      { TTI::SK_Reverse,   MVT::v16f32, 1 }, // vpermps
+      { TTI::SK_Reverse,   MVT::v8i64,  1 }, // vpermq
+      { TTI::SK_Reverse,   MVT::v16i32, 1 }  // vpermd
     };
 
     if (ST->hasAVX512())
@@ -641,6 +656,13 @@ int X86TTIImpl::getShuffleCost(TTI::Shuf
         return LT.first * Entry->Cost;
 
     static const CostTblEntry AVX2ShuffleTbl[] = {
+      { TTI::SK_Broadcast, MVT::v4f64,  1 }, // vbroadcastpd
+      { TTI::SK_Broadcast, MVT::v8f32,  1 }, // vbroadcastps
+      { TTI::SK_Broadcast, MVT::v4i64,  1 }, // vpbroadcastq
+      { TTI::SK_Broadcast, MVT::v8i32,  1 }, // vpbroadcastd
+      { TTI::SK_Broadcast, MVT::v16i16, 1 }, // vpbroadcastw
+      { TTI::SK_Broadcast, MVT::v32i8,  1 }, // vpbroadcastb
+
       { TTI::SK_Reverse,   MVT::v4f64,  1 }, // vpermpd
       { TTI::SK_Reverse,   MVT::v8f32,  1 }, // vpermps
       { TTI::SK_Reverse,   MVT::v4i64,  1 }, // vpermq
@@ -657,6 +679,13 @@ int X86TTIImpl::getShuffleCost(TTI::Shuf
         return LT.first * Entry->Cost;
 
     static const CostTblEntry AVX1ShuffleTbl[] = {
+      { TTI::SK_Broadcast, MVT::v4f64,  2 }, // vperm2f128 + vpermilpd
+      { TTI::SK_Broadcast, MVT::v8f32,  2 }, // vperm2f128 + vpermilps
+      { TTI::SK_Broadcast, MVT::v4i64,  2 }, // vperm2f128 + vpermilpd
+      { TTI::SK_Broadcast, MVT::v8i32,  2 }, // vperm2f128 + vpermilps
+      { TTI::SK_Broadcast, MVT::v16i16, 3 }, // vpshuflw + vpshufd + vinsertf128
+      { TTI::SK_Broadcast, MVT::v32i8,  2 }, // vpshufb + vinsertf128
+
       { TTI::SK_Reverse,   MVT::v4f64,  2 }, // vperm2f128 + vpermilpd
       { TTI::SK_Reverse,   MVT::v8f32,  2 }, // vperm2f128 + vpermilps
       { TTI::SK_Reverse,   MVT::v4i64,  2 }, // vperm2f128 + vpermilpd
@@ -692,6 +721,9 @@ int X86TTIImpl::getShuffleCost(TTI::Shuf
         return LT.first * Entry->Cost;
 
     static const CostTblEntry SSSE3ShuffleTbl[] = {
+      { TTI::SK_Broadcast, MVT::v8i16,  1 }, // pshufb
+      { TTI::SK_Broadcast, MVT::v16i8,  1 }, // pshufb
+
       { TTI::SK_Reverse,   MVT::v8i16,  1 }, // pshufb
       { TTI::SK_Reverse,   MVT::v16i8,  1 }, // pshufb
 
@@ -704,6 +736,12 @@ int X86TTIImpl::getShuffleCost(TTI::Shuf
         return LT.first * Entry->Cost;
 
     static const CostTblEntry SSE2ShuffleTbl[] = {
+      { TTI::SK_Broadcast, MVT::v2f64,  1 }, // shufpd
+      { TTI::SK_Broadcast, MVT::v2i64,  1 }, // pshufd
+      { TTI::SK_Broadcast, MVT::v4i32,  1 }, // pshufd
+      { TTI::SK_Broadcast, MVT::v8i16,  2 }, // pshuflw  + pshufd
+      { TTI::SK_Broadcast, MVT::v16i8,  3 }, // unpck + pshuflw + pshufd
+
       { TTI::SK_Reverse,   MVT::v2f64,  1 }, // shufpd
       { TTI::SK_Reverse,   MVT::v2i64,  1 }, // pshufd
       { TTI::SK_Reverse,   MVT::v4i32,  1 }, // pshufd
@@ -723,6 +761,7 @@ int X86TTIImpl::getShuffleCost(TTI::Shuf
         return LT.first * Entry->Cost;
 
     static const CostTblEntry SSE1ShuffleTbl[] = {
+      { TTI::SK_Broadcast, MVT::v4f32,  1 }, // shufps
       { TTI::SK_Reverse,   MVT::v4f32,  1 }, // shufps
       { TTI::SK_Alternate, MVT::v4f32,  2 }  // 2*shufps
     };

Modified: llvm/trunk/test/Analysis/CostModel/X86/shuffle-broadcast.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Analysis/CostModel/X86/shuffle-broadcast.ll?rev=291122&r1=291121&r2=291122&view=diff
==============================================================================
--- llvm/trunk/test/Analysis/CostModel/X86/shuffle-broadcast.ll (original)
+++ llvm/trunk/test/Analysis/CostModel/X86/shuffle-broadcast.ll Thu Jan  5 09:56:08 2017
@@ -18,14 +18,150 @@ define void @test_vXf64(<2 x double> %sr
   %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> zeroinitializer
 
   ; SSE: cost of 1 {{.*}} %V256 = shufflevector
-  ; AVX: cost of 1 {{.*}} %V256 = shufflevector
+  ; AVX1: cost of 2 {{.*}} %V256 = shufflevector
+  ; AVX2: cost of 1 {{.*}} %V256 = shufflevector
   ; AVX512: cost of 1 {{.*}} %V256 = shufflevector
   %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> zeroinitializer
 
   ; SSE: cost of 1 {{.*}} %V512 = shufflevector
-  ; AVX: cost of 1 {{.*}} %V512 = shufflevector
+  ; AVX1: cost of 2 {{.*}} %V512 = shufflevector
+  ; AVX2: cost of 1 {{.*}} %V512 = shufflevector
   ; AVX512: cost of 1 {{.*}} %V512 = shufflevector
   %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> zeroinitializer
 
   ret void
 }
+
+; CHECK-LABEL: 'test_vXi64'
+define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512) {
+  ; SSE: cost of 1 {{.*}} %V128 = shufflevector
+  ; AVX: cost of 1 {{.*}} %V128 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V128 = shufflevector
+  %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> zeroinitializer
+
+  ; SSE: cost of 1 {{.*}} %V256 = shufflevector
+  ; AVX1: cost of 2 {{.*}} %V256 = shufflevector
+  ; AVX2: cost of 1 {{.*}} %V256 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V256 = shufflevector
+  %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> zeroinitializer
+
+  ; SSE: cost of 1 {{.*}} %V512 = shufflevector
+  ; AVX1: cost of 2 {{.*}} %V512 = shufflevector
+  ; AVX2: cost of 1 {{.*}} %V512 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V512 = shufflevector
+  %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> zeroinitializer
+
+  ret void
+}
+
+; CHECK-LABEL: 'test_vXf32'
+define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %src256, <16 x float> %src512) {
+  ; SSE: cost of 1 {{.*}} %V64 = shufflevector
+  ; AVX: cost of 1 {{.*}} %V64 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V64 = shufflevector
+  %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> zeroinitializer
+
+  ; SSE: cost of 1 {{.*}} %V128 = shufflevector
+  ; AVX: cost of 1 {{.*}} %V128 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V128 = shufflevector
+  %V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> zeroinitializer
+
+  ; SSE: cost of 1 {{.*}} %V256 = shufflevector
+  ; AVX1: cost of 2 {{.*}} %V256 = shufflevector
+  ; AVX2: cost of 1 {{.*}} %V256 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V256 = shufflevector
+  %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> zeroinitializer
+
+  ; SSE: cost of 1 {{.*}} %V512 = shufflevector
+  ; AVX1: cost of 2 {{.*}} %V512 = shufflevector
+  ; AVX2: cost of 1 {{.*}} %V512 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V512 = shufflevector
+  %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> zeroinitializer
+
+  ret void
+}
+
+; CHECK-LABEL: 'test_vXi32'
+define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256, <16 x i32> %src512) {
+  ; SSE: cost of 1 {{.*}} %V64 = shufflevector
+  ; AVX: cost of 1 {{.*}} %V64 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V64 = shufflevector
+  %V64 = shufflevector <2 x i32> %src64, <2 x i32> undef, <2 x i32> zeroinitializer
+
+  ; SSE: cost of 1 {{.*}} %V128 = shufflevector
+  ; AVX: cost of 1 {{.*}} %V128 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V128 = shufflevector
+  %V128 = shufflevector <4 x i32> %src128, <4 x i32> undef, <4 x i32> zeroinitializer
+
+  ; SSE: cost of 1 {{.*}} %V256 = shufflevector
+  ; AVX1: cost of 2 {{.*}} %V256 = shufflevector
+  ; AVX2: cost of 1 {{.*}} %V256 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V256 = shufflevector
+  %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> zeroinitializer
+
+  ; SSE: cost of 1 {{.*}} %V512 = shufflevector
+  ; AVX1: cost of 2 {{.*}} %V512 = shufflevector
+  ; AVX2: cost of 1 {{.*}} %V512 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V512 = shufflevector
+  %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> zeroinitializer
+
+  ret void
+}
+
+; CHECK-LABEL: 'test_vXi16'
+define void @test_vXi16(<8 x i16> %src128, <16 x i16> %src256, <32 x i16> %src512) {
+  ; SSE2: cost of 2 {{.*}} %V128 = shufflevector
+  ; SSSE3: cost of 1 {{.*}} %V128 = shufflevector
+  ; SSE42: cost of 1 {{.*}} %V128 = shufflevector
+  ; AVX: cost of 1 {{.*}} %V128 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V128 = shufflevector
+  %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> zeroinitializer
+
+  ; SSE2: cost of 2 {{.*}} %V256 = shufflevector
+  ; SSSE3: cost of 1 {{.*}} %V256 = shufflevector
+  ; SSE42: cost of 1 {{.*}} %V256 = shufflevector
+  ; AVX1: cost of 3 {{.*}} %V256 = shufflevector
+  ; AVX2: cost of 1 {{.*}} %V256 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V256 = shufflevector
+  %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> zeroinitializer
+
+  ; SSE2: cost of 2 {{.*}} %V512 = shufflevector
+  ; SSSE3: cost of 1 {{.*}} %V512 = shufflevector
+  ; SSE42: cost of 1 {{.*}} %V512 = shufflevector
+  ; AVX1: cost of 3 {{.*}} %V512 = shufflevector
+  ; AVX2: cost of 1 {{.*}} %V512 = shufflevector
+  ; AVX512F: cost of 1 {{.*}} %V512 = shufflevector
+  ; AVX512BW: cost of 1 {{.*}} %V512 = shufflevector
+  %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> zeroinitializer
+
+  ret void
+}
+
+; CHECK-LABEL: 'test_vXi8'
+define void @test_vXi8(<16 x i8> %src128, <32 x i8> %src256, <64 x i8> %src512) {
+  ; SSE2: cost of 3 {{.*}} %V128 = shufflevector
+  ; SSSE3: cost of 1 {{.*}} %V128 = shufflevector
+  ; SSE42: cost of 1 {{.*}} %V128 = shufflevector
+  ; AVX: cost of 1 {{.*}} %V128 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V128 = shufflevector
+  %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> zeroinitializer
+
+  ; SSE2: cost of 3 {{.*}} %V256 = shufflevector
+  ; SSSE3: cost of 1 {{.*}} %V256 = shufflevector
+  ; SSE42: cost of 1 {{.*}} %V256 = shufflevector
+  ; AVX1: cost of 2 {{.*}} %V256 = shufflevector
+  ; AVX2: cost of 1 {{.*}} %V256 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V256 = shufflevector
+  %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> zeroinitializer
+
+  ; SSE2: cost of 3 {{.*}} %V512 = shufflevector
+  ; SSSE3: cost of 1 {{.*}} %V512 = shufflevector
+  ; SSE42: cost of 1 {{.*}} %V512 = shufflevector
+  ; AVX1: cost of 2 {{.*}} %V512 = shufflevector
+  ; AVX2: cost of 1 {{.*}} %V512 = shufflevector
+  ; AVX512F: cost of 1 {{.*}} %V512 = shufflevector
+  ; AVX512BW: cost of 1 {{.*}} %V512 = shufflevector
+  %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> zeroinitializer
+
+  ret void
+}




More information about the llvm-commits mailing list