[llvm] r306974 - [X86][CM] update add\sub costs of vectors of 64 in X86\SLM arch

Sun Jul 2 05:16:15 PDT 2017

Author: magabari
Date: Sun Jul  2 05:16:15 2017
New Revision: 306974

URL: http://llvm.org/viewvc/llvm-project?rev=306974&view=rev
Log:
[X86][CM] update add\sub costs of vectors of 64 in X86\SLM arch

this patch updates the cost of addq\subq (add\subtract of vectors of 64bits)
based on the performance numbers of SLM arch.

Differential Revision: https://reviews.llvm.org/D33983


Added:
    llvm/trunk/test/Transforms/LoopVectorize/X86/slm-no-vectorize.ll
Modified:
    llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp
    llvm/trunk/test/Analysis/CostModel/X86/slm-arith-costs.ll

Modified: llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp?rev=306974&r1=306973&r2=306974&view=diff
==============================================================================

--- llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp Sun Jul  2 05:16:15 2017
@@ -142,10 +142,15 @@ int X86TTIImpl::getArithmeticInstrCost(
     { ISD::FDIV, MVT::v2f64, 69 }, // divpd
     { ISD::FADD, MVT::v2f64, 2  }, // addpd
     { ISD::FSUB, MVT::v2f64, 2  }, // subpd
-    // v2i64/v4i64 mul is custom lowered as a series of long
-    // multiplies(3), shifts(3) and adds(2).
-    // slm muldq version throughput is 2
-    { ISD::MUL,  MVT::v2i64, 11 },
+    // v2i64/v4i64 mul is custom lowered as a series of long:
+    // multiplies(3), shifts(3) and adds(2)
+    // slm muldq version throughput is 2 and addq throughput 4 
+    // thus: 3X2 (muldq throughput) + 3X1 (shift throuput) +
+    //       3X4 (addq throughput) = 17 
+    { ISD::MUL,  MVT::v2i64, 17 },
+    // slm addq\subq throughput is 4
+    { ISD::ADD,  MVT::v2i64, 4  },
+    { ISD::SUB,  MVT::v2i64, 4  },
   };
 
   if (ST->isSLM()) {

Modified: llvm/trunk/test/Analysis/CostModel/X86/slm-arith-costs.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Analysis/CostModel/X86/slm-arith-costs.ll?rev=306974&r1=306973&r2=306974&view=diff
==============================================================================
--- llvm/trunk/test/Analysis/CostModel/X86/slm-arith-costs.ll (original)
+++ llvm/trunk/test/Analysis/CostModel/X86/slm-arith-costs.ll Sun Jul  2 05:16:15 2017
@@ -3,6 +3,20 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
+define <2 x i64> @slm-costs_64_vector_add(<2 x i64> %a, <2 x i64> %b) {
+entry:
+; SLM:  cost of 4 {{.*}} add <2 x i64>
+  %res = add <2 x i64> %a, %b
+  ret <2 x i64> %res
+}
+
+define <2 x i64> @slm-costs_64_vector_sub(<2 x i64> %a, <2 x i64> %b) {
+entry:
+; SLM:  cost of 4 {{.*}} sub <2 x i64>
+  %res = sub <2 x i64> %a, %b
+  ret <2 x i64> %res
+}
+
 ; 8bit mul
 define i8 @slm-costs_8_scalar_mul(i8 %a, i8 %b)  {
 entry:
@@ -13,7 +27,7 @@ entry:
 
 define <2 x i8> @slm-costs_8_v2_mul(<2 x i8> %a, <2 x i8> %b)  {
 entry:
-; SLM:  cost of 11 {{.*}} mul nsw <2 x i8>
+; SLM:  cost of 17 {{.*}} mul nsw <2 x i8>
   %res = mul nsw <2 x i8> %a, %b
   ret <2 x i8> %res
 }
@@ -97,7 +111,7 @@ entry:
 
 define <2 x i16> @slm-costs_16_v2_mul(<2 x i16> %a, <2 x i16> %b)  {
 entry:
-; SLM:  cost of 11 {{.*}} mul nsw <2 x i16>
+; SLM:  cost of 17 {{.*}} mul nsw <2 x i16>
   %res = mul nsw <2 x i16> %a, %b
   ret <2 x i16> %res
 }
@@ -181,7 +195,7 @@ entry:
 
 define <2 x i32> @slm-costs_32_v2_mul(<2 x i32> %a, <2 x i32> %b)  {
 entry:
-; SLM:  cost of 11 {{.*}} mul nsw <2 x i32>
+; SLM:  cost of 17 {{.*}} mul nsw <2 x i32>
   %res = mul nsw <2 x i32> %a, %b
   ret <2 x i32> %res
 }
@@ -217,28 +231,28 @@ entry:
 
 define <2 x i64> @slm-costs_64_v2_mul(<2 x i64> %a, <2 x i64> %b)  {
 entry:
-; SLM:  cost of 11 {{.*}} mul nsw <2 x i64>
+; SLM:  cost of 17 {{.*}} mul nsw <2 x i64>
   %res = mul nsw <2 x i64> %a, %b
   ret <2 x i64> %res
 }
 
 define <4 x i64> @slm-costs_64_v4_mul(<4 x i64> %a, <4 x i64> %b)  {
 entry:
-; SLM:  cost of 22 {{.*}} mul nsw <4 x i64>
+; SLM:  cost of 34 {{.*}} mul nsw <4 x i64>
   %res = mul nsw <4 x i64> %a, %b
   ret <4 x i64> %res
 }
 
 define <8 x i64> @slm-costs_64_v8_mul(<8 x i64> %a, <8 x i64> %b)  {
 entry:
-; SLM:  cost of 44 {{.*}} mul nsw <8 x i64>
+; SLM:  cost of 68 {{.*}} mul nsw <8 x i64>
   %res = mul nsw <8 x i64> %a, %b
   ret <8 x i64> %res
 }
 
 define <16 x i64> @slm-costs_64_v16_mul(<16 x i64> %a, <16 x i64> %b)  {
 entry:
-; SLM:  cost of 88 {{.*}} mul nsw <16 x i64>
+; SLM:  cost of 136 {{.*}} mul nsw <16 x i64>
   %res = mul nsw <16 x i64> %a, %b
   ret <16 x i64> %res
 }

Added: llvm/trunk/test/Transforms/LoopVectorize/X86/slm-no-vectorize.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/slm-no-vectorize.ll?rev=306974&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/slm-no-vectorize.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/slm-no-vectorize.ll Sun Jul  2 05:16:15 2017
@@ -0,0 +1,48 @@
+; RUN: opt < %s -loop-vectorize -mtriple=x86_64-unknown-linux -S -mcpu=slm -debug 2>&1 | FileCheck -check-prefix=MSG %s
+; This test should not be vectorized in X86\SLM arch
+; Vectorizing the 64bit multiply in this case is wrong since
+; it can be done with a lower bit mode (notice that the sources is 16bit)
+; Also addq\subq (quad word) has a high cost on SLM arch.
+; this test has a bad performance (regression of -70%) if vectorized on SLM arch
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define i32 @no_vec(i32 %LastIndex, i16* nocapture readonly %InputData, i16 signext %lag, i16 signext %Scale) {
+entry:
+; MSG: LV: Selecting VF: 1. 
+  %cmp17 = icmp sgt i32 %LastIndex, 0
+  br i1 %cmp17, label %for.body.lr.ph, label %for.cond.cleanup
+
+for.body.lr.ph:                                   ; preds = %entry
+  %conv5 = sext i16 %Scale to i64
+  %sh_prom = and i64 %conv5, 4294967295
+  %0 = sext i16 %lag to i64
+  %wide.trip.count = zext i32 %LastIndex to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  %conv8 = trunc i64 %add7 to i32
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %Accumulator.0.lcssa = phi i32 [ 0, %entry ], [ %conv8, %for.cond.cleanup.loopexit ]
+  ret i32 %Accumulator.0.lcssa
+
+for.body:                                         ; preds = %for.body, %for.body.lr.ph
+  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
+  %Accumulator.018 = phi i64 [ 0, %for.body.lr.ph ], [ %add7, %for.body ]
+  %arrayidx = getelementptr inbounds i16, i16* %InputData, i64 %indvars.iv
+  %1 = load i16, i16* %arrayidx, align 2
+  %conv = sext i16 %1 to i64
+  %2 = add nsw i64 %indvars.iv, %0
+  %arrayidx3 = getelementptr inbounds i16, i16* %InputData, i64 %2
+  %3 = load i16, i16* %arrayidx3, align 2 
+  %conv4 = sext i16 %3 to i64
+  %mul = mul nsw i64 %conv4, %conv
+  %shr = ashr i64 %mul, %sh_prom
+  %add7 = add i64 %shr, %Accumulator.018 
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
+}
+