[llvm] r205159 - [X86] Adjust cost of FP_TO_UINT v8f32->v8i32

Sun Mar 30 11:07:13 PDT 2014

Author: anemet
Date: Sun Mar 30 13:07:13 2014
New Revision: 205159

URL: http://llvm.org/viewvc/llvm-project?rev=205159&view=rev
Log:
[X86] Adjust cost of FP_TO_UINT v8f32->v8i32

There is no direct AVX instruction to convert to unsigned.  I have some ideas
how we may be able to do this with three vector instructions but the current
backend just bails on this to get it scalarized.

See the comment why we need to adjust the cost returned by BasicTTI.

The test is a bit roundabout (and checks assembly rather than bit code) because
I'd like it to work even if at some point we could vectorize this conversion.

Fixes <rdar://problem/16371920>

Added:
    llvm/trunk/test/Transforms/LoopVectorize/X86/fp32_to_uint32-cost-model.ll
Modified:
    llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp

Modified: llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp?rev=205159&r1=205158&r2=205159&view=diff
==============================================================================

--- llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp Sun Mar 30 13:07:13 2014
@@ -522,6 +522,12 @@ unsigned X86TTI::getCastInstrCost(unsign
 
     { ISD::FP_TO_SINT,  MVT::v8i8,  MVT::v8f32, 7 },
     { ISD::FP_TO_SINT,  MVT::v4i8,  MVT::v4f32, 1 },
+    // This node is expanded into scalarized operations but BasicTTI is overly
+    // optimistic estimating its cost.  It computes 3 per element (one
+    // vector-extract, one scalar conversion and one vector-insert).  The
+    // problem is that the inserts form a read-modify-write chain so latency
+    // should be factored in too.  Inflating the cost per element by 1.
+    { ISD::FP_TO_UINT,  MVT::v8i32, MVT::v8f32, 8*4 },
   };
 
   if (ST->hasAVX2()) {

Added: llvm/trunk/test/Transforms/LoopVectorize/X86/fp32_to_uint32-cost-model.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/fp32_to_uint32-cost-model.ll?rev=205159&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/fp32_to_uint32-cost-model.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/fp32_to_uint32-cost-model.ll Sun Mar 30 13:07:13 2014
@@ -0,0 +1,39 @@
+; RUN: opt < %s -mcpu=core-avx2 -loop-vectorize -S | llc -mcpu=core-avx2 | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx"
+
+ at float_array = common global [10000 x float] zeroinitializer, align 16
+ at unsigned_array = common global [10000 x i32] zeroinitializer, align 16
+
+; If we need to scalarize the fptoui and then use inserts to build up the
+; vector again, then there is certainly no value in going 256-bit wide.
+; CHECK-NOT: vinserti128
+
+define void @convert(i32 %N) {
+entry:
+  %0 = icmp eq i32 %N, 0
+  br i1 %0, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds [10000 x float]* @float_array, i64 0, i64 %indvars.iv
+  %1 = load float* %arrayidx, align 4
+  %conv = fptoui float %1 to i32
+  %arrayidx2 = getelementptr inbounds [10000 x i32]* @unsigned_array, i64 0, i64 %indvars.iv
+  store i32 %conv, i32* %arrayidx2, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %N
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
+