[llvm] r270729 - [x86] avoid code explosion from LoopVectorizer for gather loop (PR27826)

Wed May 25 10:27:54 PDT 2016

Author: spatel
Date: Wed May 25 12:27:54 2016
New Revision: 270729

URL: http://llvm.org/viewvc/llvm-project?rev=270729&view=rev
Log:
[x86] avoid code explosion from LoopVectorizer for gather loop (PR27826) 

By making pointer extraction from a vector more expensive in the cost model,
we avoid the vectorization of a loop that is very likely to be memory-bound:
https://llvm.org/bugs/show_bug.cgi?id=27826

There are still bugs related to this, so we may need a more general solution
to avoid vectorizing obviously memory-bound loops when we don't have HW gather
support.

Differential Revision: http://reviews.llvm.org/D20601


Modified:
    llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp
    llvm/trunk/test/Transforms/LoopVectorize/X86/cost-model.ll

Modified: llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp?rev=270729&r1=270728&r2=270729&view=diff
==============================================================================

--- llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp Wed May 25 12:27:54 2016
@@ -963,6 +963,8 @@ int X86TTIImpl::getIntrinsicInstrCost(In
 int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
   assert(Val->isVectorTy() && "This must be a vector type");
 
+  Type *ScalarType = Val->getScalarType();
+
   if (Index != -1U) {
     // Legalize the type.
     std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);
@@ -976,11 +978,17 @@ int X86TTIImpl::getVectorInstrCost(unsig
     Index = Index % Width;
 
     // Floating point scalars are already located in index #0.
-    if (Val->getScalarType()->isFloatingPointTy() && Index == 0)
+    if (ScalarType->isFloatingPointTy() && Index == 0)
       return 0;
   }
 
-  return BaseT::getVectorInstrCost(Opcode, Val, Index);
+  // Add to the base cost if we know that the extracted element of a vector is
+  // destined to be moved to and used in the integer register file.
+  int RegisterFileMoveCost = 0;
+  if (Opcode == Instruction::ExtractElement && ScalarType->isPointerTy())
+    RegisterFileMoveCost = 1;
+
+  return BaseT::getVectorInstrCost(Opcode, Val, Index) + RegisterFileMoveCost;
 }
 
 int X86TTIImpl::getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) {

Modified: llvm/trunk/test/Transforms/LoopVectorize/X86/cost-model.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/cost-model.ll?rev=270729&r1=270728&r2=270729&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/cost-model.ll (original)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/cost-model.ll Wed May 25 12:27:54 2016
@@ -39,3 +39,44 @@ for.body:
 for.end:                                          ; preds = %for.body
   ret void
 }
+
+; This function uses a stride that is generally too big to benefit from vectorization without
+; really good support for a gather load. We were not computing an accurate cost for the 
+; vectorization and subsequent scalarization of the pointer induction variables.
+
+define float @PR27826(float* nocapture readonly %a, float* nocapture readonly %b, i32 %n) {
+; CHECK-LABEL: @PR27826(
+; CHECK-NOT:   <4 x float> 
+; CHECK-NOT:   <8 x float> 
+; CHECK:       ret float %s.0.lcssa
+
+entry:
+  %cmp = icmp sgt i32 %n, 0
+  br i1 %cmp, label %preheader, label %for.end
+
+preheader:
+  %t0 = sext i32 %n to i64
+  br label %for
+
+for:
+  %indvars.iv = phi i64 [ 0, %preheader ], [ %indvars.iv.next, %for ]
+  %s.02 = phi float [ 0.0, %preheader ], [ %add4, %for ]
+  %arrayidx = getelementptr inbounds float, float* %a, i64 %indvars.iv
+  %t1 = load float, float* %arrayidx, align 4
+  %arrayidx3 = getelementptr inbounds float, float* %b, i64 %indvars.iv
+  %t2 = load float, float* %arrayidx3, align 4
+  %add = fadd fast float %t1, %s.02
+  %add4 = fadd fast float %add, %t2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 8
+  %cmp1 = icmp slt i64 %indvars.iv.next, %t0
+  br i1 %cmp1, label %for, label %loopexit
+
+loopexit:
+  %add4.lcssa = phi float [ %add4, %for ]
+  br label %for.end
+
+for.end:
+  %s.0.lcssa = phi float [ 0.0, %entry ], [ %add4.lcssa, %loopexit ]
+  ret float %s.0.lcssa
+}
+