[llvm] r272395 - Reapply "[TTI] Refine default cost for interleaved load groups with gaps"

Fri Jun 10 07:33:31 PDT 2016

Author: mssimpso
Date: Fri Jun 10 09:33:30 2016
New Revision: 272395

URL: http://llvm.org/viewvc/llvm-project?rev=272395&view=rev
Log:
Reapply "[TTI] Refine default cost for interleaved load groups with gaps"

This reapplies commit r272385 with a fix. The build was failing when compiled
with gcc, but not with clang. With the fix, we now get the data layout from the
current TTI implementation, which will hopefully solve the issue.

Modified:
    llvm/trunk/include/llvm/CodeGen/BasicTTIImpl.h
    llvm/trunk/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll

Modified: llvm/trunk/include/llvm/CodeGen/BasicTTIImpl.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/CodeGen/BasicTTIImpl.h?rev=272395&r1=272394&r2=272395&view=diff
==============================================================================

--- llvm/trunk/include/llvm/CodeGen/BasicTTIImpl.h (original)
+++ llvm/trunk/include/llvm/CodeGen/BasicTTIImpl.h Fri Jun 10 09:33:30 2016
@@ -542,6 +542,51 @@ public:
     unsigned Cost = static_cast<T *>(this)->getMemoryOpCost(
         Opcode, VecTy, Alignment, AddressSpace);
 
+    // Legalize the vector type, and get the legalized and unlegalized type
+    // sizes.
+    MVT VecTyLT = getTLI()->getTypeLegalizationCost(DL, VecTy).second;
+    unsigned VecTySize =
+        static_cast<T *>(this)->getDataLayout().getTypeStoreSize(VecTy);
+    unsigned VecTyLTSize = VecTyLT.getStoreSize();
+
+    // Return the ceiling of dividing A by B.
+    auto ceil = [](unsigned A, unsigned B) { return (A + B - 1) / B; };
+
+    // Scale the cost of the memory operation by the fraction of legalized
+    // instructions that will actually be used. We shouldn't account for the
+    // cost of dead instructions since they will be removed.
+    //
+    // E.g., An interleaved load of factor 8:
+    //       %vec = load <16 x i64>, <16 x i64>* %ptr
+    //       %v0 = shufflevector %vec, undef, <0, 8>
+    //
+    // If <16 x i64> is legalized to 8 v2i64 loads, only 2 of the loads will be
+    // used (those corresponding to elements [0:1] and [8:9] of the unlegalized
+    // type). The other loads are unused.
+    //
+    // We only scale the cost of loads since interleaved store groups aren't
+    // allowed to have gaps.
+    if (Opcode == Instruction::Load && VecTySize > VecTyLTSize) {
+
+      // The number of loads of a legal type it will take to represent a load
+      // of the unlegalized vector type.
+      unsigned NumLegalInsts = ceil(VecTySize, VecTyLTSize);
+
+      // The number of elements of the unlegalized type that correspond to a
+      // single legal instruction.
+      unsigned NumEltsPerLegalInst = ceil(NumElts, NumLegalInsts);
+
+      // Determine which legal instructions will be used.
+      BitVector UsedInsts(NumLegalInsts, false);
+      for (unsigned Index : Indices)
+        for (unsigned Elt = 0; Elt < NumSubElts; ++Elt)
+          UsedInsts.set((Index + Elt * Factor) / NumEltsPerLegalInst);
+
+      // Scale the cost of the load by the fraction of legal instructions that
+      // will be used.
+      Cost *= UsedInsts.count() / NumLegalInsts;
+    }
+
     // Then plus the cost of interleave operation.
     if (Opcode == Instruction::Load) {
       // The interleave cost is similar to extract sub vectors' elements

Modified: llvm/trunk/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll?rev=272395&r1=272394&r2=272395&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll (original)
+++ llvm/trunk/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll Fri Jun 10 09:33:30 2016
@@ -14,6 +14,7 @@ entry:
 ; 8xi8 and 16xi8 are valid i8 vector types, so the cost of the interleaved
 ; access group is 2.
 
+; CHECK: LV: Checking a loop in "test_byte_interleaved_cost"
 ; CHECK: LV: Found an estimated cost of 2 for VF 8 For instruction:   %tmp = load i8, i8* %arrayidx0, align 4
 ; CHECK: LV: Found an estimated cost of 2 for VF 16 For instruction:   %tmp = load i8, i8* %arrayidx0, align 4
 
@@ -37,3 +38,44 @@ for.body:
 for.end:                                          ; preds = %for.body
   ret void
 }
+
+%ig.factor.8 = type { double*, double, double, double, double, double, double, double }
+define double @wide_interleaved_group(%ig.factor.8* %s, double %a, double %b, i32 %n) {
+entry:
+  br label %for.body
+
+; Check the default cost of a strided load with a factor that is greater than
+; the maximum allowed. In this test, the interleave factor would be 8, which is
+; not supported.
+
+; CHECK: LV: Checking a loop in "wide_interleaved_group"
+; CHECK: LV: Found an estimated cost of 6 for VF 2 For instruction:   %1 = load double, double* %0, align 8
+; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %5 = load double, double* %4, align 8
+; CHECK: LV: Found an estimated cost of 10 for VF 2 For instruction:   store double %9, double* %10, align 8
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %r = phi double [ 0.000000e+00, %entry ], [ %12, %for.body ]
+  %0 = getelementptr inbounds %ig.factor.8, %ig.factor.8* %s, i64 %i, i32 2
+  %1 = load double, double* %0, align 8
+  %2 = fcmp fast olt double %1, %a
+  %3 = select i1 %2, double 0.000000e+00, double %1
+  %4 = getelementptr inbounds %ig.factor.8, %ig.factor.8* %s, i64 %i, i32 6
+  %5 = load double, double* %4, align 8
+  %6 = fcmp fast olt double %5, %a
+  %7 = select i1 %6, double 0.000000e+00, double %5
+  %8 = fmul fast double %7, %b
+  %9 = fadd fast double %8, %3
+  %10 = getelementptr inbounds %ig.factor.8, %ig.factor.8* %s, i64 %i, i32 3
+  store double %9, double* %10, align 8
+  %11 = fmul fast double %9, %9
+  %12 = fadd fast double %11, %r
+  %i.next = add nuw nsw i64 %i, 1
+  %13 = trunc i64 %i.next to i32
+  %cond = icmp eq i32 %13, %n
+  br i1 %cond, label %for.exit, label %for.body
+
+for.exit:
+  %r.lcssa = phi double [ %12, %for.body ]
+  ret double %r.lcssa
+}