[llvm] r272385 - [TTI] Refine default cost for interleaved load groups with gaps

Fri Jun 10 04:27:55 PDT 2016

Author: mssimpso
Date: Fri Jun 10 06:27:51 2016
New Revision: 272385

URL: http://llvm.org/viewvc/llvm-project?rev=272385&view=rev
Log:
[TTI] Refine default cost for interleaved load groups with gaps

This patch refines the default cost for interleaved load groups having gaps. If
a load group has gaps, the legalized instructions corresponding to the unused
elements will be dead. Thus, we don't need to account for them in the cost
model. Instead, we only need to account for the fraction of legalized loads
that will actually be used.

Differential Revision: http://reviews.llvm.org/D20873

Modified:
    llvm/trunk/include/llvm/CodeGen/BasicTTIImpl.h
    llvm/trunk/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll

Modified: llvm/trunk/include/llvm/CodeGen/BasicTTIImpl.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/CodeGen/BasicTTIImpl.h?rev=272385&r1=272384&r2=272385&view=diff
==============================================================================

--- llvm/trunk/include/llvm/CodeGen/BasicTTIImpl.h (original)
+++ llvm/trunk/include/llvm/CodeGen/BasicTTIImpl.h Fri Jun 10 06:27:51 2016
@@ -542,6 +542,50 @@ public:
     unsigned Cost = static_cast<T *>(this)->getMemoryOpCost(
         Opcode, VecTy, Alignment, AddressSpace);
 
+    // Legalize the vector type, and get the legalized and unlegalized type
+    // sizes.
+    MVT VecTyLT = getTLI()->getTypeLegalizationCost(DL, VecTy).second;
+    unsigned VecTySize = DL.getTypeStoreSize(VecTy);
+    unsigned VecTyLTSize = VecTyLT.getStoreSize();
+
+    // Return the ceiling of dividing A by B.
+    auto ceil = [](unsigned A, unsigned B) { return (A + B - 1) / B; };
+
+    // Scale the cost of the memory operation by the fraction of legalized
+    // instructions that will actually be used. We shouldn't account for the
+    // cost of dead instructions since they will be removed.
+    //
+    // E.g., An interleaved load of factor 8:
+    //       %vec = load <16 x i64>, <16 x i64>* %ptr
+    //       %v0 = shufflevector %vec, undef, <0, 8>
+    //
+    // If <16 x i64> is legalized to 8 v2i64 loads, only 2 of the loads will be
+    // used (those corresponding to elements [0:1] and [8:9] of the unlegalized
+    // type). The other loads are unused.
+    //
+    // We only scale the cost of loads since interleaved store groups aren't
+    // allowed to have gaps.
+    if (Opcode == Instruction::Load && VecTySize > VecTyLTSize) {
+
+      // The number of loads of a legal type it will take to represent a load
+      // of the unlegalized vector type.
+      unsigned NumLegalInsts = ceil(VecTySize, VecTyLTSize);
+
+      // The number of elements of the unlegalized type that correspond to a
+      // single legal instruction.
+      unsigned NumEltsPerLegalInst = ceil(NumElts, NumLegalInsts);
+
+      // Determine which legal instructions will be used.
+      BitVector UsedInsts(NumLegalInsts, false);
+      for (unsigned Index : Indices)
+        for (unsigned Elt = 0; Elt < NumSubElts; ++Elt)
+          UsedInsts.set((Index + Elt * Factor) / NumEltsPerLegalInst);
+
+      // Scale the cost of the load by the fraction of legal instructions that
+      // will be used.
+      Cost *= UsedInsts.count() / NumLegalInsts;
+    }
+
     // Then plus the cost of interleave operation.
     if (Opcode == Instruction::Load) {
       // The interleave cost is similar to extract sub vectors' elements

Modified: llvm/trunk/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll?rev=272385&r1=272384&r2=272385&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll (original)
+++ llvm/trunk/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll Fri Jun 10 06:27:51 2016
@@ -14,6 +14,7 @@ entry:
 ; 8xi8 and 16xi8 are valid i8 vector types, so the cost of the interleaved
 ; access group is 2.
 
+; CHECK: LV: Checking a loop in "test_byte_interleaved_cost"
 ; CHECK: LV: Found an estimated cost of 2 for VF 8 For instruction:   %tmp = load i8, i8* %arrayidx0, align 4
 ; CHECK: LV: Found an estimated cost of 2 for VF 16 For instruction:   %tmp = load i8, i8* %arrayidx0, align 4
 
@@ -37,3 +38,44 @@ for.body:
 for.end:                                          ; preds = %for.body
   ret void
 }
+
+%ig.factor.8 = type { double*, double, double, double, double, double, double, double }
+define double @wide_interleaved_group(%ig.factor.8* %s, double %a, double %b, i32 %n) {
+entry:
+  br label %for.body
+
+; Check the default cost of a strided load with a factor that is greater than
+; the maximum allowed. In this test, the interleave factor would be 8, which is
+; not supported.
+
+; CHECK: LV: Checking a loop in "wide_interleaved_group"
+; CHECK: LV: Found an estimated cost of 6 for VF 2 For instruction:   %1 = load double, double* %0, align 8
+; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %5 = load double, double* %4, align 8
+; CHECK: LV: Found an estimated cost of 10 for VF 2 For instruction:   store double %9, double* %10, align 8
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %r = phi double [ 0.000000e+00, %entry ], [ %12, %for.body ]
+  %0 = getelementptr inbounds %ig.factor.8, %ig.factor.8* %s, i64 %i, i32 2
+  %1 = load double, double* %0, align 8
+  %2 = fcmp fast olt double %1, %a
+  %3 = select i1 %2, double 0.000000e+00, double %1
+  %4 = getelementptr inbounds %ig.factor.8, %ig.factor.8* %s, i64 %i, i32 6
+  %5 = load double, double* %4, align 8
+  %6 = fcmp fast olt double %5, %a
+  %7 = select i1 %6, double 0.000000e+00, double %5
+  %8 = fmul fast double %7, %b
+  %9 = fadd fast double %8, %3
+  %10 = getelementptr inbounds %ig.factor.8, %ig.factor.8* %s, i64 %i, i32 3
+  store double %9, double* %10, align 8
+  %11 = fmul fast double %9, %9
+  %12 = fadd fast double %11, %r
+  %i.next = add nuw nsw i64 %i, 1
+  %13 = trunc i64 %i.next to i32
+  %cond = icmp eq i32 %13, %n
+  br i1 %cond, label %for.exit, label %for.body
+
+for.exit:
+  %r.lcssa = phi double [ %12, %for.body ]
+  ret double %r.lcssa
+}