[llvm] r368439 - Title: Loop Cache Analysis

Fri Aug 9 06:56:29 PDT 2019

Author: whitneyt
Date: Fri Aug  9 06:56:29 2019
New Revision: 368439

URL: http://llvm.org/viewvc/llvm-project?rev=368439&view=rev
Log:
Title: Loop Cache Analysis
Summary: Implement a new analysis to estimate the number of cache lines
required by a loop nest.
The analysis is largely based on the following paper:

Compiler Optimizations for Improving Data Locality
By: Steve Carr, Katherine S. McKinley, Chau-Wen Tseng
http://www.cs.utexas.edu/users/mckinley/papers/asplos-1994.pdf
The analysis considers temporal reuse (accesses to the same memory
location) and spatial reuse (accesses to memory locations within a cache
line). For simplicity the analysis considers memory accesses in the
innermost loop in a loop nest, and thus determines the number of cache
lines used when the loop L in loop nest LN is placed in the innermost
position.

The result of the analysis can be used to drive several transformations.
As an example, loop interchange could use it determine which loops in a
perfect loop nest should be interchanged to maximize cache reuse.
Similarly, loop distribution could be enhanced to take into
consideration cache reuse between arrays when distributing a loop to
eliminate vectorization inhibiting dependencies.

The general approach taken to estimate the number of cache lines used by
the memory references in the inner loop of a loop nest is:

Partition memory references that exhibit temporal or spatial reuse into
reference groups.
For each loop L in the a loop nest LN: a. Compute the cost of the
reference group b. Compute the 'cache cost' of the loop nest by summing
up the reference groups costs
For further details of the algorithm please refer to the paper.
Authored By: etiotto
Reviewers: hfinkel, Meinersbur, jdoerfert, kbarton, bmahjour, anemet,
fhahn
Reviewed By: Meinersbur
Subscribers: reames, nemanjai, MaskRay, wuzish, Hahnfeld, xusx595,
venkataramanan.kumar.llvm, greened, dmgreen, steleman, fhahn, xblvaOO,
Whitney, mgorny, hiraditya, mgrang, jsji, llvm-commits
Tag: LLVM
Differential Revision: https://reviews.llvm.org/D63459

Added:
    llvm/trunk/include/llvm/Analysis/LoopCacheAnalysis.h
    llvm/trunk/lib/Analysis/LoopCacheAnalysis.cpp
    llvm/trunk/test/Analysis/LoopCacheAnalysis/
    llvm/trunk/test/Analysis/LoopCacheAnalysis/PowerPC/
    llvm/trunk/test/Analysis/LoopCacheAnalysis/PowerPC/lit.local.cfg
    llvm/trunk/test/Analysis/LoopCacheAnalysis/PowerPC/loads-store.ll
    llvm/trunk/test/Analysis/LoopCacheAnalysis/PowerPC/matmul.ll
    llvm/trunk/test/Analysis/LoopCacheAnalysis/PowerPC/matvecmul.ll
    llvm/trunk/test/Analysis/LoopCacheAnalysis/PowerPC/single-store.ll
    llvm/trunk/test/Analysis/LoopCacheAnalysis/PowerPC/stencil.ll
Modified:
    llvm/trunk/lib/Analysis/CMakeLists.txt
    llvm/trunk/lib/Passes/PassBuilder.cpp
    llvm/trunk/lib/Passes/PassRegistry.def

Added: llvm/trunk/include/llvm/Analysis/LoopCacheAnalysis.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/Analysis/LoopCacheAnalysis.h?rev=368439&view=auto
==============================================================================

--- llvm/trunk/include/llvm/Analysis/LoopCacheAnalysis.h (added)
+++ llvm/trunk/include/llvm/Analysis/LoopCacheAnalysis.h Fri Aug  9 06:56:29 2019
@@ -0,0 +1,278 @@
+//===- llvm/Analysis/LoopCacheAnalysis.h ------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines the interface for the loop cache analysis.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ANALYSIS_LOOPCACHEANALYSIS_H
+#define LLVM_ANALYSIS_LOOPCACHEANALYSIS_H
+
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/DependenceAnalysis.h"
+#include "llvm/Analysis/LoopAnalysisManager.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+
+class LPMUpdater;
+using CacheCostTy = int64_t;
+using LoopVectorTy = SmallVector<Loop *, 8>;
+
+/// Represents a memory reference as a base pointer and a set of indexing
+/// operations. For example given the array reference A[i][2j+1][3k+2] in a
+/// 3-dim loop nest:
+///   for(i=0;i<n;++i)
+///     for(j=0;j<m;++j)
+///       for(k=0;k<o;++k)
+///         ... A[i][2j+1][3k+2] ...
+/// We expect:
+///   BasePointer -> A
+///   Subscripts -> [{0,+,1}<%for.i>][{1,+,2}<%for.j>][{2,+,3}<%for.k>]
+///   Sizes -> [m][o][4]
+class IndexedReference {
+  friend raw_ostream &operator<<(raw_ostream &OS, const IndexedReference &R);
+
+public:
+  /// Construct an indexed reference given a \p StoreOrLoadInst instruction.
+  IndexedReference(Instruction &StoreOrLoadInst, const LoopInfo &LI,
+                   ScalarEvolution &SE);
+
+  bool isValid() const { return IsValid; }
+  const SCEV *getBasePointer() const { return BasePointer; }
+  size_t getNumSubscripts() const { return Subscripts.size(); }
+  const SCEV *getSubscript(unsigned SubNum) const {
+    assert(SubNum < getNumSubscripts() && "Invalid subscript number");
+    return Subscripts[SubNum];
+  }
+  const SCEV *getFirstSubscript() const {
+    assert(!Subscripts.empty() && "Expecting non-empty container");
+    return Subscripts.front();
+  }
+  const SCEV *getLastSubscript() const {
+    assert(!Subscripts.empty() && "Expecting non-empty container");
+    return Subscripts.back();
+  }
+
+  /// Return true/false if the current object and the indexed reference \p Other
+  /// are/aren't in the same cache line of size \p CLS. Two references are in
+  /// the same chace line iff the distance between them in the innermost
+  /// dimension is less than the cache line size. Return None if unsure.
+  Optional<bool> hasSpacialReuse(const IndexedReference &Other, unsigned CLS,
+                                 AliasAnalysis &AA) const;
+
+  /// Return true if the current object and the indexed reference \p Other
+  /// have distance smaller than \p MaxDistance in the dimension associated with
+  /// the given loop \p L. Return false if the distance is not smaller than \p
+  /// MaxDistance and None if unsure.
+  Optional<bool> hasTemporalReuse(const IndexedReference &Other,
+                                  unsigned MaxDistance, const Loop &L,
+                                  DependenceInfo &DI, AliasAnalysis &AA) const;
+
+  /// Compute the cost of the reference w.r.t. the given loop \p L when it is
+  /// considered in the innermost position in the loop nest.
+  /// The cost is defined as:
+  ///   - equal to one if the reference is loop invariant, or
+  ///   - equal to '(TripCount * stride) / cache_line_size' if:
+  ///     + the reference stride is less than the cache line size, and
+  ///     + the coefficient of this loop's index variable used in all other
+  ///       subscripts is zero
+  ///   - or otherwise equal to 'TripCount'.
+  CacheCostTy computeRefCost(const Loop &L, unsigned CLS) const;
+
+private:
+  /// Attempt to delinearize the indexed reference.
+  bool delinearize(const LoopInfo &LI);
+
+  /// Return true if the index reference is invariant with respect to loop \p L.
+  bool isLoopInvariant(const Loop &L) const;
+
+  /// Return true if the indexed reference is 'consecutive' in loop \p L.
+  /// An indexed reference is 'consecutive' if the only coefficient that uses
+  /// the loop induction variable is the rightmost one, and the access stride is
+  /// smaller than the cache line size \p CLS.
+  bool isConsecutive(const Loop &L, unsigned CLS) const;
+
+  /// Return the coefficient used in the rightmost dimension.
+  const SCEV *getLastCoefficient() const;
+
+  /// Return true if the coefficient corresponding to induction variable of
+  /// loop \p L in the given \p Subscript is zero or is loop invariant in \p L.
+  bool isCoeffForLoopZeroOrInvariant(const SCEV &Subscript,
+                                     const Loop &L) const;
+
+  /// Verify that the given \p Subscript is 'well formed' (must be a simple add
+  /// recurrence).
+  bool isSimpleAddRecurrence(const SCEV &Subscript, const Loop &L) const;
+
+  /// Return true if the given reference \p Other is definetely aliased with
+  /// the indexed reference represented by this class.
+  bool isAliased(const IndexedReference &Other, AliasAnalysis &AA) const;
+
+private:
+  /// True if the reference can be delinearized, false otherwise.
+  bool IsValid = false;
+
+  /// Represent the memory reference instruction.
+  Instruction &StoreOrLoadInst;
+
+  /// The base pointer of the memory reference.
+  const SCEV *BasePointer = nullptr;
+
+  /// The subscript (indexes) of the memory reference.
+  SmallVector<const SCEV *, 3> Subscripts;
+
+  /// The dimensions of the memory reference.
+  SmallVector<const SCEV *, 3> Sizes;
+
+  ScalarEvolution &SE;
+};
+
+/// A reference group represents a set of memory references that exhibit
+/// temporal or spacial reuse. Two references belong to the same
+/// reference group with respect to a inner loop L iff:
+/// 1. they have a loop independent dependency, or
+/// 2. they have a loop carried dependence with a small dependence distance
+///    (e.g. less than 2) carried by the inner loop, or
+/// 3. they refer to the same array, and the subscript in their innermost
+///    dimension is less than or equal to 'd' (where 'd' is less than the cache
+///    line size)
+///
+/// Intuitively a reference group represents memory references that access
+/// the same cache line. Conditions 1,2 above account for temporal reuse, while
+/// contition 3 accounts for spacial reuse.
+using ReferenceGroupTy = SmallVector<std::unique_ptr<IndexedReference>, 8>;
+using ReferenceGroupsTy = SmallVector<ReferenceGroupTy, 8>;
+
+/// \c CacheCost represents the estimated cost of a inner loop as the number of
+/// cache lines used by the memory references it contains.
+/// The 'cache cost' of a loop 'L' in a loop nest 'LN' is computed as the sum of
+/// the cache costs of all of its reference groups when the loop is considered
+/// to be in the innermost position in the nest.
+/// A reference group represents memory references that fall into the same cache
+/// line. Each reference group is analysed with respect to the innermost loop in
+/// a loop nest. The cost of a reference is defined as follow:
+///  - one if it is loop invariant w.r.t the innermost loop,
+///  - equal to the loop trip count divided by the cache line times the
+///    reference stride if the reference stride is less than the cache line
+///    size (CLS), and the coefficient of this loop's index variable used in all
+///    other subscripts is zero (e.g. RefCost = TripCount/(CLS/RefStride))
+///  - equal to the innermost loop trip count if the reference stride is greater
+///    or equal to the cache line size CLS.
+class CacheCost {
+  friend raw_ostream &operator<<(raw_ostream &OS, const CacheCost &CC);
+  using LoopTripCountTy = std::pair<const Loop *, unsigned>;
+  using LoopCacheCostTy = std::pair<const Loop *, CacheCostTy>;
+
+public:
+  static CacheCostTy constexpr InvalidCost = -1;
+
+  /// Construct a CacheCost object for the loop nest described by \p Loops.
+  /// The optional parameter \p TRT can be used to specify the max. distance
+  /// between array elements accessed in a loop so that the elements are
+  /// classified to have temporal reuse.
+  CacheCost(const LoopVectorTy &Loops, const LoopInfo &LI, ScalarEvolution &SE,
+            TargetTransformInfo &TTI, AliasAnalysis &AA, DependenceInfo &DI,
+            Optional<unsigned> TRT = None);
+
+  /// Create a CacheCost for the loop nest rooted by \p Root.
+  /// The optional parameter \p TRT can be used to specify the max. distance
+  /// between array elements accessed in a loop so that the elements are
+  /// classified to have temporal reuse.
+  static std::unique_ptr<CacheCost>
+  getCacheCost(Loop &Root, LoopStandardAnalysisResults &AR, DependenceInfo &DI,
+               Optional<unsigned> TRT = None);
+
+  /// Return the estimated cost of loop \p L if the given loop is part of the
+  /// loop nest associated with this object. Return -1 otherwise.
+  CacheCostTy getLoopCost(const Loop &L) const {
+    auto IT = std::find_if(
+        LoopCosts.begin(), LoopCosts.end(),
+        [&L](const LoopCacheCostTy &LCC) { return LCC.first == &L; });
+    return (IT != LoopCosts.end()) ? (*IT).second : -1;
+  }
+
+  /// Return the estimated ordered loop costs.
+  const ArrayRef<LoopCacheCostTy> getLoopCosts() const { return LoopCosts; }
+
+private:
+  /// Calculate the cache footprint of each loop in the nest (when it is
+  /// considered to be in the innermost position).
+  void calculateCacheFootprint();
+
+  /// Partition store/load instructions in the loop nest into reference groups.
+  /// Two or more memory accesses belong in the same reference group if they
+  /// share the same cache line.
+  bool populateReferenceGroups(ReferenceGroupsTy &RefGroups) const;
+
+  /// Calculate the cost of the given loop \p L assuming it is the innermost
+  /// loop in nest.
+  CacheCostTy computeLoopCacheCost(const Loop &L,
+                                   const ReferenceGroupsTy &RefGroups) const;
+
+  /// Compute the cost of a representative reference in reference group \p RG
+  /// when the given loop \p L is considered as the innermost loop in the nest.
+  /// The computed cost is an estimate for the number of cache lines used by the
+  /// reference group. The representative reference cost is defined as:
+  ///   - equal to one if the reference is loop invariant, or
+  ///   - equal to '(TripCount * stride) / cache_line_size' if (a) loop \p L's
+  ///     induction variable is used only in the reference subscript associated
+  ///     with loop \p L, and (b) the reference stride is less than the cache
+  ///     line size, or
+  ///   - TripCount otherwise
+  CacheCostTy computeRefGroupCacheCost(const ReferenceGroupTy &RG,
+                                       const Loop &L) const;
+
+  /// Sort the LoopCosts vector by decreasing cache cost.
+  void sortLoopCosts() {
+    sort(LoopCosts, [](const LoopCacheCostTy &A, const LoopCacheCostTy &B) {
+      return A.second > B.second;
+    });
+  }
+
+private:
+  /// Loops in the loop nest associated with this object.
+  LoopVectorTy Loops;
+
+  /// Trip counts for the loops in the loop nest associated with this object.
+  SmallVector<LoopTripCountTy, 3> TripCounts;
+
+  /// Cache costs for the loops in the loop nest associated with this object.
+  SmallVector<LoopCacheCostTy, 3> LoopCosts;
+
+  /// The max. distance between array elements accessed in a loop so that the
+  /// elements are classified to have temporal reuse.
+  Optional<unsigned> TRT;
+
+  const LoopInfo &LI;
+  ScalarEvolution &SE;
+  TargetTransformInfo &TTI;
+  AliasAnalysis &AA;
+  DependenceInfo &DI;
+};
+
+/// Printer pass for the \c CacheCost results.
+class LoopCachePrinterPass : public PassInfoMixin<LoopCachePrinterPass> {
+  raw_ostream &OS;
+
+public:
+  explicit LoopCachePrinterPass(raw_ostream &OS) : OS(OS) {}
+
+  PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM,
+                        LoopStandardAnalysisResults &AR, LPMUpdater &U);
+};
+
+} // namespace llvm
+
+#endif // LLVM_ANALYSIS_LOOPCACHEANALYSIS_H

Modified: llvm/trunk/lib/Analysis/CMakeLists.txt
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Analysis/CMakeLists.txt?rev=368439&r1=368438&r2=368439&view=diff
==============================================================================
--- llvm/trunk/lib/Analysis/CMakeLists.txt (original)
+++ llvm/trunk/lib/Analysis/CMakeLists.txt Fri Aug  9 06:56:29 2019
@@ -50,6 +50,7 @@ add_llvm_library(LLVMAnalysis
   Loads.cpp
   LoopAccessAnalysis.cpp
   LoopAnalysisManager.cpp
+  LoopCacheAnalysis.cpp
   LoopUnrollAnalyzer.cpp
   LoopInfo.cpp
   LoopPass.cpp

Added: llvm/trunk/lib/Analysis/LoopCacheAnalysis.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Analysis/LoopCacheAnalysis.cpp?rev=368439&view=auto
==============================================================================
--- llvm/trunk/lib/Analysis/LoopCacheAnalysis.cpp (added)
+++ llvm/trunk/lib/Analysis/LoopCacheAnalysis.cpp Fri Aug  9 06:56:29 2019
@@ -0,0 +1,625 @@
+//===- LoopCacheAnalysis.cpp - Loop Cache Analysis -------------------------==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines the implementation for the loop cache analysis.
+/// The implementation is largely based on the following paper:
+///
+///       Compiler Optimizations for Improving Data Locality
+///       By: Steve Carr, Katherine S. McKinley, Chau-Wen Tseng
+///       http://www.cs.utexas.edu/users/mckinley/papers/asplos-1994.pdf
+///
+/// The general approach taken to estimate the number of cache lines used by the
+/// memory references in an inner loop is:
+///    1. Partition memory references that exhibit temporal or spacial reuse
+///       into reference groups.
+///    2. For each loop L in the a loop nest LN:
+///       a. Compute the cost of the reference group
+///       b. Compute the loop cost by summing up the reference groups costs
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/LoopCacheAnalysis.h"
+#include "llvm/ADT/BreadthFirstIterator.h"
+#include "llvm/ADT/Sequence.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-cache-cost"
+
+static cl::opt<unsigned> DefaultTripCount(
+    "default-trip-count", cl::init(100), cl::Hidden,
+    cl::desc("Use this to specify the default trip count of a loop"));
+
+// In this analysis two array references are considered to exhibit temporal
+// reuse if they access either the same memory location, or a memory location
+// with distance smaller than a configurable threshold.
+static cl::opt<unsigned> TemporalReuseThreshold(
+    "temporal-reuse-threshold", cl::init(2), cl::Hidden,
+    cl::desc("Use this to specify the max. distance between array elements "
+             "accessed in a loop so that the elements are classified to have "
+             "temporal reuse"));
+
+/// Retrieve the innermost loop in the given loop nest \p Loops. It returns a
+/// nullptr if any loops in the loop vector supplied has more than one sibling.
+/// The loop vector is expected to contain loops collected in breadth-first
+/// order.
+static Loop *getInnerMostLoop(const LoopVectorTy &Loops) {
+  assert(!Loops.empty() && "Expecting a non-empy loop vector");
+
+  Loop *LastLoop = Loops.back();
+  Loop *ParentLoop = LastLoop->getParentLoop();
+
+  if (ParentLoop == nullptr) {
+    assert(Loops.size() == 1 && "Expecting a single loop");
+    return LastLoop;
+  }
+
+  return (std::is_sorted(Loops.begin(), Loops.end(),
+                         [](const Loop *L1, const Loop *L2) {
+                           return L1->getLoopDepth() < L2->getLoopDepth();
+                         }))
+             ? LastLoop
+             : nullptr;
+}
+
+static bool isOneDimensionalArray(const SCEV &AccessFn, const SCEV &ElemSize,
+                                  const Loop &L, ScalarEvolution &SE) {
+  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(&AccessFn);
+  if (!AR || !AR->isAffine())
+    return false;
+
+  assert(AR->getLoop() && "AR should have a loop");
+
+  // Check that start and increment are not add recurrences.
+  const SCEV *Start = AR->getStart();
+  const SCEV *Step = AR->getStepRecurrence(SE);
+  if (isa<SCEVAddRecExpr>(Start) || isa<SCEVAddRecExpr>(Step))
+    return false;
+
+  // Check that start and increment are both invariant in the loop.
+  if (!SE.isLoopInvariant(Start, &L) || !SE.isLoopInvariant(Step, &L))
+    return false;
+
+  return AR->getStepRecurrence(SE) == &ElemSize;
+}
+
+/// Compute the trip count for the given loop \p L. Return the SCEV expression
+/// for the trip count or nullptr if it cannot be computed.
+static const SCEV *computeTripCount(const Loop &L, ScalarEvolution &SE) {
+  const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(&L);
+  if (isa<SCEVCouldNotCompute>(BackedgeTakenCount) ||
+      !isa<SCEVConstant>(BackedgeTakenCount))
+    return nullptr;
+
+  return SE.getAddExpr(BackedgeTakenCount,
+                       SE.getOne(BackedgeTakenCount->getType()));
+}
+
+//===----------------------------------------------------------------------===//
+// IndexedReference implementation
+//
+raw_ostream &llvm::operator<<(raw_ostream &OS, const IndexedReference &R) {
+  if (!R.IsValid) {
+    OS << R.StoreOrLoadInst;
+    OS << ", IsValid=false.";
+    return OS;
+  }
+
+  OS << *R.BasePointer;
+  for (const SCEV *Subscript : R.Subscripts)
+    OS << "[" << *Subscript << "]";
+
+  OS << ", Sizes: ";
+  for (const SCEV *Size : R.Sizes)
+    OS << "[" << *Size << "]";
+
+  return OS;
+}
+
+IndexedReference::IndexedReference(Instruction &StoreOrLoadInst,
+                                   const LoopInfo &LI, ScalarEvolution &SE)
+    : StoreOrLoadInst(StoreOrLoadInst), SE(SE) {
+  assert((isa<StoreInst>(StoreOrLoadInst) || isa<LoadInst>(StoreOrLoadInst)) &&
+         "Expecting a load or store instruction");
+
+  IsValid = delinearize(LI);
+  if (IsValid)
+    LLVM_DEBUG(dbgs().indent(2) << "Succesfully delinearized: " << *this
+                                << "\n");
+}
+
+Optional<bool> IndexedReference::hasSpacialReuse(const IndexedReference &Other,
+                                                 unsigned CLS,
+                                                 AliasAnalysis &AA) const {
+  assert(IsValid && "Expecting a valid reference");
+
+  if (BasePointer != Other.getBasePointer() && !isAliased(Other, AA)) {
+    LLVM_DEBUG(dbgs().indent(2)
+               << "No spacial reuse: different base pointers\n");
+    return false;
+  }
+
+  unsigned NumSubscripts = getNumSubscripts();
+  if (NumSubscripts != Other.getNumSubscripts()) {
+    LLVM_DEBUG(dbgs().indent(2)
+               << "No spacial reuse: different number of subscripts\n");
+    return false;
+  }
+
+  // all subscripts must be equal, except the leftmost one (the last one).
+  for (auto SubNum : seq<unsigned>(0, NumSubscripts - 1)) {
+    if (getSubscript(SubNum) != Other.getSubscript(SubNum)) {
+      LLVM_DEBUG(dbgs().indent(2) << "No spacial reuse, different subscripts: "
+                                  << "\n\t" << *getSubscript(SubNum) << "\n\t"
+                                  << *Other.getSubscript(SubNum) << "\n");
+      return false;
+    }
+  }
+
+  // the difference between the last subscripts must be less than the cache line
+  // size.
+  const SCEV *LastSubscript = getLastSubscript();
+  const SCEV *OtherLastSubscript = Other.getLastSubscript();
+  const SCEVConstant *Diff = dyn_cast<SCEVConstant>(
+      SE.getMinusSCEV(LastSubscript, OtherLastSubscript));
+
+  if (Diff == nullptr) {
+    LLVM_DEBUG(dbgs().indent(2)
+               << "No spacial reuse, difference between subscript:\n\t"
+               << *LastSubscript << "\n\t" << OtherLastSubscript
+               << "\nis not constant.\n");
+    return None;
+  }
+
+  bool InSameCacheLine = (Diff->getValue()->getSExtValue() < CLS);
+
+  LLVM_DEBUG({
+    if (InSameCacheLine)
+      dbgs().indent(2) << "Found spacial reuse.\n";
+    else
+      dbgs().indent(2) << "No spacial reuse.\n";
+  });
+
+  return InSameCacheLine;
+}
+
+Optional<bool> IndexedReference::hasTemporalReuse(const IndexedReference &Other,
+                                                  unsigned MaxDistance,
+                                                  const Loop &L,
+                                                  DependenceInfo &DI,
+                                                  AliasAnalysis &AA) const {
+  assert(IsValid && "Expecting a valid reference");
+
+  if (BasePointer != Other.getBasePointer() && !isAliased(Other, AA)) {
+    LLVM_DEBUG(dbgs().indent(2)
+               << "No temporal reuse: different base pointer\n");
+    return false;
+  }
+
+  std::unique_ptr<Dependence> D =
+      DI.depends(&StoreOrLoadInst, &Other.StoreOrLoadInst, true);
+
+  if (D == nullptr) {
+    LLVM_DEBUG(dbgs().indent(2) << "No temporal reuse: no dependence\n");
+    return false;
+  }
+
+  if (D->isLoopIndependent()) {
+    LLVM_DEBUG(dbgs().indent(2) << "Found temporal reuse\n");
+    return true;
+  }
+
+  // Check the dependence distance at every loop level. There is temporal reuse
+  // if the distance at the given loop's depth is small (|d| <= MaxDistance) and
+  // it is zero at every other loop level.
+  int LoopDepth = L.getLoopDepth();
+  int Levels = D->getLevels();
+  for (int Level = 1; Level <= Levels; ++Level) {
+    const SCEV *Distance = D->getDistance(Level);
+    const SCEVConstant *SCEVConst = dyn_cast_or_null<SCEVConstant>(Distance);
+
+    if (SCEVConst == nullptr) {
+      LLVM_DEBUG(dbgs().indent(2) << "No temporal reuse: distance unknown\n");
+      return None;
+    }
+
+    const ConstantInt &CI = *SCEVConst->getValue();
+    if (Level != LoopDepth && !CI.isZero()) {
+      LLVM_DEBUG(dbgs().indent(2)
+                 << "No temporal reuse: distance is not zero at depth=" << Level
+                 << "\n");
+      return false;
+    } else if (Level == LoopDepth && CI.getSExtValue() > MaxDistance) {
+      LLVM_DEBUG(
+          dbgs().indent(2)
+          << "No temporal reuse: distance is greater than MaxDistance at depth="
+          << Level << "\n");
+      return false;
+    }
+  }
+
+  LLVM_DEBUG(dbgs().indent(2) << "Found temporal reuse\n");
+  return true;
+}
+
+CacheCostTy IndexedReference::computeRefCost(const Loop &L,
+                                             unsigned CLS) const {
+  assert(IsValid && "Expecting a valid reference");
+  LLVM_DEBUG({
+    dbgs().indent(2) << "Computing cache cost for:\n";
+    dbgs().indent(4) << *this << "\n";
+  });
+
+  // If the indexed reference is loop invariant the cost is one.
+  if (isLoopInvariant(L)) {
+    LLVM_DEBUG(dbgs().indent(4) << "Reference is loop invariant: RefCost=1\n");
+    return 1;
+  }
+
+  const SCEV *TripCount = computeTripCount(L, SE);
+  if (!TripCount) {
+    LLVM_DEBUG(dbgs() << "Trip count of loop " << L.getName()
+                      << " could not be computed, using DefaultTripCount\n");
+    const SCEV *ElemSize = Sizes.back();
+    TripCount = SE.getConstant(ElemSize->getType(), DefaultTripCount);
+  }
+  LLVM_DEBUG(dbgs() << "TripCount=" << *TripCount << "\n");
+
+  // If the indexed reference is 'consecutive' the cost is
+  // (TripCount*Stride)/CLS, otherwise the cost is TripCount.
+  const SCEV *RefCost = TripCount;
+
+  if (isConsecutive(L, CLS)) {
+    const SCEV *Coeff = getLastCoefficient();
+    const SCEV *ElemSize = Sizes.back();
+    const SCEV *Stride = SE.getMulExpr(Coeff, ElemSize);
+    const SCEV *CacheLineSize = SE.getConstant(Stride->getType(), CLS);
+    const SCEV *Numerator = SE.getMulExpr(Stride, TripCount);
+    RefCost = SE.getUDivExpr(Numerator, CacheLineSize);
+    LLVM_DEBUG(dbgs().indent(4)
+               << "Access is consecutive: RefCost=(TripCount*Stride)/CLS="
+               << *RefCost << "\n");
+  } else
+    LLVM_DEBUG(dbgs().indent(4)
+               << "Access is not consecutive: RefCost=TripCount=" << *RefCost
+               << "\n");
+
+  // Attempt to fold RefCost into a constant.
+  if (auto ConstantCost = dyn_cast<SCEVConstant>(RefCost))
+    return ConstantCost->getValue()->getSExtValue();
+
+  LLVM_DEBUG(dbgs().indent(4)
+             << "RefCost is not a constant! Setting to RefCost=InvalidCost "
+                "(invalid value).\n");
+
+  return CacheCost::InvalidCost;
+}
+
+bool IndexedReference::delinearize(const LoopInfo &LI) {
+  assert(Subscripts.empty() && "Subscripts should be empty");
+  assert(Sizes.empty() && "Sizes should be empty");
+  assert(!IsValid && "Should be called once from the constructor");
+  LLVM_DEBUG(dbgs() << "Delinearizing: " << StoreOrLoadInst << "\n");
+
+  const SCEV *ElemSize = SE.getElementSize(&StoreOrLoadInst);
+  const BasicBlock *BB = StoreOrLoadInst.getParent();
+
+  for (Loop *L = LI.getLoopFor(BB); L != nullptr; L = L->getParentLoop()) {
+    const SCEV *AccessFn =
+        SE.getSCEVAtScope(getPointerOperand(&StoreOrLoadInst), L);
+
+    BasePointer = dyn_cast<SCEVUnknown>(SE.getPointerBase(AccessFn));
+    if (BasePointer == nullptr) {
+      LLVM_DEBUG(
+          dbgs().indent(2)
+          << "ERROR: failed to delinearize, can't identify base pointer\n");
+      return false;
+    }
+
+    AccessFn = SE.getMinusSCEV(AccessFn, BasePointer);
+
+    LLVM_DEBUG(dbgs().indent(2) << "In Loop '" << L->getName()
+                                << "', AccessFn: " << *AccessFn << "\n");
+
+    SE.delinearize(AccessFn, Subscripts, Sizes,
+                   SE.getElementSize(&StoreOrLoadInst));
+
+    if (Subscripts.empty() || Sizes.empty() ||
+        Subscripts.size() != Sizes.size()) {
+      // Attempt to determine whether we have a single dimensional array access.
+      // before giving up.
+      if (!isOneDimensionalArray(*AccessFn, *ElemSize, *L, SE)) {
+        LLVM_DEBUG(dbgs().indent(2)
+                   << "ERROR: failed to delinearize reference\n");
+        Subscripts.clear();
+        Sizes.clear();
+        break;
+      }
+
+      const SCEV *Div = SE.getUDivExactExpr(AccessFn, ElemSize);
+      Subscripts.push_back(Div);
+      Sizes.push_back(ElemSize);
+    }
+
+    return all_of(Subscripts, [&](const SCEV *Subscript) {
+      return isSimpleAddRecurrence(*Subscript, *L);
+    });
+  }
+
+  return false;
+}
+
+bool IndexedReference::isLoopInvariant(const Loop &L) const {
+  Value *Addr = getPointerOperand(&StoreOrLoadInst);
+  assert(Addr != nullptr && "Expecting either a load or a store instruction");
+  assert(SE.isSCEVable(Addr->getType()) && "Addr should be SCEVable");
+
+  if (SE.isLoopInvariant(SE.getSCEV(Addr), &L))
+    return true;
+
+  // The indexed reference is loop invariant if none of the coefficients use
+  // the loop induction variable.
+  bool allCoeffForLoopAreZero = all_of(Subscripts, [&](const SCEV *Subscript) {
+    return isCoeffForLoopZeroOrInvariant(*Subscript, L);
+  });
+
+  return allCoeffForLoopAreZero;
+}
+
+bool IndexedReference::isConsecutive(const Loop &L, unsigned CLS) const {
+  // The indexed reference is 'consecutive' if the only coefficient that uses
+  // the loop induction variable is the last one...
+  const SCEV *LastSubscript = Subscripts.back();
+  for (const SCEV *Subscript : Subscripts) {
+    if (Subscript == LastSubscript)
+      continue;
+    if (!isCoeffForLoopZeroOrInvariant(*Subscript, L))
+      return false;
+  }
+
+  // ...and the access stride is less than the cache line size.
+  const SCEV *Coeff = getLastCoefficient();
+  const SCEV *ElemSize = Sizes.back();
+  const SCEV *Stride = SE.getMulExpr(Coeff, ElemSize);
+  const SCEV *CacheLineSize = SE.getConstant(Stride->getType(), CLS);
+
+  return SE.isKnownPredicate(ICmpInst::ICMP_ULT, Stride, CacheLineSize);
+}
+
+const SCEV *IndexedReference::getLastCoefficient() const {
+  const SCEV *LastSubscript = getLastSubscript();
+  assert(isa<SCEVAddRecExpr>(LastSubscript) &&
+         "Expecting a SCEV add recurrence expression");
+  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(LastSubscript);
+  return AR->getStepRecurrence(SE);
+}
+
+bool IndexedReference::isCoeffForLoopZeroOrInvariant(const SCEV &Subscript,
+                                                     const Loop &L) const {
+  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(&Subscript);
+  return (AR != nullptr) ? AR->getLoop() != &L
+                         : SE.isLoopInvariant(&Subscript, &L);
+}
+
+bool IndexedReference::isSimpleAddRecurrence(const SCEV &Subscript,
+                                             const Loop &L) const {
+  if (!isa<SCEVAddRecExpr>(Subscript))
+    return false;
+
+  const SCEVAddRecExpr *AR = cast<SCEVAddRecExpr>(&Subscript);
+  assert(AR->getLoop() && "AR should have a loop");
+
+  if (!AR->isAffine())
+    return false;
+
+  const SCEV *Start = AR->getStart();
+  const SCEV *Step = AR->getStepRecurrence(SE);
+
+  if (!SE.isLoopInvariant(Start, &L) || !SE.isLoopInvariant(Step, &L))
+    return false;
+
+  return true;
+}
+
+bool IndexedReference::isAliased(const IndexedReference &Other,
+                                 AliasAnalysis &AA) const {
+  const auto &Loc1 = MemoryLocation::get(&StoreOrLoadInst);
+  const auto &Loc2 = MemoryLocation::get(&Other.StoreOrLoadInst);
+  return AA.isMustAlias(Loc1, Loc2);
+}
+
+//===----------------------------------------------------------------------===//
+// CacheCost implementation
+//
+raw_ostream &llvm::operator<<(raw_ostream &OS, const CacheCost &CC) {
+  for (const auto &LC : CC.LoopCosts) {
+    const Loop *L = LC.first;
+    OS << "Loop '" << L->getName() << "' has cost = " << LC.second << "\n";
+  }
+  return OS;
+}
+
+CacheCost::CacheCost(const LoopVectorTy &Loops, const LoopInfo &LI,
+                     ScalarEvolution &SE, TargetTransformInfo &TTI,
+                     AliasAnalysis &AA, DependenceInfo &DI,
+                     Optional<unsigned> TRT)
+    : Loops(Loops), TripCounts(), LoopCosts(),
+      TRT(TRT == None ? Optional<unsigned>(TemporalReuseThreshold) : TRT),
+      LI(LI), SE(SE), TTI(TTI), AA(AA), DI(DI) {
+  assert(!Loops.empty() && "Expecting a non-empty loop vector.");
+
+  for (const Loop *L : Loops) {
+    unsigned TripCount = SE.getSmallConstantTripCount(L);
+    TripCount = (TripCount == 0) ? DefaultTripCount : TripCount;
+    TripCounts.push_back({L, TripCount});
+  }
+
+  calculateCacheFootprint();
+}
+
+std::unique_ptr<CacheCost>
+CacheCost::getCacheCost(Loop &Root, LoopStandardAnalysisResults &AR,
+                        DependenceInfo &DI, Optional<unsigned> TRT) {
+  if (Root.getParentLoop()) {
+    LLVM_DEBUG(dbgs() << "Expecting the outermost loop in a loop nest\n");
+    return nullptr;
+  }
+
+  LoopVectorTy Loops;
+  for (Loop *L : breadth_first(&Root))
+    Loops.push_back(L);
+
+  if (!getInnerMostLoop(Loops)) {
+    LLVM_DEBUG(dbgs() << "Cannot compute cache cost of loop nest with more "
+                         "than one innermost loop\n");
+    return nullptr;
+  }
+
+  return make_unique<CacheCost>(Loops, AR.LI, AR.SE, AR.TTI, AR.AA, DI, TRT);
+}
+
+void CacheCost::calculateCacheFootprint() {
+  LLVM_DEBUG(dbgs() << "POPULATING REFERENCE GROUPS\n");
+  ReferenceGroupsTy RefGroups;
+  if (!populateReferenceGroups(RefGroups))
+    return;
+
+  LLVM_DEBUG(dbgs() << "COMPUTING LOOP CACHE COSTS\n");
+  for (const Loop *L : Loops) {
+    assert((std::find_if(LoopCosts.begin(), LoopCosts.end(),
+                         [L](const LoopCacheCostTy &LCC) {
+                           return LCC.first == L;
+                         }) == LoopCosts.end()) &&
+           "Should not add duplicate element");
+    CacheCostTy LoopCost = computeLoopCacheCost(*L, RefGroups);
+    LoopCosts.push_back(std::make_pair(L, LoopCost));
+  }
+
+  sortLoopCosts();
+  RefGroups.clear();
+}
+
+bool CacheCost::populateReferenceGroups(ReferenceGroupsTy &RefGroups) const {
+  assert(RefGroups.empty() && "Reference groups should be empty");
+
+  unsigned CLS = TTI.getCacheLineSize();
+  Loop *InnerMostLoop = getInnerMostLoop(Loops);
+  assert(InnerMostLoop != nullptr && "Expecting a valid innermost loop");
+
+  for (BasicBlock *BB : InnerMostLoop->getBlocks()) {
+    for (Instruction &I : *BB) {
+      if (!isa<StoreInst>(I) && !isa<LoadInst>(I))
+        continue;
+
+      std::unique_ptr<IndexedReference> R(new IndexedReference(I, LI, SE));
+      if (!R->isValid())
+        continue;
+
+      bool Added = false;
+      for (ReferenceGroupTy &RefGroup : RefGroups) {
+        const IndexedReference &Representative = *RefGroup.front().get();
+        LLVM_DEBUG({
+          dbgs() << "References:\n";
+          dbgs().indent(2) << *R << "\n";
+          dbgs().indent(2) << Representative << "\n";
+        });
+
+        Optional<bool> HasTemporalReuse =
+            R->hasTemporalReuse(Representative, *TRT, *InnerMostLoop, DI, AA);
+        Optional<bool> HasSpacialReuse =
+            R->hasSpacialReuse(Representative, CLS, AA);
+
+        if ((HasTemporalReuse.hasValue() && *HasTemporalReuse) ||
+            (HasSpacialReuse.hasValue() && *HasSpacialReuse)) {
+          RefGroup.push_back(std::move(R));
+          Added = true;
+          break;
+        }
+      }
+
+      if (!Added) {
+        ReferenceGroupTy RG;
+        RG.push_back(std::move(R));
+        RefGroups.push_back(std::move(RG));
+      }
+    }
+  }
+
+  if (RefGroups.empty())
+    return false;
+
+  LLVM_DEBUG({
+    dbgs() << "\nIDENTIFIED REFERENCE GROUPS:\n";
+    int n = 1;
+    for (const ReferenceGroupTy &RG : RefGroups) {
+      dbgs().indent(2) << "RefGroup " << n << ":\n";
+      for (const auto &IR : RG)
+        dbgs().indent(4) << *IR << "\n";
+      n++;
+    }
+    dbgs() << "\n";
+  });
+
+  return true;
+}
+
+CacheCostTy
+CacheCost::computeLoopCacheCost(const Loop &L,
+                                const ReferenceGroupsTy &RefGroups) const {
+  if (!L.isLoopSimplifyForm())
+    return InvalidCost;
+
+  LLVM_DEBUG(dbgs() << "Considering loop '" << L.getName()
+                    << "' as innermost loop.\n");
+
+  // Compute the product of the trip counts of each other loop in the nest.
+  CacheCostTy TripCountsProduct = 1;
+  for (const auto &TC : TripCounts) {
+    if (TC.first == &L)
+      continue;
+    TripCountsProduct *= TC.second;
+  }
+
+  CacheCostTy LoopCost = 0;
+  for (const ReferenceGroupTy &RG : RefGroups) {
+    CacheCostTy RefGroupCost = computeRefGroupCacheCost(RG, L);
+    LoopCost += RefGroupCost * TripCountsProduct;
+  }
+
+  LLVM_DEBUG(dbgs().indent(2) << "Loop '" << L.getName()
+                              << "' has cost=" << LoopCost << "\n");
+
+  return LoopCost;
+}
+
+CacheCostTy CacheCost::computeRefGroupCacheCost(const ReferenceGroupTy &RG,
+                                                const Loop &L) const {
+  assert(!RG.empty() && "Reference group should have at least one member.");
+
+  const IndexedReference *Representative = RG.front().get();
+  return Representative->computeRefCost(L, TTI.getCacheLineSize());
+}
+
+//===----------------------------------------------------------------------===//
+// LoopCachePrinterPass implementation
+//
+PreservedAnalyses LoopCachePrinterPass::run(Loop &L, LoopAnalysisManager &AM,
+                                            LoopStandardAnalysisResults &AR,
+                                            LPMUpdater &U) {
+  Function *F = L.getHeader()->getParent();
+  DependenceInfo DI(F, &AR.AA, &AR.SE, &AR.LI);
+
+  if (auto CC = CacheCost::getCacheCost(L, AR, DI))
+    OS << *CC;
+
+  return PreservedAnalyses::all();
+}

Modified: llvm/trunk/lib/Passes/PassBuilder.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Passes/PassBuilder.cpp?rev=368439&r1=368438&r2=368439&view=diff
==============================================================================
--- llvm/trunk/lib/Passes/PassBuilder.cpp (original)
+++ llvm/trunk/lib/Passes/PassBuilder.cpp Fri Aug  9 06:56:29 2019
@@ -35,6 +35,7 @@
 #include "llvm/Analysis/LazyCallGraph.h"
 #include "llvm/Analysis/LazyValueInfo.h"
 #include "llvm/Analysis/LoopAccessAnalysis.h"
+#include "llvm/Analysis/LoopCacheAnalysis.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/MemoryDependenceAnalysis.h"
 #include "llvm/Analysis/MemorySSA.h"

Modified: llvm/trunk/lib/Passes/PassRegistry.def
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Passes/PassRegistry.def?rev=368439&r1=368438&r2=368439&view=diff
==============================================================================
--- llvm/trunk/lib/Passes/PassRegistry.def (original)
+++ llvm/trunk/lib/Passes/PassRegistry.def Fri Aug  9 06:56:29 2019
@@ -302,6 +302,7 @@ LOOP_PASS("unroll-and-jam", LoopUnrollAn
 LOOP_PASS("unroll-full", LoopFullUnrollPass())
 LOOP_PASS("print-access-info", LoopAccessInfoPrinterPass(dbgs()))
 LOOP_PASS("print<ivusers>", IVUsersPrinterPass(dbgs()))
+LOOP_PASS("print<loop-cache-cost>", LoopCachePrinterPass(dbgs()))
 LOOP_PASS("loop-predication", LoopPredicationPass())
 LOOP_PASS("guard-widening", GuardWideningPass())
 #undef LOOP_PASS

Added: llvm/trunk/test/Analysis/LoopCacheAnalysis/PowerPC/lit.local.cfg
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Analysis/LoopCacheAnalysis/PowerPC/lit.local.cfg?rev=368439&view=auto
==============================================================================
--- llvm/trunk/test/Analysis/LoopCacheAnalysis/PowerPC/lit.local.cfg (added)
+++ llvm/trunk/test/Analysis/LoopCacheAnalysis/PowerPC/lit.local.cfg Fri Aug  9 06:56:29 2019
@@ -0,0 +1,2 @@
+if not 'PowerPC' in config.root.targets:
+    config.unsupported = True

Added: llvm/trunk/test/Analysis/LoopCacheAnalysis/PowerPC/loads-store.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Analysis/LoopCacheAnalysis/PowerPC/loads-store.ll?rev=368439&view=auto
==============================================================================
--- llvm/trunk/test/Analysis/LoopCacheAnalysis/PowerPC/loads-store.ll (added)
+++ llvm/trunk/test/Analysis/LoopCacheAnalysis/PowerPC/loads-store.ll Fri Aug  9 06:56:29 2019
@@ -0,0 +1,88 @@
+; RUN: opt < %s -passes='print<loop-cache-cost>' -disable-output 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-unknown-linux-gnu"
+
+; void foo(long n, long m, long o, int A[n][m][o], int B[n][m][o], int C[n][m][o]) {
+;   for (long i = 0; i < n; i++)
+;     for (long j = 0; j < m; j++)
+;       for (long k = 0; k < o; k++)
+;         A[i][k][j] += B[i][k][j] + C[i][j][k];
+; }
+
+; CHECK: Loop 'for.i' has cost = 3000000
+; CHECK-NEXT: Loop 'for.k' has cost = 2030000
+; CHECK-NEXT: Loop 'for.j' has cost = 1060000
+
+define void @foo(i64 %n, i64 %m, i64 %o, i32* %A, i32* %B, i32* %C) {
+entry:
+  %cmp32 = icmp sgt i64 %n, 0
+  %cmp230 = icmp sgt i64 %m, 0
+  %cmp528 = icmp sgt i64 %o, 0
+  br i1 %cmp32, label %for.cond1.preheader.lr.ph, label %for.end
+
+for.cond1.preheader.lr.ph:                        ; preds = %entry
+  br i1 %cmp230, label %for.i.preheader, label %for.end
+
+for.i.preheader:                                  ; preds = %for.cond1.preheader.lr.ph
+  br i1 %cmp528, label %for.i.preheader.split, label %for.end
+
+for.i.preheader.split:                            ; preds = %for.i.preheader
+  br label %for.i
+
+for.i:                                            ; preds = %for.inci, %for.i.preheader.split
+  %i = phi i64 [ %inci, %for.inci ], [ 0, %for.i.preheader.split ]
+  %muli = mul i64 %i, %m
+  br label %for.j
+
+for.j:                                            ; preds = %for.incj, %for.i
+  %j = phi i64 [ %incj, %for.incj ], [ 0, %for.i ]
+  %addj = add i64 %muli, %j
+  %mulj = mul i64 %addj, %o
+  br label %for.k
+
+for.k:                                            ; preds = %for.k, %for.j
+  %k = phi i64 [ 0, %for.j ], [ %inck, %for.k ]
+
+  ; B[i][k][j]
+  %addk = add i64 %muli, %k
+  %mulk = mul i64 %addk, %o
+  %arrayidx1 = add i64 %j, %mulk
+  %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %arrayidx1
+  %elem_B = load i32, i32* %arrayidx2, align 4
+
+  ; C[i][j][k]
+  %arrayidx3 = add i64 %k, %mulj
+  %arrayidx4 = getelementptr inbounds i32, i32* %C, i64 %arrayidx3
+  %elem_C = load i32, i32* %arrayidx4, align 4
+
+  ; A[i][k][j]
+  %arrayidx5 = getelementptr inbounds i32, i32* %A, i64 %arrayidx1
+  %elem_A = load i32, i32* %arrayidx5, align 4
+
+  ; A[i][k][j] += B[i][k][j] + C[i][j][k]
+  %add1 = add i32 %elem_B, %elem_C
+  %add2 = add i32 %add1, %elem_A
+  %arrayidx6 = getelementptr inbounds i32, i32* %A, i64 %arrayidx1
+  store i32 %add2, i32* %arrayidx6, align 4
+
+  %inck = add nsw i64 %k, 1
+  %exitcond.us = icmp eq i64 %inck, %o
+  br i1 %exitcond.us, label %for.incj, label %for.k
+
+for.incj:                                         ; preds = %for.k
+  %incj = add nsw i64 %j, 1
+  %exitcond54.us = icmp eq i64 %incj, %m
+  br i1 %exitcond54.us, label %for.inci, label %for.j
+
+for.inci:                                         ; preds = %for.incj
+  %inci = add nsw i64 %i, 1
+  %exitcond55.us = icmp eq i64 %inci, %n
+  br i1 %exitcond55.us, label %for.end.loopexit, label %for.i
+
+for.end.loopexit:                                 ; preds = %for.inci
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %for.cond1.preheader.lr.ph, %entry
+  ret void
+}

Added: llvm/trunk/test/Analysis/LoopCacheAnalysis/PowerPC/matmul.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Analysis/LoopCacheAnalysis/PowerPC/matmul.ll?rev=368439&view=auto
==============================================================================
--- llvm/trunk/test/Analysis/LoopCacheAnalysis/PowerPC/matmul.ll (added)
+++ llvm/trunk/test/Analysis/LoopCacheAnalysis/PowerPC/matmul.ll Fri Aug  9 06:56:29 2019
@@ -0,0 +1,81 @@
+; RUN: opt < %s -passes='print<loop-cache-cost>' -disable-output 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-unknown-linux-gnu"
+
+; void matmul(long n, long m, long o, int A[n][m], int B[n][m], int C[n]) {
+;   for (long i = 0; i < n; i++) 
+;     for (long j = 0; j < m; j++) 
+;       for (long k = 0; k < o; k++) 
+;         C[i][j] = C[i][j] + A[i][k] * B[k][j];
+; }
+
+; CHECK:Loop 'for.i' has cost = 2010000
+; CHECK-NEXT:Loop 'for.k' has cost = 1040000
+; CHECK-NEXT:Loop 'for.j' has cost = 70000
+    
+define void @matmul(i64 %n, i64 %m, i64 %o, i32* %A, i32* %B, i32* %C) {
+entry:
+  br label %for.i
+
+for.i:                                         ; preds = %entry, %for.inc.i
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.inc.i ]
+  %muli = mul i64 %i, %m
+  br label %for.j
+
+for.j:                                        ; preds = %for.i, %for.inc.j
+  %j = phi i64 [ 0, %for.i ], [ %j.next, %for.inc.j ]
+  %addj = add i64 %muli, %j
+  %mulj = mul i64 %addj, %o
+  br label %for.k
+
+for.k:                                        ; preds = %for.j, %for.inc.k
+  %k = phi i64 [ 0, %for.j ], [ %k.next, %for.inc.k ]
+
+  ; A[i][k]
+  %arrayidx3 = add i64 %k, %muli
+  %arrayidx4 = getelementptr inbounds i32, i32* %A, i64 %arrayidx3
+  %elem_A = load i32, i32* %arrayidx4, align 4
+
+  ; B[k][j]
+  %mulk = mul i64 %k, %o
+  %arrayidx5 = add i64 %j, %mulk
+  %arrayidx6 = getelementptr inbounds i32, i32* %B, i64 %arrayidx5
+  %elem_B = load i32, i32* %arrayidx6, align 4
+
+  ; C[i][k]
+  %arrayidx7 = add i64 %j, %muli
+  %arrayidx8 = getelementptr inbounds i32, i32* %C, i64 %arrayidx7
+  %elem_C = load i32, i32* %arrayidx8, align 4
+
+  ; C[i][j] = C[i][j] + A[i][k] * B[k][j];
+  %mul = mul nsw i32 %elem_A, %elem_B
+  %add = add nsw i32 %elem_C, %mul
+  store i32 %add, i32* %arrayidx8, align 4
+
+  br label %for.inc.k
+
+for.inc.k:                                          ; preds = %for.k
+  %k.next = add nuw nsw i64 %k, 1
+  %exitcond = icmp ne i64 %k.next, %o
+  br i1 %exitcond, label %for.k, label %for.end
+
+for.end:                                          ; preds = %for.inc
+  br label %for.inc.j
+
+for.inc.j:                                        ; preds = %for.end
+  %j.next = add nuw nsw i64 %j, 1
+  %exitcond5 = icmp ne i64 %j.next, %m
+  br i1 %exitcond5, label %for.j, label %for.end23
+
+for.end23:                                        ; preds = %for.inc.j
+  br label %for.inc.i
+
+for.inc.i:                                        ; preds = %for.end23
+  %i.next = add nuw nsw i64 %i, 1
+  %exitcond8 = icmp ne i64 %i.next, %n
+  br i1 %exitcond8, label %for.i, label %for.end26
+
+for.end26:                                        ; preds = %for.inc.i
+  ret void
+}

Added: llvm/trunk/test/Analysis/LoopCacheAnalysis/PowerPC/matvecmul.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Analysis/LoopCacheAnalysis/PowerPC/matvecmul.ll?rev=368439&view=auto
==============================================================================
--- llvm/trunk/test/Analysis/LoopCacheAnalysis/PowerPC/matvecmul.ll (added)
+++ llvm/trunk/test/Analysis/LoopCacheAnalysis/PowerPC/matvecmul.ll Fri Aug  9 06:56:29 2019
@@ -0,0 +1,185 @@
+; RUN: opt < %s -passes='print<loop-cache-cost>' -disable-output 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-unknown-linux-gnu"
+
+; void matvecmul(const double *__restrict y, const double * __restrict x, const double * __restrict b,
+;    const int * __restrict nb, const int * __restrict nx, const int * __restrict ny, const int * __restrict nz) {
+;
+;   for (int k=1;k<nz,++k) 
+;      for (int j=1;j<ny,++j)
+;        for (int i=1;i<nx,++i)
+;          for (int l=1;l<nb,++l)
+;            for (int m=1;m<nb,++m)
+;                 y[k+1][j][i][l] = y[k+1][j][i][l] + b[k][j][i][m][l]*x[k][j][i][m]
+; }
+
+; CHECK: Loop 'k_loop' has cost = 30000000000
+; CHECK-NEXT: Loop 'j_loop' has cost = 30000000000
+; CHECK-NEXT: Loop 'i_loop' has cost = 30000000000
+; CHECK-NEXT: Loop 'm_loop' has cost = 10700000000
+; CHECK-NEXT: Loop 'l_loop' has cost = 1300000000
+
+%_elem_type_of_double = type <{ double }>
+
+; Function Attrs: norecurse nounwind
+define void @mat_vec_mpy([0 x %_elem_type_of_double]* noalias %y, [0 x %_elem_type_of_double]* noalias readonly %x,
+    [0 x %_elem_type_of_double]* noalias readonly %b, i32* noalias readonly %nb, i32* noalias readonly %nx, 
+    i32* noalias readonly %ny, i32* noalias readonly %nz) {
+mat_times_vec_entry:
+  %_ind_val = load i32, i32* %nb, align 4
+  %_conv = sext i32 %_ind_val to i64
+  %_grt_tmp.i = icmp sgt i64 %_conv, 0
+  %a_b.i = select i1 %_grt_tmp.i, i64 %_conv, i64 0
+  %_ind_val1 = load i32, i32* %nx, align 4
+  %_conv2 = sext i32 %_ind_val1 to i64
+  %_grt_tmp.i266 = icmp sgt i64 %_conv2, 0
+  %a_b.i267 = select i1 %_grt_tmp.i266, i64 %_conv2, i64 0
+  %_ind_val3 = load i32, i32* %ny, align 4
+  %_conv4 = sext i32 %_ind_val3 to i64
+  %_grt_tmp.i264 = icmp sgt i64 %_conv4, 0
+  %a_b.i265 = select i1 %_grt_tmp.i264, i64 %_conv4, i64 0
+  %_ind_val5 = load i32, i32* %nz, align 4
+  %_mult_tmp = shl nsw i64 %a_b.i, 3
+  %_mult_tmp7 = mul i64 %_mult_tmp, %a_b.i267
+  %_mult_tmp8 = mul i64 %_mult_tmp7, %a_b.i265
+  %_sub_tmp = sub nuw nsw i64 -8, %_mult_tmp
+  %_sub_tmp21 = sub i64 %_sub_tmp, %_mult_tmp7
+  %_sub_tmp23 = sub i64 %_sub_tmp21, %_mult_tmp8
+  %_mult_tmp73 = mul i64 %_mult_tmp, %a_b.i
+  %_mult_tmp74 = mul i64 %_mult_tmp73, %a_b.i267
+  %_mult_tmp75 = mul i64 %_mult_tmp74, %a_b.i265
+  %_sub_tmp93 = sub i64 %_sub_tmp, %_mult_tmp73
+  %_sub_tmp95 = sub i64 %_sub_tmp93, %_mult_tmp74
+  %_sub_tmp97 = sub i64 %_sub_tmp95, %_mult_tmp75
+  %_grt_tmp853288 = icmp slt i32 %_ind_val5, 1
+  br i1 %_grt_tmp853288, label %_return_bb, label %k_loop.lr.ph
+
+k_loop.lr.ph:                                     ; preds = %mat_times_vec_entry
+  %_grt_tmp851279 = icmp slt i32 %_ind_val3, 1
+  %_grt_tmp847270 = icmp slt i32 %_ind_val, 1
+  %_aa_conv = bitcast [0 x %_elem_type_of_double]* %y to i8*
+  %_adda_ = getelementptr inbounds i8, i8* %_aa_conv, i64 %_sub_tmp23
+  %_aa_conv434 = bitcast [0 x %_elem_type_of_double]* %x to i8*
+  %_adda_435 = getelementptr inbounds i8, i8* %_aa_conv434, i64 %_sub_tmp23
+  %_aa_conv785 = bitcast [0 x %_elem_type_of_double]* %b to i8*
+  %_adda_786 = getelementptr inbounds i8, i8* %_aa_conv785, i64 %_sub_tmp97
+  br i1 %_grt_tmp851279, label %k_loop.us.preheader, label %k_loop.lr.ph.split
+
+k_loop.us.preheader:                              ; preds = %k_loop.lr.ph
+  br label %_return_bb.loopexit
+
+k_loop.lr.ph.split:                               ; preds = %k_loop.lr.ph
+  %_grt_tmp849273 = icmp slt i32 %_ind_val1, 1
+  br i1 %_grt_tmp849273, label %k_loop.us291.preheader, label %k_loop.lr.ph.split.split
+
+k_loop.us291.preheader:                           ; preds = %k_loop.lr.ph.split
+  br label %_return_bb.loopexit300
+
+k_loop.lr.ph.split.split:                         ; preds = %k_loop.lr.ph.split
+  br i1 %_grt_tmp847270, label %k_loop.us294.preheader, label %k_loop.preheader
+
+k_loop.preheader:                                 ; preds = %k_loop.lr.ph.split.split
+  %0 = add i32 %_ind_val, 1
+  %1 = add i32 %_ind_val1, 1
+  %2 = add i32 %_ind_val3, 1
+  %3 = add i32 %_ind_val5, 1
+  br label %k_loop
+
+k_loop.us294.preheader:                           ; preds = %k_loop.lr.ph.split.split
+  br label %_return_bb.loopexit301
+
+k_loop:                                           ; preds = %k_loop._label_18_crit_edge.split.split.split, %k_loop.preheader
+  %indvars.iv316 = phi i64 [ 1, %k_loop.preheader ], [ %indvars.iv.next317, %k_loop._label_18_crit_edge.split.split.split ]
+  %indvars.iv.next317 = add nuw nsw i64 %indvars.iv316, 1
+  %_ix_x_len = mul i64 %_mult_tmp8, %indvars.iv.next317
+  %_ix_x_len410 = mul i64 %_mult_tmp75, %indvars.iv316
+  %_ix_x_len822 = mul i64 %_mult_tmp8, %indvars.iv316
+  br label %j_loop
+
+j_loop:                                           ; preds = %j_loop._label_15_crit_edge.split.split, %k_loop
+  %indvars.iv312 = phi i64 [ %indvars.iv.next313, %j_loop._label_15_crit_edge.split.split ], [ 1, %k_loop ]
+  %_ix_x_len371 = mul i64 %_mult_tmp7, %indvars.iv312
+  %_ix_x_len415 = mul i64 %_mult_tmp74, %indvars.iv312
+  br label %i_loop
+
+i_loop:                                           ; preds = %i_loop._label_12_crit_edge.split, %j_loop
+  %indvars.iv307 = phi i64 [ %indvars.iv.next308, %i_loop._label_12_crit_edge.split ], [ 1, %j_loop ]
+  %_ix_x_len375 = mul i64 %_mult_tmp, %indvars.iv307
+  %_ix_x_len420 = mul i64 %_mult_tmp73, %indvars.iv307
+  br label %l_loop
+
+l_loop:                                           ; preds = %l_loop._label_9_crit_edge, %i_loop
+  %indvars.iv303 = phi i64 [ %indvars.iv.next304, %l_loop._label_9_crit_edge ], [ 1, %i_loop ]
+  %_ix_x_len378 = shl nuw nsw i64 %indvars.iv303, 3
+  br label %m_loop
+
+m_loop:                                           ; preds = %m_loop, %l_loop
+  %indvars.iv = phi i64 [ %indvars.iv.next, %m_loop ], [ 1, %l_loop ]
+  %_ix_x_len424 = mul i64 %_mult_tmp, %indvars.iv
+  %_ix_x_len454 = shl nuw nsw i64 %indvars.iv, 3
+  %_ixa_gep = getelementptr inbounds i8, i8* %_adda_, i64 %_ix_x_len
+  %_ixa_gep791 = getelementptr inbounds i8, i8* %_adda_786, i64 %_ix_x_len410
+  %_ixa_gep823 = getelementptr inbounds i8, i8* %_adda_435, i64 %_ix_x_len822
+  %_ixa_gep372 = getelementptr inbounds i8, i8* %_ixa_gep, i64 %_ix_x_len371
+  %_ixa_gep376 = getelementptr inbounds i8, i8* %_ixa_gep372, i64 %_ix_x_len375
+  %_ixa_gep796 = getelementptr inbounds i8, i8* %_ixa_gep791, i64 %_ix_x_len415
+  %_ixa_gep828 = getelementptr inbounds i8, i8* %_ixa_gep823, i64 %_ix_x_len371
+  %_ixa_gep379 = getelementptr inbounds i8, i8* %_ixa_gep376, i64 %_ix_x_len378
+  %_ixa_gep801 = getelementptr inbounds i8, i8* %_ixa_gep796, i64 %_ix_x_len420
+  %_ixa_gep833 = getelementptr inbounds i8, i8* %_ixa_gep828, i64 %_ix_x_len375
+  %_ixa_gep806 = getelementptr inbounds i8, i8* %_ixa_gep801, i64 %_ix_x_len378
+  %_ixa_gep810 = getelementptr inbounds i8, i8* %_ixa_gep806, i64 %_ix_x_len424
+  %_gepp = bitcast i8* %_ixa_gep379 to double*
+  %_gepp813 = bitcast i8* %_ixa_gep810 to double*
+  %_ind_val814 = load double, double* %_gepp813, align 8
+  %_ixa_gep837 = getelementptr inbounds i8, i8* %_ixa_gep833, i64 %_ix_x_len454
+  %_gepp840 = bitcast i8* %_ixa_gep837 to double*
+  %_ind_val841 = load double, double* %_gepp840, align 8
+  %_mult_tmp842 = fmul double %_ind_val814, %_ind_val841
+  store double %_mult_tmp842, double* %_gepp, align 8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %wide.trip.count = zext i32 %0 to i64
+  %wide.trip.count305 = zext i32 %0 to i64
+  %wide.trip.count309 = zext i32 %1 to i64
+  %wide.trip.count314 = zext i32 %2 to i64
+  %wide.trip.count319 = zext i32 %3 to i64
+  %exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %m_loop, label %l_loop._label_9_crit_edge
+
+l_loop._label_9_crit_edge:                        ; preds = %m_loop
+  %indvars.iv.next304 = add nuw nsw i64 %indvars.iv303, 1
+  %exitcond306 = icmp ne i64 %indvars.iv.next304, %wide.trip.count305
+  br i1 %exitcond306, label %l_loop, label %i_loop._label_12_crit_edge.split
+
+i_loop._label_12_crit_edge.split:                 ; preds = %l_loop._label_9_crit_edge
+  %indvars.iv.next308 = add nuw nsw i64 %indvars.iv307, 1
+  %exitcond310 = icmp ne i64 %indvars.iv.next308, %wide.trip.count309
+  br i1 %exitcond310, label %i_loop, label %j_loop._label_15_crit_edge.split.split
+
+j_loop._label_15_crit_edge.split.split:           ; preds = %i_loop._label_12_crit_edge.split
+  %indvars.iv.next313 = add nuw nsw i64 %indvars.iv312, 1
+  %exitcond315 = icmp ne i64 %indvars.iv.next313, %wide.trip.count314
+  br i1 %exitcond315, label %j_loop, label %k_loop._label_18_crit_edge.split.split.split
+
+k_loop._label_18_crit_edge.split.split.split:     ; preds = %j_loop._label_15_crit_edge.split.split
+  %exitcond320 = icmp ne i64 %indvars.iv.next317, %wide.trip.count319
+  br i1 %exitcond320, label %k_loop, label %_return_bb.loopexit302
+
+_return_bb.loopexit:                              ; preds = %k_loop.us.preheader
+  br label %_return_bb
+
+_return_bb.loopexit300:                           ; preds = %k_loop.us291.preheader
+  br label %_return_bb
+
+_return_bb.loopexit301:                           ; preds = %k_loop.us294.preheader
+  br label %_return_bb
+
+_return_bb.loopexit302:                           ; preds = %k_loop._label_18_crit_edge.split.split.split
+  br label %_return_bb
+
+_return_bb:                                       ; preds = %_return_bb.loopexit302, %_return_bb.loopexit301, %_return_bb.loopexit300, %_return_bb.loopexit, %mat_times_vec_entry
+  ret void
+}
+
+

Added: llvm/trunk/test/Analysis/LoopCacheAnalysis/PowerPC/single-store.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Analysis/LoopCacheAnalysis/PowerPC/single-store.ll?rev=368439&view=auto
==============================================================================
--- llvm/trunk/test/Analysis/LoopCacheAnalysis/PowerPC/single-store.ll (added)
+++ llvm/trunk/test/Analysis/LoopCacheAnalysis/PowerPC/single-store.ll Fri Aug  9 06:56:29 2019
@@ -0,0 +1,77 @@
+; RUN: opt < %s -passes='print<loop-cache-cost>' -disable-output 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-unknown-linux-gnu"
+
+; void foo(long n, long m, long o, int A[n][m][o]) {
+;   for (long i = 0; i < n; i++)
+;     for (long j = 0; j < m; j++)
+;       for (long k = 0; k < o; k++)
+;         A[2*i+3][3*j-4][2*k+7] = 1;
+; }
+
+; CHECK: Loop 'for.i' has cost = 1000000
+; CHECK-NEXT: Loop 'for.j' has cost = 1000000
+; CHECK-NEXT: Loop 'for.k' has cost = 60000
+
+define void @foo(i64 %n, i64 %m, i64 %o, i32* %A) {
+entry:
+  %cmp32 = icmp sgt i64 %n, 0
+  %cmp230 = icmp sgt i64 %m, 0
+  %cmp528 = icmp sgt i64 %o, 0
+  br i1 %cmp32, label %for.cond1.preheader.lr.ph, label %for.end
+
+for.cond1.preheader.lr.ph:                        ; preds = %entry
+  br i1 %cmp230, label %for.i.preheader, label %for.end
+
+for.i.preheader:                                  ; preds = %for.cond1.preheader.lr.ph
+  br i1 %cmp528, label %for.i.preheader.split, label %for.end
+
+for.i.preheader.split:                            ; preds = %for.i.preheader
+  br label %for.i
+
+for.i:                                            ; preds = %for.inci, %for.i.preheader.split
+  %i = phi i64 [ %inci, %for.inci ], [ 0, %for.i.preheader.split ]
+  %mul8 = shl i64 %i, 1
+  %add9 = add nsw i64 %mul8, 3
+  %0 = mul i64 %add9, %m
+  %sub = add i64 %0, -4
+  br label %for.j
+
+for.j:                                            ; preds = %for.incj, %for.i
+  %j = phi i64 [ %incj, %for.incj ], [ 0, %for.i ]
+  %mul7 = mul nsw i64 %j, 3
+  %tmp = add i64 %sub, %mul7
+  %tmp27 = mul i64 %tmp, %o
+  br label %for.k
+
+for.k:                                            ; preds = %for.k, %for.j.us
+  %k = phi i64 [ 0, %for.j ], [ %inck, %for.k ]
+
+  %mul = mul nsw i64 %k, 2
+  %arrayidx.sum = add i64 %mul, 7
+  %arrayidx10.sum = add i64 %arrayidx.sum, %tmp27
+  %arrayidx11 = getelementptr inbounds i32, i32* %A, i64 %arrayidx10.sum
+  store i32 1, i32* %arrayidx11, align 4
+
+  %inck = add nsw i64 %k, 1
+  %exitcond.us = icmp eq i64 %inck, %o
+  br i1 %exitcond.us, label %for.incj, label %for.k
+
+for.incj:                                         ; preds = %for.k
+  %incj = add nsw i64 %j, 1
+  %exitcond54.us = icmp eq i64 %incj, %m
+  br i1 %exitcond54.us, label %for.inci, label %for.j
+
+for.inci:                                         ; preds = %for.incj
+  %inci = add nsw i64 %i, 1
+  %exitcond55.us = icmp eq i64 %inci, %n
+  br i1 %exitcond55.us, label %for.end.loopexit, label %for.i
+
+for.end.loopexit:                                 ; preds = %for.inci
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %for.cond1.preheader.lr.ph, %entry
+  ret void
+}
+

Added: llvm/trunk/test/Analysis/LoopCacheAnalysis/PowerPC/stencil.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Analysis/LoopCacheAnalysis/PowerPC/stencil.ll?rev=368439&view=auto
==============================================================================
--- llvm/trunk/test/Analysis/LoopCacheAnalysis/PowerPC/stencil.ll (added)
+++ llvm/trunk/test/Analysis/LoopCacheAnalysis/PowerPC/stencil.ll Fri Aug  9 06:56:29 2019
@@ -0,0 +1,98 @@
+; RUN: opt < %s -passes='print<loop-cache-cost>' -disable-output 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-unknown-linux-gnu"
+
+; void foo(long n, long m, long o, int A[n][m], int B[n][m], int C[n]) {
+;   for (long i = 0; i < n; i++)
+;     for (long j = 0; j < m; j++) {
+;         A[i][j] = A[i][j+1] + B[i-1][j] + B[i+1][j+1] + C[i];
+;         A[i][j] += B[i][i];
+;     }   
+; }
+
+; CHECK: Loop 'for.i' has cost = 20600
+; CHECK-NEXT: Loop 'for.j' has cost = 800
+
+define void @foo(i64 %n, i64 %m, i32* %A, i32* %B, i32* %C) {
+entry:
+  %cmp32 = icmp sgt i64 %n, 0
+  %cmp230 = icmp sgt i64 %m, 0
+  br i1 %cmp32, label %for.cond1.preheader.lr.ph, label %for.end
+
+for.cond1.preheader.lr.ph:                        ; preds = %entry
+  br i1 %cmp230, label %for.i.preheader, label %for.end
+
+for.i.preheader:                                  ; preds = %for.cond1.preheader.lr.ph
+  br label %for.i
+
+for.i:                                            ; preds = %for.inci, %for.i.preheader.split
+  %i = phi i64 [ %inci, %for.inci ], [ 0, %for.i.preheader ]
+  %subione = sub i64 %i, 1
+  %addione = add i64 %i, 1
+  %muli = mul i64 %i, %m
+  %muliminusone = mul i64 %subione, %m
+  %muliplusone = mul i64 %addione, %m
+  br label %for.j
+
+for.j:                                            ; preds = %for.incj, %for.i
+  %j = phi i64 [ %incj, %for.incj ], [ 0, %for.i ]
+  %addj = add i64 %muli, %j
+
+  ; B[i-1][j]
+  %arrayidx1 = add i64 %j, %muliminusone
+  %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %arrayidx1
+  %elem_B1 = load i32, i32* %arrayidx2, align 4
+
+  ; B[i-1][j+1]
+  %addjone = add i64 %j, 1
+  %arrayidx3 = add i64 %addjone, %muliminusone
+  %arrayidx4 = getelementptr inbounds i32, i32* %B, i64 %arrayidx3
+  %elem_B2 = load i32, i32* %arrayidx4, align 4
+
+  ; C[i]
+  %arrayidx6 = getelementptr inbounds i32, i32* %C, i64 %i
+  %elem_C = load i32, i32* %arrayidx6, align 4
+
+  ; A[i][j+1]
+  %arrayidx7 = add i64 %addjone, %muli
+  %arrayidx8 = getelementptr inbounds i32, i32* %A, i64 %arrayidx7
+  %elem_A = load i32, i32* %arrayidx8, align 4
+
+  ; A[i][j] = A[i][j+1] + B[i-1][j] + B[i-1][j+1] + C[i]
+  %addB = add i32 %elem_B1, %elem_B2
+  %addC = add i32 %addB, %elem_C
+  %addA = add i32 %elem_A, %elem_C
+  %arrayidx9 = add i64 %j, %muli
+  %arrayidx10 = getelementptr inbounds i32, i32* %A, i64 %arrayidx9
+  store i32 %addA, i32* %arrayidx10, align 4
+
+  ; A[i][j] += B[i][i];
+  %arrayidx11 = add i64 %j, %muli
+  %arrayidx12 = getelementptr inbounds i32, i32* %A, i64 %arrayidx11
+  %elem_A1 = load i32, i32* %arrayidx12, align 4
+  %arrayidx13 = add i64 %i, %muli
+  %arrayidx14 = getelementptr inbounds i32, i32* %B, i64 %arrayidx13
+  %elem_B3 = load i32, i32* %arrayidx14, align 4
+  %addA1 = add i32 %elem_A1, %elem_B3
+  store i32 %addA1, i32* %arrayidx12, align 4
+
+  br label %for.incj
+
+for.incj:                                         ; preds = %for.j
+  %incj = add nsw i64 %j, 1
+  %exitcond54.us = icmp eq i64 %incj, %m
+  br i1 %exitcond54.us, label %for.inci, label %for.j
+
+for.inci:                                         ; preds = %for.incj
+  %inci = add nsw i64 %i, 1
+  %exitcond55.us = icmp eq i64 %inci, %n
+  br i1 %exitcond55.us, label %for.end.loopexit, label %for.i
+
+for.end.loopexit:                                 ; preds = %for.inci
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %for.cond1.preheader.lr.ph, %entry
+  ret void
+}
+