[polly] [Polly] Data flow reduction detection to cover more cases (PR #84901)

Karthika Devi C via llvm-commits llvm-commits at lists.llvm.org
Tue Mar 12 05:10:17 PDT 2024


https://github.com/kartcq created https://github.com/llvm/llvm-project/pull/84901

The base concept is same as existing reduction algorithm where we get the list of candidate pairs <store,load>. But the existing algorithm works only if there is single binary operation between the load and store.
Example sum += a[i];

This algorithm extends to work with more than single binary operation as well. It is implemented using data flow reduction detection on basic block level. We propagate the loads, the number of times the load is used(flows into instruction) and binary operation performed until we reach a store.

Example sum += a[i] + b[i];
```
sum(Ld)     a[i](Ld)
      \  +  /
        tmp    b[i](Ld)
           \ + /
            sum(St)
```

In the above case the candidate pairs are formed by associating sum with all of its load inputs which are sum, a[i] and b[i]. Then check functions are used to filter a valid reduction pair ie {sum,sum}.

>From c2e28ecd88a68e47ac9c383c34e4e678124ba3da Mon Sep 17 00:00:00 2001
From: Karthika Devi C <quic_kartc at quicinc.com>
Date: Tue, 5 Mar 2024 03:54:34 -0800
Subject: [PATCH] [Polly] Data flow reduction detection to cover more cases

The base concept is same as existing reduction algorithm where we get
the list of candidate pairs <store,load>. But the existing algorithm
works only if there is single binary operation between the load and
store.
Example sum += a[i];

This algorithm extends to work with more than single binary operation
as well. It is implemented using data flow reduction detection on
basic block level. We propagate the loads, the number of times the
load is used(flows into instruction) and binary operation performed
until we reach a store.

Example sum += a[i] + b[i];
sum(Ld) a[i](Ld)
   \ + /
    tmp  b[i](Ld)
       \+/
       sum(St)

In the above case the candidate pairs are formed by associating sum
with all of its load inputs which are sum, a[i] and b[i]. Then check
functions are used to filter a valid reduction pair ie {sum,sum}.
---
 polly/include/polly/ScopBuilder.h             |   2 +-
 polly/include/polly/ScopInfo.h                |   4 +-
 polly/lib/Analysis/ScopBuilder.cpp            | 272 +++++++++++++-----
 polly/lib/Analysis/ScopInfo.cpp               |  12 +-
 .../reduction_indirect_access.ll              |  39 +++
 .../reduction_escaping_intermediate_3.ll      |  43 +++
 .../ScopInfo/reduction_indirect_access.ll     |  42 +++
 .../ScopInfo/reduction_indirect_access_2.ll   |  50 ++++
 .../reduction_long_reduction_chain.ll         |  61 ++++
 ...duction_long_reduction_chain_double_use.ll |  58 ++++
 .../reduction_multiple_different_operators.ll |  37 +++
 11 files changed, 539 insertions(+), 81 deletions(-)
 create mode 100644 polly/test/DependenceInfo/reduction_indirect_access.ll
 create mode 100644 polly/test/ScopInfo/reduction_escaping_intermediate_3.ll
 create mode 100644 polly/test/ScopInfo/reduction_indirect_access.ll
 create mode 100644 polly/test/ScopInfo/reduction_indirect_access_2.ll
 create mode 100644 polly/test/ScopInfo/reduction_long_reduction_chain.ll
 create mode 100644 polly/test/ScopInfo/reduction_long_reduction_chain_double_use.ll
 create mode 100644 polly/test/ScopInfo/reduction_multiple_different_operators.ll

diff --git a/polly/include/polly/ScopBuilder.h b/polly/include/polly/ScopBuilder.h
index 635c23ca7f972c..7e7d328e78678e 100644
--- a/polly/include/polly/ScopBuilder.h
+++ b/polly/include/polly/ScopBuilder.h
@@ -602,7 +602,7 @@ class ScopBuilder final {
   /// results will escape during execution of the loop nest. We basically check
   /// here that no other memory access can access the same memory as the
   /// potential reduction.
-  void checkForReductions(ScopStmt &Stmt);
+  void checkForReductions(ScopStmt &Stmt, BasicBlock *Block);
 
   /// Verify that all required invariant loads have been hoisted.
   ///
diff --git a/polly/include/polly/ScopInfo.h b/polly/include/polly/ScopInfo.h
index 1e0692ff40110e..974de817e72db8 100644
--- a/polly/include/polly/ScopInfo.h
+++ b/polly/include/polly/ScopInfo.h
@@ -470,6 +470,8 @@ class MemoryAccess final {
     RT_BOR,  ///< Bitwise Or
     RT_BXOR, ///< Bitwise XOr
     RT_BAND, ///< Bitwise And
+
+    RT_BOTTOM, ///< Pseudo type for the data flow analysis
   };
 
   using SubscriptsTy = SmallVector<const SCEV *, 4>;
@@ -1139,6 +1141,7 @@ class ScopStmt final {
   friend class ScopBuilder;
 
 public:
+  using MemoryAccessVec = llvm::SmallVector<MemoryAccess *, 8>;
   /// Create the ScopStmt from a BasicBlock.
   ScopStmt(Scop &parent, BasicBlock &bb, StringRef Name, Loop *SurroundingLoop,
            std::vector<Instruction *> Instructions);
@@ -1206,7 +1209,6 @@ class ScopStmt final {
   /// The memory accesses of this statement.
   ///
   /// The only side effects of a statement are its memory accesses.
-  using MemoryAccessVec = llvm::SmallVector<MemoryAccess *, 8>;
   MemoryAccessVec MemAccs;
 
   /// Mapping from instructions to (scalar) memory accesses.
diff --git a/polly/lib/Analysis/ScopBuilder.cpp b/polly/lib/Analysis/ScopBuilder.cpp
index 0edc41d106415c..269e6f49e197ff 100644
--- a/polly/lib/Analysis/ScopBuilder.cpp
+++ b/polly/lib/Analysis/ScopBuilder.cpp
@@ -2480,8 +2480,8 @@ void ScopBuilder::collectSurroundingLoops(ScopStmt &Stmt) {
 }
 
 /// Return the reduction type for a given binary operator.
-static MemoryAccess::ReductionType getReductionType(const BinaryOperator *BinOp,
-                                                    const Instruction *Load) {
+static MemoryAccess::ReductionType
+getReductionType(const BinaryOperator *BinOp) {
   if (!BinOp)
     return MemoryAccess::RT_NONE;
   switch (BinOp->getOpcode()) {
@@ -2510,6 +2510,17 @@ static MemoryAccess::ReductionType getReductionType(const BinaryOperator *BinOp,
   }
 }
 
+/// @brief Combine two reduction types
+static MemoryAccess::ReductionType
+combineReductionType(MemoryAccess::ReductionType RT0,
+                     MemoryAccess::ReductionType RT1) {
+  if (RT0 == MemoryAccess::RT_BOTTOM)
+    return RT1;
+  if (RT0 == RT1)
+    return RT1;
+  return MemoryAccess::RT_NONE;
+}
+
 ///  True if @p AllAccs intersects with @p MemAccs execpt @p LoadMA and @p
 ///  StoreMA
 bool hasIntersectingAccesses(isl::set AllAccs, MemoryAccess *LoadMA,
@@ -2568,49 +2579,196 @@ bool checkCandidatePairAccesses(MemoryAccess *LoadMA, MemoryAccess *StoreMA,
     // Finally, check if they are no other instructions accessing this memory
     isl::map AllAccsRel = LoadAccs.unite(StoreAccs);
     AllAccsRel = AllAccsRel.intersect_domain(Domain);
+
     isl::set AllAccs = AllAccsRel.range();
+
     Valid = !hasIntersectingAccesses(AllAccs, LoadMA, StoreMA, Domain, MemAccs);
 
     LLVM_DEBUG(dbgs() << " == The accessed memory is " << (Valid ? "not " : "")
                       << "accessed by other instructions!\n");
   }
+
   return Valid;
 }
 
-void ScopBuilder::checkForReductions(ScopStmt &Stmt) {
-  SmallVector<MemoryAccess *, 2> Loads;
-  SmallVector<std::pair<MemoryAccess *, MemoryAccess *>, 4> Candidates;
+/// Perform a data flow analysis on the current basic block to propagate the
+/// uses of loaded values. Then check and mark the memory accesses which are
+/// part of reduction like chains.
+///
+/// NOTE: This assumes independent blocks and breaks otherwise.
+void ScopBuilder::checkForReductions(ScopStmt &Stmt, BasicBlock *Block) {
+  // During the data flow anaylis we use the State variable to keep track of
+  // the used "load-instructions" for each instruction in the basic block.
+  // This includes the LLVM-IR of the load and the "number of uses" (or the
+  // number of paths in the operand tree which end in this load).
+  using StatePairTy = std::pair<unsigned, MemoryAccess::ReductionType>;
+  using FlowInSetTy = MapVector<const LoadInst *, StatePairTy>;
+  using StateTy = MapVector<const Instruction *, FlowInSetTy>;
+  StateTy State;
+
+  // Invalid loads are loads which have uses we can't track properly in the
+  // state map. This includes loads which:
+  //   o do not form a reduction when they flow into a memory location:
+  //     (e.g., A[i] = B[i] * 3 and  A[i] = A[i] * A[i] + A[i])
+  //   o are used by a non binary operator or one which is not commutative
+  //     and associative (e.g., A[i] = A[i] % 3)
+  //   o might change the control flow            (e.g., if (A[i]))
+  //   o are used in indirect memory accesses     (e.g., A[B[i]])
+  //   o are used outside the current basic block
+  SmallPtrSet<const Instruction *, 8> InvalidLoads;
+
+  // Run the data flow analysis for all values in the basic block
+  for (Instruction &Inst : *Block) {
+    bool UsedOutsideBlock = any_of(Inst.users(), [Block](User *U) {
+      return cast<Instruction>(U)->getParent() != Block;
+    });
+
+    // Treat loads and stores special
+    if (auto *Load = dyn_cast<LoadInst>(&Inst)) {
+      // Invalidate all loads used which feed into the address of this load.
+      if (auto *Ptr = dyn_cast<Instruction>(Load->getPointerOperand())) {
+        const auto &It = State.find(Ptr);
+        if (It != State.end())
+          for (const auto &FlowInSetElem : It->second)
+            InvalidLoads.insert(FlowInSetElem.first);
+      }
+
+      // If this load is used outside this block, invalidate it.
+      if (UsedOutsideBlock)
+        InvalidLoads.insert(Load);
 
-  // First collect candidate load-store reduction chains by iterating over all
-  // stores and collecting possible reduction loads.
-  for (MemoryAccess *StoreMA : Stmt) {
-    if (StoreMA->isRead())
+      // And indicate that this load uses itself once but without specifying
+      // any reduction operator.
+      State[Load].insert(
+          std::make_pair(Load, std::make_pair(1, MemoryAccess::RT_BOTTOM)));
       continue;
+    }
 
-    Loads.clear();
-    collectCandidateReductionLoads(StoreMA, Loads);
-    for (MemoryAccess *LoadMA : Loads)
-      Candidates.push_back(std::make_pair(LoadMA, StoreMA));
-  }
+    if (auto *Store = dyn_cast<StoreInst>(&Inst)) {
+      // Invalidate all loads which feed into the address of this store.
+      if (const Instruction *Ptr =
+              dyn_cast<Instruction>(Store->getPointerOperand())) {
+        const auto &It = State.find(Ptr);
+        if (It != State.end())
+          for (const auto &FlowInSetElem : It->second)
+            InvalidLoads.insert(FlowInSetElem.first);
+      }
 
-  // Then check each possible candidate pair.
-  for (const auto &CandidatePair : Candidates) {
-    MemoryAccess *LoadMA = CandidatePair.first;
-    MemoryAccess *StoreMA = CandidatePair.second;
-    bool Valid = checkCandidatePairAccesses(LoadMA, StoreMA, Stmt.getDomain(),
-                                            Stmt.MemAccs);
-    if (!Valid)
+      // Propagate the uses of the value operand to the store
+      if (auto *ValueInst = dyn_cast<Instruction>(Store->getValueOperand()))
+        State.insert(std::make_pair(Store, State[ValueInst]));
       continue;
+    }
+
+    // Non load and store instructions are either binary operators or they will
+    // invalidate all used loads.
+    auto *BinOp = dyn_cast<BinaryOperator>(&Inst);
+    auto CurRedType = getReductionType(BinOp);
+    LLVM_DEBUG(dbgs() << "CurInst: " << Inst << " RT: " << CurRedType << "\n");
+
+    // Iterate over all operands and propagate their input loads to instruction.
+    FlowInSetTy &InstInFlowSet = State[&Inst];
+    for (Use &Op : Inst.operands()) {
+      auto *OpInst = dyn_cast<Instruction>(Op);
+      if (!OpInst)
+        continue;
+
+      LLVM_DEBUG(dbgs().indent(4) << "Op Inst: " << *OpInst << "\n");
+      const StateTy::iterator &OpInFlowSetIt = State.find(OpInst);
+      if (OpInFlowSetIt == State.end())
+        continue;
+
+      // Iterate over all the input loads of the operand and combine them
+      // with the input loads of current instruction.
+      FlowInSetTy &OpInFlowSet = OpInFlowSetIt->second;
+      for (auto &OpInFlowPair : OpInFlowSet) {
+        unsigned OpFlowIn = OpInFlowPair.second.first;
+        unsigned InstFlowIn = InstInFlowSet[OpInFlowPair.first].first;
+
+        auto OpRedType = OpInFlowPair.second.second;
+        auto InstRedType = InstInFlowSet[OpInFlowPair.first].second;
+
+        auto NewRedType = combineReductionType(OpRedType, CurRedType);
+        if (InstFlowIn)
+          NewRedType = combineReductionType(NewRedType, InstRedType);
+
+        LLVM_DEBUG(dbgs().indent(8) << "OpRedType: " << OpRedType << "\n");
+        LLVM_DEBUG(dbgs().indent(8) << "NewRedType: " << NewRedType << "\n");
+        InstInFlowSet[OpInFlowPair.first] =
+            std::make_pair(OpFlowIn + InstFlowIn, NewRedType);
+      }
+    }
+
+    // If this operation is used outside the block, invalidate all the loads
+    // which feed into it.
+    if (UsedOutsideBlock)
+      for (const auto &FlowInSetElem : InstInFlowSet)
+        InvalidLoads.insert(FlowInSetElem.first);
+  }
+
+  // All used loads are propagated through the whole basic block; now try to
+  // find valid reduction like candidate pairs. These load-store pairs fulfill
+  // all reduction like properties with regards to only this load-store chain.
+  // We later have to check if the loaded value was invalidated by an
+  // instruction not in that chain.
+  using MemAccPair = std::pair<MemoryAccess *, MemoryAccess *>;
+  DenseMap<MemAccPair, MemoryAccess::ReductionType> ValidCandidates;
+  DominatorTree *DT = Stmt.getParent()->getDT();
+
+  // Iterate over all write memory accesses and check the loads flowing into
+  // it for reduction candidate pairs.
+  for (MemoryAccess *WriteMA : Stmt.MemAccs) {
+    if (WriteMA->isRead())
+      continue;
+    StoreInst *St = dyn_cast<StoreInst>(WriteMA->getAccessInstruction());
+    if (!St || St->isVolatile())
+      continue;
+
+    FlowInSetTy &MaInFlowSet = State[WriteMA->getAccessInstruction()];
+    bool Valid = false;
+
+    for (auto &MaInFlowSetElem : MaInFlowSet) {
+      MemoryAccess *ReadMA = &Stmt.getArrayAccessFor(MaInFlowSetElem.first);
+      assert(ReadMA && "Couldn't find memory access for incoming load!");
 
-    const LoadInst *Load =
-        dyn_cast<const LoadInst>(CandidatePair.first->getAccessInstruction());
-    MemoryAccess::ReductionType RT =
-        getReductionType(dyn_cast<BinaryOperator>(Load->user_back()), Load);
+      LLVM_DEBUG(dbgs() << "'" << *ReadMA->getAccessInstruction()
+                        << "'\n\tflows into\n'"
+                        << *WriteMA->getAccessInstruction() << "'\n\t #"
+                        << MaInFlowSetElem.second.first << " times & RT: "
+                        << MaInFlowSetElem.second.second << "\n");
 
-    // If no overlapping access was found we mark the load and store as
-    // reduction like.
-    LoadMA->markAsReductionLike(RT);
-    StoreMA->markAsReductionLike(RT);
+      MemoryAccess::ReductionType RT = MaInFlowSetElem.second.second;
+      unsigned NumAllowableInFlow = 1;
+
+      // We allow the load to flow in exactly once for binary reductions
+      Valid = (MaInFlowSetElem.second.first == NumAllowableInFlow);
+
+      // Check if we saw a valid chain of binary operators.
+      Valid = Valid && RT != MemoryAccess::RT_BOTTOM;
+      Valid = Valid && RT != MemoryAccess::RT_NONE;
+
+      // Then check if the memory accesses allow a reduction.
+      Valid = Valid && checkCandidatePairAccesses(
+                           ReadMA, WriteMA, Stmt.getDomain(), Stmt.MemAccs);
+
+      // Finally, mark the pair as a candidate or the load as a invalid one.
+      if (Valid)
+        ValidCandidates[std::make_pair(ReadMA, WriteMA)] = RT;
+      else
+        InvalidLoads.insert(ReadMA->getAccessInstruction());
+    }
+  }
+
+  // In the last step mark the memory accesses of candidate pairs as reduction
+  // like if the load wasn't marked invalid in the previous step.
+  for (auto &CandidatePair : ValidCandidates) {
+    MemoryAccess *LoadMA = CandidatePair.first.first;
+    if (InvalidLoads.count(LoadMA->getAccessInstruction()))
+      continue;
+
+    MemoryAccess::ReductionType RT = CandidatePair.second;
+    CandidatePair.first.first->markAsReductionLike(RT);
+    CandidatePair.first.second->markAsReductionLike(RT);
   }
 }
 
@@ -2963,52 +3121,6 @@ void ScopBuilder::addInvariantLoads(ScopStmt &Stmt,
   }
 }
 
-void ScopBuilder::collectCandidateReductionLoads(
-    MemoryAccess *StoreMA, SmallVectorImpl<MemoryAccess *> &Loads) {
-  ScopStmt *Stmt = StoreMA->getStatement();
-
-  auto *Store = dyn_cast<StoreInst>(StoreMA->getAccessInstruction());
-  if (!Store)
-    return;
-
-  // Skip if there is not one binary operator between the load and the store
-  auto *BinOp = dyn_cast<BinaryOperator>(Store->getValueOperand());
-  if (!BinOp)
-    return;
-
-  // Skip if the binary operators has multiple uses
-  if (BinOp->getNumUses() != 1)
-    return;
-
-  // Skip if the opcode of the binary operator is not commutative/associative
-  if (!BinOp->isCommutative() || !BinOp->isAssociative())
-    return;
-
-  // Skip if the binary operator is outside the current SCoP
-  if (BinOp->getParent() != Store->getParent())
-    return;
-
-  // Skip if it is a multiplicative reduction and we disabled them
-  if (DisableMultiplicativeReductions &&
-      (BinOp->getOpcode() == Instruction::Mul ||
-       BinOp->getOpcode() == Instruction::FMul))
-    return;
-
-  // Check the binary operator operands for a candidate load
-  auto *PossibleLoad0 = dyn_cast<LoadInst>(BinOp->getOperand(0));
-  auto *PossibleLoad1 = dyn_cast<LoadInst>(BinOp->getOperand(1));
-  if (!PossibleLoad0 && !PossibleLoad1)
-    return;
-
-  // A load is only a candidate if it cannot escape (thus has only this use)
-  if (PossibleLoad0 && PossibleLoad0->getNumUses() == 1)
-    if (PossibleLoad0->getParent() == Store->getParent())
-      Loads.push_back(&Stmt->getArrayAccessFor(PossibleLoad0));
-  if (PossibleLoad1 && PossibleLoad1->getNumUses() == 1)
-    if (PossibleLoad1->getParent() == Store->getParent())
-      Loads.push_back(&Stmt->getArrayAccessFor(PossibleLoad1));
-}
-
 /// Find the canonical scop array info object for a set of invariant load
 /// hoisted loads. The canonical array is the one that corresponds to the
 /// first load in the list of accesses which is used as base pointer of a
@@ -3593,8 +3705,14 @@ void ScopBuilder::buildScop(Region &R, AssumptionCache &AC) {
     buildDomain(Stmt);
     buildAccessRelations(Stmt);
 
-    if (DetectReductions)
-      checkForReductions(Stmt);
+    if (DetectReductions) {
+      BasicBlock *BB = Stmt.getBasicBlock();
+      if (BB)
+        checkForReductions(Stmt, BB);
+      else
+        for (BasicBlock *Block : Stmt.getRegion()->blocks())
+          checkForReductions(Stmt, Block);
+    }
   }
 
   // Check early for a feasible runtime context.
diff --git a/polly/lib/Analysis/ScopInfo.cpp b/polly/lib/Analysis/ScopInfo.cpp
index 3e78cc8937fbf0..4f71e5f72c8c85 100644
--- a/polly/lib/Analysis/ScopInfo.cpp
+++ b/polly/lib/Analysis/ScopInfo.cpp
@@ -532,6 +532,9 @@ MemoryAccess::getReductionOperatorStr(MemoryAccess::ReductionType RT) {
   case MemoryAccess::RT_NONE:
     llvm_unreachable("Requested a reduction operator string for a memory "
                      "access which isn't a reduction");
+  case MemoryAccess::RT_BOTTOM:
+    llvm_unreachable("Requested a reduction operator string for a internal "
+                     "reduction type!");
   case MemoryAccess::RT_ADD:
     return "+";
   case MemoryAccess::RT_MUL:
@@ -914,10 +917,15 @@ isl::id MemoryAccess::getId() const { return Id; }
 
 raw_ostream &polly::operator<<(raw_ostream &OS,
                                MemoryAccess::ReductionType RT) {
-  if (RT == MemoryAccess::RT_NONE)
+  switch (RT) {
+  case MemoryAccess::RT_NONE:
+  case MemoryAccess::RT_BOTTOM:
     OS << "NONE";
-  else
+    break;
+  default:
     OS << MemoryAccess::getReductionOperatorStr(RT);
+    break;
+  }
   return OS;
 }
 
diff --git a/polly/test/DependenceInfo/reduction_indirect_access.ll b/polly/test/DependenceInfo/reduction_indirect_access.ll
new file mode 100644
index 00000000000000..aa0b92336c86d1
--- /dev/null
+++ b/polly/test/DependenceInfo/reduction_indirect_access.ll
@@ -0,0 +1,39 @@
+; RUN: opt %loadPolly -basic-aa -polly-print-dependences -polly-allow-nonaffine -disable-output < %s | FileCheck %s
+;
+; CHECK: Reduction dependences:
+; CHECK:   [N] -> { Stmt_for_body[i0] -> Stmt_for_body[1 + i0] : 0 <= i0 <= -2 + N }
+;
+;    void f(double *restrict A, int *restrict INDICES, int N) {
+;      for (int i = 0; i < N; i++)
+;        A[INDICES[i]] += N;
+;    }
+;
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64"
+
+define void @f(ptr noalias %A, ptr noalias %INDICES, i32 %N) {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp = icmp slt i32 %i.0, %N
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %conv = sitofp i32 %N to double
+  %arrayidx = getelementptr inbounds ptr, ptr %INDICES, i32 %i.0
+  %tmp = load i32, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds ptr, ptr %A, i32 %tmp
+  %tmp1 = load double, ptr %arrayidx1, align 8
+  %add = fadd fast double %tmp1, %conv
+  store double %add, double* %arrayidx1, align 8
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
diff --git a/polly/test/ScopInfo/reduction_escaping_intermediate_3.ll b/polly/test/ScopInfo/reduction_escaping_intermediate_3.ll
new file mode 100644
index 00000000000000..92a071ea1c372b
--- /dev/null
+++ b/polly/test/ScopInfo/reduction_escaping_intermediate_3.ll
@@ -0,0 +1,43 @@
+; RUN: opt %loadPolly -basic-aa -polly-print-scops -disable-output < %s | FileCheck %s
+;
+; void f(int N, int * restrict sums, int * restrict escape) {
+;   int i, j;
+;   for (i = 0; i < 1024; i++) {
+;     sums[i] += 5;
+;     escape[i] = sums[i];
+;   }
+; }
+;
+; CHECK: Reduction Type: NONE
+; CHECK: sums
+; CHECK: Reduction Type: NONE
+; CHECK: sums
+; CHECK: Reduction Type: NONE
+; CHECK: escape
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64"
+
+define void @f(i32 %N, i32* noalias %sums, i32* noalias %escape) {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc8, %for.inc ]
+  %exitcond1 = icmp ne i32 %i.0, 1024
+  br i1 %exitcond1, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %arrayidx = getelementptr inbounds i32, i32* %sums, i32 0
+  %tmp = load i32, i32* %arrayidx, align 4
+  %add = add nsw i32 %tmp, 5
+  store i32 %add, i32* %arrayidx, align 4
+  %arrayidx6 = getelementptr inbounds i32, i32* %escape, i32 %i.0
+  store i32 %add, i32* %arrayidx6, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %inc8 = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
diff --git a/polly/test/ScopInfo/reduction_indirect_access.ll b/polly/test/ScopInfo/reduction_indirect_access.ll
new file mode 100644
index 00000000000000..7acac4b150f40e
--- /dev/null
+++ b/polly/test/ScopInfo/reduction_indirect_access.ll
@@ -0,0 +1,42 @@
+; RUN: opt %loadPolly -basic-aa -polly-print-scops -polly-allow-nonaffine -disable-output < %s | FileCheck %s
+;
+; CHECK: Reduction Type: NONE
+; CHECK: MemRef_INDICES[i0]
+; CHECK: Reduction Type: +
+; CHECK: MemRef_A[o0]
+; CHECK: Reduction Type: +
+; CHECK: MemRef_A[o0]
+;
+;    void f(double *restrict A, int *restrict INDICES, int N) {
+;      for (int i = 0; i < N; i++)
+;        A[INDICES[i]] += N;
+;    }
+;
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64"
+
+define void @f(double* noalias %A, i32* noalias %INDICES, i32 %N) {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp = icmp slt i32 %i.0, %N
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %conv = sitofp i32 %N to double
+  %arrayidx = getelementptr inbounds i32, i32* %INDICES, i32 %i.0
+  %tmp = load i32, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds double, double* %A, i32 %tmp
+  %tmp1 = load double, double* %arrayidx1, align 8
+  %add = fadd fast double %tmp1, %conv
+  store double %add, double* %arrayidx1, align 8
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
diff --git a/polly/test/ScopInfo/reduction_indirect_access_2.ll b/polly/test/ScopInfo/reduction_indirect_access_2.ll
new file mode 100644
index 00000000000000..331953991d86cd
--- /dev/null
+++ b/polly/test/ScopInfo/reduction_indirect_access_2.ll
@@ -0,0 +1,50 @@
+; RUN: opt %loadPolly -basic-aa -polly-print-scops -disable-output -polly-allow-nonaffine < %s | FileCheck %s
+;
+; Validate that the accesses to INDICES[i] is not part of a reduction.
+;
+; CHECK: Reduction Type: NONE
+; CHECK: MemRef_INDICES[i0]
+; CHECK: Reduction Type: +
+; CHECK: MemRef_A[o0]
+; CHECK: Reduction Type: +
+; CHECK: MemRef_A[o0]
+; CHECK: Reduction Type: NONE
+; CHECK: MemRef_INDICES[i0]
+;
+;    void f(double *restrict A, int *restrict INDICES, int N) {
+;      for (int i = 0; i < N; i++) {
+;        A[INDICES[i]] += N;
+;        INDICES[i] += N;
+;      }
+;    }
+;
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64"
+
+define void @f(double* noalias %A, i32* noalias %INDICES, i32 %N) {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp = icmp slt i32 %i.0, %N
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %conv = sitofp i32 %N to double
+  %arrayidx = getelementptr inbounds i32, i32* %INDICES, i32 %i.0
+  %tmp = load i32, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds double, double* %A, i32 %tmp
+  %tmp1 = load double, double* %arrayidx1, align 8
+  %add = fadd fast double %tmp1, %conv
+  store double %add, double* %arrayidx1, align 8
+  %add3 = add nsw i32 %tmp, %N
+  store i32 %add3, i32* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
diff --git a/polly/test/ScopInfo/reduction_long_reduction_chain.ll b/polly/test/ScopInfo/reduction_long_reduction_chain.ll
new file mode 100644
index 00000000000000..62ae1fef187b63
--- /dev/null
+++ b/polly/test/ScopInfo/reduction_long_reduction_chain.ll
@@ -0,0 +1,61 @@
+; RUN: opt %loadPolly -basic-aa -polly-print-scops -disable-output < %s | FileCheck %s
+;
+; CHECK: Reduction Type: +
+; CHECK: MemRef_sum
+; CHECK: Reduction Type: NONE
+; CHECK: MemRef_A
+; CHECK: Reduction Type: +
+; CHECK: MemRef_sum
+; CHECK-NOT: MemRef_A
+;
+;    void f(int *restrict sum, int *restrict A) {
+;      for (int i = 0; i < 1024; i++)
+;        *sum = (A[i + 3] * (i - 14)) + ((A[i] + *sum + A[0]) + A[1023]) +
+;               (A[i + 2] * A[i - 1]);
+;    }
+;
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64"
+
+define void @f(i32* noalias %sum, i32* noalias %A) {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %exitcond = icmp ne i32 %i.0, 1024
+  br i1 %exitcond, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %add = add nsw i32 %i.0, 3
+  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %add
+  %tmp = load i32, i32* %arrayidx, align 4
+  %sub = add nsw i32 %i.0, -14
+  %mul = mul nsw i32 %tmp, %sub
+  %arrayidx1 = getelementptr inbounds i32, i32* %A, i32 %i.0
+  %tmp1 = load i32, i32* %arrayidx1, align 4
+  %tmp2 = load i32, i32* %sum, align 4
+  %add2 = add nsw i32 %tmp1, %tmp2
+  %tmp3 = load i32, i32* %A, align 4
+  %add4 = add nsw i32 %add2, %tmp3
+  %arrayidx5 = getelementptr inbounds i32, i32* %A, i32 1023
+  %tmp4 = load i32, i32* %arrayidx5, align 4
+  %add6 = add nsw i32 %add4, %tmp4
+  %add7 = add nsw i32 %mul, %add6
+  %add8 = add nsw i32 %i.0, 2
+  %arrayidx9 = getelementptr inbounds i32, i32* %A, i32 %add8
+  %tmp5 = load i32, i32* %arrayidx9, align 4
+  %sub10 = add nsw i32 %i.0, -1
+  %arrayidx11 = getelementptr inbounds i32, i32* %A, i32 %sub10
+  %tmp6 = load i32, i32* %arrayidx11, align 4
+  %mul12 = mul nsw i32 %tmp5, %tmp6
+  %add13 = add nsw i32 %add7, %mul12
+  store i32 %add13, i32* %sum, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
diff --git a/polly/test/ScopInfo/reduction_long_reduction_chain_double_use.ll b/polly/test/ScopInfo/reduction_long_reduction_chain_double_use.ll
new file mode 100644
index 00000000000000..7ca46fa9535ac0
--- /dev/null
+++ b/polly/test/ScopInfo/reduction_long_reduction_chain_double_use.ll
@@ -0,0 +1,58 @@
+; RUN: opt %loadPolly -basic-aa -polly-print-scops -disable-output < %s | FileCheck %s
+;
+; Sum is added twice in the statement. Hence no reduction.
+; CHECK: Reduction Type: NONE
+;
+;    void f(int *restrict sum, int *restrict A) {
+;      for (int i = 0; i < 1024; i++)
+;        *sum = (A[i + 3] * (i - 14)) + ((A[i] + *sum + A[0]) + A[1023]) +
+;               (A[i + 2] * A[i - 1]) + *sum;
+;    }
+;
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64"
+
+define void @f(i32* noalias %sum, i32* noalias %A) {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %exitcond = icmp ne i32 %i.0, 1024
+  br i1 %exitcond, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %add = add nsw i32 %i.0, 3
+  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %add
+  %tmp = load i32, i32* %arrayidx, align 4
+  %sub = add nsw i32 %i.0, -14
+  %mul = mul nsw i32 %tmp, %sub
+  %arrayidx1 = getelementptr inbounds i32, i32* %A, i32 %i.0
+  %tmp1 = load i32, i32* %arrayidx1, align 4
+  %tmp2 = load i32, i32* %sum, align 4
+  %add2 = add nsw i32 %tmp1, %tmp2
+  %tmp3 = load i32, i32* %A, align 4
+  %add4 = add nsw i32 %add2, %tmp3
+  %arrayidx5 = getelementptr inbounds i32, i32* %A, i32 1023
+  %tmp4 = load i32, i32* %arrayidx5, align 4
+  %add6 = add nsw i32 %add4, %tmp4
+  %add7 = add nsw i32 %mul, %add6
+  %add8 = add nsw i32 %i.0, 2
+  %arrayidx9 = getelementptr inbounds i32, i32* %A, i32 %add8
+  %tmp5 = load i32, i32* %arrayidx9, align 4
+  %sub10 = add nsw i32 %i.0, -1
+  %arrayidx11 = getelementptr inbounds i32, i32* %A, i32 %sub10
+  %tmp6 = load i32, i32* %arrayidx11, align 4
+  %mul12 = mul nsw i32 %tmp5, %tmp6
+  %add13 = add nsw i32 %add7, %mul12
+  %tmp7 = load i32, i32* %sum, align 4
+  %add14 = add nsw i32 %add13, %tmp7
+  store i32 %add14, i32* %sum, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
diff --git a/polly/test/ScopInfo/reduction_multiple_different_operators.ll b/polly/test/ScopInfo/reduction_multiple_different_operators.ll
new file mode 100644
index 00000000000000..b77c72a291744d
--- /dev/null
+++ b/polly/test/ScopInfo/reduction_multiple_different_operators.ll
@@ -0,0 +1,37 @@
+; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+;
+; Should not be identified as reduction as there are different operations
+; involved on sum (multiplication followed by addition)
+; CHECK: Reduction Type: NONE
+;
+;    void f(int *restrict sum) {
+;      for (int i = 0; i < 1024; i++) {
+;        *sum = (*sum * 5) + 25;
+;      }
+;    }
+;
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64"
+
+define void @f(i32* noalias %sum) {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %exitcond = icmp ne i32 %i.0, 1024
+  br i1 %exitcond, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %tmp = load i32, i32* %sum, align 4
+  %tmp1 = mul i32 %tmp, 5
+  %mul = add i32 %tmp1, 25
+  store i32 %mul, i32* %sum, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}



More information about the llvm-commits mailing list