[polly] 1e5334b - [Polly] Data flow reduction detection to cover more cases (#84901)

via llvm-commits llvm-commits at lists.llvm.org
Tue Jul 30 09:43:27 PDT 2024


Author: Karthika Devi C
Date: 2024-07-30T09:43:24-07:00
New Revision: 1e5334bcdae462e47d8516464b2cca1674ea899d

URL: https://github.com/llvm/llvm-project/commit/1e5334bcdae462e47d8516464b2cca1674ea899d
DIFF: https://github.com/llvm/llvm-project/commit/1e5334bcdae462e47d8516464b2cca1674ea899d.diff

LOG: [Polly] Data flow reduction detection to cover more cases (#84901)

The base concept is same as existing reduction algorithm where we get
the list of candidate pairs <store,load>. But the existing algorithm
works only if there is single binary operation between the load and
store.
Example sum += a[i];

This algorithm extends to work with more than single binary operation as
well. It is implemented using data flow reduction detection on basic
block level. We propagate the loads, the number of times the load is
used(flows into instruction) and binary operation performed until we
reach a store.

Example sum += a[i] + b[i];
```
sum(Ld)     a[i](Ld)
      \  +  /
        tmp    b[i](Ld)
           \ + /
            sum(St)
```

In the above case the candidate pairs are formed by associating sum with
all of its load inputs which are sum, a[i] and b[i]. Then check
functions are used to filter a valid reduction pair ie {sum,sum}.

---------

Co-authored-by: Michael Kruse <github at meinersbur.de>

Added: 
    polly/test/DependenceInfo/reduction_indirect_access.ll
    polly/test/ScopInfo/reduction_double.ll
    polly/test/ScopInfo/reduction_escaping_intermediate_3.ll
    polly/test/ScopInfo/reduction_if.ll
    polly/test/ScopInfo/reduction_indirect_access.ll
    polly/test/ScopInfo/reduction_indirect_access_2.ll
    polly/test/ScopInfo/reduction_long_reduction_chain.ll
    polly/test/ScopInfo/reduction_long_reduction_chain_double_use.ll
    polly/test/ScopInfo/reduction_multiple_different_operators.ll

Modified: 
    polly/include/polly/ScopBuilder.h
    polly/include/polly/ScopInfo.h
    polly/lib/Analysis/ScopBuilder.cpp
    polly/lib/Analysis/ScopInfo.cpp

Removed: 
    


################################################################################
diff  --git a/polly/include/polly/ScopBuilder.h b/polly/include/polly/ScopBuilder.h
index 635c23ca7f972..e589a7f0b05d6 100644
--- a/polly/include/polly/ScopBuilder.h
+++ b/polly/include/polly/ScopBuilder.h
@@ -663,19 +663,6 @@ class ScopBuilder final {
   ///         nullptr if it cannot be hoisted at all.
   isl::set getNonHoistableCtx(MemoryAccess *Access, isl::union_map Writes);
 
-  /// Collect loads which might form a reduction chain with @p StoreMA.
-  ///
-  /// Check if the stored value for @p StoreMA is a binary operator with one or
-  /// two loads as operands. If the binary operand is commutative & associative,
-  /// used only once (by @p StoreMA) and its load operands are also used only
-  /// once, we have found a possible reduction chain. It starts at an operand
-  /// load and includes the binary operator and @p StoreMA.
-  ///
-  /// Note: We allow only one use to ensure the load and binary operator cannot
-  ///       escape this block or into any other store except @p StoreMA.
-  void collectCandidateReductionLoads(MemoryAccess *StoreMA,
-                                      SmallVectorImpl<MemoryAccess *> &Loads);
-
   /// Build the access relation of all memory accesses of @p Stmt.
   void buildAccessRelations(ScopStmt &Stmt);
 

diff  --git a/polly/include/polly/ScopInfo.h b/polly/include/polly/ScopInfo.h
index 1e0692ff40110..974de817e72db 100644
--- a/polly/include/polly/ScopInfo.h
+++ b/polly/include/polly/ScopInfo.h
@@ -470,6 +470,8 @@ class MemoryAccess final {
     RT_BOR,  ///< Bitwise Or
     RT_BXOR, ///< Bitwise XOr
     RT_BAND, ///< Bitwise And
+
+    RT_BOTTOM, ///< Pseudo type for the data flow analysis
   };
 
   using SubscriptsTy = SmallVector<const SCEV *, 4>;
@@ -1139,6 +1141,7 @@ class ScopStmt final {
   friend class ScopBuilder;
 
 public:
+  using MemoryAccessVec = llvm::SmallVector<MemoryAccess *, 8>;
   /// Create the ScopStmt from a BasicBlock.
   ScopStmt(Scop &parent, BasicBlock &bb, StringRef Name, Loop *SurroundingLoop,
            std::vector<Instruction *> Instructions);
@@ -1206,7 +1209,6 @@ class ScopStmt final {
   /// The memory accesses of this statement.
   ///
   /// The only side effects of a statement are its memory accesses.
-  using MemoryAccessVec = llvm::SmallVector<MemoryAccess *, 8>;
   MemoryAccessVec MemAccs;
 
   /// Mapping from instructions to (scalar) memory accesses.

diff  --git a/polly/lib/Analysis/ScopBuilder.cpp b/polly/lib/Analysis/ScopBuilder.cpp
index 0b9a1a916e1c1..dfdfd415b8ce2 100644
--- a/polly/lib/Analysis/ScopBuilder.cpp
+++ b/polly/lib/Analysis/ScopBuilder.cpp
@@ -2481,8 +2481,8 @@ void ScopBuilder::collectSurroundingLoops(ScopStmt &Stmt) {
 }
 
 /// Return the reduction type for a given binary operator.
-static MemoryAccess::ReductionType getReductionType(const BinaryOperator *BinOp,
-                                                    const Instruction *Load) {
+static MemoryAccess::ReductionType
+getReductionType(const BinaryOperator *BinOp) {
   if (!BinOp)
     return MemoryAccess::RT_NONE;
   switch (BinOp->getOpcode()) {
@@ -2511,6 +2511,17 @@ static MemoryAccess::ReductionType getReductionType(const BinaryOperator *BinOp,
   }
 }
 
+/// @brief Combine two reduction types
+static MemoryAccess::ReductionType
+combineReductionType(MemoryAccess::ReductionType RT0,
+                     MemoryAccess::ReductionType RT1) {
+  if (RT0 == MemoryAccess::RT_BOTTOM)
+    return RT1;
+  if (RT0 == RT1)
+    return RT1;
+  return MemoryAccess::RT_NONE;
+}
+
 ///  True if @p AllAccs intersects with @p MemAccs execpt @p LoadMA and @p
 ///  StoreMA
 bool hasIntersectingAccesses(isl::set AllAccs, MemoryAccess *LoadMA,
@@ -2571,47 +2582,206 @@ bool checkCandidatePairAccesses(MemoryAccess *LoadMA, MemoryAccess *StoreMA,
     AllAccsRel = AllAccsRel.intersect_domain(Domain);
     isl::set AllAccs = AllAccsRel.range();
     Valid = !hasIntersectingAccesses(AllAccs, LoadMA, StoreMA, Domain, MemAccs);
-
     POLLY_DEBUG(dbgs() << " == The accessed memory is " << (Valid ? "not " : "")
                        << "accessed by other instructions!\n");
   }
+
   return Valid;
 }
 
 void ScopBuilder::checkForReductions(ScopStmt &Stmt) {
-  SmallVector<MemoryAccess *, 2> Loads;
-  SmallVector<std::pair<MemoryAccess *, MemoryAccess *>, 4> Candidates;
+  // Perform a data flow analysis on the current scop statement to propagate the
+  // uses of loaded values. Then check and mark the memory accesses which are
+  // part of reduction like chains.
+  // During the data flow analysis we use the State variable to keep track of
+  // the used "load-instructions" for each instruction in the scop statement.
+  // This includes the LLVM-IR of the load and the "number of uses" (or the
+  // number of paths in the operand tree which end in this load).
+  using StatePairTy = std::pair<unsigned, MemoryAccess::ReductionType>;
+  using FlowInSetTy = MapVector<const LoadInst *, StatePairTy>;
+  using StateTy = MapVector<const Instruction *, FlowInSetTy>;
+  StateTy State;
+
+  // Invalid loads are loads which have uses we can't track properly in the
+  // state map. This includes loads which:
+  //   o do not form a reduction when they flow into a memory location:
+  //     (e.g., A[i] = B[i] * 3 and  A[i] = A[i] * A[i] + A[i])
+  //   o are used by a non binary operator or one which is not commutative
+  //     and associative (e.g., A[i] = A[i] % 3)
+  //   o might change the control flow            (e.g., if (A[i]))
+  //   o are used in indirect memory accesses     (e.g., A[B[i]])
+  //   o are used outside the current scop statement
+  SmallPtrSet<const Instruction *, 8> InvalidLoads;
+  SmallVector<BasicBlock *, 8> ScopBlocks;
+  BasicBlock *BB = Stmt.getBasicBlock();
+  if (BB)
+    ScopBlocks.push_back(BB);
+  else
+    for (BasicBlock *Block : Stmt.getRegion()->blocks())
+      ScopBlocks.push_back(Block);
+  // Run the data flow analysis for all values in the scop statement
+  for (BasicBlock *Block : ScopBlocks) {
+    for (Instruction &Inst : *Block) {
+      if ((Stmt.getParent())->getStmtFor(&Inst) != &Stmt)
+        continue;
+      bool UsedOutsideStmt = any_of(Inst.users(), [&Stmt](User *U) {
+        return (Stmt.getParent())->getStmtFor(cast<Instruction>(U)) != &Stmt;
+      });
+      //  Treat loads and stores special
+      if (auto *Load = dyn_cast<LoadInst>(&Inst)) {
+        // Invalidate all loads used which feed into the address of this load.
+        if (auto *Ptr = dyn_cast<Instruction>(Load->getPointerOperand())) {
+          const auto &It = State.find(Ptr);
+          if (It != State.end())
+            for (const auto &FlowInSetElem : It->second)
+              InvalidLoads.insert(FlowInSetElem.first);
+        }
 
-  // First collect candidate load-store reduction chains by iterating over all
-  // stores and collecting possible reduction loads.
-  for (MemoryAccess *StoreMA : Stmt) {
-    if (StoreMA->isRead())
-      continue;
+        // If this load is used outside this stmt, invalidate it.
+        if (UsedOutsideStmt)
+          InvalidLoads.insert(Load);
+
+        // And indicate that this load uses itself once but without specifying
+        // any reduction operator.
+        State[Load].insert(
+            std::make_pair(Load, std::make_pair(1, MemoryAccess::RT_BOTTOM)));
+        continue;
+      }
+
+      if (auto *Store = dyn_cast<StoreInst>(&Inst)) {
+        // Invalidate all loads which feed into the address of this store.
+        if (const Instruction *Ptr =
+                dyn_cast<Instruction>(Store->getPointerOperand())) {
+          const auto &It = State.find(Ptr);
+          if (It != State.end())
+            for (const auto &FlowInSetElem : It->second)
+              InvalidLoads.insert(FlowInSetElem.first);
+        }
+
+        // Propagate the uses of the value operand to the store
+        if (auto *ValueInst = dyn_cast<Instruction>(Store->getValueOperand()))
+          State.insert(std::make_pair(Store, State[ValueInst]));
+        continue;
+      }
+
+      // Non load and store instructions are either binary operators or they
+      // will invalidate all used loads.
+      auto *BinOp = dyn_cast<BinaryOperator>(&Inst);
+      MemoryAccess::ReductionType CurRedType = getReductionType(BinOp);
+      POLLY_DEBUG(dbgs() << "CurInst: " << Inst << " RT: " << CurRedType
+                         << "\n");
+
+      // Iterate over all operands and propagate their input loads to
+      // instruction.
+      FlowInSetTy &InstInFlowSet = State[&Inst];
+      for (Use &Op : Inst.operands()) {
+        auto *OpInst = dyn_cast<Instruction>(Op);
+        if (!OpInst)
+          continue;
+
+        POLLY_DEBUG(dbgs().indent(4) << "Op Inst: " << *OpInst << "\n");
+        const StateTy::iterator &OpInFlowSetIt = State.find(OpInst);
+        if (OpInFlowSetIt == State.end())
+          continue;
+
+        // Iterate over all the input loads of the operand and combine them
+        // with the input loads of current instruction.
+        FlowInSetTy &OpInFlowSet = OpInFlowSetIt->second;
+        for (auto &OpInFlowPair : OpInFlowSet) {
+          unsigned OpFlowIn = OpInFlowPair.second.first;
+          unsigned InstFlowIn = InstInFlowSet[OpInFlowPair.first].first;
+
+          MemoryAccess::ReductionType OpRedType = OpInFlowPair.second.second;
+          MemoryAccess::ReductionType InstRedType =
+              InstInFlowSet[OpInFlowPair.first].second;
+
+          MemoryAccess::ReductionType NewRedType =
+              combineReductionType(OpRedType, CurRedType);
+          if (InstFlowIn)
+            NewRedType = combineReductionType(NewRedType, InstRedType);
+
+          POLLY_DEBUG(dbgs().indent(8) << "OpRedType: " << OpRedType << "\n");
+          POLLY_DEBUG(dbgs().indent(8) << "NewRedType: " << NewRedType << "\n");
+          InstInFlowSet[OpInFlowPair.first] =
+              std::make_pair(OpFlowIn + InstFlowIn, NewRedType);
+        }
+      }
 
-    Loads.clear();
-    collectCandidateReductionLoads(StoreMA, Loads);
-    for (MemoryAccess *LoadMA : Loads)
-      Candidates.push_back(std::make_pair(LoadMA, StoreMA));
+      // If this operation is used outside the stmt, invalidate all the loads
+      // which feed into it.
+      if (UsedOutsideStmt)
+        for (const auto &FlowInSetElem : InstInFlowSet)
+          InvalidLoads.insert(FlowInSetElem.first);
+    }
   }
 
-  // Then check each possible candidate pair.
-  for (const auto &CandidatePair : Candidates) {
-    MemoryAccess *LoadMA = CandidatePair.first;
-    MemoryAccess *StoreMA = CandidatePair.second;
-    bool Valid = checkCandidatePairAccesses(LoadMA, StoreMA, Stmt.getDomain(),
-                                            Stmt.MemAccs);
-    if (!Valid)
+  // All used loads are propagated through the whole basic block; now try to
+  // find valid reduction-like candidate pairs. These load-store pairs fulfill
+  // all reduction like properties with regards to only this load-store chain.
+  // We later have to check if the loaded value was invalidated by an
+  // instruction not in that chain.
+  using MemAccPair = std::pair<MemoryAccess *, MemoryAccess *>;
+  DenseMap<MemAccPair, MemoryAccess::ReductionType> ValidCandidates;
+  DominatorTree *DT = Stmt.getParent()->getDT();
+
+  // Iterate over all write memory accesses and check the loads flowing into
+  // it for reduction candidate pairs.
+  for (MemoryAccess *WriteMA : Stmt.MemAccs) {
+    if (WriteMA->isRead())
+      continue;
+    StoreInst *St = dyn_cast<StoreInst>(WriteMA->getAccessInstruction());
+    if (!St)
       continue;
+    assert(!St->isVolatile());
+
+    FlowInSetTy &MaInFlowSet = State[WriteMA->getAccessInstruction()];
+    for (auto &MaInFlowSetElem : MaInFlowSet) {
+      MemoryAccess *ReadMA = &Stmt.getArrayAccessFor(MaInFlowSetElem.first);
+      assert(ReadMA && "Couldn't find memory access for incoming load!");
 
-    const LoadInst *Load =
-        dyn_cast<const LoadInst>(CandidatePair.first->getAccessInstruction());
-    MemoryAccess::ReductionType RT =
-        getReductionType(dyn_cast<BinaryOperator>(Load->user_back()), Load);
+      POLLY_DEBUG(dbgs() << "'" << *ReadMA->getAccessInstruction()
+                         << "'\n\tflows into\n'"
+                         << *WriteMA->getAccessInstruction() << "'\n\t #"
+                         << MaInFlowSetElem.second.first << " times & RT: "
+                         << MaInFlowSetElem.second.second << "\n");
 
-    // If no overlapping access was found we mark the load and store as
-    // reduction like.
-    LoadMA->markAsReductionLike(RT);
-    StoreMA->markAsReductionLike(RT);
+      MemoryAccess::ReductionType RT = MaInFlowSetElem.second.second;
+      unsigned NumAllowableInFlow = 1;
+
+      // We allow the load to flow in exactly once for binary reductions
+      bool Valid = (MaInFlowSetElem.second.first == NumAllowableInFlow);
+
+      // Check if we saw a valid chain of binary operators.
+      Valid = Valid && RT != MemoryAccess::RT_BOTTOM;
+      Valid = Valid && RT != MemoryAccess::RT_NONE;
+
+      // Then check if the memory accesses allow a reduction.
+      Valid = Valid && checkCandidatePairAccesses(
+                           ReadMA, WriteMA, Stmt.getDomain(), Stmt.MemAccs);
+
+      // Finally, mark the pair as a candidate or the load as a invalid one.
+      if (Valid)
+        ValidCandidates[std::make_pair(ReadMA, WriteMA)] = RT;
+      else
+        InvalidLoads.insert(ReadMA->getAccessInstruction());
+    }
+  }
+
+  // In the last step mark the memory accesses of candidate pairs as reduction
+  // like if the load wasn't marked invalid in the previous step.
+  for (auto &CandidatePair : ValidCandidates) {
+    MemoryAccess *LoadMA = CandidatePair.first.first;
+    if (InvalidLoads.count(LoadMA->getAccessInstruction()))
+      continue;
+    POLLY_DEBUG(
+        dbgs() << " Load :: "
+               << *((CandidatePair.first.first)->getAccessInstruction())
+               << "\n Store :: "
+               << *((CandidatePair.first.second)->getAccessInstruction())
+               << "\n are marked as reduction like\n");
+    MemoryAccess::ReductionType RT = CandidatePair.second;
+    CandidatePair.first.first->markAsReductionLike(RT);
+    CandidatePair.first.second->markAsReductionLike(RT);
   }
 }
 
@@ -2965,52 +3135,6 @@ void ScopBuilder::addInvariantLoads(ScopStmt &Stmt,
   }
 }
 
-void ScopBuilder::collectCandidateReductionLoads(
-    MemoryAccess *StoreMA, SmallVectorImpl<MemoryAccess *> &Loads) {
-  ScopStmt *Stmt = StoreMA->getStatement();
-
-  auto *Store = dyn_cast<StoreInst>(StoreMA->getAccessInstruction());
-  if (!Store)
-    return;
-
-  // Skip if there is not one binary operator between the load and the store
-  auto *BinOp = dyn_cast<BinaryOperator>(Store->getValueOperand());
-  if (!BinOp)
-    return;
-
-  // Skip if the binary operators has multiple uses
-  if (BinOp->getNumUses() != 1)
-    return;
-
-  // Skip if the opcode of the binary operator is not commutative/associative
-  if (!BinOp->isCommutative() || !BinOp->isAssociative())
-    return;
-
-  // Skip if the binary operator is outside the current SCoP
-  if (BinOp->getParent() != Store->getParent())
-    return;
-
-  // Skip if it is a multiplicative reduction and we disabled them
-  if (DisableMultiplicativeReductions &&
-      (BinOp->getOpcode() == Instruction::Mul ||
-       BinOp->getOpcode() == Instruction::FMul))
-    return;
-
-  // Check the binary operator operands for a candidate load
-  auto *PossibleLoad0 = dyn_cast<LoadInst>(BinOp->getOperand(0));
-  auto *PossibleLoad1 = dyn_cast<LoadInst>(BinOp->getOperand(1));
-  if (!PossibleLoad0 && !PossibleLoad1)
-    return;
-
-  // A load is only a candidate if it cannot escape (thus has only this use)
-  if (PossibleLoad0 && PossibleLoad0->getNumUses() == 1)
-    if (PossibleLoad0->getParent() == Store->getParent())
-      Loads.push_back(&Stmt->getArrayAccessFor(PossibleLoad0));
-  if (PossibleLoad1 && PossibleLoad1->getNumUses() == 1)
-    if (PossibleLoad1->getParent() == Store->getParent())
-      Loads.push_back(&Stmt->getArrayAccessFor(PossibleLoad1));
-}
-
 /// Find the canonical scop array info object for a set of invariant load
 /// hoisted loads. The canonical array is the one that corresponds to the
 /// first load in the list of accesses which is used as base pointer of a

diff  --git a/polly/lib/Analysis/ScopInfo.cpp b/polly/lib/Analysis/ScopInfo.cpp
index fa35fae84aceb..044d3573793f0 100644
--- a/polly/lib/Analysis/ScopInfo.cpp
+++ b/polly/lib/Analysis/ScopInfo.cpp
@@ -533,6 +533,9 @@ MemoryAccess::getReductionOperatorStr(MemoryAccess::ReductionType RT) {
   case MemoryAccess::RT_NONE:
     llvm_unreachable("Requested a reduction operator string for a memory "
                      "access which isn't a reduction");
+  case MemoryAccess::RT_BOTTOM:
+    llvm_unreachable("Requested a reduction operator string for a internal "
+                     "reduction type!");
   case MemoryAccess::RT_ADD:
     return "+";
   case MemoryAccess::RT_MUL:
@@ -915,10 +918,15 @@ isl::id MemoryAccess::getId() const { return Id; }
 
 raw_ostream &polly::operator<<(raw_ostream &OS,
                                MemoryAccess::ReductionType RT) {
-  if (RT == MemoryAccess::RT_NONE)
+  switch (RT) {
+  case MemoryAccess::RT_NONE:
+  case MemoryAccess::RT_BOTTOM:
     OS << "NONE";
-  else
+    break;
+  default:
     OS << MemoryAccess::getReductionOperatorStr(RT);
+    break;
+  }
   return OS;
 }
 

diff  --git a/polly/test/DependenceInfo/reduction_indirect_access.ll b/polly/test/DependenceInfo/reduction_indirect_access.ll
new file mode 100644
index 0000000000000..3b4bd9ef04b5a
--- /dev/null
+++ b/polly/test/DependenceInfo/reduction_indirect_access.ll
@@ -0,0 +1,39 @@
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-dependences>' -polly-allow-nonaffine -disable-output < %s | FileCheck %s
+;
+; CHECK: Reduction dependences:
+; CHECK:   [N] -> { Stmt_for_body[i0] -> Stmt_for_body[1 + i0] : 0 <= i0 <= -2 + N }
+;
+;    void f(double *restrict A, int *restrict INDICES, int N) {
+;      for (int i = 0; i < N; i++)
+;        A[INDICES[i]] += N;
+;    }
+;
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64"
+
+define void @f(ptr noalias %A, ptr noalias %INDICES, i32 %N) {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp = icmp slt i32 %i.0, %N
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %conv = sitofp i32 %N to double
+  %arrayidx = getelementptr inbounds ptr, ptr %INDICES, i32 %i.0
+  %tmp = load i32, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds ptr, ptr %A, i32 %tmp
+  %tmp1 = load double, ptr %arrayidx1, align 8
+  %add = fadd fast double %tmp1, %conv
+  store double %add, double* %arrayidx1, align 8
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+

diff  --git a/polly/test/ScopInfo/reduction_double.ll b/polly/test/ScopInfo/reduction_double.ll
new file mode 100644
index 0000000000000..d126d3d833ee1
--- /dev/null
+++ b/polly/test/ScopInfo/reduction_double.ll
@@ -0,0 +1,57 @@
+; RUN: opt %loadPolly -basic-aa -polly-print-scops -disable-output -polly-allow-nonaffine < %s | FileCheck %s
+;
+; Verify if two independent reductions in same loop is detected
+;
+; CHECK: Stmt_for_body
+; CHECK: Reduction Type: +
+; CHECK-NEXT: MemRef_sum1[0]
+; CHECK-NEXT: Reduction Type: +
+; CHECK-NEXT: MemRef_sum1[0]
+;
+; CHECK: Stmt_for_body_b
+; CHECK: Reduction Type: +
+; CHECK-NEXT: MemRef_sum2[0]
+; CHECK-NEXT: Reduction Type: +
+; CHECK-NEXT: MemRef_sum2[0]
+;
+; int red(int *A, int *B, int *sum, int * prod, int n) {
+;   for (int i = 0; i < n; ++i) {
+;     *sum += A[i];
+;     *prod += B[i];
+;   }
+; }
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-linux-gnu"
+
+; Function Attrs: nofree norecurse nosync nounwind memory(argmem: readwrite) uwtable
+define dso_local i32 @red(ptr nocapture noundef readonly %A, ptr nocapture noundef readonly %B, ptr nocapture noundef %sum1, ptr nocapture noundef %sum2, i32 noundef %n) local_unnamed_addr #0 {
+entry:
+  %cmp7 = icmp sgt i32 %n, 0
+  br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext nneg i32 %n to i64
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret i32 undef
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
+  %0 = load i32, ptr %arrayidx
+  %1 = load i32, ptr %sum1
+  %add = add nsw i32 %1, %0
+  store i32 %add, ptr %sum1
+  %arrayidx2 = getelementptr inbounds i32, ptr %B, i64 %indvars.iv
+  %2 = load i32, ptr %arrayidx2
+  %3 = load i32, ptr %sum2
+  %add3 = add nsw i32 %3, %2
+  store i32 %add3, ptr %sum2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+

diff  --git a/polly/test/ScopInfo/reduction_escaping_intermediate_3.ll b/polly/test/ScopInfo/reduction_escaping_intermediate_3.ll
new file mode 100644
index 0000000000000..92a071ea1c372
--- /dev/null
+++ b/polly/test/ScopInfo/reduction_escaping_intermediate_3.ll
@@ -0,0 +1,43 @@
+; RUN: opt %loadPolly -basic-aa -polly-print-scops -disable-output < %s | FileCheck %s
+;
+; void f(int N, int * restrict sums, int * restrict escape) {
+;   int i, j;
+;   for (i = 0; i < 1024; i++) {
+;     sums[i] += 5;
+;     escape[i] = sums[i];
+;   }
+; }
+;
+; CHECK: Reduction Type: NONE
+; CHECK: sums
+; CHECK: Reduction Type: NONE
+; CHECK: sums
+; CHECK: Reduction Type: NONE
+; CHECK: escape
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64"
+
+define void @f(i32 %N, i32* noalias %sums, i32* noalias %escape) {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc8, %for.inc ]
+  %exitcond1 = icmp ne i32 %i.0, 1024
+  br i1 %exitcond1, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %arrayidx = getelementptr inbounds i32, i32* %sums, i32 0
+  %tmp = load i32, i32* %arrayidx, align 4
+  %add = add nsw i32 %tmp, 5
+  store i32 %add, i32* %arrayidx, align 4
+  %arrayidx6 = getelementptr inbounds i32, i32* %escape, i32 %i.0
+  store i32 %add, i32* %arrayidx6, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %inc8 = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}

diff  --git a/polly/test/ScopInfo/reduction_if.ll b/polly/test/ScopInfo/reduction_if.ll
new file mode 100644
index 0000000000000..4f7d3681e0a0b
--- /dev/null
+++ b/polly/test/ScopInfo/reduction_if.ll
@@ -0,0 +1,52 @@
+; RUN: opt %loadPolly -basic-aa -polly-print-scops -disable-output -polly-allow-nonaffine < %s | FileCheck %s
+;
+; Verify if reduction spread across multiple blocks in a single scop statement are detected
+;
+; CHECK: Stmt_for_body
+; CHECK: Reduction Type: +
+; CHECK-NEXT: MemRef_sum[0]
+; CHECK: Reduction Type: +
+; CHECK-NEXT: MemRef_sum[0]
+;
+; void f(int*__restrict A, int*__restrict B, int *sum) {
+;   for (int i = 0; i < 4444; ++i) {
+;     if (B[i])
+;       *sum += A[i];
+;   }
+; }
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-linux-gnu"
+
+; Function Attrs: nofree norecurse nosync nounwind memory(argmem: readwrite) uwtable
+define dso_local void @f(ptr noalias nocapture noundef readonly %A, ptr noalias nocapture noundef readonly %B, ptr nocapture noundef %sum) local_unnamed_addr #0 {
+entry:
+  br label %entry.split
+
+entry.split:                                      ; preds = %entry
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.inc
+  ret void
+
+for.body:                                         ; preds = %entry.split, %for.inc
+  %indvars.iv = phi i64 [ 0, %entry.split ], [ %indvars.iv.next, %for.inc ]
+  %arrayidx = getelementptr inbounds i32, ptr %B, i64 %indvars.iv
+  %0 = load i32, ptr %arrayidx
+  %tobool.not = icmp eq i32 %0, 0
+  br i1 %tobool.not, label %for.inc, label %if.then
+
+if.then:                                          ; preds = %for.body
+  %arrayidx2 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
+  %1 = load i32, ptr %arrayidx2
+  %2 = load i32, ptr %sum
+  %add = add nsw i32 %2, %1
+  store i32 %add, ptr %sum
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, 4444
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+

diff  --git a/polly/test/ScopInfo/reduction_indirect_access.ll b/polly/test/ScopInfo/reduction_indirect_access.ll
new file mode 100644
index 0000000000000..7acac4b150f40
--- /dev/null
+++ b/polly/test/ScopInfo/reduction_indirect_access.ll
@@ -0,0 +1,42 @@
+; RUN: opt %loadPolly -basic-aa -polly-print-scops -polly-allow-nonaffine -disable-output < %s | FileCheck %s
+;
+; CHECK: Reduction Type: NONE
+; CHECK: MemRef_INDICES[i0]
+; CHECK: Reduction Type: +
+; CHECK: MemRef_A[o0]
+; CHECK: Reduction Type: +
+; CHECK: MemRef_A[o0]
+;
+;    void f(double *restrict A, int *restrict INDICES, int N) {
+;      for (int i = 0; i < N; i++)
+;        A[INDICES[i]] += N;
+;    }
+;
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64"
+
+define void @f(double* noalias %A, i32* noalias %INDICES, i32 %N) {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp = icmp slt i32 %i.0, %N
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %conv = sitofp i32 %N to double
+  %arrayidx = getelementptr inbounds i32, i32* %INDICES, i32 %i.0
+  %tmp = load i32, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds double, double* %A, i32 %tmp
+  %tmp1 = load double, double* %arrayidx1, align 8
+  %add = fadd fast double %tmp1, %conv
+  store double %add, double* %arrayidx1, align 8
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}

diff  --git a/polly/test/ScopInfo/reduction_indirect_access_2.ll b/polly/test/ScopInfo/reduction_indirect_access_2.ll
new file mode 100644
index 0000000000000..331953991d86c
--- /dev/null
+++ b/polly/test/ScopInfo/reduction_indirect_access_2.ll
@@ -0,0 +1,50 @@
+; RUN: opt %loadPolly -basic-aa -polly-print-scops -disable-output -polly-allow-nonaffine < %s | FileCheck %s
+;
+; Validate that the accesses to INDICES[i] is not part of a reduction.
+;
+; CHECK: Reduction Type: NONE
+; CHECK: MemRef_INDICES[i0]
+; CHECK: Reduction Type: +
+; CHECK: MemRef_A[o0]
+; CHECK: Reduction Type: +
+; CHECK: MemRef_A[o0]
+; CHECK: Reduction Type: NONE
+; CHECK: MemRef_INDICES[i0]
+;
+;    void f(double *restrict A, int *restrict INDICES, int N) {
+;      for (int i = 0; i < N; i++) {
+;        A[INDICES[i]] += N;
+;        INDICES[i] += N;
+;      }
+;    }
+;
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64"
+
+define void @f(double* noalias %A, i32* noalias %INDICES, i32 %N) {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp = icmp slt i32 %i.0, %N
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %conv = sitofp i32 %N to double
+  %arrayidx = getelementptr inbounds i32, i32* %INDICES, i32 %i.0
+  %tmp = load i32, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds double, double* %A, i32 %tmp
+  %tmp1 = load double, double* %arrayidx1, align 8
+  %add = fadd fast double %tmp1, %conv
+  store double %add, double* %arrayidx1, align 8
+  %add3 = add nsw i32 %tmp, %N
+  store i32 %add3, i32* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}

diff  --git a/polly/test/ScopInfo/reduction_long_reduction_chain.ll b/polly/test/ScopInfo/reduction_long_reduction_chain.ll
new file mode 100644
index 0000000000000..62ae1fef187b6
--- /dev/null
+++ b/polly/test/ScopInfo/reduction_long_reduction_chain.ll
@@ -0,0 +1,61 @@
+; RUN: opt %loadPolly -basic-aa -polly-print-scops -disable-output < %s | FileCheck %s
+;
+; CHECK: Reduction Type: +
+; CHECK: MemRef_sum
+; CHECK: Reduction Type: NONE
+; CHECK: MemRef_A
+; CHECK: Reduction Type: +
+; CHECK: MemRef_sum
+; CHECK-NOT: MemRef_A
+;
+;    void f(int *restrict sum, int *restrict A) {
+;      for (int i = 0; i < 1024; i++)
+;        *sum = (A[i + 3] * (i - 14)) + ((A[i] + *sum + A[0]) + A[1023]) +
+;               (A[i + 2] * A[i - 1]);
+;    }
+;
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64"
+
+define void @f(i32* noalias %sum, i32* noalias %A) {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %exitcond = icmp ne i32 %i.0, 1024
+  br i1 %exitcond, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %add = add nsw i32 %i.0, 3
+  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %add
+  %tmp = load i32, i32* %arrayidx, align 4
+  %sub = add nsw i32 %i.0, -14
+  %mul = mul nsw i32 %tmp, %sub
+  %arrayidx1 = getelementptr inbounds i32, i32* %A, i32 %i.0
+  %tmp1 = load i32, i32* %arrayidx1, align 4
+  %tmp2 = load i32, i32* %sum, align 4
+  %add2 = add nsw i32 %tmp1, %tmp2
+  %tmp3 = load i32, i32* %A, align 4
+  %add4 = add nsw i32 %add2, %tmp3
+  %arrayidx5 = getelementptr inbounds i32, i32* %A, i32 1023
+  %tmp4 = load i32, i32* %arrayidx5, align 4
+  %add6 = add nsw i32 %add4, %tmp4
+  %add7 = add nsw i32 %mul, %add6
+  %add8 = add nsw i32 %i.0, 2
+  %arrayidx9 = getelementptr inbounds i32, i32* %A, i32 %add8
+  %tmp5 = load i32, i32* %arrayidx9, align 4
+  %sub10 = add nsw i32 %i.0, -1
+  %arrayidx11 = getelementptr inbounds i32, i32* %A, i32 %sub10
+  %tmp6 = load i32, i32* %arrayidx11, align 4
+  %mul12 = mul nsw i32 %tmp5, %tmp6
+  %add13 = add nsw i32 %add7, %mul12
+  store i32 %add13, i32* %sum, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}

diff  --git a/polly/test/ScopInfo/reduction_long_reduction_chain_double_use.ll b/polly/test/ScopInfo/reduction_long_reduction_chain_double_use.ll
new file mode 100644
index 0000000000000..7ca46fa9535ac
--- /dev/null
+++ b/polly/test/ScopInfo/reduction_long_reduction_chain_double_use.ll
@@ -0,0 +1,58 @@
+; RUN: opt %loadPolly -basic-aa -polly-print-scops -disable-output < %s | FileCheck %s
+;
+; Sum is added twice in the statement. Hence no reduction.
+; CHECK: Reduction Type: NONE
+;
+;    void f(int *restrict sum, int *restrict A) {
+;      for (int i = 0; i < 1024; i++)
+;        *sum = (A[i + 3] * (i - 14)) + ((A[i] + *sum + A[0]) + A[1023]) +
+;               (A[i + 2] * A[i - 1]) + *sum;
+;    }
+;
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64"
+
+define void @f(i32* noalias %sum, i32* noalias %A) {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %exitcond = icmp ne i32 %i.0, 1024
+  br i1 %exitcond, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %add = add nsw i32 %i.0, 3
+  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %add
+  %tmp = load i32, i32* %arrayidx, align 4
+  %sub = add nsw i32 %i.0, -14
+  %mul = mul nsw i32 %tmp, %sub
+  %arrayidx1 = getelementptr inbounds i32, i32* %A, i32 %i.0
+  %tmp1 = load i32, i32* %arrayidx1, align 4
+  %tmp2 = load i32, i32* %sum, align 4
+  %add2 = add nsw i32 %tmp1, %tmp2
+  %tmp3 = load i32, i32* %A, align 4
+  %add4 = add nsw i32 %add2, %tmp3
+  %arrayidx5 = getelementptr inbounds i32, i32* %A, i32 1023
+  %tmp4 = load i32, i32* %arrayidx5, align 4
+  %add6 = add nsw i32 %add4, %tmp4
+  %add7 = add nsw i32 %mul, %add6
+  %add8 = add nsw i32 %i.0, 2
+  %arrayidx9 = getelementptr inbounds i32, i32* %A, i32 %add8
+  %tmp5 = load i32, i32* %arrayidx9, align 4
+  %sub10 = add nsw i32 %i.0, -1
+  %arrayidx11 = getelementptr inbounds i32, i32* %A, i32 %sub10
+  %tmp6 = load i32, i32* %arrayidx11, align 4
+  %mul12 = mul nsw i32 %tmp5, %tmp6
+  %add13 = add nsw i32 %add7, %mul12
+  %tmp7 = load i32, i32* %sum, align 4
+  %add14 = add nsw i32 %add13, %tmp7
+  store i32 %add14, i32* %sum, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}

diff  --git a/polly/test/ScopInfo/reduction_multiple_
diff erent_operators.ll b/polly/test/ScopInfo/reduction_multiple_
diff erent_operators.ll
new file mode 100644
index 0000000000000..b77c72a291744
--- /dev/null
+++ b/polly/test/ScopInfo/reduction_multiple_
diff erent_operators.ll
@@ -0,0 +1,37 @@
+; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+;
+; Should not be identified as reduction as there are 
diff erent operations
+; involved on sum (multiplication followed by addition)
+; CHECK: Reduction Type: NONE
+;
+;    void f(int *restrict sum) {
+;      for (int i = 0; i < 1024; i++) {
+;        *sum = (*sum * 5) + 25;
+;      }
+;    }
+;
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64"
+
+define void @f(i32* noalias %sum) {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %exitcond = icmp ne i32 %i.0, 1024
+  br i1 %exitcond, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %tmp = load i32, i32* %sum, align 4
+  %tmp1 = mul i32 %tmp, 5
+  %mul = add i32 %tmp1, 25
+  store i32 %mul, i32* %sum, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}


        


More information about the llvm-commits mailing list