[Mlir-commits] [mlir] 545fa37 - [mlir] Affine: parallelize affine loops with reductions

Thu Apr 29 04:16:33 PDT 2021

Author: Alex Zinenko
Date: 2021-04-29T13:16:24+02:00
New Revision: 545fa37834ef6b5731444728c00e7a18d4f1aeed

URL: https://github.com/llvm/llvm-project/commit/545fa37834ef6b5731444728c00e7a18d4f1aeed
DIFF: https://github.com/llvm/llvm-project/commit/545fa37834ef6b5731444728c00e7a18d4f1aeed.diff

LOG: [mlir] Affine: parallelize affine loops with reductions

Introduce a basic support for parallelizing affine loops with reductions
expressed using iteration arguments. Affine parallelism detector now has a flag
to assume such reductions are parallel. The transformation handles a subset of
parallel reductions that are can be expressed using affine.parallel:
integer/float addition and multiplication. This requires to detect the
reduction operation since affine.parallel only supports a fixed set of
reduction operators.

Reviewed By: chelini, kumasento, bondhugula

Differential Revision: https://reviews.llvm.org/D101171

Added: 
    

Modified: 
    mlir/include/mlir/Analysis/AffineAnalysis.h
    mlir/include/mlir/Analysis/Utils.h
    mlir/include/mlir/Dialect/Affine/Passes.td
    mlir/include/mlir/Dialect/Affine/Utils.h
    mlir/lib/Analysis/AffineAnalysis.cpp
    mlir/lib/Analysis/Utils.cpp
    mlir/lib/Dialect/Affine/Transforms/AffineParallelize.cpp
    mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp
    mlir/lib/Dialect/Affine/Utils/CMakeLists.txt
    mlir/lib/Dialect/Affine/Utils/Utils.cpp
    mlir/test/Dialect/Affine/parallelize.mlir

Removed: 
    


################################################################################
diff  --git a/mlir/include/mlir/Analysis/AffineAnalysis.h b/mlir/include/mlir/Analysis/AffineAnalysis.h
index c3aaa40bda9a0..2a4bd478a513c 100644

--- a/mlir/include/mlir/Analysis/AffineAnalysis.h
+++ b/mlir/include/mlir/Analysis/AffineAnalysis.h
@@ -15,6 +15,7 @@
 #ifndef MLIR_ANALYSIS_AFFINE_ANALYSIS_H
 #define MLIR_ANALYSIS_AFFINE_ANALYSIS_H
 
+#include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/IR/Value.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallVector.h"
@@ -27,6 +28,25 @@ class AffineValueMap;
 class FlatAffineConstraints;
 class Operation;
 
+/// A description of a (parallelizable) reduction in an affine loop.
+struct LoopReduction {
+  /// Reduction kind.
+  AtomicRMWKind kind;
+
+  /// Position of the iteration argument that acts as accumulator.
+  unsigned iterArgPosition;
+
+  /// The value being reduced.
+  Value value;
+};
+
+/// Returns true if `forOp' is a parallel loop. If `parallelReductions` is
+/// provided, populates it with descriptors of the parallelizable reductions and
+/// treats them as not preventing parallelization.
+bool isLoopParallel(
+    AffineForOp forOp,
+    SmallVectorImpl<LoopReduction> *parallelReductions = nullptr);
+
 /// Returns in `affineApplyOps`, the sequence of those AffineApplyOp
 /// Operations that are reachable via a search starting from `operands` and
 /// ending at those operands that are not the result of an AffineApplyOp.

diff  --git a/mlir/include/mlir/Analysis/Utils.h b/mlir/include/mlir/Analysis/Utils.h
index ccedd17e0e6c5..9f231dca44e03 100644
--- a/mlir/include/mlir/Analysis/Utils.h
+++ b/mlir/include/mlir/Analysis/Utils.h
@@ -354,9 +354,6 @@ unsigned getNumCommonSurroundingLoops(Operation &A, Operation &B);
 Optional<int64_t> getMemoryFootprintBytes(AffineForOp forOp,
                                           int memorySpace = -1);
 
-/// Returns true if `forOp' is a parallel loop.
-bool isLoopParallel(AffineForOp forOp);
-
 /// Simplify the integer set by simplifying the underlying affine expressions by
 /// flattening and some simple inference. Also, drop any duplicate constraints.
 /// Returns the simplified integer set. This method runs in time linear in the

diff  --git a/mlir/include/mlir/Dialect/Affine/Passes.td b/mlir/include/mlir/Dialect/Affine/Passes.td
index 77ba06483304c..45c28bbace4ae 100644
--- a/mlir/include/mlir/Dialect/Affine/Passes.td
+++ b/mlir/include/mlir/Dialect/Affine/Passes.td
@@ -123,6 +123,9 @@ def AffineParallelize : FunctionPass<"affine-parallelize"> {
     Option<"maxNested", "max-nested", "unsigned", /*default=*/"-1u",
            "Maximum number of nested parallel loops to produce. "
            "Defaults to unlimited (UINT_MAX).">,
+    Option<"parallelReductions", "parallel-reductions", "bool",
+           /*default=*/"false",
+           "Whether to parallelize reduction loops. Defaults to false.">
   ];
 }
 

diff  --git a/mlir/include/mlir/Dialect/Affine/Utils.h b/mlir/include/mlir/Dialect/Affine/Utils.h
index be6985dfe4034..676b394398106 100644
--- a/mlir/include/mlir/Dialect/Affine/Utils.h
+++ b/mlir/include/mlir/Dialect/Affine/Utils.h
@@ -24,12 +24,17 @@ class AffineForOp;
 class AffineIfOp;
 class AffineParallelOp;
 struct LogicalResult;
+struct LoopReduction;
 class Operation;
 
 /// Replaces parallel affine.for op with 1-d affine.parallel op.
-/// mlir::isLoopParallel detect the parallel affine.for ops.
+/// mlir::isLoopParallel detects the parallel affine.for ops.
+/// Parallelizes the specified reductions. Parallelization will fail in presence
+/// of loop iteration arguments that are not listed in `parallelReductions`.
 /// There is no cost model currently used to drive this parallelization.
-void affineParallelize(AffineForOp forOp);
+LogicalResult
+affineParallelize(AffineForOp forOp,
+                  ArrayRef<LoopReduction> parallelReductions = {});
 
 /// Hoists out affine.if/else to as high as possible, i.e., past all invariant
 /// affine.fors/parallel's. Returns success if any hoisting happened; folded` is

diff  --git a/mlir/lib/Analysis/AffineAnalysis.cpp b/mlir/lib/Analysis/AffineAnalysis.cpp
index b49e532eec4de..396ab89b5b526 100644
--- a/mlir/lib/Analysis/AffineAnalysis.cpp
+++ b/mlir/lib/Analysis/AffineAnalysis.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Analysis/AffineAnalysis.h"
+#include "mlir/Analysis/SliceAnalysis.h"
 #include "mlir/Analysis/Utils.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Affine/IR/AffineValueMap.h"
@@ -21,6 +22,8 @@
 #include "mlir/IR/IntegerSet.h"
 #include "mlir/Support/MathExtras.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -30,6 +33,131 @@ using namespace mlir;
 
 using llvm::dbgs;
 
+/// Returns true if `value` (transitively) depends on iteration arguments of the
+/// given `forOp`.
+static bool dependsOnIterArgs(Value value, AffineForOp forOp) {
+  // Compute the backward slice of the value.
+  SetVector<Operation *> slice;
+  getBackwardSlice(value, &slice,
+                   [&](Operation *op) { return !forOp->isAncestor(op); });
+
+  // Check that none of the operands of the operations in the backward slice are
+  // loop iteration arguments, and neither is the value itself.
+  auto argRange = forOp.getRegionIterArgs();
+  llvm::SmallPtrSet<Value, 8> iterArgs(argRange.begin(), argRange.end());
+  if (iterArgs.contains(value))
+    return true;
+
+  for (Operation *op : slice)
+    for (Value operand : op->getOperands())
+      if (iterArgs.contains(operand))
+        return true;
+
+  return false;
+}
+
+/// Get the value that is being reduced by `pos`-th reduction in the loop if
+/// such a reduction can be performed by affine parallel loops. This assumes
+/// floating-point operations are commutative. On success, `kind` will be the
+/// reduction kind suitable for use in affine parallel loop builder. If the
+/// reduction is not supported, returns null.
+static Value getSupportedReduction(AffineForOp forOp, unsigned pos,
+                                   AtomicRMWKind &kind) {
+  auto yieldOp = cast<AffineYieldOp>(forOp.getBody()->back());
+  Value yielded = yieldOp.operands()[pos];
+  Operation *definition = yielded.getDefiningOp();
+  if (!definition)
+    return nullptr;
+  if (!forOp.getRegionIterArgs()[pos].hasOneUse())
+    return nullptr;
+
+  Optional<AtomicRMWKind> maybeKind =
+      TypeSwitch<Operation *, Optional<AtomicRMWKind>>(definition)
+          .Case<AddFOp>([](Operation *) { return AtomicRMWKind::addf; })
+          .Case<MulFOp>([](Operation *) { return AtomicRMWKind::mulf; })
+          .Case<AddIOp>([](Operation *) { return AtomicRMWKind::addi; })
+          .Case<MulIOp>([](Operation *) { return AtomicRMWKind::muli; })
+          .Default([](Operation *) -> Optional<AtomicRMWKind> {
+            // TODO: AtomicRMW supports other kinds of reductions this is
+            // currently not detecting, add those when the need arises.
+            return llvm::None;
+          });
+  if (!maybeKind)
+    return nullptr;
+
+  kind = *maybeKind;
+  if (definition->getOperand(0) == forOp.getRegionIterArgs()[pos] &&
+      !dependsOnIterArgs(definition->getOperand(1), forOp))
+    return definition->getOperand(1);
+  if (definition->getOperand(1) == forOp.getRegionIterArgs()[pos] &&
+      !dependsOnIterArgs(definition->getOperand(0), forOp))
+    return definition->getOperand(0);
+
+  return nullptr;
+}
+
+/// Returns true if `forOp' is a parallel loop. If `parallelReductions` is
+/// provided, populates it with descriptors of the parallelizable reductions and
+/// treats them as not preventing parallelization.
+bool mlir::isLoopParallel(AffineForOp forOp,
+                          SmallVectorImpl<LoopReduction> *parallelReductions) {
+  unsigned numIterArgs = forOp.getNumIterOperands();
+
+  // Loop is not parallel if it has SSA loop-carried dependences and reduction
+  // detection is not requested.
+  if (numIterArgs > 0 && !parallelReductions)
+    return false;
+
+  // Find supported reductions of requested.
+  if (parallelReductions) {
+    parallelReductions->reserve(forOp.getNumIterOperands());
+    for (unsigned i = 0; i < numIterArgs; ++i) {
+      AtomicRMWKind kind;
+      if (Value value = getSupportedReduction(forOp, i, kind))
+        parallelReductions->emplace_back(LoopReduction{kind, i, value});
+    }
+
+    // Return later to allow for identifying all parallel reductions even if the
+    // loop is not parallel.
+    if (parallelReductions->size() != numIterArgs)
+      return false;
+  }
+
+  // Collect all load and store ops in loop nest rooted at 'forOp'.
+  SmallVector<Operation *, 8> loadAndStoreOps;
+  auto walkResult = forOp.walk([&](Operation *op) -> WalkResult {
+    if (isa<AffineReadOpInterface, AffineWriteOpInterface>(op))
+      loadAndStoreOps.push_back(op);
+    else if (!isa<AffineForOp, AffineYieldOp, AffineIfOp>(op) &&
+             !MemoryEffectOpInterface::hasNoEffect(op))
+      return WalkResult::interrupt();
+
+    return WalkResult::advance();
+  });
+
+  // Stop early if the loop has unknown ops with side effects.
+  if (walkResult.wasInterrupted())
+    return false;
+
+  // Dep check depth would be number of enclosing loops + 1.
+  unsigned depth = getNestingDepth(forOp) + 1;
+
+  // Check dependences between all pairs of ops in 'loadAndStoreOps'.
+  for (auto *srcOp : loadAndStoreOps) {
+    MemRefAccess srcAccess(srcOp);
+    for (auto *dstOp : loadAndStoreOps) {
+      MemRefAccess dstAccess(dstOp);
+      FlatAffineConstraints dependenceConstraints;
+      DependenceResult result = checkMemrefAccessDependence(
+          srcAccess, dstAccess, depth, &dependenceConstraints,
+          /*dependenceComponents=*/nullptr);
+      if (result.value != DependenceResult::NoDependence)
+        return false;
+    }
+  }
+  return true;
+}
+
 /// Returns the sequence of AffineApplyOp Operations operation in
 /// 'affineApplyOps', which are reachable via a search starting from 'operands',
 /// and ending at operands which are not defined by AffineApplyOps.

diff  --git a/mlir/lib/Analysis/Utils.cpp b/mlir/lib/Analysis/Utils.cpp
index a4b8ccfc7ad14..e87ecdac2d6ca 100644
--- a/mlir/lib/Analysis/Utils.cpp
+++ b/mlir/lib/Analysis/Utils.cpp
@@ -1268,49 +1268,6 @@ void mlir::getSequentialLoops(AffineForOp forOp,
   });
 }
 
-/// Returns true if 'forOp' is parallel.
-bool mlir::isLoopParallel(AffineForOp forOp) {
-  // Loop is not parallel if it has SSA loop-carried dependences.
-  // TODO: Conditionally support reductions and other loop-carried dependences
-  // that could be handled in the context of a parallel loop.
-  if (forOp.getNumIterOperands() > 0)
-    return false;
-
-  // Collect all load and store ops in loop nest rooted at 'forOp'.
-  SmallVector<Operation *, 8> loadAndStoreOpInsts;
-  auto walkResult = forOp.walk([&](Operation *opInst) -> WalkResult {
-    if (isa<AffineReadOpInterface, AffineWriteOpInterface>(opInst))
-      loadAndStoreOpInsts.push_back(opInst);
-    else if (!isa<AffineForOp, AffineYieldOp, AffineIfOp>(opInst) &&
-             !MemoryEffectOpInterface::hasNoEffect(opInst))
-      return WalkResult::interrupt();
-
-    return WalkResult::advance();
-  });
-
-  // Stop early if the loop has unknown ops with side effects.
-  if (walkResult.wasInterrupted())
-    return false;
-
-  // Dep check depth would be number of enclosing loops + 1.
-  unsigned depth = getNestingDepth(forOp) + 1;
-
-  // Check dependences between all pairs of ops in 'loadAndStoreOpInsts'.
-  for (auto *srcOpInst : loadAndStoreOpInsts) {
-    MemRefAccess srcAccess(srcOpInst);
-    for (auto *dstOpInst : loadAndStoreOpInsts) {
-      MemRefAccess dstAccess(dstOpInst);
-      FlatAffineConstraints dependenceConstraints;
-      DependenceResult result = checkMemrefAccessDependence(
-          srcAccess, dstAccess, depth, &dependenceConstraints,
-          /*dependenceComponents=*/nullptr);
-      if (result.value != DependenceResult::NoDependence)
-        return false;
-    }
-  }
-  return true;
-}
-
 IntegerSet mlir::simplifyIntegerSet(IntegerSet set) {
   FlatAffineConstraints fac(set);
   if (fac.isEmpty())

diff  --git a/mlir/lib/Dialect/Affine/Transforms/AffineParallelize.cpp b/mlir/lib/Dialect/Affine/Transforms/AffineParallelize.cpp
index 55ae51376db7c..62519908a248f 100644
--- a/mlir/lib/Dialect/Affine/Transforms/AffineParallelize.cpp
+++ b/mlir/lib/Dialect/Affine/Transforms/AffineParallelize.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "PassDetail.h"
+#include "mlir/Analysis/AffineAnalysis.h"
 #include "mlir/Analysis/AffineStructures.h"
 #include "mlir/Analysis/LoopAnalysis.h"
 #include "mlir/Analysis/Utils.h"
@@ -33,6 +34,17 @@ namespace {
 struct AffineParallelize : public AffineParallelizeBase<AffineParallelize> {
   void runOnFunction() override;
 };
+
+/// Descriptor of a potentially parallelizable loop.
+struct ParallelizationCandidate {
+  ParallelizationCandidate(AffineForOp l, SmallVector<LoopReduction> &&r)
+      : loop(l), reductions(std::move(r)) {}
+
+  /// The potentially parallelizable loop.
+  AffineForOp loop;
+  /// Desciprtors of reductions that can be parallelized in the loop.
+  SmallVector<LoopReduction> reductions;
+};
 } // namespace
 
 void AffineParallelize::runOnFunction() {
@@ -41,14 +53,16 @@ void AffineParallelize::runOnFunction() {
   // The walker proceeds in post-order, but we need to process outer loops first
   // to control the number of outer parallel loops, so push candidate loops to
   // the front of a deque.
-  std::deque<AffineForOp> parallelizableLoops;
+  std::deque<ParallelizationCandidate> parallelizableLoops;
   f.walk([&](AffineForOp loop) {
-    if (isLoopParallel(loop))
-      parallelizableLoops.push_front(loop);
+    SmallVector<LoopReduction> reductions;
+    if (isLoopParallel(loop, parallelReductions ? &reductions : nullptr))
+      parallelizableLoops.emplace_back(loop, std::move(reductions));
   });
 
-  for (AffineForOp loop : parallelizableLoops) {
+  for (const ParallelizationCandidate &candidate : parallelizableLoops) {
     unsigned numParentParallelOps = 0;
+    AffineForOp loop = candidate.loop;
     for (Operation *op = loop->getParentOp();
          op != nullptr && !op->hasTrait<OpTrait::AffineScope>();
          op = op->getParentOp()) {
@@ -56,8 +70,15 @@ void AffineParallelize::runOnFunction() {
         ++numParentParallelOps;
     }
 
-    if (numParentParallelOps < maxNested)
-      affineParallelize(loop);
+    if (numParentParallelOps < maxNested) {
+      if (failed(affineParallelize(loop, candidate.reductions))) {
+        LLVM_DEBUG(llvm::dbgs() << "[" DEBUG_TYPE "] failed to parallelize\n"
+                                << loop);
+      }
+    } else {
+      LLVM_DEBUG(llvm::dbgs() << "[" DEBUG_TYPE "] too many nested loops\n"
+                              << loop);
+    }
   }
 }
 

diff  --git a/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp b/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp
index a8fa831c1a1e3..551a8d44042ed 100644
--- a/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp
+++ b/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp
@@ -12,16 +12,16 @@
 //===----------------------------------------------------------------------===//
 
 #include "PassDetail.h"
+#include "mlir/Analysis/AffineAnalysis.h"
 #include "mlir/Analysis/LoopAnalysis.h"
 #include "mlir/Analysis/NestedMatcher.h"
-#include "mlir/Analysis/Utils.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Affine/Utils.h"
 #include "mlir/Dialect/Vector/VectorOps.h"
 #include "mlir/Dialect/Vector/VectorUtils.h"
 #include "mlir/IR/BlockAndValueMapping.h"
-#include "llvm/Support/Debug.h"
 #include "mlir/Support/LLVM.h"
+#include "llvm/Support/Debug.h"
 
 using namespace mlir;
 using namespace vector;

diff  --git a/mlir/lib/Dialect/Affine/Utils/CMakeLists.txt b/mlir/lib/Dialect/Affine/Utils/CMakeLists.txt
index e4a5d0bbd9f15..3bc37cfa3ba23 100644
--- a/mlir/lib/Dialect/Affine/Utils/CMakeLists.txt
+++ b/mlir/lib/Dialect/Affine/Utils/CMakeLists.txt
@@ -6,5 +6,6 @@ add_mlir_dialect_library(MLIRAffineUtils
 
   LINK_LIBS PUBLIC
   MLIRAffine
+  MLIRAnalysis
   MLIRTransformUtils
   )

diff  --git a/mlir/lib/Dialect/Affine/Utils/Utils.cpp b/mlir/lib/Dialect/Affine/Utils/Utils.cpp
index 522cfd7fca950..935abdb477bdb 100644
--- a/mlir/lib/Dialect/Affine/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/Affine/Utils/Utils.cpp
@@ -12,6 +12,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Dialect/Affine/Utils.h"
+#include "mlir/Analysis/AffineAnalysis.h"
+#include "mlir/Analysis/Utils.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/IR/BlockAndValueMapping.h"
 #include "mlir/IR/BuiltinOps.h"
@@ -130,8 +132,17 @@ static AffineIfOp hoistAffineIfOp(AffineIfOp ifOp, Operation *hoistOverOp) {
 }
 
 /// Replace affine.for with a 1-d affine.parallel and clone the former's body
-/// into the latter while remapping values.
-void mlir::affineParallelize(AffineForOp forOp) {
+/// into the latter while remapping values. Parallelizes the specified
+/// reductions. Parallelization will fail in presence of loop iteration
+/// arguments that are not listed in `parallelReductions`.
+LogicalResult
+mlir::affineParallelize(AffineForOp forOp,
+                        ArrayRef<LoopReduction> parallelReductions) {
+  // Fail early if there are iter arguments that are not reductions.
+  unsigned numReductions = parallelReductions.size();
+  if (numReductions != forOp.getNumIterOperands())
+    return failure();
+
   Location loc = forOp.getLoc();
   OpBuilder outsideBuilder(forOp);
 
@@ -148,7 +159,7 @@ void mlir::affineParallelize(AffineForOp forOp) {
   if (needsMax || needsMin) {
     if (forOp->getParentOp() &&
         !forOp->getParentOp()->hasTrait<OpTrait::AffineScope>())
-      return;
+      return failure();
 
     identityMap = AffineMap::getMultiDimIdentityMap(1, loc->getContext());
   }
@@ -168,12 +179,46 @@ void mlir::affineParallelize(AffineForOp forOp) {
   }
 
   // Creating empty 1-D affine.parallel op.
+  auto reducedValues = llvm::to_vector<4>(llvm::map_range(
+      parallelReductions, [](const LoopReduction &red) { return red.value; }));
+  auto reductionKinds = llvm::to_vector<4>(llvm::map_range(
+      parallelReductions, [](const LoopReduction &red) { return red.kind; }));
   AffineParallelOp newPloop = outsideBuilder.create<AffineParallelOp>(
-      loc, llvm::None, llvm::None, lowerBoundMap, lowerBoundOperands,
-      upperBoundMap, upperBoundOperands);
-  // Steal the body of the old affine for op and erase it.
+      loc, ValueRange(reducedValues).getTypes(), reductionKinds, lowerBoundMap,
+      lowerBoundOperands, upperBoundMap, upperBoundOperands);
+  // Steal the body of the old affine for op.
   newPloop.region().takeBody(forOp.region());
+  Operation *yieldOp = &newPloop.getBody()->back();
+
+  // Handle the initial values of reductions because the parallel loop always
+  // starts from the neutral value.
+  SmallVector<Value> newResults;
+  newResults.reserve(numReductions);
+  for (unsigned i = 0; i < numReductions; ++i) {
+    Value init = forOp.getIterOperands()[i];
+    // This works because we are only handling single-op reductions at the
+    // moment. A switch on reduction kind or a mechanism to collect operations
+    // participating in the reduction will be necessary for multi-op reductions.
+    Operation *reductionOp = yieldOp->getOperand(i).getDefiningOp();
+    assert(reductionOp && "yielded value is expected to be produced by an op");
+    outsideBuilder.getInsertionBlock()->getOperations().splice(
+        outsideBuilder.getInsertionPoint(), newPloop.getBody()->getOperations(),
+        reductionOp);
+    reductionOp->setOperands({init, newPloop->getResult(i)});
+    forOp->getResult(i).replaceAllUsesWith(reductionOp->getResult(0));
+  }
+
+  // Update the loop terminator to yield reduced values bypassing the reduction
+  // operation itself (now moved outside of the loop) and erase the block
+  // arguments that correspond to reductions. Note that the loop always has one
+  // "main" induction variable whenc coming from a non-parallel for.
+  unsigned numIVs = 1;
+  yieldOp->setOperands(reducedValues);
+  newPloop.getBody()->eraseArguments(
+      llvm::to_vector<4>(llvm::seq<unsigned>(numIVs, numReductions + numIVs)));
+
   forOp.erase();
+  return success();
 }
 
 // Returns success if any hoisting happened.

diff  --git a/mlir/test/Dialect/Affine/parallelize.mlir b/mlir/test/Dialect/Affine/parallelize.mlir
index ca72e67f91080..9bd479c91a000 100644
--- a/mlir/test/Dialect/Affine/parallelize.mlir
+++ b/mlir/test/Dialect/Affine/parallelize.mlir
@@ -1,5 +1,6 @@
-// RUN: mlir-opt %s -allow-unregistered-dialect -affine-parallelize| FileCheck %s
+// RUN: mlir-opt %s -allow-unregistered-dialect -affine-parallelize | FileCheck %s
 // RUN: mlir-opt %s -allow-unregistered-dialect -affine-parallelize='max-nested=1' | FileCheck --check-prefix=MAX-NESTED %s
+// RUN: mlir-opt %s -allow-unregistered-dialect -affine-parallelize='parallel-reductions=1' | FileCheck --check-prefix=REDUCE %s
 
 // CHECK-LABEL:    func @reduce_window_max() {
 func @reduce_window_max() {
@@ -159,24 +160,34 @@ func @max_nested(%m: memref<?x?xf32>, %lb0: index, %lb1: index,
   return
 }
 
-// CHECK-LABEL: @unsupported_iter_args
-func @unsupported_iter_args(%in: memref<10xf32>) {
+// CHECK-LABEL: @iter_args
+// REDUCE-LABEL: @iter_args
+func @iter_args(%in: memref<10xf32>) {
+  // REDUCE: %[[init:.*]] = constant
   %cst = constant 0.000000e+00 : f32
   // CHECK-NOT: affine.parallel
+  // REDUCE: %[[reduced:.*]] = affine.parallel (%{{.*}}) = (0) to (10) reduce ("addf")
   %final_red = affine.for %i = 0 to 10 iter_args(%red_iter = %cst) -> (f32) {
+    // REDUCE: %[[red_value:.*]] = affine.load
     %ld = affine.load %in[%i] : memref<10xf32>
+    // REDUCE-NOT: addf
     %add = addf %red_iter, %ld : f32
+    // REDUCE: affine.yield %[[red_value]]
     affine.yield %add : f32
   }
+  // REDUCE: addf %[[init]], %[[reduced]]
   return
 }
 
-// CHECK-LABEL: @unsupported_nested_iter_args
-func @unsupported_nested_iter_args(%in: memref<20x10xf32>) {
+// CHECK-LABEL: @nested_iter_args
+// REDUCE-LABEL: @nested_iter_args
+func @nested_iter_args(%in: memref<20x10xf32>) {
   %cst = constant 0.000000e+00 : f32
   // CHECK: affine.parallel
   affine.for %i = 0 to 20 {
-    // CHECK: affine.for
+    // CHECK-NOT: affine.parallel
+    // REDUCE: affine.parallel
+    // REDUCE: reduce ("addf")
     %final_red = affine.for %j = 0 to 10 iter_args(%red_iter = %cst) -> (f32) {
       %ld = affine.load %in[%i, %j] : memref<20x10xf32>
       %add = addf %red_iter, %ld : f32
@@ -185,3 +196,43 @@ func @unsupported_nested_iter_args(%in: memref<20x10xf32>) {
   }
   return
 }
+
+// REDUCE-LABEL: @strange_butterfly
+func @strange_butterfly() {
+  %cst1 = constant 0.0 : f32
+  %cst2 = constant 1.0 : f32
+  // REDUCE-NOT: affine.parallel
+  affine.for %i = 0 to 10 iter_args(%it1 = %cst1, %it2 = %cst2) -> (f32, f32) {
+    %0 = addf %it1, %it2 : f32
+    affine.yield %0, %0 : f32, f32
+  }
+  return
+}
+
+// An iter arg is used more than once. This is not a simple reduction and
+// should not be parallelized.
+// REDUCE-LABEL: @repeated_use
+func @repeated_use() {
+  %cst1 = constant 0.0 : f32
+  // REDUCE-NOT: affine.parallel
+  affine.for %i = 0 to 10 iter_args(%it1 = %cst1) -> (f32) {
+    %0 = addf %it1, %it1 : f32
+    affine.yield %0 : f32
+  }
+  return
+}
+
+// An iter arg is used in the chain of operations defining the value being
+// reduced, this is not a simple reduction and should not be parallelized.
+// REDUCE-LABEL: @use_in_backward_slice
+func @use_in_backward_slice() {
+  %cst1 = constant 0.0 : f32
+  %cst2 = constant 1.0 : f32
+  // REDUCE-NOT: affine.parallel
+  affine.for %i = 0 to 10 iter_args(%it1 = %cst1, %it2 = %cst2) -> (f32, f32) {
+    %0 = "test.some_modification"(%it2) : (f32) -> f32
+    %1 = addf %it1, %0 : f32
+    affine.yield %1, %1 : f32, f32
+  }
+  return
+}