[clang] bd1c03d - [OPENMP50]Codegen for inscan reductions in worksharing directives.

Alexey Bataev via cfe-commits cfe-commits at lists.llvm.org
Thu Jun 4 13:36:43 PDT 2020


Author: Alexey Bataev
Date: 2020-06-04T16:29:33-04:00
New Revision: bd1c03d7b7c8bdd80b534cf2fa956c36a2f8249f

URL: https://github.com/llvm/llvm-project/commit/bd1c03d7b7c8bdd80b534cf2fa956c36a2f8249f
DIFF: https://github.com/llvm/llvm-project/commit/bd1c03d7b7c8bdd80b534cf2fa956c36a2f8249f.diff

LOG: [OPENMP50]Codegen for inscan reductions in worksharing directives.

Summary:
Implemented codegen for reduction clauses with inscan modifiers in
worksharing constructs.

Emits the code for the directive with inscan reductions.
The code is the following:
```
size num_iters = <num_iters>;
<type> buffer[num_iters];
for (i: 0..<num_iters>) {
  <input phase>;
  buffer[i] = red;
}
for (int k = 0; k != ceil(log2(num_iters)); ++k)
for (size cnt = last_iter; cnt >= pow(2, k); --k)
  buffer[i] op= buffer[i-pow(2,k)];
for (0..<num_iters>) {
  red = InclusiveScan ? buffer[i] : buffer[i-1];
  <scan phase>;
}
```

Reviewers: jdoerfert

Subscribers: yaxunl, guansong, arphaman, cfe-commits, caomhin

Tags: #clang

Differential Revision: https://reviews.llvm.org/D79948

Added: 
    clang/test/OpenMP/for_scan_codegen.cpp

Modified: 
    clang/include/clang/AST/OpenMPClause.h
    clang/include/clang/AST/RecursiveASTVisitor.h
    clang/lib/AST/OpenMPClause.cpp
    clang/lib/AST/StmtProfile.cpp
    clang/lib/CodeGen/CGStmt.cpp
    clang/lib/CodeGen/CGStmtOpenMP.cpp
    clang/lib/CodeGen/CodeGenFunction.h
    clang/lib/Sema/SemaOpenMP.cpp
    clang/lib/Serialization/ASTReader.cpp
    clang/lib/Serialization/ASTWriter.cpp
    clang/test/OpenMP/scan_messages.cpp
    clang/tools/libclang/CIndex.cpp

Removed: 
    


################################################################################
diff  --git a/clang/include/clang/AST/OpenMPClause.h b/clang/include/clang/AST/OpenMPClause.h
index 91e4d011a3e9..6c2ca1cb1008 100644
--- a/clang/include/clang/AST/OpenMPClause.h
+++ b/clang/include/clang/AST/OpenMPClause.h
@@ -2839,6 +2839,41 @@ class OMPReductionClause final
     return llvm::makeArrayRef(getRHSExprs().end(), varlist_size());
   }
 
+  /// Set list of helper copy operations for inscan reductions.
+  /// The form is: Temps[i] = LHS[i];
+  void setInscanCopyOps(ArrayRef<Expr *> Ops);
+
+  /// Get the list of helper inscan copy operations.
+  MutableArrayRef<Expr *> getInscanCopyOps() {
+    return MutableArrayRef<Expr *>(getReductionOps().end(), varlist_size());
+  }
+  ArrayRef<const Expr *> getInscanCopyOps() const {
+    return llvm::makeArrayRef(getReductionOps().end(), varlist_size());
+  }
+
+  /// Set list of helper temp vars for inscan copy array operations.
+  void setInscanCopyArrayTemps(ArrayRef<Expr *> CopyArrayTemps);
+
+  /// Get the list of helper inscan copy temps.
+  MutableArrayRef<Expr *> getInscanCopyArrayTemps() {
+    return MutableArrayRef<Expr *>(getInscanCopyOps().end(), varlist_size());
+  }
+  ArrayRef<const Expr *> getInscanCopyArrayTemps() const {
+    return llvm::makeArrayRef(getInscanCopyOps().end(), varlist_size());
+  }
+
+  /// Set list of helper temp elements vars for inscan copy array operations.
+  void setInscanCopyArrayElems(ArrayRef<Expr *> CopyArrayElems);
+
+  /// Get the list of helper inscan copy temps.
+  MutableArrayRef<Expr *> getInscanCopyArrayElems() {
+    return MutableArrayRef<Expr *>(getInscanCopyArrayTemps().end(),
+                                   varlist_size());
+  }
+  ArrayRef<const Expr *> getInscanCopyArrayElems() const {
+    return llvm::makeArrayRef(getInscanCopyArrayTemps().end(), varlist_size());
+  }
+
 public:
   /// Creates clause with a list of variables \a VL.
   ///
@@ -2869,6 +2904,12 @@ class OMPReductionClause final
   /// \endcode
   /// Required for proper codegen of final reduction operation performed by the
   /// reduction clause.
+  /// \param CopyOps List of copy operations for inscan reductions:
+  /// \code
+  /// TempExprs = LHSExprs;
+  /// \endcode
+  /// \param CopyArrayTemps Temp arrays for prefix sums.
+  /// \param CopyArrayElems Temp arrays for prefix sums.
   /// \param PreInit Statement that must be executed before entering the OpenMP
   /// region with this clause.
   /// \param PostUpdate Expression that must be executed after exit from the
@@ -2880,13 +2921,18 @@ class OMPReductionClause final
          ArrayRef<Expr *> VL, NestedNameSpecifierLoc QualifierLoc,
          const DeclarationNameInfo &NameInfo, ArrayRef<Expr *> Privates,
          ArrayRef<Expr *> LHSExprs, ArrayRef<Expr *> RHSExprs,
-         ArrayRef<Expr *> ReductionOps, Stmt *PreInit, Expr *PostUpdate);
+         ArrayRef<Expr *> ReductionOps, ArrayRef<Expr *> CopyOps,
+         ArrayRef<Expr *> CopyArrayTemps, ArrayRef<Expr *> CopyArrayElems,
+         Stmt *PreInit, Expr *PostUpdate);
 
   /// Creates an empty clause with the place for \a N variables.
   ///
   /// \param C AST context.
   /// \param N The number of variables.
-  static OMPReductionClause *CreateEmpty(const ASTContext &C, unsigned N);
+  /// \param Modifier Reduction modifier.
+  static OMPReductionClause *
+  CreateEmpty(const ASTContext &C, unsigned N,
+              OpenMPReductionClauseModifier Modifier);
 
   /// Returns modifier.
   OpenMPReductionClauseModifier getModifier() const { return Modifier; }
@@ -2943,6 +2989,36 @@ class OMPReductionClause final
                              getReductionOps().end());
   }
 
+  helper_expr_const_range copy_ops() const {
+    return helper_expr_const_range(getInscanCopyOps().begin(),
+                                   getInscanCopyOps().end());
+  }
+
+  helper_expr_range copy_ops() {
+    return helper_expr_range(getInscanCopyOps().begin(),
+                             getInscanCopyOps().end());
+  }
+
+  helper_expr_const_range copy_array_temps() const {
+    return helper_expr_const_range(getInscanCopyArrayTemps().begin(),
+                                   getInscanCopyArrayTemps().end());
+  }
+
+  helper_expr_range copy_array_temps() {
+    return helper_expr_range(getInscanCopyArrayTemps().begin(),
+                             getInscanCopyArrayTemps().end());
+  }
+
+  helper_expr_const_range copy_array_elems() const {
+    return helper_expr_const_range(getInscanCopyArrayElems().begin(),
+                                   getInscanCopyArrayElems().end());
+  }
+
+  helper_expr_range copy_array_elems() {
+    return helper_expr_range(getInscanCopyArrayElems().begin(),
+                             getInscanCopyArrayElems().end());
+  }
+
   child_range children() {
     return child_range(reinterpret_cast<Stmt **>(varlist_begin()),
                        reinterpret_cast<Stmt **>(varlist_end()));

diff  --git a/clang/include/clang/AST/RecursiveASTVisitor.h b/clang/include/clang/AST/RecursiveASTVisitor.h
index f98000993589..9bbd390f61cd 100644
--- a/clang/include/clang/AST/RecursiveASTVisitor.h
+++ b/clang/include/clang/AST/RecursiveASTVisitor.h
@@ -3363,6 +3363,17 @@ RecursiveASTVisitor<Derived>::VisitOMPReductionClause(OMPReductionClause *C) {
   for (auto *E : C->reduction_ops()) {
     TRY_TO(TraverseStmt(E));
   }
+  if (C->getModifier() == OMPC_REDUCTION_inscan) {
+    for (auto *E : C->copy_ops()) {
+      TRY_TO(TraverseStmt(E));
+    }
+    for (auto *E : C->copy_array_temps()) {
+      TRY_TO(TraverseStmt(E));
+    }
+    for (auto *E : C->copy_array_elems()) {
+      TRY_TO(TraverseStmt(E));
+    }
+  }
   return true;
 }
 

diff  --git a/clang/lib/AST/OpenMPClause.cpp b/clang/lib/AST/OpenMPClause.cpp
index bcbe916820dc..3cb71d3d77bc 100644
--- a/clang/lib/AST/OpenMPClause.cpp
+++ b/clang/lib/AST/OpenMPClause.cpp
@@ -709,15 +709,43 @@ void OMPReductionClause::setReductionOps(ArrayRef<Expr *> ReductionOps) {
   std::copy(ReductionOps.begin(), ReductionOps.end(), getRHSExprs().end());
 }
 
+void OMPReductionClause::setInscanCopyOps(ArrayRef<Expr *> Ops) {
+  assert(Modifier == OMPC_REDUCTION_inscan && "Expected inscan reduction.");
+  assert(Ops.size() == varlist_size() && "Number of copy "
+                                         "expressions is not the same "
+                                         "as the preallocated buffer");
+  llvm::copy(Ops, getReductionOps().end());
+}
+
+void OMPReductionClause::setInscanCopyArrayTemps(
+    ArrayRef<Expr *> CopyArrayTemps) {
+  assert(Modifier == OMPC_REDUCTION_inscan && "Expected inscan reduction.");
+  assert(CopyArrayTemps.size() == varlist_size() &&
+         "Number of copy temp expressions is not the same as the preallocated "
+         "buffer");
+  llvm::copy(CopyArrayTemps, getInscanCopyOps().end());
+}
+
+void OMPReductionClause::setInscanCopyArrayElems(
+    ArrayRef<Expr *> CopyArrayElems) {
+  assert(Modifier == OMPC_REDUCTION_inscan && "Expected inscan reduction.");
+  assert(CopyArrayElems.size() == varlist_size() &&
+         "Number of copy temp expressions is not the same as the preallocated "
+         "buffer");
+  llvm::copy(CopyArrayElems, getInscanCopyArrayTemps().end());
+}
+
 OMPReductionClause *OMPReductionClause::Create(
     const ASTContext &C, SourceLocation StartLoc, SourceLocation LParenLoc,
     SourceLocation ModifierLoc, SourceLocation EndLoc, SourceLocation ColonLoc,
     OpenMPReductionClauseModifier Modifier, ArrayRef<Expr *> VL,
     NestedNameSpecifierLoc QualifierLoc, const DeclarationNameInfo &NameInfo,
     ArrayRef<Expr *> Privates, ArrayRef<Expr *> LHSExprs,
-    ArrayRef<Expr *> RHSExprs, ArrayRef<Expr *> ReductionOps, Stmt *PreInit,
-    Expr *PostUpdate) {
-  void *Mem = C.Allocate(totalSizeToAlloc<Expr *>(5 * VL.size()));
+    ArrayRef<Expr *> RHSExprs, ArrayRef<Expr *> ReductionOps,
+    ArrayRef<Expr *> CopyOps, ArrayRef<Expr *> CopyArrayTemps,
+    ArrayRef<Expr *> CopyArrayElems, Stmt *PreInit, Expr *PostUpdate) {
+  void *Mem = C.Allocate(totalSizeToAlloc<Expr *>(
+      (Modifier == OMPC_REDUCTION_inscan ? 8 : 5) * VL.size()));
   auto *Clause = new (Mem)
       OMPReductionClause(StartLoc, LParenLoc, ModifierLoc, EndLoc, ColonLoc,
                          Modifier, VL.size(), QualifierLoc, NameInfo);
@@ -728,13 +756,29 @@ OMPReductionClause *OMPReductionClause::Create(
   Clause->setReductionOps(ReductionOps);
   Clause->setPreInitStmt(PreInit);
   Clause->setPostUpdateExpr(PostUpdate);
+  if (Modifier == OMPC_REDUCTION_inscan) {
+    Clause->setInscanCopyOps(CopyOps);
+    Clause->setInscanCopyArrayTemps(CopyArrayTemps);
+    Clause->setInscanCopyArrayElems(CopyArrayElems);
+  } else {
+    assert(CopyOps.empty() &&
+           "copy operations are expected in inscan reductions only.");
+    assert(CopyArrayTemps.empty() &&
+           "copy array temps are expected in inscan reductions only.");
+    assert(CopyArrayElems.empty() &&
+           "copy array temps are expected in inscan reductions only.");
+  }
   return Clause;
 }
 
-OMPReductionClause *OMPReductionClause::CreateEmpty(const ASTContext &C,
-                                                    unsigned N) {
-  void *Mem = C.Allocate(totalSizeToAlloc<Expr *>(5 * N));
-  return new (Mem) OMPReductionClause(N);
+OMPReductionClause *
+OMPReductionClause::CreateEmpty(const ASTContext &C, unsigned N,
+                                OpenMPReductionClauseModifier Modifier) {
+  void *Mem = C.Allocate(totalSizeToAlloc<Expr *>(
+      (Modifier == OMPC_REDUCTION_inscan ? 8 : 5) * N));
+  auto *Clause = new (Mem) OMPReductionClause(N);
+  Clause->setModifier(Modifier);
+  return Clause;
 }
 
 void OMPTaskReductionClause::setPrivates(ArrayRef<Expr *> Privates) {

diff  --git a/clang/lib/AST/StmtProfile.cpp b/clang/lib/AST/StmtProfile.cpp
index cd8c265eeca8..bf3b43b816f1 100644
--- a/clang/lib/AST/StmtProfile.cpp
+++ b/clang/lib/AST/StmtProfile.cpp
@@ -609,6 +609,20 @@ void OMPClauseProfiler::VisitOMPReductionClause(
     if (E)
       Profiler->VisitStmt(E);
   }
+  if (C->getModifier() == clang::OMPC_REDUCTION_inscan) {
+    for (auto *E : C->copy_ops()) {
+      if (E)
+        Profiler->VisitStmt(E);
+    }
+    for (auto *E : C->copy_array_temps()) {
+      if (E)
+        Profiler->VisitStmt(E);
+    }
+    for (auto *E : C->copy_array_elems()) {
+      if (E)
+        Profiler->VisitStmt(E);
+    }
+  }
 }
 void OMPClauseProfiler::VisitOMPTaskReductionClause(
     const OMPTaskReductionClause *C) {

diff  --git a/clang/lib/CodeGen/CGStmt.cpp b/clang/lib/CodeGen/CGStmt.cpp
index cccb15a0a909..cf498c695a66 100644
--- a/clang/lib/CodeGen/CGStmt.cpp
+++ b/clang/lib/CodeGen/CGStmt.cpp
@@ -253,7 +253,7 @@ void CodeGenFunction::EmitStmt(const Stmt *S, ArrayRef<const Attr *> Attrs) {
     EmitOMPDepobjDirective(cast<OMPDepobjDirective>(*S));
     break;
   case Stmt::OMPScanDirectiveClass:
-    llvm_unreachable("Scan directive not supported yet.");
+    EmitOMPScanDirective(cast<OMPScanDirective>(*S));
     break;
   case Stmt::OMPOrderedDirectiveClass:
     EmitOMPOrderedDirective(cast<OMPOrderedDirective>(*S));

diff  --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp
index ae4e3400fcbc..23952309b61f 100644
--- a/clang/lib/CodeGen/CGStmtOpenMP.cpp
+++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp
@@ -1161,7 +1161,7 @@ void CodeGenFunction::EmitOMPLastprivateClauseFinal(
 
 void CodeGenFunction::EmitOMPReductionClauseInit(
     const OMPExecutableDirective &D,
-    CodeGenFunction::OMPPrivateScope &PrivateScope) {
+    CodeGenFunction::OMPPrivateScope &PrivateScope, bool ForInscan) {
   if (!HaveInsertPoint())
     return;
   SmallVector<const Expr *, 4> Shareds;
@@ -1173,6 +1173,8 @@ void CodeGenFunction::EmitOMPReductionClauseInit(
   SmallVector<const Expr *, 4> TaskLHSs;
   SmallVector<const Expr *, 4> TaskRHSs;
   for (const auto *C : D.getClausesOfKind<OMPReductionClause>()) {
+    if (ForInscan != (C->getModifier() == OMPC_REDUCTION_inscan))
+      continue;
     Shareds.append(C->varlist_begin(), C->varlist_end());
     Privates.append(C->privates().begin(), C->privates().end());
     ReductionOps.append(C->reduction_ops().begin(), C->reduction_ops().end());
@@ -1387,6 +1389,9 @@ void CodeGenFunction::EmitOMPReductionClauseFinal(
   bool HasAtLeastOneReduction = false;
   bool IsReductionWithTaskMod = false;
   for (const auto *C : D.getClausesOfKind<OMPReductionClause>()) {
+    // Do not emit for inscan reductions.
+    if (C->getModifier() == OMPC_REDUCTION_inscan)
+      continue;
     HasAtLeastOneReduction = true;
     Privates.append(C->privates().begin(), C->privates().end());
     LHSExprs.append(C->lhs_exprs().begin(), C->lhs_exprs().end());
@@ -1705,6 +1710,24 @@ void CodeGenFunction::EmitOMPLoopBody(const OMPLoopDirective &D,
                          getProfileCount(D.getBody()));
     EmitBlock(NextBB);
   }
+
+  OMPPrivateScope InscanScope(*this);
+  EmitOMPReductionClauseInit(D, InscanScope, /*ForInscan=*/true);
+  bool IsInscanRegion = InscanScope.Privatize();
+  if (IsInscanRegion) {
+    // Need to remember the block before and after scan directive
+    // to dispatch them correctly depending on the clause used in
+    // this directive, inclusive or exclusive. For inclusive scan the natural
+    // order of the blocks is used, for exclusive clause the blocks must be
+    // executed in reverse order.
+    OMPBeforeScanBlock = createBasicBlock("omp.before.scan.bb");
+    OMPAfterScanBlock = createBasicBlock("omp.after.scan.bb");
+    OMPScanExitBlock = createBasicBlock("omp.exit.inscan.bb");
+    OMPScanDispatch = createBasicBlock("omp.inscan.dispatch");
+    EmitBranch(OMPScanDispatch);
+    EmitBlock(OMPBeforeScanBlock);
+  }
+
   // Emit loop variables for C++ range loops.
   const Stmt *Body =
       D.getInnermostCapturedStmt()->getCapturedStmt()->IgnoreContainers();
@@ -1714,6 +1737,10 @@ void CodeGenFunction::EmitOMPLoopBody(const OMPLoopDirective &D,
                Body, /*TryImperfectlyNestedLoops=*/true),
            D.getCollapsedNumber());
 
+  // Jump to the dispatcher at the end of the loop body.
+  if (IsInscanRegion)
+    EmitBranch(OMPScanExitBlock);
+
   // The end (updates/cleanups).
   EmitBlock(Continue.getBlock());
   BreakContinueStack.pop_back();
@@ -2979,14 +3006,217 @@ emitDispatchForLoopBounds(CodeGenFunction &CGF, const OMPExecutableDirective &S,
   return {LBVal, UBVal};
 }
 
+/// Emits the code for the directive with inscan reductions.
+/// The code is the following:
+/// \code
+/// size num_iters = <num_iters>;
+/// <type> buffer[num_iters];
+/// #pragma omp ...
+/// for (i: 0..<num_iters>) {
+///   <input phase>;
+///   buffer[i] = red;
+/// }
+/// for (int k = 0; k != ceil(log2(num_iters)); ++k)
+/// for (size cnt = last_iter; cnt >= pow(2, k); --k)
+///   buffer[i] op= buffer[i-pow(2,k)];
+/// #pragma omp ...
+/// for (0..<num_iters>) {
+///   red = InclusiveScan ? buffer[i] : buffer[i-1];
+///   <scan phase>;
+/// }
+/// \endcode
+static void emitScanBasedDirective(
+    CodeGenFunction &CGF, const OMPLoopDirective &S,
+    llvm::function_ref<llvm::Value *(CodeGenFunction &)> NumIteratorsGen,
+    llvm::function_ref<void(CodeGenFunction &)> FirstGen,
+    llvm::function_ref<void(CodeGenFunction &)> SecondGen) {
+  llvm::Value *OMPScanNumIterations = CGF.Builder.CreateIntCast(
+      NumIteratorsGen(CGF), CGF.SizeTy, /*isSigned=*/false);
+  SmallVector<const Expr *, 4> Shareds;
+  SmallVector<const Expr *, 4> Privates;
+  SmallVector<const Expr *, 4> ReductionOps;
+  SmallVector<const Expr *, 4> LHSs;
+  SmallVector<const Expr *, 4> RHSs;
+  SmallVector<const Expr *, 4> CopyOps;
+  SmallVector<const Expr *, 4> CopyArrayTemps;
+  SmallVector<const Expr *, 4> CopyArrayElems;
+  for (const auto *C : S.getClausesOfKind<OMPReductionClause>()) {
+    assert(C->getModifier() == OMPC_REDUCTION_inscan &&
+           "Only inscan reductions are expected.");
+    Shareds.append(C->varlist_begin(), C->varlist_end());
+    Privates.append(C->privates().begin(), C->privates().end());
+    ReductionOps.append(C->reduction_ops().begin(), C->reduction_ops().end());
+    LHSs.append(C->lhs_exprs().begin(), C->lhs_exprs().end());
+    RHSs.append(C->rhs_exprs().begin(), C->rhs_exprs().end());
+    CopyOps.append(C->copy_ops().begin(), C->copy_ops().end());
+    CopyArrayTemps.append(C->copy_array_temps().begin(),
+                          C->copy_array_temps().end());
+    CopyArrayElems.append(C->copy_array_elems().begin(),
+                          C->copy_array_elems().end());
+  }
+  {
+    // Emit buffers for each reduction variables.
+    // ReductionCodeGen is required to emit correctly the code for array
+    // reductions.
+    ReductionCodeGen RedCG(Shareds, Shareds, Privates, ReductionOps);
+    unsigned Count = 0;
+    auto *ITA = CopyArrayTemps.begin();
+    for (const Expr *IRef : Privates) {
+      const auto *PrivateVD = cast<VarDecl>(cast<DeclRefExpr>(IRef)->getDecl());
+      // Emit variably modified arrays, used for arrays/array sections
+      // reductions.
+      if (PrivateVD->getType()->isVariablyModifiedType()) {
+        RedCG.emitSharedOrigLValue(CGF, Count);
+        RedCG.emitAggregateType(CGF, Count);
+      }
+      CodeGenFunction::OpaqueValueMapping DimMapping(
+          CGF,
+          cast<OpaqueValueExpr>(
+              cast<VariableArrayType>((*ITA)->getType()->getAsArrayTypeUnsafe())
+                  ->getSizeExpr()),
+          RValue::get(OMPScanNumIterations));
+      // Emit temp buffer.
+      CGF.EmitVarDecl(*cast<VarDecl>(cast<DeclRefExpr>(*ITA)->getDecl()));
+      ++ITA;
+      ++Count;
+    }
+  }
+  CodeGenFunction::ParentLoopDirectiveForScanRegion ScanRegion(CGF, S);
+  {
+    // Emit loop with input phase:
+    // #pragma omp ...
+    // for (i: 0..<num_iters>) {
+    //   <input phase>;
+    //   buffer[i] = red;
+    // }
+    CGF.OMPFirstScanLoop = true;
+    CodeGenFunction::OMPLocalDeclMapRAII Scope(CGF);
+    FirstGen(CGF);
+  }
+  // Emit prefix reduction:
+  // for (int k = 0; k <= ceil(log2(n)); ++k)
+  llvm::BasicBlock *InputBB = CGF.Builder.GetInsertBlock();
+  llvm::BasicBlock *LoopBB = CGF.createBasicBlock("omp.outer.log.scan.body");
+  llvm::BasicBlock *ExitBB = CGF.createBasicBlock("omp.outer.log.scan.exit");
+  llvm::Function *F = CGF.CGM.getIntrinsic(llvm::Intrinsic::log2, CGF.DoubleTy);
+  llvm::Value *Arg =
+      CGF.Builder.CreateUIToFP(OMPScanNumIterations, CGF.DoubleTy);
+  llvm::Value *LogVal = CGF.EmitNounwindRuntimeCall(F, Arg);
+  F = CGF.CGM.getIntrinsic(llvm::Intrinsic::ceil, CGF.DoubleTy);
+  LogVal = CGF.EmitNounwindRuntimeCall(F, LogVal);
+  LogVal = CGF.Builder.CreateFPToUI(LogVal, CGF.IntTy);
+  llvm::Value *NMin1 = CGF.Builder.CreateNUWSub(
+      OMPScanNumIterations, llvm::ConstantInt::get(CGF.SizeTy, 1));
+  auto DL = ApplyDebugLocation::CreateDefaultArtificial(CGF, S.getBeginLoc());
+  CGF.EmitBlock(LoopBB);
+  auto *Counter = CGF.Builder.CreatePHI(CGF.IntTy, 2);
+  // size pow2k = 1;
+  auto *Pow2K = CGF.Builder.CreatePHI(CGF.SizeTy, 2);
+  Counter->addIncoming(llvm::ConstantInt::get(CGF.IntTy, 0), InputBB);
+  Pow2K->addIncoming(llvm::ConstantInt::get(CGF.SizeTy, 1), InputBB);
+  // for (size i = n - 1; i >= 2 ^ k; --i)
+  //   tmp[i] op= tmp[i-pow2k];
+  llvm::BasicBlock *InnerLoopBB =
+      CGF.createBasicBlock("omp.inner.log.scan.body");
+  llvm::BasicBlock *InnerExitBB =
+      CGF.createBasicBlock("omp.inner.log.scan.exit");
+  llvm::Value *CmpI = CGF.Builder.CreateICmpUGE(NMin1, Pow2K);
+  CGF.Builder.CreateCondBr(CmpI, InnerLoopBB, InnerExitBB);
+  CGF.EmitBlock(InnerLoopBB);
+  auto *IVal = CGF.Builder.CreatePHI(CGF.SizeTy, 2);
+  IVal->addIncoming(NMin1, LoopBB);
+  {
+    CodeGenFunction::OMPPrivateScope PrivScope(CGF);
+    auto *ILHS = LHSs.begin();
+    auto *IRHS = RHSs.begin();
+    for (const Expr *CopyArrayElem : CopyArrayElems) {
+      const auto *LHSVD = cast<VarDecl>(cast<DeclRefExpr>(*ILHS)->getDecl());
+      const auto *RHSVD = cast<VarDecl>(cast<DeclRefExpr>(*IRHS)->getDecl());
+      Address LHSAddr = Address::invalid();
+      {
+        CodeGenFunction::OpaqueValueMapping IdxMapping(
+            CGF,
+            cast<OpaqueValueExpr>(
+                cast<ArraySubscriptExpr>(CopyArrayElem)->getIdx()),
+            RValue::get(IVal));
+        LHSAddr = CGF.EmitLValue(CopyArrayElem).getAddress(CGF);
+      }
+      PrivScope.addPrivate(LHSVD, [LHSAddr]() { return LHSAddr; });
+      Address RHSAddr = Address::invalid();
+      {
+        llvm::Value *OffsetIVal = CGF.Builder.CreateNUWSub(IVal, Pow2K);
+        CodeGenFunction::OpaqueValueMapping IdxMapping(
+            CGF,
+            cast<OpaqueValueExpr>(
+                cast<ArraySubscriptExpr>(CopyArrayElem)->getIdx()),
+            RValue::get(OffsetIVal));
+        RHSAddr = CGF.EmitLValue(CopyArrayElem).getAddress(CGF);
+      }
+      PrivScope.addPrivate(RHSVD, [RHSAddr]() { return RHSAddr; });
+      ++ILHS;
+      ++IRHS;
+    }
+    PrivScope.Privatize();
+    CGF.CGM.getOpenMPRuntime().emitReduction(
+        CGF, S.getEndLoc(), Privates, LHSs, RHSs, ReductionOps,
+        {/*WithNowait=*/true, /*SimpleReduction=*/true, OMPD_unknown});
+  }
+  llvm::Value *NextIVal =
+      CGF.Builder.CreateNUWSub(IVal, llvm::ConstantInt::get(CGF.SizeTy, 1));
+  IVal->addIncoming(NextIVal, CGF.Builder.GetInsertBlock());
+  CmpI = CGF.Builder.CreateICmpUGE(NextIVal, Pow2K);
+  CGF.Builder.CreateCondBr(CmpI, InnerLoopBB, InnerExitBB);
+  CGF.EmitBlock(InnerExitBB);
+  llvm::Value *Next =
+      CGF.Builder.CreateNUWAdd(Counter, llvm::ConstantInt::get(CGF.IntTy, 1));
+  Counter->addIncoming(Next, CGF.Builder.GetInsertBlock());
+  // pow2k <<= 1;
+  llvm::Value *NextPow2K = CGF.Builder.CreateShl(Pow2K, 1, "", /*HasNUW=*/true);
+  Pow2K->addIncoming(NextPow2K, CGF.Builder.GetInsertBlock());
+  llvm::Value *Cmp = CGF.Builder.CreateICmpNE(Next, LogVal);
+  CGF.Builder.CreateCondBr(Cmp, LoopBB, ExitBB);
+  auto DL1 = ApplyDebugLocation::CreateDefaultArtificial(CGF, S.getEndLoc());
+  CGF.EmitBlock(ExitBB);
+
+  CGF.OMPFirstScanLoop = false;
+  SecondGen(CGF);
+}
+
 void CodeGenFunction::EmitOMPForDirective(const OMPForDirective &S) {
   bool HasLastprivates = false;
   auto &&CodeGen = [&S, &HasLastprivates](CodeGenFunction &CGF,
                                           PrePostActionTy &) {
-    OMPCancelStackRAII CancelRegion(CGF, OMPD_for, S.hasCancel());
-    HasLastprivates = CGF.EmitOMPWorksharingLoop(S, S.getEnsureUpperBound(),
-                                                 emitForLoopBounds,
-                                                 emitDispatchForLoopBounds);
+    if (llvm::any_of(S.getClausesOfKind<OMPReductionClause>(),
+                     [](const OMPReductionClause *C) {
+                       return C->getModifier() == OMPC_REDUCTION_inscan;
+                     })) {
+      const auto &&NumIteratorsGen = [&S](CodeGenFunction &CGF) {
+        OMPLocalDeclMapRAII Scope(CGF);
+        OMPLoopScope LoopScope(CGF, S);
+        return CGF.EmitScalarExpr(S.getNumIterations());
+      };
+      const auto &&FirstGen = [&S](CodeGenFunction &CGF) {
+        OMPCancelStackRAII CancelRegion(CGF, OMPD_for, S.hasCancel());
+        (void)CGF.EmitOMPWorksharingLoop(S, S.getEnsureUpperBound(),
+                                         emitForLoopBounds,
+                                         emitDispatchForLoopBounds);
+        // Emit an implicit barrier at the end.
+        CGF.CGM.getOpenMPRuntime().emitBarrierCall(CGF, S.getBeginLoc(),
+                                                   OMPD_for);
+      };
+      const auto &&SecondGen = [&S, &HasLastprivates](CodeGenFunction &CGF) {
+        OMPCancelStackRAII CancelRegion(CGF, OMPD_for, S.hasCancel());
+        HasLastprivates = CGF.EmitOMPWorksharingLoop(S, S.getEnsureUpperBound(),
+                                                     emitForLoopBounds,
+                                                     emitDispatchForLoopBounds);
+      };
+      emitScanBasedDirective(CGF, S, NumIteratorsGen, FirstGen, SecondGen);
+    } else {
+      OMPCancelStackRAII CancelRegion(CGF, OMPD_for, S.hasCancel());
+      HasLastprivates = CGF.EmitOMPWorksharingLoop(S, S.getEnsureUpperBound(),
+                                                   emitForLoopBounds,
+                                                   emitDispatchForLoopBounds);
+    }
   };
   {
     auto LPCRegion =
@@ -3961,6 +4191,112 @@ void CodeGenFunction::EmitOMPDepobjDirective(const OMPDepobjDirective &S) {
   }
 }
 
+void CodeGenFunction::EmitOMPScanDirective(const OMPScanDirective &S) {
+  // Do not emit code for non-simd directives in simd-only mode.
+  if (getLangOpts().OpenMPSimd && !OMPParentLoopDirectiveForScan)
+    return;
+  const OMPExecutableDirective &ParentDir = *OMPParentLoopDirectiveForScan;
+  SmallVector<const Expr *, 4> Shareds;
+  SmallVector<const Expr *, 4> Privates;
+  SmallVector<const Expr *, 4> LHSs;
+  SmallVector<const Expr *, 4> RHSs;
+  SmallVector<const Expr *, 4> CopyOps;
+  SmallVector<const Expr *, 4> CopyArrayTemps;
+  SmallVector<const Expr *, 4> CopyArrayElems;
+  for (const auto *C : ParentDir.getClausesOfKind<OMPReductionClause>()) {
+    if (C->getModifier() != OMPC_REDUCTION_inscan)
+      continue;
+    Shareds.append(C->varlist_begin(), C->varlist_end());
+    Privates.append(C->privates().begin(), C->privates().end());
+    LHSs.append(C->lhs_exprs().begin(), C->lhs_exprs().end());
+    RHSs.append(C->rhs_exprs().begin(), C->rhs_exprs().end());
+    CopyOps.append(C->copy_ops().begin(), C->copy_ops().end());
+    CopyArrayTemps.append(C->copy_array_temps().begin(),
+                          C->copy_array_temps().end());
+    CopyArrayElems.append(C->copy_array_elems().begin(),
+                          C->copy_array_elems().end());
+  }
+  bool IsInclusive = S.hasClausesOfKind<OMPInclusiveClause>();
+  if (!IsInclusive) {
+    EmitBranch(BreakContinueStack.back().ContinueBlock.getBlock());
+    EmitBlock(OMPScanExitBlock);
+  }
+  if (OMPFirstScanLoop) {
+    // Emit buffer[i] = red; at the end of the input phase.
+    const auto *IVExpr = cast<OMPLoopDirective>(ParentDir)
+                             .getIterationVariable()
+                             ->IgnoreParenImpCasts();
+    LValue IdxLVal = EmitLValue(IVExpr);
+    llvm::Value *IdxVal = EmitLoadOfScalar(IdxLVal, IVExpr->getExprLoc());
+    IdxVal = Builder.CreateIntCast(IdxVal, SizeTy, /*isSigned=*/false);
+    for (unsigned I = 0, E = CopyArrayElems.size(); I < E; ++I) {
+      const Expr *PrivateExpr = Privates[I];
+      const Expr *OrigExpr = Shareds[I];
+      const Expr *CopyArrayElem = CopyArrayElems[I];
+      OpaqueValueMapping IdxMapping(
+          *this,
+          cast<OpaqueValueExpr>(
+              cast<ArraySubscriptExpr>(CopyArrayElem)->getIdx()),
+          RValue::get(IdxVal));
+      LValue DestLVal = EmitLValue(CopyArrayElem);
+      LValue SrcLVal = EmitLValue(OrigExpr);
+      EmitOMPCopy(PrivateExpr->getType(), DestLVal.getAddress(*this),
+                  SrcLVal.getAddress(*this),
+                  cast<VarDecl>(cast<DeclRefExpr>(LHSs[I])->getDecl()),
+                  cast<VarDecl>(cast<DeclRefExpr>(RHSs[I])->getDecl()),
+                  CopyOps[I]);
+    }
+  }
+  EmitBranch(BreakContinueStack.back().ContinueBlock.getBlock());
+  if (IsInclusive) {
+    EmitBlock(OMPScanExitBlock);
+    EmitBranch(BreakContinueStack.back().ContinueBlock.getBlock());
+  }
+  EmitBlock(OMPScanDispatch);
+  if (!OMPFirstScanLoop) {
+    // Emit red = buffer[i]; at the entrance to the scan phase.
+    const auto *IVExpr = cast<OMPLoopDirective>(ParentDir)
+                             .getIterationVariable()
+                             ->IgnoreParenImpCasts();
+    LValue IdxLVal = EmitLValue(IVExpr);
+    llvm::Value *IdxVal = EmitLoadOfScalar(IdxLVal, IVExpr->getExprLoc());
+    IdxVal = Builder.CreateIntCast(IdxVal, SizeTy, /*isSigned=*/false);
+    llvm::BasicBlock *ExclusiveExitBB = nullptr;
+    if (!IsInclusive) {
+      llvm::BasicBlock *ContBB = createBasicBlock("omp.exclusive.dec");
+      ExclusiveExitBB = createBasicBlock("omp.exclusive.copy.exit");
+      llvm::Value *Cmp = Builder.CreateIsNull(IdxVal);
+      Builder.CreateCondBr(Cmp, ExclusiveExitBB, ContBB);
+      EmitBlock(ContBB);
+      // Use idx - 1 iteration for exclusive scan.
+      IdxVal = Builder.CreateNUWSub(IdxVal, llvm::ConstantInt::get(SizeTy, 1));
+    }
+    for (unsigned I = 0, E = CopyArrayElems.size(); I < E; ++I) {
+      const Expr *PrivateExpr = Privates[I];
+      const Expr *OrigExpr = Shareds[I];
+      const Expr *CopyArrayElem = CopyArrayElems[I];
+      OpaqueValueMapping IdxMapping(
+          *this,
+          cast<OpaqueValueExpr>(
+              cast<ArraySubscriptExpr>(CopyArrayElem)->getIdx()),
+          RValue::get(IdxVal));
+      LValue SrcLVal = EmitLValue(CopyArrayElem);
+      LValue DestLVal = EmitLValue(OrigExpr);
+      EmitOMPCopy(PrivateExpr->getType(), DestLVal.getAddress(*this),
+                  SrcLVal.getAddress(*this),
+                  cast<VarDecl>(cast<DeclRefExpr>(LHSs[I])->getDecl()),
+                  cast<VarDecl>(cast<DeclRefExpr>(RHSs[I])->getDecl()),
+                  CopyOps[I]);
+    }
+    if (!IsInclusive) {
+      EmitBlock(ExclusiveExitBB);
+    }
+  }
+  EmitBranch((OMPFirstScanLoop == IsInclusive) ? OMPBeforeScanBlock
+                                               : OMPAfterScanBlock);
+  EmitBlock(OMPAfterScanBlock);
+}
+
 void CodeGenFunction::EmitOMPDistributeLoop(const OMPLoopDirective &S,
                                             const CodeGenLoopTy &CodeGenLoop,
                                             Expr *IncExpr) {
@@ -5950,6 +6286,10 @@ void CodeGenFunction::EmitOMPTargetUpdateDirective(
 
 void CodeGenFunction::EmitSimpleOMPExecutableDirective(
     const OMPExecutableDirective &D) {
+  if (const auto *SD = dyn_cast<OMPScanDirective>(&D)) {
+    EmitOMPScanDirective(*SD);
+    return;
+  }
   if (!D.hasAssociatedStmt() || !D.getAssociatedStmt())
     return;
   auto &&CodeGen = [&D](CodeGenFunction &CGF, PrePostActionTy &Action) {

diff  --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index e3dd462e5ba8..2b0ebad3088e 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -673,6 +673,32 @@ class CodeGenFunction : public CodeGenTypeCache {
 
   llvm::BasicBlock *getInvokeDestImpl();
 
+  /// Parent loop-based directive for scan directive.
+  const OMPExecutableDirective *OMPParentLoopDirectiveForScan = nullptr;
+  llvm::BasicBlock *OMPBeforeScanBlock = nullptr;
+  llvm::BasicBlock *OMPAfterScanBlock = nullptr;
+  llvm::BasicBlock *OMPScanExitBlock = nullptr;
+  llvm::BasicBlock *OMPScanDispatch = nullptr;
+  bool OMPFirstScanLoop = false;
+
+  /// Manages parent directive for scan directives.
+  class ParentLoopDirectiveForScanRegion {
+    CodeGenFunction &CGF;
+    const OMPExecutableDirective &ParentLoopDirectiveForScan;
+
+  public:
+    ParentLoopDirectiveForScanRegion(
+        CodeGenFunction &CGF,
+        const OMPExecutableDirective &ParentLoopDirectiveForScan)
+        : CGF(CGF),
+          ParentLoopDirectiveForScan(*CGF.OMPParentLoopDirectiveForScan) {
+      CGF.OMPParentLoopDirectiveForScan = &ParentLoopDirectiveForScan;
+    }
+    ~ParentLoopDirectiveForScanRegion() {
+      CGF.OMPParentLoopDirectiveForScan = &ParentLoopDirectiveForScan;
+    }
+  };
+
   template <class T>
   typename DominatingValue<T>::saved_type saveValueInCond(T value) {
     return DominatingValue<T>::save(*this, value);
@@ -3201,7 +3227,8 @@ class CodeGenFunction : public CodeGenTypeCache {
   /// proper codegen in internal captured statement.
   ///
   void EmitOMPReductionClauseInit(const OMPExecutableDirective &D,
-                                  OMPPrivateScope &PrivateScope);
+                                  OMPPrivateScope &PrivateScope,
+                                  bool ForInscan = false);
   /// Emit final update of reduction values to original variables at
   /// the end of the directive.
   ///
@@ -3260,6 +3287,7 @@ class CodeGenFunction : public CodeGenTypeCache {
   void EmitOMPTaskgroupDirective(const OMPTaskgroupDirective &S);
   void EmitOMPFlushDirective(const OMPFlushDirective &S);
   void EmitOMPDepobjDirective(const OMPDepobjDirective &S);
+  void EmitOMPScanDirective(const OMPScanDirective &S);
   void EmitOMPOrderedDirective(const OMPOrderedDirective &S);
   void EmitOMPAtomicDirective(const OMPAtomicDirective &S);
   void EmitOMPTargetDirective(const OMPTargetDirective &S);

diff  --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index a35cb2dcf687..c689280e0ccd 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -9176,6 +9176,14 @@ StmtResult Sema::ActOnOpenMPScanDirective(ArrayRef<OMPClause *> Clauses,
          diag::err_omp_scan_single_clause_expected);
     return StmtError();
   }
+  // Check that scan directive is used in the scopeof the OpenMP loop body.
+  if (Scope *S = DSAStack->getCurScope()) {
+    Scope *ParentS = S->getParent();
+    if (!ParentS || ParentS->getParent() != ParentS->getBreakParent() ||
+        !ParentS->getBreakParent()->isOpenMPLoopScope())
+      return StmtError(Diag(StartLoc, diag::err_omp_orphaned_device_directive)
+                       << getOpenMPDirectiveName(OMPD_scan) << 5);
+  }
   // Check that only one instance of scan directives is used in the same outer
   // region.
   if (DSAStack->doesParentHasScanDirective()) {
@@ -14461,6 +14469,12 @@ struct ReductionData {
   SmallVector<Expr *, 8> RHSs;
   /// Reduction operation expression.
   SmallVector<Expr *, 8> ReductionOps;
+  /// inscan copy operation expressions.
+  SmallVector<Expr *, 8> InscanCopyOps;
+  /// inscan copy temp array expressions for prefix sums.
+  SmallVector<Expr *, 8> InscanCopyArrayTemps;
+  /// inscan copy temp array element expressions for prefix sums.
+  SmallVector<Expr *, 8> InscanCopyArrayElems;
   /// Taskgroup descriptors for the corresponding reduction items in
   /// in_reduction clauses.
   SmallVector<Expr *, 8> TaskgroupDescriptors;
@@ -14478,6 +14492,11 @@ struct ReductionData {
     LHSs.reserve(Size);
     RHSs.reserve(Size);
     ReductionOps.reserve(Size);
+    if (RedModifier == OMPC_REDUCTION_inscan) {
+      InscanCopyOps.reserve(Size);
+      InscanCopyArrayTemps.reserve(Size);
+      InscanCopyArrayElems.reserve(Size);
+    }
     TaskgroupDescriptors.reserve(Size);
     ExprCaptures.reserve(Size);
     ExprPostUpdates.reserve(Size);
@@ -14491,16 +14510,31 @@ struct ReductionData {
     RHSs.emplace_back(nullptr);
     ReductionOps.emplace_back(ReductionOp);
     TaskgroupDescriptors.emplace_back(nullptr);
+    if (RedModifier == OMPC_REDUCTION_inscan) {
+      InscanCopyOps.push_back(nullptr);
+      InscanCopyArrayTemps.push_back(nullptr);
+      InscanCopyArrayElems.push_back(nullptr);
+    }
   }
   /// Stores reduction data.
   void push(Expr *Item, Expr *Private, Expr *LHS, Expr *RHS, Expr *ReductionOp,
-            Expr *TaskgroupDescriptor) {
+            Expr *TaskgroupDescriptor, Expr *CopyOp, Expr *CopyArrayTemp,
+            Expr *CopyArrayElem) {
     Vars.emplace_back(Item);
     Privates.emplace_back(Private);
     LHSs.emplace_back(LHS);
     RHSs.emplace_back(RHS);
     ReductionOps.emplace_back(ReductionOp);
     TaskgroupDescriptors.emplace_back(TaskgroupDescriptor);
+    if (RedModifier == OMPC_REDUCTION_inscan) {
+      InscanCopyOps.push_back(CopyOp);
+      InscanCopyArrayTemps.push_back(CopyArrayTemp);
+      InscanCopyArrayElems.push_back(CopyArrayElem);
+    } else {
+      assert(CopyOp == nullptr && CopyArrayTemp == nullptr &&
+             CopyArrayElem == nullptr &&
+             "Copy operation must be used for inscan reductions only.");
+    }
   }
 };
 } // namespace
@@ -14893,11 +14927,11 @@ static bool actOnOMPReductionKindClause(
         if (isOpenMPTargetExecutionDirective(Stack->getCurrentDirective())) {
           S.Diag(ELoc, diag::err_omp_reduction_vla_unsupported) << !!OASE;
           S.Diag(ELoc, diag::note_vla_unsupported);
+          continue;
         } else {
           S.targetDiag(ELoc, diag::err_omp_reduction_vla_unsupported) << !!OASE;
           S.targetDiag(ELoc, diag::note_vla_unsupported);
         }
-        continue;
       }
       // For arrays/array sections only:
       // Create pseudo array type for private copy. The size for this array will
@@ -15102,6 +15136,40 @@ static bool actOnOMPReductionKindClause(
         continue;
     }
 
+    // Add copy operations for inscan reductions.
+    // LHS = RHS;
+    ExprResult CopyOpRes, TempArrayRes, TempArrayElem;
+    if (ClauseKind == OMPC_reduction &&
+        RD.RedModifier == OMPC_REDUCTION_inscan) {
+      ExprResult RHS = S.DefaultLvalueConversion(RHSDRE);
+      CopyOpRes = S.BuildBinOp(Stack->getCurScope(), ELoc, BO_Assign, LHSDRE,
+                               RHS.get());
+      if (!CopyOpRes.isUsable())
+        continue;
+      CopyOpRes =
+          S.ActOnFinishFullExpr(CopyOpRes.get(), /*DiscardedValue=*/true);
+      if (!CopyOpRes.isUsable())
+        continue;
+      // Build temp array for prefix sum.
+      auto *Dim = new (S.Context)
+          OpaqueValueExpr(ELoc, S.Context.getSizeType(), VK_RValue);
+      QualType ArrayTy =
+          S.Context.getVariableArrayType(PrivateTy, Dim, ArrayType::Normal,
+                                         /*IndexTypeQuals=*/0, {ELoc, ELoc});
+      VarDecl *TempArrayVD =
+          buildVarDecl(S, ELoc, ArrayTy, D->getName(),
+                       D->hasAttrs() ? &D->getAttrs() : nullptr);
+      // Add a constructor to the temp decl.
+      S.ActOnUninitializedDecl(TempArrayVD);
+      TempArrayRes = buildDeclRefExpr(S, TempArrayVD, ArrayTy, ELoc);
+      TempArrayElem =
+          S.DefaultFunctionArrayLvalueConversion(TempArrayRes.get());
+      auto *Idx = new (S.Context)
+          OpaqueValueExpr(ELoc, S.Context.getSizeType(), VK_RValue);
+      TempArrayElem = S.CreateBuiltinArraySubscriptExpr(TempArrayElem.get(),
+                                                        ELoc, Idx, ELoc);
+    }
+
     // OpenMP [2.15.4.6, Restrictions, p.2]
     // A list item that appears in an in_reduction clause of a task construct
     // must appear in a task_reduction clause of a construct associated with a
@@ -15203,7 +15271,8 @@ static bool actOnOMPReductionKindClause(
         Stack->addTaskgroupReductionData(D, ReductionIdRange, BOK);
     }
     RD.push(VarsExpr, PrivateDRE, LHSDRE, RHSDRE, ReductionOp.get(),
-            TaskgroupDescriptor);
+            TaskgroupDescriptor, CopyOpRes.get(), TempArrayRes.get(),
+            TempArrayElem.get());
   }
   return RD.Vars.empty();
 }
@@ -15246,7 +15315,8 @@ OMPClause *Sema::ActOnOpenMPReductionClause(
   return OMPReductionClause::Create(
       Context, StartLoc, LParenLoc, ModifierLoc, ColonLoc, EndLoc, Modifier,
       RD.Vars, ReductionIdScopeSpec.getWithLocInContext(Context), ReductionId,
-      RD.Privates, RD.LHSs, RD.RHSs, RD.ReductionOps,
+      RD.Privates, RD.LHSs, RD.RHSs, RD.ReductionOps, RD.InscanCopyOps,
+      RD.InscanCopyArrayTemps, RD.InscanCopyArrayElems,
       buildPreInits(Context, RD.ExprCaptures),
       buildPostUpdate(*this, RD.ExprPostUpdates));
 }

diff  --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
index 1fc09da1dadf..2afa91578be5 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -11825,9 +11825,12 @@ OMPClause *OMPClauseReader::readClause() {
   case llvm::omp::OMPC_shared:
     C = OMPSharedClause::CreateEmpty(Context, Record.readInt());
     break;
-  case llvm::omp::OMPC_reduction:
-    C = OMPReductionClause::CreateEmpty(Context, Record.readInt());
+  case llvm::omp::OMPC_reduction: {
+    unsigned N = Record.readInt();
+    auto Modifier = Record.readEnum<OpenMPReductionClauseModifier>();
+    C = OMPReductionClause::CreateEmpty(Context, N, Modifier);
     break;
+  }
   case llvm::omp::OMPC_task_reduction:
     C = OMPTaskReductionClause::CreateEmpty(Context, Record.readInt());
     break;
@@ -12208,7 +12211,6 @@ void OMPClauseReader::VisitOMPReductionClause(OMPReductionClause *C) {
   C->setLParenLoc(Record.readSourceLocation());
   C->setModifierLoc(Record.readSourceLocation());
   C->setColonLoc(Record.readSourceLocation());
-  C->setModifier(Record.readEnum<OpenMPReductionClauseModifier>());
   NestedNameSpecifierLoc NNSL = Record.readNestedNameSpecifierLoc();
   DeclarationNameInfo DNI = Record.readDeclarationNameInfo();
   C->setQualifierLoc(NNSL);
@@ -12236,6 +12238,20 @@ void OMPClauseReader::VisitOMPReductionClause(OMPReductionClause *C) {
   for (unsigned i = 0; i != NumVars; ++i)
     Vars.push_back(Record.readSubExpr());
   C->setReductionOps(Vars);
+  if (C->getModifier() == OMPC_REDUCTION_inscan) {
+    Vars.clear();
+    for (unsigned i = 0; i != NumVars; ++i)
+      Vars.push_back(Record.readSubExpr());
+    C->setInscanCopyOps(Vars);
+    Vars.clear();
+    for (unsigned i = 0; i != NumVars; ++i)
+      Vars.push_back(Record.readSubExpr());
+    C->setInscanCopyArrayTemps(Vars);
+    Vars.clear();
+    for (unsigned i = 0; i != NumVars; ++i)
+      Vars.push_back(Record.readSubExpr());
+    C->setInscanCopyArrayElems(Vars);
+  }
 }
 
 void OMPClauseReader::VisitOMPTaskReductionClause(OMPTaskReductionClause *C) {

diff  --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index 9d81e137f0bb..e1fbe566f796 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -6305,11 +6305,11 @@ void OMPClauseWriter::VisitOMPSharedClause(OMPSharedClause *C) {
 
 void OMPClauseWriter::VisitOMPReductionClause(OMPReductionClause *C) {
   Record.push_back(C->varlist_size());
+  Record.writeEnum(C->getModifier());
   VisitOMPClauseWithPostUpdate(C);
   Record.AddSourceLocation(C->getLParenLoc());
   Record.AddSourceLocation(C->getModifierLoc());
   Record.AddSourceLocation(C->getColonLoc());
-  Record.writeEnum(C->getModifier());
   Record.AddNestedNameSpecifierLoc(C->getQualifierLoc());
   Record.AddDeclarationNameInfo(C->getNameInfo());
   for (auto *VE : C->varlists())
@@ -6322,6 +6322,14 @@ void OMPClauseWriter::VisitOMPReductionClause(OMPReductionClause *C) {
     Record.AddStmt(E);
   for (auto *E : C->reduction_ops())
     Record.AddStmt(E);
+  if (C->getModifier() == clang::OMPC_REDUCTION_inscan) {
+    for (auto *E : C->copy_ops())
+      Record.AddStmt(E);
+    for (auto *E : C->copy_array_temps())
+      Record.AddStmt(E);
+    for (auto *E : C->copy_array_elems())
+      Record.AddStmt(E);
+  }
 }
 
 void OMPClauseWriter::VisitOMPTaskReductionClause(OMPTaskReductionClause *C) {

diff  --git a/clang/test/OpenMP/for_scan_codegen.cpp b/clang/test/OpenMP/for_scan_codegen.cpp
new file mode 100644
index 000000000000..9905e4a67f77
--- /dev/null
+++ b/clang/test/OpenMP/for_scan_codegen.cpp
@@ -0,0 +1,311 @@
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c++ -std=c++11 -triple x86_64-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c++ -triple x86_64-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+
+// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -x c++ -std=c++11 -triple x86_64-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -x c++ -triple x86_64-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s
+// SIMD-ONLY0-NOT: {{__kmpc|__tgt}}
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+void foo();
+void bar();
+
+// CHECK: define void @{{.*}}baz{{.*}}(i32 %n)
+void baz(int n) {
+  static float a[10];
+  static double b;
+  // CHECK: call i8* @llvm.stacksave()
+  // CHECK: [[A_BUF_SIZE:%.+]] = mul nuw i64 10, [[NUM_ELEMS:%[^,]+]]
+
+  // float a_buffer[10][n];
+  // CHECK: [[A_BUF:%.+]] = alloca float, i64 [[A_BUF_SIZE]],
+
+  // double b_buffer[10];
+  // CHECK: [[B_BUF:%.+]] = alloca double, i64 10,
+#pragma omp for reduction(inscan, +:a[:n], b)
+  for (int i = 0; i < 10; ++i) {
+    // CHECK: call void @__kmpc_for_static_init_4(
+    // CHECK: call i8* @llvm.stacksave()
+    // CHECK: store float 0.000000e+00, float* %
+    // CHECK: store double 0.000000e+00, double* [[B_PRIV_ADDR:%.+]],
+    // CHECK: br label %[[DISPATCH:[^,]+]]
+    // CHECK: [[INPUT_PHASE:.+]]:
+    // CHECK: call void @{{.+}}foo{{.+}}()
+
+    // a_buffer[i][0..n] = a_priv[[0..n];
+    // CHECK: [[BASE_IDX_I:%.+]] = load i32, i32* [[IV_ADDR:%.+]],
+    // CHECK: [[BASE_IDX:%.+]] = zext i32 [[BASE_IDX_I]] to i64
+    // CHECK: [[IDX:%.+]] = mul nsw i64 [[BASE_IDX]], [[NUM_ELEMS]]
+    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, float* [[A_BUF]], i64 [[IDX]]
+    // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds [10 x float], [10 x float]* [[A_PRIV_ADDR:%.+]], i64 0, i64 0
+    // CHECK: [[BYTES:%.+]] = mul nuw i64 [[NUM_ELEMS:%.+]], 4
+    // CHECK: [[DEST:%.+]] = bitcast float* [[A_BUF_IDX]] to i8*
+    // CHECK: [[SRC:%.+]] = bitcast float* [[A_PRIV]] to i8*
+    // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* {{.*}}[[DEST]], i8* {{.*}}[[SRC]], i64 [[BYTES]], i1 false)
+
+    // b_buffer[i] = b_priv;
+    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, double* [[B_BUF]], i64 [[BASE_IDX]]
+    // CHECK: [[B_PRIV:%.+]] = load double, double* [[B_PRIV_ADDR]],
+    // CHECK: store double [[B_PRIV]], double* [[B_BUF_IDX]],
+    // CHECK: br label %[[LOOP_CONTINUE:.+]]
+
+    // CHECK: [[DISPATCH]]:
+    // CHECK: br label %[[INPUT_PHASE]]
+    // CHECK: [[LOOP_CONTINUE]]:
+    // CHECK: call void @llvm.stackrestore(i8* %
+    // CHECK: call void @__kmpc_for_static_fini(
+    // CHECK: call void @__kmpc_barrier(
+    foo();
+#pragma omp scan inclusive(a[:n], b)
+    // CHECK: [[LOG2_10:%.+]] = call double @llvm.log2.f64(double 1.000000e+01)
+    // CHECK: [[CEIL_LOG2_10:%.+]] = call double @llvm.ceil.f64(double [[LOG2_10]])
+    // CHECK: [[CEIL_LOG2_10_INT:%.+]] = fptoui double [[CEIL_LOG2_10]] to i32
+    // CHECK: br label %[[OUTER_BODY:[^,]+]]
+    // CHECK: [[OUTER_BODY]]:
+    // CHECK: [[K:%.+]] = phi i32 [ 0, %{{.+}} ], [ [[K_NEXT:%.+]], %{{.+}} ]
+    // CHECK: [[K2POW:%.+]] = phi i64 [ 1, %{{.+}} ], [ [[K2POW_NEXT:%.+]], %{{.+}} ]
+    // CHECK: [[CMP:%.+]] = icmp uge i64 9, [[K2POW]]
+    // CHECK: br i1 [[CMP]], label %[[INNER_BODY:[^,]+]], label %[[INNER_EXIT:[^,]+]]
+    // CHECK: [[INNER_BODY]]:
+    // CHECK: [[I:%.+]] = phi i64 [ 9, %[[OUTER_BODY]] ], [ [[I_PREV:%.+]], %{{.+}} ]
+
+    // a_buffer[i] += a_buffer[i-pow(2, k)];
+    // CHECK: [[IDX:%.+]] = mul nsw i64 [[I]], [[NUM_ELEMS]]
+    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, float* [[A_BUF]], i64 [[IDX]]
+    // CHECK: [[IDX_SUB_K2POW:%.+]] = sub nuw i64 [[I]], [[K2POW]]
+    // CHECK: [[IDX:%.+]] = mul nsw i64 [[IDX_SUB_K2POW]], [[NUM_ELEMS]]
+    // CHECK: [[A_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds float, float* [[A_BUF]], i64 [[IDX]]
+    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, double* [[B_BUF]], i64 [[I]]
+    // CHECK: [[IDX_SUB_K2POW:%.+]] = sub nuw i64 [[I]], [[K2POW]]
+    // CHECK: [[B_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds double, double* [[B_BUF]], i64 [[IDX_SUB_K2POW]]
+    // CHECK: [[A_BUF_END:%.+]] = getelementptr float, float* [[A_BUF_IDX]], i64 [[NUM_ELEMS]]
+    // CHECK: [[ISEMPTY:%.+]] = icmp eq float* [[A_BUF_IDX]], [[A_BUF_END]]
+    // CHECK: br i1 [[ISEMPTY]], label %[[RED_DONE:[^,]+]], label %[[RED_BODY:[^,]+]]
+    // CHECK: [[RED_BODY]]:
+    // CHECK: [[A_BUF_IDX_SUB_K2POW_ELEM:%.+]] = phi float* [ [[A_BUF_IDX_SUB_K2POW]], %[[INNER_BODY]] ], [ [[A_BUF_IDX_SUB_K2POW_NEXT:%.+]], %[[RED_BODY]] ]
+    // CHECK: [[A_BUF_IDX_ELEM:%.+]] = phi float* [ [[A_BUF_IDX]], %[[INNER_BODY]] ], [ [[A_BUF_IDX_NEXT:%.+]], %[[RED_BODY]] ]
+    // CHECK: [[A_BUF_IDX_VAL:%.+]] = load float, float* [[A_BUF_IDX_ELEM]],
+    // CHECK: [[A_BUF_IDX_SUB_K2POW_VAL:%.+]] = load float, float* [[A_BUF_IDX_SUB_K2POW_ELEM]],
+    // CHECK: [[RED:%.+]] = fadd float [[A_BUF_IDX_VAL]], [[A_BUF_IDX_SUB_K2POW_VAL]]
+    // CHECK: store float [[RED]], float* [[A_BUF_IDX_ELEM]],
+    // CHECK: [[A_BUF_IDX_NEXT]] = getelementptr float, float* [[A_BUF_IDX_ELEM]], i32 1
+    // CHECK: [[A_BUF_IDX_SUB_K2POW_NEXT]] = getelementptr float, float* [[A_BUF_IDX_SUB_K2POW_ELEM]], i32 1
+    // CHECK: [[DONE:%.+]] = icmp eq float* [[A_BUF_IDX_NEXT]], [[A_BUF_END]]
+    // CHECK: br i1 [[DONE]], label %[[RED_DONE]], label %[[RED_BODY]]
+    // CHECK: [[RED_DONE]]:
+
+    // b_buffer[i] += b_buffer[i-pow(2, k)];
+    // CHECK: [[B_BUF_IDX_VAL:%.+]] = load double, double* [[B_BUF_IDX]],
+    // CHECK: [[B_BUF_IDX_SUB_K2POW_VAL:%.+]] = load double, double* [[B_BUF_IDX_SUB_K2POW]],
+    // CHECK: [[RED:%.+]] = fadd double [[B_BUF_IDX_VAL]], [[B_BUF_IDX_SUB_K2POW_VAL]]
+    // CHECK: store double [[RED]], double* [[B_BUF_IDX]],
+
+    // --i;
+    // CHECK: [[I_PREV:%.+]] = sub nuw i64 [[I]], 1
+    // CHECK: [[CMP:%.+]] = icmp uge i64 [[I_PREV]], [[K2POW]]
+    // CHECK: br i1 [[CMP]], label %[[INNER_BODY]], label %[[INNER_EXIT]]
+    // CHECK: [[INNER_EXIT]]:
+
+    // ++k;
+    // CHECK: [[K_NEXT]] = add nuw i32 [[K]], 1
+    // k2pow <<= 1;
+    // CHECK: [[K2POW_NEXT]] = shl nuw i64 [[K2POW]], 1
+    // CHECK: [[CMP:%.+]] = icmp ne i32 [[K_NEXT]], [[CEIL_LOG2_10_INT]]
+    // CHECK: br i1 [[CMP]], label %[[OUTER_BODY]], label %[[OUTER_EXIT:[^,]+]]
+    // CHECK: [[OUTER_EXIT]]:
+    bar();
+    // CHECK: call void @__kmpc_for_static_init_4(
+    // CHECK: call i8* @llvm.stacksave()
+    // CHECK: store float 0.000000e+00, float* %
+    // CHECK: store double 0.000000e+00, double* [[B_PRIV_ADDR:%.+]],
+    // CHECK: br label %[[DISPATCH:[^,]+]]
+
+    // Skip the before scan body.
+    // CHECK: call void @{{.+}}foo{{.+}}()
+
+    // CHECK: [[EXIT_INSCAN:[^,]+]]:
+    // CHECK: br label %[[LOOP_CONTINUE:[^,]+]]
+
+    // CHECK: [[DISPATCH]]:
+    // a_priv[[0..n] = a_buffer[i][0..n];
+    // CHECK: [[BASE_IDX_I:%.+]] = load i32, i32* [[IV_ADDR:%.+]],
+    // CHECK: [[BASE_IDX:%.+]] = zext i32 [[BASE_IDX_I]] to i64
+    // CHECK: [[IDX:%.+]] = mul nsw i64 [[BASE_IDX]], [[NUM_ELEMS]]
+    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, float* [[A_BUF]], i64 [[IDX]]
+    // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds [10 x float], [10 x float]* [[A_PRIV_ADDR:%.+]], i64 0, i64 0
+    // CHECK: [[BYTES:%.+]] = mul nuw i64 [[NUM_ELEMS:%.+]], 4
+    // CHECK: [[DEST:%.+]] = bitcast float* [[A_PRIV]] to i8*
+    // CHECK: [[SRC:%.+]] = bitcast float* [[A_BUF_IDX]] to i8*
+    // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* {{.*}}[[DEST]], i8* {{.*}}[[SRC]], i64 [[BYTES]], i1 false)
+
+    // b_priv = b_buffer[i];
+    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, double* [[B_BUF]], i64 [[BASE_IDX]]
+    // CHECK: [[B_BUF_IDX_VAL:%.+]] = load double, double* [[B_BUF_IDX]],
+    // CHECK: store double [[B_BUF_IDX_VAL]], double* [[B_PRIV_ADDR]],
+    // CHECK: br label %[[SCAN_PHASE:[^,]+]]
+
+    // CHECK: [[SCAN_PHASE]]:
+    // CHECK: call void @{{.+}}bar{{.+}}()
+    // CHECK: br label %[[EXIT_INSCAN]]
+
+    // CHECK: [[LOOP_CONTINUE]]:
+    // CHECK: call void @llvm.stackrestore(i8* %
+    // CHECK: call void @__kmpc_for_static_fini(
+    // CHECK: call void @llvm.stackrestore(i8*
+    // CHECK: call void @__kmpc_barrier(
+  }
+
+  // CHECK: call i8* @llvm.stacksave()
+  // CHECK: [[A_BUF_SIZE:%.+]] = mul nuw i64 10, [[NUM_ELEMS:%[^,]+]]
+
+  // float a_buffer[10][n];
+  // CHECK: [[A_BUF:%.+]] = alloca float, i64 [[A_BUF_SIZE]],
+
+  // double b_buffer[10];
+  // CHECK: [[B_BUF:%.+]] = alloca double, i64 10,
+#pragma omp for reduction(inscan, +:a[:n], b)
+  for (int i = 0; i < 10; ++i) {
+    // CHECK: call void @__kmpc_for_static_init_4(
+    // CHECK: call i8* @llvm.stacksave()
+    // CHECK: store float 0.000000e+00, float* %
+    // CHECK: store double 0.000000e+00, double* [[B_PRIV_ADDR:%.+]],
+    // CHECK: br label %[[DISPATCH:[^,]+]]
+
+    // Skip the before scan body.
+    // CHECK: call void @{{.+}}foo{{.+}}()
+
+    // CHECK: [[EXIT_INSCAN:[^,]+]]:
+
+    // a_buffer[i][0..n] = a_priv[[0..n];
+    // CHECK: [[BASE_IDX_I:%.+]] = load i32, i32* [[IV_ADDR:%.+]],
+    // CHECK: [[BASE_IDX:%.+]] = zext i32 [[BASE_IDX_I]] to i64
+    // CHECK: [[IDX:%.+]] = mul nsw i64 [[BASE_IDX]], [[NUM_ELEMS]]
+    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, float* [[A_BUF]], i64 [[IDX]]
+    // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds [10 x float], [10 x float]* [[A_PRIV_ADDR:%.+]], i64 0, i64 0
+    // CHECK: [[BYTES:%.+]] = mul nuw i64 [[NUM_ELEMS:%.+]], 4
+    // CHECK: [[DEST:%.+]] = bitcast float* [[A_BUF_IDX]] to i8*
+    // CHECK: [[SRC:%.+]] = bitcast float* [[A_PRIV]] to i8*
+    // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* {{.*}}[[DEST]], i8* {{.*}}[[SRC]], i64 [[BYTES]], i1 false)
+
+    // b_buffer[i] = b_priv;
+    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, double* [[B_BUF]], i64 [[BASE_IDX]]
+    // CHECK: [[B_PRIV:%.+]] = load double, double* [[B_PRIV_ADDR]],
+    // CHECK: store double [[B_PRIV]], double* [[B_BUF_IDX]],
+    // CHECK: br label %[[LOOP_CONTINUE:[^,]+]]
+
+    // CHECK: [[DISPATCH]]:
+    // CHECK: br label %[[INPUT_PHASE:[^,]+]]
+
+    // CHECK: [[INPUT_PHASE]]:
+    // CHECK: call void @{{.+}}bar{{.+}}()
+    // CHECK: br label %[[EXIT_INSCAN]]
+
+    // CHECK: [[LOOP_CONTINUE]]:
+    // CHECK: call void @llvm.stackrestore(i8* %
+    // CHECK: call void @__kmpc_for_static_fini(
+    // CHECK: call void @__kmpc_barrier(
+    foo();
+#pragma omp scan exclusive(a[:n], b)
+    // CHECK: [[LOG2_10:%.+]] = call double @llvm.log2.f64(double 1.000000e+01)
+    // CHECK: [[CEIL_LOG2_10:%.+]] = call double @llvm.ceil.f64(double [[LOG2_10]])
+    // CHECK: [[CEIL_LOG2_10_INT:%.+]] = fptoui double [[CEIL_LOG2_10]] to i32
+    // CHECK: br label %[[OUTER_BODY:[^,]+]]
+    // CHECK: [[OUTER_BODY]]:
+    // CHECK: [[K:%.+]] = phi i32 [ 0, %{{.+}} ], [ [[K_NEXT:%.+]], %{{.+}} ]
+    // CHECK: [[K2POW:%.+]] = phi i64 [ 1, %{{.+}} ], [ [[K2POW_NEXT:%.+]], %{{.+}} ]
+    // CHECK: [[CMP:%.+]] = icmp uge i64 9, [[K2POW]]
+    // CHECK: br i1 [[CMP]], label %[[INNER_BODY:[^,]+]], label %[[INNER_EXIT:[^,]+]]
+    // CHECK: [[INNER_BODY]]:
+    // CHECK: [[I:%.+]] = phi i64 [ 9, %[[OUTER_BODY]] ], [ [[I_PREV:%.+]], %{{.+}} ]
+
+    // a_buffer[i] += a_buffer[i-pow(2, k)];
+    // CHECK: [[IDX:%.+]] = mul nsw i64 [[I]], [[NUM_ELEMS]]
+    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, float* [[A_BUF]], i64 [[IDX]]
+    // CHECK: [[IDX_SUB_K2POW:%.+]] = sub nuw i64 [[I]], [[K2POW]]
+    // CHECK: [[IDX:%.+]] = mul nsw i64 [[IDX_SUB_K2POW]], [[NUM_ELEMS]]
+    // CHECK: [[A_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds float, float* [[A_BUF]], i64 [[IDX]]
+    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, double* [[B_BUF]], i64 [[I]]
+    // CHECK: [[IDX_SUB_K2POW:%.+]] = sub nuw i64 [[I]], [[K2POW]]
+    // CHECK: [[B_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds double, double* [[B_BUF]], i64 [[IDX_SUB_K2POW]]
+    // CHECK: [[A_BUF_END:%.+]] = getelementptr float, float* [[A_BUF_IDX]], i64 [[NUM_ELEMS]]
+    // CHECK: [[ISEMPTY:%.+]] = icmp eq float* [[A_BUF_IDX]], [[A_BUF_END]]
+    // CHECK: br i1 [[ISEMPTY]], label %[[RED_DONE:[^,]+]], label %[[RED_BODY:[^,]+]]
+    // CHECK: [[RED_BODY]]:
+    // CHECK: [[A_BUF_IDX_SUB_K2POW_ELEM:%.+]] = phi float* [ [[A_BUF_IDX_SUB_K2POW]], %[[INNER_BODY]] ], [ [[A_BUF_IDX_SUB_K2POW_NEXT:%.+]], %[[RED_BODY]] ]
+    // CHECK: [[A_BUF_IDX_ELEM:%.+]] = phi float* [ [[A_BUF_IDX]], %[[INNER_BODY]] ], [ [[A_BUF_IDX_NEXT:%.+]], %[[RED_BODY]] ]
+    // CHECK: [[A_BUF_IDX_VAL:%.+]] = load float, float* [[A_BUF_IDX_ELEM]],
+    // CHECK: [[A_BUF_IDX_SUB_K2POW_VAL:%.+]] = load float, float* [[A_BUF_IDX_SUB_K2POW_ELEM]],
+    // CHECK: [[RED:%.+]] = fadd float [[A_BUF_IDX_VAL]], [[A_BUF_IDX_SUB_K2POW_VAL]]
+    // CHECK: store float [[RED]], float* [[A_BUF_IDX_ELEM]],
+    // CHECK: [[A_BUF_IDX_NEXT]] = getelementptr float, float* [[A_BUF_IDX_ELEM]], i32 1
+    // CHECK: [[A_BUF_IDX_SUB_K2POW_NEXT]] = getelementptr float, float* [[A_BUF_IDX_SUB_K2POW_ELEM]], i32 1
+    // CHECK: [[DONE:%.+]] = icmp eq float* [[A_BUF_IDX_NEXT]], [[A_BUF_END]]
+    // CHECK: br i1 [[DONE]], label %[[RED_DONE]], label %[[RED_BODY]]
+    // CHECK: [[RED_DONE]]:
+
+    // b_buffer[i] += b_buffer[i-pow(2, k)];
+    // CHECK: [[B_BUF_IDX_VAL:%.+]] = load double, double* [[B_BUF_IDX]],
+    // CHECK: [[B_BUF_IDX_SUB_K2POW_VAL:%.+]] = load double, double* [[B_BUF_IDX_SUB_K2POW]],
+    // CHECK: [[RED:%.+]] = fadd double [[B_BUF_IDX_VAL]], [[B_BUF_IDX_SUB_K2POW_VAL]]
+    // CHECK: store double [[RED]], double* [[B_BUF_IDX]],
+
+    // --i;
+    // CHECK: [[I_PREV:%.+]] = sub nuw i64 [[I]], 1
+    // CHECK: [[CMP:%.+]] = icmp uge i64 [[I_PREV]], [[K2POW]]
+    // CHECK: br i1 [[CMP]], label %[[INNER_BODY]], label %[[INNER_EXIT]]
+    // CHECK: [[INNER_EXIT]]:
+
+    // ++k;
+    // CHECK: [[K_NEXT]] = add nuw i32 [[K]], 1
+    // k2pow <<= 1;
+    // CHECK: [[K2POW_NEXT]] = shl nuw i64 [[K2POW]], 1
+    // CHECK: [[CMP:%.+]] = icmp ne i32 [[K_NEXT]], [[CEIL_LOG2_10_INT]]
+    // CHECK: br i1 [[CMP]], label %[[OUTER_BODY]], label %[[OUTER_EXIT:[^,]+]]
+    // CHECK: [[OUTER_EXIT]]:
+    bar();
+    // CHECK: call void @__kmpc_for_static_init_4(
+    // CHECK: call i8* @llvm.stacksave()
+    // CHECK: store float 0.000000e+00, float* %
+    // CHECK: store double 0.000000e+00, double* [[B_PRIV_ADDR:%.+]],
+    // CHECK: br label %[[DISPATCH:[^,]+]]
+
+    // CHECK: [[SCAN_PHASE:.+]]:
+    // CHECK: call void @{{.+}}foo{{.+}}()
+    // CHECK: br label %[[LOOP_CONTINUE:.+]]
+
+    // CHECK: [[DISPATCH]]:
+    // if (i >0)
+    //   a_priv[[0..n] = a_buffer[i-1][0..n];
+    // CHECK: [[BASE_IDX_I:%.+]] = load i32, i32* [[IV_ADDR:%.+]],
+    // CHECK: [[BASE_IDX:%.+]] = zext i32 [[BASE_IDX_I]] to i64
+    // CHECK: [[CMP:%.+]] = icmp eq i64 [[BASE_IDX]], 0
+    // CHECK: br i1 [[CMP]], label %[[IF_DONE:[^,]+]], label %[[IF_THEN:[^,]+]]
+    // CHECK: [[IF_THEN]]:
+    // CHECK: [[BASE_IDX_SUB_1:%.+]] = sub nuw i64 [[BASE_IDX]], 1
+    // CHECK: [[IDX:%.+]] = mul nsw i64 [[BASE_IDX_SUB_1]], [[NUM_ELEMS]]
+    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, float* [[A_BUF]], i64 [[IDX]]
+    // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds [10 x float], [10 x float]* [[A_PRIV_ADDR:%.+]], i64 0, i64 0
+    // CHECK: [[BYTES:%.+]] = mul nuw i64 [[NUM_ELEMS:%.+]], 4
+    // CHECK: [[DEST:%.+]] = bitcast float* [[A_PRIV]] to i8*
+    // CHECK: [[SRC:%.+]] = bitcast float* [[A_BUF_IDX]] to i8*
+    // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* {{.*}}[[DEST]], i8* {{.*}}[[SRC]], i64 [[BYTES]], i1 false)
+
+    // b_priv = b_buffer[i];
+    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, double* [[B_BUF]], i64 [[BASE_IDX_SUB_1]]
+    // CHECK: [[B_BUF_IDX_VAL:%.+]] = load double, double* [[B_BUF_IDX]],
+    // CHECK: store double [[B_BUF_IDX_VAL]], double* [[B_PRIV_ADDR]],
+    // CHECK: br label %[[SCAN_PHASE]]
+
+    // CHECK: [[LOOP_CONTINUE]]:
+    // CHECK: call void @llvm.stackrestore(i8* %
+    // CHECK: call void @__kmpc_for_static_fini(
+    // CHECK: call void @llvm.stackrestore(i8*
+    // CHECK: call void @__kmpc_barrier(
+  }
+}
+
+#endif
+

diff  --git a/clang/test/OpenMP/scan_messages.cpp b/clang/test/OpenMP/scan_messages.cpp
index 9ce8314511c1..ebbc4fcc8f19 100644
--- a/clang/test/OpenMP/scan_messages.cpp
+++ b/clang/test/OpenMP/scan_messages.cpp
@@ -19,32 +19,32 @@ T tmain() {
 #pragma omp for simd reduction(inscan, +: argc)
   for (int i = 0; i < 10; ++i)
     if (argc)
-#pragma omp scan inclusive(argc) // expected-error {{'#pragma omp scan' cannot be an immediate substatement}}
+#pragma omp scan inclusive(argc) // expected-error {{'#pragma omp scan' cannot be an immediate substatement}} expected-error {{orphaned 'omp scan' directives are prohibited; perhaps you forget to enclose the directive into a for, simd, for simd, parallel for, or parallel for simd region?}}
     if (argc) {
 #pragma omp scan inclusive(argc) // expected-error {{orphaned 'omp scan' directives are prohibited; perhaps you forget to enclose the directive into a for, simd, for simd, parallel for, or parallel for simd region?}}
     }
 #pragma omp simd reduction(inscan, +: argc)
   for (int i = 0; i < 10; ++i)
   while (argc)
-#pragma omp scan inclusive(argc) // expected-error {{'#pragma omp scan' cannot be an immediate substatement}}
+#pragma omp scan inclusive(argc) // expected-error {{'#pragma omp scan' cannot be an immediate substatement}} expected-error {{orphaned 'omp scan' directives are prohibited; perhaps you forget to enclose the directive into a for, simd, for simd, parallel for, or parallel for simd region?}}
     while (argc) {
 #pragma omp scan inclusive(argc) // expected-error {{orphaned 'omp scan' directives are prohibited; perhaps you forget to enclose the directive into a for, simd, for simd, parallel for, or parallel for simd region?}}
     }
 #pragma omp simd reduction(inscan, +: argc)
   for (int i = 0; i < 10; ++i)
   do
-#pragma omp scan inclusive(argc) // expected-error {{'#pragma omp scan' cannot be an immediate substatement}}
+#pragma omp scan inclusive(argc) // expected-error {{'#pragma omp scan' cannot be an immediate substatement}} expected-error {{orphaned 'omp scan' directives are prohibited; perhaps you forget to enclose the directive into a for, simd, for simd, parallel for, or parallel for simd region?}}
     while (argc)
       ;
-#pragma omp simd reduction(inscan, +: argc)
+#pragma omp simd reduction(inscan, +: argc) // expected-error {{the inscan reduction list item must appear as a list item in an 'inclusive' or 'exclusive' clause on an inner 'omp scan' directive}}
   for (int i = 0; i < 10; ++i)
   do {
-#pragma omp scan inclusive(argc)
+#pragma omp scan inclusive(argc) // expected-error {{orphaned 'omp scan' directives are prohibited; perhaps you forget to enclose the directive into a for, simd, for simd, parallel for, or parallel for simd region?}}
   } while (argc);
 #pragma omp simd reduction(inscan, +: argc)
   for (int i = 0; i < 10; ++i)
   switch (argc)
-#pragma omp scan inclusive(argc) // expected-error {{'#pragma omp scan' cannot be an immediate substatement}}
+#pragma omp scan inclusive(argc) // expected-error {{'#pragma omp scan' cannot be an immediate substatement}} expected-error {{orphaned 'omp scan' directives are prohibited; perhaps you forget to enclose the directive into a for, simd, for simd, parallel for, or parallel for simd region?}}
     switch (argc)
     case 1:
 #pragma omp scan inclusive(argc) // expected-error {{'#pragma omp scan' cannot be an immediate substatement}} expected-error {{orphaned 'omp scan' directives are prohibited; perhaps you forget to enclose the directive into a for, simd, for simd, parallel for, or parallel for simd region?}}
@@ -52,21 +52,21 @@ T tmain() {
   case 1: {
 #pragma omp scan inclusive(argc) // expected-error {{orphaned 'omp scan' directives are prohibited; perhaps you forget to enclose the directive into a for, simd, for simd, parallel for, or parallel for simd region?}}
   }
-#pragma omp simd reduction(inscan, +: argc)
+#pragma omp simd reduction(inscan, +: argc) // expected-error {{the inscan reduction list item must appear as a list item in an 'inclusive' or 'exclusive' clause on an inner 'omp scan' directive}}
   for (int i = 0; i < 10; ++i)
   switch (argc) {
-#pragma omp scan exclusive(argc) // expected-note 2 {{previous 'scan' directive used here}}
+#pragma omp scan exclusive(argc) // expected-error {{orphaned 'omp scan' directives are prohibited; perhaps you forget to enclose the directive into a for, simd, for simd, parallel for, or parallel for simd region?}}
   case 1:
-#pragma omp scan exclusive(argc) // expected-error {{exactly one 'scan' directive must appear in the loop body of an enclosing directive}}
+#pragma omp scan exclusive(argc) // expected-error {{orphaned 'omp scan' directives are prohibited; perhaps you forget to enclose the directive into a for, simd, for simd, parallel for, or parallel for simd region?}}
     break;
   default: {
-#pragma omp scan exclusive(argc) // expected-error {{exactly one 'scan' directive must appear in the loop body of an enclosing directive}}
+#pragma omp scan exclusive(argc) // expected-error {{orphaned 'omp scan' directives are prohibited; perhaps you forget to enclose the directive into a for, simd, for simd, parallel for, or parallel for simd region?}}
   } break;
   }
 #pragma omp simd reduction(inscan, +: argc)
   for (int i = 0; i < 10; ++i)
   for (;;)
-#pragma omp scan exclusive(argc) // expected-error {{'#pragma omp scan' cannot be an immediate substatement}}
+#pragma omp scan exclusive(argc) // expected-error {{'#pragma omp scan' cannot be an immediate substatement}} expected-error {{orphaned 'omp scan' directives are prohibited; perhaps you forget to enclose the directive into a for, simd, for simd, parallel for, or parallel for simd region?}}
     for (;;) {
 #pragma omp scan exclusive(argc) // expected-error {{orphaned 'omp scan' directives are prohibited; perhaps you forget to enclose the directive into a for, simd, for simd, parallel for, or parallel for simd region?}}
     }
@@ -77,8 +77,10 @@ T tmain() {
   }
 #pragma omp simd reduction(inscan, +: argc)
   for (int i = 0; i < 10; ++i) {
+#pragma omp scan inclusive(argc) // expected-note {{previous 'scan' directive used here}}
+#pragma omp scan inclusive(argc) // expected-error {{exactly one 'scan' directive must appear in the loop body of an enclosing directive}}
 label1 : {
-#pragma omp scan inclusive(argc)
+#pragma omp scan inclusive(argc) // expected-error {{orphaned 'omp scan' directives are prohibited; perhaps you forget to enclose the directive into a for, simd, for simd, parallel for, or parallel for simd region?}}
 }}
 
   return T();
@@ -109,32 +111,32 @@ int main() {
 #pragma omp simd reduction(inscan, +: argc)
   for (int i = 0; i < 10; ++i)
   if (argc)
-#pragma omp scan inclusive(argc) // expected-error {{'#pragma omp scan' cannot be an immediate substatement}}
+#pragma omp scan inclusive(argc) // expected-error {{'#pragma omp scan' cannot be an immediate substatement}} expected-error {{orphaned 'omp scan' directives are prohibited; perhaps you forget to enclose the directive into a for, simd, for simd, parallel for, or parallel for simd region?}}
     if (argc) {
 #pragma omp scan inclusive(argc) // expected-error {{orphaned 'omp scan' directives are prohibited; perhaps you forget to enclose the directive into a for, simd, for simd, parallel for, or parallel for simd region?}} expected-error {{the list item must appear in 'reduction' clause with the 'inscan' modifier of the parent directive}}
     }
 #pragma omp simd reduction(inscan, +: argc)
   for (int i = 0; i < 10; ++i)
   while (argc)
-#pragma omp scan inclusive(argc) // expected-error {{'#pragma omp scan' cannot be an immediate substatement}}
+#pragma omp scan inclusive(argc) // expected-error {{'#pragma omp scan' cannot be an immediate substatement}} expected-error {{orphaned 'omp scan' directives are prohibited; perhaps you forget to enclose the directive into a for, simd, for simd, parallel for, or parallel for simd region?}}
     while (argc) {
 #pragma omp scan inclusive(argc) // expected-error {{orphaned 'omp scan' directives are prohibited; perhaps you forget to enclose the directive into a for, simd, for simd, parallel for, or parallel for simd region?}} expected-error {{the list item must appear in 'reduction' clause with the 'inscan' modifier of the parent directive}}
     }
 #pragma omp simd reduction(inscan, +: argc)
   for (int i = 0; i < 10; ++i)
   do
-#pragma omp scan inclusive(argc) // expected-error {{'#pragma omp scan' cannot be an immediate substatement}}
+#pragma omp scan inclusive(argc) // expected-error {{'#pragma omp scan' cannot be an immediate substatement}} expected-error {{orphaned 'omp scan' directives are prohibited; perhaps you forget to enclose the directive into a for, simd, for simd, parallel for, or parallel for simd region?}}
     while (argc)
       ;
 #pragma omp simd reduction(inscan, +: argc)
   for (int i = 0; i < 10; ++i)
   do {
-#pragma omp scan exclusive(argc)
+#pragma omp scan exclusive(argc) // expected-error {{orphaned 'omp scan' directives are prohibited; perhaps you forget to enclose the directive into a for, simd, for simd, parallel for, or parallel for simd region?}}
   } while (argc);
 #pragma omp simd reduction(inscan, +: argc)
   for (int i = 0; i < 10; ++i)
   switch (argc)
-#pragma omp scan exclusive(argc) // expected-error {{'#pragma omp scan' cannot be an immediate substatement}}
+#pragma omp scan exclusive(argc) // expected-error {{'#pragma omp scan' cannot be an immediate substatement}} expected-error {{orphaned 'omp scan' directives are prohibited; perhaps you forget to enclose the directive into a for, simd, for simd, parallel for, or parallel for simd region?}}
     switch (argc)
     case 1:
 #pragma omp scan exclusive(argc) // expected-error {{'#pragma omp scan' cannot be an immediate substatement}} expected-error {{orphaned 'omp scan' directives are prohibited; perhaps you forget to enclose the directive into a for, simd, for simd, parallel for, or parallel for simd region?}} expected-error {{the list item must appear in 'reduction' clause with the 'inscan' modifier of the parent directive}}
@@ -145,18 +147,18 @@ int main() {
 #pragma omp simd reduction(inscan, +: argc)
   for (int i = 0; i < 10; ++i)
   switch (argc) {
-#pragma omp scan inclusive(argc) // expected-note 2 {{previous 'scan' directive used here}}
+#pragma omp scan inclusive(argc) // expected-error {{orphaned 'omp scan' directives are prohibited; perhaps you forget to enclose the directive into a for, simd, for simd, parallel for, or parallel for simd region?}}
   case 1:
-#pragma omp scan inclusive(argc) // expected-error {{exactly one 'scan' directive must appear in the loop body of an enclosing directive}}
+#pragma omp scan inclusive(argc) // expected-error {{orphaned 'omp scan' directives are prohibited; perhaps you forget to enclose the directive into a for, simd, for simd, parallel for, or parallel for simd region?}}
     break;
   default: {
-#pragma omp scan inclusive(argc) // expected-error {{exactly one 'scan' directive must appear in the loop body of an enclosing directive}}
+#pragma omp scan inclusive(argc) // expected-error {{orphaned 'omp scan' directives are prohibited; perhaps you forget to enclose the directive into a for, simd, for simd, parallel for, or parallel for simd region?}}
   } break;
   }
 #pragma omp simd reduction(inscan, +: argc)
   for (int i = 0; i < 10; ++i)
   for (;;)
-#pragma omp scan inclusive(argc) // expected-error {{'#pragma omp scan' cannot be an immediate substatement}}
+#pragma omp scan inclusive(argc) // expected-error {{'#pragma omp scan' cannot be an immediate substatement}} expected-error {{orphaned 'omp scan' directives are prohibited; perhaps you forget to enclose the directive into a for, simd, for simd, parallel for, or parallel for simd region?}}
     for (;;) {
 #pragma omp scan inclusive(argc) // expected-error {{orphaned 'omp scan' directives are prohibited; perhaps you forget to enclose the directive into a for, simd, for simd, parallel for, or parallel for simd region?}} expected-error {{the list item must appear in 'reduction' clause with the 'inscan' modifier of the parent directive}}
     }
@@ -167,10 +169,12 @@ int main() {
   }
 #pragma omp simd reduction(inscan, +: argc)
   for (int i = 0; i < 10; ++i) {
+#pragma omp scan inclusive(argc) // expected-note {{previous 'scan' directive used here}}
+#pragma omp scan inclusive(argc) // expected-error {{exactly one 'scan' directive must appear in the loop body of an enclosing directive}}
 label1 : {
-#pragma omp scan inclusive(argc)
+#pragma omp scan inclusive(argc) // expected-error {{orphaned 'omp scan' directives are prohibited; perhaps you forget to enclose the directive into a for, simd, for simd, parallel for, or parallel for simd region?}}
 }
 }
 
-  return tmain<int>();
+  return tmain<int>(); // expected-note {{in instantiation of function template specialization 'tmain<int>' requested here}}
 }

diff  --git a/clang/tools/libclang/CIndex.cpp b/clang/tools/libclang/CIndex.cpp
index bff23f52b459..4a65624268d8 100644
--- a/clang/tools/libclang/CIndex.cpp
+++ b/clang/tools/libclang/CIndex.cpp
@@ -2376,6 +2376,17 @@ void OMPClauseEnqueue::VisitOMPReductionClause(const OMPReductionClause *C) {
   for (auto *E : C->reduction_ops()) {
     Visitor->AddStmt(E);
   }
+  if (C->getModifier() == clang::OMPC_REDUCTION_inscan) {
+    for (auto *E : C->copy_ops()) {
+      Visitor->AddStmt(E);
+    }
+    for (auto *E : C->copy_array_temps()) {
+      Visitor->AddStmt(E);
+    }
+    for (auto *E : C->copy_array_elems()) {
+      Visitor->AddStmt(E);
+    }
+  }
 }
 void OMPClauseEnqueue::VisitOMPTaskReductionClause(
     const OMPTaskReductionClause *C) {


        


More information about the cfe-commits mailing list