[clang] 0860db9 - [OPENMP50]Codegen for nontemporal clause.

Alexey Bataev via cfe-commits cfe-commits at lists.llvm.org
Mon Dec 23 07:10:45 PST 2019


Author: Alexey Bataev
Date: 2019-12-23T10:04:46-05:00
New Revision: 0860db966a7d2ab61b26e41426a55189986924b4

URL: https://github.com/llvm/llvm-project/commit/0860db966a7d2ab61b26e41426a55189986924b4
DIFF: https://github.com/llvm/llvm-project/commit/0860db966a7d2ab61b26e41426a55189986924b4.diff

LOG: [OPENMP50]Codegen for nontemporal clause.

Summary:
Basic codegen for the declarations marked as nontemporal. Also, if the
base declaration in the member expression is marked as nontemporal,
lvalue for member decl access inherits nonteporal flag from the base
lvalue.

Reviewers: rjmccall, hfinkel, jdoerfert

Subscribers: guansong, arphaman, caomhin, kkwli0, cfe-commits

Tags: #clang

Differential Revision: https://reviews.llvm.org/D71708

Added: 
    

Modified: 
    clang/include/clang/AST/OpenMPClause.h
    clang/include/clang/AST/RecursiveASTVisitor.h
    clang/lib/AST/OpenMPClause.cpp
    clang/lib/AST/StmtProfile.cpp
    clang/lib/CodeGen/CGExpr.cpp
    clang/lib/CodeGen/CGOpenMPRuntime.cpp
    clang/lib/CodeGen/CGOpenMPRuntime.h
    clang/lib/CodeGen/CGStmtOpenMP.cpp
    clang/lib/Sema/SemaOpenMP.cpp
    clang/lib/Serialization/ASTReader.cpp
    clang/lib/Serialization/ASTWriter.cpp
    clang/test/OpenMP/distribute_simd_codegen.cpp
    clang/test/OpenMP/for_simd_codegen.cpp
    clang/test/OpenMP/simd_codegen.cpp
    clang/test/OpenMP/target_parallel_for_simd_codegen.cpp
    clang/test/OpenMP/target_simd_codegen.cpp
    clang/test/OpenMP/target_teams_distribute_parallel_for_simd_if_codegen.cpp
    clang/test/OpenMP/target_teams_distribute_simd_codegen.cpp
    clang/test/OpenMP/teams_distribute_simd_codegen.cpp
    clang/tools/libclang/CIndex.cpp

Removed: 
    


################################################################################
diff  --git a/clang/include/clang/AST/OpenMPClause.h b/clang/include/clang/AST/OpenMPClause.h
index c708b011849a..7cef0bc9092a 100644
--- a/clang/include/clang/AST/OpenMPClause.h
+++ b/clang/include/clang/AST/OpenMPClause.h
@@ -6275,6 +6275,15 @@ class OMPNontemporalClause final
             OMPC_nontemporal, SourceLocation(), SourceLocation(),
             SourceLocation(), N) {}
 
+  /// Get the list of privatied copies if the member expression was captured by
+  /// one of the privatization clauses.
+  MutableArrayRef<Expr *> getPrivateRefs() {
+    return MutableArrayRef<Expr *>(varlist_end(), varlist_size());
+  }
+  ArrayRef<const Expr *> getPrivateRefs() const {
+    return llvm::makeArrayRef(varlist_end(), varlist_size());
+  }
+
 public:
   /// Creates clause with a list of variables \a VL.
   ///
@@ -6293,6 +6302,10 @@ class OMPNontemporalClause final
   /// \param N The number of variables.
   static OMPNontemporalClause *CreateEmpty(const ASTContext &C, unsigned N);
 
+  /// Sets the list of references to private copies created in private clauses.
+  /// \param VL List of references.
+  void setPrivateRefs(ArrayRef<Expr *> VL);
+
   child_range children() {
     return child_range(reinterpret_cast<Stmt **>(varlist_begin()),
                        reinterpret_cast<Stmt **>(varlist_end()));
@@ -6303,6 +6316,16 @@ class OMPNontemporalClause final
     return const_child_range(Children.begin(), Children.end());
   }
 
+  child_range private_refs() {
+    return child_range(reinterpret_cast<Stmt **>(getPrivateRefs().begin()),
+                       reinterpret_cast<Stmt **>(getPrivateRefs().end()));
+  }
+
+  const_child_range private_refs() const {
+    auto Children = const_cast<OMPNontemporalClause *>(this)->private_refs();
+    return const_child_range(Children.begin(), Children.end());
+  }
+
   child_range used_children() {
     return child_range(child_iterator(), child_iterator());
   }

diff  --git a/clang/include/clang/AST/RecursiveASTVisitor.h b/clang/include/clang/AST/RecursiveASTVisitor.h
index a93a01da34c1..19dd62b0fe0f 100644
--- a/clang/include/clang/AST/RecursiveASTVisitor.h
+++ b/clang/include/clang/AST/RecursiveASTVisitor.h
@@ -3378,6 +3378,9 @@ template <typename Derived>
 bool RecursiveASTVisitor<Derived>::VisitOMPNontemporalClause(
     OMPNontemporalClause *C) {
   TRY_TO(VisitOMPClauseList(C));
+  for (auto *E : C->private_refs()) {
+    TRY_TO(TraverseStmt(E));
+  }
   return true;
 }
 

diff  --git a/clang/lib/AST/OpenMPClause.cpp b/clang/lib/AST/OpenMPClause.cpp
index f062c3e463a8..790b44012298 100644
--- a/clang/lib/AST/OpenMPClause.cpp
+++ b/clang/lib/AST/OpenMPClause.cpp
@@ -1162,8 +1162,8 @@ OMPNontemporalClause *OMPNontemporalClause::Create(const ASTContext &C,
                                                    SourceLocation LParenLoc,
                                                    SourceLocation EndLoc,
                                                    ArrayRef<Expr *> VL) {
-  // Allocate space for nontemporal variables.
-  void *Mem = C.Allocate(totalSizeToAlloc<Expr *>(VL.size()));
+  // Allocate space for nontemporal variables + private references.
+  void *Mem = C.Allocate(totalSizeToAlloc<Expr *>(2 * VL.size()));
   auto *Clause =
       new (Mem) OMPNontemporalClause(StartLoc, LParenLoc, EndLoc, VL.size());
   Clause->setVarRefs(VL);
@@ -1172,10 +1172,16 @@ OMPNontemporalClause *OMPNontemporalClause::Create(const ASTContext &C,
 
 OMPNontemporalClause *OMPNontemporalClause::CreateEmpty(const ASTContext &C,
                                                         unsigned N) {
-  void *Mem = C.Allocate(totalSizeToAlloc<Expr *>(N));
+  void *Mem = C.Allocate(totalSizeToAlloc<Expr *>(2 * N));
   return new (Mem) OMPNontemporalClause(N);
 }
 
+void OMPNontemporalClause::setPrivateRefs(ArrayRef<Expr *> VL) {
+  assert(VL.size() == varlist_size() && "Number of private references is not "
+                                        "the same as the preallocated buffer");
+  std::copy(VL.begin(), VL.end(), varlist_end());
+}
+
 //===----------------------------------------------------------------------===//
 //  OpenMP clauses printing methods
 //===----------------------------------------------------------------------===//

diff  --git a/clang/lib/AST/StmtProfile.cpp b/clang/lib/AST/StmtProfile.cpp
index 56a65e8deb98..2aa5106e90fa 100644
--- a/clang/lib/AST/StmtProfile.cpp
+++ b/clang/lib/AST/StmtProfile.cpp
@@ -769,10 +769,13 @@ void OMPClauseProfiler::VisitOMPIsDevicePtrClause(
     const OMPIsDevicePtrClause *C) {
   VisitOMPClauseList(C);
 }
-void OMPClauseProfiler::VisitOMPNontemporalClause(const OMPNontemporalClause *C) {
+void OMPClauseProfiler::VisitOMPNontemporalClause(
+    const OMPNontemporalClause *C) {
   VisitOMPClauseList(C);
+  for (auto *E : C->private_refs())
+    Profiler->VisitStmt(E);
 }
-}
+} // namespace
 
 void
 StmtProfiler::VisitOMPExecutableDirective(const OMPExecutableDirective *S) {

diff  --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
index cd42faefab00..c729037852e2 100644
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -2566,21 +2566,35 @@ LValue CodeGenFunction::EmitDeclRefLValue(const DeclRefExpr *E) {
       VD = VD->getCanonicalDecl();
       if (auto *FD = LambdaCaptureFields.lookup(VD))
         return EmitCapturedFieldLValue(*this, FD, CXXABIThisValue);
-      else if (CapturedStmtInfo) {
+      if (CapturedStmtInfo) {
         auto I = LocalDeclMap.find(VD);
         if (I != LocalDeclMap.end()) {
+          LValue CapLVal;
           if (VD->getType()->isReferenceType())
-            return EmitLoadOfReferenceLValue(I->second, VD->getType(),
-                                             AlignmentSource::Decl);
-          return MakeAddrLValue(I->second, T);
+            CapLVal = EmitLoadOfReferenceLValue(I->second, VD->getType(),
+                                                AlignmentSource::Decl);
+          else
+            CapLVal = MakeAddrLValue(I->second, T);
+          // Mark lvalue as nontemporal if the variable is marked as nontemporal
+          // in simd context.
+          if (getLangOpts().OpenMP &&
+              CGM.getOpenMPRuntime().isNontemporalDecl(VD))
+            CapLVal.setNontemporal(/*Value=*/true);
+          return CapLVal;
         }
         LValue CapLVal =
             EmitCapturedFieldLValue(*this, CapturedStmtInfo->lookup(VD),
                                     CapturedStmtInfo->getContextValue());
-        return MakeAddrLValue(
+        CapLVal = MakeAddrLValue(
             Address(CapLVal.getPointer(*this), getContext().getDeclAlign(VD)),
             CapLVal.getType(), LValueBaseInfo(AlignmentSource::Decl),
             CapLVal.getTBAAInfo());
+        // Mark lvalue as nontemporal if the variable is marked as nontemporal
+        // in simd context.
+        if (getLangOpts().OpenMP &&
+            CGM.getOpenMPRuntime().isNontemporalDecl(VD))
+          CapLVal.setNontemporal(/*Value=*/true);
+        return CapLVal;
       }
 
       assert(isa<BlockDecl>(CurCodeDecl));
@@ -3929,6 +3943,15 @@ LValue CodeGenFunction::EmitMemberExpr(const MemberExpr *E) {
   if (auto *Field = dyn_cast<FieldDecl>(ND)) {
     LValue LV = EmitLValueForField(BaseLV, Field);
     setObjCGCLValueClass(getContext(), E, LV);
+    if (getLangOpts().OpenMP) {
+      // If the member was explicitly marked as nontemporal, mark it as
+      // nontemporal. If the base lvalue is marked as nontemporal, mark access
+      // to children as nontemporal too.
+      if ((IsWrappedCXXThis(BaseExpr) &&
+           CGM.getOpenMPRuntime().isNontemporalDecl(Field)) ||
+          BaseLV.isNontemporal())
+        LV.setNontemporal(/*Value=*/true);
+    }
     return LV;
   }
 

diff  --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index 1aae18b4504c..2a53e9993533 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -17,6 +17,7 @@
 #include "CodeGenFunction.h"
 #include "clang/AST/Attr.h"
 #include "clang/AST/Decl.h"
+#include "clang/AST/OpenMPClause.h"
 #include "clang/AST/StmtOpenMP.h"
 #include "clang/Basic/BitmaskEnum.h"
 #include "clang/CodeGen/ConstantInitBuilder.h"
@@ -11341,6 +11342,46 @@ bool CGOpenMPRuntime::emitDeclareVariant(GlobalDecl GD, bool IsForDefinition) {
   return true;
 }
 
+CGOpenMPRuntime::NontemporalDeclsRAII::NontemporalDeclsRAII(
+    CodeGenModule &CGM, const OMPLoopDirective &S)
+    : CGM(CGM), NeedToPush(S.hasClausesOfKind<OMPNontemporalClause>()) {
+  assert(CGM.getLangOpts().OpenMP && "Not in OpenMP mode.");
+  if (!NeedToPush)
+    return;
+  NontemporalDeclsSet &DS =
+      CGM.getOpenMPRuntime().NontemporalDeclsStack.emplace_back();
+  for (const auto *C : S.getClausesOfKind<OMPNontemporalClause>()) {
+    for (const Stmt *Ref : C->private_refs()) {
+      const auto *SimpleRefExpr = cast<Expr>(Ref)->IgnoreParenImpCasts();
+      const ValueDecl *VD;
+      if (const auto *DRE = dyn_cast<DeclRefExpr>(SimpleRefExpr)) {
+        VD = DRE->getDecl();
+      } else {
+        const auto *ME = cast<MemberExpr>(SimpleRefExpr);
+        assert((ME->isImplicitCXXThis() ||
+                isa<CXXThisExpr>(ME->getBase()->IgnoreParenImpCasts())) &&
+               "Expected member of current class.");
+        VD = ME->getMemberDecl();
+      }
+      DS.insert(VD);
+    }
+  }
+}
+
+CGOpenMPRuntime::NontemporalDeclsRAII::~NontemporalDeclsRAII() {
+  if (!NeedToPush)
+    return;
+  CGM.getOpenMPRuntime().NontemporalDeclsStack.pop_back();
+}
+
+bool CGOpenMPRuntime::isNontemporalDecl(const ValueDecl *VD) const {
+  assert(CGM.getLangOpts().OpenMP && "Not in OpenMP mode.");
+
+  return llvm::any_of(
+      CGM.getOpenMPRuntime().NontemporalDeclsStack,
+      [VD](const NontemporalDeclsSet &Set) { return Set.count(VD) > 0; });
+}
+
 llvm::Function *CGOpenMPSIMDRuntime::emitParallelOutlinedFunction(
     const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
     OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {

diff  --git a/clang/lib/CodeGen/CGOpenMPRuntime.h b/clang/lib/CodeGen/CGOpenMPRuntime.h
index d806c27fdcba..f1582457ed2d 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.h
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.h
@@ -211,6 +211,16 @@ class CGOpenMPRuntime {
     ~DisableAutoDeclareTargetRAII();
   };
 
+  /// Manages list of nontemporal decls for the specified directive.
+  class NontemporalDeclsRAII {
+    CodeGenModule &CGM;
+    const bool NeedToPush;
+
+  public:
+    NontemporalDeclsRAII(CodeGenModule &CGM, const OMPLoopDirective &S);
+    ~NontemporalDeclsRAII();
+  };
+
 protected:
   CodeGenModule &CGM;
   StringRef FirstSeparator, Separator;
@@ -650,6 +660,11 @@ class CGOpenMPRuntime {
                   std::pair<GlobalDecl, GlobalDecl>>
       DeferredVariantFunction;
 
+  using NontemporalDeclsSet = llvm::SmallDenseSet<CanonicalDeclPtr<const Decl>>;
+  /// Stack for list of declarations in current context marked as nontemporal.
+  /// The set is the union of all current stack elements.
+  llvm::SmallVector<NontemporalDeclsSet, 4> NontemporalDeclsStack;
+
   /// Flag for keeping track of weather a requires unified_shared_memory
   /// directive is present.
   bool HasRequiresUnifiedSharedMemory = false;
@@ -1663,6 +1678,10 @@ class CGOpenMPRuntime {
 
   /// Emits the definition of the declare variant function.
   virtual bool emitDeclareVariant(GlobalDecl GD, bool IsForDefinition);
+
+  /// Checks if the \p VD variable is marked as nontemporal declaration in
+  /// current context.
+  bool isNontemporalDecl(const ValueDecl *VD) const;
 };
 
 /// Class supports emissionof SIMD-only code.

diff  --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp
index e8a3790e3b5c..a2a59b64ddb3 100644
--- a/clang/lib/CodeGen/CGStmtOpenMP.cpp
+++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp
@@ -1803,8 +1803,9 @@ static LValue EmitOMPHelperVar(CodeGenFunction &CGF,
 static void emitCommonSimdLoop(CodeGenFunction &CGF, const OMPLoopDirective &S,
                                const RegionCodeGenTy &SimdInitGen,
                                const RegionCodeGenTy &BodyCodeGen) {
-  auto &&ThenGen = [&SimdInitGen, &BodyCodeGen](CodeGenFunction &CGF,
-                                                PrePostActionTy &) {
+  auto &&ThenGen = [&S, &SimdInitGen, &BodyCodeGen](CodeGenFunction &CGF,
+                                                    PrePostActionTy &) {
+    CGOpenMPRuntime::NontemporalDeclsRAII NontemporalsRegion(CGF.CGM, S);
     CodeGenFunction::OMPLocalDeclMapRAII Scope(CGF);
     SimdInitGen(CGF);
 

diff  --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index 915bf53d06e3..7365fda62bee 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -2230,6 +2230,11 @@ void Sema::EndOpenMPClause() {
 
 static void checkAllocateClauses(Sema &S, DSAStackTy *Stack,
                                  ArrayRef<OMPClause *> Clauses);
+static std::pair<ValueDecl *, bool>
+getPrivateItem(Sema &S, Expr *&RefExpr, SourceLocation &ELoc,
+               SourceRange &ERange, bool AllowArraySection = false);
+static DeclRefExpr *buildCapture(Sema &S, ValueDecl *D, Expr *CaptureExpr,
+                                 bool WithInit);
 
 void Sema::EndOpenMPDSABlock(Stmt *CurDirective) {
   // OpenMP [2.14.3.5, Restrictions, C/C++, p.1]
@@ -2274,6 +2279,31 @@ void Sema::EndOpenMPDSABlock(Stmt *CurDirective) {
           }
         }
         Clause->setPrivateCopies(PrivateCopies);
+        continue;
+      }
+      // Finalize nontemporal clause by handling private copies, if any.
+      if (auto *Clause = dyn_cast<OMPNontemporalClause>(C)) {
+        SmallVector<Expr *, 8> PrivateRefs;
+        for (Expr *RefExpr : Clause->varlists()) {
+          assert(RefExpr && "NULL expr in OpenMP nontemporal clause.");
+          SourceLocation ELoc;
+          SourceRange ERange;
+          Expr *SimpleRefExpr = RefExpr;
+          auto Res = getPrivateItem(*this, SimpleRefExpr, ELoc, ERange);
+          if (Res.second)
+            // It will be analyzed later.
+            PrivateRefs.push_back(RefExpr);
+          ValueDecl *D = Res.first;
+          if (!D)
+            continue;
+
+          const DSAStackTy::DSAVarData DVar =
+              DSAStack->getTopDSA(D, /*FromParent=*/false);
+          PrivateRefs.push_back(DVar.PrivateCopy ? DVar.PrivateCopy
+                                                 : SimpleRefExpr);
+        }
+        Clause->setPrivateRefs(PrivateRefs);
+        continue;
       }
     }
     // Check allocate clauses.
@@ -4262,9 +4292,10 @@ static bool checkIfClauses(Sema &S, OpenMPDirectiveKind Kind,
   return ErrorFound;
 }
 
-static std::pair<ValueDecl *, bool>
-getPrivateItem(Sema &S, Expr *&RefExpr, SourceLocation &ELoc,
-               SourceRange &ERange, bool AllowArraySection = false) {
+static std::pair<ValueDecl *, bool> getPrivateItem(Sema &S, Expr *&RefExpr,
+                                                   SourceLocation &ELoc,
+                                                   SourceRange &ERange,
+                                                   bool AllowArraySection) {
   if (RefExpr->isTypeDependent() || RefExpr->isValueDependent() ||
       RefExpr->containsUnexpandedParameterPack())
     return std::make_pair(nullptr, true);
@@ -17172,8 +17203,6 @@ OMPClause *Sema::ActOnOpenMPNontemporalClause(ArrayRef<Expr *> VarList,
     if (!D)
       continue;
 
-    auto *VD = dyn_cast<VarDecl>(D);
-
     // OpenMP 5.0, 2.9.3.1 simd Construct, Restrictions.
     // A list-item cannot appear in more than one nontemporal clause.
     if (const Expr *PrevRef =
@@ -17185,12 +17214,7 @@ OMPClause *Sema::ActOnOpenMPNontemporalClause(ArrayRef<Expr *> VarList,
       continue;
     }
 
-    DeclRefExpr *Ref = nullptr;
-    if (!VD && isOpenMPCapturedDecl(D) && !CurContext->isDependentContext())
-      Ref = buildCapture(*this, D, SimpleRefExpr, /*WithInit=*/true);
-    Vars.push_back((VD || !Ref || CurContext->isDependentContext())
-                       ? RefExpr->IgnoreParens()
-                       : Ref);
+    Vars.push_back(RefExpr);
   }
 
   if (Vars.empty())

diff  --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
index 9258a6fb1265..032ce888969e 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -12466,4 +12466,9 @@ void OMPClauseReader::VisitOMPNontemporalClause(OMPNontemporalClause *C) {
   for (unsigned i = 0; i != NumVars; ++i)
     Vars.push_back(Record.readSubExpr());
   C->setVarRefs(Vars);
+  Vars.clear();
+  Vars.reserve(NumVars);
+  for (unsigned i = 0; i != NumVars; ++i)
+    Vars.push_back(Record.readSubExpr());
+  C->setPrivateRefs(Vars);
 }

diff  --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index a632072d01bd..2e2ce0e875d0 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -6544,4 +6544,6 @@ void OMPClauseWriter::VisitOMPNontemporalClause(OMPNontemporalClause *C) {
   Record.AddSourceLocation(C->getLParenLoc());
   for (auto *VE : C->varlists())
     Record.AddStmt(VE);
+  for (auto *E : C->private_refs())
+    Record.AddStmt(E);
 }

diff  --git a/clang/test/OpenMP/distribute_simd_codegen.cpp b/clang/test/OpenMP/distribute_simd_codegen.cpp
index f0128a977d68..104b1ba91b66 100644
--- a/clang/test/OpenMP/distribute_simd_codegen.cpp
+++ b/clang/test/OpenMP/distribute_simd_codegen.cpp
@@ -143,7 +143,7 @@ void static_not_chunked(float *a, float *b, float *c, float *d) {
   #pragma omp target
   #pragma omp teams
 #ifdef OMP5
-  #pragma omp distribute simd dist_schedule(static) safelen(32) if(simd: true)
+  #pragma omp distribute simd dist_schedule(static) safelen(32) if(simd: true) nontemporal(a, b)
 #else
   #pragma omp distribute simd dist_schedule(static) safelen(32)
 #endif // OMP5
@@ -189,6 +189,11 @@ void static_not_chunked(float *a, float *b, float *c, float *d) {
 // CHECK:  [[BBINNBODY]]:
 // CHECK:  {{.+}} = load i32, i32* [[IV]]
 // ... loop body ...
+// OMP45-NOT: !nontemporal
+// OMP50:  load float*,{{.*}}!nontemporal
+// OMP50:  load float*,{{.*}}!nontemporal
+// OMP50-NOT: !nontemporal
+
 // CHECK:  br label %[[BBBODYCONT:.+]]
 // CHECK:  [[BBBODYCONT]]:
 // CHECK:  br label %[[BBINNINC:.+]]
@@ -271,7 +276,7 @@ void test_precond() {
   #pragma omp target
   #pragma omp teams
 #ifdef OMP5
-  #pragma omp distribute simd linear(i) if(a)
+  #pragma omp distribute simd linear(i) if(a) nontemporal(i)
 #else
   #pragma omp distribute simd linear(i)
 #endif // OMP5
@@ -293,6 +298,9 @@ void test_precond() {
 // CHECK:  br i1 [[ACMP]], label %[[PRECOND_THEN:.+]], label %[[PRECOND_END:.+]]
 // CHECK:  [[PRECOND_THEN]]
 // CHECK:  call void @__kmpc_for_static_init_4
+// OMP45-NOT: !nontemporal
+// OMP50:  store i8 {{.*}}!nontemporal
+// OMP50-NOT: !nontemporal
 // CHECK:  call void @__kmpc_for_static_fini
 // CHECK:  [[PRECOND_END]]
 

diff  --git a/clang/test/OpenMP/for_simd_codegen.cpp b/clang/test/OpenMP/for_simd_codegen.cpp
index 4bc6b362f027..a668cb77a068 100644
--- a/clang/test/OpenMP/for_simd_codegen.cpp
+++ b/clang/test/OpenMP/for_simd_codegen.cpp
@@ -333,7 +333,7 @@ void simple(float *a, float *b, float *c, float *d) {
 // OMP50: br i1 [[COND]], label {{%?}}[[THEN:[^,]+]], label {{%?}}[[ELSE:[^,]+]]
 // OMP50: [[THEN]]:
 #ifdef OMP5
-  #pragma omp for simd reduction(*:R) if (simd:A)
+  #pragma omp for simd reduction(*:R) if (simd:A) nontemporal(R)
 #else
   #pragma omp for simd reduction(*:R)
 #endif
@@ -366,7 +366,8 @@ void simple(float *a, float *b, float *c, float *d) {
 // CHECK-NEXT: [[LC_IT_2:%.+]] = add nsw i64 -10, [[LC_IT_1]]
 // CHECK-NEXT: store i64 [[LC_IT_2]], i64* [[LC:%[^,]+]],
 // CHECK-NEXT: [[LC_VAL:%.+]] = load i64, i64* [[LC]]
-// CHECK: store i32 %{{.+}}, i32* [[R_PRIV]],
+// OMP45: store i32 %{{.+}}, i32* [[R_PRIV]],
+// OMP50: store i32 %{{.+}}, i32* [[R_PRIV]],{{.*}}!nontemporal
     R *= i;
 // CHECK: [[IV8_2:%.+]] = load i64, i64* [[OMP_IV8]]
 // CHECK-NEXT: [[ADD8_2:%.+]] = add nsw i64 [[IV8_2]], 1

diff  --git a/clang/test/OpenMP/simd_codegen.cpp b/clang/test/OpenMP/simd_codegen.cpp
index 19ed323af426..aac76ad127dc 100644
--- a/clang/test/OpenMP/simd_codegen.cpp
+++ b/clang/test/OpenMP/simd_codegen.cpp
@@ -22,10 +22,15 @@
 long long get_val() { return 0; }
 double *g_ptr;
 
+struct S {
+  int a, b;
+};
+
 // CHECK-LABEL: define {{.*void}} @{{.*}}simple{{.*}}(float* {{.+}}, float* {{.+}}, float* {{.+}}, float* {{.+}})
 void simple(float *a, float *b, float *c, float *d) {
+  S s, *p;
 #ifdef OMP5
-  #pragma omp simd if (simd: true)
+  #pragma omp simd if (simd: true) nontemporal(a, b, c, d, s)
 #else
   #pragma omp simd
 #endif
@@ -43,8 +48,17 @@ void simple(float *a, float *b, float *c, float *d) {
 // CHECK-NEXT: store i32 [[CALC_I_2]], i32* [[LC_I:.+]]{{.*}}!llvm.access.group
 // ... loop body ...
 // End of body: store into a[i]:
+// OMP45-NOT: load float*,{{.*}}!nontemporal
+// CHECK-NOT: load float,{{.*}}!nontemporal
+// OMP50: load float*,{{.*}}!nontemporal
+// OMP50: load float*,{{.*}}!nontemporal
+// OMP50: load float*,{{.*}}!nontemporal
+// OMP50: load i32,{{.*}}!nontemporal
+// OMP50-NOT: load i32,{{.*}}!nontemporal
+// OMP50: load float*,{{.*}}!nontemporal
+// CHECK-NOT: load float,{{.*}}!nontemporal
 // CHECK: store float [[RESULT:%.+]], float* {{%.+}}{{.*}}!llvm.access.group
-    a[i] = b[i] * c[i] * d[i];
+    a[i] = b[i] * c[i] * d[i] + s.a + p->a;
 // CHECK: [[IV1_2:%.+]] = load i32, i32* [[OMP_IV]]{{.*}}!llvm.access.group
 // CHECK-NEXT: [[ADD1_2:%.+]] = add nsw i32 [[IV1_2]], 1
 // CHECK-NEXT: store i32 [[ADD1_2]], i32* [[OMP_IV]]{{.*}}!llvm.access.group
@@ -718,6 +732,47 @@ void linear(float *a) {
 //
 }
 
+#ifdef OMP5
+// OMP50-LABEL: inner_simd
+void inner_simd() {
+  double a, b;
+#pragma omp simd nontemporal(a)
+  for (int i = 0; i < 10; ++i) {
+#pragma omp simd nontemporal(b)
+    for (int k = 0; k < 10; ++k) {
+      // OMP50: load double,{{.*}}!nontemporal
+      // OMP50: store double{{.*}}!nontemporal
+      a = b;
+    }
+    // OMP50-NOT: load double,{{.*}}!nontemporal
+    // OMP50: load double,
+    // OMP50: store double{{.*}}!nontemporal
+    a = b;
+  }
+}
+
+extern struct T t;
+struct Base {
+  float a;
+};
+struct T : public Base {
+  void foo() {
+#pragma omp simd nontemporal(Base::a)
+    for (int i = 0; i < 10; ++i) {
+    // OMP50: store float{{.*}}!nontemporal
+    // OMP50-NOT: nontemporal
+    // OMP50-NEXT: store float
+      Base::a = 0;
+      t.a = 0;
+    }
+  }
+} t;
+
+void bartfoo() {
+  t.foo();
+}
+
+#endif // OMP5
 // TERM_DEBUG-LABEL: bar
 int bar() {return 0;};
 

diff  --git a/clang/test/OpenMP/target_parallel_for_simd_codegen.cpp b/clang/test/OpenMP/target_parallel_for_simd_codegen.cpp
index 5e1ed2d6275b..055d5dce28bb 100644
--- a/clang/test/OpenMP/target_parallel_for_simd_codegen.cpp
+++ b/clang/test/OpenMP/target_parallel_for_simd_codegen.cpp
@@ -539,7 +539,7 @@ struct S1 {
     short int c[2][n];
 
 #ifdef OMP5
-    #pragma omp target parallel for simd if(n>60)
+    #pragma omp target parallel for simd if(n>60) nontemporal(a)
 #else
     #pragma omp target parallel for simd if(target: n>60)
 #endif // OMP5
@@ -837,6 +837,9 @@ int bar(int n){
 // OMP45:       define internal {{.*}}void [[OMP_OUTLINED5]](i32* noalias %.global_tid., i32* noalias %.bound_tid., [[S1]]* %{{.+}}, i[[SZ]] %{{.+}}, i[[SZ]] %{{.+}}, i[[SZ]] %{{.+}}, i16* {{.+}})
 // OMP50:       define internal {{.*}}void [[OMP_OUTLINED5]](i32* noalias %.global_tid., i32* noalias %.bound_tid., [[S1]]* %{{.+}}, i[[SZ]] %{{.+}}, i[[SZ]] %{{.+}}, i[[SZ]] %{{.+}}, i16* {{.+}}, i[[SZ]] %{{.+}})
 // To reduce complexity, we're only going as far as validating the signature of the outlined parallel function.
+// OMP45-NOT:   !nontemporal
+// OMP50:       store double{{.*}}!nontemporal
+// OMP50:       load double{{.*}}!nontemporal
 
 
 // CHECK:       define internal void [[HVT6]]

diff  --git a/clang/test/OpenMP/target_simd_codegen.cpp b/clang/test/OpenMP/target_simd_codegen.cpp
index 6755b7e657e3..597cff7815a3 100644
--- a/clang/test/OpenMP/target_simd_codegen.cpp
+++ b/clang/test/OpenMP/target_simd_codegen.cpp
@@ -85,8 +85,8 @@
 // CHECK-DAG: [[MAPT5:@.+]] = private unnamed_addr constant [3 x i64] [i64 800, i64 800, i64 547]
 // CHECK-DAG: [[SIZET6:@.+]] = private unnamed_addr constant [4 x i64] [i64 4, i64 2, i64 1, i64 40]
 // CHECK-DAG: [[MAPT6:@.+]] = private unnamed_addr constant [4 x i64] [i64 800, i64 800, i64 800, i64 547]
-// OMP45-DAG: [[MAPT7:@.+]] = private unnamed_addr constant [6 x i64] [i64 32, i64 281474976711171, i64 800, i64 800, i64 800, i64 547]
-// OMP50-DAG: [[MAPT7:@.+]] = private unnamed_addr constant [7 x i64] [i64 32, i64 281474976711171, i64 800, i64 800, i64 800, i64 547, i64 800]
+// OMP45-DAG: [[MAPT7:@.+]] = private unnamed_addr constant [5 x i64] [i64 547, i64 800, i64 800, i64 800, i64 547]
+// OMP50-DAG: [[MAPT7:@.+]] = private unnamed_addr constant [6 x i64] [i64 547, i64 800, i64 800, i64 800, i64 547, i64 800]
 // CHECK-DAG: @{{.*}} = weak constant i8 0
 // CHECK-DAG: @{{.*}} = weak constant i8 0
 // CHECK-DAG: @{{.*}} = weak constant i8 0
@@ -461,7 +461,11 @@ struct S1 {
     int b = n+1;
     short int c[2][n];
 
-    #pragma omp target simd if(n>60)
+#ifdef OMP5
+    #pragma omp target simd if(n>60) nontemporal(a) private(a)
+#else
+    #pragma omp target simd if(n>60) private(a)
+#endif // OMP5
     for (unsigned long long it = 2000; it >= 600; it -= 400) {
       this->a = (double)b + 1.5;
       c[1][1] = ++a;
@@ -519,96 +523,84 @@ int bar(int n){
 // CHECK-32:    [[CSZSIZE:%.+]] = mul nuw i32 [[CELEMSIZE2]], 2
 // CHECK-32:    [[CSIZE:%.+]] = sext i32 [[CSZSIZE]] to i64
 
-// OMP45-DAG:   [[RET:%.+]] = call i32 @__tgt_target_teams(i64 -1, i8* @{{[^,]+}}, i32 6, i8** [[BPR:%[^,]+]], i8** [[PR:%[^,]+]], i64* [[SR:%[^,]+]], i64* getelementptr inbounds ([6 x i64], [6 x i64]* [[MAPT7]], i32 0, i32 0), i32 1, i32 1)
-// OMP50-DAG:   [[RET:%.+]] = call i32 @__tgt_target_teams(i64 -1, i8* @{{[^,]+}}, i32 7, i8** [[BPR:%[^,]+]], i8** [[PR:%[^,]+]], i64* [[SR:%[^,]+]], i64* getelementptr inbounds ([7 x  i64], [7 x  i64]* [[MAPT7]], i32 0, i32 0), i32 1, i32 1)
-// OMP45-DAG:   [[BPR]] = getelementptr inbounds [6 x i8*], [6 x i8*]* [[BP:%.+]], i32 0, i32 0
-// OMP45-DAG:   [[PR]] = getelementptr inbounds [6 x i8*], [6 x i8*]* [[P:%.+]], i32 0, i32 0
-// OMP45-DAG:   [[SR]] = getelementptr inbounds [6 x i64], [6 x i64]* [[S:%.+]], i32 0, i32 0
-// OMP45-DAG:   [[SADDR0:%.+]] = getelementptr inbounds [6 x i64], [6 x i64]* [[S]], i32 [[IDX0:[0-9]+]]
-// OMP45-DAG:   [[BPADDR0:%.+]] = getelementptr inbounds [6 x i8*], [6 x i8*]* [[BP]], i32 [[IDX0]]
-// OMP45-DAG:   [[PADDR0:%.+]] = getelementptr inbounds [6 x i8*], [6 x i8*]* [[P]], i32 [[IDX0]]
-// OMP45-DAG:   [[SADDR1:%.+]] = getelementptr inbounds [6 x i64], [6 x i64]* [[S]], i32 [[IDX1:[0-9]+]]
-// OMP45-DAG:   [[BPADDR1:%.+]] = getelementptr inbounds [6 x i8*], [6 x i8*]* [[BP]], i32 [[IDX1]]
-// OMP45-DAG:   [[PADDR1:%.+]] = getelementptr inbounds [6 x i8*], [6 x i8*]* [[P]], i32 [[IDX1]]
-// OMP45-DAG:   [[SADDR2:%.+]] = getelementptr inbounds [6 x i64], [6 x i64]* [[S]], i32 [[IDX2:[0-9]+]]
-// OMP45-DAG:   [[BPADDR2:%.+]] = getelementptr inbounds [6 x i8*], [6 x i8*]* [[BP]], i32 [[IDX2]]
-// OMP45-DAG:   [[PADDR2:%.+]] = getelementptr inbounds [6 x i8*], [6 x i8*]* [[P]], i32 [[IDX2]]
-// OMP45-DAG:   [[SADDR3:%.+]] = getelementptr inbounds [6 x i64], [6 x i64]* [[S]], i32 [[IDX3:[0-9]+]]
-// OMP45-DAG:   [[BPADDR3:%.+]] = getelementptr inbounds [6 x i8*], [6 x i8*]* [[BP]], i32 [[IDX3]]
-// OMP45-DAG:   [[PADDR3:%.+]] = getelementptr inbounds [6 x i8*], [6 x i8*]* [[P]], i32 [[IDX3]]
-// OMP45-DAG:   [[SADDR4:%.+]] = getelementptr inbounds [6 x i64], [6 x i64]* [[S]], i32 [[IDX4:[0-9]+]]
-// OMP45-DAG:   [[BPADDR4:%.+]] = getelementptr inbounds [6 x i8*], [6 x i8*]* [[BP]], i32 [[IDX4]]
-// OMP45-DAG:   [[PADDR4:%.+]] = getelementptr inbounds [6 x i8*], [6 x i8*]* [[P]], i32 [[IDX4]]
-// OMP45-DAG:   [[SADDR5:%.+]] = getelementptr inbounds [6 x i64], [6 x i64]* [[S]], i32 [[IDX5:[0-9]+]]
-// OMP45-DAG:   [[BPADDR5:%.+]] = getelementptr inbounds [6 x i8*], [6 x i8*]* [[BP]], i32 [[IDX5]]
-// OMP45-DAG:   [[PADDR5:%.+]] = getelementptr inbounds [6 x i8*], [6 x i8*]* [[P]], i32 [[IDX5]]
-// OMP50-DAG:   [[BPR]] = getelementptr inbounds [7 x  i8*], [7 x  i8*]* [[BP:%.+]], i32 0, i32 0
-// OMP50-DAG:   [[PR]] = getelementptr inbounds [7 x  i8*], [7 x  i8*]* [[P:%.+]], i32 0, i32 0
-// OMP50-DAG:   [[SR]] = getelementptr inbounds [7 x  i64], [7 x  i64]* [[S:%.+]], i32 0, i32 0
-// OMP50-DAG:   [[SADDR0:%.+]] = getelementptr inbounds [7 x  i64], [7 x  i64]* [[S]], i32 [[IDX0:[0-9]+]]
-// OMP50-DAG:   [[BPADDR0:%.+]] = getelementptr inbounds [7 x  i8*], [7 x  i8*]* [[BP]], i32 [[IDX0]]
-// OMP50-DAG:   [[PADDR0:%.+]] = getelementptr inbounds [7 x  i8*], [7 x  i8*]* [[P]], i32 [[IDX0]]
-// OMP50-DAG:   [[SADDR1:%.+]] = getelementptr inbounds [7 x  i64], [7 x  i64]* [[S]], i32 [[IDX1:[0-9]+]]
-// OMP50-DAG:   [[BPADDR1:%.+]] = getelementptr inbounds [7 x  i8*], [7 x  i8*]* [[BP]], i32 [[IDX1]]
-// OMP50-DAG:   [[PADDR1:%.+]] = getelementptr inbounds [7 x  i8*], [7 x  i8*]* [[P]], i32 [[IDX1]]
-// OMP50-DAG:   [[SADDR2:%.+]] = getelementptr inbounds [7 x  i64], [7 x  i64]* [[S]], i32 [[IDX2:[0-9]+]]
-// OMP50-DAG:   [[BPADDR2:%.+]] = getelementptr inbounds [7 x  i8*], [7 x  i8*]* [[BP]], i32 [[IDX2]]
-// OMP50-DAG:   [[PADDR2:%.+]] = getelementptr inbounds [7 x  i8*], [7 x  i8*]* [[P]], i32 [[IDX2]]
-// OMP50-DAG:   [[SADDR3:%.+]] = getelementptr inbounds [7 x  i64], [7 x  i64]* [[S]], i32 [[IDX3:[0-9]+]]
-// OMP50-DAG:   [[BPADDR3:%.+]] = getelementptr inbounds [7 x  i8*], [7 x  i8*]* [[BP]], i32 [[IDX3]]
-// OMP50-DAG:   [[PADDR3:%.+]] = getelementptr inbounds [7 x  i8*], [7 x  i8*]* [[P]], i32 [[IDX3]]
-// OMP50-DAG:   [[SADDR4:%.+]] = getelementptr inbounds [7 x  i64], [7 x  i64]* [[S]], i32 [[IDX4:[0-9]+]]
-// OMP50-DAG:   [[BPADDR4:%.+]] = getelementptr inbounds [7 x  i8*], [7 x  i8*]* [[BP]], i32 [[IDX4]]
-// OMP50-DAG:   [[PADDR4:%.+]] = getelementptr inbounds [7 x  i8*], [7 x  i8*]* [[P]], i32 [[IDX4]]
-// OMP50-DAG:   [[SADDR5:%.+]] = getelementptr inbounds [7 x  i64], [7 x  i64]* [[S]], i32 [[IDX5:[0-9]+]]
-// OMP50-DAG:   [[BPADDR5:%.+]] = getelementptr inbounds [7 x  i8*], [7 x  i8*]* [[BP]], i32 [[IDX5]]
-// OMP50-DAG:   [[PADDR5:%.+]] = getelementptr inbounds [7 x  i8*], [7 x  i8*]* [[P]], i32 [[IDX5]]
-// OMP50-DAG:   [[SADDR6:%.+]] = getelementptr inbounds [7 x  i64], [7 x  i64]* [[S]], i32 [[IDX6:[0-9]+]]
-// OMP50-DAG:   [[BPADDR6:%.+]] = getelementptr inbounds [7 x  i8*], [7 x  i8*]* [[BP]], i32 [[IDX6]]
-// OMP50-DAG:   [[PADDR6:%.+]] = getelementptr inbounds [7 x  i8*], [7 x  i8*]* [[P]], i32 [[IDX6]]
+// OMP45-DAG:   [[RET:%.+]] = call i32 @__tgt_target_teams(i64 -1, i8* @{{[^,]+}}, i32 5, i8** [[BPR:%[^,]+]], i8** [[PR:%[^,]+]], i64* [[SR:%[^,]+]], i64* getelementptr inbounds ([5 x i64], [5 x i64]* [[MAPT7]], i32 0, i32 0), i32 1, i32 1)
+// OMP50-DAG:   [[RET:%.+]] = call i32 @__tgt_target_teams(i64 -1, i8* @{{[^,]+}}, i32 6, i8** [[BPR:%[^,]+]], i8** [[PR:%[^,]+]], i64* [[SR:%[^,]+]], i64* getelementptr inbounds ([6 x  i64], [6 x  i64]* [[MAPT7]], i32 0, i32 0), i32 1, i32 1)
+// OMP45-DAG:   [[BPR]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[BP:%.+]], i32 0, i32 0
+// OMP45-DAG:   [[PR]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[P:%.+]], i32 0, i32 0
+// OMP45-DAG:   [[SR]] = getelementptr inbounds [5 x i64], [5 x i64]* [[S:%.+]], i32 0, i32 0
+// OMP45-DAG:   [[SADDR0:%.+]] = getelementptr inbounds [5 x i64], [5 x i64]* [[S]], i32 [[IDX0:[0-9]+]]
+// OMP45-DAG:   [[BPADDR0:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[BP]], i32 [[IDX0]]
+// OMP45-DAG:   [[PADDR0:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[P]], i32 [[IDX0]]
+// OMP45-DAG:   [[SADDR1:%.+]] = getelementptr inbounds [5 x i64], [5 x i64]* [[S]], i32 [[IDX1:[0-9]+]]
+// OMP45-DAG:   [[BPADDR1:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[BP]], i32 [[IDX1]]
+// OMP45-DAG:   [[PADDR1:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[P]], i32 [[IDX1]]
+// OMP45-DAG:   [[SADDR2:%.+]] = getelementptr inbounds [5 x i64], [5 x i64]* [[S]], i32 [[IDX2:[0-9]+]]
+// OMP45-DAG:   [[BPADDR2:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[BP]], i32 [[IDX2]]
+// OMP45-DAG:   [[PADDR2:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[P]], i32 [[IDX2]]
+// OMP45-DAG:   [[SADDR3:%.+]] = getelementptr inbounds [5 x i64], [5 x i64]* [[S]], i32 [[IDX3:[0-9]+]]
+// OMP45-DAG:   [[BPADDR3:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[BP]], i32 [[IDX3]]
+// OMP45-DAG:   [[PADDR3:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[P]], i32 [[IDX3]]
+// OMP45-DAG:   [[SADDR4:%.+]] = getelementptr inbounds [5 x i64], [5 x i64]* [[S]], i32 [[IDX4:[0-9]+]]
+// OMP45-DAG:   [[BPADDR4:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[BP]], i32 [[IDX4]]
+// OMP45-DAG:   [[PADDR4:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[P]], i32 [[IDX4]]
+// OMP50-DAG:   [[BPR]] = getelementptr inbounds [6 x  i8*], [6 x  i8*]* [[BP:%.+]], i32 0, i32 0
+// OMP50-DAG:   [[PR]] = getelementptr inbounds [6 x  i8*], [6 x  i8*]* [[P:%.+]], i32 0, i32 0
+// OMP50-DAG:   [[SR]] = getelementptr inbounds [6 x  i64], [6 x  i64]* [[S:%.+]], i32 0, i32 0
+// OMP50-DAG:   [[SADDR0:%.+]] = getelementptr inbounds [6 x  i64], [6 x  i64]* [[S]], i32 [[IDX0:[0-9]+]]
+// OMP50-DAG:   [[BPADDR0:%.+]] = getelementptr inbounds [6 x  i8*], [6 x  i8*]* [[BP]], i32 [[IDX0]]
+// OMP50-DAG:   [[PADDR0:%.+]] = getelementptr inbounds [6 x  i8*], [6 x  i8*]* [[P]], i32 [[IDX0]]
+// OMP50-DAG:   [[SADDR1:%.+]] = getelementptr inbounds [6 x  i64], [6 x  i64]* [[S]], i32 [[IDX1:[0-9]+]]
+// OMP50-DAG:   [[BPADDR1:%.+]] = getelementptr inbounds [6 x  i8*], [6 x  i8*]* [[BP]], i32 [[IDX1]]
+// OMP50-DAG:   [[PADDR1:%.+]] = getelementptr inbounds [6 x  i8*], [6 x  i8*]* [[P]], i32 [[IDX1]]
+// OMP50-DAG:   [[SADDR2:%.+]] = getelementptr inbounds [6 x  i64], [6 x  i64]* [[S]], i32 [[IDX2:[0-9]+]]
+// OMP50-DAG:   [[BPADDR2:%.+]] = getelementptr inbounds [6 x  i8*], [6 x  i8*]* [[BP]], i32 [[IDX2]]
+// OMP50-DAG:   [[PADDR2:%.+]] = getelementptr inbounds [6 x  i8*], [6 x  i8*]* [[P]], i32 [[IDX2]]
+// OMP50-DAG:   [[SADDR3:%.+]] = getelementptr inbounds [6 x  i64], [6 x  i64]* [[S]], i32 [[IDX3:[0-9]+]]
+// OMP50-DAG:   [[BPADDR3:%.+]] = getelementptr inbounds [6 x  i8*], [6 x  i8*]* [[BP]], i32 [[IDX3]]
+// OMP50-DAG:   [[PADDR3:%.+]] = getelementptr inbounds [6 x  i8*], [6 x  i8*]* [[P]], i32 [[IDX3]]
+// OMP50-DAG:   [[SADDR4:%.+]] = getelementptr inbounds [6 x  i64], [6 x  i64]* [[S]], i32 [[IDX4:[0-9]+]]
+// OMP50-DAG:   [[BPADDR4:%.+]] = getelementptr inbounds [6 x  i8*], [6 x  i8*]* [[BP]], i32 [[IDX4]]
+// OMP50-DAG:   [[PADDR4:%.+]] = getelementptr inbounds [6 x  i8*], [6 x  i8*]* [[P]], i32 [[IDX4]]
+// OMP50-DAG:   [[SADDR5:%.+]] = getelementptr inbounds [6 x  i64], [6 x  i64]* [[S]], i32 [[IDX5:[0-9]+]]
+// OMP50-DAG:   [[BPADDR5:%.+]] = getelementptr inbounds [6 x  i8*], [6 x  i8*]* [[BP]], i32 [[IDX5]]
+// OMP50-DAG:   [[PADDR5:%.+]] = getelementptr inbounds [6 x  i8*], [6 x  i8*]* [[P]], i32 [[IDX5]]
 
 // The names below are not necessarily consistent with the names used for the
 // addresses above as some are repeated.
 // CHECK-DAG:   store [[S1]]* %{{.+}}, [[S1]]** [[CBPADDR0:%.+]],
-// CHECK-DAG:   store double* %{{.+}}, double** [[CPADDR0:%.+]],
+// CHECK-DAG:   store [[S1]]* %{{.+}}, [[S1]]** [[CPADDR0:%.+]],
 // CHECK-DAG:   [[CBPADDR0]] = bitcast i8** {{%[^,]+}} to [[S1]]**
-// CHECK-DAG:   [[CPADDR0]] = bitcast i8** {{%[^,]+}} to double**
-// CHECK-DAG:   store i64 %{{.+}}, i64* {{%[^,]+}}
-
-// CHECK-DAG:   store [[S1]]* %{{.+}}, [[S1]]** [[CBPADDR1:%.+]],
-// CHECK-DAG:   store double* %{{.+}}, double** [[CPADDR1:%.+]],
-// CHECK-DAG:   [[CBPADDR1]] = bitcast i8** {{%[^,]+}} to [[S1]]**
-// CHECK-DAG:   [[CPADDR1]] = bitcast i8** {{%[^,]+}} to double**
+// CHECK-DAG:   [[CPADDR0]] = bitcast i8** {{%[^,]+}} to [[S1]]**
 // CHECK-DAG:   store i64 {{4|8}}, i64* {{%[^,]+}}
 
-// CHECK-DAG:   store i[[SZ]] [[B_CVAL]], i[[SZ]]* [[CBPADDR2:%.+]],
-// CHECK-DAG:   store i[[SZ]] [[B_CVAL]], i[[SZ]]* [[CPADDR2:%.+]],
+// CHECK-DAG:   store i[[SZ]] [[B_CVAL]], i[[SZ]]* [[CBPADDR1:%.+]],
+// CHECK-DAG:   store i[[SZ]] [[B_CVAL]], i[[SZ]]* [[CPADDR1:%.+]],
+// CHECK-DAG:   [[CBPADDR1]] = bitcast i8** {{%[^,]+}} to i[[SZ]]*
+// CHECK-DAG:   [[CPADDR1]] = bitcast i8** {{%[^,]+}} to i[[SZ]]*
+// CHECK-DAG:   store i64 4, i64* {{%[^,]+}}
+
+// CHECK-DAG:   store i[[SZ]] 2, i[[SZ]]* [[CBPADDR2:%.+]],
+// CHECK-DAG:   store i[[SZ]] 2, i[[SZ]]* [[CPADDR2:%.+]],
 // CHECK-DAG:   [[CBPADDR2]] = bitcast i8** {{%[^,]+}} to i[[SZ]]*
 // CHECK-DAG:   [[CPADDR2]] = bitcast i8** {{%[^,]+}} to i[[SZ]]*
-// CHECK-DAG:   store i64 4, i64* {{%[^,]+}}
+// CHECK-DAG:   store i64 {{4|8}}, i64* {{%[^,]+}}
 
-// CHECK-DAG:   store i[[SZ]] 2, i[[SZ]]* [[CBPADDR3:%.+]],
-// CHECK-DAG:   store i[[SZ]] 2, i[[SZ]]* [[CPADDR3:%.+]],
+// CHECK-DAG:   store i[[SZ]] %{{.+}}, i[[SZ]]* [[CBPADDR3:%.+]],
+// CHECK-DAG:   store i[[SZ]] %{{.+}}, i[[SZ]]* [[CPADDR3:%.+]],
 // CHECK-DAG:   [[CBPADDR3]] = bitcast i8** {{%[^,]+}} to i[[SZ]]*
 // CHECK-DAG:   [[CPADDR3]] = bitcast i8** {{%[^,]+}} to i[[SZ]]*
 // CHECK-DAG:   store i64 {{4|8}}, i64* {{%[^,]+}}
 
-// CHECK-DAG:   store i[[SZ]] %{{.+}}, i[[SZ]]* [[CBPADDR4:%.+]],
-// CHECK-DAG:   store i[[SZ]] %{{.+}}, i[[SZ]]* [[CPADDR4:%.+]],
-// CHECK-DAG:   [[CBPADDR4]] = bitcast i8** {{%[^,]+}} to i[[SZ]]*
-// CHECK-DAG:   [[CPADDR4]] = bitcast i8** {{%[^,]+}} to i[[SZ]]*
-// CHECK-DAG:   store i64 {{4|8}}, i64* {{%[^,]+}}
-
-// CHECK-DAG:   store i16* %{{.+}}, i16** [[CBPADDR5:%.+]],
-// CHECK-DAG:   store i16* %{{.+}}, i16** [[CPADDR5:%.+]],
-// CHECK-DAG:   [[CBPADDR5]] = bitcast i8** {{%[^,]+}} to i16**
-// CHECK-DAG:   [[CPADDR5]] = bitcast i8** {{%[^,]+}} to i16**
+// CHECK-DAG:   store i16* %{{.+}}, i16** [[CBPADDR4:%.+]],
+// CHECK-DAG:   store i16* %{{.+}}, i16** [[CPADDR4:%.+]],
+// CHECK-DAG:   [[CBPADDR4]] = bitcast i8** {{%[^,]+}} to i16**
+// CHECK-DAG:   [[CPADDR4]] = bitcast i8** {{%[^,]+}} to i16**
 // CHECK-DAG:   store i64 [[CSIZE]], i64* {{%[^,]+}}
 
-// OMP50-DAG:   store i[[SZ]] [[SIMD_COND]], i[[SZ]]* [[CBPADDR6:%.+]],
-// OMP50-DAG:   store i[[SZ]] [[SIMD_COND]], i[[SZ]]* [[CPADDR6:%.+]],
-// OMP50-DAG:   [[CBPADDR6]] = bitcast i8** {{%[^,]+}} to i[[SZ]]*
-// OMP50-DAG:   [[CPADDR6]] = bitcast i8** {{%[^,]+}} to i[[SZ]]*
+// OMP50-DAG:   store i[[SZ]] [[SIMD_COND]], i[[SZ]]* [[CBPADDR5:%.+]],
+// OMP50-DAG:   store i[[SZ]] [[SIMD_COND]], i[[SZ]]* [[CPADDR5:%.+]],
+// OMP50-DAG:   [[CBPADDR5]] = bitcast i8** {{%[^,]+}} to i[[SZ]]*
+// OMP50-DAG:   [[CPADDR5]] = bitcast i8** {{%[^,]+}} to i[[SZ]]*
 // OMP50-DAG:   store i64 1, i64* {{%[^,]+}}
 
 // CHECK-NEXT:  [[ERROR:%.+]] = icmp ne i32 [[RET]], 0
@@ -738,6 +730,10 @@ int bar(int n){
 // OMP50-DAG:   [[CONV_COND:%.+]] = bitcast i[[SZ]]* [[LOCAL_SIMD_COND_CASTED]] to i8*
 // OMP50-DAG:   [[SIMD_COND:%.+]] = load i8, i8* [[CONV_COND]],
 // OMP50-DAG:   trunc i8 [[SIMD_COND]] to i1
+// OMP45-NOT:   !nontemporal
+// OMP50:       store double {{.*}}!nontemporal
+// OMP50:       load double, {{.*}}!nontemporal
+// OMP50:       store double {{.*}}!nontemporal
 
 // CHECK:       define internal void [[HVT6]]
 // Create local storage for each capture.

diff  --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_if_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_if_codegen.cpp
index 8a8d64e36f8d..dda468e604eb 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_if_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_if_codegen.cpp
@@ -35,20 +35,25 @@ void gtid_test() {
 // CHECK: call i{{[0-9]+}} @__tgt_target_teams(
 // CHECK: call void [[OFFLOADING_FUN_1:@.+]](
 #ifdef OMP5
-#pragma omp target teams distribute parallel for simd if(simd: true)
+#pragma omp target teams distribute parallel for simd if(simd: true) nontemporal(Arg)
 #else
 #pragma omp target teams distribute parallel for simd
 #endif // OMP5
-  for(int i = 0 ; i < 100; i++) {}
+  for (int i = 0; i < 100; i++) {
+    Arg = 0;
+  }
   // CHECK: define internal void [[OFFLOADING_FUN_0]](
-  // CHECK: call {{.*}}void {{.+}} @__kmpc_fork_teams({{.+}}, i32 0, {{.+}}* [[OMP_TEAMS_OUTLINED_0:@.+]] to {{.+}})
+  // CHECK: call {{.*}}void {{.+}} @__kmpc_fork_teams({{.+}}, i32 1, {{.+}}* [[OMP_TEAMS_OUTLINED_0:@.+]] to {{.+}})
   // CHECK: define{{.+}} void [[OMP_TEAMS_OUTLINED_0]](
   // CHECK: call void @__kmpc_for_static_init_4(
-  // CHECK:  call void {{.+}} @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{.+}} 2, {{.+}}* [[OMP_OUTLINED_0:@.+]] to void
+  // OMP50: load i32,{{.*}}!nontemporal
+  // CHECK: call void {{.+}} @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{.+}} 3, {{.+}}* [[OMP_OUTLINED_0:@.+]] to void
   // CHECK: call void @__kmpc_for_static_fini(
 
   // CHECK: define{{.+}} void [[OMP_OUTLINED_0]](
   // CHECK: call void @__kmpc_for_static_init_4(
+  // OMP45-NOT: !nontemporal
+  // OMP50: store i32 0,{{.*}}!nontemporal
   // CHECK: call void @__kmpc_for_static_fini(
   // CHECK: ret
 #pragma omp target teams distribute parallel for simd if (parallel: false)

diff  --git a/clang/test/OpenMP/target_teams_distribute_simd_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_simd_codegen.cpp
index 14c3cd62ebfd..4912352e17ca 100644
--- a/clang/test/OpenMP/target_teams_distribute_simd_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_simd_codegen.cpp
@@ -165,7 +165,7 @@ int foo(int n) {
 
   // CHECK:       call void [[HVT1:@.+]](i[[SZ]] {{[^,]+}})
 #ifdef OMP5
-  #pragma omp target teams distribute simd if(target: 0) safelen(32) linear(a) if(simd: 1)
+  #pragma omp target teams distribute simd if(target: 0) safelen(32) linear(a) if(simd: 1) nontemporal(a)
 #else
   #pragma omp target teams distribute simd if(target: 0) safelen(32) linear(a)
 #endif // OMP5
@@ -395,6 +395,8 @@ int foo(int n) {
 // CHECK:       [[AA_ADDR:%.+]] = alloca i[[SZ]], align
 // CHECK:       store i[[SZ]] %{{.+}}, i[[SZ]]* [[AA_ADDR]], align
 // CHECK-64:    [[AA_CADDR:%.+]] = bitcast i[[SZ]]* [[AA_ADDR]] to i32*
+// OMP45-NOT:   !nontemporal
+// OMP50:       load i32,{{.*}}!nontemporal
 // CHECK-64:    store i32 10, i32* [[AA_CADDR]], align
 // CHECK-32:    store i32 10, i32* [[AA_ADDR]], align
 // CHECK:       ret void

diff  --git a/clang/test/OpenMP/teams_distribute_simd_codegen.cpp b/clang/test/OpenMP/teams_distribute_simd_codegen.cpp
index b9ee8cc728fc..63dd237a964d 100644
--- a/clang/test/OpenMP/teams_distribute_simd_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_simd_codegen.cpp
@@ -177,16 +177,16 @@ struct SS{
   // CK3: define {{.*}}i32 @{{.+}}foo{{.+}}(
   int foo(void) {
 
-  // CK3: call i32 @__tgt_target_teams(i64 -1, i8* @{{[^,]+}}, i32 2, i8** %{{[^,]+}}, i8** %{{[^,]+}}, i{{64|32}}* %{{.+}}, i64* {{.+}}@{{[^,]+}}, i32 0, i32 0), i32 0, i32 1)
+  // CK3: call i32 @__tgt_target_teams(i64 -1, i8* @{{[^,]+}}, i32 3, i8** %{{[^,]+}}, i8** %{{[^,]+}}, i{{64|32}}* %{{.+}}, i64* {{.+}}@{{[^,]+}}, i32 0, i32 0), i32 0, i32 1)
   // CK3: call void @[[OFFL1:.+]]([[SSI]]* %{{.+}})
     #pragma omp target
 #ifdef OMP5
-    #pragma omp teams distribute simd if(b)
+    #pragma omp teams distribute simd if(b) nontemporal(a, b)
 #else
     #pragma omp teams distribute simd
 #endif // OMP5
     for(int i = 0; i < X; i++) {
-      a[i] = (T)0;
+      a[i] = (T)b;
     }
 
       // outlined target region
@@ -197,6 +197,8 @@ struct SS{
 
   // CK3: define internal void @[[OUTL1]]({{.+}})
   // CK3: call void @__kmpc_for_static_init_4(
+  // OMP3_45-NOT: !nontemporal
+  // OMP3_50: load float,{{.*}}!nontemporal
   // CK3: call void @__kmpc_for_static_fini(
   // CK3: ret void
 

diff  --git a/clang/tools/libclang/CIndex.cpp b/clang/tools/libclang/CIndex.cpp
index 229abece37a4..94084e169faf 100644
--- a/clang/tools/libclang/CIndex.cpp
+++ b/clang/tools/libclang/CIndex.cpp
@@ -2458,6 +2458,8 @@ void OMPClauseEnqueue::VisitOMPIsDevicePtrClause(const OMPIsDevicePtrClause *C)
 void OMPClauseEnqueue::VisitOMPNontemporalClause(
     const OMPNontemporalClause *C) {
   VisitOMPClauseList(C);
+  for (const auto *E : C->private_refs())
+    Visitor->AddStmt(E);
 }
 }
 


        


More information about the cfe-commits mailing list