[clang] a42380c - [OMPIRBuilder] Add ordered directive to OMPBuilder

via cfe-commits cfe-commits at lists.llvm.org
Thu Sep 2 18:40:06 PDT 2021


Author: PeixinQiao
Date: 2021-09-03T09:37:58+08:00
New Revision: a42380ce837994725dd9b22c35412e0f4bd52431

URL: https://github.com/llvm/llvm-project/commit/a42380ce837994725dd9b22c35412e0f4bd52431
DIFF: https://github.com/llvm/llvm-project/commit/a42380ce837994725dd9b22c35412e0f4bd52431.diff

LOG: [OMPIRBuilder] Add ordered directive to OMPBuilder

Add support for ordered directive in the OpenMPIRBuilder.

This patch also modidies clang to use the ordered directive when the
option -fopenmp-enable-irbuilder is enabled.

Also fix one ICE when parsing one canonical for loop with the relational
operator LE or GE in openmp region by replacing unary increment
operation of the expression of the variable "Expr A" minus the variable
"Expr B" (++(Expr A - Expr B)) with binary addition operation of the
experssion of the variable "Expr A" minus the variable "Expr B" and the
expression with constant value "1" (Expr A - Expr B + "1").

Reviewed By: Meinersbur, kiranchandramohan

Differential Revision: https://reviews.llvm.org/D107430

Added: 
    

Modified: 
    clang/lib/CodeGen/CGStmtOpenMP.cpp
    clang/lib/CodeGen/CodeGenFunction.h
    clang/lib/Sema/SemaOpenMP.cpp
    clang/test/OpenMP/ordered_codegen.cpp
    clang/test/OpenMP/ordered_doacross_codegen.c
    clang/test/OpenMP/ordered_doacross_codegen.cpp
    llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
    llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
    llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp

Removed: 
    


################################################################################
diff  --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp
index b96515093a7f3..5ac5bb3facf23 100644
--- a/clang/lib/CodeGen/CGStmtOpenMP.cpp
+++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp
@@ -5312,6 +5312,74 @@ static llvm::Function *emitOutlinedOrderedFunction(CodeGenModule &CGM,
 }
 
 void CodeGenFunction::EmitOMPOrderedDirective(const OMPOrderedDirective &S) {
+  if (CGM.getLangOpts().OpenMPIRBuilder) {
+    llvm::OpenMPIRBuilder &OMPBuilder = CGM.getOpenMPRuntime().getOMPBuilder();
+    using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy;
+
+    if (S.hasClausesOfKind<OMPDependClause>()) {
+      // The ordered directive with depend clause.
+      assert(!S.hasAssociatedStmt() &&
+             "No associated statement must be in ordered depend construct.");
+      InsertPointTy AllocaIP(AllocaInsertPt->getParent(),
+                             AllocaInsertPt->getIterator());
+      for (const auto *DC : S.getClausesOfKind<OMPDependClause>()) {
+        unsigned NumLoops = DC->getNumLoops();
+        QualType Int64Ty = CGM.getContext().getIntTypeForBitwidth(
+            /*DestWidth=*/64, /*Signed=*/1);
+        llvm::SmallVector<llvm::Value *> StoreValues;
+        for (unsigned I = 0; I < NumLoops; I++) {
+          const Expr *CounterVal = DC->getLoopData(I);
+          assert(CounterVal);
+          llvm::Value *StoreValue = EmitScalarConversion(
+              EmitScalarExpr(CounterVal), CounterVal->getType(), Int64Ty,
+              CounterVal->getExprLoc());
+          StoreValues.emplace_back(StoreValue);
+        }
+        bool IsDependSource = false;
+        if (DC->getDependencyKind() == OMPC_DEPEND_source)
+          IsDependSource = true;
+        Builder.restoreIP(OMPBuilder.createOrderedDepend(
+            Builder, AllocaIP, NumLoops, StoreValues, ".cnt.addr",
+            IsDependSource));
+      }
+    } else {
+      // The ordered directive with threads or simd clause, or without clause.
+      // Without clause, it behaves as if the threads clause is specified.
+      const auto *C = S.getSingleClause<OMPSIMDClause>();
+
+      auto FiniCB = [this](InsertPointTy IP) {
+        OMPBuilderCBHelpers::FinalizeOMPRegion(*this, IP);
+      };
+
+      auto BodyGenCB = [&S, C, this](InsertPointTy AllocaIP,
+                                     InsertPointTy CodeGenIP,
+                                     llvm::BasicBlock &FiniBB) {
+        const CapturedStmt *CS = S.getInnermostCapturedStmt();
+        if (C) {
+          llvm::SmallVector<llvm::Value *, 16> CapturedVars;
+          GenerateOpenMPCapturedVars(*CS, CapturedVars);
+          llvm::Function *OutlinedFn =
+              emitOutlinedOrderedFunction(CGM, CS, S.getBeginLoc());
+          assert(S.getBeginLoc().isValid() &&
+                 "Outlined function call location must be valid.");
+          ApplyDebugLocation::CreateDefaultArtificial(*this, S.getBeginLoc());
+          OMPBuilderCBHelpers::EmitCaptureStmt(*this, CodeGenIP, FiniBB,
+                                               OutlinedFn, CapturedVars);
+        } else {
+          OMPBuilderCBHelpers::InlinedRegionBodyRAII IRB(*this, AllocaIP,
+                                                         FiniBB);
+          OMPBuilderCBHelpers::EmitOMPRegionBody(*this, CS->getCapturedStmt(),
+                                                 CodeGenIP, FiniBB);
+        }
+      };
+
+      OMPLexicalScope Scope(*this, S, OMPD_unknown);
+      Builder.restoreIP(
+          OMPBuilder.createOrderedThreadsSimd(Builder, BodyGenCB, FiniCB, !C));
+    }
+    return;
+  }
+
   if (S.hasClausesOfKind<OMPDependClause>()) {
     assert(!S.hasAssociatedStmt() &&
            "No associated statement must be in ordered depend construct.");

diff  --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index 1f05877ed6f9d..a27f4da51bd63 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -1775,6 +1775,24 @@ class CodeGenFunction : public CodeGenTypeCache {
         CGF.Builder.CreateBr(&FiniBB);
     }
 
+    static void EmitCaptureStmt(CodeGenFunction &CGF, InsertPointTy CodeGenIP,
+                                llvm::BasicBlock &FiniBB, llvm::Function *Fn,
+                                ArrayRef<llvm::Value *> Args) {
+      llvm::BasicBlock *CodeGenIPBB = CodeGenIP.getBlock();
+      if (llvm::Instruction *CodeGenIPBBTI = CodeGenIPBB->getTerminator())
+        CodeGenIPBBTI->eraseFromParent();
+
+      CGF.Builder.SetInsertPoint(CodeGenIPBB);
+
+      if (Fn->doesNotThrow())
+        CGF.EmitNounwindRuntimeCall(Fn, Args);
+      else
+        CGF.EmitRuntimeCall(Fn, Args);
+
+      if (CGF.Builder.saveIP().isSet())
+        CGF.Builder.CreateBr(&FiniBB);
+    }
+
     /// RAII for preserving necessary info during Outlined region body codegen.
     class OutlinedRegionBodyRAII {
 

diff  --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index b6e43d32dc9bb..6aa5a5d978b4d 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -5320,8 +5320,9 @@ static CapturedStmt *buildDistanceFunc(Sema &Actions, QualType LogicalTy,
 
       if (Rel == BO_LE || Rel == BO_GE) {
         // Add one to the range if the relational operator is inclusive.
-        Range =
-            AssertSuccess(Actions.BuildUnaryOp(nullptr, {}, UO_PreInc, Range));
+        Range = AssertSuccess(Actions.BuildBinOp(
+            nullptr, {}, BO_Add, Range,
+            Actions.ActOnIntegerConstant(SourceLocation(), 1).get()));
       }
 
       // Divide by the absolute step amount.

diff  --git a/clang/test/OpenMP/ordered_codegen.cpp b/clang/test/OpenMP/ordered_codegen.cpp
index 85c668ccfc591..bb9e1f4580574 100644
--- a/clang/test/OpenMP/ordered_codegen.cpp
+++ b/clang/test/OpenMP/ordered_codegen.cpp
@@ -1,11 +1,19 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s --check-prefix=CHECK1
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s --check-prefixes=CHECK1,CHECK1-NORMAL
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK2
+// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK2,CHECK2-NORMAL
 
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -fexceptions -fcxx-exceptions -fopenmp-version=45 -o - | FileCheck %s --check-prefix=CHECK3
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-enable-irbuilder -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s --check-prefixes=CHECK1,CHECK1-IRBUILDER
+// RUN: %clang_cc1 -fopenmp -fopenmp-enable-irbuilder -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-enable-irbuilder -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK2,CHECK2-IRBUILDER
+
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -fexceptions -fcxx-exceptions -fopenmp-version=45 -o - | FileCheck %s --check-prefixes=CHECK3,CHECK3-NORMAL
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -fopenmp-version=45 -o %t %s
-// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK4
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK4,CHECK4-NORMAL
+
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-enable-irbuilder -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -fexceptions -fcxx-exceptions -fopenmp-version=45 -o - | FileCheck %s --check-prefixes=CHECK3,CHECK3-IRBUILDER
+// RUN: %clang_cc1 -fopenmp -fopenmp-enable-irbuilder -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -fopenmp-version=45 -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-enable-irbuilder -fopenmp-version=45 -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK4,CHECK4-IRBUILDER
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s --check-prefix=CHECK5
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s
@@ -128,7 +136,7 @@ void foo_simd(int low, int up) {
 // CHECK1-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[I:%.*]] = alloca i32, align 4
-// CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]])
+// CHECK1-NORMAL-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]])
 // CHECK1-NEXT:    store float* [[A]], float** [[A_ADDR]], align 8
 // CHECK1-NEXT:    store float* [[B]], float** [[B_ADDR]], align 8
 // CHECK1-NEXT:    store float* [[C]], float** [[C_ADDR]], align 8
@@ -137,9 +145,12 @@ void foo_simd(int low, int up) {
 // CHECK1-NEXT:    store i32 4571423, i32* [[DOTOMP_UB]], align 4
 // CHECK1-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
 // CHECK1-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
-// CHECK1-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32 66, i32 0, i32 4571423, i32 1, i32 1)
+// CHECK1-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]])
+// CHECK1-IRBUILDER-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP0]], i32 66, i32 0, i32 4571423, i32 1, i32 1)
+// CHECK1-NORMAL-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32 66, i32 0, i32 4571423, i32 1, i32 1)
 // CHECK1-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
 // CHECK1:       omp.dispatch.cond:
+// CHECK1-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]])
 // CHECK1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
 // CHECK1-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP1]], 0
 // CHECK1-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
@@ -157,6 +168,7 @@ void foo_simd(int low, int up) {
 // CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP5]], 7
 // CHECK1-NEXT:    [[SUB:%.*]] = sub nsw i32 32000000, [[MUL]]
 // CHECK1-NEXT:    store i32 [[SUB]], i32* [[I]], align 4
+// CHECK1-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
 // CHECK1-NEXT:    call void @__kmpc_ordered(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
 // CHECK1-NEXT:    [[TMP6:%.*]] = load float*, float** [[B_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP7:%.*]] = load i32, i32* [[I]], align 4
@@ -188,6 +200,7 @@ void foo_simd(int low, int up) {
 // CHECK1-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
 // CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP17]], 1
 // CHECK1-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4
+// CHECK1-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]])
 // CHECK1-NEXT:    call void @__kmpc_dispatch_fini_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]]
 // CHECK1:       omp.inner.for.end:
@@ -195,7 +208,9 @@ void foo_simd(int low, int up) {
 // CHECK1:       omp.dispatch.inc:
 // CHECK1-NEXT:    br label [[OMP_DISPATCH_COND]]
 // CHECK1:       omp.dispatch.end:
-// CHECK1-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP0]])
+// CHECK1-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK1-IRBUILDER-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4:[0-9]+]], i32 [[TMP0]])
+// CHECK1-NORMAL-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP0]])
 // CHECK1-NEXT:    ret void
 //
 //
@@ -213,7 +228,7 @@ void foo_simd(int low, int up) {
 // CHECK1-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[I:%.*]] = alloca i64, align 8
-// CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK1-NORMAL-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
 // CHECK1-NEXT:    store float* [[A]], float** [[A_ADDR]], align 8
 // CHECK1-NEXT:    store float* [[B]], float** [[B_ADDR]], align 8
 // CHECK1-NEXT:    store float* [[C]], float** [[C_ADDR]], align 8
@@ -222,9 +237,11 @@ void foo_simd(int low, int up) {
 // CHECK1-NEXT:    store i64 16908287, i64* [[DOTOMP_UB]], align 8
 // CHECK1-NEXT:    store i64 1, i64* [[DOTOMP_STRIDE]], align 8
 // CHECK1-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK1-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB6:[0-9]+]])
 // CHECK1-NEXT:    call void @__kmpc_dispatch_init_8u(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32 1073741891, i64 0, i64 16908287, i64 1, i64 1)
 // CHECK1-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
 // CHECK1:       omp.dispatch.cond:
+// CHECK1-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB6]])
 // CHECK1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_dispatch_next_8u(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_LB]], i64* [[DOTOMP_UB]], i64* [[DOTOMP_STRIDE]])
 // CHECK1-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP1]], 0
 // CHECK1-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
@@ -243,6 +260,7 @@ void foo_simd(int low, int up) {
 // CHECK1-NEXT:    [[MUL:%.*]] = mul i64 [[TMP5]], 127
 // CHECK1-NEXT:    [[ADD1:%.*]] = add i64 131071, [[MUL]]
 // CHECK1-NEXT:    store i64 [[ADD1]], i64* [[I]], align 8
+// CHECK1-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
 // CHECK1-NEXT:    call void @__kmpc_ordered(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
 // CHECK1-NEXT:    [[TMP6:%.*]] = load float*, float** [[B_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP7:%.*]] = load i64, i64* [[I]], align 8
@@ -270,6 +288,7 @@ void foo_simd(int low, int up) {
 // CHECK1-NEXT:    [[TMP17:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8
 // CHECK1-NEXT:    [[ADD7:%.*]] = add i64 [[TMP17]], 1
 // CHECK1-NEXT:    store i64 [[ADD7]], i64* [[DOTOMP_IV]], align 8
+// CHECK1-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB6]])
 // CHECK1-NEXT:    call void @__kmpc_dispatch_fini_8u(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]]
 // CHECK1:       omp.inner.for.end:
@@ -277,7 +296,9 @@ void foo_simd(int low, int up) {
 // CHECK1:       omp.dispatch.inc:
 // CHECK1-NEXT:    br label [[OMP_DISPATCH_COND]]
 // CHECK1:       omp.dispatch.end:
-// CHECK1-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]])
+// CHECK1-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK1-IRBUILDER-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP0]])
+// CHECK1-NORMAL-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]])
 // CHECK1-NEXT:    ret void
 //
 //
@@ -303,7 +324,7 @@ void foo_simd(int low, int up) {
 // CHECK1-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[I8:%.*]] = alloca i8, align 1
 // CHECK1-NEXT:    [[X9:%.*]] = alloca i32, align 4
-// CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK1-NORMAL-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
 // CHECK1-NEXT:    store float* [[A]], float** [[A_ADDR]], align 8
 // CHECK1-NEXT:    store float* [[B]], float** [[B_ADDR]], align 8
 // CHECK1-NEXT:    store float* [[C]], float** [[C_ADDR]], align 8
@@ -336,9 +357,11 @@ void foo_simd(int low, int up) {
 // CHECK1-NEXT:    store i64 1, i64* [[DOTOMP_STRIDE]], align 8
 // CHECK1-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
 // CHECK1-NEXT:    [[TMP6:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_2]], align 8
+// CHECK1-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB8:[0-9]+]])
 // CHECK1-NEXT:    call void @__kmpc_dispatch_init_8(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32 1073741894, i64 0, i64 [[TMP6]], i64 1, i64 1)
 // CHECK1-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
 // CHECK1:       omp.dispatch.cond:
+// CHECK1-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB8]])
 // CHECK1-NEXT:    [[TMP7:%.*]] = call i32 @__kmpc_dispatch_next_8(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_LB]], i64* [[DOTOMP_UB]], i64* [[DOTOMP_STRIDE]])
 // CHECK1-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP7]], 0
 // CHECK1-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
@@ -369,6 +392,7 @@ void foo_simd(int low, int up) {
 // CHECK1-NEXT:    [[SUB20:%.*]] = sub nsw i64 11, [[MUL19]]
 // CHECK1-NEXT:    [[CONV21:%.*]] = trunc i64 [[SUB20]] to i32
 // CHECK1-NEXT:    store i32 [[CONV21]], i32* [[X9]], align 4
+// CHECK1-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
 // CHECK1-NEXT:    call void @__kmpc_ordered(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
 // CHECK1-NEXT:    [[TMP15:%.*]] = load float*, float** [[B_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP16:%.*]] = load i8, i8* [[I8]], align 1
@@ -400,6 +424,7 @@ void foo_simd(int low, int up) {
 // CHECK1-NEXT:    [[TMP26:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8
 // CHECK1-NEXT:    [[ADD30:%.*]] = add nsw i64 [[TMP26]], 1
 // CHECK1-NEXT:    store i64 [[ADD30]], i64* [[DOTOMP_IV]], align 8
+// CHECK1-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB8]])
 // CHECK1-NEXT:    call void @__kmpc_dispatch_fini_8(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]]
 // CHECK1:       omp.inner.for.end:
@@ -409,7 +434,9 @@ void foo_simd(int low, int up) {
 // CHECK1:       omp.dispatch.end:
 // CHECK1-NEXT:    br label [[OMP_PRECOND_END]]
 // CHECK1:       omp.precond.end:
-// CHECK1-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]])
+// CHECK1-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK1-IRBUILDER-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP0]])
+// CHECK1-NORMAL-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]])
 // CHECK1-NEXT:    ret void
 //
 //
@@ -430,7 +457,7 @@ void foo_simd(int low, int up) {
 // CHECK1-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[I:%.*]] = alloca i8, align 1
 // CHECK1-NEXT:    [[X2:%.*]] = alloca i32, align 4
-// CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK1-NORMAL-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
 // CHECK1-NEXT:    store float* [[A]], float** [[A_ADDR]], align 8
 // CHECK1-NEXT:    store float* [[B]], float** [[B_ADDR]], align 8
 // CHECK1-NEXT:    store float* [[C]], float** [[C_ADDR]], align 8
@@ -440,9 +467,11 @@ void foo_simd(int low, int up) {
 // CHECK1-NEXT:    store i32 199, i32* [[DOTOMP_UB]], align 4
 // CHECK1-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
 // CHECK1-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK1-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB10:[0-9]+]])
 // CHECK1-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32 1073741893, i32 0, i32 199, i32 1, i32 1)
 // CHECK1-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
 // CHECK1:       omp.dispatch.cond:
+// CHECK1-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB10]])
 // CHECK1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
 // CHECK1-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP1]], 0
 // CHECK1-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
@@ -470,6 +499,7 @@ void foo_simd(int low, int up) {
 // CHECK1-NEXT:    [[MUL5:%.*]] = mul nsw i32 [[SUB]], 1
 // CHECK1-NEXT:    [[ADD6:%.*]] = add nsw i32 -10, [[MUL5]]
 // CHECK1-NEXT:    store i32 [[ADD6]], i32* [[X2]], align 4
+// CHECK1-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
 // CHECK1-NEXT:    call void @__kmpc_ordered(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
 // CHECK1-NEXT:    [[TMP8:%.*]] = load float*, float** [[B_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP9:%.*]] = load i8, i8* [[I]], align 1
@@ -501,6 +531,7 @@ void foo_simd(int low, int up) {
 // CHECK1-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
 // CHECK1-NEXT:    [[ADD15:%.*]] = add nsw i32 [[TMP19]], 1
 // CHECK1-NEXT:    store i32 [[ADD15]], i32* [[DOTOMP_IV]], align 4
+// CHECK1-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB10]])
 // CHECK1-NEXT:    call void @__kmpc_dispatch_fini_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]]
 // CHECK1:       omp.inner.for.end:
@@ -508,7 +539,9 @@ void foo_simd(int low, int up) {
 // CHECK1:       omp.dispatch.inc:
 // CHECK1-NEXT:    br label [[OMP_DISPATCH_COND]]
 // CHECK1:       omp.dispatch.end:
-// CHECK1-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]])
+// CHECK1-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK1-IRBUILDER-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP0]])
+// CHECK1-NORMAL-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]])
 // CHECK1-NEXT:    ret void
 //
 //
@@ -535,7 +568,7 @@ void foo_simd(int low, int up) {
 // CHECK1-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[I28:%.*]] = alloca i32, align 4
-// CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK1-NORMAL-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
 // CHECK1-NEXT:    store i32 [[LOW]], i32* [[LOW_ADDR]], align 4
 // CHECK1-NEXT:    store i32 [[UP]], i32* [[UP_ADDR]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = load i32, i32* [[LOW_ADDR]], align 4
@@ -622,9 +655,11 @@ void foo_simd(int low, int up) {
 // CHECK1-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
 // CHECK1-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
 // CHECK1-NEXT:    [[TMP25:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_20]], align 4
+// CHECK1-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB12:[0-9]+]])
 // CHECK1-NEXT:    call void @__kmpc_dispatch_init_4u(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32 66, i32 0, i32 [[TMP25]], i32 1, i32 1)
 // CHECK1-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
 // CHECK1:       omp.dispatch.cond:
+// CHECK1-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB12]])
 // CHECK1-NEXT:    [[TMP26:%.*]] = call i32 @__kmpc_dispatch_next_4u(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
 // CHECK1-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP26]], 0
 // CHECK1-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
@@ -632,13 +667,15 @@ void foo_simd(int low, int up) {
 // CHECK1-NEXT:    [[TMP27:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
 // CHECK1-NEXT:    store i32 [[TMP27]], i32* [[DOTOMP_IV16]], align 4
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND29:%.*]]
-// CHECK1:       omp.inner.for.cond29:
+// CHECK1-IRBUILDER:       omp.inner.for.cond30:
+// CHECK1-NORMAL:       omp.inner.for.cond29:
 // CHECK1-NEXT:    [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IV16]], align 4, !llvm.access.group !7
 // CHECK1-NEXT:    [[TMP29:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group !7
 // CHECK1-NEXT:    [[ADD30:%.*]] = add i32 [[TMP29]], 1
 // CHECK1-NEXT:    [[CMP31:%.*]] = icmp ult i32 [[TMP28]], [[ADD30]]
 // CHECK1-NEXT:    br i1 [[CMP31]], label [[OMP_INNER_FOR_BODY32:%.*]], label [[OMP_INNER_FOR_END40:%.*]]
-// CHECK1:       omp.inner.for.body32:
+// CHECK1-IRBUILDER:       omp.inner.for.body33:
+// CHECK1-NORMAL:       omp.inner.for.body32:
 // CHECK1-NEXT:    [[TMP30:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_18]], align 4, !llvm.access.group !7
 // CHECK1-NEXT:    [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IV16]], align 4, !llvm.access.group !7
 // CHECK1-NEXT:    [[MUL33:%.*]] = mul i32 [[TMP31]], 1
@@ -650,15 +687,19 @@ void foo_simd(int low, int up) {
 // CHECK1-NEXT:    store float 0.000000e+00, float* [[ARRAYIDX36]], align 4, !llvm.access.group !7
 // CHECK1-NEXT:    call void @__captured_stmt.1(i32* [[I28]]), !llvm.access.group !7
 // CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE37:%.*]]
-// CHECK1:       omp.body.continue37:
+// CHECK1-IRBUILDER:       omp.body.continue38:
+// CHECK1-NORMAL:       omp.body.continue37:
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC38:%.*]]
-// CHECK1:       omp.inner.for.inc38:
+// CHECK1-IRBUILDER:       omp.inner.for.inc39:
+// CHECK1-NORMAL:       omp.inner.for.inc38:
 // CHECK1-NEXT:    [[TMP33:%.*]] = load i32, i32* [[DOTOMP_IV16]], align 4, !llvm.access.group !7
 // CHECK1-NEXT:    [[ADD39:%.*]] = add i32 [[TMP33]], 1
 // CHECK1-NEXT:    store i32 [[ADD39]], i32* [[DOTOMP_IV16]], align 4, !llvm.access.group !7
+// CHECK1-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB12]])
 // CHECK1-NEXT:    call void @__kmpc_dispatch_fini_4u(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]), !llvm.access.group !7
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND29]], !llvm.loop [[LOOP8:![0-9]+]]
-// CHECK1:       omp.inner.for.end40:
+// CHECK1-IRBUILDER:       omp.inner.for.end42:
+// CHECK1-NORMAL:       omp.inner.for.end40:
 // CHECK1-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK1:       omp.dispatch.inc:
 // CHECK1-NEXT:    br label [[OMP_DISPATCH_COND]]
@@ -681,7 +722,9 @@ void foo_simd(int low, int up) {
 // CHECK1:       .omp.final.done:
 // CHECK1-NEXT:    br label [[OMP_PRECOND_END]]
 // CHECK1:       omp.precond.end:
-// CHECK1-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]])
+// CHECK1-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK1-IRBUILDER-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP0]])
+// CHECK1-NORMAL-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]])
 // CHECK1-NEXT:    ret void
 //
 //
@@ -725,7 +768,7 @@ void foo_simd(int low, int up) {
 // CHECK2-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[I:%.*]] = alloca i32, align 4
-// CHECK2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]])
+// CHECK2-NORMAL-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]])
 // CHECK2-NEXT:    store float* [[A]], float** [[A_ADDR]], align 8
 // CHECK2-NEXT:    store float* [[B]], float** [[B_ADDR]], align 8
 // CHECK2-NEXT:    store float* [[C]], float** [[C_ADDR]], align 8
@@ -734,9 +777,12 @@ void foo_simd(int low, int up) {
 // CHECK2-NEXT:    store i32 4571423, i32* [[DOTOMP_UB]], align 4
 // CHECK2-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
 // CHECK2-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
-// CHECK2-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32 66, i32 0, i32 4571423, i32 1, i32 1)
+// CHECK2-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]])
+// CHECK2-IRBUILDER-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP0]], i32 66, i32 0, i32 4571423, i32 1, i32 1)
+// CHECK2-NORMAL-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32 66, i32 0, i32 4571423, i32 1, i32 1)
 // CHECK2-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
 // CHECK2:       omp.dispatch.cond:
+// CHECK2-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]])
 // CHECK2-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
 // CHECK2-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP1]], 0
 // CHECK2-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
@@ -754,6 +800,7 @@ void foo_simd(int low, int up) {
 // CHECK2-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP5]], 7
 // CHECK2-NEXT:    [[SUB:%.*]] = sub nsw i32 32000000, [[MUL]]
 // CHECK2-NEXT:    store i32 [[SUB]], i32* [[I]], align 4
+// CHECK2-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
 // CHECK2-NEXT:    call void @__kmpc_ordered(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
 // CHECK2-NEXT:    [[TMP6:%.*]] = load float*, float** [[B_ADDR]], align 8
 // CHECK2-NEXT:    [[TMP7:%.*]] = load i32, i32* [[I]], align 4
@@ -785,6 +832,7 @@ void foo_simd(int low, int up) {
 // CHECK2-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
 // CHECK2-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP17]], 1
 // CHECK2-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4
+// CHECK2-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]])
 // CHECK2-NEXT:    call void @__kmpc_dispatch_fini_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
 // CHECK2-NEXT:    br label [[OMP_INNER_FOR_COND]]
 // CHECK2:       omp.inner.for.end:
@@ -792,7 +840,9 @@ void foo_simd(int low, int up) {
 // CHECK2:       omp.dispatch.inc:
 // CHECK2-NEXT:    br label [[OMP_DISPATCH_COND]]
 // CHECK2:       omp.dispatch.end:
-// CHECK2-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP0]])
+// CHECK2-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK2-IRBUILDER-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4:[0-9]+]], i32 [[TMP0]])
+// CHECK2-NORMAL-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP0]])
 // CHECK2-NEXT:    ret void
 //
 //
@@ -810,7 +860,7 @@ void foo_simd(int low, int up) {
 // CHECK2-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8
 // CHECK2-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[I:%.*]] = alloca i64, align 8
-// CHECK2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK2-NORMAL-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
 // CHECK2-NEXT:    store float* [[A]], float** [[A_ADDR]], align 8
 // CHECK2-NEXT:    store float* [[B]], float** [[B_ADDR]], align 8
 // CHECK2-NEXT:    store float* [[C]], float** [[C_ADDR]], align 8
@@ -819,9 +869,11 @@ void foo_simd(int low, int up) {
 // CHECK2-NEXT:    store i64 16908287, i64* [[DOTOMP_UB]], align 8
 // CHECK2-NEXT:    store i64 1, i64* [[DOTOMP_STRIDE]], align 8
 // CHECK2-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK2-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB6:[0-9]+]])
 // CHECK2-NEXT:    call void @__kmpc_dispatch_init_8u(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32 1073741891, i64 0, i64 16908287, i64 1, i64 1)
 // CHECK2-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
 // CHECK2:       omp.dispatch.cond:
+// CHECK2-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB6]])
 // CHECK2-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_dispatch_next_8u(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_LB]], i64* [[DOTOMP_UB]], i64* [[DOTOMP_STRIDE]])
 // CHECK2-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP1]], 0
 // CHECK2-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
@@ -840,6 +892,7 @@ void foo_simd(int low, int up) {
 // CHECK2-NEXT:    [[MUL:%.*]] = mul i64 [[TMP5]], 127
 // CHECK2-NEXT:    [[ADD1:%.*]] = add i64 131071, [[MUL]]
 // CHECK2-NEXT:    store i64 [[ADD1]], i64* [[I]], align 8
+// CHECK2-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
 // CHECK2-NEXT:    call void @__kmpc_ordered(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
 // CHECK2-NEXT:    [[TMP6:%.*]] = load float*, float** [[B_ADDR]], align 8
 // CHECK2-NEXT:    [[TMP7:%.*]] = load i64, i64* [[I]], align 8
@@ -867,6 +920,7 @@ void foo_simd(int low, int up) {
 // CHECK2-NEXT:    [[TMP17:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8
 // CHECK2-NEXT:    [[ADD7:%.*]] = add i64 [[TMP17]], 1
 // CHECK2-NEXT:    store i64 [[ADD7]], i64* [[DOTOMP_IV]], align 8
+// CHECK2-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB6]])
 // CHECK2-NEXT:    call void @__kmpc_dispatch_fini_8u(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
 // CHECK2-NEXT:    br label [[OMP_INNER_FOR_COND]]
 // CHECK2:       omp.inner.for.end:
@@ -874,7 +928,9 @@ void foo_simd(int low, int up) {
 // CHECK2:       omp.dispatch.inc:
 // CHECK2-NEXT:    br label [[OMP_DISPATCH_COND]]
 // CHECK2:       omp.dispatch.end:
-// CHECK2-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]])
+// CHECK2-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK2-IRBUILDER-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP0]])
+// CHECK2-NORMAL-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]])
 // CHECK2-NEXT:    ret void
 //
 //
@@ -900,7 +956,7 @@ void foo_simd(int low, int up) {
 // CHECK2-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[I8:%.*]] = alloca i8, align 1
 // CHECK2-NEXT:    [[X9:%.*]] = alloca i32, align 4
-// CHECK2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK2-NORMAL-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
 // CHECK2-NEXT:    store float* [[A]], float** [[A_ADDR]], align 8
 // CHECK2-NEXT:    store float* [[B]], float** [[B_ADDR]], align 8
 // CHECK2-NEXT:    store float* [[C]], float** [[C_ADDR]], align 8
@@ -933,9 +989,11 @@ void foo_simd(int low, int up) {
 // CHECK2-NEXT:    store i64 1, i64* [[DOTOMP_STRIDE]], align 8
 // CHECK2-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
 // CHECK2-NEXT:    [[TMP6:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_2]], align 8
+// CHECK2-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB8:[0-9]+]])
 // CHECK2-NEXT:    call void @__kmpc_dispatch_init_8(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32 1073741894, i64 0, i64 [[TMP6]], i64 1, i64 1)
 // CHECK2-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
 // CHECK2:       omp.dispatch.cond:
+// CHECK2-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB8]])
 // CHECK2-NEXT:    [[TMP7:%.*]] = call i32 @__kmpc_dispatch_next_8(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_LB]], i64* [[DOTOMP_UB]], i64* [[DOTOMP_STRIDE]])
 // CHECK2-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP7]], 0
 // CHECK2-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
@@ -966,6 +1024,7 @@ void foo_simd(int low, int up) {
 // CHECK2-NEXT:    [[SUB20:%.*]] = sub nsw i64 11, [[MUL19]]
 // CHECK2-NEXT:    [[CONV21:%.*]] = trunc i64 [[SUB20]] to i32
 // CHECK2-NEXT:    store i32 [[CONV21]], i32* [[X9]], align 4
+// CHECK2-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
 // CHECK2-NEXT:    call void @__kmpc_ordered(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
 // CHECK2-NEXT:    [[TMP15:%.*]] = load float*, float** [[B_ADDR]], align 8
 // CHECK2-NEXT:    [[TMP16:%.*]] = load i8, i8* [[I8]], align 1
@@ -997,6 +1056,7 @@ void foo_simd(int low, int up) {
 // CHECK2-NEXT:    [[TMP26:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8
 // CHECK2-NEXT:    [[ADD30:%.*]] = add nsw i64 [[TMP26]], 1
 // CHECK2-NEXT:    store i64 [[ADD30]], i64* [[DOTOMP_IV]], align 8
+// CHECK2-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB8]])
 // CHECK2-NEXT:    call void @__kmpc_dispatch_fini_8(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
 // CHECK2-NEXT:    br label [[OMP_INNER_FOR_COND]]
 // CHECK2:       omp.inner.for.end:
@@ -1006,7 +1066,9 @@ void foo_simd(int low, int up) {
 // CHECK2:       omp.dispatch.end:
 // CHECK2-NEXT:    br label [[OMP_PRECOND_END]]
 // CHECK2:       omp.precond.end:
-// CHECK2-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]])
+// CHECK2-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK2-IRBUILDER-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP0]])
+// CHECK2-NORMAL-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]])
 // CHECK2-NEXT:    ret void
 //
 //
@@ -1027,7 +1089,7 @@ void foo_simd(int low, int up) {
 // CHECK2-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[I:%.*]] = alloca i8, align 1
 // CHECK2-NEXT:    [[X2:%.*]] = alloca i32, align 4
-// CHECK2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK2-NORMAL-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
 // CHECK2-NEXT:    store float* [[A]], float** [[A_ADDR]], align 8
 // CHECK2-NEXT:    store float* [[B]], float** [[B_ADDR]], align 8
 // CHECK2-NEXT:    store float* [[C]], float** [[C_ADDR]], align 8
@@ -1037,9 +1099,11 @@ void foo_simd(int low, int up) {
 // CHECK2-NEXT:    store i32 199, i32* [[DOTOMP_UB]], align 4
 // CHECK2-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
 // CHECK2-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK2-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB10:[0-9]+]])
 // CHECK2-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32 1073741893, i32 0, i32 199, i32 1, i32 1)
 // CHECK2-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
 // CHECK2:       omp.dispatch.cond:
+// CHECK2-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB10]])
 // CHECK2-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
 // CHECK2-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP1]], 0
 // CHECK2-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
@@ -1067,6 +1131,7 @@ void foo_simd(int low, int up) {
 // CHECK2-NEXT:    [[MUL5:%.*]] = mul nsw i32 [[SUB]], 1
 // CHECK2-NEXT:    [[ADD6:%.*]] = add nsw i32 -10, [[MUL5]]
 // CHECK2-NEXT:    store i32 [[ADD6]], i32* [[X2]], align 4
+// CHECK2-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
 // CHECK2-NEXT:    call void @__kmpc_ordered(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
 // CHECK2-NEXT:    [[TMP8:%.*]] = load float*, float** [[B_ADDR]], align 8
 // CHECK2-NEXT:    [[TMP9:%.*]] = load i8, i8* [[I]], align 1
@@ -1098,6 +1163,7 @@ void foo_simd(int low, int up) {
 // CHECK2-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
 // CHECK2-NEXT:    [[ADD15:%.*]] = add nsw i32 [[TMP19]], 1
 // CHECK2-NEXT:    store i32 [[ADD15]], i32* [[DOTOMP_IV]], align 4
+// CHECK2-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB10]])
 // CHECK2-NEXT:    call void @__kmpc_dispatch_fini_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
 // CHECK2-NEXT:    br label [[OMP_INNER_FOR_COND]]
 // CHECK2:       omp.inner.for.end:
@@ -1105,7 +1171,9 @@ void foo_simd(int low, int up) {
 // CHECK2:       omp.dispatch.inc:
 // CHECK2-NEXT:    br label [[OMP_DISPATCH_COND]]
 // CHECK2:       omp.dispatch.end:
-// CHECK2-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]])
+// CHECK2-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK2-IRBUILDER-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP0]])
+// CHECK2-NORMAL-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]])
 // CHECK2-NEXT:    ret void
 //
 //
@@ -1132,7 +1200,7 @@ void foo_simd(int low, int up) {
 // CHECK2-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[I28:%.*]] = alloca i32, align 4
-// CHECK2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK2-NORMAL-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
 // CHECK2-NEXT:    store i32 [[LOW]], i32* [[LOW_ADDR]], align 4
 // CHECK2-NEXT:    store i32 [[UP]], i32* [[UP_ADDR]], align 4
 // CHECK2-NEXT:    [[TMP1:%.*]] = load i32, i32* [[LOW_ADDR]], align 4
@@ -1219,9 +1287,11 @@ void foo_simd(int low, int up) {
 // CHECK2-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
 // CHECK2-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
 // CHECK2-NEXT:    [[TMP25:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_20]], align 4
+// CHECK2-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB12:[0-9]+]])
 // CHECK2-NEXT:    call void @__kmpc_dispatch_init_4u(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32 66, i32 0, i32 [[TMP25]], i32 1, i32 1)
 // CHECK2-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
 // CHECK2:       omp.dispatch.cond:
+// CHECK2-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB12]])
 // CHECK2-NEXT:    [[TMP26:%.*]] = call i32 @__kmpc_dispatch_next_4u(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
 // CHECK2-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP26]], 0
 // CHECK2-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
@@ -1229,13 +1299,15 @@ void foo_simd(int low, int up) {
 // CHECK2-NEXT:    [[TMP27:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
 // CHECK2-NEXT:    store i32 [[TMP27]], i32* [[DOTOMP_IV16]], align 4
 // CHECK2-NEXT:    br label [[OMP_INNER_FOR_COND29:%.*]]
-// CHECK2:       omp.inner.for.cond29:
+// CHECK2-IRBUILDER:       omp.inner.for.cond30:
+// CHECK2-NORMAL:       omp.inner.for.cond29:
 // CHECK2-NEXT:    [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IV16]], align 4, !llvm.access.group !7
 // CHECK2-NEXT:    [[TMP29:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group !7
 // CHECK2-NEXT:    [[ADD30:%.*]] = add i32 [[TMP29]], 1
 // CHECK2-NEXT:    [[CMP31:%.*]] = icmp ult i32 [[TMP28]], [[ADD30]]
 // CHECK2-NEXT:    br i1 [[CMP31]], label [[OMP_INNER_FOR_BODY32:%.*]], label [[OMP_INNER_FOR_END40:%.*]]
-// CHECK2:       omp.inner.for.body32:
+// CHECK2-IRBUILDER:       omp.inner.for.body33:
+// CHECK2-NORMAL:       omp.inner.for.body32:
 // CHECK2-NEXT:    [[TMP30:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_18]], align 4, !llvm.access.group !7
 // CHECK2-NEXT:    [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IV16]], align 4, !llvm.access.group !7
 // CHECK2-NEXT:    [[MUL33:%.*]] = mul i32 [[TMP31]], 1
@@ -1247,15 +1319,19 @@ void foo_simd(int low, int up) {
 // CHECK2-NEXT:    store float 0.000000e+00, float* [[ARRAYIDX36]], align 4, !llvm.access.group !7
 // CHECK2-NEXT:    call void @__captured_stmt.1(i32* [[I28]]), !llvm.access.group !7
 // CHECK2-NEXT:    br label [[OMP_BODY_CONTINUE37:%.*]]
-// CHECK2:       omp.body.continue37:
+// CHECK2-IRBUILDER:       omp.body.continue38:
+// CHECK2-NORMAL:       omp.body.continue37:
 // CHECK2-NEXT:    br label [[OMP_INNER_FOR_INC38:%.*]]
-// CHECK2:       omp.inner.for.inc38:
+// CHECK2-IRBUILDER:       omp.inner.for.inc39:
+// CHECK2-NORMAL:       omp.inner.for.inc38:
 // CHECK2-NEXT:    [[TMP33:%.*]] = load i32, i32* [[DOTOMP_IV16]], align 4, !llvm.access.group !7
 // CHECK2-NEXT:    [[ADD39:%.*]] = add i32 [[TMP33]], 1
 // CHECK2-NEXT:    store i32 [[ADD39]], i32* [[DOTOMP_IV16]], align 4, !llvm.access.group !7
+// CHECK2-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB12]])
 // CHECK2-NEXT:    call void @__kmpc_dispatch_fini_4u(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]), !llvm.access.group !7
 // CHECK2-NEXT:    br label [[OMP_INNER_FOR_COND29]], !llvm.loop [[LOOP8:![0-9]+]]
-// CHECK2:       omp.inner.for.end40:
+// CHECK2-IRBUILDER:       omp.inner.for.end42:
+// CHECK2-NORMAL:       omp.inner.for.end40:
 // CHECK2-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK2:       omp.dispatch.inc:
 // CHECK2-NEXT:    br label [[OMP_DISPATCH_COND]]
@@ -1278,7 +1354,9 @@ void foo_simd(int low, int up) {
 // CHECK2:       .omp.final.done:
 // CHECK2-NEXT:    br label [[OMP_PRECOND_END]]
 // CHECK2:       omp.precond.end:
-// CHECK2-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]])
+// CHECK2-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK2-IRBUILDER-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP0]])
+// CHECK2-NORMAL-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]])
 // CHECK2-NEXT:    ret void
 //
 //
@@ -1322,7 +1400,7 @@ void foo_simd(int low, int up) {
 // CHECK3-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[I:%.*]] = alloca i32, align 4
-// CHECK3-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]])
+// CHECK3-NORMAL-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]])
 // CHECK3-NEXT:    store float* [[A]], float** [[A_ADDR]], align 8
 // CHECK3-NEXT:    store float* [[B]], float** [[B_ADDR]], align 8
 // CHECK3-NEXT:    store float* [[C]], float** [[C_ADDR]], align 8
@@ -1331,9 +1409,12 @@ void foo_simd(int low, int up) {
 // CHECK3-NEXT:    store i32 4571423, i32* [[DOTOMP_UB]], align 4
 // CHECK3-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
 // CHECK3-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
-// CHECK3-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32 66, i32 0, i32 4571423, i32 1, i32 1)
+// CHECK3-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]])
+// CHECK3-IRBUILDER-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP0]], i32 66, i32 0, i32 4571423, i32 1, i32 1)
+// CHECK3-NORMAL-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32 66, i32 0, i32 4571423, i32 1, i32 1)
 // CHECK3-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
 // CHECK3:       omp.dispatch.cond:
+// CHECK3-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]])
 // CHECK3-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
 // CHECK3-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP1]], 0
 // CHECK3-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
@@ -1351,6 +1432,7 @@ void foo_simd(int low, int up) {
 // CHECK3-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP5]], 7
 // CHECK3-NEXT:    [[SUB:%.*]] = sub nsw i32 32000000, [[MUL]]
 // CHECK3-NEXT:    store i32 [[SUB]], i32* [[I]], align 4
+// CHECK3-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
 // CHECK3-NEXT:    call void @__kmpc_ordered(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
 // CHECK3-NEXT:    [[TMP6:%.*]] = load float*, float** [[B_ADDR]], align 8
 // CHECK3-NEXT:    [[TMP7:%.*]] = load i32, i32* [[I]], align 4
@@ -1382,6 +1464,7 @@ void foo_simd(int low, int up) {
 // CHECK3-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
 // CHECK3-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP17]], 1
 // CHECK3-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4
+// CHECK3-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]])
 // CHECK3-NEXT:    call void @__kmpc_dispatch_fini_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
 // CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND]]
 // CHECK3:       omp.inner.for.end:
@@ -1389,7 +1472,9 @@ void foo_simd(int low, int up) {
 // CHECK3:       omp.dispatch.inc:
 // CHECK3-NEXT:    br label [[OMP_DISPATCH_COND]]
 // CHECK3:       omp.dispatch.end:
-// CHECK3-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP0]])
+// CHECK3-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK3-IRBUILDER-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4:[0-9]+]], i32 [[TMP0]])
+// CHECK3-NORMAL-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP0]])
 // CHECK3-NEXT:    ret void
 //
 //
@@ -1407,7 +1492,7 @@ void foo_simd(int low, int up) {
 // CHECK3-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8
 // CHECK3-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[I:%.*]] = alloca i64, align 8
-// CHECK3-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK3-NORMAL-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
 // CHECK3-NEXT:    store float* [[A]], float** [[A_ADDR]], align 8
 // CHECK3-NEXT:    store float* [[B]], float** [[B_ADDR]], align 8
 // CHECK3-NEXT:    store float* [[C]], float** [[C_ADDR]], align 8
@@ -1416,9 +1501,11 @@ void foo_simd(int low, int up) {
 // CHECK3-NEXT:    store i64 16908287, i64* [[DOTOMP_UB]], align 8
 // CHECK3-NEXT:    store i64 1, i64* [[DOTOMP_STRIDE]], align 8
 // CHECK3-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK3-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB6:[0-9]+]])
 // CHECK3-NEXT:    call void @__kmpc_dispatch_init_8u(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32 67, i64 0, i64 16908287, i64 1, i64 1)
 // CHECK3-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
 // CHECK3:       omp.dispatch.cond:
+// CHECK3-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB6]])
 // CHECK3-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_dispatch_next_8u(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_LB]], i64* [[DOTOMP_UB]], i64* [[DOTOMP_STRIDE]])
 // CHECK3-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP1]], 0
 // CHECK3-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
@@ -1437,6 +1524,7 @@ void foo_simd(int low, int up) {
 // CHECK3-NEXT:    [[MUL:%.*]] = mul i64 [[TMP5]], 127
 // CHECK3-NEXT:    [[ADD1:%.*]] = add i64 131071, [[MUL]]
 // CHECK3-NEXT:    store i64 [[ADD1]], i64* [[I]], align 8
+// CHECK3-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
 // CHECK3-NEXT:    call void @__kmpc_ordered(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
 // CHECK3-NEXT:    [[TMP6:%.*]] = load float*, float** [[B_ADDR]], align 8
 // CHECK3-NEXT:    [[TMP7:%.*]] = load i64, i64* [[I]], align 8
@@ -1464,6 +1552,7 @@ void foo_simd(int low, int up) {
 // CHECK3-NEXT:    [[TMP17:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8
 // CHECK3-NEXT:    [[ADD7:%.*]] = add i64 [[TMP17]], 1
 // CHECK3-NEXT:    store i64 [[ADD7]], i64* [[DOTOMP_IV]], align 8
+// CHECK3-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB6]])
 // CHECK3-NEXT:    call void @__kmpc_dispatch_fini_8u(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
 // CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND]]
 // CHECK3:       omp.inner.for.end:
@@ -1471,7 +1560,9 @@ void foo_simd(int low, int up) {
 // CHECK3:       omp.dispatch.inc:
 // CHECK3-NEXT:    br label [[OMP_DISPATCH_COND]]
 // CHECK3:       omp.dispatch.end:
-// CHECK3-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]])
+// CHECK3-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK3-IRBUILDER-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP0]])
+// CHECK3-NORMAL-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]])
 // CHECK3-NEXT:    ret void
 //
 //
@@ -1497,7 +1588,7 @@ void foo_simd(int low, int up) {
 // CHECK3-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[I8:%.*]] = alloca i8, align 1
 // CHECK3-NEXT:    [[X9:%.*]] = alloca i32, align 4
-// CHECK3-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK3-NORMAL-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
 // CHECK3-NEXT:    store float* [[A]], float** [[A_ADDR]], align 8
 // CHECK3-NEXT:    store float* [[B]], float** [[B_ADDR]], align 8
 // CHECK3-NEXT:    store float* [[C]], float** [[C_ADDR]], align 8
@@ -1530,9 +1621,11 @@ void foo_simd(int low, int up) {
 // CHECK3-NEXT:    store i64 1, i64* [[DOTOMP_STRIDE]], align 8
 // CHECK3-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
 // CHECK3-NEXT:    [[TMP6:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_2]], align 8
+// CHECK3-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB8:[0-9]+]])
 // CHECK3-NEXT:    call void @__kmpc_dispatch_init_8(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32 70, i64 0, i64 [[TMP6]], i64 1, i64 1)
 // CHECK3-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
 // CHECK3:       omp.dispatch.cond:
+// CHECK3-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB8]])
 // CHECK3-NEXT:    [[TMP7:%.*]] = call i32 @__kmpc_dispatch_next_8(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_LB]], i64* [[DOTOMP_UB]], i64* [[DOTOMP_STRIDE]])
 // CHECK3-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP7]], 0
 // CHECK3-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
@@ -1563,6 +1656,7 @@ void foo_simd(int low, int up) {
 // CHECK3-NEXT:    [[SUB20:%.*]] = sub nsw i64 11, [[MUL19]]
 // CHECK3-NEXT:    [[CONV21:%.*]] = trunc i64 [[SUB20]] to i32
 // CHECK3-NEXT:    store i32 [[CONV21]], i32* [[X9]], align 4
+// CHECK3-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
 // CHECK3-NEXT:    call void @__kmpc_ordered(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
 // CHECK3-NEXT:    [[TMP15:%.*]] = load float*, float** [[B_ADDR]], align 8
 // CHECK3-NEXT:    [[TMP16:%.*]] = load i8, i8* [[I8]], align 1
@@ -1594,6 +1688,7 @@ void foo_simd(int low, int up) {
 // CHECK3-NEXT:    [[TMP26:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8
 // CHECK3-NEXT:    [[ADD30:%.*]] = add nsw i64 [[TMP26]], 1
 // CHECK3-NEXT:    store i64 [[ADD30]], i64* [[DOTOMP_IV]], align 8
+// CHECK3-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB8]])
 // CHECK3-NEXT:    call void @__kmpc_dispatch_fini_8(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
 // CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND]]
 // CHECK3:       omp.inner.for.end:
@@ -1603,7 +1698,9 @@ void foo_simd(int low, int up) {
 // CHECK3:       omp.dispatch.end:
 // CHECK3-NEXT:    br label [[OMP_PRECOND_END]]
 // CHECK3:       omp.precond.end:
-// CHECK3-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]])
+// CHECK3-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK3-IRBUILDER-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP0]])
+// CHECK3-NORMAL-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]])
 // CHECK3-NEXT:    ret void
 //
 //
@@ -1624,7 +1721,7 @@ void foo_simd(int low, int up) {
 // CHECK3-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[I:%.*]] = alloca i8, align 1
 // CHECK3-NEXT:    [[X2:%.*]] = alloca i32, align 4
-// CHECK3-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK3-NORMAL-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
 // CHECK3-NEXT:    store float* [[A]], float** [[A_ADDR]], align 8
 // CHECK3-NEXT:    store float* [[B]], float** [[B_ADDR]], align 8
 // CHECK3-NEXT:    store float* [[C]], float** [[C_ADDR]], align 8
@@ -1634,9 +1731,11 @@ void foo_simd(int low, int up) {
 // CHECK3-NEXT:    store i32 199, i32* [[DOTOMP_UB]], align 4
 // CHECK3-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
 // CHECK3-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK3-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB10:[0-9]+]])
 // CHECK3-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32 69, i32 0, i32 199, i32 1, i32 1)
 // CHECK3-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
 // CHECK3:       omp.dispatch.cond:
+// CHECK3-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB10]])
 // CHECK3-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
 // CHECK3-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP1]], 0
 // CHECK3-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
@@ -1664,6 +1763,7 @@ void foo_simd(int low, int up) {
 // CHECK3-NEXT:    [[MUL5:%.*]] = mul nsw i32 [[SUB]], 1
 // CHECK3-NEXT:    [[ADD6:%.*]] = add nsw i32 -10, [[MUL5]]
 // CHECK3-NEXT:    store i32 [[ADD6]], i32* [[X2]], align 4
+// CHECK3-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
 // CHECK3-NEXT:    call void @__kmpc_ordered(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
 // CHECK3-NEXT:    [[TMP8:%.*]] = load float*, float** [[B_ADDR]], align 8
 // CHECK3-NEXT:    [[TMP9:%.*]] = load i8, i8* [[I]], align 1
@@ -1695,6 +1795,7 @@ void foo_simd(int low, int up) {
 // CHECK3-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
 // CHECK3-NEXT:    [[ADD15:%.*]] = add nsw i32 [[TMP19]], 1
 // CHECK3-NEXT:    store i32 [[ADD15]], i32* [[DOTOMP_IV]], align 4
+// CHECK3-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB10]])
 // CHECK3-NEXT:    call void @__kmpc_dispatch_fini_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
 // CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND]]
 // CHECK3:       omp.inner.for.end:
@@ -1702,7 +1803,9 @@ void foo_simd(int low, int up) {
 // CHECK3:       omp.dispatch.inc:
 // CHECK3-NEXT:    br label [[OMP_DISPATCH_COND]]
 // CHECK3:       omp.dispatch.end:
-// CHECK3-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]])
+// CHECK3-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK3-IRBUILDER-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP0]])
+// CHECK3-NORMAL-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]])
 // CHECK3-NEXT:    ret void
 //
 //
@@ -1729,7 +1832,7 @@ void foo_simd(int low, int up) {
 // CHECK3-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[I28:%.*]] = alloca i32, align 4
-// CHECK3-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK3-NORMAL-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
 // CHECK3-NEXT:    store i32 [[LOW]], i32* [[LOW_ADDR]], align 4
 // CHECK3-NEXT:    store i32 [[UP]], i32* [[UP_ADDR]], align 4
 // CHECK3-NEXT:    [[TMP1:%.*]] = load i32, i32* [[LOW_ADDR]], align 4
@@ -1816,9 +1919,11 @@ void foo_simd(int low, int up) {
 // CHECK3-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
 // CHECK3-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
 // CHECK3-NEXT:    [[TMP25:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_20]], align 4
+// CHECK3-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB12:[0-9]+]])
 // CHECK3-NEXT:    call void @__kmpc_dispatch_init_4u(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32 66, i32 0, i32 [[TMP25]], i32 1, i32 1)
 // CHECK3-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
 // CHECK3:       omp.dispatch.cond:
+// CHECK3-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB12]])
 // CHECK3-NEXT:    [[TMP26:%.*]] = call i32 @__kmpc_dispatch_next_4u(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
 // CHECK3-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP26]], 0
 // CHECK3-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
@@ -1826,13 +1931,15 @@ void foo_simd(int low, int up) {
 // CHECK3-NEXT:    [[TMP27:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
 // CHECK3-NEXT:    store i32 [[TMP27]], i32* [[DOTOMP_IV16]], align 4
 // CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND29:%.*]]
-// CHECK3:       omp.inner.for.cond29:
+// CHECK3-IRBUILDER:       omp.inner.for.cond30:
+// CHECK3-NORMAL:       omp.inner.for.cond29:
 // CHECK3-NEXT:    [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IV16]], align 4, !llvm.access.group !7
 // CHECK3-NEXT:    [[TMP29:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group !7
 // CHECK3-NEXT:    [[ADD30:%.*]] = add i32 [[TMP29]], 1
 // CHECK3-NEXT:    [[CMP31:%.*]] = icmp ult i32 [[TMP28]], [[ADD30]]
 // CHECK3-NEXT:    br i1 [[CMP31]], label [[OMP_INNER_FOR_BODY32:%.*]], label [[OMP_INNER_FOR_END40:%.*]]
-// CHECK3:       omp.inner.for.body32:
+// CHECK3-IRBUILDER:       omp.inner.for.body33:
+// CHECK3-NORMAL:       omp.inner.for.body32:
 // CHECK3-NEXT:    [[TMP30:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_18]], align 4, !llvm.access.group !7
 // CHECK3-NEXT:    [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IV16]], align 4, !llvm.access.group !7
 // CHECK3-NEXT:    [[MUL33:%.*]] = mul i32 [[TMP31]], 1
@@ -1844,15 +1951,19 @@ void foo_simd(int low, int up) {
 // CHECK3-NEXT:    store float 0.000000e+00, float* [[ARRAYIDX36]], align 4, !llvm.access.group !7
 // CHECK3-NEXT:    call void @__captured_stmt.1(i32* [[I28]]), !llvm.access.group !7
 // CHECK3-NEXT:    br label [[OMP_BODY_CONTINUE37:%.*]]
-// CHECK3:       omp.body.continue37:
+// CHECK3-IRBUILDER:       omp.body.continue38:
+// CHECK3-NORMAL:       omp.body.continue37:
 // CHECK3-NEXT:    br label [[OMP_INNER_FOR_INC38:%.*]]
-// CHECK3:       omp.inner.for.inc38:
+// CHECK3-IRBUILDER:       omp.inner.for.inc39:
+// CHECK3-NORMAL:       omp.inner.for.inc38:
 // CHECK3-NEXT:    [[TMP33:%.*]] = load i32, i32* [[DOTOMP_IV16]], align 4, !llvm.access.group !7
 // CHECK3-NEXT:    [[ADD39:%.*]] = add i32 [[TMP33]], 1
 // CHECK3-NEXT:    store i32 [[ADD39]], i32* [[DOTOMP_IV16]], align 4, !llvm.access.group !7
+// CHECK3-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB12]])
 // CHECK3-NEXT:    call void @__kmpc_dispatch_fini_4u(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]), !llvm.access.group !7
 // CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND29]], !llvm.loop [[LOOP8:![0-9]+]]
-// CHECK3:       omp.inner.for.end40:
+// CHECK3-IRBUILDER:       omp.inner.for.end42:
+// CHECK3-NORMAL:       omp.inner.for.end40:
 // CHECK3-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK3:       omp.dispatch.inc:
 // CHECK3-NEXT:    br label [[OMP_DISPATCH_COND]]
@@ -1875,7 +1986,9 @@ void foo_simd(int low, int up) {
 // CHECK3:       .omp.final.done:
 // CHECK3-NEXT:    br label [[OMP_PRECOND_END]]
 // CHECK3:       omp.precond.end:
-// CHECK3-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]])
+// CHECK3-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK3-IRBUILDER-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP0]])
+// CHECK3-NORMAL-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]])
 // CHECK3-NEXT:    ret void
 //
 //
@@ -1919,7 +2032,7 @@ void foo_simd(int low, int up) {
 // CHECK4-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
 // CHECK4-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
 // CHECK4-NEXT:    [[I:%.*]] = alloca i32, align 4
-// CHECK4-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]])
+// CHECK4-NORMAL-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]])
 // CHECK4-NEXT:    store float* [[A]], float** [[A_ADDR]], align 8
 // CHECK4-NEXT:    store float* [[B]], float** [[B_ADDR]], align 8
 // CHECK4-NEXT:    store float* [[C]], float** [[C_ADDR]], align 8
@@ -1928,9 +2041,12 @@ void foo_simd(int low, int up) {
 // CHECK4-NEXT:    store i32 4571423, i32* [[DOTOMP_UB]], align 4
 // CHECK4-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
 // CHECK4-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
-// CHECK4-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32 66, i32 0, i32 4571423, i32 1, i32 1)
+// CHECK4-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]])
+// CHECK4-IRBUILDER-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP0]], i32 66, i32 0, i32 4571423, i32 1, i32 1)
+// CHECK4-NORMAL-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32 66, i32 0, i32 4571423, i32 1, i32 1)
 // CHECK4-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
 // CHECK4:       omp.dispatch.cond:
+// CHECK4-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]])
 // CHECK4-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
 // CHECK4-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP1]], 0
 // CHECK4-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
@@ -1948,6 +2064,7 @@ void foo_simd(int low, int up) {
 // CHECK4-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP5]], 7
 // CHECK4-NEXT:    [[SUB:%.*]] = sub nsw i32 32000000, [[MUL]]
 // CHECK4-NEXT:    store i32 [[SUB]], i32* [[I]], align 4
+// CHECK4-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
 // CHECK4-NEXT:    call void @__kmpc_ordered(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
 // CHECK4-NEXT:    [[TMP6:%.*]] = load float*, float** [[B_ADDR]], align 8
 // CHECK4-NEXT:    [[TMP7:%.*]] = load i32, i32* [[I]], align 4
@@ -1979,6 +2096,7 @@ void foo_simd(int low, int up) {
 // CHECK4-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
 // CHECK4-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP17]], 1
 // CHECK4-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4
+// CHECK4-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]])
 // CHECK4-NEXT:    call void @__kmpc_dispatch_fini_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
 // CHECK4-NEXT:    br label [[OMP_INNER_FOR_COND]]
 // CHECK4:       omp.inner.for.end:
@@ -1986,7 +2104,9 @@ void foo_simd(int low, int up) {
 // CHECK4:       omp.dispatch.inc:
 // CHECK4-NEXT:    br label [[OMP_DISPATCH_COND]]
 // CHECK4:       omp.dispatch.end:
-// CHECK4-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP0]])
+// CHECK4-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK4-IRBUILDER-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4:[0-9]+]], i32 [[TMP0]])
+// CHECK4-NORMAL-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP0]])
 // CHECK4-NEXT:    ret void
 //
 //
@@ -2004,7 +2124,7 @@ void foo_simd(int low, int up) {
 // CHECK4-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8
 // CHECK4-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
 // CHECK4-NEXT:    [[I:%.*]] = alloca i64, align 8
-// CHECK4-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK4-NORMAL-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
 // CHECK4-NEXT:    store float* [[A]], float** [[A_ADDR]], align 8
 // CHECK4-NEXT:    store float* [[B]], float** [[B_ADDR]], align 8
 // CHECK4-NEXT:    store float* [[C]], float** [[C_ADDR]], align 8
@@ -2013,9 +2133,11 @@ void foo_simd(int low, int up) {
 // CHECK4-NEXT:    store i64 16908287, i64* [[DOTOMP_UB]], align 8
 // CHECK4-NEXT:    store i64 1, i64* [[DOTOMP_STRIDE]], align 8
 // CHECK4-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK4-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB6:[0-9]+]])
 // CHECK4-NEXT:    call void @__kmpc_dispatch_init_8u(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32 67, i64 0, i64 16908287, i64 1, i64 1)
 // CHECK4-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
 // CHECK4:       omp.dispatch.cond:
+// CHECK4-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB6]])
 // CHECK4-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_dispatch_next_8u(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_LB]], i64* [[DOTOMP_UB]], i64* [[DOTOMP_STRIDE]])
 // CHECK4-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP1]], 0
 // CHECK4-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
@@ -2034,6 +2156,7 @@ void foo_simd(int low, int up) {
 // CHECK4-NEXT:    [[MUL:%.*]] = mul i64 [[TMP5]], 127
 // CHECK4-NEXT:    [[ADD1:%.*]] = add i64 131071, [[MUL]]
 // CHECK4-NEXT:    store i64 [[ADD1]], i64* [[I]], align 8
+// CHECK4-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
 // CHECK4-NEXT:    call void @__kmpc_ordered(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
 // CHECK4-NEXT:    [[TMP6:%.*]] = load float*, float** [[B_ADDR]], align 8
 // CHECK4-NEXT:    [[TMP7:%.*]] = load i64, i64* [[I]], align 8
@@ -2061,6 +2184,7 @@ void foo_simd(int low, int up) {
 // CHECK4-NEXT:    [[TMP17:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8
 // CHECK4-NEXT:    [[ADD7:%.*]] = add i64 [[TMP17]], 1
 // CHECK4-NEXT:    store i64 [[ADD7]], i64* [[DOTOMP_IV]], align 8
+// CHECK4-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB6]])
 // CHECK4-NEXT:    call void @__kmpc_dispatch_fini_8u(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
 // CHECK4-NEXT:    br label [[OMP_INNER_FOR_COND]]
 // CHECK4:       omp.inner.for.end:
@@ -2068,7 +2192,9 @@ void foo_simd(int low, int up) {
 // CHECK4:       omp.dispatch.inc:
 // CHECK4-NEXT:    br label [[OMP_DISPATCH_COND]]
 // CHECK4:       omp.dispatch.end:
-// CHECK4-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]])
+// CHECK4-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK4-IRBUILDER-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP0]])
+// CHECK4-NORMAL-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]])
 // CHECK4-NEXT:    ret void
 //
 //
@@ -2094,7 +2220,7 @@ void foo_simd(int low, int up) {
 // CHECK4-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
 // CHECK4-NEXT:    [[I8:%.*]] = alloca i8, align 1
 // CHECK4-NEXT:    [[X9:%.*]] = alloca i32, align 4
-// CHECK4-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK4-NORMAL-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
 // CHECK4-NEXT:    store float* [[A]], float** [[A_ADDR]], align 8
 // CHECK4-NEXT:    store float* [[B]], float** [[B_ADDR]], align 8
 // CHECK4-NEXT:    store float* [[C]], float** [[C_ADDR]], align 8
@@ -2127,9 +2253,11 @@ void foo_simd(int low, int up) {
 // CHECK4-NEXT:    store i64 1, i64* [[DOTOMP_STRIDE]], align 8
 // CHECK4-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
 // CHECK4-NEXT:    [[TMP6:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_2]], align 8
+// CHECK4-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB8:[0-9]+]])
 // CHECK4-NEXT:    call void @__kmpc_dispatch_init_8(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32 70, i64 0, i64 [[TMP6]], i64 1, i64 1)
 // CHECK4-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
 // CHECK4:       omp.dispatch.cond:
+// CHECK4-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB8]])
 // CHECK4-NEXT:    [[TMP7:%.*]] = call i32 @__kmpc_dispatch_next_8(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_LB]], i64* [[DOTOMP_UB]], i64* [[DOTOMP_STRIDE]])
 // CHECK4-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP7]], 0
 // CHECK4-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
@@ -2160,6 +2288,7 @@ void foo_simd(int low, int up) {
 // CHECK4-NEXT:    [[SUB20:%.*]] = sub nsw i64 11, [[MUL19]]
 // CHECK4-NEXT:    [[CONV21:%.*]] = trunc i64 [[SUB20]] to i32
 // CHECK4-NEXT:    store i32 [[CONV21]], i32* [[X9]], align 4
+// CHECK4-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
 // CHECK4-NEXT:    call void @__kmpc_ordered(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
 // CHECK4-NEXT:    [[TMP15:%.*]] = load float*, float** [[B_ADDR]], align 8
 // CHECK4-NEXT:    [[TMP16:%.*]] = load i8, i8* [[I8]], align 1
@@ -2191,6 +2320,7 @@ void foo_simd(int low, int up) {
 // CHECK4-NEXT:    [[TMP26:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8
 // CHECK4-NEXT:    [[ADD30:%.*]] = add nsw i64 [[TMP26]], 1
 // CHECK4-NEXT:    store i64 [[ADD30]], i64* [[DOTOMP_IV]], align 8
+// CHECK4-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB8]])
 // CHECK4-NEXT:    call void @__kmpc_dispatch_fini_8(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
 // CHECK4-NEXT:    br label [[OMP_INNER_FOR_COND]]
 // CHECK4:       omp.inner.for.end:
@@ -2200,7 +2330,9 @@ void foo_simd(int low, int up) {
 // CHECK4:       omp.dispatch.end:
 // CHECK4-NEXT:    br label [[OMP_PRECOND_END]]
 // CHECK4:       omp.precond.end:
-// CHECK4-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]])
+// CHECK4-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK4-IRBUILDER-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP0]])
+// CHECK4-NORMAL-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]])
 // CHECK4-NEXT:    ret void
 //
 //
@@ -2221,7 +2353,7 @@ void foo_simd(int low, int up) {
 // CHECK4-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
 // CHECK4-NEXT:    [[I:%.*]] = alloca i8, align 1
 // CHECK4-NEXT:    [[X2:%.*]] = alloca i32, align 4
-// CHECK4-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK4-NORMAL-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
 // CHECK4-NEXT:    store float* [[A]], float** [[A_ADDR]], align 8
 // CHECK4-NEXT:    store float* [[B]], float** [[B_ADDR]], align 8
 // CHECK4-NEXT:    store float* [[C]], float** [[C_ADDR]], align 8
@@ -2231,9 +2363,11 @@ void foo_simd(int low, int up) {
 // CHECK4-NEXT:    store i32 199, i32* [[DOTOMP_UB]], align 4
 // CHECK4-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
 // CHECK4-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK4-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB10:[0-9]+]])
 // CHECK4-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32 69, i32 0, i32 199, i32 1, i32 1)
 // CHECK4-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
 // CHECK4:       omp.dispatch.cond:
+// CHECK4-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB10]])
 // CHECK4-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
 // CHECK4-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP1]], 0
 // CHECK4-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
@@ -2261,6 +2395,7 @@ void foo_simd(int low, int up) {
 // CHECK4-NEXT:    [[MUL5:%.*]] = mul nsw i32 [[SUB]], 1
 // CHECK4-NEXT:    [[ADD6:%.*]] = add nsw i32 -10, [[MUL5]]
 // CHECK4-NEXT:    store i32 [[ADD6]], i32* [[X2]], align 4
+// CHECK4-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
 // CHECK4-NEXT:    call void @__kmpc_ordered(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
 // CHECK4-NEXT:    [[TMP8:%.*]] = load float*, float** [[B_ADDR]], align 8
 // CHECK4-NEXT:    [[TMP9:%.*]] = load i8, i8* [[I]], align 1
@@ -2292,6 +2427,7 @@ void foo_simd(int low, int up) {
 // CHECK4-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
 // CHECK4-NEXT:    [[ADD15:%.*]] = add nsw i32 [[TMP19]], 1
 // CHECK4-NEXT:    store i32 [[ADD15]], i32* [[DOTOMP_IV]], align 4
+// CHECK4-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB10]])
 // CHECK4-NEXT:    call void @__kmpc_dispatch_fini_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
 // CHECK4-NEXT:    br label [[OMP_INNER_FOR_COND]]
 // CHECK4:       omp.inner.for.end:
@@ -2299,7 +2435,9 @@ void foo_simd(int low, int up) {
 // CHECK4:       omp.dispatch.inc:
 // CHECK4-NEXT:    br label [[OMP_DISPATCH_COND]]
 // CHECK4:       omp.dispatch.end:
-// CHECK4-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]])
+// CHECK4-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK4-IRBUILDER-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP0]])
+// CHECK4-NORMAL-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]])
 // CHECK4-NEXT:    ret void
 //
 //
@@ -2326,7 +2464,7 @@ void foo_simd(int low, int up) {
 // CHECK4-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
 // CHECK4-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
 // CHECK4-NEXT:    [[I28:%.*]] = alloca i32, align 4
-// CHECK4-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK4-NORMAL-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
 // CHECK4-NEXT:    store i32 [[LOW]], i32* [[LOW_ADDR]], align 4
 // CHECK4-NEXT:    store i32 [[UP]], i32* [[UP_ADDR]], align 4
 // CHECK4-NEXT:    [[TMP1:%.*]] = load i32, i32* [[LOW_ADDR]], align 4
@@ -2413,9 +2551,11 @@ void foo_simd(int low, int up) {
 // CHECK4-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
 // CHECK4-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
 // CHECK4-NEXT:    [[TMP25:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_20]], align 4
+// CHECK4-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB12:[0-9]+]])
 // CHECK4-NEXT:    call void @__kmpc_dispatch_init_4u(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32 66, i32 0, i32 [[TMP25]], i32 1, i32 1)
 // CHECK4-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
 // CHECK4:       omp.dispatch.cond:
+// CHECK4-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB12]])
 // CHECK4-NEXT:    [[TMP26:%.*]] = call i32 @__kmpc_dispatch_next_4u(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
 // CHECK4-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP26]], 0
 // CHECK4-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
@@ -2423,13 +2563,15 @@ void foo_simd(int low, int up) {
 // CHECK4-NEXT:    [[TMP27:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
 // CHECK4-NEXT:    store i32 [[TMP27]], i32* [[DOTOMP_IV16]], align 4
 // CHECK4-NEXT:    br label [[OMP_INNER_FOR_COND29:%.*]]
-// CHECK4:       omp.inner.for.cond29:
+// CHECK4-IRBUILDER:       omp.inner.for.cond30:
+// CHECK4-NORMAL:       omp.inner.for.cond29:
 // CHECK4-NEXT:    [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IV16]], align 4, !llvm.access.group !7
 // CHECK4-NEXT:    [[TMP29:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group !7
 // CHECK4-NEXT:    [[ADD30:%.*]] = add i32 [[TMP29]], 1
 // CHECK4-NEXT:    [[CMP31:%.*]] = icmp ult i32 [[TMP28]], [[ADD30]]
 // CHECK4-NEXT:    br i1 [[CMP31]], label [[OMP_INNER_FOR_BODY32:%.*]], label [[OMP_INNER_FOR_END40:%.*]]
-// CHECK4:       omp.inner.for.body32:
+// CHECK4-IRBUILDER:       omp.inner.for.body33:
+// CHECK4-NORMAL:       omp.inner.for.body32:
 // CHECK4-NEXT:    [[TMP30:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_18]], align 4, !llvm.access.group !7
 // CHECK4-NEXT:    [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IV16]], align 4, !llvm.access.group !7
 // CHECK4-NEXT:    [[MUL33:%.*]] = mul i32 [[TMP31]], 1
@@ -2441,15 +2583,19 @@ void foo_simd(int low, int up) {
 // CHECK4-NEXT:    store float 0.000000e+00, float* [[ARRAYIDX36]], align 4, !llvm.access.group !7
 // CHECK4-NEXT:    call void @__captured_stmt.1(i32* [[I28]]), !llvm.access.group !7
 // CHECK4-NEXT:    br label [[OMP_BODY_CONTINUE37:%.*]]
-// CHECK4:       omp.body.continue37:
+// CHECK4-IRBUILDER:       omp.body.continue38:
+// CHECK4-NORMAL:       omp.body.continue37:
 // CHECK4-NEXT:    br label [[OMP_INNER_FOR_INC38:%.*]]
-// CHECK4:       omp.inner.for.inc38:
+// CHECK4-IRBUILDER:       omp.inner.for.inc39:
+// CHECK4-NORMAL:       omp.inner.for.inc38:
 // CHECK4-NEXT:    [[TMP33:%.*]] = load i32, i32* [[DOTOMP_IV16]], align 4, !llvm.access.group !7
 // CHECK4-NEXT:    [[ADD39:%.*]] = add i32 [[TMP33]], 1
 // CHECK4-NEXT:    store i32 [[ADD39]], i32* [[DOTOMP_IV16]], align 4, !llvm.access.group !7
+// CHECK4-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB12]])
 // CHECK4-NEXT:    call void @__kmpc_dispatch_fini_4u(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]), !llvm.access.group !7
 // CHECK4-NEXT:    br label [[OMP_INNER_FOR_COND29]], !llvm.loop [[LOOP8:![0-9]+]]
-// CHECK4:       omp.inner.for.end40:
+// CHECK4-IRBUILDER:       omp.inner.for.end42:
+// CHECK4-NORMAL:       omp.inner.for.end40:
 // CHECK4-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK4:       omp.dispatch.inc:
 // CHECK4-NEXT:    br label [[OMP_DISPATCH_COND]]
@@ -2472,7 +2618,9 @@ void foo_simd(int low, int up) {
 // CHECK4:       .omp.final.done:
 // CHECK4-NEXT:    br label [[OMP_PRECOND_END]]
 // CHECK4:       omp.precond.end:
-// CHECK4-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]])
+// CHECK4-IRBUILDER-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK4-IRBUILDER-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP0]])
+// CHECK4-NORMAL-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]])
 // CHECK4-NEXT:    ret void
 //
 //

diff  --git a/clang/test/OpenMP/ordered_doacross_codegen.c b/clang/test/OpenMP/ordered_doacross_codegen.c
index 734d97a4398ab..5e82a34db906e 100644
--- a/clang/test/OpenMP/ordered_doacross_codegen.c
+++ b/clang/test/OpenMP/ordered_doacross_codegen.c
@@ -1,6 +1,10 @@
-// RUN: %clang_cc1 -verify -fopenmp -triple x86_64-unknown-unknown -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -triple x86_64-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-NORMAL
 // RUN: %clang_cc1 -fopenmp -triple x86_64-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -triple x86_64-unknown-unknown -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -triple x86_64-unknown-unknown -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,CHECK-NORMAL
+
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-enable-irbuilder -triple x86_64-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-IRBUILDER
+// RUN: %clang_cc1 -fopenmp -fopenmp-enable-irbuilder -triple x86_64-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-enable-irbuilder -triple x86_64-unknown-unknown -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,CHECK-IRBUILDER
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -triple x86_64-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s
 // RUN: %clang_cc1 -fopenmp-simd -triple x86_64-unknown-unknown -emit-pch -o %t %s
@@ -20,7 +24,7 @@ void foo();
 int main() {
   int i;
 // CHECK: [[DIMS:%.+]] = alloca [1 x [[KMP_DIM]]],
-// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num([[IDENT:%.+]])
+// CHECK-NORMAL: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num([[IDENT:%.+]])
 // CHECK: icmp
 // CHECK-NEXT: br i1 %
 // CHECK: [[CAST:%.+]] = bitcast [1 x [[KMP_DIM]]]* [[DIMS]] to i8*
@@ -32,13 +36,13 @@ int main() {
 // CHECK: store i64 1, i64* %
 // CHECK: [[DIM:%.+]] = getelementptr inbounds [1 x [[KMP_DIM]]], [1 x [[KMP_DIM]]]* [[DIMS]], i64 0, i64 0
 // CHECK: [[CAST:%.+]] = bitcast [[KMP_DIM]]* [[DIM]] to i8*
-// CHECK: call void @__kmpc_doacross_init([[IDENT]], i32 [[GTID]], i32 1, i8* [[CAST]])
-// CHECK: call void @__kmpc_for_static_init_4(%struct.ident_t* @{{.+}}, i32 [[GTID]], i32 33, i32* %{{.+}}, i32* %{{.+}}, i32* %{{.+}}, i32* %{{.+}}, i32 1, i32 1)
+// CHECK-NORMAL: call void @__kmpc_doacross_init([[IDENT]], i32 [[GTID]], i32 1, i8* [[CAST]])
+// CHECK-NORMAL: call void @__kmpc_for_static_init_4(%struct.ident_t* @{{.+}}, i32 [[GTID]], i32 33, i32* %{{.+}}, i32* %{{.+}}, i32* %{{.+}}, i32* %{{.+}}, i32 1, i32 1)
 #pragma omp for ordered(1)
   for (i = 0; i < n; ++i) {
     a[i] = b[i] + 1;
     foo();
-// CHECK: call void [[FOO:.+]](
+// CHECK: call void (...) [[FOO:.+]](
 // CHECK: load i32, i32* [[I:%.+]],
 // CHECK-NEXT: sub nsw i32 %{{.+}}, 0
 // CHECK-NEXT: sdiv i32 %{{.+}}, 1
@@ -46,11 +50,13 @@ int main() {
 // CHECK-NEXT: [[TMP:%.+]] = getelementptr inbounds [1 x i64], [1 x i64]* [[CNT:%.+]], i64 0, i64 0
 // CHECK-NEXT: store i64 %{{.+}}, i64* [[TMP]],
 // CHECK-NEXT: [[TMP:%.+]] = getelementptr inbounds [1 x i64], [1 x i64]* [[CNT]], i64 0, i64 0
-// CHECK-NEXT: call void @__kmpc_doacross_post([[IDENT]], i32 [[GTID]], i64* [[TMP]])
+// CHECK-NORMAL-NEXT: call void @__kmpc_doacross_post([[IDENT]], i32 [[GTID]], i64* [[TMP]])
+// CHECK-IRBUILDER-NEXT: [[GTID1:%.+]] = call i32 @__kmpc_global_thread_num([[IDENT:%.+]])
+// CHECK-IRBUILDER-NEXT: call void @__kmpc_doacross_post([[IDENT]], i32 [[GTID1]], i64* [[TMP]])
 #pragma omp ordered depend(source)
     c[i] = c[i] + 1;
     foo();
-// CHECK: call void [[FOO]]
+// CHECK: call void (...) [[FOO]]
 // CHECK: load i32, i32* [[I]],
 // CHECK-NEXT: sub nsw i32 %{{.+}}, 2
 // CHECK-NEXT: sub nsw i32 %{{.+}}, 0
@@ -59,12 +65,14 @@ int main() {
 // CHECK-NEXT: [[TMP:%.+]] = getelementptr inbounds [1 x i64], [1 x i64]* [[CNT:%.+]], i64 0, i64 0
 // CHECK-NEXT: store i64 %{{.+}}, i64* [[TMP]],
 // CHECK-NEXT: [[TMP:%.+]] = getelementptr inbounds [1 x i64], [1 x i64]* [[CNT]], i64 0, i64 0
-// CHECK-NEXT: call void @__kmpc_doacross_wait([[IDENT]], i32 [[GTID]], i64* [[TMP]])
+// CHECK-NORMAL-NEXT: call void @__kmpc_doacross_wait([[IDENT]], i32 [[GTID]], i64* [[TMP]])
+// CHECK-IRBUILDER-NEXT: [[GTID2:%.+]] = call i32 @__kmpc_global_thread_num([[IDENT:%.+]])
+// CHECK-IRBUILDER-NEXT: call void @__kmpc_doacross_wait([[IDENT]], i32 [[GTID2]], i64* [[TMP]])
 #pragma omp ordered depend(sink : i - 2)
     d[i] = a[i - 2];
   }
   // CHECK: call void @__kmpc_for_static_fini(
-  // CHECK: call void @__kmpc_doacross_fini([[IDENT]], i32 [[GTID]])
+  // CHECK-NORMAL: call void @__kmpc_doacross_fini([[IDENT]], i32 [[GTID]])
   // CHECK: ret i32 0
   return 0;
 }

diff  --git a/clang/test/OpenMP/ordered_doacross_codegen.cpp b/clang/test/OpenMP/ordered_doacross_codegen.cpp
index 00deb6b192a60..1893ed07d13c4 100644
--- a/clang/test/OpenMP/ordered_doacross_codegen.cpp
+++ b/clang/test/OpenMP/ordered_doacross_codegen.cpp
@@ -1,6 +1,10 @@
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s --check-prefixes=CHECK,CHECK-NORMAL
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,CHECK-NORMAL
+
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-enable-irbuilder -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s --check-prefixes=CHECK,CHECK-IRBUILDER
+// RUN: %clang_cc1 -fopenmp -fopenmp-enable-irbuilder -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-enable-irbuilder -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,CHECK-IRBUILDER
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck --check-prefix SIMD-ONLY0 %s
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s
@@ -37,7 +41,7 @@ void bar() {
 int main() {
   int i;
 // CHECK: [[DIMS:%.+]] = alloca [1 x [[KMP_DIM]]],
-// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num([[IDENT:%.+]])
+// CHECK-NORMAL: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num([[IDENT:%.+]])
 // CHECK: icmp
 // CHECK-NEXT: br i1 %
 // CHECK: [[CAST:%.+]] = bitcast [1 x [[KMP_DIM]]]* [[DIMS]] to i8*
@@ -49,8 +53,8 @@ int main() {
 // CHECK: store i64 1, i64* %
 // CHECK: [[DIM:%.+]] = getelementptr inbounds [1 x [[KMP_DIM]]], [1 x [[KMP_DIM]]]* [[DIMS]], i64 0, i64 0
 // CHECK: [[CAST:%.+]] = bitcast [[KMP_DIM]]* [[DIM]] to i8*
-// CHECK: call void @__kmpc_doacross_init([[IDENT]], i32 [[GTID]], i32 1, i8* [[CAST]])
-// CHECK: call void @__kmpc_for_static_init_4(%struct.ident_t* @{{.+}}, i32 [[GTID]], i32 33, i32* %{{.+}}, i32* %{{.+}}, i32* %{{.+}}, i32* %{{.+}}, i32 1, i32 1)
+// CHECK-NORMAL: call void @__kmpc_doacross_init([[IDENT]], i32 [[GTID]], i32 1, i8* [[CAST]])
+// CHECK-NORMAL: call void @__kmpc_for_static_init_4(%struct.ident_t* @{{.+}}, i32 [[GTID]], i32 33, i32* %{{.+}}, i32* %{{.+}}, i32* %{{.+}}, i32* %{{.+}}, i32 1, i32 1)
 #pragma omp for ordered(1)
   for (int i = 0; i < n; ++i) {
     a[i] = b[i] + 1;
@@ -63,7 +67,9 @@ int main() {
 // CHECK-NEXT: [[TMP:%.+]] = getelementptr inbounds [1 x i64], [1 x i64]* [[CNT:%.+]], i64 0, i64 0
 // CHECK-NEXT: store i64 %{{.+}}, i64* [[TMP]],
 // CHECK-NEXT: [[TMP:%.+]] = getelementptr inbounds [1 x i64], [1 x i64]* [[CNT]], i64 0, i64 0
-// CHECK-NEXT: call void @__kmpc_doacross_post([[IDENT]], i32 [[GTID]], i64* [[TMP]])
+// CHECK-NORMAL-NEXT: call void @__kmpc_doacross_post([[IDENT]], i32 [[GTID]], i64* [[TMP]])
+// CHECK-IRBUILDER-NEXT: [[GTID18:%.+]] = call i32 @__kmpc_global_thread_num([[IDENT:%.+]])
+// CHECK-IRBUILDER-NEXT: call void @__kmpc_doacross_post([[IDENT]], i32 [[GTID18]], i64* [[TMP]])
 #pragma omp ordered depend(source)
     c[i] = c[i] + 1;
     foo();
@@ -76,16 +82,18 @@ int main() {
 // CHECK-NEXT: [[TMP:%.+]] = getelementptr inbounds [1 x i64], [1 x i64]* [[CNT:%.+]], i64 0, i64 0
 // CHECK-NEXT: store i64 %{{.+}}, i64* [[TMP]],
 // CHECK-NEXT: [[TMP:%.+]] = getelementptr inbounds [1 x i64], [1 x i64]* [[CNT]], i64 0, i64 0
-// CHECK-NEXT: call void @__kmpc_doacross_wait([[IDENT]], i32 [[GTID]], i64* [[TMP]])
+// CHECK-NORMAL-NEXT: call void @__kmpc_doacross_wait([[IDENT]], i32 [[GTID]], i64* [[TMP]])
+// CHECK-IRBUILDER-NEXT: [[GTID30:%.+]] = call i32 @__kmpc_global_thread_num([[IDENT:%.+]])
+// CHECK-IRBUILDER-NEXT: call void @__kmpc_doacross_wait([[IDENT]], i32 [[GTID30]], i64* [[TMP]])
 #pragma omp ordered depend(sink : i - 2)
     d[i] = a[i - 2];
   }
   // CHECK: landingpad
-  // CHECK: call void @__kmpc_doacross_fini([[IDENT]], i32 [[GTID]])
+  // CHECK-NORMAL: call void @__kmpc_doacross_fini([[IDENT]], i32 [[GTID]])
   // CHECK: br label %
 
   // CHECK: call void @__kmpc_for_static_fini(
-  // CHECK: call void @__kmpc_doacross_fini([[IDENT]], i32 [[GTID]])
+  // CHECK-NORMAL: call void @__kmpc_doacross_fini([[IDENT]], i32 [[GTID]])
   // CHECK: ret i32 0
   return 0;
 }
@@ -93,7 +101,7 @@ int main() {
 // CHECK-LABEL: main1
 int main1() {
 // CHECK: [[DIMS:%.+]] = alloca [1 x [[KMP_DIM]]],
-// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num([[IDENT:%.+]])
+// CHECK-NORMAL: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num([[IDENT:%.+]])
 // CHECK: icmp
 // CHECK-NEXT: br i1 %
 // CHECK: [[CAST:%.+]] = bitcast [1 x [[KMP_DIM]]]* [[DIMS]] to i8*
@@ -105,8 +113,8 @@ int main1() {
 // CHECK: store i64 1, i64* %
 // CHECK: [[DIM:%.+]] = getelementptr inbounds [1 x [[KMP_DIM]]], [1 x [[KMP_DIM]]]* [[DIMS]], i64 0, i64 0
 // CHECK: [[CAST:%.+]] = bitcast [[KMP_DIM]]* [[DIM]] to i8*
-// CHECK: call void @__kmpc_doacross_init([[IDENT]], i32 [[GTID]], i32 1, i8* [[CAST]])
-// CHECK: call void @__kmpc_for_static_init_4(%struct.ident_t* @{{.+}}, i32 [[GTID]], i32 33, i32* %{{.+}}, i32* %{{.+}}, i32* %{{.+}}, i32* %{{.+}}, i32 1, i32 1)
+// CHECK-NORMAL: call void @__kmpc_doacross_init([[IDENT]], i32 [[GTID]], i32 1, i8* [[CAST]])
+// CHECK-NORMAL: call void @__kmpc_for_static_init_4(%struct.ident_t* @{{.+}}, i32 [[GTID]], i32 33, i32* %{{.+}}, i32* %{{.+}}, i32* %{{.+}}, i32* %{{.+}}, i32 1, i32 1)
 #pragma omp for ordered(1)
   for (int i = n; i > 0; --i) {
     a[i] = b[i] + 1;
@@ -120,7 +128,9 @@ int main1() {
 // CHECK-NEXT: [[TMP:%.+]] = getelementptr inbounds [1 x i64], [1 x i64]* [[CNT:%.+]], i64 0, i64 0
 // CHECK-NEXT: store i64 %{{.+}}, i64* [[TMP]],
 // CHECK-NEXT: [[TMP:%.+]] = getelementptr inbounds [1 x i64], [1 x i64]* [[CNT]], i64 0, i64 0
-// CHECK-NEXT: call void @__kmpc_doacross_post([[IDENT]], i32 [[GTID]], i64* [[TMP]])
+// CHECK-NORMAL-NEXT: call void @__kmpc_doacross_post([[IDENT]], i32 [[GTID]], i64* [[TMP]])
+// CHECK-IRBUILDER-NEXT: [[GTID17:%.+]] = call i32 @__kmpc_global_thread_num([[IDENT:%.+]])
+// CHECK-IRBUILDER-NEXT: call void @__kmpc_doacross_post([[IDENT]], i32 [[GTID17]], i64* [[TMP]])
 #pragma omp ordered depend(source)
     c[i] = c[i] + 1;
     foo();
@@ -134,16 +144,18 @@ int main1() {
 // CHECK-NEXT: [[TMP:%.+]] = getelementptr inbounds [1 x i64], [1 x i64]* [[CNT:%.+]], i64 0, i64 0
 // CHECK-NEXT: store i64 %{{.+}}, i64* [[TMP]],
 // CHECK-NEXT: [[TMP:%.+]] = getelementptr inbounds [1 x i64], [1 x i64]* [[CNT]], i64 0, i64 0
-// CHECK-NEXT: call void @__kmpc_doacross_wait([[IDENT]], i32 [[GTID]], i64* [[TMP]])
+// CHECK-NORMAL-NEXT: call void @__kmpc_doacross_wait([[IDENT]], i32 [[GTID]], i64* [[TMP]])
+// CHECK-IRBUILDER-NEXT: [[GTID29:%.+]] = call i32 @__kmpc_global_thread_num([[IDENT:%.+]])
+// CHECK-IRBUILDER-NEXT: call void @__kmpc_doacross_wait([[IDENT]], i32 [[GTID29]], i64* [[TMP]])
 #pragma omp ordered depend(sink : i - 2)
     d[i] = a[i - 2];
   }
   // CHECK: landingpad
-  // CHECK: call void @__kmpc_doacross_fini([[IDENT]], i32 [[GTID]])
+  // CHECK-NORMAL: call void @__kmpc_doacross_fini([[IDENT]], i32 [[GTID]])
   // CHECK: br label %
 
   // CHECK: call void @__kmpc_for_static_fini(
-  // CHECK: call void @__kmpc_doacross_fini([[IDENT]], i32 [[GTID]])
+  // CHECK-NORMAL: call void @__kmpc_doacross_fini([[IDENT]], i32 [[GTID]])
   // CHECK: ret i32 0
   return 0;
 }
@@ -161,7 +173,7 @@ struct TestStruct {
   void baz(T, T);
   TestStruct() {
 // CHECK: [[DIMS:%.+]] = alloca [2 x [[KMP_DIM]]],
-// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num([[IDENT:%.+]])
+// CHECK-NORMAL: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num([[IDENT:%.+]])
 // CHECK: [[CAST:%.+]] = bitcast [2 x [[KMP_DIM]]]* [[DIMS]] to i8*
 // CHECK: call void @llvm.memset.p0i8.i64(i8* align 8 [[CAST]], i8 0, i64 48, i1 false)
 // CHECK: [[DIM:%.+]] = getelementptr inbounds [2 x [[KMP_DIM]]], [2 x [[KMP_DIM]]]* [[DIMS]], i64 0, i64 0
@@ -176,8 +188,8 @@ struct TestStruct {
 // CHECK: store i64 1, i64* %
 // CHECK: [[DIM:%.+]] = getelementptr inbounds [2 x [[KMP_DIM]]], [2 x [[KMP_DIM]]]* [[DIMS]], i64 0, i64 0
 // CHECK: [[CAST:%.+]] = bitcast [[KMP_DIM]]* [[DIM]] to i8*
-// CHECK: call void @__kmpc_doacross_init([[IDENT]], i32 [[GTID]], i32 2, i8* [[CAST]])
-// CHECK: call void @__kmpc_for_static_init_4(%struct.ident_t* @{{.+}}, i32 [[GTID]], i32 33, i32* %{{.+}}, i32* %{{.+}}, i32* %{{.+}}, i32* %{{.+}}, i32 1, i32 1)
+// CHECK-NORMAL: call void @__kmpc_doacross_init([[IDENT]], i32 [[GTID]], i32 2, i8* [[CAST]])
+// CHECK-NORMAL: call void @__kmpc_for_static_init_4(%struct.ident_t* @{{.+}}, i32 [[GTID]], i32 33, i32* %{{.+}}, i32* %{{.+}}, i32* %{{.+}}, i32* %{{.+}}, i32 1, i32 1)
 #pragma omp for ordered(2)
     for (T j = 0; j < M; j++)
       for (i = 0; i < n; i += 2) {
@@ -190,34 +202,42 @@ struct TestStruct {
 // CHECK-NEXT: sub nsw i32 %{{.+}}, 0
 // CHECK-NEXT: sdiv i32 %{{.+}}, 1
 // CHECK-NEXT: sext i32 %{{.+}} to i64
-// CHECK-NEXT: [[TMP:%.+]] = getelementptr inbounds [2 x i64], [2 x i64]* [[CNT:%.+]], i64 0, i64 0
-// CHECK-NEXT: store i64 %{{.+}}, i64* [[TMP:%.+]],
+// CHECK-NORMAL-NEXT: [[TMP:%.+]] = getelementptr inbounds [2 x i64], [2 x i64]* [[CNT:%.+]], i64 0, i64 0
+// CHECK-NORMAL-NEXT: store i64 %{{.+}}, i64* [[TMP:%.+]],
 // CHECK-NEXT: [[I:%.+]] = load i32*, i32** [[I_REF:%.+]],
 // CHECK-NEXT: load i32, i32* [[I]],
 // CHECK-NEXT: sub nsw i32 %{{.+}}, 2
 // CHECK-NEXT: sub nsw i32 %{{.+}}, 0
 // CHECK-NEXT: sdiv i32 %{{.+}}, 2
 // CHECK-NEXT: sext i32 %{{.+}} to i64
+// CHECK-IRBUILDER-NEXT: [[TMP:%.+]] = getelementptr inbounds [2 x i64], [2 x i64]* [[CNT:%.+]], i64 0, i64 0
+// CHECK-IRBUILDER-NEXT: store i64 %{{.+}}, i64* [[TMP:%.+]],
 // CHECK-NEXT: [[TMP:%.+]] = getelementptr inbounds [2 x i64], [2 x i64]* [[CNT]], i64 0, i64 1
 // CHECK-NEXT: store i64 %{{.+}}, i64* [[TMP]],
 // CHECK-NEXT: [[TMP:%.+]] = getelementptr inbounds [2 x i64], [2 x i64]* [[CNT]], i64 0, i64 0
-// CHECK-NEXT: call void @__kmpc_doacross_wait([[IDENT]], i32 [[GTID]], i64* [[TMP]])
+// CHECK-NORMAL-NEXT: call void @__kmpc_doacross_wait([[IDENT]], i32 [[GTID]], i64* [[TMP]])
+// CHECK-IRBUILDER-NEXT: [[GTID18:%.+]] = call i32 @__kmpc_global_thread_num([[IDENT:%.+]])
+// CHECK-IRBUILDER-NEXT: call void @__kmpc_doacross_wait([[IDENT]], i32 [[GTID18]], i64* [[TMP]])
 // CHECK-NEXT: load i32, i32* [[J:%.+]],
 // CHECK-NEXT: sub nsw i32 %{{.+}}, 1
 // CHECK-NEXT: sub nsw i32 %{{.+}}, 0
 // CHECK-NEXT: sdiv i32 %{{.+}}, 1
 // CHECK-NEXT: sext i32 %{{.+}} to i64
-// CHECK-NEXT: [[TMP:%.+]] = getelementptr inbounds [2 x i64], [2 x i64]* [[CNT:%.+]], i64 0, i64 0
-// CHECK-NEXT: store i64 %{{.+}}, i64* [[TMP:%.+]],
+// CHECK-NORMAL-NEXT: [[TMP:%.+]] = getelementptr inbounds [2 x i64], [2 x i64]* [[CNT:%.+]], i64 0, i64 0
+// CHECK-NORMAL-NEXT: store i64 %{{.+}}, i64* [[TMP]],
 // CHECK-NEXT: [[I:%.+]] = load i32*, i32** [[I_REF]],
 // CHECK-NEXT: load i32, i32* [[I]],
 // CHECK-NEXT: sub nsw i32 %{{.+}}, 0
 // CHECK-NEXT: sdiv i32 %{{.+}}, 2
 // CHECK-NEXT: sext i32 %{{.+}} to i64
+// CHECK-IRBUILDER-NEXT: [[TMP:%.+]] = getelementptr inbounds [2 x i64], [2 x i64]* [[CNT:%.+]], i64 0, i64 0
+// CHECK-IRBUILDER-NEXT: store i64 %{{.+}}, i64* [[TMP]],
 // CHECK-NEXT: [[TMP:%.+]] = getelementptr inbounds [2 x i64], [2 x i64]* [[CNT]], i64 0, i64 1
 // CHECK-NEXT: store i64 %{{.+}}, i64* [[TMP]],
 // CHECK-NEXT: [[TMP:%.+]] = getelementptr inbounds [2 x i64], [2 x i64]* [[CNT]], i64 0, i64 0
-// CHECK-NEXT: call void @__kmpc_doacross_wait([[IDENT]], i32 [[GTID]], i64* [[TMP]])
+// CHECK-NORMAL-NEXT: call void @__kmpc_doacross_wait([[IDENT]], i32 [[GTID]], i64* [[TMP]])
+// CHECK-IRBUILDER-NEXT: [[GTID27:%.+]] = call i32 @__kmpc_global_thread_num([[IDENT:%.+]])
+// CHECK-IRBUILDER-NEXT: call void @__kmpc_doacross_wait([[IDENT]], i32 [[GTID27]], i64* [[TMP]])
 #pragma omp ordered depend(sink : j, i - 2) depend(sink : j - 1, i)
         b[i][j] = bar(a[i][j], b[i - 1][j], b[i][j - 1]);
 // CHECK: invoke {{.+TestStruct.+bar}}
@@ -228,27 +248,31 @@ struct TestStruct {
 // CHECK-NEXT: sub nsw i32 %{{.+}}, 0
 // CHECK-NEXT: sdiv i32 %{{.+}}, 1
 // CHECK-NEXT: sext i32 %{{.+}} to i64
-// CHECK-NEXT: [[TMP:%.+]] = getelementptr inbounds [2 x i64], [2 x i64]* [[CNT:%.+]], i64 0, i64 0
-// CHECK-NEXT: store i64 %{{.+}}, i64* [[TMP:%.+]],
+// CHECK-NORMAL-NEXT: [[TMP:%.+]] = getelementptr inbounds [2 x i64], [2 x i64]* [[CNT:%.+]], i64 0, i64 0
+// CHECK-NORMAL-NEXT: store i64 %{{.+}}, i64* [[TMP]],
 // CHECK-NEXT: [[I:%.+]] = load i32*, i32** [[I_REF]],
 // CHECK-NEXT: load i32, i32* [[I]],
 // CHECK-NEXT: sub nsw i32 %{{.+}}, 0
 // CHECK-NEXT: sdiv i32 %{{.+}}, 2
 // CHECK-NEXT: sext i32 %{{.+}} to i64
+// CHECK-IRBUILDER-NEXT: [[TMP:%.+]] = getelementptr inbounds [2 x i64], [2 x i64]* [[CNT:%.+]], i64 0, i64 0
+// CHECK-IRBUILDER-NEXT: store i64 %{{.+}}, i64* [[TMP]],
 // CHECK-NEXT: [[TMP:%.+]] = getelementptr inbounds [2 x i64], [2 x i64]* [[CNT]], i64 0, i64 1
 // CHECK-NEXT: store i64 %{{.+}}, i64* [[TMP]],
 // CHECK-NEXT: [[TMP:%.+]] = getelementptr inbounds [2 x i64], [2 x i64]* [[CNT]], i64 0, i64 0
-// CHECK-NEXT: call void @__kmpc_doacross_post([[IDENT]], i32 [[GTID]], i64* [[TMP]])
+// CHECK-NORMAL-NEXT: call void @__kmpc_doacross_post([[IDENT]], i32 [[GTID]], i64* [[TMP]])
+// CHECK-IRBUILDER-NEXT: [[GTID58:%.+]] = call i32 @__kmpc_global_thread_num([[IDENT:%.+]])
+// CHECK-IRBUILDER-NEXT: call void @__kmpc_doacross_post([[IDENT]], i32 [[GTID58]], i64* [[TMP]])
 #pragma omp ordered depend(source)
         baz(a[i][j], b[i][j]);
       }
   }
   // CHECK: landingpad
-  // CHECK: call void @__kmpc_doacross_fini([[IDENT]], i32 [[GTID]])
+  // CHECK-NORMAL: call void @__kmpc_doacross_fini([[IDENT]], i32 [[GTID]])
   // CHECK: br label %
 
   // CHECK: call void @__kmpc_for_static_fini(
-  // CHECK: call void @__kmpc_doacross_fini([[IDENT]], i32 [[GTID]])
+  // CHECK-NORMAL: call void @__kmpc_doacross_fini([[IDENT]], i32 [[GTID]])
   // CHECK: ret
 };
 

diff  --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index b982e5b7ed5fe..4cdde2a559bac 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -838,6 +838,35 @@ class OpenMPIRBuilder {
                                FinalizeCallbackTy FiniCB,
                                StringRef CriticalName, Value *HintInst);
 
+  /// Generator for '#omp ordered depend (source | sink)'
+  ///
+  /// \param Loc The insert and source location description.
+  /// \param AllocaIP The insertion point to be used for alloca instructions.
+  /// \param NumLoops The number of loops in depend clause.
+  /// \param StoreValues The value will be stored in vector address.
+  /// \param Name The name of alloca instruction.
+  /// \param IsDependSource If true, depend source; otherwise, depend sink.
+  ///
+  /// \return The insertion position *after* the ordered.
+  InsertPointTy createOrderedDepend(const LocationDescription &Loc,
+                                    InsertPointTy AllocaIP, unsigned NumLoops,
+                                    ArrayRef<llvm::Value *> StoreValues,
+                                    const Twine &Name, bool IsDependSource);
+
+  /// Generator for '#omp ordered [threads | simd]'
+  ///
+  /// \param Loc The insert and source location description.
+  /// \param BodyGenCB Callback that will generate the region code.
+  /// \param FiniCB Callback to finalize variable copies.
+  /// \param IsThreads If true, with threads clause or without clause;
+  /// otherwise, with simd clause;
+  ///
+  /// \returns The insertion position *after* the ordered.
+  InsertPointTy createOrderedThreadsSimd(const LocationDescription &Loc,
+                                         BodyGenCallbackTy BodyGenCB,
+                                         FinalizeCallbackTy FiniCB,
+                                         bool IsThreads);
+
   /// Generator for '#omp sections'
   ///
   /// \param Loc The insert and source location description.

diff  --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 29fe2a8f0b391..484fe924a6d53 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -2145,6 +2145,74 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createCritical(
                               /*Conditional*/ false, /*hasFinalize*/ true);
 }
 
+OpenMPIRBuilder::InsertPointTy
+OpenMPIRBuilder::createOrderedDepend(const LocationDescription &Loc,
+                                     InsertPointTy AllocaIP, unsigned NumLoops,
+                                     ArrayRef<llvm::Value *> StoreValues,
+                                     const Twine &Name, bool IsDependSource) {
+  if (!updateToLocation(Loc))
+    return Loc.IP;
+
+  // Allocate space for vector and generate alloc instruction.
+  auto *ArrI64Ty = ArrayType::get(Int64, NumLoops);
+  Builder.restoreIP(AllocaIP);
+  AllocaInst *ArgsBase = Builder.CreateAlloca(ArrI64Ty, nullptr, Name);
+  ArgsBase->setAlignment(Align(8));
+  Builder.restoreIP(Loc.IP);
+
+  // Store the index value with offset in depend vector.
+  for (unsigned I = 0; I < NumLoops; ++I) {
+    Value *DependAddrGEPIter = Builder.CreateInBoundsGEP(
+        ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(I)});
+    Builder.CreateStore(StoreValues[I], DependAddrGEPIter);
+  }
+
+  Value *DependBaseAddrGEP = Builder.CreateInBoundsGEP(
+      ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(0)});
+
+  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc);
+  Value *Ident = getOrCreateIdent(SrcLocStr);
+  Value *ThreadId = getOrCreateThreadID(Ident);
+  Value *Args[] = {Ident, ThreadId, DependBaseAddrGEP};
+
+  Function *RTLFn = nullptr;
+  if (IsDependSource)
+    RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_post);
+  else
+    RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_wait);
+  Builder.CreateCall(RTLFn, Args);
+
+  return Builder.saveIP();
+}
+
+OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createOrderedThreadsSimd(
+    const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
+    FinalizeCallbackTy FiniCB, bool IsThreads) {
+  if (!updateToLocation(Loc))
+    return Loc.IP;
+
+  Directive OMPD = Directive::OMPD_ordered;
+  Instruction *EntryCall = nullptr;
+  Instruction *ExitCall = nullptr;
+
+  if (IsThreads) {
+    Constant *SrcLocStr = getOrCreateSrcLocStr(Loc);
+    Value *Ident = getOrCreateIdent(SrcLocStr);
+    Value *ThreadId = getOrCreateThreadID(Ident);
+    Value *Args[] = {Ident, ThreadId};
+
+    Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_ordered);
+    EntryCall = Builder.CreateCall(EntryRTLFn, Args);
+
+    Function *ExitRTLFn =
+        getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_ordered);
+    ExitCall = Builder.CreateCall(ExitRTLFn, Args);
+  }
+
+  return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
+                              /*Conditional*/ false, /*hasFinalize*/ true);
+}
+
 OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::EmitOMPInlinedRegion(
     Directive OMPD, Instruction *EntryCall, Instruction *ExitCall,
     BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool Conditional,

diff  --git a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
index 9c9893c3b5bc9..07a062b6d414a 100644
--- a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
+++ b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
@@ -2120,6 +2120,320 @@ TEST_F(OpenMPIRBuilderTest, CriticalDirective) {
   EXPECT_EQ(CriticalEndCI->getArgOperand(2)->getType(), CriticalNamePtrTy);
 }
 
+TEST_F(OpenMPIRBuilderTest, OrderedDirectiveDependSource) {
+  using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
+  OpenMPIRBuilder OMPBuilder(*M);
+  OMPBuilder.initialize();
+  F->setName("func");
+  IRBuilder<> Builder(BB);
+  LLVMContext &Ctx = M->getContext();
+
+  OpenMPIRBuilder::LocationDescription Loc({Builder.saveIP(), DL});
+
+  InsertPointTy AllocaIP(&F->getEntryBlock(),
+                         F->getEntryBlock().getFirstInsertionPt());
+
+  unsigned NumLoops = 2;
+  SmallVector<Value *, 2> StoreValues;
+  Type *LCTy = Type::getInt64Ty(Ctx);
+  StoreValues.emplace_back(ConstantInt::get(LCTy, 1));
+  StoreValues.emplace_back(ConstantInt::get(LCTy, 2));
+
+  // Test for "#omp ordered depend(source)"
+  Builder.restoreIP(OMPBuilder.createOrderedDepend(Builder, AllocaIP, NumLoops,
+                                                   StoreValues, ".cnt.addr",
+                                                   /*IsDependSource=*/true));
+
+  Builder.CreateRetVoid();
+  OMPBuilder.finalize();
+  EXPECT_FALSE(verifyModule(*M, &errs()));
+
+  AllocaInst *AllocInst = dyn_cast<AllocaInst>(&BB->front());
+  ASSERT_NE(AllocInst, nullptr);
+  ArrayType *ArrType = dyn_cast<ArrayType>(AllocInst->getAllocatedType());
+  EXPECT_EQ(ArrType->getNumElements(), NumLoops);
+  EXPECT_TRUE(
+      AllocInst->getAllocatedType()->getArrayElementType()->isIntegerTy(64));
+
+  Instruction *IterInst = dyn_cast<Instruction>(AllocInst);
+  for (unsigned Iter = 0; Iter < NumLoops; Iter++) {
+    GetElementPtrInst *DependAddrGEPIter =
+        dyn_cast<GetElementPtrInst>(IterInst->getNextNode());
+    ASSERT_NE(DependAddrGEPIter, nullptr);
+    EXPECT_EQ(DependAddrGEPIter->getPointerOperand(), AllocInst);
+    EXPECT_EQ(DependAddrGEPIter->getNumIndices(), (unsigned)2);
+    auto *FirstIdx = dyn_cast<ConstantInt>(DependAddrGEPIter->getOperand(1));
+    auto *SecondIdx = dyn_cast<ConstantInt>(DependAddrGEPIter->getOperand(2));
+    ASSERT_NE(FirstIdx, nullptr);
+    ASSERT_NE(SecondIdx, nullptr);
+    EXPECT_EQ(FirstIdx->getValue(), 0);
+    EXPECT_EQ(SecondIdx->getValue(), Iter);
+    StoreInst *StoreValue =
+        dyn_cast<StoreInst>(DependAddrGEPIter->getNextNode());
+    ASSERT_NE(StoreValue, nullptr);
+    EXPECT_EQ(StoreValue->getValueOperand(), StoreValues[Iter]);
+    EXPECT_EQ(StoreValue->getPointerOperand(), DependAddrGEPIter);
+    IterInst = dyn_cast<Instruction>(StoreValue);
+  }
+
+  GetElementPtrInst *DependBaseAddrGEP =
+      dyn_cast<GetElementPtrInst>(IterInst->getNextNode());
+  ASSERT_NE(DependBaseAddrGEP, nullptr);
+  EXPECT_EQ(DependBaseAddrGEP->getPointerOperand(), AllocInst);
+  EXPECT_EQ(DependBaseAddrGEP->getNumIndices(), (unsigned)2);
+  auto *FirstIdx = dyn_cast<ConstantInt>(DependBaseAddrGEP->getOperand(1));
+  auto *SecondIdx = dyn_cast<ConstantInt>(DependBaseAddrGEP->getOperand(2));
+  ASSERT_NE(FirstIdx, nullptr);
+  ASSERT_NE(SecondIdx, nullptr);
+  EXPECT_EQ(FirstIdx->getValue(), 0);
+  EXPECT_EQ(SecondIdx->getValue(), 0);
+
+  CallInst *GTID = dyn_cast<CallInst>(DependBaseAddrGEP->getNextNode());
+  ASSERT_NE(GTID, nullptr);
+  EXPECT_EQ(GTID->getNumArgOperands(), 1U);
+  EXPECT_EQ(GTID->getCalledFunction()->getName(), "__kmpc_global_thread_num");
+  EXPECT_FALSE(GTID->getCalledFunction()->doesNotAccessMemory());
+  EXPECT_FALSE(GTID->getCalledFunction()->doesNotFreeMemory());
+
+  CallInst *Depend = dyn_cast<CallInst>(GTID->getNextNode());
+  ASSERT_NE(Depend, nullptr);
+  EXPECT_EQ(Depend->getNumArgOperands(), 3U);
+  EXPECT_EQ(Depend->getCalledFunction()->getName(), "__kmpc_doacross_post");
+  EXPECT_TRUE(isa<GlobalVariable>(Depend->getArgOperand(0)));
+  EXPECT_EQ(Depend->getArgOperand(1), GTID);
+  EXPECT_EQ(Depend->getArgOperand(2), DependBaseAddrGEP);
+}
+
+TEST_F(OpenMPIRBuilderTest, OrderedDirectiveDependSink) {
+  using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
+  OpenMPIRBuilder OMPBuilder(*M);
+  OMPBuilder.initialize();
+  F->setName("func");
+  IRBuilder<> Builder(BB);
+  LLVMContext &Ctx = M->getContext();
+
+  OpenMPIRBuilder::LocationDescription Loc({Builder.saveIP(), DL});
+
+  InsertPointTy AllocaIP(&F->getEntryBlock(),
+                         F->getEntryBlock().getFirstInsertionPt());
+
+  unsigned NumLoops = 2;
+  SmallVector<Value *, 2> StoreValues;
+  Type *LCTy = Type::getInt64Ty(Ctx);
+  StoreValues.emplace_back(ConstantInt::get(LCTy, 1));
+  StoreValues.emplace_back(ConstantInt::get(LCTy, 2));
+
+  // Test for "#omp ordered depend(sink: vec)"
+  Builder.restoreIP(OMPBuilder.createOrderedDepend(Builder, AllocaIP, NumLoops,
+                                                   StoreValues, ".cnt.addr",
+                                                   /*IsDependSource=*/false));
+
+  Builder.CreateRetVoid();
+  OMPBuilder.finalize();
+  EXPECT_FALSE(verifyModule(*M, &errs()));
+
+  AllocaInst *AllocInst = dyn_cast<AllocaInst>(&BB->front());
+  ASSERT_NE(AllocInst, nullptr);
+  ArrayType *ArrType = dyn_cast<ArrayType>(AllocInst->getAllocatedType());
+  EXPECT_EQ(ArrType->getNumElements(), NumLoops);
+  EXPECT_TRUE(
+      AllocInst->getAllocatedType()->getArrayElementType()->isIntegerTy(64));
+
+  Instruction *IterInst = dyn_cast<Instruction>(AllocInst);
+  for (unsigned Iter = 0; Iter < NumLoops; Iter++) {
+    GetElementPtrInst *DependAddrGEPIter =
+        dyn_cast<GetElementPtrInst>(IterInst->getNextNode());
+    ASSERT_NE(DependAddrGEPIter, nullptr);
+    EXPECT_EQ(DependAddrGEPIter->getPointerOperand(), AllocInst);
+    EXPECT_EQ(DependAddrGEPIter->getNumIndices(), (unsigned)2);
+    auto *FirstIdx = dyn_cast<ConstantInt>(DependAddrGEPIter->getOperand(1));
+    auto *SecondIdx = dyn_cast<ConstantInt>(DependAddrGEPIter->getOperand(2));
+    ASSERT_NE(FirstIdx, nullptr);
+    ASSERT_NE(SecondIdx, nullptr);
+    EXPECT_EQ(FirstIdx->getValue(), 0);
+    EXPECT_EQ(SecondIdx->getValue(), Iter);
+    StoreInst *StoreValue =
+        dyn_cast<StoreInst>(DependAddrGEPIter->getNextNode());
+    ASSERT_NE(StoreValue, nullptr);
+    EXPECT_EQ(StoreValue->getValueOperand(), StoreValues[Iter]);
+    EXPECT_EQ(StoreValue->getPointerOperand(), DependAddrGEPIter);
+    IterInst = dyn_cast<Instruction>(StoreValue);
+  }
+
+  GetElementPtrInst *DependBaseAddrGEP =
+      dyn_cast<GetElementPtrInst>(IterInst->getNextNode());
+  ASSERT_NE(DependBaseAddrGEP, nullptr);
+  EXPECT_EQ(DependBaseAddrGEP->getPointerOperand(), AllocInst);
+  EXPECT_EQ(DependBaseAddrGEP->getNumIndices(), (unsigned)2);
+  auto *FirstIdx = dyn_cast<ConstantInt>(DependBaseAddrGEP->getOperand(1));
+  auto *SecondIdx = dyn_cast<ConstantInt>(DependBaseAddrGEP->getOperand(2));
+  ASSERT_NE(FirstIdx, nullptr);
+  ASSERT_NE(SecondIdx, nullptr);
+  EXPECT_EQ(FirstIdx->getValue(), 0);
+  EXPECT_EQ(SecondIdx->getValue(), 0);
+
+  CallInst *GTID = dyn_cast<CallInst>(DependBaseAddrGEP->getNextNode());
+  ASSERT_NE(GTID, nullptr);
+  EXPECT_EQ(GTID->getNumArgOperands(), 1U);
+  EXPECT_EQ(GTID->getCalledFunction()->getName(), "__kmpc_global_thread_num");
+  EXPECT_FALSE(GTID->getCalledFunction()->doesNotAccessMemory());
+  EXPECT_FALSE(GTID->getCalledFunction()->doesNotFreeMemory());
+
+  CallInst *Depend = dyn_cast<CallInst>(GTID->getNextNode());
+  ASSERT_NE(Depend, nullptr);
+  EXPECT_EQ(Depend->getNumArgOperands(), 3U);
+  EXPECT_EQ(Depend->getCalledFunction()->getName(), "__kmpc_doacross_wait");
+  EXPECT_TRUE(isa<GlobalVariable>(Depend->getArgOperand(0)));
+  EXPECT_EQ(Depend->getArgOperand(1), GTID);
+  EXPECT_EQ(Depend->getArgOperand(2), DependBaseAddrGEP);
+}
+
+TEST_F(OpenMPIRBuilderTest, OrderedDirectiveThreads) {
+  using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
+  OpenMPIRBuilder OMPBuilder(*M);
+  OMPBuilder.initialize();
+  F->setName("func");
+  IRBuilder<> Builder(BB);
+
+  OpenMPIRBuilder::LocationDescription Loc({Builder.saveIP(), DL});
+
+  AllocaInst *PrivAI =
+      Builder.CreateAlloca(F->arg_begin()->getType(), nullptr, "priv.inst");
+
+  BasicBlock *EntryBB = nullptr;
+
+  auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
+                       BasicBlock &FiniBB) {
+    EntryBB = FiniBB.getUniquePredecessor();
+
+    llvm::BasicBlock *CodeGenIPBB = CodeGenIP.getBlock();
+    llvm::Instruction *CodeGenIPInst = &*CodeGenIP.getPoint();
+    EXPECT_EQ(CodeGenIPBB->getTerminator(), CodeGenIPInst);
+    EXPECT_EQ(EntryBB, CodeGenIPBB);
+
+    Builder.restoreIP(CodeGenIP);
+    Builder.CreateStore(F->arg_begin(), PrivAI);
+    Value *PrivLoad =
+        Builder.CreateLoad(PrivAI->getAllocatedType(), PrivAI, "local.use");
+    Builder.CreateICmpNE(F->arg_begin(), PrivLoad);
+  };
+
+  auto FiniCB = [&](InsertPointTy IP) {
+    BasicBlock *IPBB = IP.getBlock();
+    EXPECT_NE(IPBB->end(), IP.getPoint());
+  };
+
+  // Test for "#omp ordered [threads]"
+  Builder.restoreIP(
+      OMPBuilder.createOrderedThreadsSimd(Builder, BodyGenCB, FiniCB, true));
+
+  Builder.CreateRetVoid();
+  OMPBuilder.finalize();
+  EXPECT_FALSE(verifyModule(*M, &errs()));
+
+  EXPECT_NE(EntryBB->getTerminator(), nullptr);
+
+  CallInst *OrderedEntryCI = nullptr;
+  for (auto &EI : *EntryBB) {
+    Instruction *Cur = &EI;
+    if (isa<CallInst>(Cur)) {
+      OrderedEntryCI = cast<CallInst>(Cur);
+      if (OrderedEntryCI->getCalledFunction()->getName() == "__kmpc_ordered")
+        break;
+      OrderedEntryCI = nullptr;
+    }
+  }
+  EXPECT_NE(OrderedEntryCI, nullptr);
+  EXPECT_EQ(OrderedEntryCI->getNumArgOperands(), 2U);
+  EXPECT_EQ(OrderedEntryCI->getCalledFunction()->getName(), "__kmpc_ordered");
+  EXPECT_TRUE(isa<GlobalVariable>(OrderedEntryCI->getArgOperand(0)));
+
+  CallInst *OrderedEndCI = nullptr;
+  for (auto &FI : *EntryBB) {
+    Instruction *Cur = &FI;
+    if (isa<CallInst>(Cur)) {
+      OrderedEndCI = cast<CallInst>(Cur);
+      if (OrderedEndCI->getCalledFunction()->getName() == "__kmpc_end_ordered")
+        break;
+      OrderedEndCI = nullptr;
+    }
+  }
+  EXPECT_NE(OrderedEndCI, nullptr);
+  EXPECT_EQ(OrderedEndCI->getNumArgOperands(), 2U);
+  EXPECT_TRUE(isa<GlobalVariable>(OrderedEndCI->getArgOperand(0)));
+  EXPECT_EQ(OrderedEndCI->getArgOperand(1), OrderedEntryCI->getArgOperand(1));
+}
+
+TEST_F(OpenMPIRBuilderTest, OrderedDirectiveSimd) {
+  using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
+  OpenMPIRBuilder OMPBuilder(*M);
+  OMPBuilder.initialize();
+  F->setName("func");
+  IRBuilder<> Builder(BB);
+
+  OpenMPIRBuilder::LocationDescription Loc({Builder.saveIP(), DL});
+
+  AllocaInst *PrivAI =
+      Builder.CreateAlloca(F->arg_begin()->getType(), nullptr, "priv.inst");
+
+  BasicBlock *EntryBB = nullptr;
+
+  auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
+                       BasicBlock &FiniBB) {
+    EntryBB = FiniBB.getUniquePredecessor();
+
+    llvm::BasicBlock *CodeGenIPBB = CodeGenIP.getBlock();
+    llvm::Instruction *CodeGenIPInst = &*CodeGenIP.getPoint();
+    EXPECT_EQ(CodeGenIPBB->getTerminator(), CodeGenIPInst);
+    EXPECT_EQ(EntryBB, CodeGenIPBB);
+
+    Builder.restoreIP(CodeGenIP);
+    Builder.CreateStore(F->arg_begin(), PrivAI);
+    Value *PrivLoad =
+        Builder.CreateLoad(PrivAI->getAllocatedType(), PrivAI, "local.use");
+    Builder.CreateICmpNE(F->arg_begin(), PrivLoad);
+  };
+
+  auto FiniCB = [&](InsertPointTy IP) {
+    BasicBlock *IPBB = IP.getBlock();
+    EXPECT_NE(IPBB->end(), IP.getPoint());
+  };
+
+  // Test for "#omp ordered simd"
+  Builder.restoreIP(
+      OMPBuilder.createOrderedThreadsSimd(Builder, BodyGenCB, FiniCB, false));
+
+  Builder.CreateRetVoid();
+  OMPBuilder.finalize();
+  EXPECT_FALSE(verifyModule(*M, &errs()));
+
+  EXPECT_NE(EntryBB->getTerminator(), nullptr);
+
+  CallInst *OrderedEntryCI = nullptr;
+  for (auto &EI : *EntryBB) {
+    Instruction *Cur = &EI;
+    if (isa<CallInst>(Cur)) {
+      OrderedEntryCI = cast<CallInst>(Cur);
+      if (OrderedEntryCI->getCalledFunction()->getName() == "__kmpc_ordered")
+        break;
+      OrderedEntryCI = nullptr;
+    }
+  }
+  EXPECT_EQ(OrderedEntryCI, nullptr);
+
+  CallInst *OrderedEndCI = nullptr;
+  for (auto &FI : *EntryBB) {
+    Instruction *Cur = &FI;
+    if (isa<CallInst>(Cur)) {
+      OrderedEndCI = cast<CallInst>(Cur);
+      if (OrderedEndCI->getCalledFunction()->getName() == "__kmpc_end_ordered")
+        break;
+      OrderedEndCI = nullptr;
+    }
+  }
+  EXPECT_EQ(OrderedEndCI, nullptr);
+}
+
 TEST_F(OpenMPIRBuilderTest, CopyinBlocks) {
   OpenMPIRBuilder OMPBuilder(*M);
   OMPBuilder.initialize();


        


More information about the cfe-commits mailing list