[Mlir-commits] [mlir] a66f776 - [OpenMPIRBuilder] Implement static-chunked workshare-loop schedules.
Michael Kruse
llvmlistbot at llvm.org
Mon Feb 28 16:18:50 PST 2022
Author: Michael Kruse
Date: 2022-02-28T18:18:33-06:00
New Revision: a66f7769a3df711ff96f3832f5c71899ac671218
URL: https://github.com/llvm/llvm-project/commit/a66f7769a3df711ff96f3832f5c71899ac671218
DIFF: https://github.com/llvm/llvm-project/commit/a66f7769a3df711ff96f3832f5c71899ac671218.diff
LOG: [OpenMPIRBuilder] Implement static-chunked workshare-loop schedules.
Add applyStaticChunkedWorkshareLoop method implementing static schedule when chunk-size is specified. Unlike a static schedule without chunk-size (where chunk-size is chosen by the runtime such that each thread receives one chunk), we need two nested loops: one for looping over the iterations of a chunk, and a second for looping over all chunks assigned to the threads.
This patch includes the following related changes:
* Adapt applyWorkshareLoop to triage between the schedule types, now possible since all schedules have been implemented. The default schedule is assumed to be non-chunked static, as without OpenMPIRBuilder.
* Remove the chunk parameter from applyStaticWorkshareLoop, it is ignored by the runtime. Change the value for the value passed to the init function to 0, as without OpenMPIRBuilder.
* Refactor CanonicalLoopInfo::setTripCount and CanonicalLoopInfo::mapIndVar as used by both, applyStaticWorkshareLoop and applyStaticChunkedWorkshareLoop.
* Enable Clang to use the OpenMPIRBuilder in the presence of the schedule clause.
Differential Revision: https://reviews.llvm.org/D114413
Added:
clang/test/OpenMP/irbuilder_for_unsigned_auto.c
clang/test/OpenMP/irbuilder_for_unsigned_dynamic.c
clang/test/OpenMP/irbuilder_for_unsigned_dynamic_chunked.c
clang/test/OpenMP/irbuilder_for_unsigned_runtime.c
clang/test/OpenMP/irbuilder_for_unsigned_static_chunked.c
Modified:
clang/lib/CodeGen/CGStmtOpenMP.cpp
clang/test/OpenMP/cancel_codegen.cpp
clang/test/OpenMP/irbuilder_for_iterator.cpp
clang/test/OpenMP/irbuilder_for_rangefor.cpp
clang/test/OpenMP/irbuilder_for_unsigned.c
clang/test/OpenMP/irbuilder_for_unsigned_down.c
clang/test/OpenMP/irbuilder_nested_parallel_for.c
clang/test/OpenMP/irbuilder_unroll_partial_factor_for.c
clang/test/OpenMP/irbuilder_unroll_partial_heuristic_constant_for.c
clang/test/OpenMP/irbuilder_unroll_partial_heuristic_for_collapse.c
clang/test/OpenMP/irbuilder_unroll_partial_heuristic_runtime_for.c
clang/test/OpenMP/irbuilder_unroll_unroll_partial_factor.c
llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
mlir/test/Target/LLVMIR/openmp-llvm.mlir
Removed:
################################################################################
diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp
index 4b839b0ed266c..e98b0c7996708 100644
--- a/clang/lib/CodeGen/CGStmtOpenMP.cpp
+++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp
@@ -3717,13 +3717,52 @@ static bool emitWorksharingDirective(CodeGenFunction &CGF,
static bool isSupportedByOpenMPIRBuilder(const OMPForDirective &S) {
if (S.hasCancel())
return false;
- for (OMPClause *C : S.clauses())
- if (!isa<OMPNowaitClause>(C))
- return false;
+ for (OMPClause *C : S.clauses()) {
+ if (isa<OMPNowaitClause>(C))
+ continue;
+
+ if (auto *SC = dyn_cast<OMPScheduleClause>(C)) {
+ if (SC->getFirstScheduleModifier() != OMPC_SCHEDULE_MODIFIER_unknown)
+ return false;
+ if (SC->getSecondScheduleModifier() != OMPC_SCHEDULE_MODIFIER_unknown)
+ return false;
+ switch (SC->getScheduleKind()) {
+ case OMPC_SCHEDULE_auto:
+ case OMPC_SCHEDULE_dynamic:
+ case OMPC_SCHEDULE_runtime:
+ case OMPC_SCHEDULE_guided:
+ case OMPC_SCHEDULE_static:
+ continue;
+ case OMPC_SCHEDULE_unknown:
+ return false;
+ }
+ }
+
+ return false;
+ }
return true;
}
+static llvm::omp::ScheduleKind
+convertClauseKindToSchedKind(OpenMPScheduleClauseKind ScheduleClauseKind) {
+ switch (ScheduleClauseKind) {
+ case OMPC_SCHEDULE_unknown:
+ return llvm::omp::OMP_SCHEDULE_Default;
+ case OMPC_SCHEDULE_auto:
+ return llvm::omp::OMP_SCHEDULE_Auto;
+ case OMPC_SCHEDULE_dynamic:
+ return llvm::omp::OMP_SCHEDULE_Dynamic;
+ case OMPC_SCHEDULE_guided:
+ return llvm::omp::OMP_SCHEDULE_Guided;
+ case OMPC_SCHEDULE_runtime:
+ return llvm::omp::OMP_SCHEDULE_Runtime;
+ case OMPC_SCHEDULE_static:
+ return llvm::omp::OMP_SCHEDULE_Static;
+ }
+ llvm_unreachable("Unhandled schedule kind");
+}
+
void CodeGenFunction::EmitOMPForDirective(const OMPForDirective &S) {
bool HasLastprivates = false;
bool UseOMPIRBuilder =
@@ -3732,18 +3771,29 @@ void CodeGenFunction::EmitOMPForDirective(const OMPForDirective &S) {
UseOMPIRBuilder](CodeGenFunction &CGF, PrePostActionTy &) {
// Use the OpenMPIRBuilder if enabled.
if (UseOMPIRBuilder) {
+ bool NeedsBarrier = !S.getSingleClause<OMPNowaitClause>();
+
+ llvm::omp::ScheduleKind SchedKind = llvm::omp::OMP_SCHEDULE_Default;
+ llvm::Value *ChunkSize = nullptr;
+ if (auto *SchedClause = S.getSingleClause<OMPScheduleClause>()) {
+ SchedKind =
+ convertClauseKindToSchedKind(SchedClause->getScheduleKind());
+ if (const Expr *ChunkSizeExpr = SchedClause->getChunkSize())
+ ChunkSize = EmitScalarExpr(ChunkSizeExpr);
+ }
+
// Emit the associated statement and get its loop representation.
const Stmt *Inner = S.getRawStmt();
llvm::CanonicalLoopInfo *CLI =
EmitOMPCollapsedCanonicalLoopNest(Inner, 1);
- bool NeedsBarrier = !S.getSingleClause<OMPNowaitClause>();
llvm::OpenMPIRBuilder &OMPBuilder =
CGM.getOpenMPRuntime().getOMPBuilder();
llvm::OpenMPIRBuilder::InsertPointTy AllocaIP(
AllocaInsertPt->getParent(), AllocaInsertPt->getIterator());
OMPBuilder.applyWorkshareLoop(Builder.getCurrentDebugLocation(), CLI,
- AllocaIP, NeedsBarrier);
+ AllocaIP, NeedsBarrier, SchedKind,
+ ChunkSize);
return;
}
diff --git a/clang/test/OpenMP/cancel_codegen.cpp b/clang/test/OpenMP/cancel_codegen.cpp
index 1cee2ff7e33e4..e68eb4ec600e8 100644
--- a/clang/test/OpenMP/cancel_codegen.cpp
+++ b/clang/test/OpenMP/cancel_codegen.cpp
@@ -1366,7 +1366,7 @@ for (int i = 0; i < argc; ++i) {
// CHECK3-NEXT: store i32 0, i32* [[P_UPPERBOUND]], align 4
// CHECK3-NEXT: store i32 1, i32* [[P_STRIDE]], align 4
// CHECK3-NEXT: [[OMP_GLOBAL_THREAD_NUM11:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-// CHECK3-NEXT: call void @__kmpc_for_static_init_4u(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM11]], i32 34, i32* [[P_LASTITER]], i32* [[P_LOWERBOUND]], i32* [[P_UPPERBOUND]], i32* [[P_STRIDE]], i32 1, i32 1)
+// CHECK3-NEXT: call void @__kmpc_for_static_init_4u(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM11]], i32 34, i32* [[P_LASTITER]], i32* [[P_LOWERBOUND]], i32* [[P_UPPERBOUND]], i32* [[P_STRIDE]], i32 1, i32 0)
// CHECK3-NEXT: [[TMP0:%.*]] = load i32, i32* [[P_LOWERBOUND]], align 4
// CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* [[P_UPPERBOUND]], align 4
// CHECK3-NEXT: [[TMP2:%.*]] = sub i32 [[TMP1]], [[TMP0]]
@@ -1402,7 +1402,7 @@ for (int i = 0; i < argc; ++i) {
// CHECK3-NEXT: store i32 1, i32* [[P_UPPERBOUND29]], align 4
// CHECK3-NEXT: store i32 1, i32* [[P_STRIDE30]], align 4
// CHECK3-NEXT: [[OMP_GLOBAL_THREAD_NUM31:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-// CHECK3-NEXT: call void @__kmpc_for_static_init_4u(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM31]], i32 34, i32* [[P_LASTITER27]], i32* [[P_LOWERBOUND28]], i32* [[P_UPPERBOUND29]], i32* [[P_STRIDE30]], i32 1, i32 1)
+// CHECK3-NEXT: call void @__kmpc_for_static_init_4u(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM31]], i32 34, i32* [[P_LASTITER27]], i32* [[P_LOWERBOUND28]], i32* [[P_UPPERBOUND29]], i32* [[P_STRIDE30]], i32 1, i32 0)
// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[P_LOWERBOUND28]], align 4
// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[P_UPPERBOUND29]], align 4
// CHECK3-NEXT: [[TMP9:%.*]] = sub i32 [[TMP8]], [[TMP7]]
@@ -2002,7 +2002,7 @@ for (int i = 0; i < argc; ++i) {
// CHECK4-NEXT: store i32 0, i32* [[P_UPPERBOUND]], align 4
// CHECK4-NEXT: store i32 1, i32* [[P_STRIDE]], align 4
// CHECK4-NEXT: [[OMP_GLOBAL_THREAD_NUM11:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-// CHECK4-NEXT: call void @__kmpc_for_static_init_4u(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM11]], i32 34, i32* [[P_LASTITER]], i32* [[P_LOWERBOUND]], i32* [[P_UPPERBOUND]], i32* [[P_STRIDE]], i32 1, i32 1)
+// CHECK4-NEXT: call void @__kmpc_for_static_init_4u(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM11]], i32 34, i32* [[P_LASTITER]], i32* [[P_LOWERBOUND]], i32* [[P_UPPERBOUND]], i32* [[P_STRIDE]], i32 1, i32 0)
// CHECK4-NEXT: [[TMP0:%.*]] = load i32, i32* [[P_LOWERBOUND]], align 4
// CHECK4-NEXT: [[TMP1:%.*]] = load i32, i32* [[P_UPPERBOUND]], align 4
// CHECK4-NEXT: [[TMP2:%.*]] = sub i32 [[TMP1]], [[TMP0]]
@@ -2038,7 +2038,7 @@ for (int i = 0; i < argc; ++i) {
// CHECK4-NEXT: store i32 1, i32* [[P_UPPERBOUND29]], align 4
// CHECK4-NEXT: store i32 1, i32* [[P_STRIDE30]], align 4
// CHECK4-NEXT: [[OMP_GLOBAL_THREAD_NUM31:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-// CHECK4-NEXT: call void @__kmpc_for_static_init_4u(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM31]], i32 34, i32* [[P_LASTITER27]], i32* [[P_LOWERBOUND28]], i32* [[P_UPPERBOUND29]], i32* [[P_STRIDE30]], i32 1, i32 1)
+// CHECK4-NEXT: call void @__kmpc_for_static_init_4u(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM31]], i32 34, i32* [[P_LASTITER27]], i32* [[P_LOWERBOUND28]], i32* [[P_UPPERBOUND29]], i32* [[P_STRIDE30]], i32 1, i32 0)
// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[P_LOWERBOUND28]], align 4
// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[P_UPPERBOUND29]], align 4
// CHECK4-NEXT: [[TMP9:%.*]] = sub i32 [[TMP8]], [[TMP7]]
@@ -3878,7 +3878,7 @@ for (int i = 0; i < argc; ++i) {
// CHECK9-NEXT: store i32 0, i32* [[P_UPPERBOUND]], align 4
// CHECK9-NEXT: store i32 1, i32* [[P_STRIDE]], align 4
// CHECK9-NEXT: [[OMP_GLOBAL_THREAD_NUM11:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-// CHECK9-NEXT: call void @__kmpc_for_static_init_4u(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM11]], i32 34, i32* [[P_LASTITER]], i32* [[P_LOWERBOUND]], i32* [[P_UPPERBOUND]], i32* [[P_STRIDE]], i32 1, i32 1)
+// CHECK9-NEXT: call void @__kmpc_for_static_init_4u(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM11]], i32 34, i32* [[P_LASTITER]], i32* [[P_LOWERBOUND]], i32* [[P_UPPERBOUND]], i32* [[P_STRIDE]], i32 1, i32 0)
// CHECK9-NEXT: [[TMP0:%.*]] = load i32, i32* [[P_LOWERBOUND]], align 4
// CHECK9-NEXT: [[TMP1:%.*]] = load i32, i32* [[P_UPPERBOUND]], align 4
// CHECK9-NEXT: [[TMP2:%.*]] = sub i32 [[TMP1]], [[TMP0]]
@@ -3914,7 +3914,7 @@ for (int i = 0; i < argc; ++i) {
// CHECK9-NEXT: store i32 1, i32* [[P_UPPERBOUND29]], align 4
// CHECK9-NEXT: store i32 1, i32* [[P_STRIDE30]], align 4
// CHECK9-NEXT: [[OMP_GLOBAL_THREAD_NUM31:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-// CHECK9-NEXT: call void @__kmpc_for_static_init_4u(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM31]], i32 34, i32* [[P_LASTITER27]], i32* [[P_LOWERBOUND28]], i32* [[P_UPPERBOUND29]], i32* [[P_STRIDE30]], i32 1, i32 1)
+// CHECK9-NEXT: call void @__kmpc_for_static_init_4u(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM31]], i32 34, i32* [[P_LASTITER27]], i32* [[P_LOWERBOUND28]], i32* [[P_UPPERBOUND29]], i32* [[P_STRIDE30]], i32 1, i32 0)
// CHECK9-NEXT: [[TMP7:%.*]] = load i32, i32* [[P_LOWERBOUND28]], align 4
// CHECK9-NEXT: [[TMP8:%.*]] = load i32, i32* [[P_UPPERBOUND29]], align 4
// CHECK9-NEXT: [[TMP9:%.*]] = sub i32 [[TMP8]], [[TMP7]]
@@ -4514,7 +4514,7 @@ for (int i = 0; i < argc; ++i) {
// CHECK10-NEXT: store i32 0, i32* [[P_UPPERBOUND]], align 4
// CHECK10-NEXT: store i32 1, i32* [[P_STRIDE]], align 4
// CHECK10-NEXT: [[OMP_GLOBAL_THREAD_NUM11:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-// CHECK10-NEXT: call void @__kmpc_for_static_init_4u(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM11]], i32 34, i32* [[P_LASTITER]], i32* [[P_LOWERBOUND]], i32* [[P_UPPERBOUND]], i32* [[P_STRIDE]], i32 1, i32 1)
+// CHECK10-NEXT: call void @__kmpc_for_static_init_4u(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM11]], i32 34, i32* [[P_LASTITER]], i32* [[P_LOWERBOUND]], i32* [[P_UPPERBOUND]], i32* [[P_STRIDE]], i32 1, i32 0)
// CHECK10-NEXT: [[TMP0:%.*]] = load i32, i32* [[P_LOWERBOUND]], align 4
// CHECK10-NEXT: [[TMP1:%.*]] = load i32, i32* [[P_UPPERBOUND]], align 4
// CHECK10-NEXT: [[TMP2:%.*]] = sub i32 [[TMP1]], [[TMP0]]
@@ -4550,7 +4550,7 @@ for (int i = 0; i < argc; ++i) {
// CHECK10-NEXT: store i32 1, i32* [[P_UPPERBOUND29]], align 4
// CHECK10-NEXT: store i32 1, i32* [[P_STRIDE30]], align 4
// CHECK10-NEXT: [[OMP_GLOBAL_THREAD_NUM31:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-// CHECK10-NEXT: call void @__kmpc_for_static_init_4u(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM31]], i32 34, i32* [[P_LASTITER27]], i32* [[P_LOWERBOUND28]], i32* [[P_UPPERBOUND29]], i32* [[P_STRIDE30]], i32 1, i32 1)
+// CHECK10-NEXT: call void @__kmpc_for_static_init_4u(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM31]], i32 34, i32* [[P_LASTITER27]], i32* [[P_LOWERBOUND28]], i32* [[P_UPPERBOUND29]], i32* [[P_STRIDE30]], i32 1, i32 0)
// CHECK10-NEXT: [[TMP7:%.*]] = load i32, i32* [[P_LOWERBOUND28]], align 4
// CHECK10-NEXT: [[TMP8:%.*]] = load i32, i32* [[P_UPPERBOUND29]], align 4
// CHECK10-NEXT: [[TMP9:%.*]] = sub i32 [[TMP8]], [[TMP7]]
diff --git a/clang/test/OpenMP/irbuilder_for_iterator.cpp b/clang/test/OpenMP/irbuilder_for_iterator.cpp
index 2e204c1da9da9..d0aec8616f341 100644
--- a/clang/test/OpenMP/irbuilder_for_iterator.cpp
+++ b/clang/test/OpenMP/irbuilder_for_iterator.cpp
@@ -59,7 +59,7 @@ extern "C" void workshareloop_iterator(float *a, float *b, float *c) {
// CHECK-NEXT: store i64 [[TMP2]], i64* [[P_UPPERBOUND]], align 8
// CHECK-NEXT: store i64 1, i64* [[P_STRIDE]], align 8
// CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]])
-// CHECK-NEXT: call void @__kmpc_for_static_init_8u(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]], i32 34, i32* [[P_LASTITER]], i64* [[P_LOWERBOUND]], i64* [[P_UPPERBOUND]], i64* [[P_STRIDE]], i64 1, i64 1)
+// CHECK-NEXT: call void @__kmpc_for_static_init_8u(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]], i32 34, i32* [[P_LASTITER]], i64* [[P_LOWERBOUND]], i64* [[P_UPPERBOUND]], i64* [[P_STRIDE]], i64 1, i64 0)
// CHECK-NEXT: [[TMP3:%.*]] = load i64, i64* [[P_LOWERBOUND]], align 8
// CHECK-NEXT: [[TMP4:%.*]] = load i64, i64* [[P_UPPERBOUND]], align 8
// CHECK-NEXT: [[TMP5:%.*]] = sub i64 [[TMP4]], [[TMP3]]
diff --git a/clang/test/OpenMP/irbuilder_for_rangefor.cpp b/clang/test/OpenMP/irbuilder_for_rangefor.cpp
index 522cf0ab6d4e2..d2fe33fcc67cc 100644
--- a/clang/test/OpenMP/irbuilder_for_rangefor.cpp
+++ b/clang/test/OpenMP/irbuilder_for_rangefor.cpp
@@ -77,7 +77,7 @@ extern "C" void workshareloop_rangefor(float *a, float *b, float *c) {
// CHECK-NEXT: store i64 [[TMP5]], i64* [[P_UPPERBOUND]], align 8
// CHECK-NEXT: store i64 1, i64* [[P_STRIDE]], align 8
// CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]])
-// CHECK-NEXT: call void @__kmpc_for_static_init_8u(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]], i32 34, i32* [[P_LASTITER]], i64* [[P_LOWERBOUND]], i64* [[P_UPPERBOUND]], i64* [[P_STRIDE]], i64 1, i64 1)
+// CHECK-NEXT: call void @__kmpc_for_static_init_8u(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]], i32 34, i32* [[P_LASTITER]], i64* [[P_LOWERBOUND]], i64* [[P_UPPERBOUND]], i64* [[P_STRIDE]], i64 1, i64 0)
// CHECK-NEXT: [[TMP6:%.*]] = load i64, i64* [[P_LOWERBOUND]], align 8
// CHECK-NEXT: [[TMP7:%.*]] = load i64, i64* [[P_UPPERBOUND]], align 8
// CHECK-NEXT: [[TMP8:%.*]] = sub i64 [[TMP7]], [[TMP6]]
diff --git a/clang/test/OpenMP/irbuilder_for_unsigned.c b/clang/test/OpenMP/irbuilder_for_unsigned.c
index 928d99b9bfc72..c7cf86fe7a26c 100644
--- a/clang/test/OpenMP/irbuilder_for_unsigned.c
+++ b/clang/test/OpenMP/irbuilder_for_unsigned.c
@@ -47,7 +47,7 @@ extern "C" void workshareloop_unsigned(float *a, float *b, float *c, float *d) {
// CHECK-NEXT: store i32 [[TMP3]], i32* [[P_UPPERBOUND]], align 4
// CHECK-NEXT: store i32 1, i32* [[P_STRIDE]], align 4
// CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]])
-// CHECK-NEXT: call void @__kmpc_for_static_init_4u(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]], i32 34, i32* [[P_LASTITER]], i32* [[P_LOWERBOUND]], i32* [[P_UPPERBOUND]], i32* [[P_STRIDE]], i32 1, i32 1)
+// CHECK-NEXT: call void @__kmpc_for_static_init_4u(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]], i32 34, i32* [[P_LASTITER]], i32* [[P_LOWERBOUND]], i32* [[P_UPPERBOUND]], i32* [[P_STRIDE]], i32 1, i32 0)
// CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[P_LOWERBOUND]], align 4
// CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* [[P_UPPERBOUND]], align 4
// CHECK-NEXT: [[TMP6:%.*]] = sub i32 [[TMP5]], [[TMP4]]
diff --git a/clang/test/OpenMP/irbuilder_for_unsigned_auto.c b/clang/test/OpenMP/irbuilder_for_unsigned_auto.c
new file mode 100644
index 0000000000000..837e9190a16fd
--- /dev/null
+++ b/clang/test/OpenMP/irbuilder_for_unsigned_auto.c
@@ -0,0 +1,173 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs
+// RUN: %clang_cc1 -fopenmp-enable-irbuilder -verify -fopenmp -fopenmp-version=45 -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -o - | FileCheck %s
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+// CHECK-LABEL: define {{.*}}@workshareloop_unsigned_auto(
+// CHECK-NEXT: [[ENTRY:.*]]:
+// CHECK-NEXT: %[[A_ADDR:.+]] = alloca float*, align 8
+// CHECK-NEXT: %[[B_ADDR:.+]] = alloca float*, align 8
+// CHECK-NEXT: %[[C_ADDR:.+]] = alloca float*, align 8
+// CHECK-NEXT: %[[D_ADDR:.+]] = alloca float*, align 8
+// CHECK-NEXT: %[[I:.+]] = alloca i32, align 4
+// CHECK-NEXT: %[[AGG_CAPTURED:.+]] = alloca %struct.anon, align 8
+// CHECK-NEXT: %[[AGG_CAPTURED1:.+]] = alloca %struct.anon.0, align 4
+// CHECK-NEXT: %[[DOTCOUNT_ADDR:.+]] = alloca i32, align 4
+// CHECK-NEXT: %[[P_LASTITER:.+]] = alloca i32, align 4
+// CHECK-NEXT: %[[P_LOWERBOUND:.+]] = alloca i32, align 4
+// CHECK-NEXT: %[[P_UPPERBOUND:.+]] = alloca i32, align 4
+// CHECK-NEXT: %[[P_STRIDE:.+]] = alloca i32, align 4
+// CHECK-NEXT: store float* %[[A:.+]], float** %[[A_ADDR]], align 8
+// CHECK-NEXT: store float* %[[B:.+]], float** %[[B_ADDR]], align 8
+// CHECK-NEXT: store float* %[[C:.+]], float** %[[C_ADDR]], align 8
+// CHECK-NEXT: store float* %[[D:.+]], float** %[[D_ADDR]], align 8
+// CHECK-NEXT: store i32 33, i32* %[[I]], align 4
+// CHECK-NEXT: %[[TMP0:.+]] = getelementptr inbounds %struct.anon, %struct.anon* %[[AGG_CAPTURED]], i32 0, i32 0
+// CHECK-NEXT: store i32* %[[I]], i32** %[[TMP0]], align 8
+// CHECK-NEXT: %[[TMP1:.+]] = getelementptr inbounds %struct.anon.0, %struct.anon.0* %[[AGG_CAPTURED1]], i32 0, i32 0
+// CHECK-NEXT: %[[TMP2:.+]] = load i32, i32* %[[I]], align 4
+// CHECK-NEXT: store i32 %[[TMP2]], i32* %[[TMP1]], align 4
+// CHECK-NEXT: call void @__captured_stmt(i32* %[[DOTCOUNT_ADDR]], %struct.anon* %[[AGG_CAPTURED]])
+// CHECK-NEXT: %[[DOTCOUNT:.+]] = load i32, i32* %[[DOTCOUNT_ADDR]], align 4
+// CHECK-NEXT: br label %[[OMP_LOOP_PREHEADER:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_LOOP_PREHEADER]]:
+// CHECK-NEXT: store i32 1, i32* %[[P_LOWERBOUND]], align 4
+// CHECK-NEXT: store i32 %[[DOTCOUNT]], i32* %[[P_UPPERBOUND]], align 4
+// CHECK-NEXT: store i32 1, i32* %[[P_STRIDE]], align 4
+// CHECK-NEXT: %[[OMP_GLOBAL_THREAD_NUM:.+]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1)
+// CHECK-NEXT: call void @__kmpc_dispatch_init_4u(%struct.ident_t* @1, i32 %[[OMP_GLOBAL_THREAD_NUM]], i32 38, i32 1, i32 %[[DOTCOUNT]], i32 1, i32 1)
+// CHECK-NEXT: br label %[[OMP_LOOP_PREHEADER_OUTER_COND:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_LOOP_HEADER:.*]]:
+// CHECK-NEXT: %[[OMP_LOOP_IV:.+]] = phi i32 [ %[[LB:.+]], %[[OMP_LOOP_PREHEADER_OUTER_COND]] ], [ %[[OMP_LOOP_NEXT:.+]], %[[OMP_LOOP_INC:.+]] ]
+// CHECK-NEXT: br label %[[OMP_LOOP_COND:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_LOOP_COND]]:
+// CHECK-NEXT: %[[UB:.+]] = load i32, i32* %[[P_UPPERBOUND]], align 4
+// CHECK-NEXT: %[[OMP_LOOP_CMP:.+]] = icmp ult i32 %[[OMP_LOOP_IV]], %[[UB]]
+// CHECK-NEXT: br i1 %[[OMP_LOOP_CMP]], label %[[OMP_LOOP_BODY:.+]], label %[[OMP_LOOP_PREHEADER_OUTER_COND]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_LOOP_BODY]]:
+// CHECK-NEXT: call void @__captured_stmt.1(i32* %[[I]], i32 %[[OMP_LOOP_IV]], %struct.anon.0* %[[AGG_CAPTURED1]])
+// CHECK-NEXT: %[[TMP3:.+]] = load float*, float** %[[B_ADDR]], align 8
+// CHECK-NEXT: %[[TMP4:.+]] = load i32, i32* %[[I]], align 4
+// CHECK-NEXT: %[[IDXPROM:.+]] = zext i32 %[[TMP4]] to i64
+// CHECK-NEXT: %[[ARRAYIDX:.+]] = getelementptr inbounds float, float* %[[TMP3]], i64 %[[IDXPROM]]
+// CHECK-NEXT: %[[TMP5:.+]] = load float, float* %[[ARRAYIDX]], align 4
+// CHECK-NEXT: %[[TMP6:.+]] = load float*, float** %[[C_ADDR]], align 8
+// CHECK-NEXT: %[[TMP7:.+]] = load i32, i32* %[[I]], align 4
+// CHECK-NEXT: %[[IDXPROM2:.+]] = zext i32 %[[TMP7]] to i64
+// CHECK-NEXT: %[[ARRAYIDX3:.+]] = getelementptr inbounds float, float* %[[TMP6]], i64 %[[IDXPROM2]]
+// CHECK-NEXT: %[[TMP8:.+]] = load float, float* %[[ARRAYIDX3]], align 4
+// CHECK-NEXT: %[[MUL:.+]] = fmul float %[[TMP5]], %[[TMP8]]
+// CHECK-NEXT: %[[TMP9:.+]] = load float*, float** %[[D_ADDR]], align 8
+// CHECK-NEXT: %[[TMP10:.+]] = load i32, i32* %[[I]], align 4
+// CHECK-NEXT: %[[IDXPROM4:.+]] = zext i32 %[[TMP10]] to i64
+// CHECK-NEXT: %[[ARRAYIDX5:.+]] = getelementptr inbounds float, float* %[[TMP9]], i64 %[[IDXPROM4]]
+// CHECK-NEXT: %[[TMP11:.+]] = load float, float* %[[ARRAYIDX5]], align 4
+// CHECK-NEXT: %[[MUL6:.+]] = fmul float %[[MUL]], %[[TMP11]]
+// CHECK-NEXT: %[[TMP12:.+]] = load float*, float** %[[A_ADDR]], align 8
+// CHECK-NEXT: %[[TMP13:.+]] = load i32, i32* %[[I]], align 4
+// CHECK-NEXT: %[[IDXPROM7:.+]] = zext i32 %[[TMP13]] to i64
+// CHECK-NEXT: %[[ARRAYIDX8:.+]] = getelementptr inbounds float, float* %[[TMP12]], i64 %[[IDXPROM7]]
+// CHECK-NEXT: store float %[[MUL6]], float* %[[ARRAYIDX8]], align 4
+// CHECK-NEXT: br label %[[OMP_LOOP_INC]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_LOOP_INC]]:
+// CHECK-NEXT: %[[OMP_LOOP_NEXT]] = add nuw i32 %[[OMP_LOOP_IV]], 1
+// CHECK-NEXT: br label %[[OMP_LOOP_HEADER]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_LOOP_EXIT:.*]]:
+// CHECK-NEXT: %[[OMP_GLOBAL_THREAD_NUM9:.+]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1)
+// CHECK-NEXT: call void @__kmpc_barrier(%struct.ident_t* @2, i32 %[[OMP_GLOBAL_THREAD_NUM9]])
+// CHECK-NEXT: br label %[[OMP_LOOP_AFTER:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_LOOP_AFTER]]:
+// CHECK-NEXT: ret void
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_LOOP_PREHEADER_OUTER_COND]]:
+// CHECK-NEXT: %[[TMP14:.+]] = call i32 @__kmpc_dispatch_next_4u(%struct.ident_t* @1, i32 %[[OMP_GLOBAL_THREAD_NUM]], i32* %[[P_LASTITER]], i32* %[[P_LOWERBOUND]], i32* %[[P_UPPERBOUND]], i32* %[[P_STRIDE]])
+// CHECK-NEXT: %[[TMP15:.+]] = icmp ne i32 %[[TMP14]], 0
+// CHECK-NEXT: %[[TMP16:.+]] = load i32, i32* %[[P_LOWERBOUND]], align 4
+// CHECK-NEXT: %[[LB]] = sub i32 %[[TMP16]], 1
+// CHECK-NEXT: br i1 %[[TMP15]], label %[[OMP_LOOP_HEADER]], label %[[OMP_LOOP_EXIT]]
+// CHECK-NEXT: }
+
+extern "C" void workshareloop_unsigned_auto(float *a, float *b, float *c, float *d) {
+#pragma omp for schedule(auto)
+ for (unsigned i = 33; i < 32000000; i += 7) {
+ a[i] = b[i] * c[i] * d[i];
+ }
+}
+
+#endif // HEADER
+
+// CHECK-LABEL: define {{.*}}@__captured_stmt(
+// CHECK-NEXT: [[ENTRY:.*]]:
+// CHECK-NEXT: %[[DISTANCE_ADDR:.+]] = alloca i32*, align 8
+// CHECK-NEXT: %[[__CONTEXT_ADDR:.+]] = alloca %struct.anon*, align 8
+// CHECK-NEXT: %[[DOTSTART:.+]] = alloca i32, align 4
+// CHECK-NEXT: %[[DOTSTOP:.+]] = alloca i32, align 4
+// CHECK-NEXT: %[[DOTSTEP:.+]] = alloca i32, align 4
+// CHECK-NEXT: store i32* %[[DISTANCE:.+]], i32** %[[DISTANCE_ADDR]], align 8
+// CHECK-NEXT: store %struct.anon* %[[__CONTEXT:.+]], %struct.anon** %[[__CONTEXT_ADDR]], align 8
+// CHECK-NEXT: %[[TMP0:.+]] = load %struct.anon*, %struct.anon** %[[__CONTEXT_ADDR]], align 8
+// CHECK-NEXT: %[[TMP1:.+]] = getelementptr inbounds %struct.anon, %struct.anon* %[[TMP0]], i32 0, i32 0
+// CHECK-NEXT: %[[TMP2:.+]] = load i32*, i32** %[[TMP1]], align 8
+// CHECK-NEXT: %[[TMP3:.+]] = load i32, i32* %[[TMP2]], align 4
+// CHECK-NEXT: store i32 %[[TMP3]], i32* %[[DOTSTART]], align 4
+// CHECK-NEXT: store i32 32000000, i32* %[[DOTSTOP]], align 4
+// CHECK-NEXT: store i32 7, i32* %[[DOTSTEP]], align 4
+// CHECK-NEXT: %[[TMP4:.+]] = load i32, i32* %[[DOTSTART]], align 4
+// CHECK-NEXT: %[[TMP5:.+]] = load i32, i32* %[[DOTSTOP]], align 4
+// CHECK-NEXT: %[[CMP:.+]] = icmp ult i32 %[[TMP4]], %[[TMP5]]
+// CHECK-NEXT: br i1 %[[CMP]], label %[[COND_TRUE:.+]], label %[[COND_FALSE:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[COND_TRUE]]:
+// CHECK-NEXT: %[[TMP6:.+]] = load i32, i32* %[[DOTSTOP]], align 4
+// CHECK-NEXT: %[[TMP7:.+]] = load i32, i32* %[[DOTSTART]], align 4
+// CHECK-NEXT: %[[SUB:.+]] = sub i32 %[[TMP6]], %[[TMP7]]
+// CHECK-NEXT: %[[TMP8:.+]] = load i32, i32* %[[DOTSTEP]], align 4
+// CHECK-NEXT: %[[SUB1:.+]] = sub i32 %[[TMP8]], 1
+// CHECK-NEXT: %[[ADD:.+]] = add i32 %[[SUB]], %[[SUB1]]
+// CHECK-NEXT: %[[TMP9:.+]] = load i32, i32* %[[DOTSTEP]], align 4
+// CHECK-NEXT: %[[DIV:.+]] = udiv i32 %[[ADD]], %[[TMP9]]
+// CHECK-NEXT: br label %[[COND_END:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[COND_FALSE]]:
+// CHECK-NEXT: br label %[[COND_END]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[COND_END]]:
+// CHECK-NEXT: %[[COND:.+]] = phi i32 [ %[[DIV]], %[[COND_TRUE]] ], [ 0, %[[COND_FALSE]] ]
+// CHECK-NEXT: %[[TMP10:.+]] = load i32*, i32** %[[DISTANCE_ADDR]], align 8
+// CHECK-NEXT: store i32 %[[COND]], i32* %[[TMP10]], align 4
+// CHECK-NEXT: ret void
+// CHECK-NEXT: }
+
+
+// CHECK-LABEL: define {{.*}}@__captured_stmt.1(
+// CHECK-NEXT: [[ENTRY:.*]]:
+// CHECK-NEXT: %[[LOOPVAR_ADDR:.+]] = alloca i32*, align 8
+// CHECK-NEXT: %[[LOGICAL_ADDR:.+]] = alloca i32, align 4
+// CHECK-NEXT: %[[__CONTEXT_ADDR:.+]] = alloca %struct.anon.0*, align 8
+// CHECK-NEXT: store i32* %[[LOOPVAR:.+]], i32** %[[LOOPVAR_ADDR]], align 8
+// CHECK-NEXT: store i32 %[[LOGICAL:.+]], i32* %[[LOGICAL_ADDR]], align 4
+// CHECK-NEXT: store %struct.anon.0* %[[__CONTEXT:.+]], %struct.anon.0** %[[__CONTEXT_ADDR]], align 8
+// CHECK-NEXT: %[[TMP0:.+]] = load %struct.anon.0*, %struct.anon.0** %[[__CONTEXT_ADDR]], align 8
+// CHECK-NEXT: %[[TMP1:.+]] = getelementptr inbounds %struct.anon.0, %struct.anon.0* %[[TMP0]], i32 0, i32 0
+// CHECK-NEXT: %[[TMP2:.+]] = load i32, i32* %[[TMP1]], align 4
+// CHECK-NEXT: %[[TMP3:.+]] = load i32, i32* %[[LOGICAL_ADDR]], align 4
+// CHECK-NEXT: %[[MUL:.+]] = mul i32 7, %[[TMP3]]
+// CHECK-NEXT: %[[ADD:.+]] = add i32 %[[TMP2]], %[[MUL]]
+// CHECK-NEXT: %[[TMP4:.+]] = load i32*, i32** %[[LOOPVAR_ADDR]], align 8
+// CHECK-NEXT: store i32 %[[ADD]], i32* %[[TMP4]], align 4
+// CHECK-NEXT: ret void
+// CHECK-NEXT: }
+
+
+// CHECK: ![[META0:[0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+// CHECK: ![[META1:[0-9]+]] = !{i32 7, !"openmp", i32 45}
+// CHECK: ![[META2:[0-9]+]] =
diff --git a/clang/test/OpenMP/irbuilder_for_unsigned_down.c b/clang/test/OpenMP/irbuilder_for_unsigned_down.c
index 7ef48b9ec900d..4914be0a6171f 100644
--- a/clang/test/OpenMP/irbuilder_for_unsigned_down.c
+++ b/clang/test/OpenMP/irbuilder_for_unsigned_down.c
@@ -33,7 +33,7 @@
// CHECK-NEXT: store i32 %[[TMP3]], i32* %[[P_UPPERBOUND]], align 4
// CHECK-NEXT: store i32 1, i32* %[[P_STRIDE]], align 4
// CHECK-NEXT: %[[OMP_GLOBAL_THREAD_NUM:.+]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1)
-// CHECK-NEXT: call void @__kmpc_for_static_init_4u(%struct.ident_t* @1, i32 %[[OMP_GLOBAL_THREAD_NUM]], i32 34, i32* %[[P_LASTITER]], i32* %[[P_LOWERBOUND]], i32* %[[P_UPPERBOUND]], i32* %[[P_STRIDE]], i32 1, i32 1)
+// CHECK-NEXT: call void @__kmpc_for_static_init_4u(%struct.ident_t* @1, i32 %[[OMP_GLOBAL_THREAD_NUM]], i32 34, i32* %[[P_LASTITER]], i32* %[[P_LOWERBOUND]], i32* %[[P_UPPERBOUND]], i32* %[[P_STRIDE]], i32 1, i32 0)
// CHECK-NEXT: %[[TMP4:.+]] = load i32, i32* %[[P_LOWERBOUND]], align 4
// CHECK-NEXT: %[[TMP5:.+]] = load i32, i32* %[[P_UPPERBOUND]], align 4
// CHECK-NEXT: %[[TMP6:.+]] = sub i32 %[[TMP5]], %[[TMP4]]
diff --git a/clang/test/OpenMP/irbuilder_for_unsigned_dynamic.c b/clang/test/OpenMP/irbuilder_for_unsigned_dynamic.c
new file mode 100644
index 0000000000000..653e952dad8e9
--- /dev/null
+++ b/clang/test/OpenMP/irbuilder_for_unsigned_dynamic.c
@@ -0,0 +1,173 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs
+// RUN: %clang_cc1 -fopenmp-enable-irbuilder -verify -fopenmp -fopenmp-version=45 -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -o - | FileCheck %s
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+// CHECK-LABEL: define {{.*}}@workshareloop_unsigned_dynamic(
+// CHECK-NEXT: [[ENTRY:.*]]:
+// CHECK-NEXT: %[[A_ADDR:.+]] = alloca float*, align 8
+// CHECK-NEXT: %[[B_ADDR:.+]] = alloca float*, align 8
+// CHECK-NEXT: %[[C_ADDR:.+]] = alloca float*, align 8
+// CHECK-NEXT: %[[D_ADDR:.+]] = alloca float*, align 8
+// CHECK-NEXT: %[[I:.+]] = alloca i32, align 4
+// CHECK-NEXT: %[[AGG_CAPTURED:.+]] = alloca %struct.anon, align 8
+// CHECK-NEXT: %[[AGG_CAPTURED1:.+]] = alloca %struct.anon.0, align 4
+// CHECK-NEXT: %[[DOTCOUNT_ADDR:.+]] = alloca i32, align 4
+// CHECK-NEXT: %[[P_LASTITER:.+]] = alloca i32, align 4
+// CHECK-NEXT: %[[P_LOWERBOUND:.+]] = alloca i32, align 4
+// CHECK-NEXT: %[[P_UPPERBOUND:.+]] = alloca i32, align 4
+// CHECK-NEXT: %[[P_STRIDE:.+]] = alloca i32, align 4
+// CHECK-NEXT: store float* %[[A:.+]], float** %[[A_ADDR]], align 8
+// CHECK-NEXT: store float* %[[B:.+]], float** %[[B_ADDR]], align 8
+// CHECK-NEXT: store float* %[[C:.+]], float** %[[C_ADDR]], align 8
+// CHECK-NEXT: store float* %[[D:.+]], float** %[[D_ADDR]], align 8
+// CHECK-NEXT: store i32 33, i32* %[[I]], align 4
+// CHECK-NEXT: %[[TMP0:.+]] = getelementptr inbounds %struct.anon, %struct.anon* %[[AGG_CAPTURED]], i32 0, i32 0
+// CHECK-NEXT: store i32* %[[I]], i32** %[[TMP0]], align 8
+// CHECK-NEXT: %[[TMP1:.+]] = getelementptr inbounds %struct.anon.0, %struct.anon.0* %[[AGG_CAPTURED1]], i32 0, i32 0
+// CHECK-NEXT: %[[TMP2:.+]] = load i32, i32* %[[I]], align 4
+// CHECK-NEXT: store i32 %[[TMP2]], i32* %[[TMP1]], align 4
+// CHECK-NEXT: call void @__captured_stmt(i32* %[[DOTCOUNT_ADDR]], %struct.anon* %[[AGG_CAPTURED]])
+// CHECK-NEXT: %[[DOTCOUNT:.+]] = load i32, i32* %[[DOTCOUNT_ADDR]], align 4
+// CHECK-NEXT: br label %[[OMP_LOOP_PREHEADER:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_LOOP_PREHEADER]]:
+// CHECK-NEXT: store i32 1, i32* %[[P_LOWERBOUND]], align 4
+// CHECK-NEXT: store i32 %[[DOTCOUNT]], i32* %[[P_UPPERBOUND]], align 4
+// CHECK-NEXT: store i32 1, i32* %[[P_STRIDE]], align 4
+// CHECK-NEXT: %[[OMP_GLOBAL_THREAD_NUM:.+]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1)
+// CHECK-NEXT: call void @__kmpc_dispatch_init_4u(%struct.ident_t* @1, i32 %[[OMP_GLOBAL_THREAD_NUM]], i32 35, i32 1, i32 %[[DOTCOUNT]], i32 1, i32 1)
+// CHECK-NEXT: br label %[[OMP_LOOP_PREHEADER_OUTER_COND:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_LOOP_HEADER:.*]]:
+// CHECK-NEXT: %[[OMP_LOOP_IV:.+]] = phi i32 [ %[[LB:.+]], %[[OMP_LOOP_PREHEADER_OUTER_COND]] ], [ %[[OMP_LOOP_NEXT:.+]], %[[OMP_LOOP_INC:.+]] ]
+// CHECK-NEXT: br label %[[OMP_LOOP_COND:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_LOOP_COND]]:
+// CHECK-NEXT: %[[UB:.+]] = load i32, i32* %[[P_UPPERBOUND]], align 4
+// CHECK-NEXT: %[[OMP_LOOP_CMP:.+]] = icmp ult i32 %[[OMP_LOOP_IV]], %[[UB]]
+// CHECK-NEXT: br i1 %[[OMP_LOOP_CMP]], label %[[OMP_LOOP_BODY:.+]], label %[[OMP_LOOP_PREHEADER_OUTER_COND]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_LOOP_BODY]]:
+// CHECK-NEXT: call void @__captured_stmt.1(i32* %[[I]], i32 %[[OMP_LOOP_IV]], %struct.anon.0* %[[AGG_CAPTURED1]])
+// CHECK-NEXT: %[[TMP3:.+]] = load float*, float** %[[B_ADDR]], align 8
+// CHECK-NEXT: %[[TMP4:.+]] = load i32, i32* %[[I]], align 4
+// CHECK-NEXT: %[[IDXPROM:.+]] = zext i32 %[[TMP4]] to i64
+// CHECK-NEXT: %[[ARRAYIDX:.+]] = getelementptr inbounds float, float* %[[TMP3]], i64 %[[IDXPROM]]
+// CHECK-NEXT: %[[TMP5:.+]] = load float, float* %[[ARRAYIDX]], align 4
+// CHECK-NEXT: %[[TMP6:.+]] = load float*, float** %[[C_ADDR]], align 8
+// CHECK-NEXT: %[[TMP7:.+]] = load i32, i32* %[[I]], align 4
+// CHECK-NEXT: %[[IDXPROM2:.+]] = zext i32 %[[TMP7]] to i64
+// CHECK-NEXT: %[[ARRAYIDX3:.+]] = getelementptr inbounds float, float* %[[TMP6]], i64 %[[IDXPROM2]]
+// CHECK-NEXT: %[[TMP8:.+]] = load float, float* %[[ARRAYIDX3]], align 4
+// CHECK-NEXT: %[[MUL:.+]] = fmul float %[[TMP5]], %[[TMP8]]
+// CHECK-NEXT: %[[TMP9:.+]] = load float*, float** %[[D_ADDR]], align 8
+// CHECK-NEXT: %[[TMP10:.+]] = load i32, i32* %[[I]], align 4
+// CHECK-NEXT: %[[IDXPROM4:.+]] = zext i32 %[[TMP10]] to i64
+// CHECK-NEXT: %[[ARRAYIDX5:.+]] = getelementptr inbounds float, float* %[[TMP9]], i64 %[[IDXPROM4]]
+// CHECK-NEXT: %[[TMP11:.+]] = load float, float* %[[ARRAYIDX5]], align 4
+// CHECK-NEXT: %[[MUL6:.+]] = fmul float %[[MUL]], %[[TMP11]]
+// CHECK-NEXT: %[[TMP12:.+]] = load float*, float** %[[A_ADDR]], align 8
+// CHECK-NEXT: %[[TMP13:.+]] = load i32, i32* %[[I]], align 4
+// CHECK-NEXT: %[[IDXPROM7:.+]] = zext i32 %[[TMP13]] to i64
+// CHECK-NEXT: %[[ARRAYIDX8:.+]] = getelementptr inbounds float, float* %[[TMP12]], i64 %[[IDXPROM7]]
+// CHECK-NEXT: store float %[[MUL6]], float* %[[ARRAYIDX8]], align 4
+// CHECK-NEXT: br label %[[OMP_LOOP_INC]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_LOOP_INC]]:
+// CHECK-NEXT: %[[OMP_LOOP_NEXT]] = add nuw i32 %[[OMP_LOOP_IV]], 1
+// CHECK-NEXT: br label %[[OMP_LOOP_HEADER]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_LOOP_EXIT:.*]]:
+// CHECK-NEXT: %[[OMP_GLOBAL_THREAD_NUM9:.+]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1)
+// CHECK-NEXT: call void @__kmpc_barrier(%struct.ident_t* @2, i32 %[[OMP_GLOBAL_THREAD_NUM9]])
+// CHECK-NEXT: br label %[[OMP_LOOP_AFTER:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_LOOP_AFTER]]:
+// CHECK-NEXT: ret void
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_LOOP_PREHEADER_OUTER_COND]]:
+// CHECK-NEXT: %[[TMP14:.+]] = call i32 @__kmpc_dispatch_next_4u(%struct.ident_t* @1, i32 %[[OMP_GLOBAL_THREAD_NUM]], i32* %[[P_LASTITER]], i32* %[[P_LOWERBOUND]], i32* %[[P_UPPERBOUND]], i32* %[[P_STRIDE]])
+// CHECK-NEXT: %[[TMP15:.+]] = icmp ne i32 %[[TMP14]], 0
+// CHECK-NEXT: %[[TMP16:.+]] = load i32, i32* %[[P_LOWERBOUND]], align 4
+// CHECK-NEXT: %[[LB]] = sub i32 %[[TMP16]], 1
+// CHECK-NEXT: br i1 %[[TMP15]], label %[[OMP_LOOP_HEADER]], label %[[OMP_LOOP_EXIT]]
+// CHECK-NEXT: }
+
+extern "C" void workshareloop_unsigned_dynamic(float *a, float *b, float *c, float *d) {
+#pragma omp for schedule(dynamic)
+ for (unsigned i = 33; i < 32000000; i += 7) {
+ a[i] = b[i] * c[i] * d[i];
+ }
+}
+
+#endif // HEADER
+
+// CHECK-LABEL: define {{.*}}@__captured_stmt(
+// CHECK-NEXT: [[ENTRY:.*]]:
+// CHECK-NEXT: %[[DISTANCE_ADDR:.+]] = alloca i32*, align 8
+// CHECK-NEXT: %[[__CONTEXT_ADDR:.+]] = alloca %struct.anon*, align 8
+// CHECK-NEXT: %[[DOTSTART:.+]] = alloca i32, align 4
+// CHECK-NEXT: %[[DOTSTOP:.+]] = alloca i32, align 4
+// CHECK-NEXT: %[[DOTSTEP:.+]] = alloca i32, align 4
+// CHECK-NEXT: store i32* %[[DISTANCE:.+]], i32** %[[DISTANCE_ADDR]], align 8
+// CHECK-NEXT: store %struct.anon* %[[__CONTEXT:.+]], %struct.anon** %[[__CONTEXT_ADDR]], align 8
+// CHECK-NEXT: %[[TMP0:.+]] = load %struct.anon*, %struct.anon** %[[__CONTEXT_ADDR]], align 8
+// CHECK-NEXT: %[[TMP1:.+]] = getelementptr inbounds %struct.anon, %struct.anon* %[[TMP0]], i32 0, i32 0
+// CHECK-NEXT: %[[TMP2:.+]] = load i32*, i32** %[[TMP1]], align 8
+// CHECK-NEXT: %[[TMP3:.+]] = load i32, i32* %[[TMP2]], align 4
+// CHECK-NEXT: store i32 %[[TMP3]], i32* %[[DOTSTART]], align 4
+// CHECK-NEXT: store i32 32000000, i32* %[[DOTSTOP]], align 4
+// CHECK-NEXT: store i32 7, i32* %[[DOTSTEP]], align 4
+// CHECK-NEXT: %[[TMP4:.+]] = load i32, i32* %[[DOTSTART]], align 4
+// CHECK-NEXT: %[[TMP5:.+]] = load i32, i32* %[[DOTSTOP]], align 4
+// CHECK-NEXT: %[[CMP:.+]] = icmp ult i32 %[[TMP4]], %[[TMP5]]
+// CHECK-NEXT: br i1 %[[CMP]], label %[[COND_TRUE:.+]], label %[[COND_FALSE:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[COND_TRUE]]:
+// CHECK-NEXT: %[[TMP6:.+]] = load i32, i32* %[[DOTSTOP]], align 4
+// CHECK-NEXT: %[[TMP7:.+]] = load i32, i32* %[[DOTSTART]], align 4
+// CHECK-NEXT: %[[SUB:.+]] = sub i32 %[[TMP6]], %[[TMP7]]
+// CHECK-NEXT: %[[TMP8:.+]] = load i32, i32* %[[DOTSTEP]], align 4
+// CHECK-NEXT: %[[SUB1:.+]] = sub i32 %[[TMP8]], 1
+// CHECK-NEXT: %[[ADD:.+]] = add i32 %[[SUB]], %[[SUB1]]
+// CHECK-NEXT: %[[TMP9:.+]] = load i32, i32* %[[DOTSTEP]], align 4
+// CHECK-NEXT: %[[DIV:.+]] = udiv i32 %[[ADD]], %[[TMP9]]
+// CHECK-NEXT: br label %[[COND_END:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[COND_FALSE]]:
+// CHECK-NEXT: br label %[[COND_END]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[COND_END]]:
+// CHECK-NEXT: %[[COND:.+]] = phi i32 [ %[[DIV]], %[[COND_TRUE]] ], [ 0, %[[COND_FALSE]] ]
+// CHECK-NEXT: %[[TMP10:.+]] = load i32*, i32** %[[DISTANCE_ADDR]], align 8
+// CHECK-NEXT: store i32 %[[COND]], i32* %[[TMP10]], align 4
+// CHECK-NEXT: ret void
+// CHECK-NEXT: }
+
+
+// CHECK-LABEL: define {{.*}}@__captured_stmt.1(
+// CHECK-NEXT: [[ENTRY:.*]]:
+// CHECK-NEXT: %[[LOOPVAR_ADDR:.+]] = alloca i32*, align 8
+// CHECK-NEXT: %[[LOGICAL_ADDR:.+]] = alloca i32, align 4
+// CHECK-NEXT: %[[__CONTEXT_ADDR:.+]] = alloca %struct.anon.0*, align 8
+// CHECK-NEXT: store i32* %[[LOOPVAR:.+]], i32** %[[LOOPVAR_ADDR]], align 8
+// CHECK-NEXT: store i32 %[[LOGICAL:.+]], i32* %[[LOGICAL_ADDR]], align 4
+// CHECK-NEXT: store %struct.anon.0* %[[__CONTEXT:.+]], %struct.anon.0** %[[__CONTEXT_ADDR]], align 8
+// CHECK-NEXT: %[[TMP0:.+]] = load %struct.anon.0*, %struct.anon.0** %[[__CONTEXT_ADDR]], align 8
+// CHECK-NEXT: %[[TMP1:.+]] = getelementptr inbounds %struct.anon.0, %struct.anon.0* %[[TMP0]], i32 0, i32 0
+// CHECK-NEXT: %[[TMP2:.+]] = load i32, i32* %[[TMP1]], align 4
+// CHECK-NEXT: %[[TMP3:.+]] = load i32, i32* %[[LOGICAL_ADDR]], align 4
+// CHECK-NEXT: %[[MUL:.+]] = mul i32 7, %[[TMP3]]
+// CHECK-NEXT: %[[ADD:.+]] = add i32 %[[TMP2]], %[[MUL]]
+// CHECK-NEXT: %[[TMP4:.+]] = load i32*, i32** %[[LOOPVAR_ADDR]], align 8
+// CHECK-NEXT: store i32 %[[ADD]], i32* %[[TMP4]], align 4
+// CHECK-NEXT: ret void
+// CHECK-NEXT: }
+
+
+// CHECK: ![[META0:[0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+// CHECK: ![[META1:[0-9]+]] = !{i32 7, !"openmp", i32 45}
+// CHECK: ![[META2:[0-9]+]] =
diff --git a/clang/test/OpenMP/irbuilder_for_unsigned_dynamic_chunked.c b/clang/test/OpenMP/irbuilder_for_unsigned_dynamic_chunked.c
new file mode 100644
index 0000000000000..2e23b538b4b2f
--- /dev/null
+++ b/clang/test/OpenMP/irbuilder_for_unsigned_dynamic_chunked.c
@@ -0,0 +1,173 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs
+// RUN: %clang_cc1 -fopenmp-enable-irbuilder -verify -fopenmp -fopenmp-version=45 -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -o - | FileCheck %s
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+// CHECK-LABEL: define {{.*}}@workshareloop_unsigned_dynamic_chunked(
+// CHECK-NEXT: [[ENTRY:.*]]:
+// CHECK-NEXT: %[[A_ADDR:.+]] = alloca float*, align 8
+// CHECK-NEXT: %[[B_ADDR:.+]] = alloca float*, align 8
+// CHECK-NEXT: %[[C_ADDR:.+]] = alloca float*, align 8
+// CHECK-NEXT: %[[D_ADDR:.+]] = alloca float*, align 8
+// CHECK-NEXT: %[[I:.+]] = alloca i32, align 4
+// CHECK-NEXT: %[[AGG_CAPTURED:.+]] = alloca %struct.anon, align 8
+// CHECK-NEXT: %[[AGG_CAPTURED1:.+]] = alloca %struct.anon.0, align 4
+// CHECK-NEXT: %[[DOTCOUNT_ADDR:.+]] = alloca i32, align 4
+// CHECK-NEXT: %[[P_LASTITER:.+]] = alloca i32, align 4
+// CHECK-NEXT: %[[P_LOWERBOUND:.+]] = alloca i32, align 4
+// CHECK-NEXT: %[[P_UPPERBOUND:.+]] = alloca i32, align 4
+// CHECK-NEXT: %[[P_STRIDE:.+]] = alloca i32, align 4
+// CHECK-NEXT: store float* %[[A:.+]], float** %[[A_ADDR]], align 8
+// CHECK-NEXT: store float* %[[B:.+]], float** %[[B_ADDR]], align 8
+// CHECK-NEXT: store float* %[[C:.+]], float** %[[C_ADDR]], align 8
+// CHECK-NEXT: store float* %[[D:.+]], float** %[[D_ADDR]], align 8
+// CHECK-NEXT: store i32 33, i32* %[[I]], align 4
+// CHECK-NEXT: %[[TMP0:.+]] = getelementptr inbounds %struct.anon, %struct.anon* %[[AGG_CAPTURED]], i32 0, i32 0
+// CHECK-NEXT: store i32* %[[I]], i32** %[[TMP0]], align 8
+// CHECK-NEXT: %[[TMP1:.+]] = getelementptr inbounds %struct.anon.0, %struct.anon.0* %[[AGG_CAPTURED1]], i32 0, i32 0
+// CHECK-NEXT: %[[TMP2:.+]] = load i32, i32* %[[I]], align 4
+// CHECK-NEXT: store i32 %[[TMP2]], i32* %[[TMP1]], align 4
+// CHECK-NEXT: call void @__captured_stmt(i32* %[[DOTCOUNT_ADDR]], %struct.anon* %[[AGG_CAPTURED]])
+// CHECK-NEXT: %[[DOTCOUNT:.+]] = load i32, i32* %[[DOTCOUNT_ADDR]], align 4
+// CHECK-NEXT: br label %[[OMP_LOOP_PREHEADER:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_LOOP_PREHEADER]]:
+// CHECK-NEXT: store i32 1, i32* %[[P_LOWERBOUND]], align 4
+// CHECK-NEXT: store i32 %[[DOTCOUNT]], i32* %[[P_UPPERBOUND]], align 4
+// CHECK-NEXT: store i32 1, i32* %[[P_STRIDE]], align 4
+// CHECK-NEXT: %[[OMP_GLOBAL_THREAD_NUM:.+]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1)
+// CHECK-NEXT: call void @__kmpc_dispatch_init_4u(%struct.ident_t* @1, i32 %[[OMP_GLOBAL_THREAD_NUM]], i32 35, i32 1, i32 %[[DOTCOUNT]], i32 1, i32 5)
+// CHECK-NEXT: br label %[[OMP_LOOP_PREHEADER_OUTER_COND:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_LOOP_HEADER:.*]]:
+// CHECK-NEXT: %[[OMP_LOOP_IV:.+]] = phi i32 [ %[[LB:.+]], %[[OMP_LOOP_PREHEADER_OUTER_COND]] ], [ %[[OMP_LOOP_NEXT:.+]], %[[OMP_LOOP_INC:.+]] ]
+// CHECK-NEXT: br label %[[OMP_LOOP_COND:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_LOOP_COND]]:
+// CHECK-NEXT: %[[UB:.+]] = load i32, i32* %[[P_UPPERBOUND]], align 4
+// CHECK-NEXT: %[[OMP_LOOP_CMP:.+]] = icmp ult i32 %[[OMP_LOOP_IV]], %[[UB]]
+// CHECK-NEXT: br i1 %[[OMP_LOOP_CMP]], label %[[OMP_LOOP_BODY:.+]], label %[[OMP_LOOP_PREHEADER_OUTER_COND]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_LOOP_BODY]]:
+// CHECK-NEXT: call void @__captured_stmt.1(i32* %[[I]], i32 %[[OMP_LOOP_IV]], %struct.anon.0* %[[AGG_CAPTURED1]])
+// CHECK-NEXT: %[[TMP3:.+]] = load float*, float** %[[B_ADDR]], align 8
+// CHECK-NEXT: %[[TMP4:.+]] = load i32, i32* %[[I]], align 4
+// CHECK-NEXT: %[[IDXPROM:.+]] = zext i32 %[[TMP4]] to i64
+// CHECK-NEXT: %[[ARRAYIDX:.+]] = getelementptr inbounds float, float* %[[TMP3]], i64 %[[IDXPROM]]
+// CHECK-NEXT: %[[TMP5:.+]] = load float, float* %[[ARRAYIDX]], align 4
+// CHECK-NEXT: %[[TMP6:.+]] = load float*, float** %[[C_ADDR]], align 8
+// CHECK-NEXT: %[[TMP7:.+]] = load i32, i32* %[[I]], align 4
+// CHECK-NEXT: %[[IDXPROM2:.+]] = zext i32 %[[TMP7]] to i64
+// CHECK-NEXT: %[[ARRAYIDX3:.+]] = getelementptr inbounds float, float* %[[TMP6]], i64 %[[IDXPROM2]]
+// CHECK-NEXT: %[[TMP8:.+]] = load float, float* %[[ARRAYIDX3]], align 4
+// CHECK-NEXT: %[[MUL:.+]] = fmul float %[[TMP5]], %[[TMP8]]
+// CHECK-NEXT: %[[TMP9:.+]] = load float*, float** %[[D_ADDR]], align 8
+// CHECK-NEXT: %[[TMP10:.+]] = load i32, i32* %[[I]], align 4
+// CHECK-NEXT: %[[IDXPROM4:.+]] = zext i32 %[[TMP10]] to i64
+// CHECK-NEXT: %[[ARRAYIDX5:.+]] = getelementptr inbounds float, float* %[[TMP9]], i64 %[[IDXPROM4]]
+// CHECK-NEXT: %[[TMP11:.+]] = load float, float* %[[ARRAYIDX5]], align 4
+// CHECK-NEXT: %[[MUL6:.+]] = fmul float %[[MUL]], %[[TMP11]]
+// CHECK-NEXT: %[[TMP12:.+]] = load float*, float** %[[A_ADDR]], align 8
+// CHECK-NEXT: %[[TMP13:.+]] = load i32, i32* %[[I]], align 4
+// CHECK-NEXT: %[[IDXPROM7:.+]] = zext i32 %[[TMP13]] to i64
+// CHECK-NEXT: %[[ARRAYIDX8:.+]] = getelementptr inbounds float, float* %[[TMP12]], i64 %[[IDXPROM7]]
+// CHECK-NEXT: store float %[[MUL6]], float* %[[ARRAYIDX8]], align 4
+// CHECK-NEXT: br label %[[OMP_LOOP_INC]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_LOOP_INC]]:
+// CHECK-NEXT: %[[OMP_LOOP_NEXT]] = add nuw i32 %[[OMP_LOOP_IV]], 1
+// CHECK-NEXT: br label %[[OMP_LOOP_HEADER]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_LOOP_EXIT:.*]]:
+// CHECK-NEXT: %[[OMP_GLOBAL_THREAD_NUM9:.+]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1)
+// CHECK-NEXT: call void @__kmpc_barrier(%struct.ident_t* @2, i32 %[[OMP_GLOBAL_THREAD_NUM9]])
+// CHECK-NEXT: br label %[[OMP_LOOP_AFTER:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_LOOP_AFTER]]:
+// CHECK-NEXT: ret void
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_LOOP_PREHEADER_OUTER_COND]]:
+// CHECK-NEXT: %[[TMP14:.+]] = call i32 @__kmpc_dispatch_next_4u(%struct.ident_t* @1, i32 %[[OMP_GLOBAL_THREAD_NUM]], i32* %[[P_LASTITER]], i32* %[[P_LOWERBOUND]], i32* %[[P_UPPERBOUND]], i32* %[[P_STRIDE]])
+// CHECK-NEXT: %[[TMP15:.+]] = icmp ne i32 %[[TMP14]], 0
+// CHECK-NEXT: %[[TMP16:.+]] = load i32, i32* %[[P_LOWERBOUND]], align 4
+// CHECK-NEXT: %[[LB]] = sub i32 %[[TMP16]], 1
+// CHECK-NEXT: br i1 %[[TMP15]], label %[[OMP_LOOP_HEADER]], label %[[OMP_LOOP_EXIT]]
+// CHECK-NEXT: }
+
+extern "C" void workshareloop_unsigned_dynamic_chunked(float *a, float *b, float *c, float *d) {
+#pragma omp for schedule(dynamic, 5)
+ for (unsigned i = 33; i < 32000000; i += 7) {
+ a[i] = b[i] * c[i] * d[i];
+ }
+}
+
+#endif // HEADER
+
+// CHECK-LABEL: define {{.*}}@__captured_stmt(
+// CHECK-NEXT: [[ENTRY:.*]]:
+// CHECK-NEXT: %[[DISTANCE_ADDR:.+]] = alloca i32*, align 8
+// CHECK-NEXT: %[[__CONTEXT_ADDR:.+]] = alloca %struct.anon*, align 8
+// CHECK-NEXT: %[[DOTSTART:.+]] = alloca i32, align 4
+// CHECK-NEXT: %[[DOTSTOP:.+]] = alloca i32, align 4
+// CHECK-NEXT: %[[DOTSTEP:.+]] = alloca i32, align 4
+// CHECK-NEXT: store i32* %[[DISTANCE:.+]], i32** %[[DISTANCE_ADDR]], align 8
+// CHECK-NEXT: store %struct.anon* %[[__CONTEXT:.+]], %struct.anon** %[[__CONTEXT_ADDR]], align 8
+// CHECK-NEXT: %[[TMP0:.+]] = load %struct.anon*, %struct.anon** %[[__CONTEXT_ADDR]], align 8
+// CHECK-NEXT: %[[TMP1:.+]] = getelementptr inbounds %struct.anon, %struct.anon* %[[TMP0]], i32 0, i32 0
+// CHECK-NEXT: %[[TMP2:.+]] = load i32*, i32** %[[TMP1]], align 8
+// CHECK-NEXT: %[[TMP3:.+]] = load i32, i32* %[[TMP2]], align 4
+// CHECK-NEXT: store i32 %[[TMP3]], i32* %[[DOTSTART]], align 4
+// CHECK-NEXT: store i32 32000000, i32* %[[DOTSTOP]], align 4
+// CHECK-NEXT: store i32 7, i32* %[[DOTSTEP]], align 4
+// CHECK-NEXT: %[[TMP4:.+]] = load i32, i32* %[[DOTSTART]], align 4
+// CHECK-NEXT: %[[TMP5:.+]] = load i32, i32* %[[DOTSTOP]], align 4
+// CHECK-NEXT: %[[CMP:.+]] = icmp ult i32 %[[TMP4]], %[[TMP5]]
+// CHECK-NEXT: br i1 %[[CMP]], label %[[COND_TRUE:.+]], label %[[COND_FALSE:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[COND_TRUE]]:
+// CHECK-NEXT: %[[TMP6:.+]] = load i32, i32* %[[DOTSTOP]], align 4
+// CHECK-NEXT: %[[TMP7:.+]] = load i32, i32* %[[DOTSTART]], align 4
+// CHECK-NEXT: %[[SUB:.+]] = sub i32 %[[TMP6]], %[[TMP7]]
+// CHECK-NEXT: %[[TMP8:.+]] = load i32, i32* %[[DOTSTEP]], align 4
+// CHECK-NEXT: %[[SUB1:.+]] = sub i32 %[[TMP8]], 1
+// CHECK-NEXT: %[[ADD:.+]] = add i32 %[[SUB]], %[[SUB1]]
+// CHECK-NEXT: %[[TMP9:.+]] = load i32, i32* %[[DOTSTEP]], align 4
+// CHECK-NEXT: %[[DIV:.+]] = udiv i32 %[[ADD]], %[[TMP9]]
+// CHECK-NEXT: br label %[[COND_END:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[COND_FALSE]]:
+// CHECK-NEXT: br label %[[COND_END]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[COND_END]]:
+// CHECK-NEXT: %[[COND:.+]] = phi i32 [ %[[DIV]], %[[COND_TRUE]] ], [ 0, %[[COND_FALSE]] ]
+// CHECK-NEXT: %[[TMP10:.+]] = load i32*, i32** %[[DISTANCE_ADDR]], align 8
+// CHECK-NEXT: store i32 %[[COND]], i32* %[[TMP10]], align 4
+// CHECK-NEXT: ret void
+// CHECK-NEXT: }
+
+
+// CHECK-LABEL: define {{.*}}@__captured_stmt.1(
+// CHECK-NEXT: [[ENTRY:.*]]:
+// CHECK-NEXT: %[[LOOPVAR_ADDR:.+]] = alloca i32*, align 8
+// CHECK-NEXT: %[[LOGICAL_ADDR:.+]] = alloca i32, align 4
+// CHECK-NEXT: %[[__CONTEXT_ADDR:.+]] = alloca %struct.anon.0*, align 8
+// CHECK-NEXT: store i32* %[[LOOPVAR:.+]], i32** %[[LOOPVAR_ADDR]], align 8
+// CHECK-NEXT: store i32 %[[LOGICAL:.+]], i32* %[[LOGICAL_ADDR]], align 4
+// CHECK-NEXT: store %struct.anon.0* %[[__CONTEXT:.+]], %struct.anon.0** %[[__CONTEXT_ADDR]], align 8
+// CHECK-NEXT: %[[TMP0:.+]] = load %struct.anon.0*, %struct.anon.0** %[[__CONTEXT_ADDR]], align 8
+// CHECK-NEXT: %[[TMP1:.+]] = getelementptr inbounds %struct.anon.0, %struct.anon.0* %[[TMP0]], i32 0, i32 0
+// CHECK-NEXT: %[[TMP2:.+]] = load i32, i32* %[[TMP1]], align 4
+// CHECK-NEXT: %[[TMP3:.+]] = load i32, i32* %[[LOGICAL_ADDR]], align 4
+// CHECK-NEXT: %[[MUL:.+]] = mul i32 7, %[[TMP3]]
+// CHECK-NEXT: %[[ADD:.+]] = add i32 %[[TMP2]], %[[MUL]]
+// CHECK-NEXT: %[[TMP4:.+]] = load i32*, i32** %[[LOOPVAR_ADDR]], align 8
+// CHECK-NEXT: store i32 %[[ADD]], i32* %[[TMP4]], align 4
+// CHECK-NEXT: ret void
+// CHECK-NEXT: }
+
+
+// CHECK: ![[META0:[0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+// CHECK: ![[META1:[0-9]+]] = !{i32 7, !"openmp", i32 45}
+// CHECK: ![[META2:[0-9]+]] =
diff --git a/clang/test/OpenMP/irbuilder_for_unsigned_runtime.c b/clang/test/OpenMP/irbuilder_for_unsigned_runtime.c
new file mode 100644
index 0000000000000..33483ca49cbca
--- /dev/null
+++ b/clang/test/OpenMP/irbuilder_for_unsigned_runtime.c
@@ -0,0 +1,173 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs
+// RUN: %clang_cc1 -fopenmp-enable-irbuilder -verify -fopenmp -fopenmp-version=45 -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -o - | FileCheck %s
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+// CHECK-LABEL: define {{.*}}@workshareloop_unsigned_runtime(
+// CHECK-NEXT: [[ENTRY:.*]]:
+// CHECK-NEXT: %[[A_ADDR:.+]] = alloca float*, align 8
+// CHECK-NEXT: %[[B_ADDR:.+]] = alloca float*, align 8
+// CHECK-NEXT: %[[C_ADDR:.+]] = alloca float*, align 8
+// CHECK-NEXT: %[[D_ADDR:.+]] = alloca float*, align 8
+// CHECK-NEXT: %[[I:.+]] = alloca i32, align 4
+// CHECK-NEXT: %[[AGG_CAPTURED:.+]] = alloca %struct.anon, align 8
+// CHECK-NEXT: %[[AGG_CAPTURED1:.+]] = alloca %struct.anon.0, align 4
+// CHECK-NEXT: %[[DOTCOUNT_ADDR:.+]] = alloca i32, align 4
+// CHECK-NEXT: %[[P_LASTITER:.+]] = alloca i32, align 4
+// CHECK-NEXT: %[[P_LOWERBOUND:.+]] = alloca i32, align 4
+// CHECK-NEXT: %[[P_UPPERBOUND:.+]] = alloca i32, align 4
+// CHECK-NEXT: %[[P_STRIDE:.+]] = alloca i32, align 4
+// CHECK-NEXT: store float* %[[A:.+]], float** %[[A_ADDR]], align 8
+// CHECK-NEXT: store float* %[[B:.+]], float** %[[B_ADDR]], align 8
+// CHECK-NEXT: store float* %[[C:.+]], float** %[[C_ADDR]], align 8
+// CHECK-NEXT: store float* %[[D:.+]], float** %[[D_ADDR]], align 8
+// CHECK-NEXT: store i32 33, i32* %[[I]], align 4
+// CHECK-NEXT: %[[TMP0:.+]] = getelementptr inbounds %struct.anon, %struct.anon* %[[AGG_CAPTURED]], i32 0, i32 0
+// CHECK-NEXT: store i32* %[[I]], i32** %[[TMP0]], align 8
+// CHECK-NEXT: %[[TMP1:.+]] = getelementptr inbounds %struct.anon.0, %struct.anon.0* %[[AGG_CAPTURED1]], i32 0, i32 0
+// CHECK-NEXT: %[[TMP2:.+]] = load i32, i32* %[[I]], align 4
+// CHECK-NEXT: store i32 %[[TMP2]], i32* %[[TMP1]], align 4
+// CHECK-NEXT: call void @__captured_stmt(i32* %[[DOTCOUNT_ADDR]], %struct.anon* %[[AGG_CAPTURED]])
+// CHECK-NEXT: %[[DOTCOUNT:.+]] = load i32, i32* %[[DOTCOUNT_ADDR]], align 4
+// CHECK-NEXT: br label %[[OMP_LOOP_PREHEADER:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_LOOP_PREHEADER]]:
+// CHECK-NEXT: store i32 1, i32* %[[P_LOWERBOUND]], align 4
+// CHECK-NEXT: store i32 %[[DOTCOUNT]], i32* %[[P_UPPERBOUND]], align 4
+// CHECK-NEXT: store i32 1, i32* %[[P_STRIDE]], align 4
+// CHECK-NEXT: %[[OMP_GLOBAL_THREAD_NUM:.+]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1)
+// CHECK-NEXT: call void @__kmpc_dispatch_init_4u(%struct.ident_t* @1, i32 %[[OMP_GLOBAL_THREAD_NUM]], i32 37, i32 1, i32 %[[DOTCOUNT]], i32 1, i32 1)
+// CHECK-NEXT: br label %[[OMP_LOOP_PREHEADER_OUTER_COND:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_LOOP_HEADER:.*]]:
+// CHECK-NEXT: %[[OMP_LOOP_IV:.+]] = phi i32 [ %[[LB:.+]], %[[OMP_LOOP_PREHEADER_OUTER_COND]] ], [ %[[OMP_LOOP_NEXT:.+]], %[[OMP_LOOP_INC:.+]] ]
+// CHECK-NEXT: br label %[[OMP_LOOP_COND:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_LOOP_COND]]:
+// CHECK-NEXT: %[[UB:.+]] = load i32, i32* %[[P_UPPERBOUND]], align 4
+// CHECK-NEXT: %[[OMP_LOOP_CMP:.+]] = icmp ult i32 %[[OMP_LOOP_IV]], %[[UB]]
+// CHECK-NEXT: br i1 %[[OMP_LOOP_CMP]], label %[[OMP_LOOP_BODY:.+]], label %[[OMP_LOOP_PREHEADER_OUTER_COND]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_LOOP_BODY]]:
+// CHECK-NEXT: call void @__captured_stmt.1(i32* %[[I]], i32 %[[OMP_LOOP_IV]], %struct.anon.0* %[[AGG_CAPTURED1]])
+// CHECK-NEXT: %[[TMP3:.+]] = load float*, float** %[[B_ADDR]], align 8
+// CHECK-NEXT: %[[TMP4:.+]] = load i32, i32* %[[I]], align 4
+// CHECK-NEXT: %[[IDXPROM:.+]] = zext i32 %[[TMP4]] to i64
+// CHECK-NEXT: %[[ARRAYIDX:.+]] = getelementptr inbounds float, float* %[[TMP3]], i64 %[[IDXPROM]]
+// CHECK-NEXT: %[[TMP5:.+]] = load float, float* %[[ARRAYIDX]], align 4
+// CHECK-NEXT: %[[TMP6:.+]] = load float*, float** %[[C_ADDR]], align 8
+// CHECK-NEXT: %[[TMP7:.+]] = load i32, i32* %[[I]], align 4
+// CHECK-NEXT: %[[IDXPROM2:.+]] = zext i32 %[[TMP7]] to i64
+// CHECK-NEXT: %[[ARRAYIDX3:.+]] = getelementptr inbounds float, float* %[[TMP6]], i64 %[[IDXPROM2]]
+// CHECK-NEXT: %[[TMP8:.+]] = load float, float* %[[ARRAYIDX3]], align 4
+// CHECK-NEXT: %[[MUL:.+]] = fmul float %[[TMP5]], %[[TMP8]]
+// CHECK-NEXT: %[[TMP9:.+]] = load float*, float** %[[D_ADDR]], align 8
+// CHECK-NEXT: %[[TMP10:.+]] = load i32, i32* %[[I]], align 4
+// CHECK-NEXT: %[[IDXPROM4:.+]] = zext i32 %[[TMP10]] to i64
+// CHECK-NEXT: %[[ARRAYIDX5:.+]] = getelementptr inbounds float, float* %[[TMP9]], i64 %[[IDXPROM4]]
+// CHECK-NEXT: %[[TMP11:.+]] = load float, float* %[[ARRAYIDX5]], align 4
+// CHECK-NEXT: %[[MUL6:.+]] = fmul float %[[MUL]], %[[TMP11]]
+// CHECK-NEXT: %[[TMP12:.+]] = load float*, float** %[[A_ADDR]], align 8
+// CHECK-NEXT: %[[TMP13:.+]] = load i32, i32* %[[I]], align 4
+// CHECK-NEXT: %[[IDXPROM7:.+]] = zext i32 %[[TMP13]] to i64
+// CHECK-NEXT: %[[ARRAYIDX8:.+]] = getelementptr inbounds float, float* %[[TMP12]], i64 %[[IDXPROM7]]
+// CHECK-NEXT: store float %[[MUL6]], float* %[[ARRAYIDX8]], align 4
+// CHECK-NEXT: br label %[[OMP_LOOP_INC]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_LOOP_INC]]:
+// CHECK-NEXT: %[[OMP_LOOP_NEXT]] = add nuw i32 %[[OMP_LOOP_IV]], 1
+// CHECK-NEXT: br label %[[OMP_LOOP_HEADER]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_LOOP_EXIT:.*]]:
+// CHECK-NEXT: %[[OMP_GLOBAL_THREAD_NUM9:.+]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1)
+// CHECK-NEXT: call void @__kmpc_barrier(%struct.ident_t* @2, i32 %[[OMP_GLOBAL_THREAD_NUM9]])
+// CHECK-NEXT: br label %[[OMP_LOOP_AFTER:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_LOOP_AFTER]]:
+// CHECK-NEXT: ret void
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_LOOP_PREHEADER_OUTER_COND]]:
+// CHECK-NEXT: %[[TMP14:.+]] = call i32 @__kmpc_dispatch_next_4u(%struct.ident_t* @1, i32 %[[OMP_GLOBAL_THREAD_NUM]], i32* %[[P_LASTITER]], i32* %[[P_LOWERBOUND]], i32* %[[P_UPPERBOUND]], i32* %[[P_STRIDE]])
+// CHECK-NEXT: %[[TMP15:.+]] = icmp ne i32 %[[TMP14]], 0
+// CHECK-NEXT: %[[TMP16:.+]] = load i32, i32* %[[P_LOWERBOUND]], align 4
+// CHECK-NEXT: %[[LB]] = sub i32 %[[TMP16]], 1
+// CHECK-NEXT: br i1 %[[TMP15]], label %[[OMP_LOOP_HEADER]], label %[[OMP_LOOP_EXIT]]
+// CHECK-NEXT: }
+
+extern "C" void workshareloop_unsigned_runtime(float *a, float *b, float *c, float *d) {
+#pragma omp for schedule(runtime)
+ for (unsigned i = 33; i < 32000000; i += 7) {
+ a[i] = b[i] * c[i] * d[i];
+ }
+}
+
+#endif // HEADER
+
+// CHECK-LABEL: define {{.*}}@__captured_stmt(
+// CHECK-NEXT: [[ENTRY:.*]]:
+// CHECK-NEXT: %[[DISTANCE_ADDR:.+]] = alloca i32*, align 8
+// CHECK-NEXT: %[[__CONTEXT_ADDR:.+]] = alloca %struct.anon*, align 8
+// CHECK-NEXT: %[[DOTSTART:.+]] = alloca i32, align 4
+// CHECK-NEXT: %[[DOTSTOP:.+]] = alloca i32, align 4
+// CHECK-NEXT: %[[DOTSTEP:.+]] = alloca i32, align 4
+// CHECK-NEXT: store i32* %[[DISTANCE:.+]], i32** %[[DISTANCE_ADDR]], align 8
+// CHECK-NEXT: store %struct.anon* %[[__CONTEXT:.+]], %struct.anon** %[[__CONTEXT_ADDR]], align 8
+// CHECK-NEXT: %[[TMP0:.+]] = load %struct.anon*, %struct.anon** %[[__CONTEXT_ADDR]], align 8
+// CHECK-NEXT: %[[TMP1:.+]] = getelementptr inbounds %struct.anon, %struct.anon* %[[TMP0]], i32 0, i32 0
+// CHECK-NEXT: %[[TMP2:.+]] = load i32*, i32** %[[TMP1]], align 8
+// CHECK-NEXT: %[[TMP3:.+]] = load i32, i32* %[[TMP2]], align 4
+// CHECK-NEXT: store i32 %[[TMP3]], i32* %[[DOTSTART]], align 4
+// CHECK-NEXT: store i32 32000000, i32* %[[DOTSTOP]], align 4
+// CHECK-NEXT: store i32 7, i32* %[[DOTSTEP]], align 4
+// CHECK-NEXT: %[[TMP4:.+]] = load i32, i32* %[[DOTSTART]], align 4
+// CHECK-NEXT: %[[TMP5:.+]] = load i32, i32* %[[DOTSTOP]], align 4
+// CHECK-NEXT: %[[CMP:.+]] = icmp ult i32 %[[TMP4]], %[[TMP5]]
+// CHECK-NEXT: br i1 %[[CMP]], label %[[COND_TRUE:.+]], label %[[COND_FALSE:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[COND_TRUE]]:
+// CHECK-NEXT: %[[TMP6:.+]] = load i32, i32* %[[DOTSTOP]], align 4
+// CHECK-NEXT: %[[TMP7:.+]] = load i32, i32* %[[DOTSTART]], align 4
+// CHECK-NEXT: %[[SUB:.+]] = sub i32 %[[TMP6]], %[[TMP7]]
+// CHECK-NEXT: %[[TMP8:.+]] = load i32, i32* %[[DOTSTEP]], align 4
+// CHECK-NEXT: %[[SUB1:.+]] = sub i32 %[[TMP8]], 1
+// CHECK-NEXT: %[[ADD:.+]] = add i32 %[[SUB]], %[[SUB1]]
+// CHECK-NEXT: %[[TMP9:.+]] = load i32, i32* %[[DOTSTEP]], align 4
+// CHECK-NEXT: %[[DIV:.+]] = udiv i32 %[[ADD]], %[[TMP9]]
+// CHECK-NEXT: br label %[[COND_END:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[COND_FALSE]]:
+// CHECK-NEXT: br label %[[COND_END]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[COND_END]]:
+// CHECK-NEXT: %[[COND:.+]] = phi i32 [ %[[DIV]], %[[COND_TRUE]] ], [ 0, %[[COND_FALSE]] ]
+// CHECK-NEXT: %[[TMP10:.+]] = load i32*, i32** %[[DISTANCE_ADDR]], align 8
+// CHECK-NEXT: store i32 %[[COND]], i32* %[[TMP10]], align 4
+// CHECK-NEXT: ret void
+// CHECK-NEXT: }
+
+
+// CHECK-LABEL: define {{.*}}@__captured_stmt.1(
+// CHECK-NEXT: [[ENTRY:.*]]:
+// CHECK-NEXT: %[[LOOPVAR_ADDR:.+]] = alloca i32*, align 8
+// CHECK-NEXT: %[[LOGICAL_ADDR:.+]] = alloca i32, align 4
+// CHECK-NEXT: %[[__CONTEXT_ADDR:.+]] = alloca %struct.anon.0*, align 8
+// CHECK-NEXT: store i32* %[[LOOPVAR:.+]], i32** %[[LOOPVAR_ADDR]], align 8
+// CHECK-NEXT: store i32 %[[LOGICAL:.+]], i32* %[[LOGICAL_ADDR]], align 4
+// CHECK-NEXT: store %struct.anon.0* %[[__CONTEXT:.+]], %struct.anon.0** %[[__CONTEXT_ADDR]], align 8
+// CHECK-NEXT: %[[TMP0:.+]] = load %struct.anon.0*, %struct.anon.0** %[[__CONTEXT_ADDR]], align 8
+// CHECK-NEXT: %[[TMP1:.+]] = getelementptr inbounds %struct.anon.0, %struct.anon.0* %[[TMP0]], i32 0, i32 0
+// CHECK-NEXT: %[[TMP2:.+]] = load i32, i32* %[[TMP1]], align 4
+// CHECK-NEXT: %[[TMP3:.+]] = load i32, i32* %[[LOGICAL_ADDR]], align 4
+// CHECK-NEXT: %[[MUL:.+]] = mul i32 7, %[[TMP3]]
+// CHECK-NEXT: %[[ADD:.+]] = add i32 %[[TMP2]], %[[MUL]]
+// CHECK-NEXT: %[[TMP4:.+]] = load i32*, i32** %[[LOOPVAR_ADDR]], align 8
+// CHECK-NEXT: store i32 %[[ADD]], i32* %[[TMP4]], align 4
+// CHECK-NEXT: ret void
+// CHECK-NEXT: }
+
+
+// CHECK: ![[META0:[0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+// CHECK: ![[META1:[0-9]+]] = !{i32 7, !"openmp", i32 45}
+// CHECK: ![[META2:[0-9]+]] =
diff --git a/clang/test/OpenMP/irbuilder_for_unsigned_static_chunked.c b/clang/test/OpenMP/irbuilder_for_unsigned_static_chunked.c
new file mode 100644
index 0000000000000..7e92120d43c8c
--- /dev/null
+++ b/clang/test/OpenMP/irbuilder_for_unsigned_static_chunked.c
@@ -0,0 +1,214 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs
+// RUN: %clang_cc1 -fopenmp-enable-irbuilder -verify -fopenmp -fopenmp-version=45 -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -o - | FileCheck %s
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+// CHECK-LABEL: define {{.*}}@workshareloop_unsigned_static_chunked(
+// CHECK-NEXT: [[ENTRY:.*]]:
+// CHECK-NEXT: %[[A_ADDR:.+]] = alloca float*, align 8
+// CHECK-NEXT: %[[B_ADDR:.+]] = alloca float*, align 8
+// CHECK-NEXT: %[[C_ADDR:.+]] = alloca float*, align 8
+// CHECK-NEXT: %[[D_ADDR:.+]] = alloca float*, align 8
+// CHECK-NEXT: %[[I:.+]] = alloca i32, align 4
+// CHECK-NEXT: %[[AGG_CAPTURED:.+]] = alloca %struct.anon, align 8
+// CHECK-NEXT: %[[AGG_CAPTURED1:.+]] = alloca %struct.anon.0, align 4
+// CHECK-NEXT: %[[DOTCOUNT_ADDR:.+]] = alloca i32, align 4
+// CHECK-NEXT: %[[P_LASTITER:.+]] = alloca i32, align 4
+// CHECK-NEXT: %[[P_LOWERBOUND:.+]] = alloca i32, align 4
+// CHECK-NEXT: %[[P_UPPERBOUND:.+]] = alloca i32, align 4
+// CHECK-NEXT: %[[P_STRIDE:.+]] = alloca i32, align 4
+// CHECK-NEXT: store float* %[[A:.+]], float** %[[A_ADDR]], align 8
+// CHECK-NEXT: store float* %[[B:.+]], float** %[[B_ADDR]], align 8
+// CHECK-NEXT: store float* %[[C:.+]], float** %[[C_ADDR]], align 8
+// CHECK-NEXT: store float* %[[D:.+]], float** %[[D_ADDR]], align 8
+// CHECK-NEXT: store i32 33, i32* %[[I]], align 4
+// CHECK-NEXT: %[[TMP0:.+]] = getelementptr inbounds %struct.anon, %struct.anon* %[[AGG_CAPTURED]], i32 0, i32 0
+// CHECK-NEXT: store i32* %[[I]], i32** %[[TMP0]], align 8
+// CHECK-NEXT: %[[TMP1:.+]] = getelementptr inbounds %struct.anon.0, %struct.anon.0* %[[AGG_CAPTURED1]], i32 0, i32 0
+// CHECK-NEXT: %[[TMP2:.+]] = load i32, i32* %[[I]], align 4
+// CHECK-NEXT: store i32 %[[TMP2]], i32* %[[TMP1]], align 4
+// CHECK-NEXT: call void @__captured_stmt(i32* %[[DOTCOUNT_ADDR]], %struct.anon* %[[AGG_CAPTURED]])
+// CHECK-NEXT: %[[DOTCOUNT:.+]] = load i32, i32* %[[DOTCOUNT_ADDR]], align 4
+// CHECK-NEXT: br label %[[OMP_LOOP_PREHEADER:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_LOOP_PREHEADER]]:
+// CHECK-NEXT: store i32 0, i32* %[[P_LOWERBOUND]], align 4
+// CHECK-NEXT: %[[TMP3:.+]] = sub i32 %[[DOTCOUNT]], 1
+// CHECK-NEXT: store i32 %[[TMP3]], i32* %[[P_UPPERBOUND]], align 4
+// CHECK-NEXT: store i32 1, i32* %[[P_STRIDE]], align 4
+// CHECK-NEXT: %[[OMP_GLOBAL_THREAD_NUM:.+]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1)
+// CHECK-NEXT: call void @__kmpc_for_static_init_4u(%struct.ident_t* @1, i32 %[[OMP_GLOBAL_THREAD_NUM]], i32 33, i32* %[[P_LASTITER]], i32* %[[P_LOWERBOUND]], i32* %[[P_UPPERBOUND]], i32* %[[P_STRIDE]], i32 1, i32 5)
+// CHECK-NEXT: %[[OMP_FIRSTCHUNK_LB:.+]] = load i32, i32* %[[P_LOWERBOUND]], align 4
+// CHECK-NEXT: %[[OMP_FIRSTCHUNK_UB:.+]] = load i32, i32* %[[P_UPPERBOUND]], align 4
+// CHECK-NEXT: %[[TMP4:.+]] = add i32 %[[OMP_FIRSTCHUNK_UB]], 1
+// CHECK-NEXT: %[[OMP_CHUNK_RANGE:.+]] = sub i32 %[[TMP4]], %[[OMP_FIRSTCHUNK_LB]]
+// CHECK-NEXT: %[[OMP_DISPATCH_STRIDE:.+]] = load i32, i32* %[[P_STRIDE]], align 4
+// CHECK-NEXT: %[[TMP5:.+]] = sub nuw i32 %[[DOTCOUNT]], %[[OMP_FIRSTCHUNK_LB]]
+// CHECK-NEXT: %[[TMP6:.+]] = icmp ule i32 %[[DOTCOUNT]], %[[OMP_FIRSTCHUNK_LB]]
+// CHECK-NEXT: %[[TMP7:.+]] = sub i32 %[[TMP5]], 1
+// CHECK-NEXT: %[[TMP8:.+]] = udiv i32 %[[TMP7]], %[[OMP_DISPATCH_STRIDE]]
+// CHECK-NEXT: %[[TMP9:.+]] = add i32 %[[TMP8]], 1
+// CHECK-NEXT: %[[TMP10:.+]] = icmp ule i32 %[[TMP5]], %[[OMP_DISPATCH_STRIDE]]
+// CHECK-NEXT: %[[TMP11:.+]] = select i1 %[[TMP10]], i32 1, i32 %[[TMP9]]
+// CHECK-NEXT: %[[OMP_DISPATCH_TRIPCOUNT:.+]] = select i1 %[[TMP6]], i32 0, i32 %[[TMP11]]
+// CHECK-NEXT: br label %[[OMP_DISPATCH_PREHEADER:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_DISPATCH_PREHEADER]]:
+// CHECK-NEXT: br label %[[OMP_DISPATCH_HEADER:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_DISPATCH_HEADER]]:
+// CHECK-NEXT: %[[OMP_DISPATCH_IV:.+]] = phi i32 [ 0, %[[OMP_DISPATCH_PREHEADER]] ], [ %[[OMP_DISPATCH_NEXT:.+]], %[[OMP_DISPATCH_INC:.+]] ]
+// CHECK-NEXT: br label %[[OMP_DISPATCH_COND:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_DISPATCH_COND]]:
+// CHECK-NEXT: %[[OMP_DISPATCH_CMP:.+]] = icmp ult i32 %[[OMP_DISPATCH_IV]], %[[OMP_DISPATCH_TRIPCOUNT]]
+// CHECK-NEXT: br i1 %[[OMP_DISPATCH_CMP]], label %[[OMP_DISPATCH_BODY:.+]], label %[[OMP_DISPATCH_EXIT:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_DISPATCH_BODY]]:
+// CHECK-NEXT: %[[TMP12:.+]] = mul i32 %[[OMP_DISPATCH_IV]], %[[OMP_DISPATCH_STRIDE]]
+// CHECK-NEXT: %[[TMP13:.+]] = add i32 %[[TMP12]], %[[OMP_FIRSTCHUNK_LB]]
+// CHECK-NEXT: br label %[[OMP_LOOP_PREHEADER9:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_DISPATCH_INC]]:
+// CHECK-NEXT: %[[OMP_DISPATCH_NEXT]] = add nuw i32 %[[OMP_DISPATCH_IV]], 1
+// CHECK-NEXT: br label %[[OMP_DISPATCH_HEADER]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_DISPATCH_EXIT]]:
+// CHECK-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @1, i32 %[[OMP_GLOBAL_THREAD_NUM]])
+// CHECK-NEXT: %[[OMP_GLOBAL_THREAD_NUM10:.+]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1)
+// CHECK-NEXT: call void @__kmpc_barrier(%struct.ident_t* @2, i32 %[[OMP_GLOBAL_THREAD_NUM10]])
+// CHECK-NEXT: br label %[[OMP_DISPATCH_AFTER:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_DISPATCH_AFTER]]:
+// CHECK-NEXT: br label %[[OMP_LOOP_AFTER:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_LOOP_PREHEADER9]]:
+// CHECK-NEXT: %[[TMP14:.+]] = add i32 %[[TMP13]], %[[OMP_CHUNK_RANGE]]
+// CHECK-NEXT: %[[OMP_CHUNK_IS_LAST:.+]] = icmp uge i32 %[[TMP14]], %[[DOTCOUNT]]
+// CHECK-NEXT: %[[TMP15:.+]] = sub i32 %[[DOTCOUNT]], %[[TMP13]]
+// CHECK-NEXT: %[[OMP_CHUNK_TRIPCOUNT:.+]] = select i1 %[[OMP_CHUNK_IS_LAST]], i32 %[[TMP15]], i32 %[[OMP_CHUNK_RANGE]]
+// CHECK-NEXT: br label %[[OMP_LOOP_HEADER:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_LOOP_HEADER]]:
+// CHECK-NEXT: %[[OMP_LOOP_IV:.+]] = phi i32 [ 0, %[[OMP_LOOP_PREHEADER9]] ], [ %[[OMP_LOOP_NEXT:.+]], %[[OMP_LOOP_INC:.+]] ]
+// CHECK-NEXT: br label %[[OMP_LOOP_COND:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_LOOP_COND]]:
+// CHECK-NEXT: %[[OMP_LOOP_CMP:.+]] = icmp ult i32 %[[OMP_LOOP_IV]], %[[OMP_CHUNK_TRIPCOUNT]]
+// CHECK-NEXT: br i1 %[[OMP_LOOP_CMP]], label %[[OMP_LOOP_BODY:.+]], label %[[OMP_LOOP_EXIT:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_LOOP_BODY]]:
+// CHECK-NEXT: %[[TMP16:.+]] = add i32 %[[OMP_LOOP_IV]], %[[TMP13]]
+// CHECK-NEXT: call void @__captured_stmt.1(i32* %[[I]], i32 %[[TMP16]], %struct.anon.0* %[[AGG_CAPTURED1]])
+// CHECK-NEXT: %[[TMP17:.+]] = load float*, float** %[[B_ADDR]], align 8
+// CHECK-NEXT: %[[TMP18:.+]] = load i32, i32* %[[I]], align 4
+// CHECK-NEXT: %[[IDXPROM:.+]] = zext i32 %[[TMP18]] to i64
+// CHECK-NEXT: %[[ARRAYIDX:.+]] = getelementptr inbounds float, float* %[[TMP17]], i64 %[[IDXPROM]]
+// CHECK-NEXT: %[[TMP19:.+]] = load float, float* %[[ARRAYIDX]], align 4
+// CHECK-NEXT: %[[TMP20:.+]] = load float*, float** %[[C_ADDR]], align 8
+// CHECK-NEXT: %[[TMP21:.+]] = load i32, i32* %[[I]], align 4
+// CHECK-NEXT: %[[IDXPROM2:.+]] = zext i32 %[[TMP21]] to i64
+// CHECK-NEXT: %[[ARRAYIDX3:.+]] = getelementptr inbounds float, float* %[[TMP20]], i64 %[[IDXPROM2]]
+// CHECK-NEXT: %[[TMP22:.+]] = load float, float* %[[ARRAYIDX3]], align 4
+// CHECK-NEXT: %[[MUL:.+]] = fmul float %[[TMP19]], %[[TMP22]]
+// CHECK-NEXT: %[[TMP23:.+]] = load float*, float** %[[D_ADDR]], align 8
+// CHECK-NEXT: %[[TMP24:.+]] = load i32, i32* %[[I]], align 4
+// CHECK-NEXT: %[[IDXPROM4:.+]] = zext i32 %[[TMP24]] to i64
+// CHECK-NEXT: %[[ARRAYIDX5:.+]] = getelementptr inbounds float, float* %[[TMP23]], i64 %[[IDXPROM4]]
+// CHECK-NEXT: %[[TMP25:.+]] = load float, float* %[[ARRAYIDX5]], align 4
+// CHECK-NEXT: %[[MUL6:.+]] = fmul float %[[MUL]], %[[TMP25]]
+// CHECK-NEXT: %[[TMP26:.+]] = load float*, float** %[[A_ADDR]], align 8
+// CHECK-NEXT: %[[TMP27:.+]] = load i32, i32* %[[I]], align 4
+// CHECK-NEXT: %[[IDXPROM7:.+]] = zext i32 %[[TMP27]] to i64
+// CHECK-NEXT: %[[ARRAYIDX8:.+]] = getelementptr inbounds float, float* %[[TMP26]], i64 %[[IDXPROM7]]
+// CHECK-NEXT: store float %[[MUL6]], float* %[[ARRAYIDX8]], align 4
+// CHECK-NEXT: br label %[[OMP_LOOP_INC]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_LOOP_INC]]:
+// CHECK-NEXT: %[[OMP_LOOP_NEXT]] = add nuw i32 %[[OMP_LOOP_IV]], 1
+// CHECK-NEXT: br label %[[OMP_LOOP_HEADER]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_LOOP_EXIT]]:
+// CHECK-NEXT: br label %[[OMP_DISPATCH_INC]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_LOOP_AFTER]]:
+// CHECK-NEXT: ret void
+// CHECK-NEXT: }
+
+extern "C" void workshareloop_unsigned_static_chunked(float *a, float *b, float *c, float *d) {
+#pragma omp for schedule(static, 5)
+ for (unsigned i = 33; i < 32000000; i += 7) {
+ a[i] = b[i] * c[i] * d[i];
+ }
+}
+
+#endif // HEADER
+
+// CHECK-LABEL: define {{.*}}@__captured_stmt(
+// CHECK-NEXT: [[ENTRY:.*]]:
+// CHECK-NEXT: %[[DISTANCE_ADDR:.+]] = alloca i32*, align 8
+// CHECK-NEXT: %[[__CONTEXT_ADDR:.+]] = alloca %struct.anon*, align 8
+// CHECK-NEXT: %[[DOTSTART:.+]] = alloca i32, align 4
+// CHECK-NEXT: %[[DOTSTOP:.+]] = alloca i32, align 4
+// CHECK-NEXT: %[[DOTSTEP:.+]] = alloca i32, align 4
+// CHECK-NEXT: store i32* %[[DISTANCE:.+]], i32** %[[DISTANCE_ADDR]], align 8
+// CHECK-NEXT: store %struct.anon* %[[__CONTEXT:.+]], %struct.anon** %[[__CONTEXT_ADDR]], align 8
+// CHECK-NEXT: %[[TMP0:.+]] = load %struct.anon*, %struct.anon** %[[__CONTEXT_ADDR]], align 8
+// CHECK-NEXT: %[[TMP1:.+]] = getelementptr inbounds %struct.anon, %struct.anon* %[[TMP0]], i32 0, i32 0
+// CHECK-NEXT: %[[TMP2:.+]] = load i32*, i32** %[[TMP1]], align 8
+// CHECK-NEXT: %[[TMP3:.+]] = load i32, i32* %[[TMP2]], align 4
+// CHECK-NEXT: store i32 %[[TMP3]], i32* %[[DOTSTART]], align 4
+// CHECK-NEXT: store i32 32000000, i32* %[[DOTSTOP]], align 4
+// CHECK-NEXT: store i32 7, i32* %[[DOTSTEP]], align 4
+// CHECK-NEXT: %[[TMP4:.+]] = load i32, i32* %[[DOTSTART]], align 4
+// CHECK-NEXT: %[[TMP5:.+]] = load i32, i32* %[[DOTSTOP]], align 4
+// CHECK-NEXT: %[[CMP:.+]] = icmp ult i32 %[[TMP4]], %[[TMP5]]
+// CHECK-NEXT: br i1 %[[CMP]], label %[[COND_TRUE:.+]], label %[[COND_FALSE:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[COND_TRUE]]:
+// CHECK-NEXT: %[[TMP6:.+]] = load i32, i32* %[[DOTSTOP]], align 4
+// CHECK-NEXT: %[[TMP7:.+]] = load i32, i32* %[[DOTSTART]], align 4
+// CHECK-NEXT: %[[SUB:.+]] = sub i32 %[[TMP6]], %[[TMP7]]
+// CHECK-NEXT: %[[TMP8:.+]] = load i32, i32* %[[DOTSTEP]], align 4
+// CHECK-NEXT: %[[SUB1:.+]] = sub i32 %[[TMP8]], 1
+// CHECK-NEXT: %[[ADD:.+]] = add i32 %[[SUB]], %[[SUB1]]
+// CHECK-NEXT: %[[TMP9:.+]] = load i32, i32* %[[DOTSTEP]], align 4
+// CHECK-NEXT: %[[DIV:.+]] = udiv i32 %[[ADD]], %[[TMP9]]
+// CHECK-NEXT: br label %[[COND_END:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[COND_FALSE]]:
+// CHECK-NEXT: br label %[[COND_END]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[COND_END]]:
+// CHECK-NEXT: %[[COND:.+]] = phi i32 [ %[[DIV]], %[[COND_TRUE]] ], [ 0, %[[COND_FALSE]] ]
+// CHECK-NEXT: %[[TMP10:.+]] = load i32*, i32** %[[DISTANCE_ADDR]], align 8
+// CHECK-NEXT: store i32 %[[COND]], i32* %[[TMP10]], align 4
+// CHECK-NEXT: ret void
+// CHECK-NEXT: }
+
+
+// CHECK-LABEL: define {{.*}}@__captured_stmt.1(
+// CHECK-NEXT: [[ENTRY:.*]]:
+// CHECK-NEXT: %[[LOOPVAR_ADDR:.+]] = alloca i32*, align 8
+// CHECK-NEXT: %[[LOGICAL_ADDR:.+]] = alloca i32, align 4
+// CHECK-NEXT: %[[__CONTEXT_ADDR:.+]] = alloca %struct.anon.0*, align 8
+// CHECK-NEXT: store i32* %[[LOOPVAR:.+]], i32** %[[LOOPVAR_ADDR]], align 8
+// CHECK-NEXT: store i32 %[[LOGICAL:.+]], i32* %[[LOGICAL_ADDR]], align 4
+// CHECK-NEXT: store %struct.anon.0* %[[__CONTEXT:.+]], %struct.anon.0** %[[__CONTEXT_ADDR]], align 8
+// CHECK-NEXT: %[[TMP0:.+]] = load %struct.anon.0*, %struct.anon.0** %[[__CONTEXT_ADDR]], align 8
+// CHECK-NEXT: %[[TMP1:.+]] = getelementptr inbounds %struct.anon.0, %struct.anon.0* %[[TMP0]], i32 0, i32 0
+// CHECK-NEXT: %[[TMP2:.+]] = load i32, i32* %[[TMP1]], align 4
+// CHECK-NEXT: %[[TMP3:.+]] = load i32, i32* %[[LOGICAL_ADDR]], align 4
+// CHECK-NEXT: %[[MUL:.+]] = mul i32 7, %[[TMP3]]
+// CHECK-NEXT: %[[ADD:.+]] = add i32 %[[TMP2]], %[[MUL]]
+// CHECK-NEXT: %[[TMP4:.+]] = load i32*, i32** %[[LOOPVAR_ADDR]], align 8
+// CHECK-NEXT: store i32 %[[ADD]], i32* %[[TMP4]], align 4
+// CHECK-NEXT: ret void
+// CHECK-NEXT: }
+
+
+// CHECK: ![[META0:[0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+// CHECK: ![[META1:[0-9]+]] = !{i32 7, !"openmp", i32 45}
+// CHECK: ![[META2:[0-9]+]] =
diff --git a/clang/test/OpenMP/irbuilder_nested_parallel_for.c b/clang/test/OpenMP/irbuilder_nested_parallel_for.c
index 7dde6e1f89d2a..f4e24343eb52c 100644
--- a/clang/test/OpenMP/irbuilder_nested_parallel_for.c
+++ b/clang/test/OpenMP/irbuilder_nested_parallel_for.c
@@ -78,12 +78,12 @@ void parallel_for_0(void) {
// CHECK-DEBUG-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
// CHECK-DEBUG-NEXT: [[B_ADDR:%.*]] = alloca double, align 8
// CHECK-DEBUG-NEXT: store float* [[R:%.*]], float** [[R_ADDR]], align 8
-// CHECK-DEBUG-NEXT: call void @llvm.dbg.declare(metadata float** [[R_ADDR]], metadata [[META72:![0-9]+]], metadata !DIExpression()), !dbg [[DBG73:![0-9]+]]
+// CHECK-DEBUG-NEXT: call void @llvm.dbg.declare(metadata float** [[R_ADDR]], metadata [[META73:![0-9]+]], metadata !DIExpression()), !dbg [[DBG74:![0-9]+]]
// CHECK-DEBUG-NEXT: store i32 [[A:%.*]], i32* [[A_ADDR]], align 4
-// CHECK-DEBUG-NEXT: call void @llvm.dbg.declare(metadata i32* [[A_ADDR]], metadata [[META74:![0-9]+]], metadata !DIExpression()), !dbg [[DBG75:![0-9]+]]
+// CHECK-DEBUG-NEXT: call void @llvm.dbg.declare(metadata i32* [[A_ADDR]], metadata [[META75:![0-9]+]], metadata !DIExpression()), !dbg [[DBG76:![0-9]+]]
// CHECK-DEBUG-NEXT: store double [[B:%.*]], double* [[B_ADDR]], align 8
-// CHECK-DEBUG-NEXT: call void @llvm.dbg.declare(metadata double* [[B_ADDR]], metadata [[META76:![0-9]+]], metadata !DIExpression()), !dbg [[DBG77:![0-9]+]]
-// CHECK-DEBUG-NEXT: [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB6:[0-9]+]]), !dbg [[DBG78:![0-9]+]]
+// CHECK-DEBUG-NEXT: call void @llvm.dbg.declare(metadata double* [[B_ADDR]], metadata [[META77:![0-9]+]], metadata !DIExpression()), !dbg [[DBG78:![0-9]+]]
+// CHECK-DEBUG-NEXT: [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB6:[0-9]+]]), !dbg [[DBG79:![0-9]+]]
// CHECK-DEBUG-NEXT: br label [[OMP_PARALLEL:%.*]]
// CHECK-DEBUG: omp_parallel:
// CHECK-DEBUG-NEXT: [[GEP_STRUCTARG:%.*]] = getelementptr { { i32*, double*, float** }*, i32*, double*, float** }, { { i32*, double*, float** }*, i32*, double*, float** }* [[STRUCTARG17]], i32 0, i32 0
@@ -94,12 +94,12 @@ void parallel_for_0(void) {
// CHECK-DEBUG-NEXT: store double* [[B_ADDR]], double** [[GEP_B_ADDR19]], align 8
// CHECK-DEBUG-NEXT: [[GEP_R_ADDR20:%.*]] = getelementptr { { i32*, double*, float** }*, i32*, double*, float** }, { { i32*, double*, float** }*, i32*, double*, float** }* [[STRUCTARG17]], i32 0, i32 3
// CHECK-DEBUG-NEXT: store float** [[R_ADDR]], float*** [[GEP_R_ADDR20]], align 8
-// CHECK-DEBUG-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB6]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, { { i32*, double*, float** }*, i32*, double*, float** }*)* @_Z14parallel_for_1Pfid..omp_par.4 to void (i32*, i32*, ...)*), { { i32*, double*, float** }*, i32*, double*, float** }* [[STRUCTARG17]]), !dbg [[DBG79:![0-9]+]]
+// CHECK-DEBUG-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB6]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, { { i32*, double*, float** }*, i32*, double*, float** }*)* @_Z14parallel_for_1Pfid..omp_par.4 to void (i32*, i32*, ...)*), { { i32*, double*, float** }*, i32*, double*, float** }* [[STRUCTARG17]]), !dbg [[DBG80:![0-9]+]]
// CHECK-DEBUG-NEXT: br label [[OMP_PAR_OUTLINED_EXIT16:%.*]]
// CHECK-DEBUG: omp.par.outlined.exit16:
// CHECK-DEBUG-NEXT: br label [[OMP_PAR_EXIT_SPLIT:%.*]]
// CHECK-DEBUG: omp.par.exit.split:
-// CHECK-DEBUG-NEXT: ret void, !dbg [[DBG81:![0-9]+]]
+// CHECK-DEBUG-NEXT: ret void, !dbg [[DBG82:![0-9]+]]
//
void parallel_for_1(float *r, int a, double b) {
#pragma omp parallel
@@ -169,7 +169,7 @@ void parallel_for_1(float *r, int a, double b) {
// CHECK-NEXT: store i32 [[TMP3]], i32* [[P_UPPERBOUND205]], align 4
// CHECK-NEXT: store i32 1, i32* [[P_STRIDE206]], align 4
// CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM207:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
-// CHECK-NEXT: call void @__kmpc_for_static_init_4u(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM207]], i32 34, i32* [[P_LASTITER203]], i32* [[P_LOWERBOUND204]], i32* [[P_UPPERBOUND205]], i32* [[P_STRIDE206]], i32 1, i32 1)
+// CHECK-NEXT: call void @__kmpc_for_static_init_4u(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM207]], i32 34, i32* [[P_LASTITER203]], i32* [[P_LOWERBOUND204]], i32* [[P_UPPERBOUND205]], i32* [[P_STRIDE206]], i32 1, i32 0)
// CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[P_LOWERBOUND204]], align 4
// CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* [[P_UPPERBOUND205]], align 4
// CHECK-NEXT: [[TMP6:%.*]] = sub i32 [[TMP5]], [[TMP4]]
@@ -221,12 +221,12 @@ void parallel_for_1(float *r, int a, double b) {
// CHECK-DEBUG-NEXT: [[P_UPPERBOUND205:%.*]] = alloca i32, align 4
// CHECK-DEBUG-NEXT: [[P_STRIDE206:%.*]] = alloca i32, align 4
// CHECK-DEBUG-NEXT: store float* [[R:%.*]], float** [[R_ADDR]], align 8
-// CHECK-DEBUG-NEXT: call void @llvm.dbg.declare(metadata float** [[R_ADDR]], metadata [[META133:![0-9]+]], metadata !DIExpression()), !dbg [[DBG134:![0-9]+]]
+// CHECK-DEBUG-NEXT: call void @llvm.dbg.declare(metadata float** [[R_ADDR]], metadata [[META134:![0-9]+]], metadata !DIExpression()), !dbg [[DBG135:![0-9]+]]
// CHECK-DEBUG-NEXT: store i32 [[A:%.*]], i32* [[A_ADDR]], align 4
-// CHECK-DEBUG-NEXT: call void @llvm.dbg.declare(metadata i32* [[A_ADDR]], metadata [[META135:![0-9]+]], metadata !DIExpression()), !dbg [[DBG136:![0-9]+]]
+// CHECK-DEBUG-NEXT: call void @llvm.dbg.declare(metadata i32* [[A_ADDR]], metadata [[META136:![0-9]+]], metadata !DIExpression()), !dbg [[DBG137:![0-9]+]]
// CHECK-DEBUG-NEXT: store double [[B:%.*]], double* [[B_ADDR]], align 8
-// CHECK-DEBUG-NEXT: call void @llvm.dbg.declare(metadata double* [[B_ADDR]], metadata [[META137:![0-9]+]], metadata !DIExpression()), !dbg [[DBG138:![0-9]+]]
-// CHECK-DEBUG-NEXT: [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB13:[0-9]+]]), !dbg [[DBG139:![0-9]+]]
+// CHECK-DEBUG-NEXT: call void @llvm.dbg.declare(metadata double* [[B_ADDR]], metadata [[META138:![0-9]+]], metadata !DIExpression()), !dbg [[DBG139:![0-9]+]]
+// CHECK-DEBUG-NEXT: [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB13:[0-9]+]]), !dbg [[DBG140:![0-9]+]]
// CHECK-DEBUG-NEXT: br label [[OMP_PARALLEL:%.*]]
// CHECK-DEBUG: omp_parallel:
// CHECK-DEBUG-NEXT: [[GEP_STRUCTARG214:%.*]] = getelementptr { { { i32*, double*, float** }*, i32*, double*, float**, { i32*, double*, float** }* }*, { i32*, double*, float** }*, i32*, double*, float**, { i32*, double*, float** }* }, { { { i32*, double*, float** }*, i32*, double*, float**, { i32*, double*, float** }* }*, { i32*, double*, float** }*, i32*, double*, float**, { i32*, double*, float** }* }* [[STRUCTARG218]], i32 0, i32 0
@@ -241,60 +241,60 @@ void parallel_for_1(float *r, int a, double b) {
// CHECK-DEBUG-NEXT: store float** [[R_ADDR]], float*** [[GEP_R_ADDR]], align 8
// CHECK-DEBUG-NEXT: [[GEP_STRUCTARG209220:%.*]] = getelementptr { { { i32*, double*, float** }*, i32*, double*, float**, { i32*, double*, float** }* }*, { i32*, double*, float** }*, i32*, double*, float**, { i32*, double*, float** }* }, { { { i32*, double*, float** }*, i32*, double*, float**, { i32*, double*, float** }* }*, { i32*, double*, float** }*, i32*, double*, float**, { i32*, double*, float** }* }* [[STRUCTARG218]], i32 0, i32 5
// CHECK-DEBUG-NEXT: store { i32*, double*, float** }* [[STRUCTARG209]], { i32*, double*, float** }** [[GEP_STRUCTARG209220]], align 8
-// CHECK-DEBUG-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB13]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, { { { i32*, double*, float** }*, i32*, double*, float**, { i32*, double*, float** }* }*, { i32*, double*, float** }*, i32*, double*, float**, { i32*, double*, float** }* }*)* @_Z14parallel_for_2Pfid..omp_par.23 to void (i32*, i32*, ...)*), { { { i32*, double*, float** }*, i32*, double*, float**, { i32*, double*, float** }* }*, { i32*, double*, float** }*, i32*, double*, float**, { i32*, double*, float** }* }* [[STRUCTARG218]]), !dbg [[DBG140:![0-9]+]]
+// CHECK-DEBUG-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB13]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, { { { i32*, double*, float** }*, i32*, double*, float**, { i32*, double*, float** }* }*, { i32*, double*, float** }*, i32*, double*, float**, { i32*, double*, float** }* }*)* @_Z14parallel_for_2Pfid..omp_par.23 to void (i32*, i32*, ...)*), { { { i32*, double*, float** }*, i32*, double*, float**, { i32*, double*, float** }* }*, { i32*, double*, float** }*, i32*, double*, float**, { i32*, double*, float** }* }* [[STRUCTARG218]]), !dbg [[DBG141:![0-9]+]]
// CHECK-DEBUG-NEXT: br label [[OMP_PAR_OUTLINED_EXIT184:%.*]]
// CHECK-DEBUG: omp.par.outlined.exit184:
// CHECK-DEBUG-NEXT: br label [[OMP_PAR_EXIT_SPLIT:%.*]]
// CHECK-DEBUG: omp.par.exit.split:
-// CHECK-DEBUG-NEXT: call void @llvm.dbg.declare(metadata i32* [[I185]], metadata [[META144:![0-9]+]], metadata !DIExpression()), !dbg [[DBG147:![0-9]+]]
-// CHECK-DEBUG-NEXT: store i32 0, i32* [[I185]], align 4, !dbg [[DBG147]]
-// CHECK-DEBUG-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[STRUCT_ANON_17]], %struct.anon.17* [[AGG_CAPTURED186]], i32 0, i32 0, !dbg [[DBG148:![0-9]+]]
-// CHECK-DEBUG-NEXT: store i32* [[I185]], i32** [[TMP0]], align 8, !dbg [[DBG148]]
-// CHECK-DEBUG-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_ANON_18]], %struct.anon.18* [[AGG_CAPTURED187]], i32 0, i32 0, !dbg [[DBG148]]
-// CHECK-DEBUG-NEXT: [[TMP2:%.*]] = load i32, i32* [[I185]], align 4, !dbg [[DBG149:![0-9]+]]
-// CHECK-DEBUG-NEXT: store i32 [[TMP2]], i32* [[TMP1]], align 4, !dbg [[DBG148]]
-// CHECK-DEBUG-NEXT: call void @__captured_stmt.19(i32* [[DOTCOUNT_ADDR188]], %struct.anon.17* [[AGG_CAPTURED186]]), !dbg [[DBG148]]
-// CHECK-DEBUG-NEXT: [[DOTCOUNT189:%.*]] = load i32, i32* [[DOTCOUNT_ADDR188]], align 4, !dbg [[DBG148]]
-// CHECK-DEBUG-NEXT: br label [[OMP_LOOP_PREHEADER190:%.*]], !dbg [[DBG148]]
+// CHECK-DEBUG-NEXT: call void @llvm.dbg.declare(metadata i32* [[I185]], metadata [[META145:![0-9]+]], metadata !DIExpression()), !dbg [[DBG148:![0-9]+]]
+// CHECK-DEBUG-NEXT: store i32 0, i32* [[I185]], align 4, !dbg [[DBG148]]
+// CHECK-DEBUG-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[STRUCT_ANON_17]], %struct.anon.17* [[AGG_CAPTURED186]], i32 0, i32 0, !dbg [[DBG149:![0-9]+]]
+// CHECK-DEBUG-NEXT: store i32* [[I185]], i32** [[TMP0]], align 8, !dbg [[DBG149]]
+// CHECK-DEBUG-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_ANON_18]], %struct.anon.18* [[AGG_CAPTURED187]], i32 0, i32 0, !dbg [[DBG149]]
+// CHECK-DEBUG-NEXT: [[TMP2:%.*]] = load i32, i32* [[I185]], align 4, !dbg [[DBG150:![0-9]+]]
+// CHECK-DEBUG-NEXT: store i32 [[TMP2]], i32* [[TMP1]], align 4, !dbg [[DBG149]]
+// CHECK-DEBUG-NEXT: call void @__captured_stmt.19(i32* [[DOTCOUNT_ADDR188]], %struct.anon.17* [[AGG_CAPTURED186]]), !dbg [[DBG149]]
+// CHECK-DEBUG-NEXT: [[DOTCOUNT189:%.*]] = load i32, i32* [[DOTCOUNT_ADDR188]], align 4, !dbg [[DBG149]]
+// CHECK-DEBUG-NEXT: br label [[OMP_LOOP_PREHEADER190:%.*]], !dbg [[DBG149]]
// CHECK-DEBUG: omp_loop.preheader190:
-// CHECK-DEBUG-NEXT: store i32 0, i32* [[P_LOWERBOUND204]], align 4, !dbg [[DBG148]]
-// CHECK-DEBUG-NEXT: [[TMP3:%.*]] = sub i32 [[DOTCOUNT189]], 1, !dbg [[DBG148]]
-// CHECK-DEBUG-NEXT: store i32 [[TMP3]], i32* [[P_UPPERBOUND205]], align 4, !dbg [[DBG148]]
-// CHECK-DEBUG-NEXT: store i32 1, i32* [[P_STRIDE206]], align 4, !dbg [[DBG148]]
-// CHECK-DEBUG-NEXT: [[OMP_GLOBAL_THREAD_NUM207:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB42:[0-9]+]]), !dbg [[DBG148]]
-// CHECK-DEBUG-NEXT: call void @__kmpc_for_static_init_4u(%struct.ident_t* @[[GLOB42]], i32 [[OMP_GLOBAL_THREAD_NUM207]], i32 34, i32* [[P_LASTITER203]], i32* [[P_LOWERBOUND204]], i32* [[P_UPPERBOUND205]], i32* [[P_STRIDE206]], i32 1, i32 1), !dbg [[DBG148]]
-// CHECK-DEBUG-NEXT: [[TMP4:%.*]] = load i32, i32* [[P_LOWERBOUND204]], align 4, !dbg [[DBG148]]
-// CHECK-DEBUG-NEXT: [[TMP5:%.*]] = load i32, i32* [[P_UPPERBOUND205]], align 4, !dbg [[DBG148]]
-// CHECK-DEBUG-NEXT: [[TMP6:%.*]] = sub i32 [[TMP5]], [[TMP4]], !dbg [[DBG148]]
-// CHECK-DEBUG-NEXT: [[TMP7:%.*]] = add i32 [[TMP6]], 1, !dbg [[DBG148]]
-// CHECK-DEBUG-NEXT: br label [[OMP_LOOP_HEADER191:%.*]], !dbg [[DBG148]]
+// CHECK-DEBUG-NEXT: store i32 0, i32* [[P_LOWERBOUND204]], align 4, !dbg [[DBG149]]
+// CHECK-DEBUG-NEXT: [[TMP3:%.*]] = sub i32 [[DOTCOUNT189]], 1, !dbg [[DBG149]]
+// CHECK-DEBUG-NEXT: store i32 [[TMP3]], i32* [[P_UPPERBOUND205]], align 4, !dbg [[DBG149]]
+// CHECK-DEBUG-NEXT: store i32 1, i32* [[P_STRIDE206]], align 4, !dbg [[DBG149]]
+// CHECK-DEBUG-NEXT: [[OMP_GLOBAL_THREAD_NUM207:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB42:[0-9]+]]), !dbg [[DBG149]]
+// CHECK-DEBUG-NEXT: call void @__kmpc_for_static_init_4u(%struct.ident_t* @[[GLOB42]], i32 [[OMP_GLOBAL_THREAD_NUM207]], i32 34, i32* [[P_LASTITER203]], i32* [[P_LOWERBOUND204]], i32* [[P_UPPERBOUND205]], i32* [[P_STRIDE206]], i32 1, i32 0), !dbg [[DBG149]]
+// CHECK-DEBUG-NEXT: [[TMP4:%.*]] = load i32, i32* [[P_LOWERBOUND204]], align 4, !dbg [[DBG149]]
+// CHECK-DEBUG-NEXT: [[TMP5:%.*]] = load i32, i32* [[P_UPPERBOUND205]], align 4, !dbg [[DBG149]]
+// CHECK-DEBUG-NEXT: [[TMP6:%.*]] = sub i32 [[TMP5]], [[TMP4]], !dbg [[DBG149]]
+// CHECK-DEBUG-NEXT: [[TMP7:%.*]] = add i32 [[TMP6]], 1, !dbg [[DBG149]]
+// CHECK-DEBUG-NEXT: br label [[OMP_LOOP_HEADER191:%.*]], !dbg [[DBG149]]
// CHECK-DEBUG: omp_loop.header191:
-// CHECK-DEBUG-NEXT: [[OMP_LOOP_IV197:%.*]] = phi i32 [ 0, [[OMP_LOOP_PREHEADER190]] ], [ [[OMP_LOOP_NEXT199:%.*]], [[OMP_LOOP_INC194:%.*]] ], !dbg [[DBG148]]
-// CHECK-DEBUG-NEXT: br label [[OMP_LOOP_COND192:%.*]], !dbg [[DBG148]]
+// CHECK-DEBUG-NEXT: [[OMP_LOOP_IV197:%.*]] = phi i32 [ 0, [[OMP_LOOP_PREHEADER190]] ], [ [[OMP_LOOP_NEXT199:%.*]], [[OMP_LOOP_INC194:%.*]] ], !dbg [[DBG149]]
+// CHECK-DEBUG-NEXT: br label [[OMP_LOOP_COND192:%.*]], !dbg [[DBG149]]
// CHECK-DEBUG: omp_loop.cond192:
-// CHECK-DEBUG-NEXT: [[OMP_LOOP_CMP198:%.*]] = icmp ult i32 [[OMP_LOOP_IV197]], [[TMP7]], !dbg [[DBG148]]
-// CHECK-DEBUG-NEXT: br i1 [[OMP_LOOP_CMP198]], label [[OMP_LOOP_BODY193:%.*]], label [[OMP_LOOP_EXIT195:%.*]], !dbg [[DBG148]]
+// CHECK-DEBUG-NEXT: [[OMP_LOOP_CMP198:%.*]] = icmp ult i32 [[OMP_LOOP_IV197]], [[TMP7]], !dbg [[DBG149]]
+// CHECK-DEBUG-NEXT: br i1 [[OMP_LOOP_CMP198]], label [[OMP_LOOP_BODY193:%.*]], label [[OMP_LOOP_EXIT195:%.*]], !dbg [[DBG149]]
// CHECK-DEBUG: omp_loop.body193:
-// CHECK-DEBUG-NEXT: [[TMP8:%.*]] = add i32 [[OMP_LOOP_IV197]], [[TMP4]], !dbg [[DBG148]]
-// CHECK-DEBUG-NEXT: call void @__captured_stmt.20(i32* [[I185]], i32 [[TMP8]], %struct.anon.18* [[AGG_CAPTURED187]]), !dbg [[DBG148]]
-// CHECK-DEBUG-NEXT: [[TMP9:%.*]] = load i32, i32* [[A_ADDR]], align 4, !dbg [[DBG150:![0-9]+]]
-// CHECK-DEBUG-NEXT: [[CONV200:%.*]] = sitofp i32 [[TMP9]] to double, !dbg [[DBG150]]
-// CHECK-DEBUG-NEXT: [[TMP10:%.*]] = load double, double* [[B_ADDR]], align 8, !dbg [[DBG151:![0-9]+]]
-// CHECK-DEBUG-NEXT: [[ADD201:%.*]] = fadd double [[CONV200]], [[TMP10]], !dbg [[DBG152:![0-9]+]]
-// CHECK-DEBUG-NEXT: [[CONV202:%.*]] = fptrunc double [[ADD201]] to float, !dbg [[DBG150]]
-// CHECK-DEBUG-NEXT: [[TMP11:%.*]] = load float*, float** [[R_ADDR]], align 8, !dbg [[DBG153:![0-9]+]]
-// CHECK-DEBUG-NEXT: store float [[CONV202]], float* [[TMP11]], align 4, !dbg [[DBG154:![0-9]+]]
-// CHECK-DEBUG-NEXT: br label [[OMP_LOOP_INC194]], !dbg [[DBG148]]
+// CHECK-DEBUG-NEXT: [[TMP8:%.*]] = add i32 [[OMP_LOOP_IV197]], [[TMP4]], !dbg [[DBG151:![0-9]+]]
+// CHECK-DEBUG-NEXT: call void @__captured_stmt.20(i32* [[I185]], i32 [[TMP8]], %struct.anon.18* [[AGG_CAPTURED187]]), !dbg [[DBG149]]
+// CHECK-DEBUG-NEXT: [[TMP9:%.*]] = load i32, i32* [[A_ADDR]], align 4, !dbg [[DBG152:![0-9]+]]
+// CHECK-DEBUG-NEXT: [[CONV200:%.*]] = sitofp i32 [[TMP9]] to double, !dbg [[DBG152]]
+// CHECK-DEBUG-NEXT: [[TMP10:%.*]] = load double, double* [[B_ADDR]], align 8, !dbg [[DBG151]]
+// CHECK-DEBUG-NEXT: [[ADD201:%.*]] = fadd double [[CONV200]], [[TMP10]], !dbg [[DBG153:![0-9]+]]
+// CHECK-DEBUG-NEXT: [[CONV202:%.*]] = fptrunc double [[ADD201]] to float, !dbg [[DBG152]]
+// CHECK-DEBUG-NEXT: [[TMP11:%.*]] = load float*, float** [[R_ADDR]], align 8, !dbg [[DBG154:![0-9]+]]
+// CHECK-DEBUG-NEXT: store float [[CONV202]], float* [[TMP11]], align 4, !dbg [[DBG155:![0-9]+]]
+// CHECK-DEBUG-NEXT: br label [[OMP_LOOP_INC194]], !dbg [[DBG149]]
// CHECK-DEBUG: omp_loop.inc194:
-// CHECK-DEBUG-NEXT: [[OMP_LOOP_NEXT199]] = add nuw i32 [[OMP_LOOP_IV197]], 1, !dbg [[DBG148]]
-// CHECK-DEBUG-NEXT: br label [[OMP_LOOP_HEADER191]], !dbg [[DBG148]]
+// CHECK-DEBUG-NEXT: [[OMP_LOOP_NEXT199]] = add nuw i32 [[OMP_LOOP_IV197]], 1, !dbg [[DBG149]]
+// CHECK-DEBUG-NEXT: br label [[OMP_LOOP_HEADER191]], !dbg [[DBG149]]
// CHECK-DEBUG: omp_loop.exit195:
-// CHECK-DEBUG-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB42]], i32 [[OMP_GLOBAL_THREAD_NUM207]]), !dbg [[DBG148]]
+// CHECK-DEBUG-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB42]], i32 [[OMP_GLOBAL_THREAD_NUM207]]), !dbg [[DBG149]]
// CHECK-DEBUG-NEXT: [[OMP_GLOBAL_THREAD_NUM208:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB42]]), !dbg [[DBG151]]
// CHECK-DEBUG-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB43:[0-9]+]], i32 [[OMP_GLOBAL_THREAD_NUM208]]), !dbg [[DBG151]]
-// CHECK-DEBUG-NEXT: br label [[OMP_LOOP_AFTER196:%.*]], !dbg [[DBG148]]
+// CHECK-DEBUG-NEXT: br label [[OMP_LOOP_AFTER196:%.*]], !dbg [[DBG149]]
// CHECK-DEBUG: omp_loop.after196:
-// CHECK-DEBUG-NEXT: ret void, !dbg [[DBG155:![0-9]+]]
+// CHECK-DEBUG-NEXT: ret void, !dbg [[DBG156:![0-9]+]]
//
void parallel_for_2(float *r, int a, double b) {
#pragma omp parallel
diff --git a/clang/test/OpenMP/irbuilder_unroll_partial_factor_for.c b/clang/test/OpenMP/irbuilder_unroll_partial_factor_for.c
index 7bcfeab076c3e..457e6b4e594a0 100644
--- a/clang/test/OpenMP/irbuilder_unroll_partial_factor_for.c
+++ b/clang/test/OpenMP/irbuilder_unroll_partial_factor_for.c
@@ -51,7 +51,7 @@
// CHECK-NEXT: store i32 %[[TMP8]], i32* %[[P_UPPERBOUND]], align 4
// CHECK-NEXT: store i32 1, i32* %[[P_STRIDE]], align 4
// CHECK-NEXT: %[[OMP_GLOBAL_THREAD_NUM:.+]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1)
-// CHECK-NEXT: call void @__kmpc_for_static_init_4u(%struct.ident_t* @1, i32 %[[OMP_GLOBAL_THREAD_NUM]], i32 34, i32* %[[P_LASTITER]], i32* %[[P_LOWERBOUND]], i32* %[[P_UPPERBOUND]], i32* %[[P_STRIDE]], i32 1, i32 1)
+// CHECK-NEXT: call void @__kmpc_for_static_init_4u(%struct.ident_t* @1, i32 %[[OMP_GLOBAL_THREAD_NUM]], i32 34, i32* %[[P_LASTITER]], i32* %[[P_LOWERBOUND]], i32* %[[P_UPPERBOUND]], i32* %[[P_STRIDE]], i32 1, i32 0)
// CHECK-NEXT: %[[TMP9:.+]] = load i32, i32* %[[P_LOWERBOUND]], align 4
// CHECK-NEXT: %[[TMP10:.+]] = load i32, i32* %[[P_UPPERBOUND]], align 4
// CHECK-NEXT: %[[TMP11:.+]] = sub i32 %[[TMP10]], %[[TMP9]]
diff --git a/clang/test/OpenMP/irbuilder_unroll_partial_heuristic_constant_for.c b/clang/test/OpenMP/irbuilder_unroll_partial_heuristic_constant_for.c
index 34dd745aa855f..cc7cead59de6a 100644
--- a/clang/test/OpenMP/irbuilder_unroll_partial_heuristic_constant_for.c
+++ b/clang/test/OpenMP/irbuilder_unroll_partial_heuristic_constant_for.c
@@ -57,7 +57,7 @@ double sind(double);
// CHECK-NEXT: store i32 %[[TMP7]], i32* %[[P_UPPERBOUND]], align 4
// CHECK-NEXT: store i32 1, i32* %[[P_STRIDE]], align 4
// CHECK-NEXT: %[[OMP_GLOBAL_THREAD_NUM:.+]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1)
-// CHECK-NEXT: call void @__kmpc_for_static_init_4u(%struct.ident_t* @1, i32 %[[OMP_GLOBAL_THREAD_NUM]], i32 34, i32* %[[P_LASTITER]], i32* %[[P_LOWERBOUND]], i32* %[[P_UPPERBOUND]], i32* %[[P_STRIDE]], i32 1, i32 1)
+// CHECK-NEXT: call void @__kmpc_for_static_init_4u(%struct.ident_t* @1, i32 %[[OMP_GLOBAL_THREAD_NUM]], i32 34, i32* %[[P_LASTITER]], i32* %[[P_LOWERBOUND]], i32* %[[P_UPPERBOUND]], i32* %[[P_STRIDE]], i32 1, i32 0)
// CHECK-NEXT: %[[TMP8:.+]] = load i32, i32* %[[P_LOWERBOUND]], align 4
// CHECK-NEXT: %[[TMP9:.+]] = load i32, i32* %[[P_UPPERBOUND]], align 4
// CHECK-NEXT: %[[TMP10:.+]] = sub i32 %[[TMP9]], %[[TMP8]]
diff --git a/clang/test/OpenMP/irbuilder_unroll_partial_heuristic_for_collapse.c b/clang/test/OpenMP/irbuilder_unroll_partial_heuristic_for_collapse.c
index be974d5454bd1..11370db6b6c1d 100644
--- a/clang/test/OpenMP/irbuilder_unroll_partial_heuristic_for_collapse.c
+++ b/clang/test/OpenMP/irbuilder_unroll_partial_heuristic_for_collapse.c
@@ -9,6 +9,202 @@
double sind(double);
+// CHECK-LABEL: define {{.*}}@unroll_partial_heuristic_for(
+// CHECK-NEXT: [[ENTRY:.*]]:
+// CHECK-NEXT: %[[M_ADDR:.+]] = alloca i32, align 4
+// CHECK-NEXT: %[[A_ADDR:.+]] = alloca float*, align 8
+// CHECK-NEXT: %[[B_ADDR:.+]] = alloca float*, align 8
+// CHECK-NEXT: %[[C_ADDR:.+]] = alloca float*, align 8
+// CHECK-NEXT: %[[D_ADDR:.+]] = alloca float*, align 8
+// CHECK-NEXT: %[[E_ADDR:.+]] = alloca float*, align 8
+// CHECK-NEXT: %[[OFFSET_ADDR:.+]] = alloca float, align 4
+// CHECK-NEXT: %[[DOTOMP_IV:.+]] = alloca i64, align 8
+// CHECK-NEXT: %[[TMP:.+]] = alloca i32, align 4
+// CHECK-NEXT: %[[TMP1:.+]] = alloca i32, align 4
+// CHECK-NEXT: %[[DOTCAPTURE_EXPR_:.+]] = alloca i32, align 4
+// CHECK-NEXT: %[[J:.+]] = alloca i32, align 4
+// CHECK-NEXT: %[[DOTCAPTURE_EXPR_2:.+]] = alloca i64, align 8
+// CHECK-NEXT: %[[I:.+]] = alloca i32, align 4
+// CHECK-NEXT: %[[DOTUNROLLED_IV_J:.+]] = alloca i32, align 4
+// CHECK-NEXT: %[[DOTOMP_LB:.+]] = alloca i64, align 8
+// CHECK-NEXT: %[[DOTOMP_UB:.+]] = alloca i64, align 8
+// CHECK-NEXT: %[[DOTOMP_STRIDE:.+]] = alloca i64, align 8
+// CHECK-NEXT: %[[DOTOMP_IS_LAST:.+]] = alloca i32, align 4
+// CHECK-NEXT: %[[I6:.+]] = alloca i32, align 4
+// CHECK-NEXT: %[[DOTUNROLLED_IV_J7:.+]] = alloca i32, align 4
+// CHECK-NEXT: %[[DOTUNROLL_INNER_IV_J:.+]] = alloca i32, align 4
+// CHECK-NEXT: store i32 %[[M:.+]], i32* %[[M_ADDR]], align 4
+// CHECK-NEXT: store float* %[[A:.+]], float** %[[A_ADDR]], align 8
+// CHECK-NEXT: store float* %[[B:.+]], float** %[[B_ADDR]], align 8
+// CHECK-NEXT: store float* %[[C:.+]], float** %[[C_ADDR]], align 8
+// CHECK-NEXT: store float* %[[D:.+]], float** %[[D_ADDR]], align 8
+// CHECK-NEXT: store float* %[[E:.+]], float** %[[E_ADDR]], align 8
+// CHECK-NEXT: store float %[[OFFSET:.+]], float* %[[OFFSET_ADDR]], align 4
+// CHECK-NEXT: %[[TMP0:.+]] = load i32, i32* %[[M_ADDR]], align 4
+// CHECK-NEXT: store i32 %[[TMP0]], i32* %[[DOTCAPTURE_EXPR_]], align 4
+// CHECK-NEXT: store i32 0, i32* %[[J]], align 4
+// CHECK-NEXT: %[[TMP1_1:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_]], align 4
+// CHECK-NEXT: %[[SUB:.+]] = sub nsw i32 %[[TMP1_1]], 0
+// CHECK-NEXT: %[[DIV:.+]] = sdiv i32 %[[SUB]], 1
+// CHECK-NEXT: %[[CONV:.+]] = sext i32 %[[DIV]] to i64
+// CHECK-NEXT: %[[MUL:.+]] = mul nsw i64 %[[CONV]], 4
+// CHECK-NEXT: %[[SUB3:.+]] = sub nsw i64 %[[MUL]], 1
+// CHECK-NEXT: store i64 %[[SUB3]], i64* %[[DOTCAPTURE_EXPR_2]], align 8
+// CHECK-NEXT: store i32 0, i32* %[[I]], align 4
+// CHECK-NEXT: store i32 0, i32* %[[DOTUNROLLED_IV_J]], align 4
+// CHECK-NEXT: %[[TMP2:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_]], align 4
+// CHECK-NEXT: %[[CMP:.+]] = icmp slt i32 0, %[[TMP2]]
+// CHECK-NEXT: br i1 %[[CMP]], label %[[OMP_PRECOND_THEN:.+]], label %[[OMP_PRECOND_END:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_PRECOND_THEN]]:
+// CHECK-NEXT: store i64 0, i64* %[[DOTOMP_LB]], align 8
+// CHECK-NEXT: %[[TMP3:.+]] = load i64, i64* %[[DOTCAPTURE_EXPR_2]], align 8
+// CHECK-NEXT: store i64 %[[TMP3]], i64* %[[DOTOMP_UB]], align 8
+// CHECK-NEXT: store i64 1, i64* %[[DOTOMP_STRIDE]], align 8
+// CHECK-NEXT: store i32 0, i32* %[[DOTOMP_IS_LAST]], align 4
+// CHECK-NEXT: %[[OMP_GLOBAL_THREAD_NUM:.+]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @3)
+// CHECK-NEXT: call void @__kmpc_for_static_init_8(%struct.ident_t* @1, i32 %[[OMP_GLOBAL_THREAD_NUM]], i32 34, i32* %[[DOTOMP_IS_LAST]], i64* %[[DOTOMP_LB]], i64* %[[DOTOMP_UB]], i64* %[[DOTOMP_STRIDE]], i64 1, i64 1)
+// CHECK-NEXT: %[[TMP4:.+]] = load i64, i64* %[[DOTOMP_UB]], align 8
+// CHECK-NEXT: %[[TMP5:.+]] = load i64, i64* %[[DOTCAPTURE_EXPR_2]], align 8
+// CHECK-NEXT: %[[CMP8:.+]] = icmp sgt i64 %[[TMP4]], %[[TMP5]]
+// CHECK-NEXT: br i1 %[[CMP8]], label %[[COND_TRUE:.+]], label %[[COND_FALSE:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[COND_TRUE]]:
+// CHECK-NEXT: %[[TMP6:.+]] = load i64, i64* %[[DOTCAPTURE_EXPR_2]], align 8
+// CHECK-NEXT: br label %[[COND_END:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[COND_FALSE]]:
+// CHECK-NEXT: %[[TMP7:.+]] = load i64, i64* %[[DOTOMP_UB]], align 8
+// CHECK-NEXT: br label %[[COND_END]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[COND_END]]:
+// CHECK-NEXT: %[[COND:.+]] = phi i64 [ %[[TMP6]], %[[COND_TRUE]] ], [ %[[TMP7]], %[[COND_FALSE]] ]
+// CHECK-NEXT: store i64 %[[COND]], i64* %[[DOTOMP_UB]], align 8
+// CHECK-NEXT: %[[TMP8:.+]] = load i64, i64* %[[DOTOMP_LB]], align 8
+// CHECK-NEXT: store i64 %[[TMP8]], i64* %[[DOTOMP_IV]], align 8
+// CHECK-NEXT: br label %[[OMP_INNER_FOR_COND:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_INNER_FOR_COND]]:
+// CHECK-NEXT: %[[TMP9:.+]] = load i64, i64* %[[DOTOMP_IV]], align 8
+// CHECK-NEXT: %[[TMP10:.+]] = load i64, i64* %[[DOTOMP_UB]], align 8
+// CHECK-NEXT: %[[CMP10:.+]] = icmp sle i64 %[[TMP9]], %[[TMP10]]
+// CHECK-NEXT: br i1 %[[CMP10]], label %[[OMP_INNER_FOR_BODY:.+]], label %[[OMP_INNER_FOR_END:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_INNER_FOR_BODY]]:
+// CHECK-NEXT: %[[TMP11:.+]] = load i64, i64* %[[DOTOMP_IV]], align 8
+// CHECK-NEXT: %[[DIV12:.+]] = sdiv i64 %[[TMP11]], 4
+// CHECK-NEXT: %[[MUL13:.+]] = mul nsw i64 %[[DIV12]], 1
+// CHECK-NEXT: %[[ADD:.+]] = add nsw i64 0, %[[MUL13]]
+// CHECK-NEXT: %[[CONV14:.+]] = trunc i64 %[[ADD]] to i32
+// CHECK-NEXT: store i32 %[[CONV14]], i32* %[[I6]], align 4
+// CHECK-NEXT: %[[TMP12:.+]] = load i64, i64* %[[DOTOMP_IV]], align 8
+// CHECK-NEXT: %[[TMP13:.+]] = load i64, i64* %[[DOTOMP_IV]], align 8
+// CHECK-NEXT: %[[DIV15:.+]] = sdiv i64 %[[TMP13]], 4
+// CHECK-NEXT: %[[MUL16:.+]] = mul nsw i64 %[[DIV15]], 4
+// CHECK-NEXT: %[[SUB17:.+]] = sub nsw i64 %[[TMP12]], %[[MUL16]]
+// CHECK-NEXT: %[[MUL18:.+]] = mul nsw i64 %[[SUB17]], 2
+// CHECK-NEXT: %[[ADD19:.+]] = add nsw i64 0, %[[MUL18]]
+// CHECK-NEXT: %[[CONV20:.+]] = trunc i64 %[[ADD19]] to i32
+// CHECK-NEXT: store i32 %[[CONV20]], i32* %[[DOTUNROLLED_IV_J7]], align 4
+// CHECK-NEXT: %[[TMP14:.+]] = load i32, i32* %[[DOTUNROLLED_IV_J7]], align 4
+// CHECK-NEXT: store i32 %[[TMP14]], i32* %[[DOTUNROLL_INNER_IV_J]], align 4
+// CHECK-NEXT: br label %[[FOR_COND:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[FOR_COND]]:
+// CHECK-NEXT: %[[TMP15:.+]] = load i32, i32* %[[DOTUNROLL_INNER_IV_J]], align 4
+// CHECK-NEXT: %[[TMP16:.+]] = load i32, i32* %[[DOTUNROLLED_IV_J7]], align 4
+// CHECK-NEXT: %[[ADD21:.+]] = add nsw i32 %[[TMP16]], 2
+// CHECK-NEXT: %[[CMP22:.+]] = icmp sle i32 %[[TMP15]], %[[ADD21]]
+// CHECK-NEXT: br i1 %[[CMP22]], label %[[LAND_RHS:.+]], label %[[LAND_END:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[LAND_RHS]]:
+// CHECK-NEXT: %[[TMP17:.+]] = load i32, i32* %[[DOTUNROLL_INNER_IV_J]], align 4
+// CHECK-NEXT: %[[CMP24:.+]] = icmp sle i32 %[[TMP17]], 8
+// CHECK-NEXT: br label %[[LAND_END]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[LAND_END]]:
+// CHECK-NEXT: %[[TMP18:.+]] = phi i1 [ false, %[[FOR_COND]] ], [ %[[CMP24]], %[[LAND_RHS]] ]
+// CHECK-NEXT: br i1 %[[TMP18]], label %[[FOR_BODY:.+]], label %[[FOR_END:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[FOR_BODY]]:
+// CHECK-NEXT: %[[TMP19:.+]] = load i32, i32* %[[DOTUNROLL_INNER_IV_J]], align 4
+// CHECK-NEXT: %[[MUL26:.+]] = mul nsw i32 %[[TMP19]], 1
+// CHECK-NEXT: %[[ADD27:.+]] = add nsw i32 0, %[[MUL26]]
+// CHECK-NEXT: store i32 %[[ADD27]], i32* %[[J]], align 4
+// CHECK-NEXT: %[[TMP20:.+]] = load float*, float** %[[B_ADDR]], align 8
+// CHECK-NEXT: %[[TMP21:.+]] = load i32, i32* %[[I6]], align 4
+// CHECK-NEXT: %[[IDXPROM:.+]] = sext i32 %[[TMP21]] to i64
+// CHECK-NEXT: %[[ARRAYIDX:.+]] = getelementptr inbounds float, float* %[[TMP20]], i64 %[[IDXPROM]]
+// CHECK-NEXT: %[[TMP22:.+]] = load float, float* %[[ARRAYIDX]], align 4
+// CHECK-NEXT: %[[CONV28:.+]] = fpext float %[[TMP22]] to double
+// CHECK-NEXT: %[[CALL:.+]] = call double @sind(double noundef %[[CONV28]])
+// CHECK-NEXT: %[[TMP23:.+]] = load float*, float** %[[C_ADDR]], align 8
+// CHECK-NEXT: %[[TMP24:.+]] = load i32, i32* %[[I6]], align 4
+// CHECK-NEXT: %[[IDXPROM29:.+]] = sext i32 %[[TMP24]] to i64
+// CHECK-NEXT: %[[ARRAYIDX30:.+]] = getelementptr inbounds float, float* %[[TMP23]], i64 %[[IDXPROM29]]
+// CHECK-NEXT: %[[TMP25:.+]] = load float, float* %[[ARRAYIDX30]], align 4
+// CHECK-NEXT: %[[CONV31:.+]] = fpext float %[[TMP25]] to double
+// CHECK-NEXT: %[[MUL32:.+]] = fmul double %[[CALL]], %[[CONV31]]
+// CHECK-NEXT: %[[TMP26:.+]] = load float*, float** %[[D_ADDR]], align 8
+// CHECK-NEXT: %[[TMP27:.+]] = load i32, i32* %[[I6]], align 4
+// CHECK-NEXT: %[[IDXPROM33:.+]] = sext i32 %[[TMP27]] to i64
+// CHECK-NEXT: %[[ARRAYIDX34:.+]] = getelementptr inbounds float, float* %[[TMP26]], i64 %[[IDXPROM33]]
+// CHECK-NEXT: %[[TMP28:.+]] = load float, float* %[[ARRAYIDX34]], align 4
+// CHECK-NEXT: %[[CONV35:.+]] = fpext float %[[TMP28]] to double
+// CHECK-NEXT: %[[MUL36:.+]] = fmul double %[[MUL32]], %[[CONV35]]
+// CHECK-NEXT: %[[TMP29:.+]] = load float*, float** %[[E_ADDR]], align 8
+// CHECK-NEXT: %[[TMP30:.+]] = load i32, i32* %[[I6]], align 4
+// CHECK-NEXT: %[[IDXPROM37:.+]] = sext i32 %[[TMP30]] to i64
+// CHECK-NEXT: %[[ARRAYIDX38:.+]] = getelementptr inbounds float, float* %[[TMP29]], i64 %[[IDXPROM37]]
+// CHECK-NEXT: %[[TMP31:.+]] = load float, float* %[[ARRAYIDX38]], align 4
+// CHECK-NEXT: %[[CONV39:.+]] = fpext float %[[TMP31]] to double
+// CHECK-NEXT: %[[MUL40:.+]] = fmul double %[[MUL36]], %[[CONV39]]
+// CHECK-NEXT: %[[TMP32:.+]] = load float, float* %[[OFFSET_ADDR]], align 4
+// CHECK-NEXT: %[[CONV41:.+]] = fpext float %[[TMP32]] to double
+// CHECK-NEXT: %[[ADD42:.+]] = fadd double %[[MUL40]], %[[CONV41]]
+// CHECK-NEXT: %[[TMP33:.+]] = load float*, float** %[[A_ADDR]], align 8
+// CHECK-NEXT: %[[TMP34:.+]] = load i32, i32* %[[I6]], align 4
+// CHECK-NEXT: %[[IDXPROM43:.+]] = sext i32 %[[TMP34]] to i64
+// CHECK-NEXT: %[[ARRAYIDX44:.+]] = getelementptr inbounds float, float* %[[TMP33]], i64 %[[IDXPROM43]]
+// CHECK-NEXT: %[[TMP35:.+]] = load float, float* %[[ARRAYIDX44]], align 4
+// CHECK-NEXT: %[[CONV45:.+]] = fpext float %[[TMP35]] to double
+// CHECK-NEXT: %[[ADD46:.+]] = fadd double %[[CONV45]], %[[ADD42]]
+// CHECK-NEXT: %[[CONV47:.+]] = fptrunc double %[[ADD46]] to float
+// CHECK-NEXT: store float %[[CONV47]], float* %[[ARRAYIDX44]], align 4
+// CHECK-NEXT: br label %[[FOR_INC:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[FOR_INC]]:
+// CHECK-NEXT: %[[TMP36:.+]] = load i32, i32* %[[DOTUNROLL_INNER_IV_J]], align 4
+// CHECK-NEXT: %[[INC:.+]] = add nsw i32 %[[TMP36]], 1
+// CHECK-NEXT: store i32 %[[INC]], i32* %[[DOTUNROLL_INNER_IV_J]], align 4
+// CHECK-NEXT: br label %[[FOR_COND]], !llvm.loop ![[LOOP3:[0-9]+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[FOR_END]]:
+// CHECK-NEXT: br label %[[OMP_BODY_CONTINUE:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_BODY_CONTINUE]]:
+// CHECK-NEXT: br label %[[OMP_INNER_FOR_INC:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_INNER_FOR_INC]]:
+// CHECK-NEXT: %[[TMP37:.+]] = load i64, i64* %[[DOTOMP_IV]], align 8
+// CHECK-NEXT: %[[ADD48:.+]] = add nsw i64 %[[TMP37]], 1
+// CHECK-NEXT: store i64 %[[ADD48]], i64* %[[DOTOMP_IV]], align 8
+// CHECK-NEXT: br label %[[OMP_INNER_FOR_COND]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_INNER_FOR_END]]:
+// CHECK-NEXT: br label %[[OMP_LOOP_EXIT:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_LOOP_EXIT]]:
+// CHECK-NEXT: %[[OMP_GLOBAL_THREAD_NUM49:.+]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @5)
+// CHECK-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @1, i32 %[[OMP_GLOBAL_THREAD_NUM49]])
+// CHECK-NEXT: br label %[[OMP_PRECOND_END]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_PRECOND_END]]:
+// CHECK-NEXT: %[[OMP_GLOBAL_THREAD_NUM50:.+]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @7)
+// CHECK-NEXT: call void @__kmpc_barrier(%struct.ident_t* @6, i32 %[[OMP_GLOBAL_THREAD_NUM50]])
+// CHECK-NEXT: ret void
+// CHECK-NEXT: }
+
void unroll_partial_heuristic_for(int m, float *a, float *b, float *c, float *d, float *e, float offset) {
#pragma omp for collapse(2)
@@ -21,183 +217,11 @@ void unroll_partial_heuristic_for(int m, float *a, float *b, float *c, float *d,
}
#endif // HEADER
-
-// CHECK-LABEL: define {{[^@]+}}@unroll_partial_heuristic_for
-// CHECK-SAME: (i32 noundef [[M:%.*]], float* noundef [[A:%.*]], float* noundef [[B:%.*]], float* noundef [[C:%.*]], float* noundef [[D:%.*]], float* noundef [[E:%.*]], float noundef [[OFFSET:%.*]]) #[[ATTR0:[0-9]+]] {
-// CHECK-NEXT: entry:
-// CHECK-NEXT: [[M_ADDR:%.*]] = alloca i32, align 4
-// CHECK-NEXT: [[A_ADDR:%.*]] = alloca float*, align 8
-// CHECK-NEXT: [[B_ADDR:%.*]] = alloca float*, align 8
-// CHECK-NEXT: [[C_ADDR:%.*]] = alloca float*, align 8
-// CHECK-NEXT: [[D_ADDR:%.*]] = alloca float*, align 8
-// CHECK-NEXT: [[E_ADDR:%.*]] = alloca float*, align 8
-// CHECK-NEXT: [[OFFSET_ADDR:%.*]] = alloca float, align 4
-// CHECK-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8
-// CHECK-NEXT: [[TMP:%.*]] = alloca i32, align 4
-// CHECK-NEXT: [[TMP1:%.*]] = alloca i32, align 4
-// CHECK-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
-// CHECK-NEXT: [[J:%.*]] = alloca i32, align 4
-// CHECK-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i64, align 8
-// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4
-// CHECK-NEXT: [[DOTUNROLLED_IV_J:%.*]] = alloca i32, align 4
-// CHECK-NEXT: [[DOTOMP_LB:%.*]] = alloca i64, align 8
-// CHECK-NEXT: [[DOTOMP_UB:%.*]] = alloca i64, align 8
-// CHECK-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8
-// CHECK-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
-// CHECK-NEXT: [[I6:%.*]] = alloca i32, align 4
-// CHECK-NEXT: [[DOTUNROLLED_IV_J7:%.*]] = alloca i32, align 4
-// CHECK-NEXT: [[DOTUNROLL_INNER_IV_J:%.*]] = alloca i32, align 4
-// CHECK-NEXT: store i32 [[M]], i32* [[M_ADDR]], align 4
-// CHECK-NEXT: store float* [[A]], float** [[A_ADDR]], align 8
-// CHECK-NEXT: store float* [[B]], float** [[B_ADDR]], align 8
-// CHECK-NEXT: store float* [[C]], float** [[C_ADDR]], align 8
-// CHECK-NEXT: store float* [[D]], float** [[D_ADDR]], align 8
-// CHECK-NEXT: store float* [[E]], float** [[E_ADDR]], align 8
-// CHECK-NEXT: store float [[OFFSET]], float* [[OFFSET_ADDR]], align 4
-// CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[M_ADDR]], align 4
-// CHECK-NEXT: store i32 [[TMP0]], i32* [[DOTCAPTURE_EXPR_]], align 4
-// CHECK-NEXT: store i32 0, i32* [[J]], align 4
-// CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4
-// CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP1]], 0
-// CHECK-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
-// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[DIV]] to i64
-// CHECK-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV]], 4
-// CHECK-NEXT: [[SUB3:%.*]] = sub nsw i64 [[MUL]], 1
-// CHECK-NEXT: store i64 [[SUB3]], i64* [[DOTCAPTURE_EXPR_2]], align 8
-// CHECK-NEXT: store i32 0, i32* [[I]], align 4
-// CHECK-NEXT: store i32 0, i32* [[DOTUNROLLED_IV_J]], align 4
-// CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4
-// CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP2]]
-// CHECK-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
-// CHECK: omp.precond.then:
-// CHECK-NEXT: store i64 0, i64* [[DOTOMP_LB]], align 8
-// CHECK-NEXT: [[TMP3:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_2]], align 8
-// CHECK-NEXT: store i64 [[TMP3]], i64* [[DOTOMP_UB]], align 8
-// CHECK-NEXT: store i64 1, i64* [[DOTOMP_STRIDE]], align 8
-// CHECK-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
-// CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]])
-// CHECK-NEXT: call void @__kmpc_for_static_init_8(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[OMP_GLOBAL_THREAD_NUM]], i32 34, i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_LB]], i64* [[DOTOMP_UB]], i64* [[DOTOMP_STRIDE]], i64 1, i64 1)
-// CHECK-NEXT: [[TMP4:%.*]] = load i64, i64* [[DOTOMP_UB]], align 8
-// CHECK-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_2]], align 8
-// CHECK-NEXT: [[CMP8:%.*]] = icmp sgt i64 [[TMP4]], [[TMP5]]
-// CHECK-NEXT: br i1 [[CMP8]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
-// CHECK: cond.true:
-// CHECK-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_2]], align 8
-// CHECK-NEXT: br label [[COND_END:%.*]]
-// CHECK: cond.false:
-// CHECK-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTOMP_UB]], align 8
-// CHECK-NEXT: br label [[COND_END]]
-// CHECK: cond.end:
-// CHECK-NEXT: [[COND:%.*]] = phi i64 [ [[TMP6]], [[COND_TRUE]] ], [ [[TMP7]], [[COND_FALSE]] ]
-// CHECK-NEXT: store i64 [[COND]], i64* [[DOTOMP_UB]], align 8
-// CHECK-NEXT: [[TMP8:%.*]] = load i64, i64* [[DOTOMP_LB]], align 8
-// CHECK-NEXT: store i64 [[TMP8]], i64* [[DOTOMP_IV]], align 8
-// CHECK-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
-// CHECK: omp.inner.for.cond:
-// CHECK-NEXT: [[TMP9:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8
-// CHECK-NEXT: [[TMP10:%.*]] = load i64, i64* [[DOTOMP_UB]], align 8
-// CHECK-NEXT: [[CMP10:%.*]] = icmp sle i64 [[TMP9]], [[TMP10]]
-// CHECK-NEXT: br i1 [[CMP10]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
-// CHECK: omp.inner.for.body:
-// CHECK-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8
-// CHECK-NEXT: [[DIV12:%.*]] = sdiv i64 [[TMP11]], 4
-// CHECK-NEXT: [[MUL13:%.*]] = mul nsw i64 [[DIV12]], 1
-// CHECK-NEXT: [[ADD:%.*]] = add nsw i64 0, [[MUL13]]
-// CHECK-NEXT: [[CONV14:%.*]] = trunc i64 [[ADD]] to i32
-// CHECK-NEXT: store i32 [[CONV14]], i32* [[I6]], align 4
-// CHECK-NEXT: [[TMP12:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8
-// CHECK-NEXT: [[TMP13:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8
-// CHECK-NEXT: [[DIV15:%.*]] = sdiv i64 [[TMP13]], 4
-// CHECK-NEXT: [[MUL16:%.*]] = mul nsw i64 [[DIV15]], 4
-// CHECK-NEXT: [[SUB17:%.*]] = sub nsw i64 [[TMP12]], [[MUL16]]
-// CHECK-NEXT: [[MUL18:%.*]] = mul nsw i64 [[SUB17]], 2
-// CHECK-NEXT: [[ADD19:%.*]] = add nsw i64 0, [[MUL18]]
-// CHECK-NEXT: [[CONV20:%.*]] = trunc i64 [[ADD19]] to i32
-// CHECK-NEXT: store i32 [[CONV20]], i32* [[DOTUNROLLED_IV_J7]], align 4
-// CHECK-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTUNROLLED_IV_J7]], align 4
-// CHECK-NEXT: store i32 [[TMP14]], i32* [[DOTUNROLL_INNER_IV_J]], align 4
-// CHECK-NEXT: br label [[FOR_COND:%.*]]
-// CHECK: for.cond:
-// CHECK-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTUNROLL_INNER_IV_J]], align 4
-// CHECK-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTUNROLLED_IV_J7]], align 4
-// CHECK-NEXT: [[ADD21:%.*]] = add nsw i32 [[TMP16]], 2
-// CHECK-NEXT: [[CMP22:%.*]] = icmp sle i32 [[TMP15]], [[ADD21]]
-// CHECK-NEXT: br i1 [[CMP22]], label [[LAND_RHS:%.*]], label [[LAND_END:%.*]]
-// CHECK: land.rhs:
-// CHECK-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTUNROLL_INNER_IV_J]], align 4
-// CHECK-NEXT: [[CMP24:%.*]] = icmp sle i32 [[TMP17]], 8
-// CHECK-NEXT: br label [[LAND_END]]
-// CHECK: land.end:
-// CHECK-NEXT: [[TMP18:%.*]] = phi i1 [ false, [[FOR_COND]] ], [ [[CMP24]], [[LAND_RHS]] ]
-// CHECK-NEXT: br i1 [[TMP18]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]]
-// CHECK: for.body:
-// CHECK-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTUNROLL_INNER_IV_J]], align 4
-// CHECK-NEXT: [[MUL26:%.*]] = mul nsw i32 [[TMP19]], 1
-// CHECK-NEXT: [[ADD27:%.*]] = add nsw i32 0, [[MUL26]]
-// CHECK-NEXT: store i32 [[ADD27]], i32* [[J]], align 4
-// CHECK-NEXT: [[TMP20:%.*]] = load float*, float** [[B_ADDR]], align 8
-// CHECK-NEXT: [[TMP21:%.*]] = load i32, i32* [[I6]], align 4
-// CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP21]] to i64
-// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[TMP20]], i64 [[IDXPROM]]
-// CHECK-NEXT: [[TMP22:%.*]] = load float, float* [[ARRAYIDX]], align 4
-// CHECK-NEXT: [[CONV28:%.*]] = fpext float [[TMP22]] to double
-// CHECK-NEXT: [[CALL:%.*]] = call double @sind(double noundef [[CONV28]])
-// CHECK-NEXT: [[TMP23:%.*]] = load float*, float** [[C_ADDR]], align 8
-// CHECK-NEXT: [[TMP24:%.*]] = load i32, i32* [[I6]], align 4
-// CHECK-NEXT: [[IDXPROM29:%.*]] = sext i32 [[TMP24]] to i64
-// CHECK-NEXT: [[ARRAYIDX30:%.*]] = getelementptr inbounds float, float* [[TMP23]], i64 [[IDXPROM29]]
-// CHECK-NEXT: [[TMP25:%.*]] = load float, float* [[ARRAYIDX30]], align 4
-// CHECK-NEXT: [[CONV31:%.*]] = fpext float [[TMP25]] to double
-// CHECK-NEXT: [[MUL32:%.*]] = fmul double [[CALL]], [[CONV31]]
-// CHECK-NEXT: [[TMP26:%.*]] = load float*, float** [[D_ADDR]], align 8
-// CHECK-NEXT: [[TMP27:%.*]] = load i32, i32* [[I6]], align 4
-// CHECK-NEXT: [[IDXPROM33:%.*]] = sext i32 [[TMP27]] to i64
-// CHECK-NEXT: [[ARRAYIDX34:%.*]] = getelementptr inbounds float, float* [[TMP26]], i64 [[IDXPROM33]]
-// CHECK-NEXT: [[TMP28:%.*]] = load float, float* [[ARRAYIDX34]], align 4
-// CHECK-NEXT: [[CONV35:%.*]] = fpext float [[TMP28]] to double
-// CHECK-NEXT: [[MUL36:%.*]] = fmul double [[MUL32]], [[CONV35]]
-// CHECK-NEXT: [[TMP29:%.*]] = load float*, float** [[E_ADDR]], align 8
-// CHECK-NEXT: [[TMP30:%.*]] = load i32, i32* [[I6]], align 4
-// CHECK-NEXT: [[IDXPROM37:%.*]] = sext i32 [[TMP30]] to i64
-// CHECK-NEXT: [[ARRAYIDX38:%.*]] = getelementptr inbounds float, float* [[TMP29]], i64 [[IDXPROM37]]
-// CHECK-NEXT: [[TMP31:%.*]] = load float, float* [[ARRAYIDX38]], align 4
-// CHECK-NEXT: [[CONV39:%.*]] = fpext float [[TMP31]] to double
-// CHECK-NEXT: [[MUL40:%.*]] = fmul double [[MUL36]], [[CONV39]]
-// CHECK-NEXT: [[TMP32:%.*]] = load float, float* [[OFFSET_ADDR]], align 4
-// CHECK-NEXT: [[CONV41:%.*]] = fpext float [[TMP32]] to double
-// CHECK-NEXT: [[ADD42:%.*]] = fadd double [[MUL40]], [[CONV41]]
-// CHECK-NEXT: [[TMP33:%.*]] = load float*, float** [[A_ADDR]], align 8
-// CHECK-NEXT: [[TMP34:%.*]] = load i32, i32* [[I6]], align 4
-// CHECK-NEXT: [[IDXPROM43:%.*]] = sext i32 [[TMP34]] to i64
-// CHECK-NEXT: [[ARRAYIDX44:%.*]] = getelementptr inbounds float, float* [[TMP33]], i64 [[IDXPROM43]]
-// CHECK-NEXT: [[TMP35:%.*]] = load float, float* [[ARRAYIDX44]], align 4
-// CHECK-NEXT: [[CONV45:%.*]] = fpext float [[TMP35]] to double
-// CHECK-NEXT: [[ADD46:%.*]] = fadd double [[CONV45]], [[ADD42]]
-// CHECK-NEXT: [[CONV47:%.*]] = fptrunc double [[ADD46]] to float
-// CHECK-NEXT: store float [[CONV47]], float* [[ARRAYIDX44]], align 4
-// CHECK-NEXT: br label [[FOR_INC:%.*]]
-// CHECK: for.inc:
-// CHECK-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTUNROLL_INNER_IV_J]], align 4
-// CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP36]], 1
-// CHECK-NEXT: store i32 [[INC]], i32* [[DOTUNROLL_INNER_IV_J]], align 4
-// CHECK-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP3:![0-9]+]]
-// CHECK: for.end:
-// CHECK-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
-// CHECK: omp.body.continue:
-// CHECK-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
-// CHECK: omp.inner.for.inc:
-// CHECK-NEXT: [[TMP37:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8
-// CHECK-NEXT: [[ADD48:%.*]] = add nsw i64 [[TMP37]], 1
-// CHECK-NEXT: store i64 [[ADD48]], i64* [[DOTOMP_IV]], align 8
-// CHECK-NEXT: br label [[OMP_INNER_FOR_COND]]
-// CHECK: omp.inner.for.end:
-// CHECK-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
-// CHECK: omp.loop.exit:
-// CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM49:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB5:[0-9]+]])
-// CHECK-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM49]])
-// CHECK-NEXT: br label [[OMP_PRECOND_END]]
-// CHECK: omp.precond.end:
-// CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM50:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB7:[0-9]+]])
-// CHECK-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB6:[0-9]+]], i32 [[OMP_GLOBAL_THREAD_NUM50]])
-// CHECK-NEXT: ret void
//
+
+// CHECK: ![[META0:[0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+// CHECK: ![[META1:[0-9]+]] = !{i32 7, !"openmp", i32 51}
+// CHECK: ![[META2:[0-9]+]] =
+// CHECK: ![[LOOP3]] = distinct !{![[LOOP3]], ![[LOOPPROP4:[0-9]+]], ![[LOOPPROP5:[0-9]+]]}
+// CHECK: ![[LOOPPROP4]] = !{!"llvm.loop.mustprogress"}
+// CHECK: ![[LOOPPROP5]] = !{!"llvm.loop.unroll.count", i32 2}
diff --git a/clang/test/OpenMP/irbuilder_unroll_partial_heuristic_runtime_for.c b/clang/test/OpenMP/irbuilder_unroll_partial_heuristic_runtime_for.c
index f17c52ee68d10..e81c6d2ec2e61 100644
--- a/clang/test/OpenMP/irbuilder_unroll_partial_heuristic_runtime_for.c
+++ b/clang/test/OpenMP/irbuilder_unroll_partial_heuristic_runtime_for.c
@@ -59,7 +59,7 @@ double sind(double);
// CHECK-NEXT: store i32 %[[TMP8]], i32* %[[P_UPPERBOUND]], align 4
// CHECK-NEXT: store i32 1, i32* %[[P_STRIDE]], align 4
// CHECK-NEXT: %[[OMP_GLOBAL_THREAD_NUM:.+]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1)
-// CHECK-NEXT: call void @__kmpc_for_static_init_4u(%struct.ident_t* @1, i32 %[[OMP_GLOBAL_THREAD_NUM]], i32 34, i32* %[[P_LASTITER]], i32* %[[P_LOWERBOUND]], i32* %[[P_UPPERBOUND]], i32* %[[P_STRIDE]], i32 1, i32 1)
+// CHECK-NEXT: call void @__kmpc_for_static_init_4u(%struct.ident_t* @1, i32 %[[OMP_GLOBAL_THREAD_NUM]], i32 34, i32* %[[P_LASTITER]], i32* %[[P_LOWERBOUND]], i32* %[[P_UPPERBOUND]], i32* %[[P_STRIDE]], i32 1, i32 0)
// CHECK-NEXT: %[[TMP9:.+]] = load i32, i32* %[[P_LOWERBOUND]], align 4
// CHECK-NEXT: %[[TMP10:.+]] = load i32, i32* %[[P_UPPERBOUND]], align 4
// CHECK-NEXT: %[[TMP11:.+]] = sub i32 %[[TMP10]], %[[TMP9]]
diff --git a/clang/test/OpenMP/irbuilder_unroll_unroll_partial_factor.c b/clang/test/OpenMP/irbuilder_unroll_unroll_partial_factor.c
index 09f64f86915bc..bfff2f7e0abe7 100644
--- a/clang/test/OpenMP/irbuilder_unroll_unroll_partial_factor.c
+++ b/clang/test/OpenMP/irbuilder_unroll_unroll_partial_factor.c
@@ -47,7 +47,7 @@
// CHECK-NEXT: store i32 %[[TMP7]], i32* %[[P_UPPERBOUND]], align 4
// CHECK-NEXT: store i32 1, i32* %[[P_STRIDE]], align 4
// CHECK-NEXT: %[[OMP_GLOBAL_THREAD_NUM:.+]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1)
-// CHECK-NEXT: call void @__kmpc_for_static_init_4u(%struct.ident_t* @1, i32 %[[OMP_GLOBAL_THREAD_NUM]], i32 34, i32* %[[P_LASTITER]], i32* %[[P_LOWERBOUND]], i32* %[[P_UPPERBOUND]], i32* %[[P_STRIDE]], i32 1, i32 1)
+// CHECK-NEXT: call void @__kmpc_for_static_init_4u(%struct.ident_t* @1, i32 %[[OMP_GLOBAL_THREAD_NUM]], i32 34, i32* %[[P_LASTITER]], i32* %[[P_LOWERBOUND]], i32* %[[P_UPPERBOUND]], i32* %[[P_STRIDE]], i32 1, i32 0)
// CHECK-NEXT: %[[TMP8:.+]] = load i32, i32* %[[P_LOWERBOUND]], align 4
// CHECK-NEXT: %[[TMP9:.+]] = load i32, i32* %[[P_UPPERBOUND]], align 4
// CHECK-NEXT: %[[TMP10:.+]] = sub i32 %[[TMP9]], %[[TMP8]]
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index d90d4e807e0df..8c85d9818cb4f 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -353,14 +353,6 @@ class OpenMPIRBuilder {
/// the current thread, updates the relevant instructions in the canonical
/// loop and calls to an OpenMP runtime finalization function after the loop.
///
- /// TODO: Workshare loops with static scheduling may contain up to two loops
- /// that fulfill the requirements of an OpenMP canonical loop. One for
- /// iterating over all iterations of a chunk and another one for iterating
- /// over all chunks that are executed on the same thread. Returning
- /// CanonicalLoopInfo objects representing them may eventually be useful for
- /// the apply clause planned in OpenMP 6.0, but currently whether these are
- /// canonical loops is irrelevant.
- ///
/// \param DL Debug location for instructions added for the
/// workshare-loop construct itself.
/// \param CLI A descriptor of the canonical loop to workshare.
@@ -368,14 +360,30 @@ class OpenMPIRBuilder {
/// preheader of the loop.
/// \param NeedsBarrier Indicates whether a barrier must be inserted after
/// the loop.
- /// \param Chunk The size of loop chunk considered as a unit when
- /// scheduling. If \p nullptr, defaults to 1.
///
/// \returns Point where to insert code after the workshare construct.
InsertPointTy applyStaticWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI,
InsertPointTy AllocaIP,
- bool NeedsBarrier,
- Value *Chunk = nullptr);
+ bool NeedsBarrier);
+
+ /// Modifies the canonical loop a statically-scheduled workshare loop with a
+ /// user-specified chunk size.
+ ///
+ /// \param DL Debug location for instructions added for the
+ /// workshare-loop construct itself.
+ /// \param CLI A descriptor of the canonical loop to workshare.
+ /// \param AllocaIP An insertion point for Alloca instructions usable in
+ /// the preheader of the loop.
+ /// \param NeedsBarrier Indicates whether a barrier must be inserted after the
+ /// loop.
+ /// \param ChunkSize The user-specified chunk size.
+ ///
+ /// \returns Point where to insert code after the workshare construct.
+ InsertPointTy applyStaticChunkedWorkshareLoop(DebugLoc DL,
+ CanonicalLoopInfo *CLI,
+ InsertPointTy AllocaIP,
+ bool NeedsBarrier,
+ Value *ChunkSize);
/// Modifies the canonical loop to be a dynamically-scheduled workshare loop.
///
@@ -412,6 +420,10 @@ class OpenMPIRBuilder {
/// the current thread, updates the relevant instructions in the canonical
/// loop and calls to an OpenMP runtime finalization function after the loop.
///
+ /// The concrete transformation is done by applyStaticWorkshareLoop,
+ /// applyStaticChunkedWorkshareLoop, or applyDynamicWorkshareLoop, depending
+ /// on the value of \p SchedKind and \p ChunkSize.
+ ///
/// \param DL Debug location for instructions added for the
/// workshare-loop construct itself.
/// \param CLI A descriptor of the canonical loop to workshare.
@@ -419,10 +431,15 @@ class OpenMPIRBuilder {
/// preheader of the loop.
/// \param NeedsBarrier Indicates whether a barrier must be insterted after
/// the loop.
+ /// \param SchedKind Scheduling algorithm to use.
+ /// \param ChunkSize The chunk size for the inner loop.
///
/// \returns Point where to insert code after the workshare construct.
- InsertPointTy applyWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI,
- InsertPointTy AllocaIP, bool NeedsBarrier);
+ InsertPointTy applyWorkshareLoop(
+ DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
+ bool NeedsBarrier,
+ llvm::omp::ScheduleKind SchedKind = llvm::omp::OMP_SCHEDULE_Default,
+ Value *ChunkSize = nullptr);
/// Tile a loop nest.
///
@@ -1517,6 +1534,27 @@ class CanonicalLoopInfo {
/// Re-evaluated whether this makes sense.
void collectControlBlocks(SmallVectorImpl<BasicBlock *> &BBs);
+ /// Sets the number of loop iterations to the given value. This value must be
+ /// valid in the condition block (i.e., defined in the preheader) and is
+ /// interpreted as an unsigned integer.
+ void setTripCount(Value *TripCount);
+
+ /// Replace all uses of the canonical induction variable in the loop body with
+ /// a new one.
+ ///
+ /// The intended use case is to update the induction variable for an updated
+ /// iteration space such that it can stay normalized in the 0...tripcount-1
+ /// range.
+ ///
+ /// The \p Updater is called with the (presumable updated) current normalized
+ /// induction variable and is expected to return the value that uses of the
+ /// pre-updated induction values should use instead, typically dependent on
+ /// the new induction variable. This is a lambda (instead of e.g. just passing
+ /// the new value) to be able to distinguish the uses of the pre-updated
+ /// induction variable and uses of the induction varible to compute the
+ /// updated induction variable value.
+ void mapIndVar(llvm::function_ref<Value *(Instruction *)> Updater);
+
public:
/// Returns whether this object currently represents the IR of a loop. If
/// returning false, it may have been consumed by a loop transformation or not
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 293f956b6bce4..cf844f3477494 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -74,6 +74,106 @@ static bool isConflictIP(IRBuilder<>::InsertPoint IP1,
}
#endif
+/// Make \p Source branch to \p Target.
+///
+/// Handles two situations:
+/// * \p Source already has an unconditional branch.
+/// * \p Source is a degenerate block (no terminator because the BB is
+/// the current head of the IR construction).
+static void redirectTo(BasicBlock *Source, BasicBlock *Target, DebugLoc DL) {
+ if (Instruction *Term = Source->getTerminator()) {
+ auto *Br = cast<BranchInst>(Term);
+ assert(!Br->isConditional() &&
+ "BB's terminator must be an unconditional branch (or degenerate)");
+ BasicBlock *Succ = Br->getSuccessor(0);
+ Succ->removePredecessor(Source, /*KeepOneInputPHIs=*/true);
+ Br->setSuccessor(0, Target);
+ return;
+ }
+
+ auto *NewBr = BranchInst::Create(Target, Source);
+ NewBr->setDebugLoc(DL);
+}
+
+/// Move the instruction after an InsertPoint to the beginning of another
+/// BasicBlock.
+///
+/// The instructions after \p IP are moved to the beginning of \p New which must
+/// not have any PHINodes. If \p CreateBranch is true, a branch instruction to
+/// \p New will be added such that there is no semantic change. Otherwise, the
+/// \p IP insert block remains degenerate and it is up to the caller to insert a
+/// terminator.
+static void spliceBB(OpenMPIRBuilder::InsertPointTy IP, BasicBlock *New,
+ bool CreateBranch) {
+ assert(New->getFirstInsertionPt() == New->begin() &&
+ "Target BB must not have PHI nodes");
+
+ // Move instructions to new block.
+ BasicBlock *Old = IP.getBlock();
+ New->getInstList().splice(New->begin(), Old->getInstList(), IP.getPoint(),
+ Old->end());
+
+ if (CreateBranch)
+ BranchInst::Create(New, Old);
+}
+
+/// Splice a BasicBlock at an IRBuilder's current insertion point. Its new
+/// insert location will stick to after the instruction before the insertion
+/// point (instead of moving with the instruction the InsertPoint stores
+/// internally).
+static void spliceBB(IRBuilder<> &Builder, BasicBlock *New, bool CreateBranch) {
+ DebugLoc DebugLoc = Builder.getCurrentDebugLocation();
+ BasicBlock *Old = Builder.GetInsertBlock();
+
+ spliceBB(Builder.saveIP(), New, CreateBranch);
+ if (CreateBranch)
+ Builder.SetInsertPoint(Old->getTerminator());
+ else
+ Builder.SetInsertPoint(Old);
+
+ // SetInsertPoint also updates the Builder's debug location, but we want to
+ // keep the one the Builder was configured to use.
+ Builder.SetCurrentDebugLocation(DebugLoc);
+}
+
+/// Split a BasicBlock at an InsertPoint, even if the block is degenerate
+/// (missing the terminator).
+///
+/// llvm::SplitBasicBlock and BasicBlock::splitBasicBlock require a well-formed
+/// BasicBlock. \p Name is used for the new successor block. If \p CreateBranch
+/// is true, a branch to the new successor will new created such that
+/// semantically there is no change; otherwise the block of the insertion point
+/// remains degenerate and it is the caller's responsibility to insert a
+/// terminator. Returns the new successor block.
+static BasicBlock *splitBB(OpenMPIRBuilder::InsertPointTy IP, bool CreateBranch,
+ llvm::Twine Name = {}) {
+ BasicBlock *Old = IP.getBlock();
+ BasicBlock *New = BasicBlock::Create(
+ Old->getContext(), Name.isTriviallyEmpty() ? Old->getName() : Name,
+ Old->getParent(), Old->getNextNode());
+ spliceBB(IP, New, CreateBranch);
+ New->replaceSuccessorsPhiUsesWith(Old, New);
+ return New;
+}
+
+/// Split a BasicBlock at \p Builder's insertion point, even if the block is
+/// degenerate (missing the terminator). Its new insert location will stick to
+/// after the instruction before the insertion point (instead of moving with the
+/// instruction the InsertPoint stores internally).
+static BasicBlock *splitBB(IRBuilder<> &Builder, bool CreateBranch,
+ llvm::Twine Name = {}) {
+ DebugLoc DebugLoc = Builder.getCurrentDebugLocation();
+ BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, Name);
+ if (CreateBranch)
+ Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
+ else
+ Builder.SetInsertPoint(Builder.GetInsertBlock());
+ // SetInsertPoint also updates the Builder's debug location, but we want to
+ // keep the one the Builder was configured to use.
+ Builder.SetCurrentDebugLocation(DebugLoc);
+ return New;
+}
+
void OpenMPIRBuilder::addAttributes(omp::RuntimeFunction FnID, Function &Fn) {
LLVMContext &Ctx = Fn.getContext();
@@ -1402,10 +1502,8 @@ OpenMPIRBuilder::createCanonicalLoop(const LocationDescription &Loc,
// Split the loop at the insertion point: Branch to the preheader and move
// every following instruction to after the loop (the After BB). Also, the
// new successor is the loop's after block.
+ spliceBB(Builder, After, /*CreateBranch=*/false);
Builder.CreateBr(CL->getPreheader());
- After->getInstList().splice(After->begin(), BB->getInstList(),
- Builder.GetInsertPoint(), BB->end());
- After->replaceSuccessorsPhiUsesWith(BB, After);
}
// Emit the body content. We do it after connecting the loop to the CFG to
@@ -1506,20 +1604,10 @@ static FunctionCallee getKmpcForStaticInitForType(Type *Ty, Module &M,
llvm_unreachable("unknown OpenMP loop iterator bitwidth");
}
-// Sets the number of loop iterations to the given value. This value must be
-// valid in the condition block (i.e., defined in the preheader) and is
-// interpreted as an unsigned integer.
-void setCanonicalLoopTripCount(CanonicalLoopInfo *CLI, Value *TripCount) {
- Instruction *CmpI = &CLI->getCond()->front();
- assert(isa<CmpInst>(CmpI) && "First inst must compare IV with TripCount");
- CmpI->setOperand(1, TripCount);
- CLI->assertOK();
-}
-
OpenMPIRBuilder::InsertPointTy
OpenMPIRBuilder::applyStaticWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI,
InsertPointTy AllocaIP,
- bool NeedsBarrier, Value *Chunk) {
+ bool NeedsBarrier) {
assert(CLI->isValid() && "Requires a valid canonical loop");
assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
"Require dedicated allocate IP");
@@ -1559,10 +1647,6 @@ OpenMPIRBuilder::applyStaticWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI,
Builder.CreateStore(UpperBound, PUpperBound);
Builder.CreateStore(One, PStride);
- // FIXME: schedule(static) is NOT the same as schedule(static,1)
- if (!Chunk)
- Chunk = One;
-
Value *ThreadNum = getOrCreateThreadID(SrcLoc);
Constant *SchedulingType =
@@ -1572,25 +1656,22 @@ OpenMPIRBuilder::applyStaticWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI,
// value it produced.
Builder.CreateCall(StaticInit,
{SrcLoc, ThreadNum, SchedulingType, PLastIter, PLowerBound,
- PUpperBound, PStride, One, Chunk});
+ PUpperBound, PStride, One, Zero});
Value *LowerBound = Builder.CreateLoad(IVTy, PLowerBound);
Value *InclusiveUpperBound = Builder.CreateLoad(IVTy, PUpperBound);
Value *TripCountMinusOne = Builder.CreateSub(InclusiveUpperBound, LowerBound);
Value *TripCount = Builder.CreateAdd(TripCountMinusOne, One);
- setCanonicalLoopTripCount(CLI, TripCount);
+ CLI->setTripCount(TripCount);
// Update all uses of the induction variable except the one in the condition
// block that compares it with the actual upper bound, and the increment in
// the latch block.
- // TODO: this can eventually move to CanonicalLoopInfo or to a new
- // CanonicalLoopInfoUpdater interface.
- Builder.SetInsertPoint(CLI->getBody(), CLI->getBody()->getFirstInsertionPt());
- Value *UpdatedIV = Builder.CreateAdd(IV, LowerBound);
- IV->replaceUsesWithIf(UpdatedIV, [&](Use &U) {
- auto *Instr = dyn_cast<Instruction>(U.getUser());
- return !Instr ||
- (Instr->getParent() != CLI->getCond() &&
- Instr->getParent() != CLI->getLatch() && Instr != UpdatedIV);
+
+ CLI->mapIndVar([&](Instruction *OldIV) -> Value * {
+ Builder.SetInsertPoint(CLI->getBody(),
+ CLI->getBody()->getFirstInsertionPt());
+ Builder.SetCurrentDebugLocation(DL);
+ return Builder.CreateAdd(OldIV, LowerBound);
});
// In the "exit" block, call the "fini" function.
@@ -1610,11 +1691,184 @@ OpenMPIRBuilder::applyStaticWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI,
return AfterIP;
}
+OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::applyStaticChunkedWorkshareLoop(
+ DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
+ bool NeedsBarrier, Value *ChunkSize) {
+ assert(CLI->isValid() && "Requires a valid canonical loop");
+ assert(ChunkSize && "Chunk size is required");
+
+ LLVMContext &Ctx = CLI->getFunction()->getContext();
+ Value *IV = CLI->getIndVar();
+ Value *OrigTripCount = CLI->getTripCount();
+ Type *IVTy = IV->getType();
+ assert(IVTy->getIntegerBitWidth() <= 64 &&
+ "Max supported tripcount bitwidth is 64 bits");
+ Type *InternalIVTy = IVTy->getIntegerBitWidth() <= 32 ? Type::getInt32Ty(Ctx)
+ : Type::getInt64Ty(Ctx);
+ Type *I32Type = Type::getInt32Ty(M.getContext());
+ Constant *Zero = ConstantInt::get(InternalIVTy, 0);
+ Constant *One = ConstantInt::get(InternalIVTy, 1);
+
+ // Declare useful OpenMP runtime functions.
+ FunctionCallee StaticInit =
+ getKmpcForStaticInitForType(InternalIVTy, M, *this);
+ FunctionCallee StaticFini =
+ getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
+
+ // Allocate space for computed loop bounds as expected by the "init" function.
+ Builder.restoreIP(AllocaIP);
+ Builder.SetCurrentDebugLocation(DL);
+ Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
+ Value *PLowerBound =
+ Builder.CreateAlloca(InternalIVTy, nullptr, "p.lowerbound");
+ Value *PUpperBound =
+ Builder.CreateAlloca(InternalIVTy, nullptr, "p.upperbound");
+ Value *PStride = Builder.CreateAlloca(InternalIVTy, nullptr, "p.stride");
+
+ // Set up the source location value for the OpenMP runtime.
+ Builder.restoreIP(CLI->getPreheaderIP());
+ Builder.SetCurrentDebugLocation(DL);
+
+ // TODO: Detect overflow in ubsan or max-out with current tripcount.
+ Value *CastedChunkSize =
+ Builder.CreateZExtOrTrunc(ChunkSize, InternalIVTy, "chunksize");
+ Value *CastedTripCount =
+ Builder.CreateZExt(OrigTripCount, InternalIVTy, "tripcount");
+
+ Constant *SchedulingType = ConstantInt::get(
+ I32Type, static_cast<int>(OMPScheduleType::StaticChunked));
+ Builder.CreateStore(Zero, PLowerBound);
+ Value *OrigUpperBound = Builder.CreateSub(CastedTripCount, One);
+ Builder.CreateStore(OrigUpperBound, PUpperBound);
+ Builder.CreateStore(One, PStride);
+
+ // Call the "init" function and update the trip count of the loop with the
+ // value it produced.
+ uint32_t SrcLocStrSize;
+ Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
+ Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
+ Value *ThreadNum = getOrCreateThreadID(SrcLoc);
+ Builder.CreateCall(StaticInit,
+ {/*loc=*/SrcLoc, /*global_tid=*/ThreadNum,
+ /*schedtype=*/SchedulingType, /*plastiter=*/PLastIter,
+ /*plower=*/PLowerBound, /*pupper=*/PUpperBound,
+ /*pstride=*/PStride, /*incr=*/One,
+ /*chunk=*/CastedChunkSize});
+
+ // Load values written by the "init" function.
+ Value *FirstChunkStart =
+ Builder.CreateLoad(InternalIVTy, PLowerBound, "omp_firstchunk.lb");
+ Value *FirstChunkStop =
+ Builder.CreateLoad(InternalIVTy, PUpperBound, "omp_firstchunk.ub");
+ Value *FirstChunkEnd = Builder.CreateAdd(FirstChunkStop, One);
+ Value *ChunkRange =
+ Builder.CreateSub(FirstChunkEnd, FirstChunkStart, "omp_chunk.range");
+ Value *NextChunkStride =
+ Builder.CreateLoad(InternalIVTy, PStride, "omp_dispatch.stride");
+
+ // Create outer "dispatch" loop for enumerating the chunks.
+ BasicBlock *DispatchEnter = splitBB(Builder, true);
+ Value *DispatchCounter;
+ CanonicalLoopInfo *DispatchCLI = createCanonicalLoop(
+ {Builder.saveIP(), DL},
+ [&](InsertPointTy BodyIP, Value *Counter) { DispatchCounter = Counter; },
+ FirstChunkStart, CastedTripCount, NextChunkStride,
+ /*IsSigned=*/false, /*InclusiveStop=*/false, /*ComputeIP=*/{},
+ "dispatch");
+
+ // Remember the BasicBlocks of the dispatch loop we need, then invalidate to
+ // not have to preserve the canonical invariant.
+ BasicBlock *DispatchBody = DispatchCLI->getBody();
+ BasicBlock *DispatchLatch = DispatchCLI->getLatch();
+ BasicBlock *DispatchExit = DispatchCLI->getExit();
+ BasicBlock *DispatchAfter = DispatchCLI->getAfter();
+ DispatchCLI->invalidate();
+
+ // Rewire the original loop to become the chunk loop inside the dispatch loop.
+ redirectTo(DispatchAfter, CLI->getAfter(), DL);
+ redirectTo(CLI->getExit(), DispatchLatch, DL);
+ redirectTo(DispatchBody, DispatchEnter, DL);
+
+ // Prepare the prolog of the chunk loop.
+ Builder.restoreIP(CLI->getPreheaderIP());
+ Builder.SetCurrentDebugLocation(DL);
+
+ // Compute the number of iterations of the chunk loop.
+ Builder.SetInsertPoint(CLI->getPreheader()->getTerminator());
+ Value *ChunkEnd = Builder.CreateAdd(DispatchCounter, ChunkRange);
+ Value *IsLastChunk =
+ Builder.CreateICmpUGE(ChunkEnd, CastedTripCount, "omp_chunk.is_last");
+ Value *CountUntilOrigTripCount =
+ Builder.CreateSub(CastedTripCount, DispatchCounter);
+ Value *ChunkTripCount = Builder.CreateSelect(
+ IsLastChunk, CountUntilOrigTripCount, ChunkRange, "omp_chunk.tripcount");
+ Value *BackcastedChunkTC =
+ Builder.CreateTrunc(ChunkTripCount, IVTy, "omp_chunk.tripcount.trunc");
+ CLI->setTripCount(BackcastedChunkTC);
+
+ // Update all uses of the induction variable except the one in the condition
+ // block that compares it with the actual upper bound, and the increment in
+ // the latch block.
+ Value *BackcastedDispatchCounter =
+ Builder.CreateTrunc(DispatchCounter, IVTy, "omp_dispatch.iv.trunc");
+ CLI->mapIndVar([&](Instruction *) -> Value * {
+ Builder.restoreIP(CLI->getBodyIP());
+ return Builder.CreateAdd(IV, BackcastedDispatchCounter);
+ });
+
+ // In the "exit" block, call the "fini" function.
+ Builder.SetInsertPoint(DispatchExit, DispatchExit->getFirstInsertionPt());
+ Builder.CreateCall(StaticFini, {SrcLoc, ThreadNum});
+
+ // Add the barrier if requested.
+ if (NeedsBarrier)
+ createBarrier(LocationDescription(Builder.saveIP(), DL), OMPD_for,
+ /*ForceSimpleCall=*/false, /*CheckCancelFlag=*/false);
+
+#ifndef NDEBUG
+ // Even though we currently do not support applying additional methods to it,
+ // the chunk loop should remain a canonical loop.
+ CLI->assertOK();
+#endif
+
+ return {DispatchAfter, DispatchAfter->getFirstInsertionPt()};
+}
+
OpenMPIRBuilder::InsertPointTy
OpenMPIRBuilder::applyWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI,
- InsertPointTy AllocaIP, bool NeedsBarrier) {
- // Currently only supports static schedules.
- return applyStaticWorkshareLoop(DL, CLI, AllocaIP, NeedsBarrier);
+ InsertPointTy AllocaIP, bool NeedsBarrier,
+ llvm::omp::ScheduleKind SchedKind,
+ llvm::Value *ChunkSize) {
+ switch (SchedKind) {
+ case llvm::omp::ScheduleKind::OMP_SCHEDULE_Default:
+ assert(!ChunkSize && "No chunk size with default schedule (which for clang "
+ "is static non-chunked)");
+ LLVM_FALLTHROUGH;
+ case llvm::omp::ScheduleKind::OMP_SCHEDULE_Static:
+ if (ChunkSize)
+ return applyStaticChunkedWorkshareLoop(DL, CLI, AllocaIP, NeedsBarrier,
+ ChunkSize);
+ return applyStaticWorkshareLoop(DL, CLI, AllocaIP, NeedsBarrier);
+ case llvm::omp::ScheduleKind::OMP_SCHEDULE_Auto:
+ assert(!ChunkSize && "Chunk size with auto scheduling not user-defined");
+ return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, OMPScheduleType::Auto,
+ NeedsBarrier, nullptr);
+ case llvm::omp::ScheduleKind::OMP_SCHEDULE_Dynamic:
+ return applyDynamicWorkshareLoop(DL, CLI, AllocaIP,
+ OMPScheduleType::DynamicChunked,
+ NeedsBarrier, ChunkSize);
+ case llvm::omp::ScheduleKind::OMP_SCHEDULE_Guided:
+ return applyDynamicWorkshareLoop(DL, CLI, AllocaIP,
+ OMPScheduleType::GuidedChunked,
+ NeedsBarrier, ChunkSize);
+ case llvm::omp::ScheduleKind::OMP_SCHEDULE_Runtime:
+ assert(!ChunkSize &&
+ "Chunk size with runtime scheduling implied to be one");
+ return applyDynamicWorkshareLoop(
+ DL, CLI, AllocaIP, OMPScheduleType::Runtime, NeedsBarrier, nullptr);
+ }
+
+ llvm_unreachable("Unknown/unimplemented schedule kind");
}
/// Returns an LLVM function to call for initializing loop bounds using OpenMP
@@ -1763,27 +2017,6 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::applyDynamicWorkshareLoop(
return AfterIP;
}
-/// Make \p Source branch to \p Target.
-///
-/// Handles two situations:
-/// * \p Source already has an unconditional branch.
-/// * \p Source is a degenerate block (no terminator because the BB is
-/// the current head of the IR construction).
-static void redirectTo(BasicBlock *Source, BasicBlock *Target, DebugLoc DL) {
- if (Instruction *Term = Source->getTerminator()) {
- auto *Br = cast<BranchInst>(Term);
- assert(!Br->isConditional() &&
- "BB's terminator must be an unconditional branch (or degenerate)");
- BasicBlock *Succ = Br->getSuccessor(0);
- Succ->removePredecessor(Source, /*KeepOneInputPHIs=*/true);
- Br->setSuccessor(0, Target);
- return;
- }
-
- auto *NewBr = BranchInst::Create(Target, Source);
- NewBr->setDebugLoc(DL);
-}
-
/// Redirect all edges that branch to \p OldTarget to \p NewTarget. That is,
/// after this \p OldTarget will be orphaned.
static void redirectAllPredecessorsTo(BasicBlock *OldTarget,
@@ -3597,6 +3830,51 @@ BasicBlock *CanonicalLoopInfo::getPreheader() const {
llvm_unreachable("Missing preheader");
}
+void CanonicalLoopInfo::setTripCount(Value *TripCount) {
+ assert(isValid() && "Requires a valid canonical loop");
+
+ Instruction *CmpI = &getCond()->front();
+ assert(isa<CmpInst>(CmpI) && "First inst must compare IV with TripCount");
+ CmpI->setOperand(1, TripCount);
+
+#ifndef NDEBUG
+ assertOK();
+#endif
+}
+
+void CanonicalLoopInfo::mapIndVar(
+ llvm::function_ref<Value *(Instruction *)> Updater) {
+ assert(isValid() && "Requires a valid canonical loop");
+
+ Instruction *OldIV = getIndVar();
+
+ // Record all uses excluding those introduced by the updater. Uses by the
+ // CanonicalLoopInfo itself to keep track of the number of iterations are
+ // excluded.
+ SmallVector<Use *> ReplacableUses;
+ for (Use &U : OldIV->uses()) {
+ auto *User = dyn_cast<Instruction>(U.getUser());
+ if (!User)
+ continue;
+ if (User->getParent() == getCond())
+ continue;
+ if (User->getParent() == getLatch())
+ continue;
+ ReplacableUses.push_back(&U);
+ }
+
+ // Run the updater that may introduce new uses
+ Value *NewIV = Updater(OldIV);
+
+ // Replace the old uses with the value returned by the updater.
+ for (Use *U : ReplacableUses)
+ U->set(NewIV);
+
+#ifndef NDEBUG
+ assertOK();
+#endif
+}
+
void CanonicalLoopInfo::assertOK() const {
#ifndef NDEBUG
// No constraints if this object currently does not describe a loop.
diff --git a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
index 48f720bd2d727..54f797283061f 100644
--- a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
+++ b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
@@ -113,6 +113,33 @@ verifyListOrder(Function *F, ArrayRef<BasicBlock *> RefOrder) {
<< " in function " << F->getName();
}
+/// Populate Calls with call instructions calling the function with the given
+/// FnID from the given function F.
+static void findCalls(Function *F, omp::RuntimeFunction FnID,
+ OpenMPIRBuilder &OMPBuilder,
+ SmallVectorImpl<CallInst *> &Calls) {
+ Function *Fn = OMPBuilder.getOrCreateRuntimeFunctionPtr(FnID);
+ for (BasicBlock &BB : *F) {
+ for (Instruction &I : BB) {
+ auto *Call = dyn_cast<CallInst>(&I);
+ if (Call && Call->getCalledFunction() == Fn)
+ Calls.push_back(Call);
+ }
+ }
+}
+
+/// Assuming \p F contains only one call to the function with the given \p FnID,
+/// return that call.
+static CallInst *findSingleCall(Function *F, omp::RuntimeFunction FnID,
+ OpenMPIRBuilder &OMPBuilder) {
+ SmallVector<CallInst *, 1> Calls;
+ findCalls(F, FnID, OMPBuilder, Calls);
+ EXPECT_EQ(1u, Calls.size());
+ if (Calls.size() != 1)
+ return nullptr;
+ return Calls.front();
+}
+
class OpenMPIRBuilderTest : public testing::Test {
protected:
void SetUp() override {
@@ -147,7 +174,8 @@ class OpenMPIRBuilderTest : public testing::Test {
/// loop counter for use with tests that need a CanonicalLoopInfo object.
CanonicalLoopInfo *buildSingleLoopFunction(DebugLoc DL,
OpenMPIRBuilder &OMPBuilder,
- Instruction **Call = nullptr,
+ int UseIVBits,
+ CallInst **Call = nullptr,
BasicBlock **BodyCode = nullptr) {
OMPBuilder.initialize();
F->setName("func");
@@ -156,6 +184,10 @@ class OpenMPIRBuilderTest : public testing::Test {
OpenMPIRBuilder::LocationDescription Loc({Builder.saveIP(), DL});
Value *TripCount = F->getArg(0);
+ Type *IVType = Type::getIntNTy(Builder.getContext(), UseIVBits);
+ Value *CastedTripCount =
+ Builder.CreateZExtOrTrunc(TripCount, IVType, "tripcount");
+
auto LoopBodyGenCB = [&](OpenMPIRBuilder::InsertPointTy CodeGenIP,
llvm::Value *LC) {
Builder.restoreIP(CodeGenIP);
@@ -168,7 +200,7 @@ class OpenMPIRBuilderTest : public testing::Test {
*Call = CallInst;
};
CanonicalLoopInfo *Loop =
- OMPBuilder.createCanonicalLoop(Loc, LoopBodyGenCB, TripCount);
+ OMPBuilder.createCanonicalLoop(Loc, LoopBodyGenCB, CastedTripCount);
// Finalize the function.
Builder.restoreIP(Loop->getAfterIP());
@@ -188,6 +220,10 @@ class OpenMPIRBuilderTestWithParams
: public OpenMPIRBuilderTest,
public ::testing::WithParamInterface<omp::OMPScheduleType> {};
+class OpenMPIRBuilderTestWithIVBits
+ : public OpenMPIRBuilderTest,
+ public ::testing::WithParamInterface<int> {};
+
// Returns the value stored in the given allocation. Returns null if the given
// value is not a result of an InstTy instruction, if no value is stored or if
// there is more than one store.
@@ -1387,10 +1423,10 @@ TEST_F(OpenMPIRBuilderTest, CollapseNestedLoops) {
TEST_F(OpenMPIRBuilderTest, TileSingleLoop) {
OpenMPIRBuilder OMPBuilder(*M);
- Instruction *Call;
+ CallInst *Call;
BasicBlock *BodyCode;
CanonicalLoopInfo *Loop =
- buildSingleLoopFunction(DL, OMPBuilder, &Call, &BodyCode);
+ buildSingleLoopFunction(DL, OMPBuilder, 32, &Call, &BodyCode);
Instruction *OrigIndVar = Loop->getIndVar();
EXPECT_EQ(Call->getOperand(1), OrigIndVar);
@@ -1730,7 +1766,7 @@ TEST_F(OpenMPIRBuilderTest, TileSingleLoopCounts) {
TEST_F(OpenMPIRBuilderTest, ApplySimd) {
OpenMPIRBuilder OMPBuilder(*M);
- CanonicalLoopInfo *CLI = buildSingleLoopFunction(DL, OMPBuilder);
+ CanonicalLoopInfo *CLI = buildSingleLoopFunction(DL, OMPBuilder, 32);
// Simd-ize the loop.
OMPBuilder.applySimd(DL, CLI);
@@ -1761,7 +1797,7 @@ TEST_F(OpenMPIRBuilderTest, ApplySimd) {
TEST_F(OpenMPIRBuilderTest, UnrollLoopFull) {
OpenMPIRBuilder OMPBuilder(*M);
- CanonicalLoopInfo *CLI = buildSingleLoopFunction(DL, OMPBuilder);
+ CanonicalLoopInfo *CLI = buildSingleLoopFunction(DL, OMPBuilder, 32);
// Unroll the loop.
OMPBuilder.unrollLoopFull(DL, CLI);
@@ -1784,7 +1820,7 @@ TEST_F(OpenMPIRBuilderTest, UnrollLoopFull) {
TEST_F(OpenMPIRBuilderTest, UnrollLoopPartial) {
OpenMPIRBuilder OMPBuilder(*M);
- CanonicalLoopInfo *CLI = buildSingleLoopFunction(DL, OMPBuilder);
+ CanonicalLoopInfo *CLI = buildSingleLoopFunction(DL, OMPBuilder, 32);
// Unroll the loop.
CanonicalLoopInfo *UnrolledLoop = nullptr;
@@ -1818,7 +1854,7 @@ TEST_F(OpenMPIRBuilderTest, UnrollLoopPartial) {
TEST_F(OpenMPIRBuilderTest, UnrollLoopHeuristic) {
OpenMPIRBuilder OMPBuilder(*M);
- CanonicalLoopInfo *CLI = buildSingleLoopFunction(DL, OMPBuilder);
+ CanonicalLoopInfo *CLI = buildSingleLoopFunction(DL, OMPBuilder, 32);
// Unroll the loop.
OMPBuilder.unrollLoopHeuristic(DL, CLI);
@@ -1935,6 +1971,82 @@ TEST_F(OpenMPIRBuilderTest, StaticWorkShareLoop) {
EXPECT_EQ(NumCallsInExitBlock, 3u);
}
+TEST_P(OpenMPIRBuilderTestWithIVBits, StaticChunkedWorkshareLoop) {
+ int IVBits = GetParam();
+
+ using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
+ OpenMPIRBuilder OMPBuilder(*M);
+
+ BasicBlock *Body;
+ CallInst *Call;
+ CanonicalLoopInfo *CLI =
+ buildSingleLoopFunction(DL, OMPBuilder, IVBits, &Call, &Body);
+
+ Instruction *OrigIndVar = CLI->getIndVar();
+ EXPECT_EQ(Call->getOperand(1), OrigIndVar);
+
+ Type *LCTy = Type::getInt32Ty(Ctx);
+ Value *ChunkSize = ConstantInt::get(LCTy, 5);
+ InsertPointTy AllocaIP{&F->getEntryBlock(),
+ F->getEntryBlock().getFirstInsertionPt()};
+ OMPBuilder.applyStaticChunkedWorkshareLoop(DL, CLI, AllocaIP,
+ /*NeedsBarrier=*/true, ChunkSize);
+
+ OMPBuilder.finalize();
+ EXPECT_FALSE(verifyModule(*M, &errs()));
+
+ BasicBlock *Entry = &F->getEntryBlock();
+ BasicBlock *Preheader = Entry->getSingleSuccessor();
+
+ BasicBlock *DispatchPreheader = Preheader->getSingleSuccessor();
+ BasicBlock *DispatchHeader = DispatchPreheader->getSingleSuccessor();
+ BasicBlock *DispatchCond = DispatchHeader->getSingleSuccessor();
+ BasicBlock *DispatchBody = succ_begin(DispatchCond)[0];
+ BasicBlock *DispatchExit = succ_begin(DispatchCond)[1];
+ BasicBlock *DispatchAfter = DispatchExit->getSingleSuccessor();
+ BasicBlock *Return = DispatchAfter->getSingleSuccessor();
+
+ BasicBlock *ChunkPreheader = DispatchBody->getSingleSuccessor();
+ BasicBlock *ChunkHeader = ChunkPreheader->getSingleSuccessor();
+ BasicBlock *ChunkCond = ChunkHeader->getSingleSuccessor();
+ BasicBlock *ChunkBody = succ_begin(ChunkCond)[0];
+ BasicBlock *ChunkExit = succ_begin(ChunkCond)[1];
+ BasicBlock *ChunkInc = ChunkBody->getSingleSuccessor();
+ BasicBlock *ChunkAfter = ChunkExit->getSingleSuccessor();
+
+ BasicBlock *DispatchInc = ChunkAfter;
+
+ EXPECT_EQ(ChunkBody, Body);
+ EXPECT_EQ(ChunkInc->getSingleSuccessor(), ChunkHeader);
+ EXPECT_EQ(DispatchInc->getSingleSuccessor(), DispatchHeader);
+
+ EXPECT_TRUE(isa<ReturnInst>(Return->front()));
+
+ Value *NewIV = Call->getOperand(1);
+ EXPECT_EQ(NewIV->getType()->getScalarSizeInBits(), IVBits);
+
+ CallInst *InitCall = findSingleCall(
+ F,
+ (IVBits > 32) ? omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_8u
+ : omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_4u,
+ OMPBuilder);
+ EXPECT_EQ(InitCall->getParent(), Preheader);
+ EXPECT_EQ(cast<ConstantInt>(InitCall->getArgOperand(2))->getSExtValue(), 33);
+ EXPECT_EQ(cast<ConstantInt>(InitCall->getArgOperand(7))->getSExtValue(), 1);
+ EXPECT_EQ(cast<ConstantInt>(InitCall->getArgOperand(8))->getSExtValue(), 5);
+
+ CallInst *FiniCall = findSingleCall(
+ F, omp::RuntimeFunction::OMPRTL___kmpc_for_static_fini, OMPBuilder);
+ EXPECT_EQ(FiniCall->getParent(), DispatchExit);
+
+ CallInst *BarrierCall = findSingleCall(
+ F, omp::RuntimeFunction::OMPRTL___kmpc_barrier, OMPBuilder);
+ EXPECT_EQ(BarrierCall->getParent(), DispatchExit);
+}
+
+INSTANTIATE_TEST_SUITE_P(IVBits, OpenMPIRBuilderTestWithIVBits,
+ ::testing::Values(8, 16, 32, 64));
+
TEST_P(OpenMPIRBuilderTestWithParams, DynamicWorkShareLoop) {
using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
OpenMPIRBuilder OMPBuilder(*M);
@@ -3283,21 +3395,6 @@ xorAtomicReduction(OpenMPIRBuilder::InsertPointTy IP, Type *Ty, Value *LHS,
return Builder.saveIP();
}
-/// Populate Calls with call instructions calling the function with the given
-/// FnID from the given function F.
-static void findCalls(Function *F, omp::RuntimeFunction FnID,
- OpenMPIRBuilder &OMPBuilder,
- SmallVectorImpl<CallInst *> &Calls) {
- Function *Fn = OMPBuilder.getOrCreateRuntimeFunctionPtr(FnID);
- for (BasicBlock &BB : *F) {
- for (Instruction &I : BB) {
- auto *Call = dyn_cast<CallInst>(&I);
- if (Call && Call->getCalledFunction() == Fn)
- Calls.push_back(Call);
- }
- }
-}
-
TEST_F(OpenMPIRBuilderTest, CreateReductions) {
using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
OpenMPIRBuilder OMPBuilder(*M);
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 40d8582d91287..d3b43b423c952 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -780,8 +780,9 @@ convertOmpWsLoop(Operation &opInst, llvm::IRBuilderBase &builder,
bool isSimd = loop.simd_modifier();
if (schedule == omp::ClauseScheduleKind::Static) {
- ompBuilder->applyStaticWorkshareLoop(ompLoc.DL, loopInfo, allocaIP,
- !loop.nowait(), chunk);
+ ompBuilder->applyWorkshareLoop(ompLoc.DL, loopInfo, allocaIP,
+ !loop.nowait(),
+ llvm::omp::OMP_SCHEDULE_Static, chunk);
} else {
llvm::omp::OMPScheduleType schedType;
switch (schedule) {
diff --git a/mlir/test/Target/LLVMIR/openmp-llvm.mlir b/mlir/test/Target/LLVMIR/openmp-llvm.mlir
index 0de8c4d4e019a..2b1dd54bc7326 100644
--- a/mlir/test/Target/LLVMIR/openmp-llvm.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-llvm.mlir
@@ -423,6 +423,53 @@ llvm.func @wsloop_inclusive_2(%arg0: !llvm.ptr<f32>) {
// -----
+llvm.func @body(i32)
+
+// CHECK-LABEL: @test_omp_wsloop_static_defchunk
+llvm.func @test_omp_wsloop_static_defchunk(%lb : i32, %ub : i32, %step : i32) -> () {
+ omp.wsloop (%iv) : i32 = (%lb) to (%ub) step (%step) schedule(static) {
+ // CHECK: call void @__kmpc_for_static_init_4u(%struct.ident_t* @{{.*}}, i32 %{{.*}}, i32 34, i32* %{{.*}}, i32* %{{.*}}, i32* %{{.*}}, i32* %{{.*}}, i32 1, i32 0)
+ // CHECK: call void @__kmpc_for_static_fini
+ llvm.call @body(%iv) : (i32) -> ()
+ omp.yield
+ }
+ llvm.return
+}
+
+// -----
+
+llvm.func @body(i32)
+
+// CHECK-LABEL: @test_omp_wsloop_static_1
+llvm.func @test_omp_wsloop_static_1(%lb : i32, %ub : i32, %step : i32) -> () {
+ %static_chunk_size = llvm.mlir.constant(1 : i32) : i32
+ omp.wsloop (%iv) : i32 = (%lb) to (%ub) step (%step) schedule(static = %static_chunk_size : i32) {
+ // CHECK: call void @__kmpc_for_static_init_4u(%struct.ident_t* @{{.*}}, i32 %{{.*}}, i32 33, i32* %{{.*}}, i32* %{{.*}}, i32* %{{.*}}, i32* %{{.*}}, i32 1, i32 1)
+ // CHECK: call void @__kmpc_for_static_fini
+ llvm.call @body(%iv) : (i32) -> ()
+ omp.yield
+ }
+ llvm.return
+}
+
+// -----
+
+llvm.func @body(i32)
+
+// CHECK-LABEL: @test_omp_wsloop_static_2
+llvm.func @test_omp_wsloop_static_2(%lb : i32, %ub : i32, %step : i32) -> () {
+ %static_chunk_size = llvm.mlir.constant(2 : i32) : i32
+ omp.wsloop (%iv) : i32 = (%lb) to (%ub) step (%step) schedule(static = %static_chunk_size : i32) {
+ // CHECK: call void @__kmpc_for_static_init_4u(%struct.ident_t* @{{.*}}, i32 %{{.*}}, i32 33, i32* %{{.*}}, i32* %{{.*}}, i32* %{{.*}}, i32* %{{.*}}, i32 1, i32 2)
+ // CHECK: call void @__kmpc_for_static_fini
+ llvm.call @body(%iv) : (i32) -> ()
+ omp.yield
+ }
+ llvm.return
+}
+
+// -----
+
llvm.func @body(i64)
llvm.func @test_omp_wsloop_dynamic(%lb : i64, %ub : i64, %step : i64) -> () {
More information about the Mlir-commits
mailing list