[clang] [flang] [llvm] [openmp] [Clang][OpenMP][LoopTransformations] Add support for "#pragma omp fuse" loop transformation directive and "looprange" clause (PR #139293)
Roger Ferrer Ibáñez via llvm-commits
llvm-commits at lists.llvm.org
Mon Sep 8 02:00:07 PDT 2025
https://github.com/rofirrim updated https://github.com/llvm/llvm-project/pull/139293
>From 365c3c5d18b0c466fee85075d6f6bc3c63267fef Mon Sep 17 00:00:00 2001
From: Roger Ferrer Ibanez <roger.ferrer at bsc.es>
Date: Wed, 27 Aug 2025 08:18:02 +0000
Subject: [PATCH 1/2] [Clang][OpenMP] Add an additional class to hold data that
will be shared between all loop transformations
This class is not a statement by itself and its goal is to avoid having
to replicate information between classes.
Also simplify the way we handle the "generated loops" information as we
currently only need to know if it is zero or non-zero.
---
clang/include/clang/AST/StmtOpenMP.h | 55 +++++++++++++----------
clang/lib/AST/StmtOpenMP.cpp | 21 ++++-----
clang/lib/Sema/SemaOpenMP.cpp | 12 ++---
clang/lib/Serialization/ASTReaderStmt.cpp | 2 +-
clang/lib/Serialization/ASTWriterStmt.cpp | 2 +-
5 files changed, 52 insertions(+), 40 deletions(-)
diff --git a/clang/include/clang/AST/StmtOpenMP.h b/clang/include/clang/AST/StmtOpenMP.h
index a436676113921..d9f87f1e49b40 100644
--- a/clang/include/clang/AST/StmtOpenMP.h
+++ b/clang/include/clang/AST/StmtOpenMP.h
@@ -956,30 +956,46 @@ class OMPLoopBasedDirective : public OMPExecutableDirective {
}
};
+/// Common class of data shared between
+/// OMPCanonicalLoopNestTransformationDirective and transformations over
+/// canonical loop sequences.
+class OMPLoopTransformationDirective {
+ /// Number of (top-level) generated loops.
+ /// This value is 1 for most transformations as they only map one loop nest
+ /// into another.
+ /// Some loop transformations (like a non-partial 'unroll') may not generate
+ /// a loop nest, so this would be 0.
+ /// Some loop transformations (like 'fuse' with looprange and 'split') may
+ /// generate more than one loop nest, so the value would be >= 1.
+ unsigned NumGeneratedTopLevelLoops = 1;
+
+protected:
+ void setNumGeneratedTopLevelLoops(unsigned N) {
+ NumGeneratedTopLevelLoops = N;
+ }
+
+public:
+ unsigned getNumGeneratedTopLevelLoops() const {
+ return NumGeneratedTopLevelLoops;
+ }
+};
+
/// The base class for all transformation directives of canonical loop nests.
class OMPCanonicalLoopNestTransformationDirective
- : public OMPLoopBasedDirective {
+ : public OMPLoopBasedDirective,
+ public OMPLoopTransformationDirective {
friend class ASTStmtReader;
- /// Number of loops generated by this loop transformation.
- unsigned NumGeneratedLoops = 0;
-
protected:
explicit OMPCanonicalLoopNestTransformationDirective(
StmtClass SC, OpenMPDirectiveKind Kind, SourceLocation StartLoc,
SourceLocation EndLoc, unsigned NumAssociatedLoops)
: OMPLoopBasedDirective(SC, Kind, StartLoc, EndLoc, NumAssociatedLoops) {}
- /// Set the number of loops generated by this loop transformation.
- void setNumGeneratedLoops(unsigned Num) { NumGeneratedLoops = Num; }
-
public:
/// Return the number of associated (consumed) loops.
unsigned getNumAssociatedLoops() const { return getLoopsNumber(); }
- /// Return the number of loops generated by this loop transformation.
- unsigned getNumGeneratedLoops() const { return NumGeneratedLoops; }
-
/// Get the de-sugared statements after the loop transformation.
///
/// Might be nullptr if either the directive generates no loops and is handled
@@ -5560,9 +5576,7 @@ class OMPTileDirective final
unsigned NumLoops)
: OMPCanonicalLoopNestTransformationDirective(
OMPTileDirectiveClass, llvm::omp::OMPD_tile, StartLoc, EndLoc,
- NumLoops) {
- setNumGeneratedLoops(2 * NumLoops);
- }
+ NumLoops) {}
void setPreInits(Stmt *PreInits) {
Data->getChildren()[PreInitsOffset] = PreInits;
@@ -5638,9 +5652,7 @@ class OMPStripeDirective final
unsigned NumLoops)
: OMPCanonicalLoopNestTransformationDirective(
OMPStripeDirectiveClass, llvm::omp::OMPD_stripe, StartLoc, EndLoc,
- NumLoops) {
- setNumGeneratedLoops(2 * NumLoops);
- }
+ NumLoops) {}
void setPreInits(Stmt *PreInits) {
Data->getChildren()[PreInitsOffset] = PreInits;
@@ -5744,7 +5756,8 @@ class OMPUnrollDirective final
static OMPUnrollDirective *
Create(const ASTContext &C, SourceLocation StartLoc, SourceLocation EndLoc,
ArrayRef<OMPClause *> Clauses, Stmt *AssociatedStmt,
- unsigned NumGeneratedLoops, Stmt *TransformedStmt, Stmt *PreInits);
+ unsigned NumGeneratedTopLevelLoops, Stmt *TransformedStmt,
+ Stmt *PreInits);
/// Build an empty '#pragma omp unroll' AST node for deserialization.
///
@@ -5794,9 +5807,7 @@ class OMPReverseDirective final
unsigned NumLoops)
: OMPCanonicalLoopNestTransformationDirective(
OMPReverseDirectiveClass, llvm::omp::OMPD_reverse, StartLoc, EndLoc,
- NumLoops) {
- setNumGeneratedLoops(NumLoops);
- }
+ NumLoops) {}
void setPreInits(Stmt *PreInits) {
Data->getChildren()[PreInitsOffset] = PreInits;
@@ -5867,9 +5878,7 @@ class OMPInterchangeDirective final
SourceLocation EndLoc, unsigned NumLoops)
: OMPCanonicalLoopNestTransformationDirective(
OMPInterchangeDirectiveClass, llvm::omp::OMPD_interchange, StartLoc,
- EndLoc, NumLoops) {
- setNumGeneratedLoops(NumLoops);
- }
+ EndLoc, NumLoops) {}
void setPreInits(Stmt *PreInits) {
Data->getChildren()[PreInitsOffset] = PreInits;
diff --git a/clang/lib/AST/StmtOpenMP.cpp b/clang/lib/AST/StmtOpenMP.cpp
index 36ecaf6489ef0..1f6586f95a9f8 100644
--- a/clang/lib/AST/StmtOpenMP.cpp
+++ b/clang/lib/AST/StmtOpenMP.cpp
@@ -139,13 +139,14 @@ bool OMPLoopBasedDirective::doForAllLoops(
Stmt *TransformedStmt = Dir->getTransformedStmt();
if (!TransformedStmt) {
- unsigned NumGeneratedLoops = Dir->getNumGeneratedLoops();
- if (NumGeneratedLoops == 0) {
+ unsigned NumGeneratedTopLevelLoops =
+ Dir->getNumGeneratedTopLevelLoops();
+ if (NumGeneratedTopLevelLoops == 0) {
// May happen if the loop transformation does not result in a
// generated loop (such as full unrolling).
break;
}
- if (NumGeneratedLoops > 0) {
+ if (NumGeneratedTopLevelLoops > 0) {
// The loop transformation construct has generated loops, but these
// may not have been generated yet due to being in a dependent
// context.
@@ -447,16 +448,16 @@ OMPStripeDirective *OMPStripeDirective::CreateEmpty(const ASTContext &C,
SourceLocation(), SourceLocation(), NumLoops);
}
-OMPUnrollDirective *
-OMPUnrollDirective::Create(const ASTContext &C, SourceLocation StartLoc,
- SourceLocation EndLoc, ArrayRef<OMPClause *> Clauses,
- Stmt *AssociatedStmt, unsigned NumGeneratedLoops,
- Stmt *TransformedStmt, Stmt *PreInits) {
- assert(NumGeneratedLoops <= 1 && "Unrolling generates at most one loop");
+OMPUnrollDirective *OMPUnrollDirective::Create(
+ const ASTContext &C, SourceLocation StartLoc, SourceLocation EndLoc,
+ ArrayRef<OMPClause *> Clauses, Stmt *AssociatedStmt,
+ unsigned NumGeneratedTopLevelLoops, Stmt *TransformedStmt, Stmt *PreInits) {
+ assert(NumGeneratedTopLevelLoops <= 1 &&
+ "Unrolling generates at most one loop");
auto *Dir = createDirective<OMPUnrollDirective>(
C, Clauses, AssociatedStmt, TransformedStmtOffset + 1, StartLoc, EndLoc);
- Dir->setNumGeneratedLoops(NumGeneratedLoops);
+ Dir->setNumGeneratedTopLevelLoops(NumGeneratedTopLevelLoops);
Dir->setTransformedStmt(TransformedStmt);
Dir->setPreInits(PreInits);
return Dir;
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index 63a56a6583efc..60f0317020c59 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -14919,12 +14919,13 @@ StmtResult SemaOpenMP::ActOnOpenMPUnrollDirective(ArrayRef<OMPClause *> Clauses,
Body, OriginalInits))
return StmtError();
- unsigned NumGeneratedLoops = PartialClause ? 1 : 0;
+ unsigned NumGeneratedTopLevelLoops = PartialClause ? 1 : 0;
// Delay unrolling to when template is completely instantiated.
if (SemaRef.CurContext->isDependentContext())
return OMPUnrollDirective::Create(Context, StartLoc, EndLoc, Clauses, AStmt,
- NumGeneratedLoops, nullptr, nullptr);
+ NumGeneratedTopLevelLoops, nullptr,
+ nullptr);
assert(LoopHelpers.size() == NumLoops &&
"Expecting a single-dimensional loop iteration space");
@@ -14947,9 +14948,10 @@ StmtResult SemaOpenMP::ActOnOpenMPUnrollDirective(ArrayRef<OMPClause *> Clauses,
// The generated loop may only be passed to other loop-associated directive
// when a partial clause is specified. Without the requirement it is
// sufficient to generate loop unroll metadata at code-generation.
- if (NumGeneratedLoops == 0)
+ if (NumGeneratedTopLevelLoops == 0)
return OMPUnrollDirective::Create(Context, StartLoc, EndLoc, Clauses, AStmt,
- NumGeneratedLoops, nullptr, nullptr);
+ NumGeneratedTopLevelLoops, nullptr,
+ nullptr);
// Otherwise, we need to provide a de-sugared/transformed AST that can be
// associated with another loop directive.
@@ -15164,7 +15166,7 @@ StmtResult SemaOpenMP::ActOnOpenMPUnrollDirective(ArrayRef<OMPClause *> Clauses,
LoopHelper.Init->getBeginLoc(), LoopHelper.Inc->getEndLoc());
return OMPUnrollDirective::Create(Context, StartLoc, EndLoc, Clauses, AStmt,
- NumGeneratedLoops, OuterFor,
+ NumGeneratedTopLevelLoops, OuterFor,
buildPreInits(Context, PreInits));
}
diff --git a/clang/lib/Serialization/ASTReaderStmt.cpp b/clang/lib/Serialization/ASTReaderStmt.cpp
index 7ec8e450fbaca..213c2c2148f64 100644
--- a/clang/lib/Serialization/ASTReaderStmt.cpp
+++ b/clang/lib/Serialization/ASTReaderStmt.cpp
@@ -2450,7 +2450,7 @@ void ASTStmtReader::VisitOMPSimdDirective(OMPSimdDirective *D) {
void ASTStmtReader::VisitOMPCanonicalLoopNestTransformationDirective(
OMPCanonicalLoopNestTransformationDirective *D) {
VisitOMPLoopBasedDirective(D);
- D->setNumGeneratedLoops(Record.readUInt32());
+ D->setNumGeneratedTopLevelLoops(Record.readUInt32());
}
void ASTStmtReader::VisitOMPTileDirective(OMPTileDirective *D) {
diff --git a/clang/lib/Serialization/ASTWriterStmt.cpp b/clang/lib/Serialization/ASTWriterStmt.cpp
index 07a5cde47a9a8..21c04ddbc2c7a 100644
--- a/clang/lib/Serialization/ASTWriterStmt.cpp
+++ b/clang/lib/Serialization/ASTWriterStmt.cpp
@@ -2459,7 +2459,7 @@ void ASTStmtWriter::VisitOMPSimdDirective(OMPSimdDirective *D) {
void ASTStmtWriter::VisitOMPCanonicalLoopNestTransformationDirective(
OMPCanonicalLoopNestTransformationDirective *D) {
VisitOMPLoopBasedDirective(D);
- Record.writeUInt32(D->getNumGeneratedLoops());
+ Record.writeUInt32(D->getNumGeneratedTopLevelLoops());
}
void ASTStmtWriter::VisitOMPTileDirective(OMPTileDirective *D) {
>From 6ba627542af0d0d95074380a16885c7afe0930d7 Mon Sep 17 00:00:00 2001
From: eZWALT <waltertheshadow333 at gmail.com>
Date: Mon, 8 Sep 2025 08:55:24 +0000
Subject: [PATCH 2/2] [Clang][OpenMP] Implement #pragma omp fuse
---
clang/docs/OpenMPSupport.rst | 2 +
clang/docs/ReleaseNotes.rst | 1 +
clang/include/clang-c/Index.h | 4 +
clang/include/clang/AST/OpenMPClause.h | 74 +
clang/include/clang/AST/RecursiveASTVisitor.h | 11 +
clang/include/clang/AST/StmtOpenMP.h | 208 +-
.../clang/Basic/DiagnosticSemaKinds.td | 12 +
clang/include/clang/Basic/OpenMPKinds.h | 7 +
clang/include/clang/Basic/StmtNodes.td | 4 +
clang/include/clang/Parse/Parser.h | 3 +
clang/include/clang/Sema/SemaOpenMP.h | 91 +-
.../include/clang/Serialization/ASTBitCodes.h | 1 +
clang/lib/AST/OpenMPClause.cpp | 35 +
clang/lib/AST/StmtOpenMP.cpp | 81 +-
clang/lib/AST/StmtPrinter.cpp | 5 +
clang/lib/AST/StmtProfile.cpp | 16 +
clang/lib/Basic/OpenMPKinds.cpp | 11 +-
clang/lib/CodeGen/CGStmt.cpp | 3 +
clang/lib/CodeGen/CGStmtOpenMP.cpp | 42 +-
clang/lib/CodeGen/CodeGenFunction.h | 1 +
clang/lib/Parse/ParseOpenMP.cpp | 36 +
clang/lib/Sema/SemaExceptionSpec.cpp | 1 +
clang/lib/Sema/SemaOpenMP.cpp | 848 +++++-
clang/lib/Sema/TreeTransform.h | 44 +
clang/lib/Serialization/ASTReader.cpp | 11 +
clang/lib/Serialization/ASTReaderStmt.cpp | 20 +
clang/lib/Serialization/ASTWriter.cpp | 8 +
clang/lib/Serialization/ASTWriterStmt.cpp | 13 +
clang/lib/StaticAnalyzer/Core/ExprEngine.cpp | 1 +
clang/test/OpenMP/fuse_ast_print.cpp | 397 +++
clang/test/OpenMP/fuse_codegen.cpp | 2328 +++++++++++++++++
clang/test/OpenMP/fuse_messages.cpp | 209 ++
clang/tools/libclang/CIndex.cpp | 19 +
clang/tools/libclang/CXCursor.cpp | 3 +
flang/include/flang/Lower/OpenMP/Clauses.h | 1 +
flang/include/flang/Parser/dump-parse-tree.h | 1 +
flang/include/flang/Parser/parse-tree.h | 9 +
flang/lib/Lower/OpenMP/Clauses.cpp | 5 +
flang/lib/Parser/openmp-parsers.cpp | 5 +
flang/lib/Parser/unparse.cpp | 7 +
flang/lib/Semantics/check-omp-structure.cpp | 6 +
llvm/include/llvm/Frontend/OpenMP/ClauseT.h | 13 +-
llvm/include/llvm/Frontend/OpenMP/OMP.td | 9 +
.../runtime/test/transform/fuse/foreach.cpp | 191 ++
openmp/runtime/test/transform/fuse/intfor.c | 50 +
.../runtime/test/transform/fuse/iterfor.cpp | 194 ++
.../fuse/parallel-wsloop-collapse-foreach.cpp | 207 ++
.../fuse/parallel-wsloop-collapse-intfor.c | 45 +
48 files changed, 5239 insertions(+), 54 deletions(-)
create mode 100644 clang/test/OpenMP/fuse_ast_print.cpp
create mode 100644 clang/test/OpenMP/fuse_codegen.cpp
create mode 100644 clang/test/OpenMP/fuse_messages.cpp
create mode 100644 openmp/runtime/test/transform/fuse/foreach.cpp
create mode 100644 openmp/runtime/test/transform/fuse/intfor.c
create mode 100644 openmp/runtime/test/transform/fuse/iterfor.cpp
create mode 100644 openmp/runtime/test/transform/fuse/parallel-wsloop-collapse-foreach.cpp
create mode 100644 openmp/runtime/test/transform/fuse/parallel-wsloop-collapse-intfor.c
diff --git a/clang/docs/OpenMPSupport.rst b/clang/docs/OpenMPSupport.rst
index 46c365865f8dd..961f05b48ecd1 100644
--- a/clang/docs/OpenMPSupport.rst
+++ b/clang/docs/OpenMPSupport.rst
@@ -376,6 +376,8 @@ implementation.
+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
| loop stripe transformation | :good:`done` | https://github.com/llvm/llvm-project/pull/119891 |
+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| loop fuse transformation | :good:`done` | :none:`unclaimed` | |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
| workdistribute construct | | :none:`in progress` | @skc7, @mjklemm |
+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
| task_iteration | :none:`unclaimed` | :none:`unclaimed` | |
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index d1ef91b7e7c14..09f4c94725d71 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -500,6 +500,7 @@ OpenMP Support
- Allow array length to be omitted in array section subscript expression.
- Fixed non-contiguous strided update in the ``omp target update`` directive with the ``from`` clause.
- Properly handle array section/assumed-size array privatization in C/C++.
+- Added support for 'omp fuse' directive.
Improvements
^^^^^^^^^^^^
diff --git a/clang/include/clang-c/Index.h b/clang/include/clang-c/Index.h
index be038d9165fc6..f13d9c9307b40 100644
--- a/clang/include/clang-c/Index.h
+++ b/clang/include/clang-c/Index.h
@@ -2162,6 +2162,10 @@ enum CXCursorKind {
*/
CXCursor_OMPStripeDirective = 310,
+ /** OpenMP fuse directive
+ */
+ CXCursor_OMPFuseDirective = 311,
+
/** OpenACC Compute Construct.
*/
CXCursor_OpenACCComputeConstruct = 320,
diff --git a/clang/include/clang/AST/OpenMPClause.h b/clang/include/clang/AST/OpenMPClause.h
index 72effbc3e02fc..f1fba251ffdd3 100644
--- a/clang/include/clang/AST/OpenMPClause.h
+++ b/clang/include/clang/AST/OpenMPClause.h
@@ -1149,6 +1149,80 @@ class OMPFullClause final : public OMPNoChildClause<llvm::omp::OMPC_full> {
static OMPFullClause *CreateEmpty(const ASTContext &C);
};
+/// This class represents the 'looprange' clause in the
+/// '#pragma omp fuse' directive
+///
+/// \code {c}
+/// #pragma omp fuse looprange(1,2)
+/// {
+/// for(int i = 0; i < 64; ++i)
+/// for(int j = 0; j < 256; j+=2)
+/// for(int k = 127; k >= 0; --k)
+/// \endcode
+class OMPLoopRangeClause final : public OMPClause {
+ friend class OMPClauseReader;
+ /// Location of '('
+ SourceLocation LParenLoc;
+
+ /// Location of first and count expressions
+ SourceLocation FirstLoc, CountLoc;
+
+ /// Number of looprange arguments (always 2: first, count)
+ static constexpr unsigned NumArgs = 2;
+ Stmt *Args[NumArgs] = {nullptr, nullptr};
+
+ /// Set looprange 'first' expression
+ void setFirst(Expr *E) { Args[0] = E; }
+
+ /// Set looprange 'count' expression
+ void setCount(Expr *E) { Args[1] = E; }
+
+ /// Build an empty clause for deserialization.
+ explicit OMPLoopRangeClause()
+ : OMPClause(llvm::omp::OMPC_looprange, {}, {}) {}
+
+public:
+ /// Build a 'looprange' clause AST node.
+ static OMPLoopRangeClause *
+ Create(const ASTContext &C, SourceLocation StartLoc, SourceLocation LParenLoc,
+ SourceLocation FirstLoc, SourceLocation CountLoc,
+ SourceLocation EndLoc, Expr *First, Expr *Count);
+
+ /// Build an empty 'looprange' clause node.
+ static OMPLoopRangeClause *CreateEmpty(const ASTContext &C);
+
+ // Location getters/setters
+ SourceLocation getLParenLoc() const { return LParenLoc; }
+ SourceLocation getFirstLoc() const { return FirstLoc; }
+ SourceLocation getCountLoc() const { return CountLoc; }
+
+ void setLParenLoc(SourceLocation Loc) { LParenLoc = Loc; }
+ void setFirstLoc(SourceLocation Loc) { FirstLoc = Loc; }
+ void setCountLoc(SourceLocation Loc) { CountLoc = Loc; }
+
+ /// Get looprange 'first' expression
+ Expr *getFirst() const { return cast_or_null<Expr>(Args[0]); }
+
+ /// Get looprange 'count' expression
+ Expr *getCount() const { return cast_or_null<Expr>(Args[1]); }
+
+ child_range children() { return child_range(Args, Args + NumArgs); }
+ const_child_range children() const {
+ return const_child_range(Args, Args + NumArgs);
+ }
+
+ child_range used_children() {
+ return child_range(child_iterator(), child_iterator());
+ }
+ const_child_range used_children() const {
+ return const_child_range(const_child_iterator(), const_child_iterator());
+ }
+
+ static bool classof(const OMPClause *T) {
+ return T->getClauseKind() == llvm::omp::OMPC_looprange;
+ }
+};
+
/// Representation of the 'partial' clause of the '#pragma omp unroll'
/// directive.
///
diff --git a/clang/include/clang/AST/RecursiveASTVisitor.h b/clang/include/clang/AST/RecursiveASTVisitor.h
index 02581c8e73299..89de8f767f579 100644
--- a/clang/include/clang/AST/RecursiveASTVisitor.h
+++ b/clang/include/clang/AST/RecursiveASTVisitor.h
@@ -3186,6 +3186,9 @@ DEF_TRAVERSE_STMT(OMPUnrollDirective,
DEF_TRAVERSE_STMT(OMPReverseDirective,
{ TRY_TO(TraverseOMPExecutableDirective(S)); })
+DEF_TRAVERSE_STMT(OMPFuseDirective,
+ { TRY_TO(TraverseOMPExecutableDirective(S)); })
+
DEF_TRAVERSE_STMT(OMPInterchangeDirective,
{ TRY_TO(TraverseOMPExecutableDirective(S)); })
@@ -3503,6 +3506,14 @@ bool RecursiveASTVisitor<Derived>::VisitOMPFullClause(OMPFullClause *C) {
return true;
}
+template <typename Derived>
+bool RecursiveASTVisitor<Derived>::VisitOMPLoopRangeClause(
+ OMPLoopRangeClause *C) {
+ TRY_TO(TraverseStmt(C->getFirst()));
+ TRY_TO(TraverseStmt(C->getCount()));
+ return true;
+}
+
template <typename Derived>
bool RecursiveASTVisitor<Derived>::VisitOMPPartialClause(OMPPartialClause *C) {
TRY_TO(TraverseStmt(C->getFactor()));
diff --git a/clang/include/clang/AST/StmtOpenMP.h b/clang/include/clang/AST/StmtOpenMP.h
index d9f87f1e49b40..f7e73c9cc8753 100644
--- a/clang/include/clang/AST/StmtOpenMP.h
+++ b/clang/include/clang/AST/StmtOpenMP.h
@@ -21,6 +21,7 @@
#include "clang/AST/StmtCXX.h"
#include "clang/Basic/OpenMPKinds.h"
#include "clang/Basic/SourceLocation.h"
+#include "llvm/Support/Casting.h"
namespace clang {
@@ -677,6 +678,10 @@ class OMPParallelDirective : public OMPExecutableDirective {
}
};
+// Forward declaration of a generic loop transformation. Used in the declaration
+// of OMPLoopBasedDirective.
+class OMPLoopTransformationDirective;
+
/// The base class for all loop-based directives, including loop transformation
/// directives.
class OMPLoopBasedDirective : public OMPExecutableDirective {
@@ -889,24 +894,23 @@ class OMPLoopBasedDirective : public OMPExecutableDirective {
/// Calls the specified callback function for all the loops in \p CurStmt,
/// from the outermost to the innermost.
- static bool doForAllLoops(
- Stmt *CurStmt, bool TryImperfectlyNestedLoops, unsigned NumLoops,
- llvm::function_ref<bool(unsigned, Stmt *)> Callback,
- llvm::function_ref<void(OMPCanonicalLoopNestTransformationDirective *)>
- OnTransformationCallback);
+ static bool
+ doForAllLoops(Stmt *CurStmt, bool TryImperfectlyNestedLoops,
+ unsigned NumLoops,
+ llvm::function_ref<bool(unsigned, Stmt *)> Callback,
+ llvm::function_ref<void(OMPLoopTransformationDirective *)>
+ OnTransformationCallback);
static bool
doForAllLoops(const Stmt *CurStmt, bool TryImperfectlyNestedLoops,
unsigned NumLoops,
llvm::function_ref<bool(unsigned, const Stmt *)> Callback,
- llvm::function_ref<
- void(const OMPCanonicalLoopNestTransformationDirective *)>
+ llvm::function_ref<void(const OMPLoopTransformationDirective *)>
OnTransformationCallback) {
auto &&NewCallback = [Callback](unsigned Cnt, Stmt *CurStmt) {
return Callback(Cnt, CurStmt);
};
auto &&NewTransformCb =
- [OnTransformationCallback](
- OMPCanonicalLoopNestTransformationDirective *A) {
+ [OnTransformationCallback](OMPLoopTransformationDirective *A) {
OnTransformationCallback(A);
};
return doForAllLoops(const_cast<Stmt *>(CurStmt), TryImperfectlyNestedLoops,
@@ -919,7 +923,7 @@ class OMPLoopBasedDirective : public OMPExecutableDirective {
doForAllLoops(Stmt *CurStmt, bool TryImperfectlyNestedLoops,
unsigned NumLoops,
llvm::function_ref<bool(unsigned, Stmt *)> Callback) {
- auto &&TransformCb = [](OMPCanonicalLoopNestTransformationDirective *) {};
+ auto &&TransformCb = [](OMPLoopTransformationDirective *) {};
return doForAllLoops(CurStmt, TryImperfectlyNestedLoops, NumLoops, Callback,
TransformCb);
}
@@ -957,9 +961,11 @@ class OMPLoopBasedDirective : public OMPExecutableDirective {
};
/// Common class of data shared between
-/// OMPCanonicalLoopNestTransformationDirective and transformations over
-/// canonical loop sequences.
+/// OMPCanonicalLoopNestTransformationDirective and
+/// OMPCanonicalLoopSequenceTransformationDirective
class OMPLoopTransformationDirective {
+ friend class ASTStmtReader;
+
/// Number of (top-level) generated loops.
/// This value is 1 for most transformations as they only map one loop nest
/// into another.
@@ -969,15 +975,39 @@ class OMPLoopTransformationDirective {
/// generate more than one loop nest, so the value would be >= 1.
unsigned NumGeneratedTopLevelLoops = 1;
+ /// We need this because we cannot easily make OMPLoopTransformationDirective
+ /// a proper Stmt.
+ Stmt *S = nullptr;
+
protected:
void setNumGeneratedTopLevelLoops(unsigned N) {
NumGeneratedTopLevelLoops = N;
}
+ explicit OMPLoopTransformationDirective(Stmt *S) : S(S) {}
+
public:
unsigned getNumGeneratedTopLevelLoops() const {
return NumGeneratedTopLevelLoops;
}
+
+ /// Returns the specific directive related to this loop transformation.
+ Stmt *getDirective() const { return S; }
+
+ /// Get the de-sugared statements after the loop transformation.
+ ///
+ /// Might be nullptr if either the directive generates no loops and is handled
+ /// directly in CodeGen, or resolving a template-dependence context is
+ /// required.
+ Stmt *getTransformedStmt() const;
+
+ /// Return preinits statement.
+ Stmt *getPreInits() const;
+
+ static bool classof(const Stmt *T) {
+ return isa<OMPCanonicalLoopNestTransformationDirective,
+ OMPCanonicalLoopSequenceTransformationDirective>(T);
+ }
};
/// The base class for all transformation directives of canonical loop nests.
@@ -990,7 +1020,8 @@ class OMPCanonicalLoopNestTransformationDirective
explicit OMPCanonicalLoopNestTransformationDirective(
StmtClass SC, OpenMPDirectiveKind Kind, SourceLocation StartLoc,
SourceLocation EndLoc, unsigned NumAssociatedLoops)
- : OMPLoopBasedDirective(SC, Kind, StartLoc, EndLoc, NumAssociatedLoops) {}
+ : OMPLoopBasedDirective(SC, Kind, StartLoc, EndLoc, NumAssociatedLoops),
+ OMPLoopTransformationDirective(this) {}
public:
/// Return the number of associated (consumed) loops.
@@ -5928,6 +5959,124 @@ class OMPInterchangeDirective final
}
};
+/// The base class for all transformation directives of canonical loop
+/// sequences (currently only 'fuse')
+class OMPCanonicalLoopSequenceTransformationDirective
+ : public OMPExecutableDirective,
+ public OMPLoopTransformationDirective {
+ friend class ASTStmtReader;
+
+ /// Number of loops contained in the canonical loop sequence.
+ unsigned NumLoopsInSequence = 1;
+
+protected:
+ explicit OMPCanonicalLoopSequenceTransformationDirective(
+ StmtClass SC, OpenMPDirectiveKind Kind, SourceLocation StartLoc,
+ SourceLocation EndLoc, unsigned NumLoopsInSequence)
+ : OMPExecutableDirective(SC, Kind, StartLoc, EndLoc),
+ OMPLoopTransformationDirective(this),
+ NumLoopsInSequence(NumLoopsInSequence) {}
+
+public:
+ /// Returns the number of canonical loop nests contained in this sequence.
+ unsigned getNumLoopsInSequence() const { return NumLoopsInSequence; }
+
+ /// Get the de-sugared statements after the loop transformation.
+ ///
+ /// Might be nullptr if either the directive generates no loops and is handled
+ /// directly in CodeGen, or resolving a template-dependence context is
+ /// required.
+ Stmt *getTransformedStmt() const;
+
+ /// Return preinits statement.
+ Stmt *getPreInits() const;
+
+ static bool classof(const Stmt *T) {
+ Stmt::StmtClass C = T->getStmtClass();
+ return C == OMPFuseDirectiveClass;
+ }
+};
+
+/// Represents the '#pragma omp fuse' loop transformation directive
+///
+/// \code{c}
+/// #pragma omp fuse
+/// {
+/// for(int i = 0; i < m1; ++i) {...}
+/// for(int j = 0; j < m2; ++j) {...}
+/// ...
+/// }
+/// \endcode
+class OMPFuseDirective final
+ : public OMPCanonicalLoopSequenceTransformationDirective {
+ friend class ASTStmtReader;
+ friend class OMPExecutableDirective;
+
+ // Offsets of child members.
+ enum {
+ PreInitsOffset = 0,
+ TransformedStmtOffset,
+ };
+
+ unsigned NumGeneratedLoopNests = 1;
+
+ explicit OMPFuseDirective(SourceLocation StartLoc, SourceLocation EndLoc,
+ unsigned NumLoopsInSequence)
+ : OMPCanonicalLoopSequenceTransformationDirective(
+ OMPFuseDirectiveClass, llvm::omp::OMPD_fuse, StartLoc, EndLoc,
+ NumLoopsInSequence) {}
+
+ void setPreInits(Stmt *PreInits) {
+ Data->getChildren()[PreInitsOffset] = PreInits;
+ }
+
+ void setTransformedStmt(Stmt *S) {
+ Data->getChildren()[TransformedStmtOffset] = S;
+ }
+
+public:
+ /// Create a new AST node representation for #pragma omp fuse'
+ ///
+ /// \param C Context of the AST
+ /// \param StartLoc Location of the introducer (e.g the 'omp' token)
+ /// \param EndLoc Location of the directive's end (e.g the tok::eod)
+ /// \param Clauses The directive's clauses
+ /// \param NumLoops Total number of loops in the canonical loop sequence.
+ /// \param NumGeneratedTopLevelLoops Number of top-level generated loops.
+ // Typically 1 but looprange clause can
+ // change this.
+ /// \param AssociatedStmt The outermost associated loop
+ /// \param TransformedStmt The loop nest after fusion, or nullptr in
+ /// dependent
+ /// \param PreInits Helper preinits statements for the loop nest
+ static OMPFuseDirective *
+ Create(const ASTContext &C, SourceLocation StartLoc, SourceLocation EndLoc,
+ ArrayRef<OMPClause *> Clauses, unsigned NumLoops,
+ unsigned NumGeneratedTopLevelLoops, Stmt *AssociatedStmt,
+ Stmt *TransformedStmt, Stmt *PreInits);
+
+ /// Build an empty '#pragma omp fuse' AST node for deserialization
+ ///
+ /// \param C Context of the AST
+ /// \param NumClauses Number of clauses to allocate
+ /// \param NumLoops Number of top level loops to allocate
+ static OMPFuseDirective *CreateEmpty(const ASTContext &C, unsigned NumClauses,
+ unsigned NumLoops);
+
+ /// Gets the associated loops after the transformation. This is the de-sugared
+ /// replacement or nulltpr in dependent contexts.
+ Stmt *getTransformedStmt() const {
+ return Data->getChildren()[TransformedStmtOffset];
+ }
+
+ /// Return preinits statement.
+ Stmt *getPreInits() const { return Data->getChildren()[PreInitsOffset]; }
+
+ static bool classof(const Stmt *T) {
+ return T->getStmtClass() == OMPFuseDirectiveClass;
+ }
+};
+
/// This represents '#pragma omp scan' directive.
///
/// \code
@@ -6596,4 +6745,37 @@ class OMPAssumeDirective final : public OMPExecutableDirective {
} // end namespace clang
+namespace llvm {
+// Allow a Stmt* be casted correctly to an OMPLoopTransformationDirective*.
+// The default routines would just use a C-style cast which won't work well
+// for the multiple inheritance here. We have to use a static cast from the
+// corresponding subclass.
+template <>
+struct CastInfo<clang::OMPLoopTransformationDirective, clang::Stmt *>
+ : public NullableValueCastFailed<clang::OMPLoopTransformationDirective *>,
+ public DefaultDoCastIfPossible<
+ clang::OMPLoopTransformationDirective *, clang::Stmt *,
+ CastInfo<clang::OMPLoopTransformationDirective, clang::Stmt *>> {
+ static bool isPossible(const clang::Stmt *T) {
+ return clang::OMPLoopTransformationDirective::classof(T);
+ }
+
+ static clang::OMPLoopTransformationDirective *doCast(clang::Stmt *T) {
+ if (auto *D =
+ dyn_cast<clang::OMPCanonicalLoopNestTransformationDirective>(T))
+ return static_cast<clang::OMPLoopTransformationDirective *>(D);
+ if (auto *D =
+ dyn_cast<clang::OMPCanonicalLoopSequenceTransformationDirective>(T))
+ return static_cast<clang::OMPLoopTransformationDirective *>(D);
+ llvm_unreachable("unexpected type");
+ }
+};
+template <>
+struct CastInfo<clang::OMPLoopTransformationDirective, const clang::Stmt *>
+ : public ConstStrippingForwardingCast<
+ clang::OMPLoopTransformationDirective, const clang::Stmt *,
+ CastInfo<clang::OMPLoopTransformationDirective, clang::Stmt *>> {};
+
+} // namespace llvm
+
#endif
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 711efbe727892..1376aea03ed4e 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -11735,6 +11735,18 @@ def note_omp_implicit_dsa : Note<
"implicitly determined as %0">;
def err_omp_loop_var_dsa : Error<
"loop iteration variable in the associated loop of 'omp %1' directive may not be %0, predetermined as %2">;
+def err_omp_not_a_loop_sequence
+ : Error<"statement after '#pragma omp %0' must be a loop sequence "
+ "containing canonical loops or loop-generating constructs">;
+def err_omp_empty_loop_sequence
+ : Error<"loop sequence after '#pragma omp %0' must contain at least 1 "
+ "canonical loop or loop-generating construct">;
+def err_omp_invalid_looprange
+ : Error<"looprange clause selects loops from %1 to %2 but this exceeds the "
+ "number of loops (%3) in the loop sequence">;
+def warn_omp_redundant_fusion : Warning<"looprange clause selects a single "
+ "loop, resulting in redundant fusion">,
+ InGroup<OpenMPClauses>;
def err_omp_not_for : Error<
"%select{statement after '#pragma omp %1' must be a for loop|"
"expected %2 for loops after '#pragma omp %1'%select{|, but found only %4}3}0">;
diff --git a/clang/include/clang/Basic/OpenMPKinds.h b/clang/include/clang/Basic/OpenMPKinds.h
index d3285cd9c6a14..59b9c18052636 100644
--- a/clang/include/clang/Basic/OpenMPKinds.h
+++ b/clang/include/clang/Basic/OpenMPKinds.h
@@ -372,6 +372,13 @@ bool isOpenMPLoopBoundSharingDirective(OpenMPDirectiveKind Kind);
bool isOpenMPCanonicalLoopNestTransformationDirective(
OpenMPDirectiveKind DKind);
+/// Checks if the specified directive is a loop transformation directive that
+/// applies to a canonical loop sequence.
+/// \param DKind Specified directive.
+/// \return True iff the directive is a loop transformation.
+bool isOpenMPCanonicalLoopSequenceTransformationDirective(
+ OpenMPDirectiveKind DKind);
+
/// Checks if the specified directive is a loop transformation directive.
/// \param DKind Specified directive.
/// \return True iff the directive is a loop transformation.
diff --git a/clang/include/clang/Basic/StmtNodes.td b/clang/include/clang/Basic/StmtNodes.td
index dd1a24405fae7..bf3686bb372d5 100644
--- a/clang/include/clang/Basic/StmtNodes.td
+++ b/clang/include/clang/Basic/StmtNodes.td
@@ -238,6 +238,10 @@ def OMPUnrollDirective : StmtNode<OMPCanonicalLoopNestTransformationDirective>;
def OMPReverseDirective : StmtNode<OMPCanonicalLoopNestTransformationDirective>;
def OMPInterchangeDirective
: StmtNode<OMPCanonicalLoopNestTransformationDirective>;
+def OMPCanonicalLoopSequenceTransformationDirective
+ : StmtNode<OMPExecutableDirective, 1>;
+def OMPFuseDirective
+ : StmtNode<OMPCanonicalLoopSequenceTransformationDirective>;
def OMPForDirective : StmtNode<OMPLoopDirective>;
def OMPForSimdDirective : StmtNode<OMPLoopDirective>;
def OMPSectionsDirective : StmtNode<OMPExecutableDirective>;
diff --git a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h
index a9a87fb586fc2..cdfd02ab724e2 100644
--- a/clang/include/clang/Parse/Parser.h
+++ b/clang/include/clang/Parse/Parser.h
@@ -6759,6 +6759,9 @@ class Parser : public CodeCompletionHandler {
OpenMPClauseKind Kind,
bool ParseOnly);
+ /// Parses the 'looprange' clause of a '#pragma omp fuse' directive.
+ OMPClause *ParseOpenMPLoopRangeClause();
+
/// Parses the 'sizes' clause of a '#pragma omp tile' directive.
OMPClause *ParseOpenMPSizesClause();
diff --git a/clang/include/clang/Sema/SemaOpenMP.h b/clang/include/clang/Sema/SemaOpenMP.h
index 91c3d4bd5210e..71818ebdba0e4 100644
--- a/clang/include/clang/Sema/SemaOpenMP.h
+++ b/clang/include/clang/Sema/SemaOpenMP.h
@@ -457,6 +457,13 @@ class SemaOpenMP : public SemaBase {
Stmt *AStmt,
SourceLocation StartLoc,
SourceLocation EndLoc);
+
+ /// Called on well-formed '#pragma omp fuse' after parsing of its
+ /// clauses and the associated statement.
+ StmtResult ActOnOpenMPFuseDirective(ArrayRef<OMPClause *> Clauses,
+ Stmt *AStmt, SourceLocation StartLoc,
+ SourceLocation EndLoc);
+
/// Called on well-formed '\#pragma omp for' after parsing
/// of the associated statement.
StmtResult
@@ -915,6 +922,12 @@ class SemaOpenMP : public SemaBase {
SourceLocation StartLoc,
SourceLocation LParenLoc,
SourceLocation EndLoc);
+
+ /// Called on well-form 'looprange' clause after parsing its arguments.
+ OMPClause *
+ ActOnOpenMPLoopRangeClause(Expr *First, Expr *Count, SourceLocation StartLoc,
+ SourceLocation LParenLoc, SourceLocation FirstLoc,
+ SourceLocation CountLoc, SourceLocation EndLoc);
/// Called on well-formed 'ordered' clause.
OMPClause *
ActOnOpenMPOrderedClause(SourceLocation StartLoc, SourceLocation EndLoc,
@@ -1479,7 +1492,83 @@ class SemaOpenMP : public SemaBase {
bool checkTransformableLoopNest(
OpenMPDirectiveKind Kind, Stmt *AStmt, int NumLoops,
SmallVectorImpl<OMPLoopBasedDirective::HelperExprs> &LoopHelpers,
- Stmt *&Body, SmallVectorImpl<SmallVector<Stmt *, 0>> &OriginalInits);
+ Stmt *&Body, SmallVectorImpl<SmallVector<Stmt *>> &OriginalInits);
+
+ /// Categories of loops encountered during semantic OpenMP loop
+ /// analysis.
+ ///
+ /// This enumeration identifies the structural category of a loop or sequence
+ /// of loops analyzed in the context of OpenMP transformations and directives.
+ /// This categorization helps differentiate between original source loops
+ /// and the structures resulting from applying OpenMP loop transformations.
+ enum class OMPLoopCategory {
+ /// @var OMPLoopCategory::RegularLoop
+ /// Represents a standard canonical loop nest found in the
+ /// original source code or an intact loop after transformations
+ /// (i.e Post/Pre loops of a loopranged fusion).
+ RegularLoop,
+
+ /// @var OMPLoopCategory::TransformSingleLoop
+ /// Represents the resulting loop structure when an OpenMP loop
+ /// transformation, generates a single, top-level loop.
+ TransformSingleLoop,
+ };
+
+ /// Holds the result of the analysis of a (possibly canonical) loop.
+ struct LoopAnalysis {
+ /// Category of the analyzed loop.
+ OMPLoopCategory Category;
+ /// Loop analyses results.
+ OMPLoopBasedDirective::HelperExprs HelperExprs;
+ /// The for-statement of the loop.
+ Stmt *ForStmt;
+ /// Initialization statements before transformations.
+ SmallVector<Stmt *> OriginalInits;
+ /// Initialization statements required after transformation of this loop.
+ SmallVector<Stmt *> TransformsPreInits;
+
+ explicit LoopAnalysis(OMPLoopCategory Category) : Category(Category) {}
+ };
+
+ /// Holds the result of the analysis of a (possibly canonical) loop sequence.
+ struct LoopSequenceAnalysis {
+ /// Number of top level canonical loops.
+ unsigned LoopSeqSize = 0;
+ /// For each loop results of the analysis.
+ SmallVector<LoopAnalysis, 2> Loops;
+ /// Additional code required before entering the transformed loop sequence.
+ SmallVector<Stmt *> LoopSequencePreInits;
+ };
+
+ /// The main recursive process of `checkTransformableLoopSequence` that
+ /// performs grammatical parsing of a canonical loop sequence. It extracts
+ /// key information, such as the number of top-level loops, loop statements,
+ /// helper expressions, and other relevant loop-related data, all in a single
+ /// execution to avoid redundant traversals. This analysis flattens inner
+ /// Loop Sequences
+ ///
+ /// \param LoopSeqStmt The AST of the original statement.
+ /// \param SeqAnalysis [out] Result of the analysis of \p LoopSeqStmt
+ /// \param Context
+ /// \param Kind The loop transformation directive kind.
+ /// \return Whether the original statement is both syntactically and
+ /// semantically correct according to OpenMP 6.0 canonical loop
+ /// sequence definition.
+ bool analyzeLoopSequence(Stmt *LoopSeqStmt, LoopSequenceAnalysis &SeqAnalysis,
+ ASTContext &Context, OpenMPDirectiveKind Kind);
+
+ /// Validates and checks whether a loop sequence can be transformed according
+ /// to the given directive, providing necessary setup and initialization
+ /// (Driver function) before recursion using `analyzeLoopSequence`.
+ ///
+ /// \param Kind The loop transformation directive kind.
+ /// \param AStmt The AST of the original statement
+ /// \param SeqAnalysis [out] Result of the analysis of \p LoopSeqStmt
+ /// \param Context
+ /// \return Whether there was an absence of errors or not
+ bool checkTransformableLoopSequence(OpenMPDirectiveKind Kind, Stmt *AStmt,
+ LoopSequenceAnalysis &SeqAnalysis,
+ ASTContext &Context);
/// Helper to keep information about the current `omp begin/end declare
/// variant` nesting.
diff --git a/clang/include/clang/Serialization/ASTBitCodes.h b/clang/include/clang/Serialization/ASTBitCodes.h
index 441047d64f48c..99864c7373908 100644
--- a/clang/include/clang/Serialization/ASTBitCodes.h
+++ b/clang/include/clang/Serialization/ASTBitCodes.h
@@ -1951,6 +1951,7 @@ enum StmtCode {
STMT_OMP_UNROLL_DIRECTIVE,
STMT_OMP_REVERSE_DIRECTIVE,
STMT_OMP_INTERCHANGE_DIRECTIVE,
+ STMT_OMP_FUSE_DIRECTIVE,
STMT_OMP_FOR_DIRECTIVE,
STMT_OMP_FOR_SIMD_DIRECTIVE,
STMT_OMP_SECTIONS_DIRECTIVE,
diff --git a/clang/lib/AST/OpenMPClause.cpp b/clang/lib/AST/OpenMPClause.cpp
index 0930ca27c29f8..08ddbdb3f6cb2 100644
--- a/clang/lib/AST/OpenMPClause.cpp
+++ b/clang/lib/AST/OpenMPClause.cpp
@@ -1021,6 +1021,26 @@ OMPPartialClause *OMPPartialClause::CreateEmpty(const ASTContext &C) {
return new (C) OMPPartialClause();
}
+OMPLoopRangeClause *
+OMPLoopRangeClause::Create(const ASTContext &C, SourceLocation StartLoc,
+ SourceLocation LParenLoc, SourceLocation FirstLoc,
+ SourceLocation CountLoc, SourceLocation EndLoc,
+ Expr *First, Expr *Count) {
+ OMPLoopRangeClause *Clause = CreateEmpty(C);
+ Clause->setLocStart(StartLoc);
+ Clause->setLParenLoc(LParenLoc);
+ Clause->setFirstLoc(FirstLoc);
+ Clause->setCountLoc(CountLoc);
+ Clause->setLocEnd(EndLoc);
+ Clause->setFirst(First);
+ Clause->setCount(Count);
+ return Clause;
+}
+
+OMPLoopRangeClause *OMPLoopRangeClause::CreateEmpty(const ASTContext &C) {
+ return new (C) OMPLoopRangeClause();
+}
+
OMPAllocateClause *OMPAllocateClause::Create(
const ASTContext &C, SourceLocation StartLoc, SourceLocation LParenLoc,
Expr *Allocator, Expr *Alignment, SourceLocation ColonLoc,
@@ -1890,6 +1910,21 @@ void OMPClausePrinter::VisitOMPPartialClause(OMPPartialClause *Node) {
}
}
+void OMPClausePrinter::VisitOMPLoopRangeClause(OMPLoopRangeClause *Node) {
+ OS << "looprange";
+
+ Expr *First = Node->getFirst();
+ Expr *Count = Node->getCount();
+
+ if (First && Count) {
+ OS << "(";
+ First->printPretty(OS, nullptr, Policy, 0);
+ OS << ",";
+ Count->printPretty(OS, nullptr, Policy, 0);
+ OS << ")";
+ }
+}
+
void OMPClausePrinter::VisitOMPAllocatorClause(OMPAllocatorClause *Node) {
OS << "allocator(";
Node->getAllocator()->printPretty(OS, nullptr, Policy, 0);
diff --git a/clang/lib/AST/StmtOpenMP.cpp b/clang/lib/AST/StmtOpenMP.cpp
index 1f6586f95a9f8..5ba6a52650bd4 100644
--- a/clang/lib/AST/StmtOpenMP.cpp
+++ b/clang/lib/AST/StmtOpenMP.cpp
@@ -125,13 +125,12 @@ OMPLoopBasedDirective::tryToFindNextInnerLoop(Stmt *CurStmt,
bool OMPLoopBasedDirective::doForAllLoops(
Stmt *CurStmt, bool TryImperfectlyNestedLoops, unsigned NumLoops,
llvm::function_ref<bool(unsigned, Stmt *)> Callback,
- llvm::function_ref<void(OMPCanonicalLoopNestTransformationDirective *)>
+ llvm::function_ref<void(OMPLoopTransformationDirective *)>
OnTransformationCallback) {
CurStmt = CurStmt->IgnoreContainers();
for (unsigned Cnt = 0; Cnt < NumLoops; ++Cnt) {
while (true) {
- auto *Dir =
- dyn_cast<OMPCanonicalLoopNestTransformationDirective>(CurStmt);
+ auto *Dir = dyn_cast<OMPLoopTransformationDirective>(CurStmt);
if (!Dir)
break;
@@ -371,6 +370,26 @@ OMPForDirective *OMPForDirective::Create(
return Dir;
}
+Stmt *OMPLoopTransformationDirective::getTransformedStmt() const {
+ if (auto *D = dyn_cast<OMPCanonicalLoopNestTransformationDirective>(S)) {
+ return D->getTransformedStmt();
+ }
+ if (auto *D = dyn_cast<OMPCanonicalLoopSequenceTransformationDirective>(S)) {
+ return D->getTransformedStmt();
+ }
+ llvm_unreachable("unexpected object type");
+}
+
+Stmt *OMPLoopTransformationDirective::getPreInits() const {
+ if (auto *D = dyn_cast<OMPCanonicalLoopNestTransformationDirective>(S)) {
+ return D->getPreInits();
+ }
+ if (auto *D = dyn_cast<OMPCanonicalLoopSequenceTransformationDirective>(S)) {
+ return D->getPreInits();
+ }
+ llvm_unreachable("unexpected object type");
+}
+
Stmt *OMPCanonicalLoopNestTransformationDirective::getTransformedStmt() const {
switch (getStmtClass()) {
#define STMT(CLASS, PARENT)
@@ -380,7 +399,7 @@ Stmt *OMPCanonicalLoopNestTransformationDirective::getTransformedStmt() const {
return static_cast<const CLASS *>(this)->getTransformedStmt();
#include "clang/AST/StmtNodes.inc"
default:
- llvm_unreachable("Not a loop transformation");
+ llvm_unreachable("Not a loop transformation for canonical loop nests");
}
}
@@ -393,7 +412,34 @@ Stmt *OMPCanonicalLoopNestTransformationDirective::getPreInits() const {
return static_cast<const CLASS *>(this)->getPreInits();
#include "clang/AST/StmtNodes.inc"
default:
- llvm_unreachable("Not a loop transformation");
+ llvm_unreachable("Not a loop transformation for canonical loop nests");
+ }
+}
+
+Stmt *
+OMPCanonicalLoopSequenceTransformationDirective::getTransformedStmt() const {
+ switch (getStmtClass()) {
+#define STMT(CLASS, PARENT)
+#define ABSTRACT_STMT(CLASS)
+#define OMPCANONICALLOOPSEQUENCETRANSFORMATIONDIRECTIVE(CLASS, PARENT) \
+ case Stmt::CLASS##Class: \
+ return static_cast<const CLASS *>(this)->getTransformedStmt();
+#include "clang/AST/StmtNodes.inc"
+ default:
+ llvm_unreachable("Not a loop transformation for canonical loop sequences");
+ }
+}
+
+Stmt *OMPCanonicalLoopSequenceTransformationDirective::getPreInits() const {
+ switch (getStmtClass()) {
+#define STMT(CLASS, PARENT)
+#define ABSTRACT_STMT(CLASS)
+#define OMPCANONICALLOOPSEQUENCETRANSFORMATIONDIRECTIVE(CLASS, PARENT) \
+ case Stmt::CLASS##Class: \
+ return static_cast<const CLASS *>(this)->getPreInits();
+#include "clang/AST/StmtNodes.inc"
+ default:
+ llvm_unreachable("Not a loop transformation for canonical loop sequences");
}
}
@@ -510,6 +556,31 @@ OMPInterchangeDirective::CreateEmpty(const ASTContext &C, unsigned NumClauses,
SourceLocation(), SourceLocation(), NumLoops);
}
+OMPFuseDirective *
+OMPFuseDirective::Create(const ASTContext &C, SourceLocation StartLoc,
+ SourceLocation EndLoc, ArrayRef<OMPClause *> Clauses,
+ unsigned NumLoops, unsigned NumGeneratedTopLevelLoops,
+ Stmt *AssociatedStmt, Stmt *TransformedStmt,
+ Stmt *PreInits) {
+
+ OMPFuseDirective *Dir = createDirective<OMPFuseDirective>(
+ C, Clauses, AssociatedStmt, TransformedStmtOffset + 1, StartLoc, EndLoc,
+ NumLoops);
+ Dir->setTransformedStmt(TransformedStmt);
+ Dir->setPreInits(PreInits);
+ Dir->setNumGeneratedTopLevelLoops(NumGeneratedTopLevelLoops);
+ return Dir;
+}
+
+OMPFuseDirective *OMPFuseDirective::CreateEmpty(const ASTContext &C,
+ unsigned NumClauses,
+ unsigned NumLoops) {
+ OMPFuseDirective *Dir = createEmptyDirective<OMPFuseDirective>(
+ C, NumClauses, /*HasAssociatedStmt=*/true, TransformedStmtOffset + 1,
+ SourceLocation(), SourceLocation(), NumLoops);
+ return Dir;
+}
+
OMPForSimdDirective *
OMPForSimdDirective::Create(const ASTContext &C, SourceLocation StartLoc,
SourceLocation EndLoc, unsigned CollapsedNum,
diff --git a/clang/lib/AST/StmtPrinter.cpp b/clang/lib/AST/StmtPrinter.cpp
index 0030300521128..3bf146f3663ad 100644
--- a/clang/lib/AST/StmtPrinter.cpp
+++ b/clang/lib/AST/StmtPrinter.cpp
@@ -795,6 +795,11 @@ void StmtPrinter::VisitOMPInterchangeDirective(OMPInterchangeDirective *Node) {
PrintOMPExecutableDirective(Node);
}
+void StmtPrinter::VisitOMPFuseDirective(OMPFuseDirective *Node) {
+ Indent() << "#pragma omp fuse";
+ PrintOMPExecutableDirective(Node);
+}
+
void StmtPrinter::VisitOMPForDirective(OMPForDirective *Node) {
Indent() << "#pragma omp for";
PrintOMPExecutableDirective(Node);
diff --git a/clang/lib/AST/StmtProfile.cpp b/clang/lib/AST/StmtProfile.cpp
index 37c4d43ec0b2f..1a6607066bbfe 100644
--- a/clang/lib/AST/StmtProfile.cpp
+++ b/clang/lib/AST/StmtProfile.cpp
@@ -510,6 +510,13 @@ void OMPClauseProfiler::VisitOMPPartialClause(const OMPPartialClause *C) {
Profiler->VisitExpr(Factor);
}
+void OMPClauseProfiler::VisitOMPLoopRangeClause(const OMPLoopRangeClause *C) {
+ if (const Expr *First = C->getFirst())
+ Profiler->VisitExpr(First);
+ if (const Expr *Count = C->getCount())
+ Profiler->VisitExpr(Count);
+}
+
void OMPClauseProfiler::VisitOMPAllocatorClause(const OMPAllocatorClause *C) {
if (C->getAllocator())
Profiler->VisitStmt(C->getAllocator());
@@ -1025,6 +1032,15 @@ void StmtProfiler::VisitOMPInterchangeDirective(
VisitOMPCanonicalLoopNestTransformationDirective(S);
}
+void StmtProfiler::VisitOMPCanonicalLoopSequenceTransformationDirective(
+ const OMPCanonicalLoopSequenceTransformationDirective *S) {
+ VisitOMPExecutableDirective(S);
+}
+
+void StmtProfiler::VisitOMPFuseDirective(const OMPFuseDirective *S) {
+ VisitOMPCanonicalLoopSequenceTransformationDirective(S);
+}
+
void StmtProfiler::VisitOMPForDirective(const OMPForDirective *S) {
VisitOMPLoopDirective(S);
}
diff --git a/clang/lib/Basic/OpenMPKinds.cpp b/clang/lib/Basic/OpenMPKinds.cpp
index 3f8f64df8702e..246a569940155 100644
--- a/clang/lib/Basic/OpenMPKinds.cpp
+++ b/clang/lib/Basic/OpenMPKinds.cpp
@@ -256,6 +256,7 @@ unsigned clang::getOpenMPSimpleClauseType(OpenMPClauseKind Kind, StringRef Str,
case OMPC_affinity:
case OMPC_when:
case OMPC_append_args:
+ case OMPC_looprange:
break;
default:
break;
@@ -600,6 +601,7 @@ const char *clang::getOpenMPSimpleClauseTypeName(OpenMPClauseKind Kind,
case OMPC_affinity:
case OMPC_when:
case OMPC_append_args:
+ case OMPC_looprange:
break;
default:
break;
@@ -723,9 +725,14 @@ bool clang::isOpenMPCanonicalLoopNestTransformationDirective(
DKind == OMPD_interchange || DKind == OMPD_stripe;
}
+bool clang::isOpenMPCanonicalLoopSequenceTransformationDirective(
+ OpenMPDirectiveKind DKind) {
+ return DKind == OMPD_fuse;
+}
+
bool clang::isOpenMPLoopTransformationDirective(OpenMPDirectiveKind DKind) {
- // FIXME: There will be more cases when we implement 'fuse'.
- return isOpenMPCanonicalLoopNestTransformationDirective(DKind);
+ return isOpenMPCanonicalLoopNestTransformationDirective(DKind) ||
+ isOpenMPCanonicalLoopSequenceTransformationDirective(DKind);
}
bool clang::isOpenMPCombinedParallelADirective(OpenMPDirectiveKind DKind) {
diff --git a/clang/lib/CodeGen/CGStmt.cpp b/clang/lib/CodeGen/CGStmt.cpp
index aeff73d525c10..79ab728b4e15c 100644
--- a/clang/lib/CodeGen/CGStmt.cpp
+++ b/clang/lib/CodeGen/CGStmt.cpp
@@ -234,6 +234,9 @@ void CodeGenFunction::EmitStmt(const Stmt *S, ArrayRef<const Attr *> Attrs) {
case Stmt::OMPInterchangeDirectiveClass:
EmitOMPInterchangeDirective(cast<OMPInterchangeDirective>(*S));
break;
+ case Stmt::OMPFuseDirectiveClass:
+ EmitOMPFuseDirective(cast<OMPFuseDirective>(*S));
+ break;
case Stmt::OMPForDirectiveClass:
EmitOMPForDirective(cast<OMPForDirective>(*S));
break;
diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp
index 1360680cc6640..d9d6b6d107bec 100644
--- a/clang/lib/CodeGen/CGStmtOpenMP.cpp
+++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp
@@ -201,6 +201,24 @@ class OMPLoopScope : public CodeGenFunction::RunCleanupsScope {
} else {
llvm_unreachable("Unknown loop-based directive kind.");
}
+ doEmitPreinits(PreInits);
+ PreCondVars.restore(CGF);
+ }
+
+ void
+ emitPreInitStmt(CodeGenFunction &CGF,
+ const OMPCanonicalLoopSequenceTransformationDirective &S) {
+ const Stmt *PreInits;
+ if (const auto *Fuse = dyn_cast<OMPFuseDirective>(&S)) {
+ PreInits = Fuse->getPreInits();
+ } else {
+ llvm_unreachable(
+ "Unknown canonical loop sequence transform directive kind.");
+ }
+ doEmitPreinits(PreInits);
+ }
+
+ void doEmitPreinits(const Stmt *PreInits) {
if (PreInits) {
// CompoundStmts and DeclStmts are used as lists of PreInit statements and
// declarations. Since declarations must be visible in the the following
@@ -222,7 +240,6 @@ class OMPLoopScope : public CodeGenFunction::RunCleanupsScope {
CGF.EmitStmt(S);
}
}
- PreCondVars.restore(CGF);
}
public:
@@ -230,6 +247,11 @@ class OMPLoopScope : public CodeGenFunction::RunCleanupsScope {
: CodeGenFunction::RunCleanupsScope(CGF) {
emitPreInitStmt(CGF, S);
}
+ OMPLoopScope(CodeGenFunction &CGF,
+ const OMPCanonicalLoopSequenceTransformationDirective &S)
+ : CodeGenFunction::RunCleanupsScope(CGF) {
+ emitPreInitStmt(CGF, S);
+ }
};
class OMPSimdLexicalScope : public CodeGenFunction::LexicalScope {
@@ -1912,6 +1934,15 @@ class OMPTransformDirectiveScopeRAII {
CGSI = new CodeGenFunction::CGCapturedStmtInfo(CR_OpenMP);
CapInfoRAII = new CodeGenFunction::CGCapturedStmtRAII(CGF, CGSI);
}
+ if (const auto *Dir =
+ dyn_cast<OMPCanonicalLoopSequenceTransformationDirective>(S)) {
+ // For simplicity we reuse the loop scope similarly to what we do with
+ // OMPCanonicalLoopNestTransformationDirective do by being a subclass
+ // of OMPLoopBasedDirective.
+ Scope = new OMPLoopScope(CGF, *Dir);
+ CGSI = new CodeGenFunction::CGCapturedStmtInfo(CR_OpenMP);
+ CapInfoRAII = new CodeGenFunction::CGCapturedStmtRAII(CGF, CGSI);
+ }
}
~OMPTransformDirectiveScopeRAII() {
if (!Scope)
@@ -1939,8 +1970,7 @@ static void emitBody(CodeGenFunction &CGF, const Stmt *S, const Stmt *NextLoop,
return;
}
if (SimplifiedS == NextLoop) {
- if (auto *Dir =
- dyn_cast<OMPCanonicalLoopNestTransformationDirective>(SimplifiedS))
+ if (auto *Dir = dyn_cast<OMPLoopTransformationDirective>(SimplifiedS))
SimplifiedS = Dir->getTransformedStmt();
if (const auto *CanonLoop = dyn_cast<OMPCanonicalLoop>(SimplifiedS))
SimplifiedS = CanonLoop->getLoopStmt();
@@ -2935,6 +2965,12 @@ void CodeGenFunction::EmitOMPInterchangeDirective(
EmitStmt(S.getTransformedStmt());
}
+void CodeGenFunction::EmitOMPFuseDirective(const OMPFuseDirective &S) {
+ // Emit the de-sugared statement
+ OMPTransformDirectiveScopeRAII FuseScope(*this, &S);
+ EmitStmt(S.getTransformedStmt());
+}
+
void CodeGenFunction::EmitOMPUnrollDirective(const OMPUnrollDirective &S) {
bool UseOMPIRBuilder = CGM.getLangOpts().OpenMPIRBuilder;
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index 123cb4f51f828..39dff982f428e 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -3860,6 +3860,7 @@ class CodeGenFunction : public CodeGenTypeCache {
void EmitOMPUnrollDirective(const OMPUnrollDirective &S);
void EmitOMPReverseDirective(const OMPReverseDirective &S);
void EmitOMPInterchangeDirective(const OMPInterchangeDirective &S);
+ void EmitOMPFuseDirective(const OMPFuseDirective &S);
void EmitOMPForDirective(const OMPForDirective &S);
void EmitOMPForSimdDirective(const OMPForSimdDirective &S);
void EmitOMPScopeDirective(const OMPScopeDirective &S);
diff --git a/clang/lib/Parse/ParseOpenMP.cpp b/clang/lib/Parse/ParseOpenMP.cpp
index 5db2f2e2ccf86..d48e6436ba9aa 100644
--- a/clang/lib/Parse/ParseOpenMP.cpp
+++ b/clang/lib/Parse/ParseOpenMP.cpp
@@ -2937,6 +2937,39 @@ OMPClause *Parser::ParseOpenMPSizesClause() {
OpenLoc, CloseLoc);
}
+OMPClause *Parser::ParseOpenMPLoopRangeClause() {
+ SourceLocation ClauseNameLoc = ConsumeToken();
+ SourceLocation FirstLoc, CountLoc;
+
+ BalancedDelimiterTracker T(*this, tok::l_paren, tok::annot_pragma_openmp_end);
+ if (T.consumeOpen()) {
+ Diag(Tok, diag::err_expected) << tok::l_paren;
+ return nullptr;
+ }
+
+ FirstLoc = Tok.getLocation();
+ ExprResult FirstVal = ParseConstantExpression();
+ if (!FirstVal.isUsable()) {
+ T.skipToEnd();
+ return nullptr;
+ }
+
+ ExpectAndConsume(tok::comma);
+
+ CountLoc = Tok.getLocation();
+ ExprResult CountVal = ParseConstantExpression();
+ if (!CountVal.isUsable()) {
+ T.skipToEnd();
+ return nullptr;
+ }
+
+ T.consumeClose();
+
+ return Actions.OpenMP().ActOnOpenMPLoopRangeClause(
+ FirstVal.get(), CountVal.get(), ClauseNameLoc, T.getOpenLocation(),
+ FirstLoc, CountLoc, T.getCloseLocation());
+}
+
OMPClause *Parser::ParseOpenMPPermutationClause() {
SourceLocation ClauseNameLoc, OpenLoc, CloseLoc;
SmallVector<Expr *> ArgExprs;
@@ -3366,6 +3399,9 @@ OMPClause *Parser::ParseOpenMPClause(OpenMPDirectiveKind DKind,
}
Clause = ParseOpenMPClause(CKind, WrongDirective);
break;
+ case OMPC_looprange:
+ Clause = ParseOpenMPLoopRangeClause();
+ break;
default:
break;
}
diff --git a/clang/lib/Sema/SemaExceptionSpec.cpp b/clang/lib/Sema/SemaExceptionSpec.cpp
index 552c92996dc2e..a0483c3027199 100644
--- a/clang/lib/Sema/SemaExceptionSpec.cpp
+++ b/clang/lib/Sema/SemaExceptionSpec.cpp
@@ -1493,6 +1493,7 @@ CanThrowResult Sema::canThrow(const Stmt *S) {
case Stmt::OMPUnrollDirectiveClass:
case Stmt::OMPReverseDirectiveClass:
case Stmt::OMPInterchangeDirectiveClass:
+ case Stmt::OMPFuseDirectiveClass:
case Stmt::OMPSingleDirectiveClass:
case Stmt::OMPTargetDataDirectiveClass:
case Stmt::OMPTargetDirectiveClass:
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index 60f0317020c59..e5b6d48ad792d 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -4406,6 +4406,7 @@ void SemaOpenMP::ActOnOpenMPRegionStart(OpenMPDirectiveKind DKind,
case OMPD_unroll:
case OMPD_reverse:
case OMPD_interchange:
+ case OMPD_fuse:
case OMPD_assume:
break;
default:
@@ -6247,6 +6248,10 @@ StmtResult SemaOpenMP::ActOnOpenMPExecutableDirective(
Res = ActOnOpenMPInterchangeDirective(ClausesWithImplicit, AStmt, StartLoc,
EndLoc);
break;
+ case OMPD_fuse:
+ Res =
+ ActOnOpenMPFuseDirective(ClausesWithImplicit, AStmt, StartLoc, EndLoc);
+ break;
case OMPD_for:
Res = ActOnOpenMPForDirective(ClausesWithImplicit, AStmt, StartLoc, EndLoc,
VarsWithInheritedDSA);
@@ -9324,7 +9329,9 @@ static bool checkOpenMPIterationSpace(
// sharing attributes.
VarsWithImplicitDSA.erase(LCDecl);
- assert(isOpenMPLoopDirective(DKind) && "DSA for non-loop vars");
+ assert((isOpenMPLoopDirective(DKind) ||
+ isOpenMPCanonicalLoopSequenceTransformationDirective(DKind)) &&
+ "DSA for non-loop vars");
// Check test-expr.
HasErrors |= ISC.checkAndSetCond(For ? For->getCond() : CXXFor->getCond());
@@ -9752,7 +9759,8 @@ checkOpenMPLoop(OpenMPDirectiveKind DKind, Expr *CollapseLoopCountExpr,
unsigned NumLoops = std::max(OrderedLoopCount, NestedLoopCount);
SmallVector<LoopIterationSpace, 4> IterSpaces(NumLoops);
if (!OMPLoopBasedDirective::doForAllLoops(
- AStmt->IgnoreContainers(!isOpenMPLoopTransformationDirective(DKind)),
+ AStmt->IgnoreContainers(
+ !isOpenMPCanonicalLoopNestTransformationDirective(DKind)),
SupportsNonPerfectlyNested, NumLoops,
[DKind, &SemaRef, &DSA, NumLoops, NestedLoopCount,
CollapseLoopCountExpr, OrderedLoopCountExpr, &VarsWithImplicitDSA,
@@ -9774,8 +9782,7 @@ checkOpenMPLoop(OpenMPDirectiveKind DKind, Expr *CollapseLoopCountExpr,
}
return false;
},
- [&SemaRef,
- &Captures](OMPCanonicalLoopNestTransformationDirective *Transform) {
+ [&SemaRef, &Captures](OMPLoopTransformationDirective *Transform) {
Stmt *DependentPreInits = Transform->getPreInits();
if (!DependentPreInits)
return;
@@ -9790,7 +9797,8 @@ checkOpenMPLoop(OpenMPDirectiveKind DKind, Expr *CollapseLoopCountExpr,
auto *D = cast<VarDecl>(C);
DeclRefExpr *Ref = buildDeclRefExpr(
SemaRef, D, D->getType().getNonReferenceType(),
- Transform->getBeginLoc());
+ cast<OMPExecutableDirective>(Transform->getDirective())
+ ->getBeginLoc());
Captures[Ref] = Ref;
}
}
@@ -14240,10 +14248,34 @@ StmtResult SemaOpenMP::ActOnOpenMPTargetTeamsDistributeSimdDirective(
getASTContext(), StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B);
}
+/// Updates OriginalInits by checking Transform against loop transformation
+/// directives and appending their pre-inits if a match is found.
+static void updatePreInits(OMPLoopTransformationDirective *Transform,
+ SmallVectorImpl<Stmt *> &PreInits) {
+ Stmt *Dir = Transform->getDirective();
+ switch (Dir->getStmtClass()) {
+#define STMT(CLASS, PARENT)
+#define ABSTRACT_STMT(CLASS)
+#define COMMON_OMP_LOOP_TRANSFORMATION(CLASS, PARENT) \
+ case Stmt::CLASS##Class: \
+ appendFlattenedStmtList(PreInits, \
+ static_cast<const CLASS *>(Dir)->getPreInits()); \
+ break;
+#define OMPCANONICALLOOPNESTTRANSFORMATIONDIRECTIVE(CLASS, PARENT) \
+ COMMON_OMP_LOOP_TRANSFORMATION(CLASS, PARENT)
+#define OMPCANONICALLOOPSEQUENCETRANSFORMATIONDIRECTIVE(CLASS, PARENT) \
+ COMMON_OMP_LOOP_TRANSFORMATION(CLASS, PARENT)
+#include "clang/AST/StmtNodes.inc"
+#undef COMMON_OMP_LOOP_TRANSFORMATION
+ default:
+ llvm_unreachable("Not a loop transformation");
+ }
+}
+
bool SemaOpenMP::checkTransformableLoopNest(
OpenMPDirectiveKind Kind, Stmt *AStmt, int NumLoops,
SmallVectorImpl<OMPLoopBasedDirective::HelperExprs> &LoopHelpers,
- Stmt *&Body, SmallVectorImpl<SmallVector<Stmt *, 0>> &OriginalInits) {
+ Stmt *&Body, SmallVectorImpl<SmallVector<Stmt *>> &OriginalInits) {
OriginalInits.emplace_back();
bool Result = OMPLoopBasedDirective::doForAllLoops(
AStmt->IgnoreContainers(), /*TryImperfectlyNestedLoops=*/false, NumLoops,
@@ -14269,29 +14301,278 @@ bool SemaOpenMP::checkTransformableLoopNest(
OriginalInits.emplace_back();
return false;
},
- [&OriginalInits](OMPLoopBasedDirective *Transform) {
- Stmt *DependentPreInits;
- if (auto *Dir = dyn_cast<OMPTileDirective>(Transform))
- DependentPreInits = Dir->getPreInits();
- else if (auto *Dir = dyn_cast<OMPStripeDirective>(Transform))
- DependentPreInits = Dir->getPreInits();
- else if (auto *Dir = dyn_cast<OMPUnrollDirective>(Transform))
- DependentPreInits = Dir->getPreInits();
- else if (auto *Dir = dyn_cast<OMPReverseDirective>(Transform))
- DependentPreInits = Dir->getPreInits();
- else if (auto *Dir = dyn_cast<OMPInterchangeDirective>(Transform))
- DependentPreInits = Dir->getPreInits();
- else
- llvm_unreachable("Unhandled loop transformation");
-
- appendFlattenedStmtList(OriginalInits.back(), DependentPreInits);
+ [&OriginalInits](OMPLoopTransformationDirective *Transform) {
+ updatePreInits(Transform, OriginalInits.back());
});
assert(OriginalInits.back().empty() && "No preinit after innermost loop");
OriginalInits.pop_back();
return Result;
}
-/// Add preinit statements that need to be propageted from the selected loop.
+/// Counts the total number of OpenMP canonical nested loops, including the
+/// outermost loop (the original loop). PRECONDITION of this visitor is that it
+/// must be invoked from the original loop to be analyzed. The traversal stops
+/// for Decl's and Expr's given that they may contain inner loops that must not
+/// be counted.
+///
+/// Example AST structure for the code:
+///
+/// int main() {
+/// #pragma omp fuse
+/// {
+/// for (int i = 0; i < 100; i++) { <-- Outer loop
+/// []() {
+/// for(int j = 0; j < 100; j++) {} <-- NOT A LOOP (1)
+/// };
+/// for(int j = 0; j < 5; ++j) {} <-- Inner loop
+/// }
+/// for (int r = 0; i < 100; i++) { <-- Outer loop
+/// struct LocalClass {
+/// void bar() {
+/// for(int j = 0; j < 100; j++) {} <-- NOT A LOOP (2)
+/// }
+/// };
+/// for(int k = 0; k < 10; ++k) {} <-- Inner loop
+/// {x = 5; for(k = 0; k < 10; ++k) x += k; x}; <-- NOT A LOOP (3)
+/// }
+/// }
+/// }
+/// (1) because in a different function (here: a lambda)
+/// (2) because in a different function (here: class method)
+/// (3) because considered to be intervening-code of non-perfectly nested loop
+/// Result: Loop 'i' contains 2 loops, Loop 'r' also contains 2 loops.
+class NestedLoopCounterVisitor final : public DynamicRecursiveASTVisitor {
+private:
+ unsigned NestedLoopCount = 0;
+
+public:
+ explicit NestedLoopCounterVisitor() = default;
+
+ unsigned getNestedLoopCount() const { return NestedLoopCount; }
+
+ bool VisitForStmt(ForStmt *FS) override {
+ ++NestedLoopCount;
+ return true;
+ }
+
+ bool VisitCXXForRangeStmt(CXXForRangeStmt *FRS) override {
+ ++NestedLoopCount;
+ return true;
+ }
+
+ bool TraverseStmt(Stmt *S) override {
+ if (!S)
+ return true;
+
+ // Skip traversal of all expressions, including special cases like
+ // LambdaExpr, StmtExpr, BlockExpr, and RequiresExpr. These expressions
+ // may contain inner statements (and even loops), but they are not part
+ // of the syntactic body of the surrounding loop structure.
+ // Therefore must not be counted.
+ if (isa<Expr>(S))
+ return true;
+
+ // Only recurse into CompoundStmt (block {}) and loop bodies.
+ if (isa<CompoundStmt, ForStmt, CXXForRangeStmt>(S)) {
+ return DynamicRecursiveASTVisitor::TraverseStmt(S);
+ }
+
+ // Stop traversal of the rest of statements, that break perfect
+ // loop nesting, such as control flow (IfStmt, SwitchStmt...).
+ return true;
+ }
+
+ bool TraverseDecl(Decl *D) override {
+ // Stop in the case of finding a declaration, it is not important
+ // in order to find nested loops (Possible CXXRecordDecl, RecordDecl,
+ // FunctionDecl...).
+ return true;
+ }
+};
+
+bool SemaOpenMP::analyzeLoopSequence(Stmt *LoopSeqStmt,
+ LoopSequenceAnalysis &SeqAnalysis,
+ ASTContext &Context,
+ OpenMPDirectiveKind Kind) {
+ VarsWithInheritedDSAType TmpDSA;
+ // Helper Lambda to handle storing initialization and body statements for
+ // both ForStmt and CXXForRangeStmt.
+ auto StoreLoopStatements = [](LoopAnalysis &Analysis, Stmt *LoopStmt) {
+ if (auto *For = dyn_cast<ForStmt>(LoopStmt)) {
+ Analysis.OriginalInits.push_back(For->getInit());
+ Analysis.ForStmt = For;
+ } else {
+ auto *CXXFor = cast<CXXForRangeStmt>(LoopStmt);
+ Analysis.OriginalInits.push_back(CXXFor->getBeginStmt());
+ Analysis.ForStmt = CXXFor;
+ }
+ };
+
+ // Helper lambda functions to encapsulate the processing of different
+ // derivations of the canonical loop sequence grammar
+ // Modularized code for handling loop generation and transformations.
+ auto AnalyzeLoopGeneration = [&](Stmt *Child) {
+ auto *LoopTransform = cast<OMPLoopTransformationDirective>(Child);
+ Stmt *TransformedStmt = LoopTransform->getTransformedStmt();
+ unsigned NumGeneratedTopLevelLoops =
+ LoopTransform->getNumGeneratedTopLevelLoops();
+ // Handle the case where transformed statement is not available due to
+ // dependent contexts
+ if (!TransformedStmt) {
+ if (NumGeneratedTopLevelLoops > 0) {
+ SeqAnalysis.LoopSeqSize += NumGeneratedTopLevelLoops;
+ return true;
+ }
+ // Unroll full (0 loops produced)
+ Diag(Child->getBeginLoc(), diag::err_omp_not_for)
+ << 0 << getOpenMPDirectiveName(Kind);
+ return false;
+ }
+ // Handle loop transformations with multiple loop nests
+ // Unroll full
+ if (!NumGeneratedTopLevelLoops) {
+ Diag(Child->getBeginLoc(), diag::err_omp_not_for)
+ << 0 << getOpenMPDirectiveName(Kind);
+ return false;
+ }
+ // Loop transformatons such as split or loopranged fuse
+ if (NumGeneratedTopLevelLoops > 1) {
+ // Get the preinits related to this loop sequence generating
+ // loop transformation (i.e loopranged fuse, split...)
+ // These preinits differ slightly from regular inits/pre-inits related
+ // to single loop generating loop transformations (interchange, unroll)
+ // given that they are not bounded to a particular loop nest
+ // so they need to be treated independently
+ updatePreInits(LoopTransform, SeqAnalysis.LoopSequencePreInits);
+ return analyzeLoopSequence(TransformedStmt, SeqAnalysis, Context, Kind);
+ }
+ // Vast majority: (Tile, Unroll, Stripe, Reverse, Interchange, Fuse all)
+ // Process the transformed loop statement
+ LoopAnalysis &NewTransformedSingleLoop =
+ SeqAnalysis.Loops.emplace_back(OMPLoopCategory::TransformSingleLoop);
+ unsigned IsCanonical = checkOpenMPLoop(
+ Kind, nullptr, nullptr, TransformedStmt, SemaRef, *DSAStack, TmpDSA,
+ NewTransformedSingleLoop.HelperExprs);
+
+ if (!IsCanonical)
+ return false;
+
+ StoreLoopStatements(NewTransformedSingleLoop, TransformedStmt);
+ updatePreInits(LoopTransform, NewTransformedSingleLoop.TransformsPreInits);
+
+ SeqAnalysis.LoopSeqSize++;
+ return true;
+ };
+
+ // Modularized code for handling regular canonical loops.
+ auto AnalyzeRegularLoop = [&](Stmt *Child) {
+ LoopAnalysis &NewRegularLoop =
+ SeqAnalysis.Loops.emplace_back(OMPLoopCategory::RegularLoop);
+ unsigned IsCanonical =
+ checkOpenMPLoop(Kind, nullptr, nullptr, Child, SemaRef, *DSAStack,
+ TmpDSA, NewRegularLoop.HelperExprs);
+
+ if (!IsCanonical)
+ return false;
+
+ StoreLoopStatements(NewRegularLoop, Child);
+ NestedLoopCounterVisitor NLCV;
+ NLCV.TraverseStmt(Child);
+ return true;
+ };
+
+ // Helper functions to validate loop sequence grammar derivations.
+ auto IsLoopSequenceDerivation = [](auto *Child) {
+ return isa<ForStmt, CXXForRangeStmt, OMPLoopTransformationDirective>(Child);
+ };
+ // Helper functions to validate loop generating grammar derivations.
+ auto IsLoopGeneratingStmt = [](auto *Child) {
+ return isa<OMPLoopTransformationDirective>(Child);
+ };
+
+ // High level grammar validation.
+ for (Stmt *Child : LoopSeqStmt->children()) {
+ if (!Child)
+ continue;
+ // Skip over non-loop-sequence statements.
+ if (!IsLoopSequenceDerivation(Child)) {
+ Child = Child->IgnoreContainers();
+ // Ignore empty compound statement.
+ if (!Child)
+ continue;
+ // In the case of a nested loop sequence ignoring containers would not
+ // be enough, a recurisve transversal of the loop sequence is required.
+ if (isa<CompoundStmt>(Child)) {
+ if (!analyzeLoopSequence(Child, SeqAnalysis, Context, Kind))
+ return false;
+ // Already been treated, skip this children
+ continue;
+ }
+ }
+ // Regular loop sequence handling.
+ if (IsLoopSequenceDerivation(Child)) {
+ if (IsLoopGeneratingStmt(Child)) {
+ if (!AnalyzeLoopGeneration(Child))
+ return false;
+ // AnalyzeLoopGeneration updates SeqAnalysis.LoopSeqSize accordingly.
+ } else {
+ if (!AnalyzeRegularLoop(Child))
+ return false;
+ SeqAnalysis.LoopSeqSize++;
+ }
+ } else {
+ // Report error for invalid statement inside canonical loop sequence.
+ Diag(Child->getBeginLoc(), diag::err_omp_not_for)
+ << 0 << getOpenMPDirectiveName(Kind);
+ return false;
+ }
+ }
+ return true;
+}
+
+bool SemaOpenMP::checkTransformableLoopSequence(
+ OpenMPDirectiveKind Kind, Stmt *AStmt, LoopSequenceAnalysis &SeqAnalysis,
+ ASTContext &Context) {
+ // Following OpenMP 6.0 API Specification, a Canonical Loop Sequence follows
+ // the grammar:
+ //
+ // canonical-loop-sequence:
+ // {
+ // loop-sequence+
+ // }
+ // where loop-sequence can be any of the following:
+ // 1. canonical-loop-sequence
+ // 2. loop-nest
+ // 3. loop-sequence-generating-construct (i.e OMPLoopTransformationDirective)
+ //
+ // To recognise and traverse this structure the helper function
+ // analyzeLoopSequence serves as the recurisve entry point
+ // and tries to match the input AST to the canonical loop sequence grammar
+ // structure. This function will perform both a semantic and syntactical
+ // analysis of the given statement according to OpenMP 6.0 definition of
+ // the aforementioned canonical loop sequence.
+
+ // We expect an outer compound statement.
+ if (!isa<CompoundStmt>(AStmt)) {
+ Diag(AStmt->getBeginLoc(), diag::err_omp_not_a_loop_sequence)
+ << getOpenMPDirectiveName(Kind);
+ return false;
+ }
+
+ // Recursive entry point to process the main loop sequence
+ if (!analyzeLoopSequence(AStmt, SeqAnalysis, Context, Kind))
+ return false;
+
+ // Diagnose an empty loop sequence.
+ if (SeqAnalysis.LoopSeqSize <= 0) {
+ Diag(AStmt->getBeginLoc(), diag::err_omp_empty_loop_sequence)
+ << getOpenMPDirectiveName(Kind);
+ return false;
+ }
+ return true;
+}
+
+/// Add preinit statements that need to be propagated from the selected loop.
static void addLoopPreInits(ASTContext &Context,
OMPLoopBasedDirective::HelperExprs &LoopHelper,
Stmt *LoopStmt, ArrayRef<Stmt *> OriginalInit,
@@ -14376,7 +14657,7 @@ StmtResult SemaOpenMP::ActOnOpenMPTileDirective(ArrayRef<OMPClause *> Clauses,
// Verify and diagnose loop nest.
SmallVector<OMPLoopBasedDirective::HelperExprs, 4> LoopHelpers(NumLoops);
Stmt *Body = nullptr;
- SmallVector<SmallVector<Stmt *, 0>, 4> OriginalInits;
+ SmallVector<SmallVector<Stmt *>, 4> OriginalInits;
if (!checkTransformableLoopNest(OMPD_tile, AStmt, NumLoops, LoopHelpers, Body,
OriginalInits))
return StmtError();
@@ -14653,7 +14934,7 @@ StmtResult SemaOpenMP::ActOnOpenMPStripeDirective(ArrayRef<OMPClause *> Clauses,
// Verify and diagnose loop nest.
SmallVector<OMPLoopBasedDirective::HelperExprs, 4> LoopHelpers(NumLoops);
Stmt *Body = nullptr;
- SmallVector<SmallVector<Stmt *, 0>, 4> OriginalInits;
+ SmallVector<SmallVector<Stmt *>, 4> OriginalInits;
if (!checkTransformableLoopNest(OMPD_stripe, AStmt, NumLoops, LoopHelpers,
Body, OriginalInits))
return StmtError();
@@ -14914,7 +15195,7 @@ StmtResult SemaOpenMP::ActOnOpenMPUnrollDirective(ArrayRef<OMPClause *> Clauses,
Stmt *Body = nullptr;
SmallVector<OMPLoopBasedDirective::HelperExprs, NumLoops> LoopHelpers(
NumLoops);
- SmallVector<SmallVector<Stmt *, 0>, NumLoops + 1> OriginalInits;
+ SmallVector<SmallVector<Stmt *>, NumLoops + 1> OriginalInits;
if (!checkTransformableLoopNest(OMPD_unroll, AStmt, NumLoops, LoopHelpers,
Body, OriginalInits))
return StmtError();
@@ -15184,7 +15465,7 @@ StmtResult SemaOpenMP::ActOnOpenMPReverseDirective(Stmt *AStmt,
Stmt *Body = nullptr;
SmallVector<OMPLoopBasedDirective::HelperExprs, NumLoops> LoopHelpers(
NumLoops);
- SmallVector<SmallVector<Stmt *, 0>, NumLoops + 1> OriginalInits;
+ SmallVector<SmallVector<Stmt *>, NumLoops + 1> OriginalInits;
if (!checkTransformableLoopNest(OMPD_reverse, AStmt, NumLoops, LoopHelpers,
Body, OriginalInits))
return StmtError();
@@ -15376,7 +15657,7 @@ StmtResult SemaOpenMP::ActOnOpenMPInterchangeDirective(
// Verify and diagnose loop nest.
SmallVector<OMPLoopBasedDirective::HelperExprs, 4> LoopHelpers(NumLoops);
Stmt *Body = nullptr;
- SmallVector<SmallVector<Stmt *, 0>, 2> OriginalInits;
+ SmallVector<SmallVector<Stmt *>, 2> OriginalInits;
if (!checkTransformableLoopNest(OMPD_interchange, AStmt, NumLoops,
LoopHelpers, Body, OriginalInits))
return StmtError();
@@ -15552,6 +15833,492 @@ StmtResult SemaOpenMP::ActOnOpenMPInterchangeDirective(
buildPreInits(Context, PreInits));
}
+StmtResult SemaOpenMP::ActOnOpenMPFuseDirective(ArrayRef<OMPClause *> Clauses,
+ Stmt *AStmt,
+ SourceLocation StartLoc,
+ SourceLocation EndLoc) {
+
+ ASTContext &Context = getASTContext();
+ DeclContext *CurrContext = SemaRef.CurContext;
+ Scope *CurScope = SemaRef.getCurScope();
+ CaptureVars CopyTransformer(SemaRef);
+
+ // Ensure the structured block is not empty
+ if (!AStmt)
+ return StmtError();
+
+ // Defer transformation in dependent contexts
+ // The NumLoopNests argument is set to a placeholder 1 (even though
+ // using looprange fuse could yield up to 3 top level loop nests)
+ // because a dependent context could prevent determining its true value
+ if (CurrContext->isDependentContext()) {
+ return OMPFuseDirective::Create(Context, StartLoc, EndLoc, Clauses,
+ /* NumLoops */ 1, /* LoopSeqSize */ 1,
+ AStmt, nullptr, nullptr);
+ }
+
+ // Validate that the potential loop sequence is transformable for fusion
+ // Also collect the HelperExprs, Loop Stmts, Inits, and Number of loops
+ LoopSequenceAnalysis SeqAnalysis;
+ if (!checkTransformableLoopSequence(OMPD_fuse, AStmt, SeqAnalysis, Context))
+ return StmtError();
+
+ // SeqAnalysis.LoopSeqSize exists mostly to handle dependent contexts,
+ // otherwise it must be the same as SeqAnalysis.Loops.size().
+ assert(SeqAnalysis.LoopSeqSize == SeqAnalysis.Loops.size());
+
+ // Handle clauses, which can be any of the following: [looprange, apply]
+ const OMPLoopRangeClause *LRC =
+ OMPExecutableDirective::getSingleClause<OMPLoopRangeClause>(Clauses);
+
+ // The clause arguments are invalidated if any error arises
+ // such as non-constant or non-positive arguments
+ if (LRC && (!LRC->getFirst() || !LRC->getCount()))
+ return StmtError();
+
+ // Delayed semantic check of LoopRange constraint
+ // Evaluates the loop range arguments and returns the first and count values
+ auto EvaluateLoopRangeArguments = [&Context](Expr *First, Expr *Count,
+ uint64_t &FirstVal,
+ uint64_t &CountVal) {
+ llvm::APSInt FirstInt = First->EvaluateKnownConstInt(Context);
+ llvm::APSInt CountInt = Count->EvaluateKnownConstInt(Context);
+ FirstVal = FirstInt.getZExtValue();
+ CountVal = CountInt.getZExtValue();
+ };
+
+ // OpenMP [6.0, Restrictions]
+ // first + count - 1 must not evaluate to a value greater than the
+ // loop sequence length of the associated canonical loop sequence.
+ auto ValidLoopRange = [](uint64_t FirstVal, uint64_t CountVal,
+ unsigned NumLoops) -> bool {
+ return FirstVal + CountVal - 1 <= NumLoops;
+ };
+ uint64_t FirstVal = 1, CountVal = 0, LastVal = SeqAnalysis.LoopSeqSize;
+
+ // Validates the loop range after evaluating the semantic information
+ // and ensures that the range is valid for the given loop sequence size.
+ // Expressions are evaluated at compile time to obtain constant values.
+ if (LRC) {
+ EvaluateLoopRangeArguments(LRC->getFirst(), LRC->getCount(), FirstVal,
+ CountVal);
+ if (CountVal == 1)
+ SemaRef.Diag(LRC->getCountLoc(), diag::warn_omp_redundant_fusion)
+ << getOpenMPDirectiveName(OMPD_fuse);
+
+ if (!ValidLoopRange(FirstVal, CountVal, SeqAnalysis.LoopSeqSize)) {
+ SemaRef.Diag(LRC->getFirstLoc(), diag::err_omp_invalid_looprange)
+ << getOpenMPDirectiveName(OMPD_fuse) << FirstVal
+ << (FirstVal + CountVal - 1) << SeqAnalysis.LoopSeqSize;
+ return StmtError();
+ }
+
+ LastVal = FirstVal + CountVal - 1;
+ }
+
+ // Complete fusion generates a single canonical loop nest
+ // However looprange clause may generate several loop nests
+ unsigned NumGeneratedTopLevelLoops =
+ LRC ? SeqAnalysis.LoopSeqSize - CountVal + 1 : 1;
+
+ // Emit a warning for redundant loop fusion when the sequence contains only
+ // one loop.
+ if (SeqAnalysis.LoopSeqSize == 1)
+ SemaRef.Diag(AStmt->getBeginLoc(), diag::warn_omp_redundant_fusion)
+ << getOpenMPDirectiveName(OMPD_fuse);
+
+ // Select the type with the largest bit width among all induction variables
+ QualType IVType =
+ SeqAnalysis.Loops[FirstVal - 1].HelperExprs.IterationVarRef->getType();
+ for (unsigned I = FirstVal; I < LastVal; ++I) {
+ QualType CurrentIVType =
+ SeqAnalysis.Loops[I].HelperExprs.IterationVarRef->getType();
+ if (Context.getTypeSize(CurrentIVType) > Context.getTypeSize(IVType)) {
+ IVType = CurrentIVType;
+ }
+ }
+ uint64_t IVBitWidth = Context.getIntWidth(IVType);
+
+ // Create pre-init declarations for all loops lower bounds, upper bounds,
+ // strides and num-iterations for every top level loop in the fusion
+ SmallVector<VarDecl *, 4> LBVarDecls;
+ SmallVector<VarDecl *, 4> STVarDecls;
+ SmallVector<VarDecl *, 4> NIVarDecls;
+ SmallVector<VarDecl *, 4> UBVarDecls;
+ SmallVector<VarDecl *, 4> IVVarDecls;
+
+ // Helper lambda to create variables for bounds, strides, and other
+ // expressions. Generates both the variable declaration and the corresponding
+ // initialization statement.
+ auto CreateHelperVarAndStmt =
+ [&, &SemaRef = SemaRef](Expr *ExprToCopy, const std::string &BaseName,
+ unsigned I, bool NeedsNewVD = false) {
+ Expr *TransformedExpr =
+ AssertSuccess(CopyTransformer.TransformExpr(ExprToCopy));
+ if (!TransformedExpr)
+ return std::pair<VarDecl *, StmtResult>(nullptr, StmtError());
+
+ auto Name = (Twine(".omp.") + BaseName + std::to_string(I)).str();
+
+ VarDecl *VD;
+ if (NeedsNewVD) {
+ VD = buildVarDecl(SemaRef, SourceLocation(), IVType, Name);
+ SemaRef.AddInitializerToDecl(VD, TransformedExpr, false);
+
+ } else {
+ // Create a unique variable name
+ DeclRefExpr *DRE = cast<DeclRefExpr>(TransformedExpr);
+ VD = cast<VarDecl>(DRE->getDecl());
+ VD->setDeclName(&SemaRef.PP.getIdentifierTable().get(Name));
+ }
+ // Create the corresponding declaration statement
+ StmtResult DeclStmt = new (Context) class DeclStmt(
+ DeclGroupRef(VD), SourceLocation(), SourceLocation());
+ return std::make_pair(VD, DeclStmt);
+ };
+
+ // PreInits hold a sequence of variable declarations that must be executed
+ // before the fused loop begins. These include bounds, strides, and other
+ // helper variables required for the transformation. Other loop transforms
+ // also contain their own preinits
+ SmallVector<Stmt *> PreInits;
+
+ // Update the general preinits using the preinits generated by loop sequence
+ // generating loop transformations. These preinits differ slightly from
+ // single-loop transformation preinits, as they can be detached from a
+ // specific loop inside multiple generated loop nests. This happens
+ // because certain helper variables, like '.omp.fuse.max', are introduced to
+ // handle fused iteration spaces and may not be directly tied to a single
+ // original loop. The preinit structure must ensure that hidden variables
+ // like '.omp.fuse.max' are still properly handled.
+ // Transformations that apply this concept: Loopranged Fuse, Split
+ if (!SeqAnalysis.LoopSequencePreInits.empty()) {
+ llvm::append_range(PreInits, SeqAnalysis.LoopSequencePreInits);
+ }
+
+ // Process each single loop to generate and collect declarations
+ // and statements for all helper expressions related to
+ // particular single loop nests
+
+ // Also In the case of the fused loops, we keep track of their original
+ // inits by appending them to their preinits statement, and in the case of
+ // transformations, also append their preinits (which contain the original
+ // loop initialization statement or other statements)
+
+ // Firstly we need to set TransformIndex to match the begining of the
+ // looprange section
+ unsigned int TransformIndex = 0;
+ for (unsigned I : llvm::seq<unsigned>(FirstVal - 1)) {
+ if (SeqAnalysis.Loops[I].Category == OMPLoopCategory::TransformSingleLoop)
+ ++TransformIndex;
+ }
+
+ for (unsigned int I = FirstVal - 1, J = 0; I < LastVal; ++I, ++J) {
+ if (SeqAnalysis.Loops[I].Category == OMPLoopCategory::RegularLoop) {
+ addLoopPreInits(Context, SeqAnalysis.Loops[I].HelperExprs,
+ SeqAnalysis.Loops[I].ForStmt,
+ SeqAnalysis.Loops[I].OriginalInits, PreInits);
+ } else if (SeqAnalysis.Loops[I].Category ==
+ OMPLoopCategory::TransformSingleLoop) {
+ // For transformed loops, insert both pre-inits and original inits.
+ // Order matters: pre-inits may define variables used in the original
+ // inits such as upper bounds...
+ SmallVector<Stmt *> &TransformPreInit =
+ SeqAnalysis.Loops[TransformIndex++].TransformsPreInits;
+ if (!TransformPreInit.empty())
+ llvm::append_range(PreInits, TransformPreInit);
+
+ addLoopPreInits(Context, SeqAnalysis.Loops[I].HelperExprs,
+ SeqAnalysis.Loops[I].ForStmt,
+ SeqAnalysis.Loops[I].OriginalInits, PreInits);
+ }
+ auto [UBVD, UBDStmt] =
+ CreateHelperVarAndStmt(SeqAnalysis.Loops[I].HelperExprs.UB, "ub", J);
+ auto [LBVD, LBDStmt] =
+ CreateHelperVarAndStmt(SeqAnalysis.Loops[I].HelperExprs.LB, "lb", J);
+ auto [STVD, STDStmt] =
+ CreateHelperVarAndStmt(SeqAnalysis.Loops[I].HelperExprs.ST, "st", J);
+ auto [NIVD, NIDStmt] = CreateHelperVarAndStmt(
+ SeqAnalysis.Loops[I].HelperExprs.NumIterations, "ni", J, true);
+ auto [IVVD, IVDStmt] = CreateHelperVarAndStmt(
+ SeqAnalysis.Loops[I].HelperExprs.IterationVarRef, "iv", J);
+
+ assert(LBVD && STVD && NIVD && IVVD &&
+ "OpenMP Fuse Helper variables creation failed");
+
+ UBVarDecls.push_back(UBVD);
+ LBVarDecls.push_back(LBVD);
+ STVarDecls.push_back(STVD);
+ NIVarDecls.push_back(NIVD);
+ IVVarDecls.push_back(IVVD);
+
+ PreInits.push_back(LBDStmt.get());
+ PreInits.push_back(STDStmt.get());
+ PreInits.push_back(NIDStmt.get());
+ PreInits.push_back(IVDStmt.get());
+ }
+
+ auto MakeVarDeclRef = [&SemaRef = this->SemaRef](VarDecl *VD) {
+ return buildDeclRefExpr(SemaRef, VD, VD->getType(), VD->getLocation(),
+ false);
+ };
+
+ // Following up the creation of the final fused loop will be performed
+ // which has the following shape (considering the selected loops):
+ //
+ // for (fuse.index = 0; fuse.index < max(ni0, ni1..., nik); ++fuse.index) {
+ // if (fuse.index < ni0){
+ // iv0 = lb0 + st0 * fuse.index;
+ // original.index0 = iv0
+ // body(0);
+ // }
+ // if (fuse.index < ni1){
+ // iv1 = lb1 + st1 * fuse.index;
+ // original.index1 = iv1
+ // body(1);
+ // }
+ //
+ // ...
+ //
+ // if (fuse.index < nik){
+ // ivk = lbk + stk * fuse.index;
+ // original.indexk = ivk
+ // body(k); Expr *InitVal = IntegerLiteral::Create(Context,
+ // llvm::APInt(IVWidth, 0),
+ // }
+
+ // 1. Create the initialized fuse index
+ StringRef IndexName = ".omp.fuse.index";
+ Expr *InitVal = IntegerLiteral::Create(Context, llvm::APInt(IVBitWidth, 0),
+ IVType, SourceLocation());
+ VarDecl *IndexDecl =
+ buildVarDecl(SemaRef, {}, IVType, IndexName, nullptr, nullptr);
+ SemaRef.AddInitializerToDecl(IndexDecl, InitVal, false);
+ StmtResult InitStmt = new (Context)
+ DeclStmt(DeclGroupRef(IndexDecl), SourceLocation(), SourceLocation());
+
+ if (!InitStmt.isUsable())
+ return StmtError();
+
+ auto MakeIVRef = [&SemaRef = this->SemaRef, IndexDecl, IVType,
+ Loc = InitVal->getExprLoc()]() {
+ return buildDeclRefExpr(SemaRef, IndexDecl, IVType, Loc, false);
+ };
+
+ // 2. Iteratively compute the max number of logical iterations Max(NI_1, NI_2,
+ // ..., NI_k)
+ //
+ // This loop accumulates the maximum value across multiple expressions,
+ // ensuring each step constructs a unique AST node for correctness. By using
+ // intermediate temporary variables and conditional operators, we maintain
+ // distinct nodes and avoid duplicating subtrees, For instance, max(a,b,c):
+ // omp.temp0 = max(a, b)
+ // omp.temp1 = max(omp.temp0, c)
+ // omp.fuse.max = max(omp.temp1, omp.temp0)
+
+ ExprResult MaxExpr;
+ // I is the true
+ for (unsigned I = FirstVal - 1, J = 0; I < LastVal; ++I, ++J) {
+ DeclRefExpr *NIRef = MakeVarDeclRef(NIVarDecls[J]);
+ QualType NITy = NIRef->getType();
+
+ if (MaxExpr.isUnset()) {
+ // Initialize MaxExpr with the first NI expression
+ MaxExpr = NIRef;
+ } else {
+ // Create a new acummulator variable t_i = MaxExpr
+ std::string TempName = (Twine(".omp.temp.") + Twine(J)).str();
+ VarDecl *TempDecl =
+ buildVarDecl(SemaRef, {}, NITy, TempName, nullptr, nullptr);
+ TempDecl->setInit(MaxExpr.get());
+ DeclRefExpr *TempRef =
+ buildDeclRefExpr(SemaRef, TempDecl, NITy, SourceLocation(), false);
+ DeclRefExpr *TempRef2 =
+ buildDeclRefExpr(SemaRef, TempDecl, NITy, SourceLocation(), false);
+ // Add a DeclStmt to PreInits to ensure the variable is declared.
+ StmtResult TempStmt = new (Context)
+ DeclStmt(DeclGroupRef(TempDecl), SourceLocation(), SourceLocation());
+
+ if (!TempStmt.isUsable())
+ return StmtError();
+ PreInits.push_back(TempStmt.get());
+
+ // Build MaxExpr <-(MaxExpr > NIRef ? MaxExpr : NIRef)
+ ExprResult Comparison =
+ SemaRef.BuildBinOp(nullptr, SourceLocation(), BO_GT, TempRef, NIRef);
+ // Handle any errors in Comparison creation
+ if (!Comparison.isUsable())
+ return StmtError();
+
+ DeclRefExpr *NIRef2 = MakeVarDeclRef(NIVarDecls[J]);
+ // Update MaxExpr using a conditional expression to hold the max value
+ MaxExpr = new (Context) ConditionalOperator(
+ Comparison.get(), SourceLocation(), TempRef2, SourceLocation(),
+ NIRef2->getExprStmt(), NITy, VK_LValue, OK_Ordinary);
+
+ if (!MaxExpr.isUsable())
+ return StmtError();
+ }
+ }
+ if (!MaxExpr.isUsable())
+ return StmtError();
+
+ // 3. Declare the max variable
+ const std::string MaxName = Twine(".omp.fuse.max").str();
+ VarDecl *MaxDecl =
+ buildVarDecl(SemaRef, {}, IVType, MaxName, nullptr, nullptr);
+ MaxDecl->setInit(MaxExpr.get());
+ DeclRefExpr *MaxRef = buildDeclRefExpr(SemaRef, MaxDecl, IVType, {}, false);
+ StmtResult MaxStmt = new (Context)
+ DeclStmt(DeclGroupRef(MaxDecl), SourceLocation(), SourceLocation());
+
+ if (MaxStmt.isInvalid())
+ return StmtError();
+ PreInits.push_back(MaxStmt.get());
+
+ // 4. Create condition Expr: index < n_max
+ ExprResult CondExpr = SemaRef.BuildBinOp(CurScope, SourceLocation(), BO_LT,
+ MakeIVRef(), MaxRef);
+ if (!CondExpr.isUsable())
+ return StmtError();
+
+ // 5. Increment Expr: ++index
+ ExprResult IncrExpr =
+ SemaRef.BuildUnaryOp(CurScope, SourceLocation(), UO_PreInc, MakeIVRef());
+ if (!IncrExpr.isUsable())
+ return StmtError();
+
+ // 6. Build the Fused Loop Body
+ // The final fused loop iterates over the maximum logical range. Inside the
+ // loop, each original loop's index is calculated dynamically, and its body
+ // is executed conditionally.
+ //
+ // Each sub-loop's body is guarded by a conditional statement to ensure
+ // it executes only within its logical iteration range:
+ //
+ // if (fuse.index < ni_k){
+ // iv_k = lb_k + st_k * fuse.index;
+ // original.index = iv_k
+ // body(k);
+ // }
+
+ CompoundStmt *FusedBody = nullptr;
+ SmallVector<Stmt *, 4> FusedBodyStmts;
+ for (unsigned I = FirstVal - 1, J = 0; I < LastVal; ++I, ++J) {
+ // Assingment of the original sub-loop index to compute the logical index
+ // IV_k = LB_k + omp.fuse.index * ST_k
+ ExprResult IdxExpr =
+ SemaRef.BuildBinOp(CurScope, SourceLocation(), BO_Mul,
+ MakeVarDeclRef(STVarDecls[J]), MakeIVRef());
+ if (!IdxExpr.isUsable())
+ return StmtError();
+ IdxExpr = SemaRef.BuildBinOp(CurScope, SourceLocation(), BO_Add,
+ MakeVarDeclRef(LBVarDecls[J]), IdxExpr.get());
+
+ if (!IdxExpr.isUsable())
+ return StmtError();
+ IdxExpr = SemaRef.BuildBinOp(CurScope, SourceLocation(), BO_Assign,
+ MakeVarDeclRef(IVVarDecls[J]), IdxExpr.get());
+ if (!IdxExpr.isUsable())
+ return StmtError();
+
+ // Update the original i_k = IV_k
+ SmallVector<Stmt *, 4> BodyStmts;
+ BodyStmts.push_back(IdxExpr.get());
+ llvm::append_range(BodyStmts, SeqAnalysis.Loops[I].HelperExprs.Updates);
+
+ // If the loop is a CXXForRangeStmt then the iterator variable is needed
+ if (auto *SourceCXXFor =
+ dyn_cast<CXXForRangeStmt>(SeqAnalysis.Loops[I].ForStmt))
+ BodyStmts.push_back(SourceCXXFor->getLoopVarStmt());
+
+ Stmt *Body =
+ (isa<ForStmt>(SeqAnalysis.Loops[I].ForStmt))
+ ? cast<ForStmt>(SeqAnalysis.Loops[I].ForStmt)->getBody()
+ : cast<CXXForRangeStmt>(SeqAnalysis.Loops[I].ForStmt)->getBody();
+ BodyStmts.push_back(Body);
+
+ CompoundStmt *CombinedBody =
+ CompoundStmt::Create(Context, BodyStmts, FPOptionsOverride(),
+ SourceLocation(), SourceLocation());
+ ExprResult Condition =
+ SemaRef.BuildBinOp(CurScope, SourceLocation(), BO_LT, MakeIVRef(),
+ MakeVarDeclRef(NIVarDecls[J]));
+
+ if (!Condition.isUsable())
+ return StmtError();
+
+ IfStmt *IfStatement = IfStmt::Create(
+ Context, SourceLocation(), IfStatementKind::Ordinary, nullptr, nullptr,
+ Condition.get(), SourceLocation(), SourceLocation(), CombinedBody,
+ SourceLocation(), nullptr);
+
+ FusedBodyStmts.push_back(IfStatement);
+ }
+ FusedBody = CompoundStmt::Create(Context, FusedBodyStmts, FPOptionsOverride(),
+ SourceLocation(), SourceLocation());
+
+ // 7. Construct the final fused loop
+ ForStmt *FusedForStmt = new (Context)
+ ForStmt(Context, InitStmt.get(), CondExpr.get(), nullptr, IncrExpr.get(),
+ FusedBody, InitStmt.get()->getBeginLoc(), SourceLocation(),
+ IncrExpr.get()->getEndLoc());
+
+ // In the case of looprange, the result of fuse won't simply
+ // be a single loop (ForStmt), but rather a loop sequence
+ // (CompoundStmt) of 3 parts: the pre-fusion loops, the fused loop
+ // and the post-fusion loops, preserving its original order.
+ //
+ // Note: If looprange clause produces a single fused loop nest then
+ // this compound statement wrapper is unnecessary (Therefore this
+ // treatment is skipped)
+
+ Stmt *FusionStmt = FusedForStmt;
+ if (LRC && CountVal != SeqAnalysis.LoopSeqSize) {
+ SmallVector<Stmt *, 4> FinalLoops;
+
+ // Reset the transform index
+ TransformIndex = 0;
+
+ // Collect all non-fused loops before and after the fused region.
+ // Pre-fusion and post-fusion loops are inserted in order exploiting their
+ // symmetry, along with their corresponding transformation pre-inits if
+ // needed. The fused loop is added between the two regions.
+ for (unsigned I = 0; I < SeqAnalysis.LoopSeqSize; ++I) {
+ if (I >= FirstVal - 1 && I < FirstVal + CountVal - 1) {
+ // Update the Transformation counter to skip already treated
+ // loop transformations
+ if (SeqAnalysis.Loops[I].Category !=
+ OMPLoopCategory::TransformSingleLoop)
+ ++TransformIndex;
+ continue;
+ }
+
+ // No need to handle:
+ // Regular loops: they are kept intact as-is.
+ // Loop-sequence-generating transformations: already handled earlier.
+ // Only TransformSingleLoop requires inserting pre-inits here
+ if (SeqAnalysis.Loops[I].Category ==
+ OMPLoopCategory::TransformSingleLoop) {
+ const auto &TransformPreInit =
+ SeqAnalysis.Loops[TransformIndex++].TransformsPreInits;
+ if (!TransformPreInit.empty())
+ llvm::append_range(PreInits, TransformPreInit);
+ }
+
+ FinalLoops.push_back(SeqAnalysis.Loops[I].ForStmt);
+ }
+
+ FinalLoops.insert(FinalLoops.begin() + (FirstVal - 1), FusedForStmt);
+ FusionStmt = CompoundStmt::Create(Context, FinalLoops, FPOptionsOverride(),
+ SourceLocation(), SourceLocation());
+ }
+ return OMPFuseDirective::Create(Context, StartLoc, EndLoc, Clauses,
+ SeqAnalysis.LoopSeqSize,
+ NumGeneratedTopLevelLoops, AStmt, FusionStmt,
+ buildPreInits(Context, PreInits));
+}
+
OMPClause *SemaOpenMP::ActOnOpenMPSingleExprClause(OpenMPClauseKind Kind,
Expr *Expr,
SourceLocation StartLoc,
@@ -16704,6 +17471,31 @@ OMPClause *SemaOpenMP::ActOnOpenMPPartialClause(Expr *FactorExpr,
FactorExpr);
}
+OMPClause *SemaOpenMP::ActOnOpenMPLoopRangeClause(
+ Expr *First, Expr *Count, SourceLocation StartLoc, SourceLocation LParenLoc,
+ SourceLocation FirstLoc, SourceLocation CountLoc, SourceLocation EndLoc) {
+
+ // OpenMP [6.0, Restrictions]
+ // First and Count must be integer expressions with positive value
+ ExprResult FirstVal =
+ VerifyPositiveIntegerConstantInClause(First, OMPC_looprange);
+ if (FirstVal.isInvalid())
+ First = nullptr;
+
+ ExprResult CountVal =
+ VerifyPositiveIntegerConstantInClause(Count, OMPC_looprange);
+ if (CountVal.isInvalid())
+ Count = nullptr;
+
+ // OpenMP [6.0, Restrictions]
+ // first + count - 1 must not evaluate to a value greater than the
+ // loop sequence length of the associated canonical loop sequence.
+ // This check must be performed afterwards due to the delayed
+ // parsing and computation of the associated loop sequence
+ return OMPLoopRangeClause::Create(getASTContext(), StartLoc, LParenLoc,
+ FirstLoc, CountLoc, EndLoc, First, Count);
+}
+
OMPClause *SemaOpenMP::ActOnOpenMPAlignClause(Expr *A, SourceLocation StartLoc,
SourceLocation LParenLoc,
SourceLocation EndLoc) {
diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
index 0587a7decbd8d..df3bb1b22c079 100644
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -1800,6 +1800,14 @@ class TreeTransform {
LParenLoc, EndLoc);
}
+ OMPClause *
+ RebuildOMPLoopRangeClause(Expr *First, Expr *Count, SourceLocation StartLoc,
+ SourceLocation LParenLoc, SourceLocation FirstLoc,
+ SourceLocation CountLoc, SourceLocation EndLoc) {
+ return getSema().OpenMP().ActOnOpenMPLoopRangeClause(
+ First, Count, StartLoc, LParenLoc, FirstLoc, CountLoc, EndLoc);
+ }
+
/// Build a new OpenMP 'allocator' clause.
///
/// By default, performs semantic analysis to build the new OpenMP clause.
@@ -9707,6 +9715,17 @@ StmtResult TreeTransform<Derived>::TransformOMPInterchangeDirective(
return Res;
}
+template <typename Derived>
+StmtResult
+TreeTransform<Derived>::TransformOMPFuseDirective(OMPFuseDirective *D) {
+ DeclarationNameInfo DirName;
+ getDerived().getSema().OpenMP().StartOpenMPDSABlock(
+ D->getDirectiveKind(), DirName, nullptr, D->getBeginLoc());
+ StmtResult Res = getDerived().TransformOMPExecutableDirective(D);
+ getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get());
+ return Res;
+}
+
template <typename Derived>
StmtResult
TreeTransform<Derived>::TransformOMPForDirective(OMPForDirective *D) {
@@ -10600,6 +10619,31 @@ TreeTransform<Derived>::TransformOMPPartialClause(OMPPartialClause *C) {
C->getEndLoc());
}
+template <typename Derived>
+OMPClause *
+TreeTransform<Derived>::TransformOMPLoopRangeClause(OMPLoopRangeClause *C) {
+ ExprResult F = getDerived().TransformExpr(C->getFirst());
+ if (F.isInvalid())
+ return nullptr;
+
+ ExprResult Cn = getDerived().TransformExpr(C->getCount());
+ if (Cn.isInvalid())
+ return nullptr;
+
+ Expr *First = F.get();
+ Expr *Count = Cn.get();
+
+ bool Changed = (First != C->getFirst()) || (Count != C->getCount());
+
+ // If no changes and AlwaysRebuild() is false, return the original clause
+ if (!Changed && !getDerived().AlwaysRebuild())
+ return C;
+
+ return RebuildOMPLoopRangeClause(First, Count, C->getBeginLoc(),
+ C->getLParenLoc(), C->getFirstLoc(),
+ C->getCountLoc(), C->getEndLoc());
+}
+
template <typename Derived>
OMPClause *
TreeTransform<Derived>::TransformOMPCollapseClause(OMPCollapseClause *C) {
diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
index 9b0e699b1a829..79fb885dc2a4a 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -11229,6 +11229,9 @@ OMPClause *OMPClauseReader::readClause() {
case llvm::omp::OMPC_partial:
C = OMPPartialClause::CreateEmpty(Context);
break;
+ case llvm::omp::OMPC_looprange:
+ C = OMPLoopRangeClause::CreateEmpty(Context);
+ break;
case llvm::omp::OMPC_allocator:
C = new (Context) OMPAllocatorClause();
break;
@@ -11632,6 +11635,14 @@ void OMPClauseReader::VisitOMPPartialClause(OMPPartialClause *C) {
C->setLParenLoc(Record.readSourceLocation());
}
+void OMPClauseReader::VisitOMPLoopRangeClause(OMPLoopRangeClause *C) {
+ C->setFirst(Record.readSubExpr());
+ C->setCount(Record.readSubExpr());
+ C->setLParenLoc(Record.readSourceLocation());
+ C->setFirstLoc(Record.readSourceLocation());
+ C->setCountLoc(Record.readSourceLocation());
+}
+
void OMPClauseReader::VisitOMPAllocatorClause(OMPAllocatorClause *C) {
C->setAllocator(Record.readExpr());
C->setLParenLoc(Record.readSourceLocation());
diff --git a/clang/lib/Serialization/ASTReaderStmt.cpp b/clang/lib/Serialization/ASTReaderStmt.cpp
index 213c2c2148f64..23e358d68d6d4 100644
--- a/clang/lib/Serialization/ASTReaderStmt.cpp
+++ b/clang/lib/Serialization/ASTReaderStmt.cpp
@@ -2469,10 +2469,23 @@ void ASTStmtReader::VisitOMPReverseDirective(OMPReverseDirective *D) {
VisitOMPCanonicalLoopNestTransformationDirective(D);
}
+void ASTStmtReader::VisitOMPCanonicalLoopSequenceTransformationDirective(
+ OMPCanonicalLoopSequenceTransformationDirective *D) {
+ VisitStmt(D);
+ // Field NumLoopsInSequence was read in ReadStmtFromStream.
+ Record.skipInts(1);
+ VisitOMPExecutableDirective(D);
+ D->setNumGeneratedTopLevelLoops(Record.readUInt32());
+}
+
void ASTStmtReader::VisitOMPInterchangeDirective(OMPInterchangeDirective *D) {
VisitOMPCanonicalLoopNestTransformationDirective(D);
}
+void ASTStmtReader::VisitOMPFuseDirective(OMPFuseDirective *D) {
+ VisitOMPCanonicalLoopSequenceTransformationDirective(D);
+}
+
void ASTStmtReader::VisitOMPForDirective(OMPForDirective *D) {
VisitOMPLoopDirective(D);
D->setHasCancel(Record.readBool());
@@ -3614,6 +3627,13 @@ Stmt *ASTReader::ReadStmtFromStream(ModuleFile &F) {
S = OMPReverseDirective::CreateEmpty(Context, NumLoops);
break;
}
+ case STMT_OMP_FUSE_DIRECTIVE: {
+ unsigned NumLoopsInSequence = Record[ASTStmtReader::NumStmtFields];
+ unsigned NumClauses = Record[ASTStmtReader::NumStmtFields + 1];
+ S = OMPFuseDirective::CreateEmpty(Context, NumClauses,
+ NumLoopsInSequence);
+ break;
+ }
case STMT_OMP_INTERCHANGE_DIRECTIVE: {
unsigned NumLoops = Record[ASTStmtReader::NumStmtFields];
diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index 8e219e54bf251..6fc92cfcb1b40 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -7865,6 +7865,14 @@ void OMPClauseWriter::VisitOMPPartialClause(OMPPartialClause *C) {
Record.AddSourceLocation(C->getLParenLoc());
}
+void OMPClauseWriter::VisitOMPLoopRangeClause(OMPLoopRangeClause *C) {
+ Record.AddStmt(C->getFirst());
+ Record.AddStmt(C->getCount());
+ Record.AddSourceLocation(C->getLParenLoc());
+ Record.AddSourceLocation(C->getFirstLoc());
+ Record.AddSourceLocation(C->getCountLoc());
+}
+
void OMPClauseWriter::VisitOMPAllocatorClause(OMPAllocatorClause *C) {
Record.AddStmt(C->getAllocator());
Record.AddSourceLocation(C->getLParenLoc());
diff --git a/clang/lib/Serialization/ASTWriterStmt.cpp b/clang/lib/Serialization/ASTWriterStmt.cpp
index 21c04ddbc2c7a..99169739cfd5a 100644
--- a/clang/lib/Serialization/ASTWriterStmt.cpp
+++ b/clang/lib/Serialization/ASTWriterStmt.cpp
@@ -2487,6 +2487,19 @@ void ASTStmtWriter::VisitOMPInterchangeDirective(OMPInterchangeDirective *D) {
Code = serialization::STMT_OMP_INTERCHANGE_DIRECTIVE;
}
+void ASTStmtWriter::VisitOMPCanonicalLoopSequenceTransformationDirective(
+ OMPCanonicalLoopSequenceTransformationDirective *D) {
+ VisitStmt(D);
+ Record.writeUInt32(D->getNumLoopsInSequence());
+ VisitOMPExecutableDirective(D);
+ Record.writeUInt32(D->getNumGeneratedTopLevelLoops());
+}
+
+void ASTStmtWriter::VisitOMPFuseDirective(OMPFuseDirective *D) {
+ VisitOMPCanonicalLoopSequenceTransformationDirective(D);
+ Code = serialization::STMT_OMP_FUSE_DIRECTIVE;
+}
+
void ASTStmtWriter::VisitOMPForDirective(OMPForDirective *D) {
VisitOMPLoopDirective(D);
Record.writeBool(D->hasCancel());
diff --git a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
index 785cdfa15bf04..4e472b7fc38b0 100644
--- a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
+++ b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
@@ -1814,6 +1814,7 @@ void ExprEngine::Visit(const Stmt *S, ExplodedNode *Pred,
case Stmt::OMPStripeDirectiveClass:
case Stmt::OMPTileDirectiveClass:
case Stmt::OMPInterchangeDirectiveClass:
+ case Stmt::OMPFuseDirectiveClass:
case Stmt::OMPInteropDirectiveClass:
case Stmt::OMPDispatchDirectiveClass:
case Stmt::OMPMaskedDirectiveClass:
diff --git a/clang/test/OpenMP/fuse_ast_print.cpp b/clang/test/OpenMP/fuse_ast_print.cpp
new file mode 100644
index 0000000000000..283f5883c907d
--- /dev/null
+++ b/clang/test/OpenMP/fuse_ast_print.cpp
@@ -0,0 +1,397 @@
+// Check no warnings/errors
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -fopenmp -std=c++20 -fopenmp-version=60 -fsyntax-only -verify %s
+// expected-no-diagnostics
+
+// Check AST and unparsing
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -fopenmp -std=c++20 -fopenmp-version=60 -ast-dump %s | FileCheck %s --check-prefix=DUMP
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -fopenmp -std=c++20 -fopenmp-version=60 -ast-print %s | FileCheck %s --check-prefix=PRINT
+
+// Check same results after serialization round-trip
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -fopenmp -std=c++20 -fopenmp-version=60 -emit-pch -o %t %s
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -fopenmp -std=c++20 -fopenmp-version=60 -include-pch %t -ast-dump-all %s | FileCheck %s --check-prefix=DUMP
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -fopenmp -std=c++20 -fopenmp-version=60 -include-pch %t -ast-print %s | FileCheck %s --check-prefix=PRINT
+
+#ifndef HEADER
+#define HEADER
+
+// placeholder for loop body code
+extern "C" void body(...);
+
+// PRINT-LABEL: void foo1(
+// DUMP-LABEL: FunctionDecl {{.*}} foo1
+void foo1() {
+ // PRINT: #pragma omp fuse
+ // DUMP: OMPFuseDirective
+ #pragma omp fuse
+ // PRINT: {
+ // DUMP: CompoundStmt
+ {
+ // PRINT: for (int i = 0; i < 10; i += 2)
+ // DUMP: ForStmt
+ for (int i = 0; i < 10; i += 2)
+ // PRINT: body(i)
+ // DUMP: CallExpr
+ body(i);
+ // PRINT: for (int j = 10; j > 0; --j)
+ // DUMP: ForStmt
+ for (int j = 10; j > 0; --j)
+ // PRINT: body(j)
+ // DUMP: CallExpr
+ body(j);
+ // PRINT: for (int k = 0; k <= 10; ++k)
+ // DUMP: ForStmt
+ for (int k = 0; k <= 10; ++k)
+ // PRINT: body(k)
+ // DUMP: CallExpr
+ body(k);
+
+ }
+
+}
+
+// PRINT-LABEL: void foo2(
+// DUMP-LABEL: FunctionDecl {{.*}} foo2
+void foo2() {
+ // PRINT: #pragma omp unroll partial(4)
+ // DUMP: OMPUnrollDirective
+ // DUMP-NEXT: OMPPartialClause
+ // DUMP-NEXT: ConstantExpr
+ // DUMP-NEXT: value: Int 4
+ // DUMP-NEXT: IntegerLiteral {{.*}} 4
+ #pragma omp unroll partial(4)
+ // PRINT: #pragma omp fuse
+ // DUMP-NEXT: OMPFuseDirective
+ #pragma omp fuse
+ // PRINT: {
+ // DUMP: CompoundStmt
+ {
+ // PRINT: for (int i = 0; i < 10; i += 2)
+ // DUMP: ForStmt
+ for (int i = 0; i < 10; i += 2)
+ // PRINT: body(i)
+ // DUMP: CallExpr
+ body(i);
+ // PRINT: for (int j = 10; j > 0; --j)
+ // DUMP: ForStmt
+ for (int j = 10; j > 0; --j)
+ // PRINT: body(j)
+ // DUMP: CallExpr
+ body(j);
+ }
+
+}
+
+//PRINT-LABEL: void foo3(
+//DUMP-LABEL: FunctionTemplateDecl {{.*}} foo3
+template<int Factor1, int Factor2>
+void foo3() {
+ // PRINT: #pragma omp fuse
+ // DUMP: OMPFuseDirective
+ #pragma omp fuse
+ // PRINT: {
+ // DUMP: CompoundStmt
+ {
+ // PRINT: #pragma omp unroll partial(Factor1)
+ // DUMP: OMPUnrollDirective
+ #pragma omp unroll partial(Factor1)
+ // PRINT: for (int i = 0; i < 12; i += 1)
+ // DUMP: ForStmt
+ for (int i = 0; i < 12; i += 1)
+ // PRINT: body(i)
+ // DUMP: CallExpr
+ body(i);
+ // PRINT: #pragma omp unroll partial(Factor2)
+ // DUMP: OMPUnrollDirective
+ #pragma omp unroll partial(Factor2)
+ // PRINT: for (int k = 0; k <= 10; ++k)
+ // DUMP: ForStmt
+ for (int k = 0; k <= 10; ++k)
+ // PRINT: body(k)
+ // DUMP: CallExpr
+ body(k);
+
+ }
+}
+
+// Also test instantiating the template.
+void tfoo3() {
+ foo3<4,2>();
+}
+
+//PRINT-LABEL: void foo4(
+//DUMP-LABEL: FunctionTemplateDecl {{.*}} foo4
+template<typename T, T Step>
+void foo4(int start, int end) {
+ // PRINT: #pragma omp fuse
+ // DUMP: OMPFuseDirective
+ #pragma omp fuse
+ // PRINT: {
+ // DUMP: CompoundStmt
+ {
+ // PRINT: for (T i = start; i < end; i += Step)
+ // DUMP: ForStmt
+ for (T i = start; i < end; i += Step)
+ // PRINT: body(i)
+ // DUMP: CallExpr
+ body(i);
+
+ // PRINT: for (T j = end; j > start; j -= Step)
+ // DUMP: ForStmt
+ for (T j = end; j > start; j -= Step) {
+ // PRINT: body(j)
+ // DUMP: CallExpr
+ body(j);
+ }
+
+ }
+}
+
+// Also test instantiating the template.
+void tfoo4() {
+ foo4<int, 4>(0, 64);
+}
+
+
+
+// PRINT-LABEL: void foo5(
+// DUMP-LABEL: FunctionDecl {{.*}} foo5
+void foo5() {
+ double arr[128], arr2[128];
+ // PRINT: #pragma omp fuse
+ // DUMP: OMPFuseDirective
+ #pragma omp fuse
+ // PRINT: {
+ // DUMP: CompoundStmt
+ {
+ // PRINT-NEXT: for (auto &&a : arr)
+ // DUMP-NEXT: CXXForRangeStmt
+ for (auto &&a: arr)
+ // PRINT: body(a)
+ // DUMP: CallExpr
+ body(a);
+ // PRINT: for (double v = 42; auto &&b : arr)
+ // DUMP: CXXForRangeStmt
+ for (double v = 42; auto &&b: arr)
+ // PRINT: body(b, v);
+ // DUMP: CallExpr
+ body(b, v);
+ // PRINT: for (auto &&c : arr2)
+ // DUMP: CXXForRangeStmt
+ for (auto &&c: arr2)
+ // PRINT: body(c)
+ // DUMP: CallExpr
+ body(c);
+
+ }
+
+}
+
+// PRINT-LABEL: void foo6(
+// DUMP-LABEL: FunctionDecl {{.*}} foo6
+void foo6() {
+ // PRINT: #pragma omp fuse
+ // DUMP: OMPFuseDirective
+ #pragma omp fuse
+ // PRINT: {
+ // DUMP: CompoundStmt
+ {
+ // PRINT: #pragma omp fuse
+ // DUMP: OMPFuseDirective
+ #pragma omp fuse
+ // PRINT: {
+ // DUMP: CompoundStmt
+ {
+ // PRINT: for (int i = 0; i <= 10; ++i)
+ // DUMP: ForStmt
+ for (int i = 0; i <= 10; ++i)
+ body(i);
+ // PRINT: for (int j = 0; j < 100; ++j)
+ // DUMP: ForStmt
+ for(int j = 0; j < 100; ++j)
+ body(j);
+ }
+ // PRINT: #pragma omp unroll partial(4)
+ // DUMP: OMPUnrollDirective
+ #pragma omp unroll partial(4)
+ // PRINT: for (int k = 0; k < 250; ++k)
+ // DUMP: ForStmt
+ for (int k = 0; k < 250; ++k)
+ body(k);
+ }
+}
+
+// PRINT-LABEL: void foo7(
+// DUMP-LABEL: FunctionDecl {{.*}} foo7
+void foo7() {
+ // PRINT: #pragma omp fuse
+ // DUMP: OMPFuseDirective
+ #pragma omp fuse
+ // PRINT: {
+ // DUMP: CompoundStmt
+ {
+ // PRINT: {
+ // DUMP: CompoundStmt
+ {
+ // PRINT: {
+ // DUMP: CompoundStmt
+ {
+ // PRINT: for (int i = 0; i < 10; i += 2)
+ // DUMP: ForStmt
+ for (int i = 0; i < 10; i += 2)
+ // PRINT: body(i)
+ // DUMP: CallExpr
+ body(i);
+ // PRINT: for (int j = 10; j > 0; --j)
+ // DUMP: ForStmt
+ for (int j = 10; j > 0; --j)
+ // PRINT: body(j)
+ // DUMP: CallExpr
+ body(j);
+ }
+ }
+ // PRINT: {
+ // DUMP: CompoundStmt
+ {
+ // PRINT: {
+ // DUMP: CompoundStmt
+ {
+ // PRINT: {
+ // DUMP: CompoundStmt
+ {
+ // PRINT: for (int k = 0; k <= 10; ++k)
+ // DUMP: ForStmt
+ for (int k = 0; k <= 10; ++k)
+ // PRINT: body(k)
+ // DUMP: CallExpr
+ body(k);
+ }
+ }
+ }
+ }
+
+}
+
+// PRINT-LABEL: void foo8(
+// DUMP-LABEL: FunctionDecl {{.*}} foo8
+void foo8() {
+ // PRINT: #pragma omp fuse looprange(2,2)
+ // DUMP: OMPFuseDirective
+ // DUMP: OMPLooprangeClause
+ #pragma omp fuse looprange(2,2)
+ // PRINT: {
+ // DUMP: CompoundStmt
+ {
+ // PRINT: for (int i = 0; i < 10; i += 2)
+ // DUMP: ForStmt
+ for (int i = 0; i < 10; i += 2)
+ // PRINT: body(i)
+ // DUMP: CallExpr
+ body(i);
+ // PRINT: for (int j = 10; j > 0; --j)
+ // DUMP: ForStmt
+ for (int j = 10; j > 0; --j)
+ // PRINT: body(j)
+ // DUMP: CallExpr
+ body(j);
+ // PRINT: for (int k = 0; k <= 10; ++k)
+ // DUMP: ForStmt
+ for (int k = 0; k <= 10; ++k)
+ // PRINT: body(k)
+ // DUMP: CallExpr
+ body(k);
+
+ }
+
+}
+
+//PRINT-LABEL: void foo9(
+//DUMP-LABEL: FunctionTemplateDecl {{.*}} foo9
+//DUMP-LABEL: NonTypeTemplateParmDecl {{.*}} F
+//DUMP-LABEL: NonTypeTemplateParmDecl {{.*}} C
+template<int F, int C>
+void foo9() {
+ // PRINT: #pragma omp fuse looprange(F,C)
+ // DUMP: OMPFuseDirective
+ // DUMP: OMPLooprangeClause
+ #pragma omp fuse looprange(F,C)
+ // PRINT: {
+ // DUMP: CompoundStmt
+ {
+ // PRINT: for (int i = 0; i < 10; i += 2)
+ // DUMP: ForStmt
+ for (int i = 0; i < 10; i += 2)
+ // PRINT: body(i)
+ // DUMP: CallExpr
+ body(i);
+ // PRINT: for (int j = 10; j > 0; --j)
+ // DUMP: ForStmt
+ for (int j = 10; j > 0; --j)
+ // PRINT: body(j)
+ // DUMP: CallExpr
+ body(j);
+
+ }
+}
+
+// Also test instantiating the template.
+void tfoo9() {
+ foo9<1, 2>();
+}
+
+// PRINT-LABEL: void foo10(
+// DUMP-LABEL: FunctionDecl {{.*}} foo10
+void foo10() {
+ // PRINT: #pragma omp fuse looprange(2,2)
+ // DUMP: OMPFuseDirective
+ // DUMP: OMPLooprangeClause
+ #pragma omp fuse looprange(2,2)
+ // PRINT: {
+ // DUMP: CompoundStmt
+ {
+ // PRINT: for (int i = 0; i < 10; i += 2)
+ // DUMP: ForStmt
+ for (int i = 0; i < 10; i += 2)
+ // PRINT: body(i)
+ // DUMP: CallExpr
+ body(i);
+ // PRINT: for (int ii = 0; ii < 10; ii += 2)
+ // DUMP: ForStmt
+ for (int ii = 0; ii < 10; ii += 2)
+ // PRINT: body(ii)
+ // DUMP: CallExpr
+ body(ii);
+ // PRINT: #pragma omp fuse looprange(2,2)
+ // DUMP: OMPFuseDirective
+ // DUMP: OMPLooprangeClause
+ #pragma omp fuse looprange(2,2)
+ {
+ // PRINT: for (int j = 10; j > 0; --j)
+ // DUMP: ForStmt
+ for (int j = 10; j > 0; --j)
+ // PRINT: body(j)
+ // DUMP: CallExpr
+ body(j);
+ // PRINT: for (int jj = 10; jj > 0; --jj)
+ // DUMP: ForStmt
+ for (int jj = 10; jj > 0; --jj)
+ // PRINT: body(jj)
+ // DUMP: CallExpr
+ body(jj);
+ // PRINT: for (int k = 0; k <= 10; ++k)
+ // DUMP: ForStmt
+ for (int k = 0; k <= 10; ++k)
+ // PRINT: body(k)
+ // DUMP: CallExpr
+ body(k);
+ // PRINT: for (int kk = 0; kk <= 10; ++kk)
+ // DUMP: ForStmt
+ for (int kk = 0; kk <= 10; ++kk)
+ // PRINT: body(kk)
+ // DUMP: CallExpr
+ body(kk);
+ }
+ }
+
+}
+
+#endif
diff --git a/clang/test/OpenMP/fuse_codegen.cpp b/clang/test/OpenMP/fuse_codegen.cpp
new file mode 100644
index 0000000000000..742c280ed0172
--- /dev/null
+++ b/clang/test/OpenMP/fuse_codegen.cpp
@@ -0,0 +1,2328 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --include-generated-funcs --replace-value-regex "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _ --version 5
+// expected-no-diagnostics
+
+// Check code generation
+// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -std=c++20 -fclang-abi-compat=latest -fopenmp -fopenmp-version=60 -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK1
+
+// Check same results after serialization round-trip
+// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -std=c++20 -fclang-abi-compat=latest -fopenmp -fopenmp-version=60 -emit-pch -o %t %s
+// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -std=c++20 -fclang-abi-compat=latest -fopenmp -fopenmp-version=60 -include-pch %t -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK2
+
+#ifndef HEADER
+#define HEADER
+
+//placeholder for loop body code.
+extern "C" void body(...) {}
+
+extern "C" void foo1(int start1, int end1, int step1, int start2, int end2, int step2) {
+ int i,j;
+ #pragma omp fuse
+ {
+ for(i = start1; i < end1; i += step1) body(i);
+ for(j = start2; j < end2; j += step2) body(j);
+ }
+
+}
+
+template <typename T>
+void foo2(T start, T end, T step){
+ T i,j,k;
+ #pragma omp fuse
+ {
+ for(i = start; i < end; i += step) body(i);
+ for(j = end; j > start; j -= step) body(j);
+ for(k = start+step; k < end+step; k += step) body(k);
+ }
+}
+
+extern "C" void tfoo2() {
+ foo2<int>(0, 64, 4);
+}
+
+extern "C" void foo3() {
+ double arr[256];
+ #pragma omp fuse
+ {
+ #pragma omp fuse
+ {
+ for(int i = 0; i < 128; ++i) body(i);
+ for(int j = 0; j < 256; j+=2) body(j);
+ }
+ for(int c = 42; auto &&v: arr) body(c,v);
+ for(int cc = 37; auto &&vv: arr) body(cc, vv);
+ }
+}
+
+extern "C" void foo4() {
+ double arr[256];
+
+ #pragma omp fuse looprange(2,2)
+ {
+ for(int i = 0; i < 128; ++i) body(i);
+ for(int j = 0; j < 256; j+=2) body(j);
+ for(int k = 0; k < 64; ++k) body(k);
+ for(int c = 42; auto &&v: arr) body(c,v);
+ }
+}
+
+// This exemplifies the usage of loop transformations that generate
+// more than top level canonical loop nests (e.g split, loopranged fuse...)
+extern "C" void foo5() {
+ double arr[256];
+ #pragma omp fuse looprange(2,2)
+ {
+ #pragma omp fuse looprange(2,2)
+ {
+ for(int i = 0; i < 128; ++i) body(i);
+ for(int j = 0; j < 256; j+=2) body(j);
+ for(int k = 0; k < 512; ++k) body(k);
+ }
+ for(int c = 42; auto &&v: arr) body(c,v);
+ for(int cc = 37; auto &&vv: arr) body(cc, vv);
+ }
+}
+
+
+#endif
+// CHECK1-LABEL: define dso_local void @body(
+// CHECK1-SAME: ...) #[[ATTR0:[0-9]+]] {
+// CHECK1-NEXT: [[ENTRY:.*:]]
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define dso_local void @foo1(
+// CHECK1-SAME: i32 noundef [[START1:%.*]], i32 noundef [[END1:%.*]], i32 noundef [[STEP1:%.*]], i32 noundef [[START2:%.*]], i32 noundef [[END2:%.*]], i32 noundef [[STEP2:%.*]]) #[[ATTR0]] {
+// CHECK1-NEXT: [[ENTRY:.*:]]
+// CHECK1-NEXT: [[START1_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[END1_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[STEP1_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[START2_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[END2_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[STEP2_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[J:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTNEW_STEP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_LB0:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_ST0:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_NI0:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_IV0:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_6:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_7:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTNEW_STEP8:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_9:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_LB1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_ST1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_NI1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_IV1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_TEMP_1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_FUSE_MAX:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_FUSE_INDEX:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: store i32 [[START1]], ptr [[START1_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[END1]], ptr [[END1_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[STEP1]], ptr [[STEP1_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[START2]], ptr [[START2_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[END2]], ptr [[END2_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[STEP2]], ptr [[STEP2_ADDR]], align 4
+// CHECK1-NEXT: [[TMP0:%.*]] = load i32, ptr [[START1_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[TMP0]], ptr [[I]], align 4
+// CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[START1_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: [[TMP2:%.*]] = load i32, ptr [[END1_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[TMP2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[STEP1_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[TMP3]], ptr [[DOTNEW_STEP]], align 4
+// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: [[SUB:%.*]] = sub i32 [[TMP4]], [[TMP5]]
+// CHECK1-NEXT: [[SUB3:%.*]] = sub i32 [[SUB]], 1
+// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4
+// CHECK1-NEXT: [[ADD:%.*]] = add i32 [[SUB3]], [[TMP6]]
+// CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4
+// CHECK1-NEXT: [[DIV:%.*]] = udiv i32 [[ADD]], [[TMP7]]
+// CHECK1-NEXT: [[SUB4:%.*]] = sub i32 [[DIV]], 1
+// CHECK1-NEXT: store i32 [[SUB4]], ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB0]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_ST0]], align 4
+// CHECK1-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK1-NEXT: [[ADD5:%.*]] = add i32 [[TMP8]], 1
+// CHECK1-NEXT: store i32 [[ADD5]], ptr [[DOTOMP_NI0]], align 4
+// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[START2_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[TMP9]], ptr [[J]], align 4
+// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[START2_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[TMP10]], ptr [[DOTCAPTURE_EXPR_6]], align 4
+// CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[END2_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[TMP11]], ptr [[DOTCAPTURE_EXPR_7]], align 4
+// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[STEP2_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[TMP12]], ptr [[DOTNEW_STEP8]], align 4
+// CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_7]], align 4
+// CHECK1-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4
+// CHECK1-NEXT: [[SUB10:%.*]] = sub i32 [[TMP13]], [[TMP14]]
+// CHECK1-NEXT: [[SUB11:%.*]] = sub i32 [[SUB10]], 1
+// CHECK1-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTNEW_STEP8]], align 4
+// CHECK1-NEXT: [[ADD12:%.*]] = add i32 [[SUB11]], [[TMP15]]
+// CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTNEW_STEP8]], align 4
+// CHECK1-NEXT: [[DIV13:%.*]] = udiv i32 [[ADD12]], [[TMP16]]
+// CHECK1-NEXT: [[SUB14:%.*]] = sub i32 [[DIV13]], 1
+// CHECK1-NEXT: store i32 [[SUB14]], ptr [[DOTCAPTURE_EXPR_9]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB1]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_ST1]], align 4
+// CHECK1-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_9]], align 4
+// CHECK1-NEXT: [[ADD15:%.*]] = add i32 [[TMP17]], 1
+// CHECK1-NEXT: store i32 [[ADD15]], ptr [[DOTOMP_NI1]], align 4
+// CHECK1-NEXT: [[TMP18:%.*]] = load i32, ptr [[DOTOMP_NI0]], align 4
+// CHECK1-NEXT: store i32 [[TMP18]], ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK1-NEXT: [[TMP19:%.*]] = load i32, ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK1-NEXT: [[TMP20:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK1-NEXT: [[CMP:%.*]] = icmp ugt i32 [[TMP19]], [[TMP20]]
+// CHECK1-NEXT: br i1 [[CMP]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]]
+// CHECK1: [[COND_TRUE]]:
+// CHECK1-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK1-NEXT: br label %[[COND_END:.*]]
+// CHECK1: [[COND_FALSE]]:
+// CHECK1-NEXT: [[TMP22:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK1-NEXT: br label %[[COND_END]]
+// CHECK1: [[COND_END]]:
+// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ [[TMP21]], %[[COND_TRUE]] ], [ [[TMP22]], %[[COND_FALSE]] ]
+// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_FUSE_MAX]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT: br label %[[FOR_COND:.*]]
+// CHECK1: [[FOR_COND]]:
+// CHECK1-NEXT: [[TMP23:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT: [[TMP24:%.*]] = load i32, ptr [[DOTOMP_FUSE_MAX]], align 4
+// CHECK1-NEXT: [[CMP16:%.*]] = icmp ult i32 [[TMP23]], [[TMP24]]
+// CHECK1-NEXT: br i1 [[CMP16]], label %[[FOR_BODY:.*]], label %[[FOR_END:.*]]
+// CHECK1: [[FOR_BODY]]:
+// CHECK1-NEXT: [[TMP25:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTOMP_NI0]], align 4
+// CHECK1-NEXT: [[CMP17:%.*]] = icmp ult i32 [[TMP25]], [[TMP26]]
+// CHECK1-NEXT: br i1 [[CMP17]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
+// CHECK1: [[IF_THEN]]:
+// CHECK1-NEXT: [[TMP27:%.*]] = load i32, ptr [[DOTOMP_LB0]], align 4
+// CHECK1-NEXT: [[TMP28:%.*]] = load i32, ptr [[DOTOMP_ST0]], align 4
+// CHECK1-NEXT: [[TMP29:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT: [[MUL:%.*]] = mul i32 [[TMP28]], [[TMP29]]
+// CHECK1-NEXT: [[ADD18:%.*]] = add i32 [[TMP27]], [[MUL]]
+// CHECK1-NEXT: store i32 [[ADD18]], ptr [[DOTOMP_IV0]], align 4
+// CHECK1-NEXT: [[TMP30:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTOMP_IV0]], align 4
+// CHECK1-NEXT: [[TMP32:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4
+// CHECK1-NEXT: [[MUL19:%.*]] = mul i32 [[TMP31]], [[TMP32]]
+// CHECK1-NEXT: [[ADD20:%.*]] = add i32 [[TMP30]], [[MUL19]]
+// CHECK1-NEXT: store i32 [[ADD20]], ptr [[I]], align 4
+// CHECK1-NEXT: [[TMP33:%.*]] = load i32, ptr [[I]], align 4
+// CHECK1-NEXT: call void (...) @body(i32 noundef [[TMP33]])
+// CHECK1-NEXT: br label %[[IF_END]]
+// CHECK1: [[IF_END]]:
+// CHECK1-NEXT: [[TMP34:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT: [[TMP35:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK1-NEXT: [[CMP21:%.*]] = icmp ult i32 [[TMP34]], [[TMP35]]
+// CHECK1-NEXT: br i1 [[CMP21]], label %[[IF_THEN22:.*]], label %[[IF_END27:.*]]
+// CHECK1: [[IF_THEN22]]:
+// CHECK1-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTOMP_LB1]], align 4
+// CHECK1-NEXT: [[TMP37:%.*]] = load i32, ptr [[DOTOMP_ST1]], align 4
+// CHECK1-NEXT: [[TMP38:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT: [[MUL23:%.*]] = mul i32 [[TMP37]], [[TMP38]]
+// CHECK1-NEXT: [[ADD24:%.*]] = add i32 [[TMP36]], [[MUL23]]
+// CHECK1-NEXT: store i32 [[ADD24]], ptr [[DOTOMP_IV1]], align 4
+// CHECK1-NEXT: [[TMP39:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4
+// CHECK1-NEXT: [[TMP40:%.*]] = load i32, ptr [[DOTOMP_IV1]], align 4
+// CHECK1-NEXT: [[TMP41:%.*]] = load i32, ptr [[DOTNEW_STEP8]], align 4
+// CHECK1-NEXT: [[MUL25:%.*]] = mul i32 [[TMP40]], [[TMP41]]
+// CHECK1-NEXT: [[ADD26:%.*]] = add i32 [[TMP39]], [[MUL25]]
+// CHECK1-NEXT: store i32 [[ADD26]], ptr [[J]], align 4
+// CHECK1-NEXT: [[TMP42:%.*]] = load i32, ptr [[J]], align 4
+// CHECK1-NEXT: call void (...) @body(i32 noundef [[TMP42]])
+// CHECK1-NEXT: br label %[[IF_END27]]
+// CHECK1: [[IF_END27]]:
+// CHECK1-NEXT: br label %[[FOR_INC:.*]]
+// CHECK1: [[FOR_INC]]:
+// CHECK1-NEXT: [[TMP43:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT: [[INC:%.*]] = add i32 [[TMP43]], 1
+// CHECK1-NEXT: store i32 [[INC]], ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT: br label %[[FOR_COND]], !llvm.loop [[LOOP3:![0-9]+]]
+// CHECK1: [[FOR_END]]:
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define dso_local void @tfoo2(
+// CHECK1-SAME: ) #[[ATTR0]] {
+// CHECK1-NEXT: [[ENTRY:.*:]]
+// CHECK1-NEXT: call void @_Z4foo2IiEvT_S0_S0_(i32 noundef 0, i32 noundef 64, i32 noundef 4)
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define linkonce_odr void @_Z4foo2IiEvT_S0_S0_(
+// CHECK1-SAME: i32 noundef [[START:%.*]], i32 noundef [[END:%.*]], i32 noundef [[STEP:%.*]]) #[[ATTR0]] comdat {
+// CHECK1-NEXT: [[ENTRY:.*:]]
+// CHECK1-NEXT: [[START_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[END_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[STEP_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[J:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[K:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTNEW_STEP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_LB0:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_ST0:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_NI0:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_IV0:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_6:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_7:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTNEW_STEP8:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_9:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_LB1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_ST1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_NI1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_IV1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_17:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_19:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTNEW_STEP21:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_22:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_LB2:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_ST2:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_NI2:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_IV2:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_TEMP_1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_TEMP_2:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_FUSE_MAX:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_FUSE_INDEX:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: store i32 [[START]], ptr [[START_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[END]], ptr [[END_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[STEP]], ptr [[STEP_ADDR]], align 4
+// CHECK1-NEXT: [[TMP0:%.*]] = load i32, ptr [[START_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[TMP0]], ptr [[I]], align 4
+// CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[START_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: [[TMP2:%.*]] = load i32, ptr [[END_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[TMP2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[STEP_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[TMP3]], ptr [[DOTNEW_STEP]], align 4
+// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: [[SUB:%.*]] = sub i32 [[TMP4]], [[TMP5]]
+// CHECK1-NEXT: [[SUB3:%.*]] = sub i32 [[SUB]], 1
+// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4
+// CHECK1-NEXT: [[ADD:%.*]] = add i32 [[SUB3]], [[TMP6]]
+// CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4
+// CHECK1-NEXT: [[DIV:%.*]] = udiv i32 [[ADD]], [[TMP7]]
+// CHECK1-NEXT: [[SUB4:%.*]] = sub i32 [[DIV]], 1
+// CHECK1-NEXT: store i32 [[SUB4]], ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB0]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_ST0]], align 4
+// CHECK1-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK1-NEXT: [[ADD5:%.*]] = add i32 [[TMP8]], 1
+// CHECK1-NEXT: store i32 [[ADD5]], ptr [[DOTOMP_NI0]], align 4
+// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[END_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[TMP9]], ptr [[J]], align 4
+// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[END_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[TMP10]], ptr [[DOTCAPTURE_EXPR_6]], align 4
+// CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[START_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[TMP11]], ptr [[DOTCAPTURE_EXPR_7]], align 4
+// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[STEP_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[TMP12]], ptr [[DOTNEW_STEP8]], align 4
+// CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4
+// CHECK1-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_7]], align 4
+// CHECK1-NEXT: [[SUB10:%.*]] = sub i32 [[TMP13]], [[TMP14]]
+// CHECK1-NEXT: [[SUB11:%.*]] = sub i32 [[SUB10]], 1
+// CHECK1-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTNEW_STEP8]], align 4
+// CHECK1-NEXT: [[ADD12:%.*]] = add i32 [[SUB11]], [[TMP15]]
+// CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTNEW_STEP8]], align 4
+// CHECK1-NEXT: [[DIV13:%.*]] = udiv i32 [[ADD12]], [[TMP16]]
+// CHECK1-NEXT: [[SUB14:%.*]] = sub i32 [[DIV13]], 1
+// CHECK1-NEXT: store i32 [[SUB14]], ptr [[DOTCAPTURE_EXPR_9]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB1]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_ST1]], align 4
+// CHECK1-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_9]], align 4
+// CHECK1-NEXT: [[ADD15:%.*]] = add i32 [[TMP17]], 1
+// CHECK1-NEXT: store i32 [[ADD15]], ptr [[DOTOMP_NI1]], align 4
+// CHECK1-NEXT: [[TMP18:%.*]] = load i32, ptr [[START_ADDR]], align 4
+// CHECK1-NEXT: [[TMP19:%.*]] = load i32, ptr [[STEP_ADDR]], align 4
+// CHECK1-NEXT: [[ADD16:%.*]] = add nsw i32 [[TMP18]], [[TMP19]]
+// CHECK1-NEXT: store i32 [[ADD16]], ptr [[K]], align 4
+// CHECK1-NEXT: [[TMP20:%.*]] = load i32, ptr [[START_ADDR]], align 4
+// CHECK1-NEXT: [[TMP21:%.*]] = load i32, ptr [[STEP_ADDR]], align 4
+// CHECK1-NEXT: [[ADD18:%.*]] = add nsw i32 [[TMP20]], [[TMP21]]
+// CHECK1-NEXT: store i32 [[ADD18]], ptr [[DOTCAPTURE_EXPR_17]], align 4
+// CHECK1-NEXT: [[TMP22:%.*]] = load i32, ptr [[END_ADDR]], align 4
+// CHECK1-NEXT: [[TMP23:%.*]] = load i32, ptr [[STEP_ADDR]], align 4
+// CHECK1-NEXT: [[ADD20:%.*]] = add nsw i32 [[TMP22]], [[TMP23]]
+// CHECK1-NEXT: store i32 [[ADD20]], ptr [[DOTCAPTURE_EXPR_19]], align 4
+// CHECK1-NEXT: [[TMP24:%.*]] = load i32, ptr [[STEP_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[TMP24]], ptr [[DOTNEW_STEP21]], align 4
+// CHECK1-NEXT: [[TMP25:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_19]], align 4
+// CHECK1-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4
+// CHECK1-NEXT: [[SUB23:%.*]] = sub i32 [[TMP25]], [[TMP26]]
+// CHECK1-NEXT: [[SUB24:%.*]] = sub i32 [[SUB23]], 1
+// CHECK1-NEXT: [[TMP27:%.*]] = load i32, ptr [[DOTNEW_STEP21]], align 4
+// CHECK1-NEXT: [[ADD25:%.*]] = add i32 [[SUB24]], [[TMP27]]
+// CHECK1-NEXT: [[TMP28:%.*]] = load i32, ptr [[DOTNEW_STEP21]], align 4
+// CHECK1-NEXT: [[DIV26:%.*]] = udiv i32 [[ADD25]], [[TMP28]]
+// CHECK1-NEXT: [[SUB27:%.*]] = sub i32 [[DIV26]], 1
+// CHECK1-NEXT: store i32 [[SUB27]], ptr [[DOTCAPTURE_EXPR_22]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB2]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_ST2]], align 4
+// CHECK1-NEXT: [[TMP29:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_22]], align 4
+// CHECK1-NEXT: [[ADD28:%.*]] = add i32 [[TMP29]], 1
+// CHECK1-NEXT: store i32 [[ADD28]], ptr [[DOTOMP_NI2]], align 4
+// CHECK1-NEXT: [[TMP30:%.*]] = load i32, ptr [[DOTOMP_NI0]], align 4
+// CHECK1-NEXT: store i32 [[TMP30]], ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK1-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK1-NEXT: [[TMP32:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK1-NEXT: [[CMP:%.*]] = icmp ugt i32 [[TMP31]], [[TMP32]]
+// CHECK1-NEXT: br i1 [[CMP]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]]
+// CHECK1: [[COND_TRUE]]:
+// CHECK1-NEXT: [[TMP33:%.*]] = load i32, ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK1-NEXT: br label %[[COND_END:.*]]
+// CHECK1: [[COND_FALSE]]:
+// CHECK1-NEXT: [[TMP34:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK1-NEXT: br label %[[COND_END]]
+// CHECK1: [[COND_END]]:
+// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ [[TMP33]], %[[COND_TRUE]] ], [ [[TMP34]], %[[COND_FALSE]] ]
+// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_TEMP_2]], align 4
+// CHECK1-NEXT: [[TMP35:%.*]] = load i32, ptr [[DOTOMP_TEMP_2]], align 4
+// CHECK1-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTOMP_NI2]], align 4
+// CHECK1-NEXT: [[CMP29:%.*]] = icmp ugt i32 [[TMP35]], [[TMP36]]
+// CHECK1-NEXT: br i1 [[CMP29]], label %[[COND_TRUE30:.*]], label %[[COND_FALSE31:.*]]
+// CHECK1: [[COND_TRUE30]]:
+// CHECK1-NEXT: [[TMP37:%.*]] = load i32, ptr [[DOTOMP_TEMP_2]], align 4
+// CHECK1-NEXT: br label %[[COND_END32:.*]]
+// CHECK1: [[COND_FALSE31]]:
+// CHECK1-NEXT: [[TMP38:%.*]] = load i32, ptr [[DOTOMP_NI2]], align 4
+// CHECK1-NEXT: br label %[[COND_END32]]
+// CHECK1: [[COND_END32]]:
+// CHECK1-NEXT: [[COND33:%.*]] = phi i32 [ [[TMP37]], %[[COND_TRUE30]] ], [ [[TMP38]], %[[COND_FALSE31]] ]
+// CHECK1-NEXT: store i32 [[COND33]], ptr [[DOTOMP_FUSE_MAX]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT: br label %[[FOR_COND:.*]]
+// CHECK1: [[FOR_COND]]:
+// CHECK1-NEXT: [[TMP39:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT: [[TMP40:%.*]] = load i32, ptr [[DOTOMP_FUSE_MAX]], align 4
+// CHECK1-NEXT: [[CMP34:%.*]] = icmp ult i32 [[TMP39]], [[TMP40]]
+// CHECK1-NEXT: br i1 [[CMP34]], label %[[FOR_BODY:.*]], label %[[FOR_END:.*]]
+// CHECK1: [[FOR_BODY]]:
+// CHECK1-NEXT: [[TMP41:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT: [[TMP42:%.*]] = load i32, ptr [[DOTOMP_NI0]], align 4
+// CHECK1-NEXT: [[CMP35:%.*]] = icmp ult i32 [[TMP41]], [[TMP42]]
+// CHECK1-NEXT: br i1 [[CMP35]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
+// CHECK1: [[IF_THEN]]:
+// CHECK1-NEXT: [[TMP43:%.*]] = load i32, ptr [[DOTOMP_LB0]], align 4
+// CHECK1-NEXT: [[TMP44:%.*]] = load i32, ptr [[DOTOMP_ST0]], align 4
+// CHECK1-NEXT: [[TMP45:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT: [[MUL:%.*]] = mul i32 [[TMP44]], [[TMP45]]
+// CHECK1-NEXT: [[ADD36:%.*]] = add i32 [[TMP43]], [[MUL]]
+// CHECK1-NEXT: store i32 [[ADD36]], ptr [[DOTOMP_IV0]], align 4
+// CHECK1-NEXT: [[TMP46:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: [[TMP47:%.*]] = load i32, ptr [[DOTOMP_IV0]], align 4
+// CHECK1-NEXT: [[TMP48:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4
+// CHECK1-NEXT: [[MUL37:%.*]] = mul i32 [[TMP47]], [[TMP48]]
+// CHECK1-NEXT: [[ADD38:%.*]] = add i32 [[TMP46]], [[MUL37]]
+// CHECK1-NEXT: store i32 [[ADD38]], ptr [[I]], align 4
+// CHECK1-NEXT: [[TMP49:%.*]] = load i32, ptr [[I]], align 4
+// CHECK1-NEXT: call void (...) @body(i32 noundef [[TMP49]])
+// CHECK1-NEXT: br label %[[IF_END]]
+// CHECK1: [[IF_END]]:
+// CHECK1-NEXT: [[TMP50:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT: [[TMP51:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK1-NEXT: [[CMP39:%.*]] = icmp ult i32 [[TMP50]], [[TMP51]]
+// CHECK1-NEXT: br i1 [[CMP39]], label %[[IF_THEN40:.*]], label %[[IF_END45:.*]]
+// CHECK1: [[IF_THEN40]]:
+// CHECK1-NEXT: [[TMP52:%.*]] = load i32, ptr [[DOTOMP_LB1]], align 4
+// CHECK1-NEXT: [[TMP53:%.*]] = load i32, ptr [[DOTOMP_ST1]], align 4
+// CHECK1-NEXT: [[TMP54:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT: [[MUL41:%.*]] = mul i32 [[TMP53]], [[TMP54]]
+// CHECK1-NEXT: [[ADD42:%.*]] = add i32 [[TMP52]], [[MUL41]]
+// CHECK1-NEXT: store i32 [[ADD42]], ptr [[DOTOMP_IV1]], align 4
+// CHECK1-NEXT: [[TMP55:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4
+// CHECK1-NEXT: [[TMP56:%.*]] = load i32, ptr [[DOTOMP_IV1]], align 4
+// CHECK1-NEXT: [[TMP57:%.*]] = load i32, ptr [[DOTNEW_STEP8]], align 4
+// CHECK1-NEXT: [[MUL43:%.*]] = mul i32 [[TMP56]], [[TMP57]]
+// CHECK1-NEXT: [[SUB44:%.*]] = sub i32 [[TMP55]], [[MUL43]]
+// CHECK1-NEXT: store i32 [[SUB44]], ptr [[J]], align 4
+// CHECK1-NEXT: [[TMP58:%.*]] = load i32, ptr [[J]], align 4
+// CHECK1-NEXT: call void (...) @body(i32 noundef [[TMP58]])
+// CHECK1-NEXT: br label %[[IF_END45]]
+// CHECK1: [[IF_END45]]:
+// CHECK1-NEXT: [[TMP59:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT: [[TMP60:%.*]] = load i32, ptr [[DOTOMP_NI2]], align 4
+// CHECK1-NEXT: [[CMP46:%.*]] = icmp ult i32 [[TMP59]], [[TMP60]]
+// CHECK1-NEXT: br i1 [[CMP46]], label %[[IF_THEN47:.*]], label %[[IF_END52:.*]]
+// CHECK1: [[IF_THEN47]]:
+// CHECK1-NEXT: [[TMP61:%.*]] = load i32, ptr [[DOTOMP_LB2]], align 4
+// CHECK1-NEXT: [[TMP62:%.*]] = load i32, ptr [[DOTOMP_ST2]], align 4
+// CHECK1-NEXT: [[TMP63:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT: [[MUL48:%.*]] = mul i32 [[TMP62]], [[TMP63]]
+// CHECK1-NEXT: [[ADD49:%.*]] = add i32 [[TMP61]], [[MUL48]]
+// CHECK1-NEXT: store i32 [[ADD49]], ptr [[DOTOMP_IV2]], align 4
+// CHECK1-NEXT: [[TMP64:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4
+// CHECK1-NEXT: [[TMP65:%.*]] = load i32, ptr [[DOTOMP_IV2]], align 4
+// CHECK1-NEXT: [[TMP66:%.*]] = load i32, ptr [[DOTNEW_STEP21]], align 4
+// CHECK1-NEXT: [[MUL50:%.*]] = mul i32 [[TMP65]], [[TMP66]]
+// CHECK1-NEXT: [[ADD51:%.*]] = add i32 [[TMP64]], [[MUL50]]
+// CHECK1-NEXT: store i32 [[ADD51]], ptr [[K]], align 4
+// CHECK1-NEXT: [[TMP67:%.*]] = load i32, ptr [[K]], align 4
+// CHECK1-NEXT: call void (...) @body(i32 noundef [[TMP67]])
+// CHECK1-NEXT: br label %[[IF_END52]]
+// CHECK1: [[IF_END52]]:
+// CHECK1-NEXT: br label %[[FOR_INC:.*]]
+// CHECK1: [[FOR_INC]]:
+// CHECK1-NEXT: [[TMP68:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT: [[INC:%.*]] = add i32 [[TMP68]], 1
+// CHECK1-NEXT: store i32 [[INC]], ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT: br label %[[FOR_COND]], !llvm.loop [[LOOP5:![0-9]+]]
+// CHECK1: [[FOR_END]]:
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define dso_local void @foo3(
+// CHECK1-SAME: ) #[[ATTR0]] {
+// CHECK1-NEXT: [[ENTRY:.*:]]
+// CHECK1-NEXT: [[ARR:%.*]] = alloca [256 x double], align 16
+// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_LB0:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_ST0:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_NI0:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_IV0:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[J:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_LB1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_ST1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_NI1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_IV1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_TEMP_1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_FUSE_MAX:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_FUSE_INDEX:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_LB03:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_ST04:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_NI05:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTOMP_IV06:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[C:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[__RANGE2:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[__END2:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[__BEGIN2:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_8:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_10:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_11:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTOMP_LB116:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTOMP_ST117:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTOMP_NI118:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTOMP_IV120:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[CC:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[__RANGE221:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[__END222:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[__BEGIN225:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_27:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_29:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_30:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTOMP_LB2:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTOMP_ST2:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTOMP_NI2:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTOMP_IV2:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTOMP_TEMP_140:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTOMP_TEMP_2:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTOMP_FUSE_MAX46:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTOMP_FUSE_INDEX52:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[V:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[VV:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB0]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_ST0]], align 4
+// CHECK1-NEXT: store i32 128, ptr [[DOTOMP_NI0]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[J]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB1]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_ST1]], align 4
+// CHECK1-NEXT: store i32 128, ptr [[DOTOMP_NI1]], align 4
+// CHECK1-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTOMP_NI0]], align 4
+// CHECK1-NEXT: store i32 [[TMP0]], ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK1-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP1]], [[TMP2]]
+// CHECK1-NEXT: br i1 [[CMP]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]]
+// CHECK1: [[COND_TRUE]]:
+// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK1-NEXT: br label %[[COND_END:.*]]
+// CHECK1: [[COND_FALSE]]:
+// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK1-NEXT: br label %[[COND_END]]
+// CHECK1: [[COND_END]]:
+// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ [[TMP3]], %[[COND_TRUE]] ], [ [[TMP4]], %[[COND_FALSE]] ]
+// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_FUSE_MAX]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_FUSE_MAX]], align 4
+// CHECK1-NEXT: store i32 [[TMP5]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP6]], 0
+// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK1-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK1-NEXT: store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB03]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_ST04]], align 4
+// CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP7]], 1
+// CHECK1-NEXT: [[CONV:%.*]] = sext i32 [[ADD]] to i64
+// CHECK1-NEXT: store i64 [[CONV]], ptr [[DOTOMP_NI05]], align 8
+// CHECK1-NEXT: store i32 42, ptr [[C]], align 4
+// CHECK1-NEXT: store ptr [[ARR]], ptr [[__RANGE2]], align 8
+// CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK1-NEXT: [[ARRAYDECAY:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP8]], i64 0, i64 0
+// CHECK1-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds double, ptr [[ARRAYDECAY]], i64 256
+// CHECK1-NEXT: store ptr [[ADD_PTR]], ptr [[__END2]], align 8
+// CHECK1-NEXT: [[TMP9:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK1-NEXT: [[ARRAYDECAY7:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP9]], i64 0, i64 0
+// CHECK1-NEXT: store ptr [[ARRAYDECAY7]], ptr [[__BEGIN2]], align 8
+// CHECK1-NEXT: [[TMP10:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK1-NEXT: [[ARRAYDECAY9:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP10]], i64 0, i64 0
+// CHECK1-NEXT: store ptr [[ARRAYDECAY9]], ptr [[DOTCAPTURE_EXPR_8]], align 8
+// CHECK1-NEXT: [[TMP11:%.*]] = load ptr, ptr [[__END2]], align 8
+// CHECK1-NEXT: store ptr [[TMP11]], ptr [[DOTCAPTURE_EXPR_10]], align 8
+// CHECK1-NEXT: [[TMP12:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_10]], align 8
+// CHECK1-NEXT: [[TMP13:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_8]], align 8
+// CHECK1-NEXT: [[SUB_PTR_LHS_CAST:%.*]] = ptrtoint ptr [[TMP12]] to i64
+// CHECK1-NEXT: [[SUB_PTR_RHS_CAST:%.*]] = ptrtoint ptr [[TMP13]] to i64
+// CHECK1-NEXT: [[SUB_PTR_SUB:%.*]] = sub i64 [[SUB_PTR_LHS_CAST]], [[SUB_PTR_RHS_CAST]]
+// CHECK1-NEXT: [[SUB_PTR_DIV:%.*]] = sdiv exact i64 [[SUB_PTR_SUB]], 8
+// CHECK1-NEXT: [[SUB12:%.*]] = sub nsw i64 [[SUB_PTR_DIV]], 1
+// CHECK1-NEXT: [[ADD13:%.*]] = add nsw i64 [[SUB12]], 1
+// CHECK1-NEXT: [[DIV14:%.*]] = sdiv i64 [[ADD13]], 1
+// CHECK1-NEXT: [[SUB15:%.*]] = sub nsw i64 [[DIV14]], 1
+// CHECK1-NEXT: store i64 [[SUB15]], ptr [[DOTCAPTURE_EXPR_11]], align 8
+// CHECK1-NEXT: store i64 0, ptr [[DOTOMP_LB116]], align 8
+// CHECK1-NEXT: store i64 1, ptr [[DOTOMP_ST117]], align 8
+// CHECK1-NEXT: [[TMP14:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_11]], align 8
+// CHECK1-NEXT: [[ADD19:%.*]] = add nsw i64 [[TMP14]], 1
+// CHECK1-NEXT: store i64 [[ADD19]], ptr [[DOTOMP_NI118]], align 8
+// CHECK1-NEXT: store i32 37, ptr [[CC]], align 4
+// CHECK1-NEXT: store ptr [[ARR]], ptr [[__RANGE221]], align 8
+// CHECK1-NEXT: [[TMP15:%.*]] = load ptr, ptr [[__RANGE221]], align 8
+// CHECK1-NEXT: [[ARRAYDECAY23:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP15]], i64 0, i64 0
+// CHECK1-NEXT: [[ADD_PTR24:%.*]] = getelementptr inbounds double, ptr [[ARRAYDECAY23]], i64 256
+// CHECK1-NEXT: store ptr [[ADD_PTR24]], ptr [[__END222]], align 8
+// CHECK1-NEXT: [[TMP16:%.*]] = load ptr, ptr [[__RANGE221]], align 8
+// CHECK1-NEXT: [[ARRAYDECAY26:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP16]], i64 0, i64 0
+// CHECK1-NEXT: store ptr [[ARRAYDECAY26]], ptr [[__BEGIN225]], align 8
+// CHECK1-NEXT: [[TMP17:%.*]] = load ptr, ptr [[__RANGE221]], align 8
+// CHECK1-NEXT: [[ARRAYDECAY28:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP17]], i64 0, i64 0
+// CHECK1-NEXT: store ptr [[ARRAYDECAY28]], ptr [[DOTCAPTURE_EXPR_27]], align 8
+// CHECK1-NEXT: [[TMP18:%.*]] = load ptr, ptr [[__END222]], align 8
+// CHECK1-NEXT: store ptr [[TMP18]], ptr [[DOTCAPTURE_EXPR_29]], align 8
+// CHECK1-NEXT: [[TMP19:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_29]], align 8
+// CHECK1-NEXT: [[TMP20:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_27]], align 8
+// CHECK1-NEXT: [[SUB_PTR_LHS_CAST31:%.*]] = ptrtoint ptr [[TMP19]] to i64
+// CHECK1-NEXT: [[SUB_PTR_RHS_CAST32:%.*]] = ptrtoint ptr [[TMP20]] to i64
+// CHECK1-NEXT: [[SUB_PTR_SUB33:%.*]] = sub i64 [[SUB_PTR_LHS_CAST31]], [[SUB_PTR_RHS_CAST32]]
+// CHECK1-NEXT: [[SUB_PTR_DIV34:%.*]] = sdiv exact i64 [[SUB_PTR_SUB33]], 8
+// CHECK1-NEXT: [[SUB35:%.*]] = sub nsw i64 [[SUB_PTR_DIV34]], 1
+// CHECK1-NEXT: [[ADD36:%.*]] = add nsw i64 [[SUB35]], 1
+// CHECK1-NEXT: [[DIV37:%.*]] = sdiv i64 [[ADD36]], 1
+// CHECK1-NEXT: [[SUB38:%.*]] = sub nsw i64 [[DIV37]], 1
+// CHECK1-NEXT: store i64 [[SUB38]], ptr [[DOTCAPTURE_EXPR_30]], align 8
+// CHECK1-NEXT: store i64 0, ptr [[DOTOMP_LB2]], align 8
+// CHECK1-NEXT: store i64 1, ptr [[DOTOMP_ST2]], align 8
+// CHECK1-NEXT: [[TMP21:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_30]], align 8
+// CHECK1-NEXT: [[ADD39:%.*]] = add nsw i64 [[TMP21]], 1
+// CHECK1-NEXT: store i64 [[ADD39]], ptr [[DOTOMP_NI2]], align 8
+// CHECK1-NEXT: [[TMP22:%.*]] = load i64, ptr [[DOTOMP_NI05]], align 8
+// CHECK1-NEXT: store i64 [[TMP22]], ptr [[DOTOMP_TEMP_140]], align 8
+// CHECK1-NEXT: [[TMP23:%.*]] = load i64, ptr [[DOTOMP_TEMP_140]], align 8
+// CHECK1-NEXT: [[TMP24:%.*]] = load i64, ptr [[DOTOMP_NI118]], align 8
+// CHECK1-NEXT: [[CMP41:%.*]] = icmp sgt i64 [[TMP23]], [[TMP24]]
+// CHECK1-NEXT: br i1 [[CMP41]], label %[[COND_TRUE42:.*]], label %[[COND_FALSE43:.*]]
+// CHECK1: [[COND_TRUE42]]:
+// CHECK1-NEXT: [[TMP25:%.*]] = load i64, ptr [[DOTOMP_TEMP_140]], align 8
+// CHECK1-NEXT: br label %[[COND_END44:.*]]
+// CHECK1: [[COND_FALSE43]]:
+// CHECK1-NEXT: [[TMP26:%.*]] = load i64, ptr [[DOTOMP_NI118]], align 8
+// CHECK1-NEXT: br label %[[COND_END44]]
+// CHECK1: [[COND_END44]]:
+// CHECK1-NEXT: [[COND45:%.*]] = phi i64 [ [[TMP25]], %[[COND_TRUE42]] ], [ [[TMP26]], %[[COND_FALSE43]] ]
+// CHECK1-NEXT: store i64 [[COND45]], ptr [[DOTOMP_TEMP_2]], align 8
+// CHECK1-NEXT: [[TMP27:%.*]] = load i64, ptr [[DOTOMP_TEMP_2]], align 8
+// CHECK1-NEXT: [[TMP28:%.*]] = load i64, ptr [[DOTOMP_NI2]], align 8
+// CHECK1-NEXT: [[CMP47:%.*]] = icmp sgt i64 [[TMP27]], [[TMP28]]
+// CHECK1-NEXT: br i1 [[CMP47]], label %[[COND_TRUE48:.*]], label %[[COND_FALSE49:.*]]
+// CHECK1: [[COND_TRUE48]]:
+// CHECK1-NEXT: [[TMP29:%.*]] = load i64, ptr [[DOTOMP_TEMP_2]], align 8
+// CHECK1-NEXT: br label %[[COND_END50:.*]]
+// CHECK1: [[COND_FALSE49]]:
+// CHECK1-NEXT: [[TMP30:%.*]] = load i64, ptr [[DOTOMP_NI2]], align 8
+// CHECK1-NEXT: br label %[[COND_END50]]
+// CHECK1: [[COND_END50]]:
+// CHECK1-NEXT: [[COND51:%.*]] = phi i64 [ [[TMP29]], %[[COND_TRUE48]] ], [ [[TMP30]], %[[COND_FALSE49]] ]
+// CHECK1-NEXT: store i64 [[COND51]], ptr [[DOTOMP_FUSE_MAX46]], align 8
+// CHECK1-NEXT: store i64 0, ptr [[DOTOMP_FUSE_INDEX52]], align 8
+// CHECK1-NEXT: br label %[[FOR_COND:.*]]
+// CHECK1: [[FOR_COND]]:
+// CHECK1-NEXT: [[TMP31:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX52]], align 8
+// CHECK1-NEXT: [[TMP32:%.*]] = load i64, ptr [[DOTOMP_FUSE_MAX46]], align 8
+// CHECK1-NEXT: [[CMP53:%.*]] = icmp slt i64 [[TMP31]], [[TMP32]]
+// CHECK1-NEXT: br i1 [[CMP53]], label %[[FOR_BODY:.*]], label %[[FOR_END:.*]]
+// CHECK1: [[FOR_BODY]]:
+// CHECK1-NEXT: [[TMP33:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX52]], align 8
+// CHECK1-NEXT: [[TMP34:%.*]] = load i64, ptr [[DOTOMP_NI05]], align 8
+// CHECK1-NEXT: [[CMP54:%.*]] = icmp slt i64 [[TMP33]], [[TMP34]]
+// CHECK1-NEXT: br i1 [[CMP54]], label %[[IF_THEN:.*]], label %[[IF_END74:.*]]
+// CHECK1: [[IF_THEN]]:
+// CHECK1-NEXT: [[TMP35:%.*]] = load i32, ptr [[DOTOMP_LB03]], align 4
+// CHECK1-NEXT: [[CONV55:%.*]] = sext i32 [[TMP35]] to i64
+// CHECK1-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTOMP_ST04]], align 4
+// CHECK1-NEXT: [[CONV56:%.*]] = sext i32 [[TMP36]] to i64
+// CHECK1-NEXT: [[TMP37:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX52]], align 8
+// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV56]], [[TMP37]]
+// CHECK1-NEXT: [[ADD57:%.*]] = add nsw i64 [[CONV55]], [[MUL]]
+// CHECK1-NEXT: [[CONV58:%.*]] = trunc i64 [[ADD57]] to i32
+// CHECK1-NEXT: store i32 [[CONV58]], ptr [[DOTOMP_IV06]], align 4
+// CHECK1-NEXT: [[TMP38:%.*]] = load i32, ptr [[DOTOMP_IV06]], align 4
+// CHECK1-NEXT: [[MUL59:%.*]] = mul nsw i32 [[TMP38]], 1
+// CHECK1-NEXT: [[ADD60:%.*]] = add nsw i32 0, [[MUL59]]
+// CHECK1-NEXT: store i32 [[ADD60]], ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT: [[TMP39:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT: [[TMP40:%.*]] = load i32, ptr [[DOTOMP_NI0]], align 4
+// CHECK1-NEXT: [[CMP61:%.*]] = icmp slt i32 [[TMP39]], [[TMP40]]
+// CHECK1-NEXT: br i1 [[CMP61]], label %[[IF_THEN62:.*]], label %[[IF_END:.*]]
+// CHECK1: [[IF_THEN62]]:
+// CHECK1-NEXT: [[TMP41:%.*]] = load i32, ptr [[DOTOMP_LB0]], align 4
+// CHECK1-NEXT: [[TMP42:%.*]] = load i32, ptr [[DOTOMP_ST0]], align 4
+// CHECK1-NEXT: [[TMP43:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT: [[MUL63:%.*]] = mul nsw i32 [[TMP42]], [[TMP43]]
+// CHECK1-NEXT: [[ADD64:%.*]] = add nsw i32 [[TMP41]], [[MUL63]]
+// CHECK1-NEXT: store i32 [[ADD64]], ptr [[DOTOMP_IV0]], align 4
+// CHECK1-NEXT: [[TMP44:%.*]] = load i32, ptr [[DOTOMP_IV0]], align 4
+// CHECK1-NEXT: [[MUL65:%.*]] = mul nsw i32 [[TMP44]], 1
+// CHECK1-NEXT: [[ADD66:%.*]] = add nsw i32 0, [[MUL65]]
+// CHECK1-NEXT: store i32 [[ADD66]], ptr [[I]], align 4
+// CHECK1-NEXT: [[TMP45:%.*]] = load i32, ptr [[I]], align 4
+// CHECK1-NEXT: call void (...) @body(i32 noundef [[TMP45]])
+// CHECK1-NEXT: br label %[[IF_END]]
+// CHECK1: [[IF_END]]:
+// CHECK1-NEXT: [[TMP46:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT: [[TMP47:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK1-NEXT: [[CMP67:%.*]] = icmp slt i32 [[TMP46]], [[TMP47]]
+// CHECK1-NEXT: br i1 [[CMP67]], label %[[IF_THEN68:.*]], label %[[IF_END73:.*]]
+// CHECK1: [[IF_THEN68]]:
+// CHECK1-NEXT: [[TMP48:%.*]] = load i32, ptr [[DOTOMP_LB1]], align 4
+// CHECK1-NEXT: [[TMP49:%.*]] = load i32, ptr [[DOTOMP_ST1]], align 4
+// CHECK1-NEXT: [[TMP50:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT: [[MUL69:%.*]] = mul nsw i32 [[TMP49]], [[TMP50]]
+// CHECK1-NEXT: [[ADD70:%.*]] = add nsw i32 [[TMP48]], [[MUL69]]
+// CHECK1-NEXT: store i32 [[ADD70]], ptr [[DOTOMP_IV1]], align 4
+// CHECK1-NEXT: [[TMP51:%.*]] = load i32, ptr [[DOTOMP_IV1]], align 4
+// CHECK1-NEXT: [[MUL71:%.*]] = mul nsw i32 [[TMP51]], 2
+// CHECK1-NEXT: [[ADD72:%.*]] = add nsw i32 0, [[MUL71]]
+// CHECK1-NEXT: store i32 [[ADD72]], ptr [[J]], align 4
+// CHECK1-NEXT: [[TMP52:%.*]] = load i32, ptr [[J]], align 4
+// CHECK1-NEXT: call void (...) @body(i32 noundef [[TMP52]])
+// CHECK1-NEXT: br label %[[IF_END73]]
+// CHECK1: [[IF_END73]]:
+// CHECK1-NEXT: br label %[[IF_END74]]
+// CHECK1: [[IF_END74]]:
+// CHECK1-NEXT: [[TMP53:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX52]], align 8
+// CHECK1-NEXT: [[TMP54:%.*]] = load i64, ptr [[DOTOMP_NI118]], align 8
+// CHECK1-NEXT: [[CMP75:%.*]] = icmp slt i64 [[TMP53]], [[TMP54]]
+// CHECK1-NEXT: br i1 [[CMP75]], label %[[IF_THEN76:.*]], label %[[IF_END81:.*]]
+// CHECK1: [[IF_THEN76]]:
+// CHECK1-NEXT: [[TMP55:%.*]] = load i64, ptr [[DOTOMP_LB116]], align 8
+// CHECK1-NEXT: [[TMP56:%.*]] = load i64, ptr [[DOTOMP_ST117]], align 8
+// CHECK1-NEXT: [[TMP57:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX52]], align 8
+// CHECK1-NEXT: [[MUL77:%.*]] = mul nsw i64 [[TMP56]], [[TMP57]]
+// CHECK1-NEXT: [[ADD78:%.*]] = add nsw i64 [[TMP55]], [[MUL77]]
+// CHECK1-NEXT: store i64 [[ADD78]], ptr [[DOTOMP_IV120]], align 8
+// CHECK1-NEXT: [[TMP58:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_8]], align 8
+// CHECK1-NEXT: [[TMP59:%.*]] = load i64, ptr [[DOTOMP_IV120]], align 8
+// CHECK1-NEXT: [[MUL79:%.*]] = mul nsw i64 [[TMP59]], 1
+// CHECK1-NEXT: [[ADD_PTR80:%.*]] = getelementptr inbounds double, ptr [[TMP58]], i64 [[MUL79]]
+// CHECK1-NEXT: store ptr [[ADD_PTR80]], ptr [[__BEGIN2]], align 8
+// CHECK1-NEXT: [[TMP60:%.*]] = load ptr, ptr [[__BEGIN2]], align 8
+// CHECK1-NEXT: store ptr [[TMP60]], ptr [[V]], align 8
+// CHECK1-NEXT: [[TMP61:%.*]] = load i32, ptr [[C]], align 4
+// CHECK1-NEXT: [[TMP62:%.*]] = load ptr, ptr [[V]], align 8
+// CHECK1-NEXT: [[TMP63:%.*]] = load double, ptr [[TMP62]], align 8
+// CHECK1-NEXT: call void (...) @body(i32 noundef [[TMP61]], double noundef [[TMP63]])
+// CHECK1-NEXT: br label %[[IF_END81]]
+// CHECK1: [[IF_END81]]:
+// CHECK1-NEXT: [[TMP64:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX52]], align 8
+// CHECK1-NEXT: [[TMP65:%.*]] = load i64, ptr [[DOTOMP_NI2]], align 8
+// CHECK1-NEXT: [[CMP82:%.*]] = icmp slt i64 [[TMP64]], [[TMP65]]
+// CHECK1-NEXT: br i1 [[CMP82]], label %[[IF_THEN83:.*]], label %[[IF_END88:.*]]
+// CHECK1: [[IF_THEN83]]:
+// CHECK1-NEXT: [[TMP66:%.*]] = load i64, ptr [[DOTOMP_LB2]], align 8
+// CHECK1-NEXT: [[TMP67:%.*]] = load i64, ptr [[DOTOMP_ST2]], align 8
+// CHECK1-NEXT: [[TMP68:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX52]], align 8
+// CHECK1-NEXT: [[MUL84:%.*]] = mul nsw i64 [[TMP67]], [[TMP68]]
+// CHECK1-NEXT: [[ADD85:%.*]] = add nsw i64 [[TMP66]], [[MUL84]]
+// CHECK1-NEXT: store i64 [[ADD85]], ptr [[DOTOMP_IV2]], align 8
+// CHECK1-NEXT: [[TMP69:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_27]], align 8
+// CHECK1-NEXT: [[TMP70:%.*]] = load i64, ptr [[DOTOMP_IV2]], align 8
+// CHECK1-NEXT: [[MUL86:%.*]] = mul nsw i64 [[TMP70]], 1
+// CHECK1-NEXT: [[ADD_PTR87:%.*]] = getelementptr inbounds double, ptr [[TMP69]], i64 [[MUL86]]
+// CHECK1-NEXT: store ptr [[ADD_PTR87]], ptr [[__BEGIN225]], align 8
+// CHECK1-NEXT: [[TMP71:%.*]] = load ptr, ptr [[__BEGIN225]], align 8
+// CHECK1-NEXT: store ptr [[TMP71]], ptr [[VV]], align 8
+// CHECK1-NEXT: [[TMP72:%.*]] = load i32, ptr [[CC]], align 4
+// CHECK1-NEXT: [[TMP73:%.*]] = load ptr, ptr [[VV]], align 8
+// CHECK1-NEXT: [[TMP74:%.*]] = load double, ptr [[TMP73]], align 8
+// CHECK1-NEXT: call void (...) @body(i32 noundef [[TMP72]], double noundef [[TMP74]])
+// CHECK1-NEXT: br label %[[IF_END88]]
+// CHECK1: [[IF_END88]]:
+// CHECK1-NEXT: br label %[[FOR_INC:.*]]
+// CHECK1: [[FOR_INC]]:
+// CHECK1-NEXT: [[TMP75:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX52]], align 8
+// CHECK1-NEXT: [[INC:%.*]] = add nsw i64 [[TMP75]], 1
+// CHECK1-NEXT: store i64 [[INC]], ptr [[DOTOMP_FUSE_INDEX52]], align 8
+// CHECK1-NEXT: br label %[[FOR_COND]], !llvm.loop [[LOOP6:![0-9]+]]
+// CHECK1: [[FOR_END]]:
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define dso_local void @foo4(
+// CHECK1-SAME: ) #[[ATTR0]] {
+// CHECK1-NEXT: [[ENTRY:.*:]]
+// CHECK1-NEXT: [[ARR:%.*]] = alloca [256 x double], align 16
+// CHECK1-NEXT: [[J:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_LB0:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_ST0:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_NI0:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_IV0:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[K:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_LB1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_ST1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_NI1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_IV1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_TEMP_1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_FUSE_MAX:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_FUSE_INDEX:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[C:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[__RANGE2:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[__BEGIN2:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[__END2:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[V:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: store i32 0, ptr [[J]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB0]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_ST0]], align 4
+// CHECK1-NEXT: store i32 128, ptr [[DOTOMP_NI0]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[K]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB1]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_ST1]], align 4
+// CHECK1-NEXT: store i32 64, ptr [[DOTOMP_NI1]], align 4
+// CHECK1-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTOMP_NI0]], align 4
+// CHECK1-NEXT: store i32 [[TMP0]], ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK1-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP1]], [[TMP2]]
+// CHECK1-NEXT: br i1 [[CMP]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]]
+// CHECK1: [[COND_TRUE]]:
+// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK1-NEXT: br label %[[COND_END:.*]]
+// CHECK1: [[COND_FALSE]]:
+// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK1-NEXT: br label %[[COND_END]]
+// CHECK1: [[COND_END]]:
+// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ [[TMP3]], %[[COND_TRUE]] ], [ [[TMP4]], %[[COND_FALSE]] ]
+// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_FUSE_MAX]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK1-NEXT: br label %[[FOR_COND:.*]]
+// CHECK1: [[FOR_COND]]:
+// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[I]], align 4
+// CHECK1-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 128
+// CHECK1-NEXT: br i1 [[CMP1]], label %[[FOR_BODY:.*]], label %[[FOR_END:.*]]
+// CHECK1: [[FOR_BODY]]:
+// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[I]], align 4
+// CHECK1-NEXT: call void (...) @body(i32 noundef [[TMP6]])
+// CHECK1-NEXT: br label %[[FOR_INC:.*]]
+// CHECK1: [[FOR_INC]]:
+// CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[I]], align 4
+// CHECK1-NEXT: [[INC:%.*]] = add nsw i32 [[TMP7]], 1
+// CHECK1-NEXT: store i32 [[INC]], ptr [[I]], align 4
+// CHECK1-NEXT: br label %[[FOR_COND]], !llvm.loop [[LOOP7:![0-9]+]]
+// CHECK1: [[FOR_END]]:
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT: br label %[[FOR_COND2:.*]]
+// CHECK1: [[FOR_COND2]]:
+// CHECK1-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_FUSE_MAX]], align 4
+// CHECK1-NEXT: [[CMP3:%.*]] = icmp slt i32 [[TMP8]], [[TMP9]]
+// CHECK1-NEXT: br i1 [[CMP3]], label %[[FOR_BODY4:.*]], label %[[FOR_END17:.*]]
+// CHECK1: [[FOR_BODY4]]:
+// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_NI0]], align 4
+// CHECK1-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP10]], [[TMP11]]
+// CHECK1-NEXT: br i1 [[CMP5]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
+// CHECK1: [[IF_THEN]]:
+// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_LB0]], align 4
+// CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_ST0]], align 4
+// CHECK1-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP13]], [[TMP14]]
+// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP12]], [[MUL]]
+// CHECK1-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV0]], align 4
+// CHECK1-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV0]], align 4
+// CHECK1-NEXT: [[MUL6:%.*]] = mul nsw i32 [[TMP15]], 2
+// CHECK1-NEXT: [[ADD7:%.*]] = add nsw i32 0, [[MUL6]]
+// CHECK1-NEXT: store i32 [[ADD7]], ptr [[J]], align 4
+// CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[J]], align 4
+// CHECK1-NEXT: call void (...) @body(i32 noundef [[TMP16]])
+// CHECK1-NEXT: br label %[[IF_END]]
+// CHECK1: [[IF_END]]:
+// CHECK1-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT: [[TMP18:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK1-NEXT: [[CMP8:%.*]] = icmp slt i32 [[TMP17]], [[TMP18]]
+// CHECK1-NEXT: br i1 [[CMP8]], label %[[IF_THEN9:.*]], label %[[IF_END14:.*]]
+// CHECK1: [[IF_THEN9]]:
+// CHECK1-NEXT: [[TMP19:%.*]] = load i32, ptr [[DOTOMP_LB1]], align 4
+// CHECK1-NEXT: [[TMP20:%.*]] = load i32, ptr [[DOTOMP_ST1]], align 4
+// CHECK1-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT: [[MUL10:%.*]] = mul nsw i32 [[TMP20]], [[TMP21]]
+// CHECK1-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP19]], [[MUL10]]
+// CHECK1-NEXT: store i32 [[ADD11]], ptr [[DOTOMP_IV1]], align 4
+// CHECK1-NEXT: [[TMP22:%.*]] = load i32, ptr [[DOTOMP_IV1]], align 4
+// CHECK1-NEXT: [[MUL12:%.*]] = mul nsw i32 [[TMP22]], 1
+// CHECK1-NEXT: [[ADD13:%.*]] = add nsw i32 0, [[MUL12]]
+// CHECK1-NEXT: store i32 [[ADD13]], ptr [[K]], align 4
+// CHECK1-NEXT: [[TMP23:%.*]] = load i32, ptr [[K]], align 4
+// CHECK1-NEXT: call void (...) @body(i32 noundef [[TMP23]])
+// CHECK1-NEXT: br label %[[IF_END14]]
+// CHECK1: [[IF_END14]]:
+// CHECK1-NEXT: br label %[[FOR_INC15:.*]]
+// CHECK1: [[FOR_INC15]]:
+// CHECK1-NEXT: [[TMP24:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT: [[INC16:%.*]] = add nsw i32 [[TMP24]], 1
+// CHECK1-NEXT: store i32 [[INC16]], ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT: br label %[[FOR_COND2]], !llvm.loop [[LOOP8:![0-9]+]]
+// CHECK1: [[FOR_END17]]:
+// CHECK1-NEXT: store i32 42, ptr [[C]], align 4
+// CHECK1-NEXT: store ptr [[ARR]], ptr [[__RANGE2]], align 8
+// CHECK1-NEXT: [[TMP25:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK1-NEXT: [[ARRAYDECAY:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP25]], i64 0, i64 0
+// CHECK1-NEXT: store ptr [[ARRAYDECAY]], ptr [[__BEGIN2]], align 8
+// CHECK1-NEXT: [[TMP26:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK1-NEXT: [[ARRAYDECAY18:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP26]], i64 0, i64 0
+// CHECK1-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds double, ptr [[ARRAYDECAY18]], i64 256
+// CHECK1-NEXT: store ptr [[ADD_PTR]], ptr [[__END2]], align 8
+// CHECK1-NEXT: br label %[[FOR_COND19:.*]]
+// CHECK1: [[FOR_COND19]]:
+// CHECK1-NEXT: [[TMP27:%.*]] = load ptr, ptr [[__BEGIN2]], align 8
+// CHECK1-NEXT: [[TMP28:%.*]] = load ptr, ptr [[__END2]], align 8
+// CHECK1-NEXT: [[CMP20:%.*]] = icmp ne ptr [[TMP27]], [[TMP28]]
+// CHECK1-NEXT: br i1 [[CMP20]], label %[[FOR_BODY21:.*]], label %[[FOR_END23:.*]]
+// CHECK1: [[FOR_BODY21]]:
+// CHECK1-NEXT: [[TMP29:%.*]] = load ptr, ptr [[__BEGIN2]], align 8
+// CHECK1-NEXT: store ptr [[TMP29]], ptr [[V]], align 8
+// CHECK1-NEXT: [[TMP30:%.*]] = load i32, ptr [[C]], align 4
+// CHECK1-NEXT: [[TMP31:%.*]] = load ptr, ptr [[V]], align 8
+// CHECK1-NEXT: [[TMP32:%.*]] = load double, ptr [[TMP31]], align 8
+// CHECK1-NEXT: call void (...) @body(i32 noundef [[TMP30]], double noundef [[TMP32]])
+// CHECK1-NEXT: br label %[[FOR_INC22:.*]]
+// CHECK1: [[FOR_INC22]]:
+// CHECK1-NEXT: [[TMP33:%.*]] = load ptr, ptr [[__BEGIN2]], align 8
+// CHECK1-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP33]], i32 1
+// CHECK1-NEXT: store ptr [[INCDEC_PTR]], ptr [[__BEGIN2]], align 8
+// CHECK1-NEXT: br label %[[FOR_COND19]]
+// CHECK1: [[FOR_END23]]:
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define dso_local void @foo5(
+// CHECK1-SAME: ) #[[ATTR0]] {
+// CHECK1-NEXT: [[ENTRY:.*:]]
+// CHECK1-NEXT: [[ARR:%.*]] = alloca [256 x double], align 16
+// CHECK1-NEXT: [[J:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_LB0:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_ST0:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_NI0:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_IV0:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[K:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_LB1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_ST1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_NI1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_IV1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_TEMP_1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_FUSE_MAX:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_FUSE_INDEX:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_LB03:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_ST04:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_NI05:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTOMP_IV06:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[C:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[__RANGE2:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[__END2:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[__BEGIN2:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_8:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_10:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_11:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTOMP_LB116:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTOMP_ST117:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTOMP_NI118:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTOMP_IV120:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTOMP_TEMP_121:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTOMP_FUSE_MAX22:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_FUSE_INDEX29:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[V:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[CC:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[__RANGE264:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[__BEGIN265:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[__END267:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[VV:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: store i32 0, ptr [[J]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB0]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_ST0]], align 4
+// CHECK1-NEXT: store i32 128, ptr [[DOTOMP_NI0]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[K]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB1]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_ST1]], align 4
+// CHECK1-NEXT: store i32 512, ptr [[DOTOMP_NI1]], align 4
+// CHECK1-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTOMP_NI0]], align 4
+// CHECK1-NEXT: store i32 [[TMP0]], ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK1-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP1]], [[TMP2]]
+// CHECK1-NEXT: br i1 [[CMP]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]]
+// CHECK1: [[COND_TRUE]]:
+// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK1-NEXT: br label %[[COND_END:.*]]
+// CHECK1: [[COND_FALSE]]:
+// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK1-NEXT: br label %[[COND_END]]
+// CHECK1: [[COND_END]]:
+// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ [[TMP3]], %[[COND_TRUE]] ], [ [[TMP4]], %[[COND_FALSE]] ]
+// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_FUSE_MAX]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_FUSE_MAX]], align 4
+// CHECK1-NEXT: store i32 [[TMP5]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP6]], 0
+// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK1-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK1-NEXT: store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB03]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_ST04]], align 4
+// CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP7]], 1
+// CHECK1-NEXT: [[CONV:%.*]] = sext i32 [[ADD]] to i64
+// CHECK1-NEXT: store i64 [[CONV]], ptr [[DOTOMP_NI05]], align 8
+// CHECK1-NEXT: store i32 42, ptr [[C]], align 4
+// CHECK1-NEXT: store ptr [[ARR]], ptr [[__RANGE2]], align 8
+// CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK1-NEXT: [[ARRAYDECAY:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP8]], i64 0, i64 0
+// CHECK1-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds double, ptr [[ARRAYDECAY]], i64 256
+// CHECK1-NEXT: store ptr [[ADD_PTR]], ptr [[__END2]], align 8
+// CHECK1-NEXT: [[TMP9:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK1-NEXT: [[ARRAYDECAY7:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP9]], i64 0, i64 0
+// CHECK1-NEXT: store ptr [[ARRAYDECAY7]], ptr [[__BEGIN2]], align 8
+// CHECK1-NEXT: [[TMP10:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK1-NEXT: [[ARRAYDECAY9:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP10]], i64 0, i64 0
+// CHECK1-NEXT: store ptr [[ARRAYDECAY9]], ptr [[DOTCAPTURE_EXPR_8]], align 8
+// CHECK1-NEXT: [[TMP11:%.*]] = load ptr, ptr [[__END2]], align 8
+// CHECK1-NEXT: store ptr [[TMP11]], ptr [[DOTCAPTURE_EXPR_10]], align 8
+// CHECK1-NEXT: [[TMP12:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_10]], align 8
+// CHECK1-NEXT: [[TMP13:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_8]], align 8
+// CHECK1-NEXT: [[SUB_PTR_LHS_CAST:%.*]] = ptrtoint ptr [[TMP12]] to i64
+// CHECK1-NEXT: [[SUB_PTR_RHS_CAST:%.*]] = ptrtoint ptr [[TMP13]] to i64
+// CHECK1-NEXT: [[SUB_PTR_SUB:%.*]] = sub i64 [[SUB_PTR_LHS_CAST]], [[SUB_PTR_RHS_CAST]]
+// CHECK1-NEXT: [[SUB_PTR_DIV:%.*]] = sdiv exact i64 [[SUB_PTR_SUB]], 8
+// CHECK1-NEXT: [[SUB12:%.*]] = sub nsw i64 [[SUB_PTR_DIV]], 1
+// CHECK1-NEXT: [[ADD13:%.*]] = add nsw i64 [[SUB12]], 1
+// CHECK1-NEXT: [[DIV14:%.*]] = sdiv i64 [[ADD13]], 1
+// CHECK1-NEXT: [[SUB15:%.*]] = sub nsw i64 [[DIV14]], 1
+// CHECK1-NEXT: store i64 [[SUB15]], ptr [[DOTCAPTURE_EXPR_11]], align 8
+// CHECK1-NEXT: store i64 0, ptr [[DOTOMP_LB116]], align 8
+// CHECK1-NEXT: store i64 1, ptr [[DOTOMP_ST117]], align 8
+// CHECK1-NEXT: [[TMP14:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_11]], align 8
+// CHECK1-NEXT: [[ADD19:%.*]] = add nsw i64 [[TMP14]], 1
+// CHECK1-NEXT: store i64 [[ADD19]], ptr [[DOTOMP_NI118]], align 8
+// CHECK1-NEXT: [[TMP15:%.*]] = load i64, ptr [[DOTOMP_NI05]], align 8
+// CHECK1-NEXT: store i64 [[TMP15]], ptr [[DOTOMP_TEMP_121]], align 8
+// CHECK1-NEXT: [[TMP16:%.*]] = load i64, ptr [[DOTOMP_TEMP_121]], align 8
+// CHECK1-NEXT: [[TMP17:%.*]] = load i64, ptr [[DOTOMP_NI118]], align 8
+// CHECK1-NEXT: [[CMP23:%.*]] = icmp sgt i64 [[TMP16]], [[TMP17]]
+// CHECK1-NEXT: br i1 [[CMP23]], label %[[COND_TRUE24:.*]], label %[[COND_FALSE25:.*]]
+// CHECK1: [[COND_TRUE24]]:
+// CHECK1-NEXT: [[TMP18:%.*]] = load i64, ptr [[DOTOMP_TEMP_121]], align 8
+// CHECK1-NEXT: br label %[[COND_END26:.*]]
+// CHECK1: [[COND_FALSE25]]:
+// CHECK1-NEXT: [[TMP19:%.*]] = load i64, ptr [[DOTOMP_NI118]], align 8
+// CHECK1-NEXT: br label %[[COND_END26]]
+// CHECK1: [[COND_END26]]:
+// CHECK1-NEXT: [[COND27:%.*]] = phi i64 [ [[TMP18]], %[[COND_TRUE24]] ], [ [[TMP19]], %[[COND_FALSE25]] ]
+// CHECK1-NEXT: store i64 [[COND27]], ptr [[DOTOMP_FUSE_MAX22]], align 8
+// CHECK1-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK1-NEXT: br label %[[FOR_COND:.*]]
+// CHECK1: [[FOR_COND]]:
+// CHECK1-NEXT: [[TMP20:%.*]] = load i32, ptr [[I]], align 4
+// CHECK1-NEXT: [[CMP28:%.*]] = icmp slt i32 [[TMP20]], 128
+// CHECK1-NEXT: br i1 [[CMP28]], label %[[FOR_BODY:.*]], label %[[FOR_END:.*]]
+// CHECK1: [[FOR_BODY]]:
+// CHECK1-NEXT: [[TMP21:%.*]] = load i32, ptr [[I]], align 4
+// CHECK1-NEXT: call void (...) @body(i32 noundef [[TMP21]])
+// CHECK1-NEXT: br label %[[FOR_INC:.*]]
+// CHECK1: [[FOR_INC]]:
+// CHECK1-NEXT: [[TMP22:%.*]] = load i32, ptr [[I]], align 4
+// CHECK1-NEXT: [[INC:%.*]] = add nsw i32 [[TMP22]], 1
+// CHECK1-NEXT: store i32 [[INC]], ptr [[I]], align 4
+// CHECK1-NEXT: br label %[[FOR_COND]], !llvm.loop [[LOOP9:![0-9]+]]
+// CHECK1: [[FOR_END]]:
+// CHECK1-NEXT: store i64 0, ptr [[DOTOMP_FUSE_INDEX29]], align 8
+// CHECK1-NEXT: br label %[[FOR_COND30:.*]]
+// CHECK1: [[FOR_COND30]]:
+// CHECK1-NEXT: [[TMP23:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX29]], align 8
+// CHECK1-NEXT: [[TMP24:%.*]] = load i64, ptr [[DOTOMP_FUSE_MAX22]], align 8
+// CHECK1-NEXT: [[CMP31:%.*]] = icmp slt i64 [[TMP23]], [[TMP24]]
+// CHECK1-NEXT: br i1 [[CMP31]], label %[[FOR_BODY32:.*]], label %[[FOR_END63:.*]]
+// CHECK1: [[FOR_BODY32]]:
+// CHECK1-NEXT: [[TMP25:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX29]], align 8
+// CHECK1-NEXT: [[TMP26:%.*]] = load i64, ptr [[DOTOMP_NI05]], align 8
+// CHECK1-NEXT: [[CMP33:%.*]] = icmp slt i64 [[TMP25]], [[TMP26]]
+// CHECK1-NEXT: br i1 [[CMP33]], label %[[IF_THEN:.*]], label %[[IF_END53:.*]]
+// CHECK1: [[IF_THEN]]:
+// CHECK1-NEXT: [[TMP27:%.*]] = load i32, ptr [[DOTOMP_LB03]], align 4
+// CHECK1-NEXT: [[CONV34:%.*]] = sext i32 [[TMP27]] to i64
+// CHECK1-NEXT: [[TMP28:%.*]] = load i32, ptr [[DOTOMP_ST04]], align 4
+// CHECK1-NEXT: [[CONV35:%.*]] = sext i32 [[TMP28]] to i64
+// CHECK1-NEXT: [[TMP29:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX29]], align 8
+// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV35]], [[TMP29]]
+// CHECK1-NEXT: [[ADD36:%.*]] = add nsw i64 [[CONV34]], [[MUL]]
+// CHECK1-NEXT: [[CONV37:%.*]] = trunc i64 [[ADD36]] to i32
+// CHECK1-NEXT: store i32 [[CONV37]], ptr [[DOTOMP_IV06]], align 4
+// CHECK1-NEXT: [[TMP30:%.*]] = load i32, ptr [[DOTOMP_IV06]], align 4
+// CHECK1-NEXT: [[MUL38:%.*]] = mul nsw i32 [[TMP30]], 1
+// CHECK1-NEXT: [[ADD39:%.*]] = add nsw i32 0, [[MUL38]]
+// CHECK1-NEXT: store i32 [[ADD39]], ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT: [[TMP32:%.*]] = load i32, ptr [[DOTOMP_NI0]], align 4
+// CHECK1-NEXT: [[CMP40:%.*]] = icmp slt i32 [[TMP31]], [[TMP32]]
+// CHECK1-NEXT: br i1 [[CMP40]], label %[[IF_THEN41:.*]], label %[[IF_END:.*]]
+// CHECK1: [[IF_THEN41]]:
+// CHECK1-NEXT: [[TMP33:%.*]] = load i32, ptr [[DOTOMP_LB0]], align 4
+// CHECK1-NEXT: [[TMP34:%.*]] = load i32, ptr [[DOTOMP_ST0]], align 4
+// CHECK1-NEXT: [[TMP35:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT: [[MUL42:%.*]] = mul nsw i32 [[TMP34]], [[TMP35]]
+// CHECK1-NEXT: [[ADD43:%.*]] = add nsw i32 [[TMP33]], [[MUL42]]
+// CHECK1-NEXT: store i32 [[ADD43]], ptr [[DOTOMP_IV0]], align 4
+// CHECK1-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTOMP_IV0]], align 4
+// CHECK1-NEXT: [[MUL44:%.*]] = mul nsw i32 [[TMP36]], 2
+// CHECK1-NEXT: [[ADD45:%.*]] = add nsw i32 0, [[MUL44]]
+// CHECK1-NEXT: store i32 [[ADD45]], ptr [[J]], align 4
+// CHECK1-NEXT: [[TMP37:%.*]] = load i32, ptr [[J]], align 4
+// CHECK1-NEXT: call void (...) @body(i32 noundef [[TMP37]])
+// CHECK1-NEXT: br label %[[IF_END]]
+// CHECK1: [[IF_END]]:
+// CHECK1-NEXT: [[TMP38:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT: [[TMP39:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK1-NEXT: [[CMP46:%.*]] = icmp slt i32 [[TMP38]], [[TMP39]]
+// CHECK1-NEXT: br i1 [[CMP46]], label %[[IF_THEN47:.*]], label %[[IF_END52:.*]]
+// CHECK1: [[IF_THEN47]]:
+// CHECK1-NEXT: [[TMP40:%.*]] = load i32, ptr [[DOTOMP_LB1]], align 4
+// CHECK1-NEXT: [[TMP41:%.*]] = load i32, ptr [[DOTOMP_ST1]], align 4
+// CHECK1-NEXT: [[TMP42:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT: [[MUL48:%.*]] = mul nsw i32 [[TMP41]], [[TMP42]]
+// CHECK1-NEXT: [[ADD49:%.*]] = add nsw i32 [[TMP40]], [[MUL48]]
+// CHECK1-NEXT: store i32 [[ADD49]], ptr [[DOTOMP_IV1]], align 4
+// CHECK1-NEXT: [[TMP43:%.*]] = load i32, ptr [[DOTOMP_IV1]], align 4
+// CHECK1-NEXT: [[MUL50:%.*]] = mul nsw i32 [[TMP43]], 1
+// CHECK1-NEXT: [[ADD51:%.*]] = add nsw i32 0, [[MUL50]]
+// CHECK1-NEXT: store i32 [[ADD51]], ptr [[K]], align 4
+// CHECK1-NEXT: [[TMP44:%.*]] = load i32, ptr [[K]], align 4
+// CHECK1-NEXT: call void (...) @body(i32 noundef [[TMP44]])
+// CHECK1-NEXT: br label %[[IF_END52]]
+// CHECK1: [[IF_END52]]:
+// CHECK1-NEXT: br label %[[IF_END53]]
+// CHECK1: [[IF_END53]]:
+// CHECK1-NEXT: [[TMP45:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX29]], align 8
+// CHECK1-NEXT: [[TMP46:%.*]] = load i64, ptr [[DOTOMP_NI118]], align 8
+// CHECK1-NEXT: [[CMP54:%.*]] = icmp slt i64 [[TMP45]], [[TMP46]]
+// CHECK1-NEXT: br i1 [[CMP54]], label %[[IF_THEN55:.*]], label %[[IF_END60:.*]]
+// CHECK1: [[IF_THEN55]]:
+// CHECK1-NEXT: [[TMP47:%.*]] = load i64, ptr [[DOTOMP_LB116]], align 8
+// CHECK1-NEXT: [[TMP48:%.*]] = load i64, ptr [[DOTOMP_ST117]], align 8
+// CHECK1-NEXT: [[TMP49:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX29]], align 8
+// CHECK1-NEXT: [[MUL56:%.*]] = mul nsw i64 [[TMP48]], [[TMP49]]
+// CHECK1-NEXT: [[ADD57:%.*]] = add nsw i64 [[TMP47]], [[MUL56]]
+// CHECK1-NEXT: store i64 [[ADD57]], ptr [[DOTOMP_IV120]], align 8
+// CHECK1-NEXT: [[TMP50:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_8]], align 8
+// CHECK1-NEXT: [[TMP51:%.*]] = load i64, ptr [[DOTOMP_IV120]], align 8
+// CHECK1-NEXT: [[MUL58:%.*]] = mul nsw i64 [[TMP51]], 1
+// CHECK1-NEXT: [[ADD_PTR59:%.*]] = getelementptr inbounds double, ptr [[TMP50]], i64 [[MUL58]]
+// CHECK1-NEXT: store ptr [[ADD_PTR59]], ptr [[__BEGIN2]], align 8
+// CHECK1-NEXT: [[TMP52:%.*]] = load ptr, ptr [[__BEGIN2]], align 8
+// CHECK1-NEXT: store ptr [[TMP52]], ptr [[V]], align 8
+// CHECK1-NEXT: [[TMP53:%.*]] = load i32, ptr [[C]], align 4
+// CHECK1-NEXT: [[TMP54:%.*]] = load ptr, ptr [[V]], align 8
+// CHECK1-NEXT: [[TMP55:%.*]] = load double, ptr [[TMP54]], align 8
+// CHECK1-NEXT: call void (...) @body(i32 noundef [[TMP53]], double noundef [[TMP55]])
+// CHECK1-NEXT: br label %[[IF_END60]]
+// CHECK1: [[IF_END60]]:
+// CHECK1-NEXT: br label %[[FOR_INC61:.*]]
+// CHECK1: [[FOR_INC61]]:
+// CHECK1-NEXT: [[TMP56:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX29]], align 8
+// CHECK1-NEXT: [[INC62:%.*]] = add nsw i64 [[TMP56]], 1
+// CHECK1-NEXT: store i64 [[INC62]], ptr [[DOTOMP_FUSE_INDEX29]], align 8
+// CHECK1-NEXT: br label %[[FOR_COND30]], !llvm.loop [[LOOP10:![0-9]+]]
+// CHECK1: [[FOR_END63]]:
+// CHECK1-NEXT: store i32 37, ptr [[CC]], align 4
+// CHECK1-NEXT: store ptr [[ARR]], ptr [[__RANGE264]], align 8
+// CHECK1-NEXT: [[TMP57:%.*]] = load ptr, ptr [[__RANGE264]], align 8
+// CHECK1-NEXT: [[ARRAYDECAY66:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP57]], i64 0, i64 0
+// CHECK1-NEXT: store ptr [[ARRAYDECAY66]], ptr [[__BEGIN265]], align 8
+// CHECK1-NEXT: [[TMP58:%.*]] = load ptr, ptr [[__RANGE264]], align 8
+// CHECK1-NEXT: [[ARRAYDECAY68:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP58]], i64 0, i64 0
+// CHECK1-NEXT: [[ADD_PTR69:%.*]] = getelementptr inbounds double, ptr [[ARRAYDECAY68]], i64 256
+// CHECK1-NEXT: store ptr [[ADD_PTR69]], ptr [[__END267]], align 8
+// CHECK1-NEXT: br label %[[FOR_COND70:.*]]
+// CHECK1: [[FOR_COND70]]:
+// CHECK1-NEXT: [[TMP59:%.*]] = load ptr, ptr [[__BEGIN265]], align 8
+// CHECK1-NEXT: [[TMP60:%.*]] = load ptr, ptr [[__END267]], align 8
+// CHECK1-NEXT: [[CMP71:%.*]] = icmp ne ptr [[TMP59]], [[TMP60]]
+// CHECK1-NEXT: br i1 [[CMP71]], label %[[FOR_BODY72:.*]], label %[[FOR_END74:.*]]
+// CHECK1: [[FOR_BODY72]]:
+// CHECK1-NEXT: [[TMP61:%.*]] = load ptr, ptr [[__BEGIN265]], align 8
+// CHECK1-NEXT: store ptr [[TMP61]], ptr [[VV]], align 8
+// CHECK1-NEXT: [[TMP62:%.*]] = load i32, ptr [[CC]], align 4
+// CHECK1-NEXT: [[TMP63:%.*]] = load ptr, ptr [[VV]], align 8
+// CHECK1-NEXT: [[TMP64:%.*]] = load double, ptr [[TMP63]], align 8
+// CHECK1-NEXT: call void (...) @body(i32 noundef [[TMP62]], double noundef [[TMP64]])
+// CHECK1-NEXT: br label %[[FOR_INC73:.*]]
+// CHECK1: [[FOR_INC73]]:
+// CHECK1-NEXT: [[TMP65:%.*]] = load ptr, ptr [[__BEGIN265]], align 8
+// CHECK1-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP65]], i32 1
+// CHECK1-NEXT: store ptr [[INCDEC_PTR]], ptr [[__BEGIN265]], align 8
+// CHECK1-NEXT: br label %[[FOR_COND70]]
+// CHECK1: [[FOR_END74]]:
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK2-LABEL: define dso_local void @body(
+// CHECK2-SAME: ...) #[[ATTR0:[0-9]+]] {
+// CHECK2-NEXT: [[ENTRY:.*:]]
+// CHECK2-NEXT: ret void
+//
+//
+// CHECK2-LABEL: define dso_local void @foo1(
+// CHECK2-SAME: i32 noundef [[START1:%.*]], i32 noundef [[END1:%.*]], i32 noundef [[STEP1:%.*]], i32 noundef [[START2:%.*]], i32 noundef [[END2:%.*]], i32 noundef [[STEP2:%.*]]) #[[ATTR0]] {
+// CHECK2-NEXT: [[ENTRY:.*:]]
+// CHECK2-NEXT: [[START1_ADDR:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[END1_ADDR:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[STEP1_ADDR:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[START2_ADDR:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[END2_ADDR:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[STEP2_ADDR:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[J:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTNEW_STEP:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_LB0:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_ST0:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_NI0:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_IV0:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR_6:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR_7:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTNEW_STEP8:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR_9:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_LB1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_ST1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_NI1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_IV1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_TEMP_1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_FUSE_MAX:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_FUSE_INDEX:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: store i32 [[START1]], ptr [[START1_ADDR]], align 4
+// CHECK2-NEXT: store i32 [[END1]], ptr [[END1_ADDR]], align 4
+// CHECK2-NEXT: store i32 [[STEP1]], ptr [[STEP1_ADDR]], align 4
+// CHECK2-NEXT: store i32 [[START2]], ptr [[START2_ADDR]], align 4
+// CHECK2-NEXT: store i32 [[END2]], ptr [[END2_ADDR]], align 4
+// CHECK2-NEXT: store i32 [[STEP2]], ptr [[STEP2_ADDR]], align 4
+// CHECK2-NEXT: [[TMP0:%.*]] = load i32, ptr [[START1_ADDR]], align 4
+// CHECK2-NEXT: store i32 [[TMP0]], ptr [[I]], align 4
+// CHECK2-NEXT: [[TMP1:%.*]] = load i32, ptr [[START1_ADDR]], align 4
+// CHECK2-NEXT: store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT: [[TMP2:%.*]] = load i32, ptr [[END1_ADDR]], align 4
+// CHECK2-NEXT: store i32 [[TMP2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT: [[TMP3:%.*]] = load i32, ptr [[STEP1_ADDR]], align 4
+// CHECK2-NEXT: store i32 [[TMP3]], ptr [[DOTNEW_STEP]], align 4
+// CHECK2-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT: [[SUB:%.*]] = sub i32 [[TMP4]], [[TMP5]]
+// CHECK2-NEXT: [[SUB3:%.*]] = sub i32 [[SUB]], 1
+// CHECK2-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4
+// CHECK2-NEXT: [[ADD:%.*]] = add i32 [[SUB3]], [[TMP6]]
+// CHECK2-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4
+// CHECK2-NEXT: [[DIV:%.*]] = udiv i32 [[ADD]], [[TMP7]]
+// CHECK2-NEXT: [[SUB4:%.*]] = sub i32 [[DIV]], 1
+// CHECK2-NEXT: store i32 [[SUB4]], ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK2-NEXT: store i32 0, ptr [[DOTOMP_LB0]], align 4
+// CHECK2-NEXT: store i32 1, ptr [[DOTOMP_ST0]], align 4
+// CHECK2-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK2-NEXT: [[ADD5:%.*]] = add i32 [[TMP8]], 1
+// CHECK2-NEXT: store i32 [[ADD5]], ptr [[DOTOMP_NI0]], align 4
+// CHECK2-NEXT: [[TMP9:%.*]] = load i32, ptr [[START2_ADDR]], align 4
+// CHECK2-NEXT: store i32 [[TMP9]], ptr [[J]], align 4
+// CHECK2-NEXT: [[TMP10:%.*]] = load i32, ptr [[START2_ADDR]], align 4
+// CHECK2-NEXT: store i32 [[TMP10]], ptr [[DOTCAPTURE_EXPR_6]], align 4
+// CHECK2-NEXT: [[TMP11:%.*]] = load i32, ptr [[END2_ADDR]], align 4
+// CHECK2-NEXT: store i32 [[TMP11]], ptr [[DOTCAPTURE_EXPR_7]], align 4
+// CHECK2-NEXT: [[TMP12:%.*]] = load i32, ptr [[STEP2_ADDR]], align 4
+// CHECK2-NEXT: store i32 [[TMP12]], ptr [[DOTNEW_STEP8]], align 4
+// CHECK2-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_7]], align 4
+// CHECK2-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4
+// CHECK2-NEXT: [[SUB10:%.*]] = sub i32 [[TMP13]], [[TMP14]]
+// CHECK2-NEXT: [[SUB11:%.*]] = sub i32 [[SUB10]], 1
+// CHECK2-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTNEW_STEP8]], align 4
+// CHECK2-NEXT: [[ADD12:%.*]] = add i32 [[SUB11]], [[TMP15]]
+// CHECK2-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTNEW_STEP8]], align 4
+// CHECK2-NEXT: [[DIV13:%.*]] = udiv i32 [[ADD12]], [[TMP16]]
+// CHECK2-NEXT: [[SUB14:%.*]] = sub i32 [[DIV13]], 1
+// CHECK2-NEXT: store i32 [[SUB14]], ptr [[DOTCAPTURE_EXPR_9]], align 4
+// CHECK2-NEXT: store i32 0, ptr [[DOTOMP_LB1]], align 4
+// CHECK2-NEXT: store i32 1, ptr [[DOTOMP_ST1]], align 4
+// CHECK2-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_9]], align 4
+// CHECK2-NEXT: [[ADD15:%.*]] = add i32 [[TMP17]], 1
+// CHECK2-NEXT: store i32 [[ADD15]], ptr [[DOTOMP_NI1]], align 4
+// CHECK2-NEXT: [[TMP18:%.*]] = load i32, ptr [[DOTOMP_NI0]], align 4
+// CHECK2-NEXT: store i32 [[TMP18]], ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK2-NEXT: [[TMP19:%.*]] = load i32, ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK2-NEXT: [[TMP20:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK2-NEXT: [[CMP:%.*]] = icmp ugt i32 [[TMP19]], [[TMP20]]
+// CHECK2-NEXT: br i1 [[CMP]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]]
+// CHECK2: [[COND_TRUE]]:
+// CHECK2-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK2-NEXT: br label %[[COND_END:.*]]
+// CHECK2: [[COND_FALSE]]:
+// CHECK2-NEXT: [[TMP22:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK2-NEXT: br label %[[COND_END]]
+// CHECK2: [[COND_END]]:
+// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ [[TMP21]], %[[COND_TRUE]] ], [ [[TMP22]], %[[COND_FALSE]] ]
+// CHECK2-NEXT: store i32 [[COND]], ptr [[DOTOMP_FUSE_MAX]], align 4
+// CHECK2-NEXT: store i32 0, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT: br label %[[FOR_COND:.*]]
+// CHECK2: [[FOR_COND]]:
+// CHECK2-NEXT: [[TMP23:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT: [[TMP24:%.*]] = load i32, ptr [[DOTOMP_FUSE_MAX]], align 4
+// CHECK2-NEXT: [[CMP16:%.*]] = icmp ult i32 [[TMP23]], [[TMP24]]
+// CHECK2-NEXT: br i1 [[CMP16]], label %[[FOR_BODY:.*]], label %[[FOR_END:.*]]
+// CHECK2: [[FOR_BODY]]:
+// CHECK2-NEXT: [[TMP25:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTOMP_NI0]], align 4
+// CHECK2-NEXT: [[CMP17:%.*]] = icmp ult i32 [[TMP25]], [[TMP26]]
+// CHECK2-NEXT: br i1 [[CMP17]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
+// CHECK2: [[IF_THEN]]:
+// CHECK2-NEXT: [[TMP27:%.*]] = load i32, ptr [[DOTOMP_LB0]], align 4
+// CHECK2-NEXT: [[TMP28:%.*]] = load i32, ptr [[DOTOMP_ST0]], align 4
+// CHECK2-NEXT: [[TMP29:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT: [[MUL:%.*]] = mul i32 [[TMP28]], [[TMP29]]
+// CHECK2-NEXT: [[ADD18:%.*]] = add i32 [[TMP27]], [[MUL]]
+// CHECK2-NEXT: store i32 [[ADD18]], ptr [[DOTOMP_IV0]], align 4
+// CHECK2-NEXT: [[TMP30:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTOMP_IV0]], align 4
+// CHECK2-NEXT: [[TMP32:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4
+// CHECK2-NEXT: [[MUL19:%.*]] = mul i32 [[TMP31]], [[TMP32]]
+// CHECK2-NEXT: [[ADD20:%.*]] = add i32 [[TMP30]], [[MUL19]]
+// CHECK2-NEXT: store i32 [[ADD20]], ptr [[I]], align 4
+// CHECK2-NEXT: [[TMP33:%.*]] = load i32, ptr [[I]], align 4
+// CHECK2-NEXT: call void (...) @body(i32 noundef [[TMP33]])
+// CHECK2-NEXT: br label %[[IF_END]]
+// CHECK2: [[IF_END]]:
+// CHECK2-NEXT: [[TMP34:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT: [[TMP35:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK2-NEXT: [[CMP21:%.*]] = icmp ult i32 [[TMP34]], [[TMP35]]
+// CHECK2-NEXT: br i1 [[CMP21]], label %[[IF_THEN22:.*]], label %[[IF_END27:.*]]
+// CHECK2: [[IF_THEN22]]:
+// CHECK2-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTOMP_LB1]], align 4
+// CHECK2-NEXT: [[TMP37:%.*]] = load i32, ptr [[DOTOMP_ST1]], align 4
+// CHECK2-NEXT: [[TMP38:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT: [[MUL23:%.*]] = mul i32 [[TMP37]], [[TMP38]]
+// CHECK2-NEXT: [[ADD24:%.*]] = add i32 [[TMP36]], [[MUL23]]
+// CHECK2-NEXT: store i32 [[ADD24]], ptr [[DOTOMP_IV1]], align 4
+// CHECK2-NEXT: [[TMP39:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4
+// CHECK2-NEXT: [[TMP40:%.*]] = load i32, ptr [[DOTOMP_IV1]], align 4
+// CHECK2-NEXT: [[TMP41:%.*]] = load i32, ptr [[DOTNEW_STEP8]], align 4
+// CHECK2-NEXT: [[MUL25:%.*]] = mul i32 [[TMP40]], [[TMP41]]
+// CHECK2-NEXT: [[ADD26:%.*]] = add i32 [[TMP39]], [[MUL25]]
+// CHECK2-NEXT: store i32 [[ADD26]], ptr [[J]], align 4
+// CHECK2-NEXT: [[TMP42:%.*]] = load i32, ptr [[J]], align 4
+// CHECK2-NEXT: call void (...) @body(i32 noundef [[TMP42]])
+// CHECK2-NEXT: br label %[[IF_END27]]
+// CHECK2: [[IF_END27]]:
+// CHECK2-NEXT: br label %[[FOR_INC:.*]]
+// CHECK2: [[FOR_INC]]:
+// CHECK2-NEXT: [[TMP43:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT: [[INC:%.*]] = add i32 [[TMP43]], 1
+// CHECK2-NEXT: store i32 [[INC]], ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT: br label %[[FOR_COND]], !llvm.loop [[LOOP3:![0-9]+]]
+// CHECK2: [[FOR_END]]:
+// CHECK2-NEXT: ret void
+//
+//
+// CHECK2-LABEL: define dso_local void @foo3(
+// CHECK2-SAME: ) #[[ATTR0]] {
+// CHECK2-NEXT: [[ENTRY:.*:]]
+// CHECK2-NEXT: [[ARR:%.*]] = alloca [256 x double], align 16
+// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_LB0:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_ST0:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_NI0:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_IV0:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[J:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_LB1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_ST1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_NI1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_IV1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_TEMP_1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_FUSE_MAX:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_FUSE_INDEX:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_LB03:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_ST04:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_NI05:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[DOTOMP_IV06:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[C:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[__RANGE2:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[__END2:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[__BEGIN2:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR_8:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR_10:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR_11:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[DOTOMP_LB116:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[DOTOMP_ST117:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[DOTOMP_NI118:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[DOTOMP_IV120:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[CC:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[__RANGE221:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[__END222:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[__BEGIN225:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR_27:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR_29:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR_30:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[DOTOMP_LB2:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[DOTOMP_ST2:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[DOTOMP_NI2:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[DOTOMP_IV2:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[DOTOMP_TEMP_140:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[DOTOMP_TEMP_2:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[DOTOMP_FUSE_MAX46:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[DOTOMP_FUSE_INDEX52:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[V:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[VV:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK2-NEXT: store i32 0, ptr [[DOTOMP_LB0]], align 4
+// CHECK2-NEXT: store i32 1, ptr [[DOTOMP_ST0]], align 4
+// CHECK2-NEXT: store i32 128, ptr [[DOTOMP_NI0]], align 4
+// CHECK2-NEXT: store i32 0, ptr [[J]], align 4
+// CHECK2-NEXT: store i32 0, ptr [[DOTOMP_LB1]], align 4
+// CHECK2-NEXT: store i32 1, ptr [[DOTOMP_ST1]], align 4
+// CHECK2-NEXT: store i32 128, ptr [[DOTOMP_NI1]], align 4
+// CHECK2-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTOMP_NI0]], align 4
+// CHECK2-NEXT: store i32 [[TMP0]], ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK2-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK2-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK2-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP1]], [[TMP2]]
+// CHECK2-NEXT: br i1 [[CMP]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]]
+// CHECK2: [[COND_TRUE]]:
+// CHECK2-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK2-NEXT: br label %[[COND_END:.*]]
+// CHECK2: [[COND_FALSE]]:
+// CHECK2-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK2-NEXT: br label %[[COND_END]]
+// CHECK2: [[COND_END]]:
+// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ [[TMP3]], %[[COND_TRUE]] ], [ [[TMP4]], %[[COND_FALSE]] ]
+// CHECK2-NEXT: store i32 [[COND]], ptr [[DOTOMP_FUSE_MAX]], align 4
+// CHECK2-NEXT: store i32 0, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_FUSE_MAX]], align 4
+// CHECK2-NEXT: store i32 [[TMP5]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP6]], 0
+// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK2-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK2-NEXT: store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT: store i32 0, ptr [[DOTOMP_LB03]], align 4
+// CHECK2-NEXT: store i32 1, ptr [[DOTOMP_ST04]], align 4
+// CHECK2-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP7]], 1
+// CHECK2-NEXT: [[CONV:%.*]] = sext i32 [[ADD]] to i64
+// CHECK2-NEXT: store i64 [[CONV]], ptr [[DOTOMP_NI05]], align 8
+// CHECK2-NEXT: store i32 42, ptr [[C]], align 4
+// CHECK2-NEXT: store ptr [[ARR]], ptr [[__RANGE2]], align 8
+// CHECK2-NEXT: [[TMP8:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK2-NEXT: [[ARRAYDECAY:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP8]], i64 0, i64 0
+// CHECK2-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds double, ptr [[ARRAYDECAY]], i64 256
+// CHECK2-NEXT: store ptr [[ADD_PTR]], ptr [[__END2]], align 8
+// CHECK2-NEXT: [[TMP9:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK2-NEXT: [[ARRAYDECAY7:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP9]], i64 0, i64 0
+// CHECK2-NEXT: store ptr [[ARRAYDECAY7]], ptr [[__BEGIN2]], align 8
+// CHECK2-NEXT: [[TMP10:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK2-NEXT: [[ARRAYDECAY9:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP10]], i64 0, i64 0
+// CHECK2-NEXT: store ptr [[ARRAYDECAY9]], ptr [[DOTCAPTURE_EXPR_8]], align 8
+// CHECK2-NEXT: [[TMP11:%.*]] = load ptr, ptr [[__END2]], align 8
+// CHECK2-NEXT: store ptr [[TMP11]], ptr [[DOTCAPTURE_EXPR_10]], align 8
+// CHECK2-NEXT: [[TMP12:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_10]], align 8
+// CHECK2-NEXT: [[TMP13:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_8]], align 8
+// CHECK2-NEXT: [[SUB_PTR_LHS_CAST:%.*]] = ptrtoint ptr [[TMP12]] to i64
+// CHECK2-NEXT: [[SUB_PTR_RHS_CAST:%.*]] = ptrtoint ptr [[TMP13]] to i64
+// CHECK2-NEXT: [[SUB_PTR_SUB:%.*]] = sub i64 [[SUB_PTR_LHS_CAST]], [[SUB_PTR_RHS_CAST]]
+// CHECK2-NEXT: [[SUB_PTR_DIV:%.*]] = sdiv exact i64 [[SUB_PTR_SUB]], 8
+// CHECK2-NEXT: [[SUB12:%.*]] = sub nsw i64 [[SUB_PTR_DIV]], 1
+// CHECK2-NEXT: [[ADD13:%.*]] = add nsw i64 [[SUB12]], 1
+// CHECK2-NEXT: [[DIV14:%.*]] = sdiv i64 [[ADD13]], 1
+// CHECK2-NEXT: [[SUB15:%.*]] = sub nsw i64 [[DIV14]], 1
+// CHECK2-NEXT: store i64 [[SUB15]], ptr [[DOTCAPTURE_EXPR_11]], align 8
+// CHECK2-NEXT: store i64 0, ptr [[DOTOMP_LB116]], align 8
+// CHECK2-NEXT: store i64 1, ptr [[DOTOMP_ST117]], align 8
+// CHECK2-NEXT: [[TMP14:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_11]], align 8
+// CHECK2-NEXT: [[ADD19:%.*]] = add nsw i64 [[TMP14]], 1
+// CHECK2-NEXT: store i64 [[ADD19]], ptr [[DOTOMP_NI118]], align 8
+// CHECK2-NEXT: store i32 37, ptr [[CC]], align 4
+// CHECK2-NEXT: store ptr [[ARR]], ptr [[__RANGE221]], align 8
+// CHECK2-NEXT: [[TMP15:%.*]] = load ptr, ptr [[__RANGE221]], align 8
+// CHECK2-NEXT: [[ARRAYDECAY23:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP15]], i64 0, i64 0
+// CHECK2-NEXT: [[ADD_PTR24:%.*]] = getelementptr inbounds double, ptr [[ARRAYDECAY23]], i64 256
+// CHECK2-NEXT: store ptr [[ADD_PTR24]], ptr [[__END222]], align 8
+// CHECK2-NEXT: [[TMP16:%.*]] = load ptr, ptr [[__RANGE221]], align 8
+// CHECK2-NEXT: [[ARRAYDECAY26:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP16]], i64 0, i64 0
+// CHECK2-NEXT: store ptr [[ARRAYDECAY26]], ptr [[__BEGIN225]], align 8
+// CHECK2-NEXT: [[TMP17:%.*]] = load ptr, ptr [[__RANGE221]], align 8
+// CHECK2-NEXT: [[ARRAYDECAY28:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP17]], i64 0, i64 0
+// CHECK2-NEXT: store ptr [[ARRAYDECAY28]], ptr [[DOTCAPTURE_EXPR_27]], align 8
+// CHECK2-NEXT: [[TMP18:%.*]] = load ptr, ptr [[__END222]], align 8
+// CHECK2-NEXT: store ptr [[TMP18]], ptr [[DOTCAPTURE_EXPR_29]], align 8
+// CHECK2-NEXT: [[TMP19:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_29]], align 8
+// CHECK2-NEXT: [[TMP20:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_27]], align 8
+// CHECK2-NEXT: [[SUB_PTR_LHS_CAST31:%.*]] = ptrtoint ptr [[TMP19]] to i64
+// CHECK2-NEXT: [[SUB_PTR_RHS_CAST32:%.*]] = ptrtoint ptr [[TMP20]] to i64
+// CHECK2-NEXT: [[SUB_PTR_SUB33:%.*]] = sub i64 [[SUB_PTR_LHS_CAST31]], [[SUB_PTR_RHS_CAST32]]
+// CHECK2-NEXT: [[SUB_PTR_DIV34:%.*]] = sdiv exact i64 [[SUB_PTR_SUB33]], 8
+// CHECK2-NEXT: [[SUB35:%.*]] = sub nsw i64 [[SUB_PTR_DIV34]], 1
+// CHECK2-NEXT: [[ADD36:%.*]] = add nsw i64 [[SUB35]], 1
+// CHECK2-NEXT: [[DIV37:%.*]] = sdiv i64 [[ADD36]], 1
+// CHECK2-NEXT: [[SUB38:%.*]] = sub nsw i64 [[DIV37]], 1
+// CHECK2-NEXT: store i64 [[SUB38]], ptr [[DOTCAPTURE_EXPR_30]], align 8
+// CHECK2-NEXT: store i64 0, ptr [[DOTOMP_LB2]], align 8
+// CHECK2-NEXT: store i64 1, ptr [[DOTOMP_ST2]], align 8
+// CHECK2-NEXT: [[TMP21:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_30]], align 8
+// CHECK2-NEXT: [[ADD39:%.*]] = add nsw i64 [[TMP21]], 1
+// CHECK2-NEXT: store i64 [[ADD39]], ptr [[DOTOMP_NI2]], align 8
+// CHECK2-NEXT: [[TMP22:%.*]] = load i64, ptr [[DOTOMP_NI05]], align 8
+// CHECK2-NEXT: store i64 [[TMP22]], ptr [[DOTOMP_TEMP_140]], align 8
+// CHECK2-NEXT: [[TMP23:%.*]] = load i64, ptr [[DOTOMP_TEMP_140]], align 8
+// CHECK2-NEXT: [[TMP24:%.*]] = load i64, ptr [[DOTOMP_NI118]], align 8
+// CHECK2-NEXT: [[CMP41:%.*]] = icmp sgt i64 [[TMP23]], [[TMP24]]
+// CHECK2-NEXT: br i1 [[CMP41]], label %[[COND_TRUE42:.*]], label %[[COND_FALSE43:.*]]
+// CHECK2: [[COND_TRUE42]]:
+// CHECK2-NEXT: [[TMP25:%.*]] = load i64, ptr [[DOTOMP_TEMP_140]], align 8
+// CHECK2-NEXT: br label %[[COND_END44:.*]]
+// CHECK2: [[COND_FALSE43]]:
+// CHECK2-NEXT: [[TMP26:%.*]] = load i64, ptr [[DOTOMP_NI118]], align 8
+// CHECK2-NEXT: br label %[[COND_END44]]
+// CHECK2: [[COND_END44]]:
+// CHECK2-NEXT: [[COND45:%.*]] = phi i64 [ [[TMP25]], %[[COND_TRUE42]] ], [ [[TMP26]], %[[COND_FALSE43]] ]
+// CHECK2-NEXT: store i64 [[COND45]], ptr [[DOTOMP_TEMP_2]], align 8
+// CHECK2-NEXT: [[TMP27:%.*]] = load i64, ptr [[DOTOMP_TEMP_2]], align 8
+// CHECK2-NEXT: [[TMP28:%.*]] = load i64, ptr [[DOTOMP_NI2]], align 8
+// CHECK2-NEXT: [[CMP47:%.*]] = icmp sgt i64 [[TMP27]], [[TMP28]]
+// CHECK2-NEXT: br i1 [[CMP47]], label %[[COND_TRUE48:.*]], label %[[COND_FALSE49:.*]]
+// CHECK2: [[COND_TRUE48]]:
+// CHECK2-NEXT: [[TMP29:%.*]] = load i64, ptr [[DOTOMP_TEMP_2]], align 8
+// CHECK2-NEXT: br label %[[COND_END50:.*]]
+// CHECK2: [[COND_FALSE49]]:
+// CHECK2-NEXT: [[TMP30:%.*]] = load i64, ptr [[DOTOMP_NI2]], align 8
+// CHECK2-NEXT: br label %[[COND_END50]]
+// CHECK2: [[COND_END50]]:
+// CHECK2-NEXT: [[COND51:%.*]] = phi i64 [ [[TMP29]], %[[COND_TRUE48]] ], [ [[TMP30]], %[[COND_FALSE49]] ]
+// CHECK2-NEXT: store i64 [[COND51]], ptr [[DOTOMP_FUSE_MAX46]], align 8
+// CHECK2-NEXT: store i64 0, ptr [[DOTOMP_FUSE_INDEX52]], align 8
+// CHECK2-NEXT: br label %[[FOR_COND:.*]]
+// CHECK2: [[FOR_COND]]:
+// CHECK2-NEXT: [[TMP31:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX52]], align 8
+// CHECK2-NEXT: [[TMP32:%.*]] = load i64, ptr [[DOTOMP_FUSE_MAX46]], align 8
+// CHECK2-NEXT: [[CMP53:%.*]] = icmp slt i64 [[TMP31]], [[TMP32]]
+// CHECK2-NEXT: br i1 [[CMP53]], label %[[FOR_BODY:.*]], label %[[FOR_END:.*]]
+// CHECK2: [[FOR_BODY]]:
+// CHECK2-NEXT: [[TMP33:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX52]], align 8
+// CHECK2-NEXT: [[TMP34:%.*]] = load i64, ptr [[DOTOMP_NI05]], align 8
+// CHECK2-NEXT: [[CMP54:%.*]] = icmp slt i64 [[TMP33]], [[TMP34]]
+// CHECK2-NEXT: br i1 [[CMP54]], label %[[IF_THEN:.*]], label %[[IF_END74:.*]]
+// CHECK2: [[IF_THEN]]:
+// CHECK2-NEXT: [[TMP35:%.*]] = load i32, ptr [[DOTOMP_LB03]], align 4
+// CHECK2-NEXT: [[CONV55:%.*]] = sext i32 [[TMP35]] to i64
+// CHECK2-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTOMP_ST04]], align 4
+// CHECK2-NEXT: [[CONV56:%.*]] = sext i32 [[TMP36]] to i64
+// CHECK2-NEXT: [[TMP37:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX52]], align 8
+// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV56]], [[TMP37]]
+// CHECK2-NEXT: [[ADD57:%.*]] = add nsw i64 [[CONV55]], [[MUL]]
+// CHECK2-NEXT: [[CONV58:%.*]] = trunc i64 [[ADD57]] to i32
+// CHECK2-NEXT: store i32 [[CONV58]], ptr [[DOTOMP_IV06]], align 4
+// CHECK2-NEXT: [[TMP38:%.*]] = load i32, ptr [[DOTOMP_IV06]], align 4
+// CHECK2-NEXT: [[MUL59:%.*]] = mul nsw i32 [[TMP38]], 1
+// CHECK2-NEXT: [[ADD60:%.*]] = add nsw i32 0, [[MUL59]]
+// CHECK2-NEXT: store i32 [[ADD60]], ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT: [[TMP39:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT: [[TMP40:%.*]] = load i32, ptr [[DOTOMP_NI0]], align 4
+// CHECK2-NEXT: [[CMP61:%.*]] = icmp slt i32 [[TMP39]], [[TMP40]]
+// CHECK2-NEXT: br i1 [[CMP61]], label %[[IF_THEN62:.*]], label %[[IF_END:.*]]
+// CHECK2: [[IF_THEN62]]:
+// CHECK2-NEXT: [[TMP41:%.*]] = load i32, ptr [[DOTOMP_LB0]], align 4
+// CHECK2-NEXT: [[TMP42:%.*]] = load i32, ptr [[DOTOMP_ST0]], align 4
+// CHECK2-NEXT: [[TMP43:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT: [[MUL63:%.*]] = mul nsw i32 [[TMP42]], [[TMP43]]
+// CHECK2-NEXT: [[ADD64:%.*]] = add nsw i32 [[TMP41]], [[MUL63]]
+// CHECK2-NEXT: store i32 [[ADD64]], ptr [[DOTOMP_IV0]], align 4
+// CHECK2-NEXT: [[TMP44:%.*]] = load i32, ptr [[DOTOMP_IV0]], align 4
+// CHECK2-NEXT: [[MUL65:%.*]] = mul nsw i32 [[TMP44]], 1
+// CHECK2-NEXT: [[ADD66:%.*]] = add nsw i32 0, [[MUL65]]
+// CHECK2-NEXT: store i32 [[ADD66]], ptr [[I]], align 4
+// CHECK2-NEXT: [[TMP45:%.*]] = load i32, ptr [[I]], align 4
+// CHECK2-NEXT: call void (...) @body(i32 noundef [[TMP45]])
+// CHECK2-NEXT: br label %[[IF_END]]
+// CHECK2: [[IF_END]]:
+// CHECK2-NEXT: [[TMP46:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT: [[TMP47:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK2-NEXT: [[CMP67:%.*]] = icmp slt i32 [[TMP46]], [[TMP47]]
+// CHECK2-NEXT: br i1 [[CMP67]], label %[[IF_THEN68:.*]], label %[[IF_END73:.*]]
+// CHECK2: [[IF_THEN68]]:
+// CHECK2-NEXT: [[TMP48:%.*]] = load i32, ptr [[DOTOMP_LB1]], align 4
+// CHECK2-NEXT: [[TMP49:%.*]] = load i32, ptr [[DOTOMP_ST1]], align 4
+// CHECK2-NEXT: [[TMP50:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT: [[MUL69:%.*]] = mul nsw i32 [[TMP49]], [[TMP50]]
+// CHECK2-NEXT: [[ADD70:%.*]] = add nsw i32 [[TMP48]], [[MUL69]]
+// CHECK2-NEXT: store i32 [[ADD70]], ptr [[DOTOMP_IV1]], align 4
+// CHECK2-NEXT: [[TMP51:%.*]] = load i32, ptr [[DOTOMP_IV1]], align 4
+// CHECK2-NEXT: [[MUL71:%.*]] = mul nsw i32 [[TMP51]], 2
+// CHECK2-NEXT: [[ADD72:%.*]] = add nsw i32 0, [[MUL71]]
+// CHECK2-NEXT: store i32 [[ADD72]], ptr [[J]], align 4
+// CHECK2-NEXT: [[TMP52:%.*]] = load i32, ptr [[J]], align 4
+// CHECK2-NEXT: call void (...) @body(i32 noundef [[TMP52]])
+// CHECK2-NEXT: br label %[[IF_END73]]
+// CHECK2: [[IF_END73]]:
+// CHECK2-NEXT: br label %[[IF_END74]]
+// CHECK2: [[IF_END74]]:
+// CHECK2-NEXT: [[TMP53:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX52]], align 8
+// CHECK2-NEXT: [[TMP54:%.*]] = load i64, ptr [[DOTOMP_NI118]], align 8
+// CHECK2-NEXT: [[CMP75:%.*]] = icmp slt i64 [[TMP53]], [[TMP54]]
+// CHECK2-NEXT: br i1 [[CMP75]], label %[[IF_THEN76:.*]], label %[[IF_END81:.*]]
+// CHECK2: [[IF_THEN76]]:
+// CHECK2-NEXT: [[TMP55:%.*]] = load i64, ptr [[DOTOMP_LB116]], align 8
+// CHECK2-NEXT: [[TMP56:%.*]] = load i64, ptr [[DOTOMP_ST117]], align 8
+// CHECK2-NEXT: [[TMP57:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX52]], align 8
+// CHECK2-NEXT: [[MUL77:%.*]] = mul nsw i64 [[TMP56]], [[TMP57]]
+// CHECK2-NEXT: [[ADD78:%.*]] = add nsw i64 [[TMP55]], [[MUL77]]
+// CHECK2-NEXT: store i64 [[ADD78]], ptr [[DOTOMP_IV120]], align 8
+// CHECK2-NEXT: [[TMP58:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_8]], align 8
+// CHECK2-NEXT: [[TMP59:%.*]] = load i64, ptr [[DOTOMP_IV120]], align 8
+// CHECK2-NEXT: [[MUL79:%.*]] = mul nsw i64 [[TMP59]], 1
+// CHECK2-NEXT: [[ADD_PTR80:%.*]] = getelementptr inbounds double, ptr [[TMP58]], i64 [[MUL79]]
+// CHECK2-NEXT: store ptr [[ADD_PTR80]], ptr [[__BEGIN2]], align 8
+// CHECK2-NEXT: [[TMP60:%.*]] = load ptr, ptr [[__BEGIN2]], align 8
+// CHECK2-NEXT: store ptr [[TMP60]], ptr [[V]], align 8
+// CHECK2-NEXT: [[TMP61:%.*]] = load i32, ptr [[C]], align 4
+// CHECK2-NEXT: [[TMP62:%.*]] = load ptr, ptr [[V]], align 8
+// CHECK2-NEXT: [[TMP63:%.*]] = load double, ptr [[TMP62]], align 8
+// CHECK2-NEXT: call void (...) @body(i32 noundef [[TMP61]], double noundef [[TMP63]])
+// CHECK2-NEXT: br label %[[IF_END81]]
+// CHECK2: [[IF_END81]]:
+// CHECK2-NEXT: [[TMP64:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX52]], align 8
+// CHECK2-NEXT: [[TMP65:%.*]] = load i64, ptr [[DOTOMP_NI2]], align 8
+// CHECK2-NEXT: [[CMP82:%.*]] = icmp slt i64 [[TMP64]], [[TMP65]]
+// CHECK2-NEXT: br i1 [[CMP82]], label %[[IF_THEN83:.*]], label %[[IF_END88:.*]]
+// CHECK2: [[IF_THEN83]]:
+// CHECK2-NEXT: [[TMP66:%.*]] = load i64, ptr [[DOTOMP_LB2]], align 8
+// CHECK2-NEXT: [[TMP67:%.*]] = load i64, ptr [[DOTOMP_ST2]], align 8
+// CHECK2-NEXT: [[TMP68:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX52]], align 8
+// CHECK2-NEXT: [[MUL84:%.*]] = mul nsw i64 [[TMP67]], [[TMP68]]
+// CHECK2-NEXT: [[ADD85:%.*]] = add nsw i64 [[TMP66]], [[MUL84]]
+// CHECK2-NEXT: store i64 [[ADD85]], ptr [[DOTOMP_IV2]], align 8
+// CHECK2-NEXT: [[TMP69:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_27]], align 8
+// CHECK2-NEXT: [[TMP70:%.*]] = load i64, ptr [[DOTOMP_IV2]], align 8
+// CHECK2-NEXT: [[MUL86:%.*]] = mul nsw i64 [[TMP70]], 1
+// CHECK2-NEXT: [[ADD_PTR87:%.*]] = getelementptr inbounds double, ptr [[TMP69]], i64 [[MUL86]]
+// CHECK2-NEXT: store ptr [[ADD_PTR87]], ptr [[__BEGIN225]], align 8
+// CHECK2-NEXT: [[TMP71:%.*]] = load ptr, ptr [[__BEGIN225]], align 8
+// CHECK2-NEXT: store ptr [[TMP71]], ptr [[VV]], align 8
+// CHECK2-NEXT: [[TMP72:%.*]] = load i32, ptr [[CC]], align 4
+// CHECK2-NEXT: [[TMP73:%.*]] = load ptr, ptr [[VV]], align 8
+// CHECK2-NEXT: [[TMP74:%.*]] = load double, ptr [[TMP73]], align 8
+// CHECK2-NEXT: call void (...) @body(i32 noundef [[TMP72]], double noundef [[TMP74]])
+// CHECK2-NEXT: br label %[[IF_END88]]
+// CHECK2: [[IF_END88]]:
+// CHECK2-NEXT: br label %[[FOR_INC:.*]]
+// CHECK2: [[FOR_INC]]:
+// CHECK2-NEXT: [[TMP75:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX52]], align 8
+// CHECK2-NEXT: [[INC:%.*]] = add nsw i64 [[TMP75]], 1
+// CHECK2-NEXT: store i64 [[INC]], ptr [[DOTOMP_FUSE_INDEX52]], align 8
+// CHECK2-NEXT: br label %[[FOR_COND]], !llvm.loop [[LOOP5:![0-9]+]]
+// CHECK2: [[FOR_END]]:
+// CHECK2-NEXT: ret void
+//
+//
+// CHECK2-LABEL: define dso_local void @foo4(
+// CHECK2-SAME: ) #[[ATTR0]] {
+// CHECK2-NEXT: [[ENTRY:.*:]]
+// CHECK2-NEXT: [[ARR:%.*]] = alloca [256 x double], align 16
+// CHECK2-NEXT: [[J:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_LB0:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_ST0:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_NI0:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_IV0:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[K:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_LB1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_ST1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_NI1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_IV1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_TEMP_1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_FUSE_MAX:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_FUSE_INDEX:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[C:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[__RANGE2:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[__BEGIN2:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[__END2:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[V:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: store i32 0, ptr [[J]], align 4
+// CHECK2-NEXT: store i32 0, ptr [[DOTOMP_LB0]], align 4
+// CHECK2-NEXT: store i32 1, ptr [[DOTOMP_ST0]], align 4
+// CHECK2-NEXT: store i32 128, ptr [[DOTOMP_NI0]], align 4
+// CHECK2-NEXT: store i32 0, ptr [[K]], align 4
+// CHECK2-NEXT: store i32 0, ptr [[DOTOMP_LB1]], align 4
+// CHECK2-NEXT: store i32 1, ptr [[DOTOMP_ST1]], align 4
+// CHECK2-NEXT: store i32 64, ptr [[DOTOMP_NI1]], align 4
+// CHECK2-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTOMP_NI0]], align 4
+// CHECK2-NEXT: store i32 [[TMP0]], ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK2-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK2-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK2-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP1]], [[TMP2]]
+// CHECK2-NEXT: br i1 [[CMP]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]]
+// CHECK2: [[COND_TRUE]]:
+// CHECK2-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK2-NEXT: br label %[[COND_END:.*]]
+// CHECK2: [[COND_FALSE]]:
+// CHECK2-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK2-NEXT: br label %[[COND_END]]
+// CHECK2: [[COND_END]]:
+// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ [[TMP3]], %[[COND_TRUE]] ], [ [[TMP4]], %[[COND_FALSE]] ]
+// CHECK2-NEXT: store i32 [[COND]], ptr [[DOTOMP_FUSE_MAX]], align 4
+// CHECK2-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK2-NEXT: br label %[[FOR_COND:.*]]
+// CHECK2: [[FOR_COND]]:
+// CHECK2-NEXT: [[TMP5:%.*]] = load i32, ptr [[I]], align 4
+// CHECK2-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 128
+// CHECK2-NEXT: br i1 [[CMP1]], label %[[FOR_BODY:.*]], label %[[FOR_END:.*]]
+// CHECK2: [[FOR_BODY]]:
+// CHECK2-NEXT: [[TMP6:%.*]] = load i32, ptr [[I]], align 4
+// CHECK2-NEXT: call void (...) @body(i32 noundef [[TMP6]])
+// CHECK2-NEXT: br label %[[FOR_INC:.*]]
+// CHECK2: [[FOR_INC]]:
+// CHECK2-NEXT: [[TMP7:%.*]] = load i32, ptr [[I]], align 4
+// CHECK2-NEXT: [[INC:%.*]] = add nsw i32 [[TMP7]], 1
+// CHECK2-NEXT: store i32 [[INC]], ptr [[I]], align 4
+// CHECK2-NEXT: br label %[[FOR_COND]], !llvm.loop [[LOOP6:![0-9]+]]
+// CHECK2: [[FOR_END]]:
+// CHECK2-NEXT: store i32 0, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT: br label %[[FOR_COND2:.*]]
+// CHECK2: [[FOR_COND2]]:
+// CHECK2-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_FUSE_MAX]], align 4
+// CHECK2-NEXT: [[CMP3:%.*]] = icmp slt i32 [[TMP8]], [[TMP9]]
+// CHECK2-NEXT: br i1 [[CMP3]], label %[[FOR_BODY4:.*]], label %[[FOR_END17:.*]]
+// CHECK2: [[FOR_BODY4]]:
+// CHECK2-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_NI0]], align 4
+// CHECK2-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP10]], [[TMP11]]
+// CHECK2-NEXT: br i1 [[CMP5]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
+// CHECK2: [[IF_THEN]]:
+// CHECK2-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_LB0]], align 4
+// CHECK2-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_ST0]], align 4
+// CHECK2-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP13]], [[TMP14]]
+// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP12]], [[MUL]]
+// CHECK2-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV0]], align 4
+// CHECK2-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV0]], align 4
+// CHECK2-NEXT: [[MUL6:%.*]] = mul nsw i32 [[TMP15]], 2
+// CHECK2-NEXT: [[ADD7:%.*]] = add nsw i32 0, [[MUL6]]
+// CHECK2-NEXT: store i32 [[ADD7]], ptr [[J]], align 4
+// CHECK2-NEXT: [[TMP16:%.*]] = load i32, ptr [[J]], align 4
+// CHECK2-NEXT: call void (...) @body(i32 noundef [[TMP16]])
+// CHECK2-NEXT: br label %[[IF_END]]
+// CHECK2: [[IF_END]]:
+// CHECK2-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT: [[TMP18:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK2-NEXT: [[CMP8:%.*]] = icmp slt i32 [[TMP17]], [[TMP18]]
+// CHECK2-NEXT: br i1 [[CMP8]], label %[[IF_THEN9:.*]], label %[[IF_END14:.*]]
+// CHECK2: [[IF_THEN9]]:
+// CHECK2-NEXT: [[TMP19:%.*]] = load i32, ptr [[DOTOMP_LB1]], align 4
+// CHECK2-NEXT: [[TMP20:%.*]] = load i32, ptr [[DOTOMP_ST1]], align 4
+// CHECK2-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT: [[MUL10:%.*]] = mul nsw i32 [[TMP20]], [[TMP21]]
+// CHECK2-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP19]], [[MUL10]]
+// CHECK2-NEXT: store i32 [[ADD11]], ptr [[DOTOMP_IV1]], align 4
+// CHECK2-NEXT: [[TMP22:%.*]] = load i32, ptr [[DOTOMP_IV1]], align 4
+// CHECK2-NEXT: [[MUL12:%.*]] = mul nsw i32 [[TMP22]], 1
+// CHECK2-NEXT: [[ADD13:%.*]] = add nsw i32 0, [[MUL12]]
+// CHECK2-NEXT: store i32 [[ADD13]], ptr [[K]], align 4
+// CHECK2-NEXT: [[TMP23:%.*]] = load i32, ptr [[K]], align 4
+// CHECK2-NEXT: call void (...) @body(i32 noundef [[TMP23]])
+// CHECK2-NEXT: br label %[[IF_END14]]
+// CHECK2: [[IF_END14]]:
+// CHECK2-NEXT: br label %[[FOR_INC15:.*]]
+// CHECK2: [[FOR_INC15]]:
+// CHECK2-NEXT: [[TMP24:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT: [[INC16:%.*]] = add nsw i32 [[TMP24]], 1
+// CHECK2-NEXT: store i32 [[INC16]], ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT: br label %[[FOR_COND2]], !llvm.loop [[LOOP7:![0-9]+]]
+// CHECK2: [[FOR_END17]]:
+// CHECK2-NEXT: store i32 42, ptr [[C]], align 4
+// CHECK2-NEXT: store ptr [[ARR]], ptr [[__RANGE2]], align 8
+// CHECK2-NEXT: [[TMP25:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK2-NEXT: [[ARRAYDECAY:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP25]], i64 0, i64 0
+// CHECK2-NEXT: store ptr [[ARRAYDECAY]], ptr [[__BEGIN2]], align 8
+// CHECK2-NEXT: [[TMP26:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK2-NEXT: [[ARRAYDECAY18:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP26]], i64 0, i64 0
+// CHECK2-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds double, ptr [[ARRAYDECAY18]], i64 256
+// CHECK2-NEXT: store ptr [[ADD_PTR]], ptr [[__END2]], align 8
+// CHECK2-NEXT: br label %[[FOR_COND19:.*]]
+// CHECK2: [[FOR_COND19]]:
+// CHECK2-NEXT: [[TMP27:%.*]] = load ptr, ptr [[__BEGIN2]], align 8
+// CHECK2-NEXT: [[TMP28:%.*]] = load ptr, ptr [[__END2]], align 8
+// CHECK2-NEXT: [[CMP20:%.*]] = icmp ne ptr [[TMP27]], [[TMP28]]
+// CHECK2-NEXT: br i1 [[CMP20]], label %[[FOR_BODY21:.*]], label %[[FOR_END23:.*]]
+// CHECK2: [[FOR_BODY21]]:
+// CHECK2-NEXT: [[TMP29:%.*]] = load ptr, ptr [[__BEGIN2]], align 8
+// CHECK2-NEXT: store ptr [[TMP29]], ptr [[V]], align 8
+// CHECK2-NEXT: [[TMP30:%.*]] = load i32, ptr [[C]], align 4
+// CHECK2-NEXT: [[TMP31:%.*]] = load ptr, ptr [[V]], align 8
+// CHECK2-NEXT: [[TMP32:%.*]] = load double, ptr [[TMP31]], align 8
+// CHECK2-NEXT: call void (...) @body(i32 noundef [[TMP30]], double noundef [[TMP32]])
+// CHECK2-NEXT: br label %[[FOR_INC22:.*]]
+// CHECK2: [[FOR_INC22]]:
+// CHECK2-NEXT: [[TMP33:%.*]] = load ptr, ptr [[__BEGIN2]], align 8
+// CHECK2-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP33]], i32 1
+// CHECK2-NEXT: store ptr [[INCDEC_PTR]], ptr [[__BEGIN2]], align 8
+// CHECK2-NEXT: br label %[[FOR_COND19]]
+// CHECK2: [[FOR_END23]]:
+// CHECK2-NEXT: ret void
+//
+//
+// CHECK2-LABEL: define dso_local void @foo5(
+// CHECK2-SAME: ) #[[ATTR0]] {
+// CHECK2-NEXT: [[ENTRY:.*:]]
+// CHECK2-NEXT: [[ARR:%.*]] = alloca [256 x double], align 16
+// CHECK2-NEXT: [[J:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_LB0:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_ST0:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_NI0:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_IV0:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[K:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_LB1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_ST1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_NI1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_IV1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_TEMP_1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_FUSE_MAX:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_FUSE_INDEX:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_LB03:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_ST04:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_NI05:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[DOTOMP_IV06:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[C:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[__RANGE2:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[__END2:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[__BEGIN2:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR_8:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR_10:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR_11:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[DOTOMP_LB116:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[DOTOMP_ST117:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[DOTOMP_NI118:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[DOTOMP_IV120:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[DOTOMP_TEMP_121:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[DOTOMP_FUSE_MAX22:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_FUSE_INDEX29:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[V:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[CC:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[__RANGE264:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[__BEGIN265:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[__END267:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[VV:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: store i32 0, ptr [[J]], align 4
+// CHECK2-NEXT: store i32 0, ptr [[DOTOMP_LB0]], align 4
+// CHECK2-NEXT: store i32 1, ptr [[DOTOMP_ST0]], align 4
+// CHECK2-NEXT: store i32 128, ptr [[DOTOMP_NI0]], align 4
+// CHECK2-NEXT: store i32 0, ptr [[K]], align 4
+// CHECK2-NEXT: store i32 0, ptr [[DOTOMP_LB1]], align 4
+// CHECK2-NEXT: store i32 1, ptr [[DOTOMP_ST1]], align 4
+// CHECK2-NEXT: store i32 512, ptr [[DOTOMP_NI1]], align 4
+// CHECK2-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTOMP_NI0]], align 4
+// CHECK2-NEXT: store i32 [[TMP0]], ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK2-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK2-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK2-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP1]], [[TMP2]]
+// CHECK2-NEXT: br i1 [[CMP]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]]
+// CHECK2: [[COND_TRUE]]:
+// CHECK2-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK2-NEXT: br label %[[COND_END:.*]]
+// CHECK2: [[COND_FALSE]]:
+// CHECK2-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK2-NEXT: br label %[[COND_END]]
+// CHECK2: [[COND_END]]:
+// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ [[TMP3]], %[[COND_TRUE]] ], [ [[TMP4]], %[[COND_FALSE]] ]
+// CHECK2-NEXT: store i32 [[COND]], ptr [[DOTOMP_FUSE_MAX]], align 4
+// CHECK2-NEXT: store i32 0, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_FUSE_MAX]], align 4
+// CHECK2-NEXT: store i32 [[TMP5]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP6]], 0
+// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK2-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK2-NEXT: store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT: store i32 0, ptr [[DOTOMP_LB03]], align 4
+// CHECK2-NEXT: store i32 1, ptr [[DOTOMP_ST04]], align 4
+// CHECK2-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP7]], 1
+// CHECK2-NEXT: [[CONV:%.*]] = sext i32 [[ADD]] to i64
+// CHECK2-NEXT: store i64 [[CONV]], ptr [[DOTOMP_NI05]], align 8
+// CHECK2-NEXT: store i32 42, ptr [[C]], align 4
+// CHECK2-NEXT: store ptr [[ARR]], ptr [[__RANGE2]], align 8
+// CHECK2-NEXT: [[TMP8:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK2-NEXT: [[ARRAYDECAY:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP8]], i64 0, i64 0
+// CHECK2-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds double, ptr [[ARRAYDECAY]], i64 256
+// CHECK2-NEXT: store ptr [[ADD_PTR]], ptr [[__END2]], align 8
+// CHECK2-NEXT: [[TMP9:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK2-NEXT: [[ARRAYDECAY7:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP9]], i64 0, i64 0
+// CHECK2-NEXT: store ptr [[ARRAYDECAY7]], ptr [[__BEGIN2]], align 8
+// CHECK2-NEXT: [[TMP10:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK2-NEXT: [[ARRAYDECAY9:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP10]], i64 0, i64 0
+// CHECK2-NEXT: store ptr [[ARRAYDECAY9]], ptr [[DOTCAPTURE_EXPR_8]], align 8
+// CHECK2-NEXT: [[TMP11:%.*]] = load ptr, ptr [[__END2]], align 8
+// CHECK2-NEXT: store ptr [[TMP11]], ptr [[DOTCAPTURE_EXPR_10]], align 8
+// CHECK2-NEXT: [[TMP12:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_10]], align 8
+// CHECK2-NEXT: [[TMP13:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_8]], align 8
+// CHECK2-NEXT: [[SUB_PTR_LHS_CAST:%.*]] = ptrtoint ptr [[TMP12]] to i64
+// CHECK2-NEXT: [[SUB_PTR_RHS_CAST:%.*]] = ptrtoint ptr [[TMP13]] to i64
+// CHECK2-NEXT: [[SUB_PTR_SUB:%.*]] = sub i64 [[SUB_PTR_LHS_CAST]], [[SUB_PTR_RHS_CAST]]
+// CHECK2-NEXT: [[SUB_PTR_DIV:%.*]] = sdiv exact i64 [[SUB_PTR_SUB]], 8
+// CHECK2-NEXT: [[SUB12:%.*]] = sub nsw i64 [[SUB_PTR_DIV]], 1
+// CHECK2-NEXT: [[ADD13:%.*]] = add nsw i64 [[SUB12]], 1
+// CHECK2-NEXT: [[DIV14:%.*]] = sdiv i64 [[ADD13]], 1
+// CHECK2-NEXT: [[SUB15:%.*]] = sub nsw i64 [[DIV14]], 1
+// CHECK2-NEXT: store i64 [[SUB15]], ptr [[DOTCAPTURE_EXPR_11]], align 8
+// CHECK2-NEXT: store i64 0, ptr [[DOTOMP_LB116]], align 8
+// CHECK2-NEXT: store i64 1, ptr [[DOTOMP_ST117]], align 8
+// CHECK2-NEXT: [[TMP14:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_11]], align 8
+// CHECK2-NEXT: [[ADD19:%.*]] = add nsw i64 [[TMP14]], 1
+// CHECK2-NEXT: store i64 [[ADD19]], ptr [[DOTOMP_NI118]], align 8
+// CHECK2-NEXT: [[TMP15:%.*]] = load i64, ptr [[DOTOMP_NI05]], align 8
+// CHECK2-NEXT: store i64 [[TMP15]], ptr [[DOTOMP_TEMP_121]], align 8
+// CHECK2-NEXT: [[TMP16:%.*]] = load i64, ptr [[DOTOMP_TEMP_121]], align 8
+// CHECK2-NEXT: [[TMP17:%.*]] = load i64, ptr [[DOTOMP_NI118]], align 8
+// CHECK2-NEXT: [[CMP23:%.*]] = icmp sgt i64 [[TMP16]], [[TMP17]]
+// CHECK2-NEXT: br i1 [[CMP23]], label %[[COND_TRUE24:.*]], label %[[COND_FALSE25:.*]]
+// CHECK2: [[COND_TRUE24]]:
+// CHECK2-NEXT: [[TMP18:%.*]] = load i64, ptr [[DOTOMP_TEMP_121]], align 8
+// CHECK2-NEXT: br label %[[COND_END26:.*]]
+// CHECK2: [[COND_FALSE25]]:
+// CHECK2-NEXT: [[TMP19:%.*]] = load i64, ptr [[DOTOMP_NI118]], align 8
+// CHECK2-NEXT: br label %[[COND_END26]]
+// CHECK2: [[COND_END26]]:
+// CHECK2-NEXT: [[COND27:%.*]] = phi i64 [ [[TMP18]], %[[COND_TRUE24]] ], [ [[TMP19]], %[[COND_FALSE25]] ]
+// CHECK2-NEXT: store i64 [[COND27]], ptr [[DOTOMP_FUSE_MAX22]], align 8
+// CHECK2-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK2-NEXT: br label %[[FOR_COND:.*]]
+// CHECK2: [[FOR_COND]]:
+// CHECK2-NEXT: [[TMP20:%.*]] = load i32, ptr [[I]], align 4
+// CHECK2-NEXT: [[CMP28:%.*]] = icmp slt i32 [[TMP20]], 128
+// CHECK2-NEXT: br i1 [[CMP28]], label %[[FOR_BODY:.*]], label %[[FOR_END:.*]]
+// CHECK2: [[FOR_BODY]]:
+// CHECK2-NEXT: [[TMP21:%.*]] = load i32, ptr [[I]], align 4
+// CHECK2-NEXT: call void (...) @body(i32 noundef [[TMP21]])
+// CHECK2-NEXT: br label %[[FOR_INC:.*]]
+// CHECK2: [[FOR_INC]]:
+// CHECK2-NEXT: [[TMP22:%.*]] = load i32, ptr [[I]], align 4
+// CHECK2-NEXT: [[INC:%.*]] = add nsw i32 [[TMP22]], 1
+// CHECK2-NEXT: store i32 [[INC]], ptr [[I]], align 4
+// CHECK2-NEXT: br label %[[FOR_COND]], !llvm.loop [[LOOP8:![0-9]+]]
+// CHECK2: [[FOR_END]]:
+// CHECK2-NEXT: store i64 0, ptr [[DOTOMP_FUSE_INDEX29]], align 8
+// CHECK2-NEXT: br label %[[FOR_COND30:.*]]
+// CHECK2: [[FOR_COND30]]:
+// CHECK2-NEXT: [[TMP23:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX29]], align 8
+// CHECK2-NEXT: [[TMP24:%.*]] = load i64, ptr [[DOTOMP_FUSE_MAX22]], align 8
+// CHECK2-NEXT: [[CMP31:%.*]] = icmp slt i64 [[TMP23]], [[TMP24]]
+// CHECK2-NEXT: br i1 [[CMP31]], label %[[FOR_BODY32:.*]], label %[[FOR_END63:.*]]
+// CHECK2: [[FOR_BODY32]]:
+// CHECK2-NEXT: [[TMP25:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX29]], align 8
+// CHECK2-NEXT: [[TMP26:%.*]] = load i64, ptr [[DOTOMP_NI05]], align 8
+// CHECK2-NEXT: [[CMP33:%.*]] = icmp slt i64 [[TMP25]], [[TMP26]]
+// CHECK2-NEXT: br i1 [[CMP33]], label %[[IF_THEN:.*]], label %[[IF_END53:.*]]
+// CHECK2: [[IF_THEN]]:
+// CHECK2-NEXT: [[TMP27:%.*]] = load i32, ptr [[DOTOMP_LB03]], align 4
+// CHECK2-NEXT: [[CONV34:%.*]] = sext i32 [[TMP27]] to i64
+// CHECK2-NEXT: [[TMP28:%.*]] = load i32, ptr [[DOTOMP_ST04]], align 4
+// CHECK2-NEXT: [[CONV35:%.*]] = sext i32 [[TMP28]] to i64
+// CHECK2-NEXT: [[TMP29:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX29]], align 8
+// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV35]], [[TMP29]]
+// CHECK2-NEXT: [[ADD36:%.*]] = add nsw i64 [[CONV34]], [[MUL]]
+// CHECK2-NEXT: [[CONV37:%.*]] = trunc i64 [[ADD36]] to i32
+// CHECK2-NEXT: store i32 [[CONV37]], ptr [[DOTOMP_IV06]], align 4
+// CHECK2-NEXT: [[TMP30:%.*]] = load i32, ptr [[DOTOMP_IV06]], align 4
+// CHECK2-NEXT: [[MUL38:%.*]] = mul nsw i32 [[TMP30]], 1
+// CHECK2-NEXT: [[ADD39:%.*]] = add nsw i32 0, [[MUL38]]
+// CHECK2-NEXT: store i32 [[ADD39]], ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT: [[TMP32:%.*]] = load i32, ptr [[DOTOMP_NI0]], align 4
+// CHECK2-NEXT: [[CMP40:%.*]] = icmp slt i32 [[TMP31]], [[TMP32]]
+// CHECK2-NEXT: br i1 [[CMP40]], label %[[IF_THEN41:.*]], label %[[IF_END:.*]]
+// CHECK2: [[IF_THEN41]]:
+// CHECK2-NEXT: [[TMP33:%.*]] = load i32, ptr [[DOTOMP_LB0]], align 4
+// CHECK2-NEXT: [[TMP34:%.*]] = load i32, ptr [[DOTOMP_ST0]], align 4
+// CHECK2-NEXT: [[TMP35:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT: [[MUL42:%.*]] = mul nsw i32 [[TMP34]], [[TMP35]]
+// CHECK2-NEXT: [[ADD43:%.*]] = add nsw i32 [[TMP33]], [[MUL42]]
+// CHECK2-NEXT: store i32 [[ADD43]], ptr [[DOTOMP_IV0]], align 4
+// CHECK2-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTOMP_IV0]], align 4
+// CHECK2-NEXT: [[MUL44:%.*]] = mul nsw i32 [[TMP36]], 2
+// CHECK2-NEXT: [[ADD45:%.*]] = add nsw i32 0, [[MUL44]]
+// CHECK2-NEXT: store i32 [[ADD45]], ptr [[J]], align 4
+// CHECK2-NEXT: [[TMP37:%.*]] = load i32, ptr [[J]], align 4
+// CHECK2-NEXT: call void (...) @body(i32 noundef [[TMP37]])
+// CHECK2-NEXT: br label %[[IF_END]]
+// CHECK2: [[IF_END]]:
+// CHECK2-NEXT: [[TMP38:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT: [[TMP39:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK2-NEXT: [[CMP46:%.*]] = icmp slt i32 [[TMP38]], [[TMP39]]
+// CHECK2-NEXT: br i1 [[CMP46]], label %[[IF_THEN47:.*]], label %[[IF_END52:.*]]
+// CHECK2: [[IF_THEN47]]:
+// CHECK2-NEXT: [[TMP40:%.*]] = load i32, ptr [[DOTOMP_LB1]], align 4
+// CHECK2-NEXT: [[TMP41:%.*]] = load i32, ptr [[DOTOMP_ST1]], align 4
+// CHECK2-NEXT: [[TMP42:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT: [[MUL48:%.*]] = mul nsw i32 [[TMP41]], [[TMP42]]
+// CHECK2-NEXT: [[ADD49:%.*]] = add nsw i32 [[TMP40]], [[MUL48]]
+// CHECK2-NEXT: store i32 [[ADD49]], ptr [[DOTOMP_IV1]], align 4
+// CHECK2-NEXT: [[TMP43:%.*]] = load i32, ptr [[DOTOMP_IV1]], align 4
+// CHECK2-NEXT: [[MUL50:%.*]] = mul nsw i32 [[TMP43]], 1
+// CHECK2-NEXT: [[ADD51:%.*]] = add nsw i32 0, [[MUL50]]
+// CHECK2-NEXT: store i32 [[ADD51]], ptr [[K]], align 4
+// CHECK2-NEXT: [[TMP44:%.*]] = load i32, ptr [[K]], align 4
+// CHECK2-NEXT: call void (...) @body(i32 noundef [[TMP44]])
+// CHECK2-NEXT: br label %[[IF_END52]]
+// CHECK2: [[IF_END52]]:
+// CHECK2-NEXT: br label %[[IF_END53]]
+// CHECK2: [[IF_END53]]:
+// CHECK2-NEXT: [[TMP45:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX29]], align 8
+// CHECK2-NEXT: [[TMP46:%.*]] = load i64, ptr [[DOTOMP_NI118]], align 8
+// CHECK2-NEXT: [[CMP54:%.*]] = icmp slt i64 [[TMP45]], [[TMP46]]
+// CHECK2-NEXT: br i1 [[CMP54]], label %[[IF_THEN55:.*]], label %[[IF_END60:.*]]
+// CHECK2: [[IF_THEN55]]:
+// CHECK2-NEXT: [[TMP47:%.*]] = load i64, ptr [[DOTOMP_LB116]], align 8
+// CHECK2-NEXT: [[TMP48:%.*]] = load i64, ptr [[DOTOMP_ST117]], align 8
+// CHECK2-NEXT: [[TMP49:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX29]], align 8
+// CHECK2-NEXT: [[MUL56:%.*]] = mul nsw i64 [[TMP48]], [[TMP49]]
+// CHECK2-NEXT: [[ADD57:%.*]] = add nsw i64 [[TMP47]], [[MUL56]]
+// CHECK2-NEXT: store i64 [[ADD57]], ptr [[DOTOMP_IV120]], align 8
+// CHECK2-NEXT: [[TMP50:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_8]], align 8
+// CHECK2-NEXT: [[TMP51:%.*]] = load i64, ptr [[DOTOMP_IV120]], align 8
+// CHECK2-NEXT: [[MUL58:%.*]] = mul nsw i64 [[TMP51]], 1
+// CHECK2-NEXT: [[ADD_PTR59:%.*]] = getelementptr inbounds double, ptr [[TMP50]], i64 [[MUL58]]
+// CHECK2-NEXT: store ptr [[ADD_PTR59]], ptr [[__BEGIN2]], align 8
+// CHECK2-NEXT: [[TMP52:%.*]] = load ptr, ptr [[__BEGIN2]], align 8
+// CHECK2-NEXT: store ptr [[TMP52]], ptr [[V]], align 8
+// CHECK2-NEXT: [[TMP53:%.*]] = load i32, ptr [[C]], align 4
+// CHECK2-NEXT: [[TMP54:%.*]] = load ptr, ptr [[V]], align 8
+// CHECK2-NEXT: [[TMP55:%.*]] = load double, ptr [[TMP54]], align 8
+// CHECK2-NEXT: call void (...) @body(i32 noundef [[TMP53]], double noundef [[TMP55]])
+// CHECK2-NEXT: br label %[[IF_END60]]
+// CHECK2: [[IF_END60]]:
+// CHECK2-NEXT: br label %[[FOR_INC61:.*]]
+// CHECK2: [[FOR_INC61]]:
+// CHECK2-NEXT: [[TMP56:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX29]], align 8
+// CHECK2-NEXT: [[INC62:%.*]] = add nsw i64 [[TMP56]], 1
+// CHECK2-NEXT: store i64 [[INC62]], ptr [[DOTOMP_FUSE_INDEX29]], align 8
+// CHECK2-NEXT: br label %[[FOR_COND30]], !llvm.loop [[LOOP9:![0-9]+]]
+// CHECK2: [[FOR_END63]]:
+// CHECK2-NEXT: store i32 37, ptr [[CC]], align 4
+// CHECK2-NEXT: store ptr [[ARR]], ptr [[__RANGE264]], align 8
+// CHECK2-NEXT: [[TMP57:%.*]] = load ptr, ptr [[__RANGE264]], align 8
+// CHECK2-NEXT: [[ARRAYDECAY66:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP57]], i64 0, i64 0
+// CHECK2-NEXT: store ptr [[ARRAYDECAY66]], ptr [[__BEGIN265]], align 8
+// CHECK2-NEXT: [[TMP58:%.*]] = load ptr, ptr [[__RANGE264]], align 8
+// CHECK2-NEXT: [[ARRAYDECAY68:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP58]], i64 0, i64 0
+// CHECK2-NEXT: [[ADD_PTR69:%.*]] = getelementptr inbounds double, ptr [[ARRAYDECAY68]], i64 256
+// CHECK2-NEXT: store ptr [[ADD_PTR69]], ptr [[__END267]], align 8
+// CHECK2-NEXT: br label %[[FOR_COND70:.*]]
+// CHECK2: [[FOR_COND70]]:
+// CHECK2-NEXT: [[TMP59:%.*]] = load ptr, ptr [[__BEGIN265]], align 8
+// CHECK2-NEXT: [[TMP60:%.*]] = load ptr, ptr [[__END267]], align 8
+// CHECK2-NEXT: [[CMP71:%.*]] = icmp ne ptr [[TMP59]], [[TMP60]]
+// CHECK2-NEXT: br i1 [[CMP71]], label %[[FOR_BODY72:.*]], label %[[FOR_END74:.*]]
+// CHECK2: [[FOR_BODY72]]:
+// CHECK2-NEXT: [[TMP61:%.*]] = load ptr, ptr [[__BEGIN265]], align 8
+// CHECK2-NEXT: store ptr [[TMP61]], ptr [[VV]], align 8
+// CHECK2-NEXT: [[TMP62:%.*]] = load i32, ptr [[CC]], align 4
+// CHECK2-NEXT: [[TMP63:%.*]] = load ptr, ptr [[VV]], align 8
+// CHECK2-NEXT: [[TMP64:%.*]] = load double, ptr [[TMP63]], align 8
+// CHECK2-NEXT: call void (...) @body(i32 noundef [[TMP62]], double noundef [[TMP64]])
+// CHECK2-NEXT: br label %[[FOR_INC73:.*]]
+// CHECK2: [[FOR_INC73]]:
+// CHECK2-NEXT: [[TMP65:%.*]] = load ptr, ptr [[__BEGIN265]], align 8
+// CHECK2-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP65]], i32 1
+// CHECK2-NEXT: store ptr [[INCDEC_PTR]], ptr [[__BEGIN265]], align 8
+// CHECK2-NEXT: br label %[[FOR_COND70]]
+// CHECK2: [[FOR_END74]]:
+// CHECK2-NEXT: ret void
+//
+//
+// CHECK2-LABEL: define dso_local void @tfoo2(
+// CHECK2-SAME: ) #[[ATTR0]] {
+// CHECK2-NEXT: [[ENTRY:.*:]]
+// CHECK2-NEXT: call void @_Z4foo2IiEvT_S0_S0_(i32 noundef 0, i32 noundef 64, i32 noundef 4)
+// CHECK2-NEXT: ret void
+//
+//
+// CHECK2-LABEL: define linkonce_odr void @_Z4foo2IiEvT_S0_S0_(
+// CHECK2-SAME: i32 noundef [[START:%.*]], i32 noundef [[END:%.*]], i32 noundef [[STEP:%.*]]) #[[ATTR0]] comdat {
+// CHECK2-NEXT: [[ENTRY:.*:]]
+// CHECK2-NEXT: [[START_ADDR:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[END_ADDR:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[STEP_ADDR:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[J:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[K:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTNEW_STEP:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_LB0:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_ST0:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_NI0:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_IV0:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR_6:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR_7:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTNEW_STEP8:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR_9:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_LB1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_ST1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_NI1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_IV1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR_17:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR_19:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTNEW_STEP21:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR_22:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_LB2:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_ST2:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_NI2:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_IV2:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_TEMP_1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_TEMP_2:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_FUSE_MAX:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_FUSE_INDEX:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: store i32 [[START]], ptr [[START_ADDR]], align 4
+// CHECK2-NEXT: store i32 [[END]], ptr [[END_ADDR]], align 4
+// CHECK2-NEXT: store i32 [[STEP]], ptr [[STEP_ADDR]], align 4
+// CHECK2-NEXT: [[TMP0:%.*]] = load i32, ptr [[START_ADDR]], align 4
+// CHECK2-NEXT: store i32 [[TMP0]], ptr [[I]], align 4
+// CHECK2-NEXT: [[TMP1:%.*]] = load i32, ptr [[START_ADDR]], align 4
+// CHECK2-NEXT: store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT: [[TMP2:%.*]] = load i32, ptr [[END_ADDR]], align 4
+// CHECK2-NEXT: store i32 [[TMP2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT: [[TMP3:%.*]] = load i32, ptr [[STEP_ADDR]], align 4
+// CHECK2-NEXT: store i32 [[TMP3]], ptr [[DOTNEW_STEP]], align 4
+// CHECK2-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT: [[SUB:%.*]] = sub i32 [[TMP4]], [[TMP5]]
+// CHECK2-NEXT: [[SUB3:%.*]] = sub i32 [[SUB]], 1
+// CHECK2-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4
+// CHECK2-NEXT: [[ADD:%.*]] = add i32 [[SUB3]], [[TMP6]]
+// CHECK2-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4
+// CHECK2-NEXT: [[DIV:%.*]] = udiv i32 [[ADD]], [[TMP7]]
+// CHECK2-NEXT: [[SUB4:%.*]] = sub i32 [[DIV]], 1
+// CHECK2-NEXT: store i32 [[SUB4]], ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK2-NEXT: store i32 0, ptr [[DOTOMP_LB0]], align 4
+// CHECK2-NEXT: store i32 1, ptr [[DOTOMP_ST0]], align 4
+// CHECK2-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK2-NEXT: [[ADD5:%.*]] = add i32 [[TMP8]], 1
+// CHECK2-NEXT: store i32 [[ADD5]], ptr [[DOTOMP_NI0]], align 4
+// CHECK2-NEXT: [[TMP9:%.*]] = load i32, ptr [[END_ADDR]], align 4
+// CHECK2-NEXT: store i32 [[TMP9]], ptr [[J]], align 4
+// CHECK2-NEXT: [[TMP10:%.*]] = load i32, ptr [[END_ADDR]], align 4
+// CHECK2-NEXT: store i32 [[TMP10]], ptr [[DOTCAPTURE_EXPR_6]], align 4
+// CHECK2-NEXT: [[TMP11:%.*]] = load i32, ptr [[START_ADDR]], align 4
+// CHECK2-NEXT: store i32 [[TMP11]], ptr [[DOTCAPTURE_EXPR_7]], align 4
+// CHECK2-NEXT: [[TMP12:%.*]] = load i32, ptr [[STEP_ADDR]], align 4
+// CHECK2-NEXT: store i32 [[TMP12]], ptr [[DOTNEW_STEP8]], align 4
+// CHECK2-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4
+// CHECK2-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_7]], align 4
+// CHECK2-NEXT: [[SUB10:%.*]] = sub i32 [[TMP13]], [[TMP14]]
+// CHECK2-NEXT: [[SUB11:%.*]] = sub i32 [[SUB10]], 1
+// CHECK2-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTNEW_STEP8]], align 4
+// CHECK2-NEXT: [[ADD12:%.*]] = add i32 [[SUB11]], [[TMP15]]
+// CHECK2-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTNEW_STEP8]], align 4
+// CHECK2-NEXT: [[DIV13:%.*]] = udiv i32 [[ADD12]], [[TMP16]]
+// CHECK2-NEXT: [[SUB14:%.*]] = sub i32 [[DIV13]], 1
+// CHECK2-NEXT: store i32 [[SUB14]], ptr [[DOTCAPTURE_EXPR_9]], align 4
+// CHECK2-NEXT: store i32 0, ptr [[DOTOMP_LB1]], align 4
+// CHECK2-NEXT: store i32 1, ptr [[DOTOMP_ST1]], align 4
+// CHECK2-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_9]], align 4
+// CHECK2-NEXT: [[ADD15:%.*]] = add i32 [[TMP17]], 1
+// CHECK2-NEXT: store i32 [[ADD15]], ptr [[DOTOMP_NI1]], align 4
+// CHECK2-NEXT: [[TMP18:%.*]] = load i32, ptr [[START_ADDR]], align 4
+// CHECK2-NEXT: [[TMP19:%.*]] = load i32, ptr [[STEP_ADDR]], align 4
+// CHECK2-NEXT: [[ADD16:%.*]] = add nsw i32 [[TMP18]], [[TMP19]]
+// CHECK2-NEXT: store i32 [[ADD16]], ptr [[K]], align 4
+// CHECK2-NEXT: [[TMP20:%.*]] = load i32, ptr [[START_ADDR]], align 4
+// CHECK2-NEXT: [[TMP21:%.*]] = load i32, ptr [[STEP_ADDR]], align 4
+// CHECK2-NEXT: [[ADD18:%.*]] = add nsw i32 [[TMP20]], [[TMP21]]
+// CHECK2-NEXT: store i32 [[ADD18]], ptr [[DOTCAPTURE_EXPR_17]], align 4
+// CHECK2-NEXT: [[TMP22:%.*]] = load i32, ptr [[END_ADDR]], align 4
+// CHECK2-NEXT: [[TMP23:%.*]] = load i32, ptr [[STEP_ADDR]], align 4
+// CHECK2-NEXT: [[ADD20:%.*]] = add nsw i32 [[TMP22]], [[TMP23]]
+// CHECK2-NEXT: store i32 [[ADD20]], ptr [[DOTCAPTURE_EXPR_19]], align 4
+// CHECK2-NEXT: [[TMP24:%.*]] = load i32, ptr [[STEP_ADDR]], align 4
+// CHECK2-NEXT: store i32 [[TMP24]], ptr [[DOTNEW_STEP21]], align 4
+// CHECK2-NEXT: [[TMP25:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_19]], align 4
+// CHECK2-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4
+// CHECK2-NEXT: [[SUB23:%.*]] = sub i32 [[TMP25]], [[TMP26]]
+// CHECK2-NEXT: [[SUB24:%.*]] = sub i32 [[SUB23]], 1
+// CHECK2-NEXT: [[TMP27:%.*]] = load i32, ptr [[DOTNEW_STEP21]], align 4
+// CHECK2-NEXT: [[ADD25:%.*]] = add i32 [[SUB24]], [[TMP27]]
+// CHECK2-NEXT: [[TMP28:%.*]] = load i32, ptr [[DOTNEW_STEP21]], align 4
+// CHECK2-NEXT: [[DIV26:%.*]] = udiv i32 [[ADD25]], [[TMP28]]
+// CHECK2-NEXT: [[SUB27:%.*]] = sub i32 [[DIV26]], 1
+// CHECK2-NEXT: store i32 [[SUB27]], ptr [[DOTCAPTURE_EXPR_22]], align 4
+// CHECK2-NEXT: store i32 0, ptr [[DOTOMP_LB2]], align 4
+// CHECK2-NEXT: store i32 1, ptr [[DOTOMP_ST2]], align 4
+// CHECK2-NEXT: [[TMP29:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_22]], align 4
+// CHECK2-NEXT: [[ADD28:%.*]] = add i32 [[TMP29]], 1
+// CHECK2-NEXT: store i32 [[ADD28]], ptr [[DOTOMP_NI2]], align 4
+// CHECK2-NEXT: [[TMP30:%.*]] = load i32, ptr [[DOTOMP_NI0]], align 4
+// CHECK2-NEXT: store i32 [[TMP30]], ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK2-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK2-NEXT: [[TMP32:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK2-NEXT: [[CMP:%.*]] = icmp ugt i32 [[TMP31]], [[TMP32]]
+// CHECK2-NEXT: br i1 [[CMP]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]]
+// CHECK2: [[COND_TRUE]]:
+// CHECK2-NEXT: [[TMP33:%.*]] = load i32, ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK2-NEXT: br label %[[COND_END:.*]]
+// CHECK2: [[COND_FALSE]]:
+// CHECK2-NEXT: [[TMP34:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK2-NEXT: br label %[[COND_END]]
+// CHECK2: [[COND_END]]:
+// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ [[TMP33]], %[[COND_TRUE]] ], [ [[TMP34]], %[[COND_FALSE]] ]
+// CHECK2-NEXT: store i32 [[COND]], ptr [[DOTOMP_TEMP_2]], align 4
+// CHECK2-NEXT: [[TMP35:%.*]] = load i32, ptr [[DOTOMP_TEMP_2]], align 4
+// CHECK2-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTOMP_NI2]], align 4
+// CHECK2-NEXT: [[CMP29:%.*]] = icmp ugt i32 [[TMP35]], [[TMP36]]
+// CHECK2-NEXT: br i1 [[CMP29]], label %[[COND_TRUE30:.*]], label %[[COND_FALSE31:.*]]
+// CHECK2: [[COND_TRUE30]]:
+// CHECK2-NEXT: [[TMP37:%.*]] = load i32, ptr [[DOTOMP_TEMP_2]], align 4
+// CHECK2-NEXT: br label %[[COND_END32:.*]]
+// CHECK2: [[COND_FALSE31]]:
+// CHECK2-NEXT: [[TMP38:%.*]] = load i32, ptr [[DOTOMP_NI2]], align 4
+// CHECK2-NEXT: br label %[[COND_END32]]
+// CHECK2: [[COND_END32]]:
+// CHECK2-NEXT: [[COND33:%.*]] = phi i32 [ [[TMP37]], %[[COND_TRUE30]] ], [ [[TMP38]], %[[COND_FALSE31]] ]
+// CHECK2-NEXT: store i32 [[COND33]], ptr [[DOTOMP_FUSE_MAX]], align 4
+// CHECK2-NEXT: store i32 0, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT: br label %[[FOR_COND:.*]]
+// CHECK2: [[FOR_COND]]:
+// CHECK2-NEXT: [[TMP39:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT: [[TMP40:%.*]] = load i32, ptr [[DOTOMP_FUSE_MAX]], align 4
+// CHECK2-NEXT: [[CMP34:%.*]] = icmp ult i32 [[TMP39]], [[TMP40]]
+// CHECK2-NEXT: br i1 [[CMP34]], label %[[FOR_BODY:.*]], label %[[FOR_END:.*]]
+// CHECK2: [[FOR_BODY]]:
+// CHECK2-NEXT: [[TMP41:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT: [[TMP42:%.*]] = load i32, ptr [[DOTOMP_NI0]], align 4
+// CHECK2-NEXT: [[CMP35:%.*]] = icmp ult i32 [[TMP41]], [[TMP42]]
+// CHECK2-NEXT: br i1 [[CMP35]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
+// CHECK2: [[IF_THEN]]:
+// CHECK2-NEXT: [[TMP43:%.*]] = load i32, ptr [[DOTOMP_LB0]], align 4
+// CHECK2-NEXT: [[TMP44:%.*]] = load i32, ptr [[DOTOMP_ST0]], align 4
+// CHECK2-NEXT: [[TMP45:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT: [[MUL:%.*]] = mul i32 [[TMP44]], [[TMP45]]
+// CHECK2-NEXT: [[ADD36:%.*]] = add i32 [[TMP43]], [[MUL]]
+// CHECK2-NEXT: store i32 [[ADD36]], ptr [[DOTOMP_IV0]], align 4
+// CHECK2-NEXT: [[TMP46:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT: [[TMP47:%.*]] = load i32, ptr [[DOTOMP_IV0]], align 4
+// CHECK2-NEXT: [[TMP48:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4
+// CHECK2-NEXT: [[MUL37:%.*]] = mul i32 [[TMP47]], [[TMP48]]
+// CHECK2-NEXT: [[ADD38:%.*]] = add i32 [[TMP46]], [[MUL37]]
+// CHECK2-NEXT: store i32 [[ADD38]], ptr [[I]], align 4
+// CHECK2-NEXT: [[TMP49:%.*]] = load i32, ptr [[I]], align 4
+// CHECK2-NEXT: call void (...) @body(i32 noundef [[TMP49]])
+// CHECK2-NEXT: br label %[[IF_END]]
+// CHECK2: [[IF_END]]:
+// CHECK2-NEXT: [[TMP50:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT: [[TMP51:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK2-NEXT: [[CMP39:%.*]] = icmp ult i32 [[TMP50]], [[TMP51]]
+// CHECK2-NEXT: br i1 [[CMP39]], label %[[IF_THEN40:.*]], label %[[IF_END45:.*]]
+// CHECK2: [[IF_THEN40]]:
+// CHECK2-NEXT: [[TMP52:%.*]] = load i32, ptr [[DOTOMP_LB1]], align 4
+// CHECK2-NEXT: [[TMP53:%.*]] = load i32, ptr [[DOTOMP_ST1]], align 4
+// CHECK2-NEXT: [[TMP54:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT: [[MUL41:%.*]] = mul i32 [[TMP53]], [[TMP54]]
+// CHECK2-NEXT: [[ADD42:%.*]] = add i32 [[TMP52]], [[MUL41]]
+// CHECK2-NEXT: store i32 [[ADD42]], ptr [[DOTOMP_IV1]], align 4
+// CHECK2-NEXT: [[TMP55:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4
+// CHECK2-NEXT: [[TMP56:%.*]] = load i32, ptr [[DOTOMP_IV1]], align 4
+// CHECK2-NEXT: [[TMP57:%.*]] = load i32, ptr [[DOTNEW_STEP8]], align 4
+// CHECK2-NEXT: [[MUL43:%.*]] = mul i32 [[TMP56]], [[TMP57]]
+// CHECK2-NEXT: [[SUB44:%.*]] = sub i32 [[TMP55]], [[MUL43]]
+// CHECK2-NEXT: store i32 [[SUB44]], ptr [[J]], align 4
+// CHECK2-NEXT: [[TMP58:%.*]] = load i32, ptr [[J]], align 4
+// CHECK2-NEXT: call void (...) @body(i32 noundef [[TMP58]])
+// CHECK2-NEXT: br label %[[IF_END45]]
+// CHECK2: [[IF_END45]]:
+// CHECK2-NEXT: [[TMP59:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT: [[TMP60:%.*]] = load i32, ptr [[DOTOMP_NI2]], align 4
+// CHECK2-NEXT: [[CMP46:%.*]] = icmp ult i32 [[TMP59]], [[TMP60]]
+// CHECK2-NEXT: br i1 [[CMP46]], label %[[IF_THEN47:.*]], label %[[IF_END52:.*]]
+// CHECK2: [[IF_THEN47]]:
+// CHECK2-NEXT: [[TMP61:%.*]] = load i32, ptr [[DOTOMP_LB2]], align 4
+// CHECK2-NEXT: [[TMP62:%.*]] = load i32, ptr [[DOTOMP_ST2]], align 4
+// CHECK2-NEXT: [[TMP63:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT: [[MUL48:%.*]] = mul i32 [[TMP62]], [[TMP63]]
+// CHECK2-NEXT: [[ADD49:%.*]] = add i32 [[TMP61]], [[MUL48]]
+// CHECK2-NEXT: store i32 [[ADD49]], ptr [[DOTOMP_IV2]], align 4
+// CHECK2-NEXT: [[TMP64:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4
+// CHECK2-NEXT: [[TMP65:%.*]] = load i32, ptr [[DOTOMP_IV2]], align 4
+// CHECK2-NEXT: [[TMP66:%.*]] = load i32, ptr [[DOTNEW_STEP21]], align 4
+// CHECK2-NEXT: [[MUL50:%.*]] = mul i32 [[TMP65]], [[TMP66]]
+// CHECK2-NEXT: [[ADD51:%.*]] = add i32 [[TMP64]], [[MUL50]]
+// CHECK2-NEXT: store i32 [[ADD51]], ptr [[K]], align 4
+// CHECK2-NEXT: [[TMP67:%.*]] = load i32, ptr [[K]], align 4
+// CHECK2-NEXT: call void (...) @body(i32 noundef [[TMP67]])
+// CHECK2-NEXT: br label %[[IF_END52]]
+// CHECK2: [[IF_END52]]:
+// CHECK2-NEXT: br label %[[FOR_INC:.*]]
+// CHECK2: [[FOR_INC]]:
+// CHECK2-NEXT: [[TMP68:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT: [[INC:%.*]] = add i32 [[TMP68]], 1
+// CHECK2-NEXT: store i32 [[INC]], ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT: br label %[[FOR_COND]], !llvm.loop [[LOOP10:![0-9]+]]
+// CHECK2: [[FOR_END]]:
+// CHECK2-NEXT: ret void
+//
+//.
+// CHECK1: [[LOOP3]] = distinct !{[[LOOP3]], [[META4:![0-9]+]]}
+// CHECK1: [[META4]] = !{!"llvm.loop.mustprogress"}
+// CHECK1: [[LOOP5]] = distinct !{[[LOOP5]], [[META4]]}
+// CHECK1: [[LOOP6]] = distinct !{[[LOOP6]], [[META4]]}
+// CHECK1: [[LOOP7]] = distinct !{[[LOOP7]], [[META4]]}
+// CHECK1: [[LOOP8]] = distinct !{[[LOOP8]], [[META4]]}
+// CHECK1: [[LOOP9]] = distinct !{[[LOOP9]], [[META4]]}
+// CHECK1: [[LOOP10]] = distinct !{[[LOOP10]], [[META4]]}
+//.
+// CHECK2: [[LOOP3]] = distinct !{[[LOOP3]], [[META4:![0-9]+]]}
+// CHECK2: [[META4]] = !{!"llvm.loop.mustprogress"}
+// CHECK2: [[LOOP5]] = distinct !{[[LOOP5]], [[META4]]}
+// CHECK2: [[LOOP6]] = distinct !{[[LOOP6]], [[META4]]}
+// CHECK2: [[LOOP7]] = distinct !{[[LOOP7]], [[META4]]}
+// CHECK2: [[LOOP8]] = distinct !{[[LOOP8]], [[META4]]}
+// CHECK2: [[LOOP9]] = distinct !{[[LOOP9]], [[META4]]}
+// CHECK2: [[LOOP10]] = distinct !{[[LOOP10]], [[META4]]}
+//.
diff --git a/clang/test/OpenMP/fuse_messages.cpp b/clang/test/OpenMP/fuse_messages.cpp
new file mode 100644
index 0000000000000..b86ce95cfe9bc
--- /dev/null
+++ b/clang/test/OpenMP/fuse_messages.cpp
@@ -0,0 +1,209 @@
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -std=c++20 -fopenmp -fopenmp-version=60 -fsyntax-only -Wuninitialized -verify %s
+
+void func() {
+
+ // expected-error at +2 {{statement after '#pragma omp fuse' must be a loop sequence containing canonical loops or loop-generating constructs}}
+ #pragma omp fuse
+ ;
+
+ // expected-error at +2 {{statement after '#pragma omp fuse' must be a for loop}}
+ #pragma omp fuse
+ {int bar = 0;}
+
+ // expected-error at +4 {{statement after '#pragma omp fuse' must be a for loop}}
+ #pragma omp fuse
+ {
+ for(int i = 0; i < 10; ++i);
+ int x = 2;
+ }
+
+ // expected-error at +2 {{statement after '#pragma omp fuse' must be a loop sequence containing canonical loops or loop-generating constructs}}
+ #pragma omp fuse
+ #pragma omp for
+ for (int i = 0; i < 7; ++i)
+ ;
+
+ {
+ // expected-error at +2 {{expected statement}}
+ #pragma omp fuse
+ }
+
+ // expected-warning at +1 {{extra tokens at the end of '#pragma omp fuse' are ignored}}
+ #pragma omp fuse foo
+ {
+ for (int i = 0; i < 7; ++i)
+ ;
+ for(int j = 0; j < 100; ++j);
+
+ }
+
+
+ // expected-error at +1 {{unexpected OpenMP clause 'final' in directive '#pragma omp fuse'}}
+ #pragma omp fuse final(0)
+ {
+ for (int i = 0; i < 7; ++i)
+ ;
+ for(int j = 0; j < 100; ++j);
+
+ }
+
+ //expected-error at +3 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'i'}}
+ #pragma omp fuse
+ {
+ for(int i = 0; i < 10; i*=2) {
+ ;
+ }
+ for(int j = 0; j < 100; ++j);
+ }
+
+ //expected-error at +2 {{loop sequence after '#pragma omp fuse' must contain at least 1 canonical loop or loop-generating construct}}
+ #pragma omp fuse
+ {}
+
+ //expected-error at +3 {{statement after '#pragma omp fuse' must be a for loop}}
+ #pragma omp fuse
+ {
+ #pragma omp unroll full
+ for(int i = 0; i < 10; ++i);
+
+ for(int j = 0; j < 10; ++j);
+ }
+
+ //expected-warning at +2 {{looprange clause selects a single loop, resulting in redundant fusion}}
+ #pragma omp fuse
+ {
+ for(int i = 0; i < 10; ++i);
+ }
+
+ //expected-warning at +1 {{looprange clause selects a single loop, resulting in redundant fusion}}
+ #pragma omp fuse looprange(1, 1)
+ {
+ for(int i = 0; i < 10; ++i);
+ for(int j = 0; j < 100; ++j);
+ }
+
+ //expected-error at +1 {{argument to 'looprange' clause must be a strictly positive integer value}}
+ #pragma omp fuse looprange(1, -1)
+ {
+ for(int i = 0; i < 10; ++i);
+ for(int j = 0; j < 100; ++j);
+ }
+
+ //expected-error at +1 {{argument to 'looprange' clause must be a strictly positive integer value}}
+ #pragma omp fuse looprange(1, 0)
+ {
+ for(int i = 0; i < 10; ++i);
+ for(int j = 0; j < 100; ++j);
+ }
+
+ const int x = 1;
+ constexpr int y = 4;
+ //expected-error at +1 {{looprange clause selects loops from 1 to 4 but this exceeds the number of loops (3) in the loop sequence}}
+ #pragma omp fuse looprange(x,y)
+ {
+ for(int i = 0; i < 10; ++i);
+ for(int j = 0; j < 100; ++j);
+ for(int k = 0; k < 50; ++k);
+ }
+
+ //expected-error at +1 {{looprange clause selects loops from 1 to 420 but this exceeds the number of loops (3) in the loop sequence}}
+ #pragma omp fuse looprange(1,420)
+ {
+ for(int i = 0; i < 10; ++i);
+ for(int j = 0; j < 100; ++j);
+ for(int k = 0; k < 50; ++k);
+ }
+
+ //expected-error at +1 {{looprange clause selects loops from 1 to 6 but this exceeds the number of loops (5) in the loop sequence}}
+ #pragma omp fuse looprange(1,6)
+ {
+ for(int i = 0; i < 10; ++i);
+ for(int j = 0; j < 100; ++j);
+ for(int k = 0; k < 50; ++k);
+ // This fusion results in 2 loops
+ #pragma omp fuse looprange(1,2)
+ {
+ for(int i = 0; i < 10; ++i);
+ for(int j = 0; j < 100; ++j);
+ for(int k = 0; k < 50; ++k);
+ }
+ }
+
+ //expected-error at +1 {{looprange clause selects loops from 2 to 4 but this exceeds the number of loops (3) in the loop sequence}}
+ #pragma omp fuse looprange(2,3)
+ {
+ #pragma omp unroll partial(2)
+ for(int i = 0; i < 10; ++i);
+
+ #pragma omp reverse
+ for(int j = 0; j < 10; ++j);
+
+ #pragma omp fuse
+ {
+ {
+ #pragma omp reverse
+ for(int j = 0; j < 10; ++j);
+ }
+ for(int k = 0; k < 50; ++k);
+ }
+ }
+}
+
+// In a template context, but expression itself not instantiation-dependent
+template <typename T>
+static void templated_func() {
+
+ //expected-warning at +1 {{looprange clause selects a single loop, resulting in redundant fusion}}
+ #pragma omp fuse looprange(2,1)
+ {
+ for(int i = 0; i < 10; ++i);
+ for(int j = 0; j < 100; ++j);
+ for(int k = 0; k < 50; ++k);
+ }
+
+ //expected-error at +1 {{looprange clause selects loops from 3 to 5 but this exceeds the number of loops (3) in the loop sequence}}
+ #pragma omp fuse looprange(3,3)
+ {
+ for(int i = 0; i < 10; ++i);
+ for(int j = 0; j < 100; ++j);
+ for(int k = 0; k < 50; ++k);
+ }
+
+}
+
+template <int V>
+static void templated_func_value_dependent() {
+
+ //expected-warning at +1 {{looprange clause selects a single loop, resulting in redundant fusion}}
+ #pragma omp fuse looprange(V,1)
+ {
+ for(int i = 0; i < 10; ++i);
+ for(int j = 0; j < 100; ++j);
+ for(int k = 0; k < 50; ++k);
+ }
+}
+
+template <typename T>
+static void templated_func_type_dependent() {
+ constexpr T s = 1;
+
+ //expected-error at +1 {{argument to 'looprange' clause must be a strictly positive integer value}}
+ #pragma omp fuse looprange(s,s-1)
+ {
+ for(int i = 0; i < 10; ++i);
+ for(int j = 0; j < 100; ++j);
+ for(int k = 0; k < 50; ++k);
+ }
+}
+
+
+void template_inst() {
+ // expected-note at +1 {{in instantiation of function template specialization 'templated_func<int>' requested here}}
+ templated_func<int>();
+ // expected-note at +1 {{in instantiation of function template specialization 'templated_func_value_dependent<1>' requested here}}
+ templated_func_value_dependent<1>();
+ // expected-note at +1 {{in instantiation of function template specialization 'templated_func_type_dependent<int>' requested here}}
+ templated_func_type_dependent<int>();
+}
+
+
diff --git a/clang/tools/libclang/CIndex.cpp b/clang/tools/libclang/CIndex.cpp
index 0ed029c39885f..ea0f7895528cb 100644
--- a/clang/tools/libclang/CIndex.cpp
+++ b/clang/tools/libclang/CIndex.cpp
@@ -2161,6 +2161,9 @@ class EnqueueVisitor : public ConstStmtVisitor<EnqueueVisitor, void>,
void VisitOMPUnrollDirective(const OMPUnrollDirective *D);
void VisitOMPReverseDirective(const OMPReverseDirective *D);
void VisitOMPInterchangeDirective(const OMPInterchangeDirective *D);
+ void VisitOMPCanonicalLoopSequenceTransformationDirective(
+ const OMPCanonicalLoopSequenceTransformationDirective *D);
+ void VisitOMPFuseDirective(const OMPFuseDirective *D);
void VisitOMPForDirective(const OMPForDirective *D);
void VisitOMPForSimdDirective(const OMPForSimdDirective *D);
void VisitOMPSectionsDirective(const OMPSectionsDirective *D);
@@ -2366,6 +2369,11 @@ void OMPClauseEnqueue::VisitOMPPartialClause(const OMPPartialClause *C) {
Visitor->AddStmt(C->getFactor());
}
+void OMPClauseEnqueue::VisitOMPLoopRangeClause(const OMPLoopRangeClause *C) {
+ Visitor->AddStmt(C->getFirst());
+ Visitor->AddStmt(C->getCount());
+}
+
void OMPClauseEnqueue::VisitOMPAllocatorClause(const OMPAllocatorClause *C) {
Visitor->AddStmt(C->getAllocator());
}
@@ -3330,6 +3338,15 @@ void EnqueueVisitor::VisitOMPInterchangeDirective(
VisitOMPCanonicalLoopNestTransformationDirective(D);
}
+void EnqueueVisitor::VisitOMPCanonicalLoopSequenceTransformationDirective(
+ const OMPCanonicalLoopSequenceTransformationDirective *D) {
+ VisitOMPExecutableDirective(D);
+}
+
+void EnqueueVisitor::VisitOMPFuseDirective(const OMPFuseDirective *D) {
+ VisitOMPCanonicalLoopSequenceTransformationDirective(D);
+}
+
void EnqueueVisitor::VisitOMPForDirective(const OMPForDirective *D) {
VisitOMPLoopDirective(D);
}
@@ -6287,6 +6304,8 @@ CXString clang_getCursorKindSpelling(enum CXCursorKind Kind) {
return cxstring::createRef("OMPReverseDirective");
case CXCursor_OMPInterchangeDirective:
return cxstring::createRef("OMPInterchangeDirective");
+ case CXCursor_OMPFuseDirective:
+ return cxstring::createRef("OMPFuseDirective");
case CXCursor_OMPForDirective:
return cxstring::createRef("OMPForDirective");
case CXCursor_OMPForSimdDirective:
diff --git a/clang/tools/libclang/CXCursor.cpp b/clang/tools/libclang/CXCursor.cpp
index 3c4062410eac1..56f113c1dc309 100644
--- a/clang/tools/libclang/CXCursor.cpp
+++ b/clang/tools/libclang/CXCursor.cpp
@@ -687,6 +687,9 @@ CXCursor cxcursor::MakeCXCursor(const Stmt *S, const Decl *Parent,
case Stmt::OMPInterchangeDirectiveClass:
K = CXCursor_OMPInterchangeDirective;
break;
+ case Stmt::OMPFuseDirectiveClass:
+ K = CXCursor_OMPFuseDirective;
+ break;
case Stmt::OMPForDirectiveClass:
K = CXCursor_OMPForDirective;
break;
diff --git a/flang/include/flang/Lower/OpenMP/Clauses.h b/flang/include/flang/Lower/OpenMP/Clauses.h
index 1ab594ffcd209..2decc8f815cd8 100644
--- a/flang/include/flang/Lower/OpenMP/Clauses.h
+++ b/flang/include/flang/Lower/OpenMP/Clauses.h
@@ -241,6 +241,7 @@ using Initializer = tomp::clause::InitializerT<TypeTy, IdTy, ExprTy>;
using InReduction = tomp::clause::InReductionT<TypeTy, IdTy, ExprTy>;
using IsDevicePtr = tomp::clause::IsDevicePtrT<TypeTy, IdTy, ExprTy>;
using Lastprivate = tomp::clause::LastprivateT<TypeTy, IdTy, ExprTy>;
+using LoopRange = tomp::clause::LoopRangeT<TypeTy, IdTy, ExprTy>;
using Linear = tomp::clause::LinearT<TypeTy, IdTy, ExprTy>;
using Link = tomp::clause::LinkT<TypeTy, IdTy, ExprTy>;
using Map = tomp::clause::MapT<TypeTy, IdTy, ExprTy>;
diff --git a/flang/include/flang/Parser/dump-parse-tree.h b/flang/include/flang/Parser/dump-parse-tree.h
index 27be500f6b054..a2dc2620ca402 100644
--- a/flang/include/flang/Parser/dump-parse-tree.h
+++ b/flang/include/flang/Parser/dump-parse-tree.h
@@ -613,6 +613,7 @@ class ParseTreeDumper {
NODE_ENUM(OmpLinearModifier, Value)
NODE(parser, OmpLocator)
NODE(parser, OmpLocatorList)
+ NODE(parser, OmpLoopRangeClause)
NODE(parser, OmpLoopDirective)
NODE(parser, OmpMapClause)
NODE(OmpMapClause, Modifier)
diff --git a/flang/include/flang/Parser/parse-tree.h b/flang/include/flang/Parser/parse-tree.h
index 61fdcfe37172e..28f11e3b88d51 100644
--- a/flang/include/flang/Parser/parse-tree.h
+++ b/flang/include/flang/Parser/parse-tree.h
@@ -4510,6 +4510,15 @@ struct OmpLinearClause {
std::tuple<OmpObjectList, MODIFIERS(), /*PostModified=*/bool> t;
};
+// Ref: [6.0:207-208]
+//
+// loop-range-clause ->
+// LOOPRANGE(first, count) // since 6.0
+struct OmpLoopRangeClause {
+ TUPLE_CLASS_BOILERPLATE(OmpLoopRangeClause);
+ std::tuple<ScalarIntConstantExpr, ScalarIntConstantExpr> t;
+};
+
// Ref: [4.5:216-219], [5.0:315-324], [5.1:347-355], [5.2:150-158]
//
// map-clause ->
diff --git a/flang/lib/Lower/OpenMP/Clauses.cpp b/flang/lib/Lower/OpenMP/Clauses.cpp
index 1a16e1c87e250..1b7dc8cdde1b1 100644
--- a/flang/lib/Lower/OpenMP/Clauses.cpp
+++ b/flang/lib/Lower/OpenMP/Clauses.cpp
@@ -1031,6 +1031,11 @@ Link make(const parser::OmpClause::Link &inp,
return Link{/*List=*/makeObjects(inp.v, semaCtx)};
}
+LoopRange make(const parser::OmpClause::Looprange &inp,
+ semantics::SemanticsContext &semaCtx) {
+ llvm_unreachable("Unimplemented: looprange");
+}
+
Map make(const parser::OmpClause::Map &inp,
semantics::SemanticsContext &semaCtx) {
// inp.v -> parser::OmpMapClause
diff --git a/flang/lib/Parser/openmp-parsers.cpp b/flang/lib/Parser/openmp-parsers.cpp
index ce46a8605e34a..5612b61fac505 100644
--- a/flang/lib/Parser/openmp-parsers.cpp
+++ b/flang/lib/Parser/openmp-parsers.cpp
@@ -952,6 +952,9 @@ TYPE_PARSER(
maybe(":"_tok >> nonemptyList(Parser<OmpLinearClause::Modifier>{})),
/*PostModified=*/pure(true)))
+TYPE_PARSER(construct<OmpLoopRangeClause>(
+ scalarIntConstantExpr, "," >> scalarIntConstantExpr))
+
// OpenMPv5.2 12.5.2 detach-clause -> DETACH (event-handle)
TYPE_PARSER(construct<OmpDetachClause>(Parser<OmpObject>{}))
@@ -1128,6 +1131,8 @@ TYPE_PARSER( //
parenthesized(Parser<OmpLinearClause>{}))) ||
"LINK" >> construct<OmpClause>(construct<OmpClause::Link>(
parenthesized(Parser<OmpObjectList>{}))) ||
+ "LOOPRANGE" >> construct<OmpClause>(construct<OmpClause::Looprange>(
+ parenthesized(Parser<OmpLoopRangeClause>{}))) ||
"MAP" >> construct<OmpClause>(construct<OmpClause::Map>(
parenthesized(Parser<OmpMapClause>{}))) ||
"MATCH" >> construct<OmpClause>(construct<OmpClause::Match>(
diff --git a/flang/lib/Parser/unparse.cpp b/flang/lib/Parser/unparse.cpp
index dc6d33607146b..732c6bb0f3e05 100644
--- a/flang/lib/Parser/unparse.cpp
+++ b/flang/lib/Parser/unparse.cpp
@@ -2328,6 +2328,13 @@ class UnparseVisitor {
}
}
}
+ void Unparse(const OmpLoopRangeClause &x) {
+ Word("LOOPRANGE(");
+ Walk(std::get<0>(x.t));
+ Put(", ");
+ Walk(std::get<1>(x.t));
+ Put(")");
+ }
void Unparse(const OmpReductionClause &x) {
using Modifier = OmpReductionClause::Modifier;
Walk(std::get<std::optional<std::list<Modifier>>>(x.t), ": ");
diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp
index 85d79a00d9aff..deda81354c721 100644
--- a/flang/lib/Semantics/check-omp-structure.cpp
+++ b/flang/lib/Semantics/check-omp-structure.cpp
@@ -2856,6 +2856,12 @@ CHECK_REQ_CONSTANT_SCALAR_INT_CLAUSE(Collapse, OMPC_collapse)
CHECK_REQ_CONSTANT_SCALAR_INT_CLAUSE(Safelen, OMPC_safelen)
CHECK_REQ_CONSTANT_SCALAR_INT_CLAUSE(Simdlen, OMPC_simdlen)
+void OmpStructureChecker::Enter(const parser::OmpClause::Looprange &x) {
+ context_.Say(GetContext().clauseSource,
+ "LOOPRANGE clause is not implemented yet"_err_en_US,
+ ContextDirectiveAsFortran());
+}
+
// Restrictions specific to each clause are implemented apart from the
// generalized restrictions.
diff --git a/llvm/include/llvm/Frontend/OpenMP/ClauseT.h b/llvm/include/llvm/Frontend/OpenMP/ClauseT.h
index 8ea50e7e8d416..26e6b699b02c1 100644
--- a/llvm/include/llvm/Frontend/OpenMP/ClauseT.h
+++ b/llvm/include/llvm/Frontend/OpenMP/ClauseT.h
@@ -1244,6 +1244,15 @@ struct WriteT {
using EmptyTrait = std::true_type;
};
+// V6: [6.4.7] Looprange clause
+template <typename T, typename I, typename E> struct LoopRangeT {
+ using Begin = E;
+ using End = E;
+
+ using TupleTrait = std::true_type;
+ std::tuple<Begin, End> t;
+};
+
// ---
template <typename T, typename I, typename E>
@@ -1275,8 +1284,8 @@ using TupleClausesT =
DoacrossT<T, I, E>, DynGroupprivateT<T, I, E>, FromT<T, I, E>,
GrainsizeT<T, I, E>, IfT<T, I, E>, InitT<T, I, E>,
InReductionT<T, I, E>, LastprivateT<T, I, E>, LinearT<T, I, E>,
- MapT<T, I, E>, NumTasksT<T, I, E>, OrderT<T, I, E>,
- ReductionT<T, I, E>, ScheduleT<T, I, E>,
+ LoopRangeT<T, I, E>, MapT<T, I, E>, NumTasksT<T, I, E>,
+ OrderT<T, I, E>, ReductionT<T, I, E>, ScheduleT<T, I, E>,
TaskReductionT<T, I, E>, ToT<T, I, E>>;
template <typename T, typename I, typename E>
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMP.td b/llvm/include/llvm/Frontend/OpenMP/OMP.td
index 036b932acff9e..320ebf017d73b 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMP.td
+++ b/llvm/include/llvm/Frontend/OpenMP/OMP.td
@@ -277,6 +277,10 @@ def OMPC_Linear : Clause<[Spelling<"linear">]> {
def OMPC_Link : Clause<[Spelling<"link">]> {
let flangClass = "OmpObjectList";
}
+def OMPC_LoopRange : Clause<[Spelling<"looprange">]> {
+ let clangClass = "OMPLoopRangeClause";
+ let flangClass = "OmpLoopRangeClause";
+}
def OMPC_Map : Clause<[Spelling<"map">]> {
let clangClass = "OMPMapClause";
let flangClass = "OmpMapClause";
@@ -884,6 +888,11 @@ def OMP_Groupprivate : Directive<[Spelling<"groupprivate">]> {
let category = CA_Declarative;
let languages = [L_C, L_Fortran];
}
+def OMP_Fuse : Directive<[Spelling<"fuse">]> {
+ let allowedOnceClauses = [VersionedClause<OMPC_LoopRange, 60>];
+ let association = AS_Block;
+ let category = CA_Executable;
+}
def OMP_Interchange : Directive<[Spelling<"interchange">]> {
let allowedOnceClauses = [
VersionedClause<OMPC_Permutation>,
diff --git a/openmp/runtime/test/transform/fuse/foreach.cpp b/openmp/runtime/test/transform/fuse/foreach.cpp
new file mode 100644
index 0000000000000..176465b201faa
--- /dev/null
+++ b/openmp/runtime/test/transform/fuse/foreach.cpp
@@ -0,0 +1,191 @@
+// RUN: %libomp-cxx20-compile-and-run | FileCheck %s --match-full-lines
+
+#ifndef HEADER
+#define HEADER
+
+#include <cstdlib>
+#include <cstdarg>
+#include <cstdio>
+#include <vector>
+
+struct Reporter {
+ const char *name;
+
+ Reporter(const char *name) : name(name) { print("ctor"); }
+
+ Reporter() : name("<anon>") { print("ctor"); }
+
+ Reporter(const Reporter &that) : name(that.name) { print("copy ctor"); }
+
+ Reporter(Reporter &&that) : name(that.name) { print("move ctor"); }
+
+ ~Reporter() { print("dtor"); }
+
+ const Reporter &operator=(const Reporter &that) {
+ print("copy assign");
+ this->name = that.name;
+ return *this;
+ }
+
+ const Reporter &operator=(Reporter &&that) {
+ print("move assign");
+ this->name = that.name;
+ return *this;
+ }
+
+ struct Iterator {
+ const Reporter *owner;
+ int pos;
+
+ Iterator(const Reporter *owner, int pos) : owner(owner), pos(pos) {}
+
+ Iterator(const Iterator &that) : owner(that.owner), pos(that.pos) {
+ owner->print("iterator copy ctor");
+ }
+
+ Iterator(Iterator &&that) : owner(that.owner), pos(that.pos) {
+ owner->print("iterator move ctor");
+ }
+
+ ~Iterator() { owner->print("iterator dtor"); }
+
+ const Iterator &operator=(const Iterator &that) {
+ owner->print("iterator copy assign");
+ this->owner = that.owner;
+ this->pos = that.pos;
+ return *this;
+ }
+
+ const Iterator &operator=(Iterator &&that) {
+ owner->print("iterator move assign");
+ this->owner = that.owner;
+ this->pos = that.pos;
+ return *this;
+ }
+
+ bool operator==(const Iterator &that) const {
+ owner->print("iterator %d == %d", 2 - this->pos, 2 - that.pos);
+ return this->pos == that.pos;
+ }
+
+ Iterator &operator++() {
+ owner->print("iterator prefix ++");
+ pos -= 1;
+ return *this;
+ }
+
+ Iterator operator++(int) {
+ owner->print("iterator postfix ++");
+ auto result = *this;
+ pos -= 1;
+ return result;
+ }
+
+ int operator*() const {
+ int result = 2 - pos;
+ owner->print("iterator deref: %i", result);
+ return result;
+ }
+
+ size_t operator-(const Iterator &that) const {
+ int result = (2 - this->pos) - (2 - that.pos);
+ owner->print("iterator distance: %d", result);
+ return result;
+ }
+
+ Iterator operator+(int steps) const {
+ owner->print("iterator advance: %i += %i", 2 - this->pos, steps);
+ return Iterator(owner, pos - steps);
+ }
+
+ void print(const char *msg) const { owner->print(msg); }
+ };
+
+ Iterator begin() const {
+ print("begin()");
+ return Iterator(this, 2);
+ }
+
+ Iterator end() const {
+ print("end()");
+ return Iterator(this, -1);
+ }
+
+ void print(const char *msg, ...) const {
+ va_list args;
+ va_start(args, msg);
+ printf("[%s] ", name);
+ vprintf(msg, args);
+ printf("\n");
+ va_end(args);
+ }
+};
+
+int main() {
+ printf("do\n");
+#pragma omp fuse
+ {
+ for (Reporter a{"C"}; auto &&v : Reporter("A"))
+ printf("v=%d\n", v);
+ for (Reporter aa{"D"}; auto &&vv : Reporter("B"))
+ printf("vv=%d\n", vv);
+ }
+ printf("done\n");
+ return EXIT_SUCCESS;
+}
+
+// CHECK: [C] ctor
+// CHECK-NEXT: [A] ctor
+// CHECK-NEXT: [A] end()
+// CHECK-NEXT: [A] begin()
+// CHECK-NEXT: [A] begin()
+// CHECK-NEXT: [A] iterator distance: 3
+// CHECK-NEXT: [D] ctor
+// CHECK-NEXT: [B] ctor
+// CHECK-NEXT: [B] end()
+// CHECK-NEXT: [B] begin()
+// CHECK-NEXT: [B] begin()
+// CHECK-NEXT: [B] iterator distance: 3
+// CHECK-NEXT: [A] iterator advance: 0 += 0
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 0
+// CHECK-NEXT: v=0
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [B] iterator advance: 0 += 0
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 0
+// CHECK-NEXT: vv=0
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [A] iterator advance: 0 += 1
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 1
+// CHECK-NEXT: v=1
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [B] iterator advance: 0 += 1
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 1
+// CHECK-NEXT: vv=1
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [A] iterator advance: 0 += 2
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 2
+// CHECK-NEXT: v=2
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [B] iterator advance: 0 += 2
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 2
+// CHECK-NEXT: vv=2
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [B] dtor
+// CHECK-NEXT: [D] dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] dtor
+// CHECK-NEXT: [C] dtor
+// CHECK-NEXT: done
+
+#endif
diff --git a/openmp/runtime/test/transform/fuse/intfor.c b/openmp/runtime/test/transform/fuse/intfor.c
new file mode 100644
index 0000000000000..b8171b4df7042
--- /dev/null
+++ b/openmp/runtime/test/transform/fuse/intfor.c
@@ -0,0 +1,50 @@
+// RUN: %libomp-compile-and-run | FileCheck %s --match-full-lines
+
+#ifndef HEADER
+#define HEADER
+
+#include <stdlib.h>
+#include <stdio.h>
+
+int main() {
+ printf("do\n");
+#pragma omp fuse
+ {
+ for (int i = 5; i <= 25; i += 5)
+ printf("i=%d\n", i);
+ for (int j = 10; j < 100; j += 10)
+ printf("j=%d\n", j);
+ for (int k = 10; k > 0; --k)
+ printf("k=%d\n", k);
+ }
+ printf("done\n");
+ return EXIT_SUCCESS;
+}
+#endif /* HEADER */
+
+// CHECK: do
+// CHECK-NEXT: i=5
+// CHECK-NEXT: j=10
+// CHECK-NEXT: k=10
+// CHECK-NEXT: i=10
+// CHECK-NEXT: j=20
+// CHECK-NEXT: k=9
+// CHECK-NEXT: i=15
+// CHECK-NEXT: j=30
+// CHECK-NEXT: k=8
+// CHECK-NEXT: i=20
+// CHECK-NEXT: j=40
+// CHECK-NEXT: k=7
+// CHECK-NEXT: i=25
+// CHECK-NEXT: j=50
+// CHECK-NEXT: k=6
+// CHECK-NEXT: j=60
+// CHECK-NEXT: k=5
+// CHECK-NEXT: j=70
+// CHECK-NEXT: k=4
+// CHECK-NEXT: j=80
+// CHECK-NEXT: k=3
+// CHECK-NEXT: j=90
+// CHECK-NEXT: k=2
+// CHECK-NEXT: k=1
+// CHECK-NEXT: done
diff --git a/openmp/runtime/test/transform/fuse/iterfor.cpp b/openmp/runtime/test/transform/fuse/iterfor.cpp
new file mode 100644
index 0000000000000..552484b2981c4
--- /dev/null
+++ b/openmp/runtime/test/transform/fuse/iterfor.cpp
@@ -0,0 +1,194 @@
+// RUN: %libomp-cxx20-compile-and-run | FileCheck %s --match-full-lines
+
+#ifndef HEADER
+#define HEADER
+
+#include <cstdlib>
+#include <cstdarg>
+#include <cstdio>
+#include <vector>
+
+struct Reporter {
+ const char *name;
+
+ Reporter(const char *name) : name(name) { print("ctor"); }
+
+ Reporter() : name("<anon>") { print("ctor"); }
+
+ Reporter(const Reporter &that) : name(that.name) { print("copy ctor"); }
+
+ Reporter(Reporter &&that) : name(that.name) { print("move ctor"); }
+
+ ~Reporter() { print("dtor"); }
+
+ const Reporter &operator=(const Reporter &that) {
+ print("copy assign");
+ this->name = that.name;
+ return *this;
+ }
+
+ const Reporter &operator=(Reporter &&that) {
+ print("move assign");
+ this->name = that.name;
+ return *this;
+ }
+
+ struct Iterator {
+ const Reporter *owner;
+ int pos;
+
+ Iterator(const Reporter *owner, int pos) : owner(owner), pos(pos) {}
+
+ Iterator(const Iterator &that) : owner(that.owner), pos(that.pos) {
+ owner->print("iterator copy ctor");
+ }
+
+ Iterator(Iterator &&that) : owner(that.owner), pos(that.pos) {
+ owner->print("iterator move ctor");
+ }
+
+ ~Iterator() { owner->print("iterator dtor"); }
+
+ const Iterator &operator=(const Iterator &that) {
+ owner->print("iterator copy assign");
+ this->owner = that.owner;
+ this->pos = that.pos;
+ return *this;
+ }
+
+ const Iterator &operator=(Iterator &&that) {
+ owner->print("iterator move assign");
+ this->owner = that.owner;
+ this->pos = that.pos;
+ return *this;
+ }
+
+ bool operator==(const Iterator &that) const {
+ owner->print("iterator %d == %d", 2 - this->pos, 2 - that.pos);
+ return this->pos == that.pos;
+ }
+
+ bool operator!=(const Iterator &that) const {
+ owner->print("iterator %d != %d", 2 - this->pos, 2 - that.pos);
+ return this->pos != that.pos;
+ }
+
+ Iterator &operator++() {
+ owner->print("iterator prefix ++");
+ pos -= 1;
+ return *this;
+ }
+
+ Iterator operator++(int) {
+ owner->print("iterator postfix ++");
+ auto result = *this;
+ pos -= 1;
+ return result;
+ }
+
+ int operator*() const {
+ int result = 2 - pos;
+ owner->print("iterator deref: %i", result);
+ return result;
+ }
+
+ size_t operator-(const Iterator &that) const {
+ int result = (2 - this->pos) - (2 - that.pos);
+ owner->print("iterator distance: %d", result);
+ return result;
+ }
+
+ Iterator operator+(int steps) const {
+ owner->print("iterator advance: %i += %i", 2 - this->pos, steps);
+ return Iterator(owner, pos - steps);
+ }
+ };
+
+ Iterator begin() const {
+ print("begin()");
+ return Iterator(this, 2);
+ }
+
+ Iterator end() const {
+ print("end()");
+ return Iterator(this, -1);
+ }
+
+ void print(const char *msg, ...) const {
+ va_list args;
+ va_start(args, msg);
+ printf("[%s] ", name);
+ vprintf(msg, args);
+ printf("\n");
+ va_end(args);
+ }
+};
+
+int main() {
+ printf("do\n");
+ Reporter C("C");
+ Reporter D("D");
+#pragma omp fuse
+ {
+ for (auto it = C.begin(); it != C.end(); ++it)
+ printf("v=%d\n", *it);
+
+ for (auto it = D.begin(); it != D.end(); ++it)
+ printf("vv=%d\n", *it);
+ }
+ printf("done\n");
+ return EXIT_SUCCESS;
+}
+
+#endif /* HEADER */
+
+// CHECK: do
+// CHECK: [C] ctor
+// CHECK-NEXT: [D] ctor
+// CHECK-NEXT: [C] begin()
+// CHECK-NEXT: [C] begin()
+// CHECK-NEXT: [C] end()
+// CHECK-NEXT: [C] iterator distance: 3
+// CHECK-NEXT: [D] begin()
+// CHECK-NEXT: [D] begin()
+// CHECK-NEXT: [D] end()
+// CHECK-NEXT: [D] iterator distance: 3
+// CHECK-NEXT: [C] iterator advance: 0 += 0
+// CHECK-NEXT: [C] iterator move assign
+// CHECK-NEXT: [C] iterator deref: 0
+// CHECK-NEXT: v=0
+// CHECK-NEXT: [C] iterator dtor
+// CHECK-NEXT: [D] iterator advance: 0 += 0
+// CHECK-NEXT: [D] iterator move assign
+// CHECK-NEXT: [D] iterator deref: 0
+// CHECK-NEXT: vv=0
+// CHECK-NEXT: [D] iterator dtor
+// CHECK-NEXT: [C] iterator advance: 0 += 1
+// CHECK-NEXT: [C] iterator move assign
+// CHECK-NEXT: [C] iterator deref: 1
+// CHECK-NEXT: v=1
+// CHECK-NEXT: [C] iterator dtor
+// CHECK-NEXT: [D] iterator advance: 0 += 1
+// CHECK-NEXT: [D] iterator move assign
+// CHECK-NEXT: [D] iterator deref: 1
+// CHECK-NEXT: vv=1
+// CHECK-NEXT: [D] iterator dtor
+// CHECK-NEXT: [C] iterator advance: 0 += 2
+// CHECK-NEXT: [C] iterator move assign
+// CHECK-NEXT: [C] iterator deref: 2
+// CHECK-NEXT: v=2
+// CHECK-NEXT: [C] iterator dtor
+// CHECK-NEXT: [D] iterator advance: 0 += 2
+// CHECK-NEXT: [D] iterator move assign
+// CHECK-NEXT: [D] iterator deref: 2
+// CHECK-NEXT: vv=2
+// CHECK-NEXT: [D] iterator dtor
+// CHECK-NEXT: [D] iterator dtor
+// CHECK-NEXT: [D] iterator dtor
+// CHECK-NEXT: [C] iterator dtor
+// CHECK-NEXT: [C] iterator dtor
+// CHECK-NEXT: done
+// CHECK-NEXT: [D] iterator dtor
+// CHECK-NEXT: [C] iterator dtor
+// CHECK-NEXT: [D] dtor
+// CHECK-NEXT: [C] dtor
diff --git a/openmp/runtime/test/transform/fuse/parallel-wsloop-collapse-foreach.cpp b/openmp/runtime/test/transform/fuse/parallel-wsloop-collapse-foreach.cpp
new file mode 100644
index 0000000000000..dcbbdf1b6734e
--- /dev/null
+++ b/openmp/runtime/test/transform/fuse/parallel-wsloop-collapse-foreach.cpp
@@ -0,0 +1,207 @@
+// RUN: %libomp-cxx20-compile-and-run | FileCheck %s --match-full-lines
+
+#ifndef HEADER
+#define HEADER
+
+#include <cstdlib>
+#include <cstdarg>
+#include <cstdio>
+#include <vector>
+
+struct Reporter {
+ const char *name;
+
+ Reporter(const char *name) : name(name) { print("ctor"); }
+
+ Reporter() : name("<anon>") { print("ctor"); }
+
+ Reporter(const Reporter &that) : name(that.name) { print("copy ctor"); }
+
+ Reporter(Reporter &&that) : name(that.name) { print("move ctor"); }
+
+ ~Reporter() { print("dtor"); }
+
+ const Reporter &operator=(const Reporter &that) {
+ print("copy assign");
+ this->name = that.name;
+ return *this;
+ }
+
+ const Reporter &operator=(Reporter &&that) {
+ print("move assign");
+ this->name = that.name;
+ return *this;
+ }
+
+ struct Iterator {
+ const Reporter *owner;
+ int pos;
+
+ Iterator(const Reporter *owner, int pos) : owner(owner), pos(pos) {}
+
+ Iterator(const Iterator &that) : owner(that.owner), pos(that.pos) {
+ owner->print("iterator copy ctor");
+ }
+
+ Iterator(Iterator &&that) : owner(that.owner), pos(that.pos) {
+ owner->print("iterator move ctor");
+ }
+
+ ~Iterator() { owner->print("iterator dtor"); }
+
+ const Iterator &operator=(const Iterator &that) {
+ owner->print("iterator copy assign");
+ this->owner = that.owner;
+ this->pos = that.pos;
+ return *this;
+ }
+
+ const Iterator &operator=(Iterator &&that) {
+ owner->print("iterator move assign");
+ this->owner = that.owner;
+ this->pos = that.pos;
+ return *this;
+ }
+
+ bool operator==(const Iterator &that) const {
+ owner->print("iterator %d == %d", 2 - this->pos, 2 - that.pos);
+ return this->pos == that.pos;
+ }
+
+ Iterator &operator++() {
+ owner->print("iterator prefix ++");
+ pos -= 1;
+ return *this;
+ }
+
+ Iterator operator++(int) {
+ owner->print("iterator postfix ++");
+ auto result = *this;
+ pos -= 1;
+ return result;
+ }
+
+ int operator*() const {
+ int result = 2 - pos;
+ owner->print("iterator deref: %i", result);
+ return result;
+ }
+
+ size_t operator-(const Iterator &that) const {
+ int result = (2 - this->pos) - (2 - that.pos);
+ owner->print("iterator distance: %d", result);
+ return result;
+ }
+
+ Iterator operator+(int steps) const {
+ owner->print("iterator advance: %i += %i", 2 - this->pos, steps);
+ return Iterator(owner, pos - steps);
+ }
+
+ void print(const char *msg) const { owner->print(msg); }
+ };
+
+ Iterator begin() const {
+ print("begin()");
+ return Iterator(this, 2);
+ }
+
+ Iterator end() const {
+ print("end()");
+ return Iterator(this, -1);
+ }
+
+ void print(const char *msg, ...) const {
+ va_list args;
+ va_start(args, msg);
+ printf("[%s] ", name);
+ vprintf(msg, args);
+ printf("\n");
+ va_end(args);
+ }
+};
+
+int main() {
+ printf("do\n");
+#pragma omp parallel for collapse(2) num_threads(1)
+ for (int i = 0; i < 3; ++i)
+#pragma omp fuse
+ {
+ for (Reporter c{"init-stmt"}; auto &&v : Reporter("range"))
+ printf("i=%d v=%d\n", i, v);
+ for (int vv = 0; vv < 3; ++vv)
+ printf("i=%d vv=%d\n", i, vv);
+ }
+ printf("done\n");
+ return EXIT_SUCCESS;
+}
+
+#endif /* HEADER */
+
+// CHECK: do
+// CHECK-NEXT: [init-stmt] ctor
+// CHECK-NEXT: [range] ctor
+// CHECK-NEXT: [range] end()
+// CHECK-NEXT: [range] begin()
+// CHECK-NEXT: [range] begin()
+// CHECK-NEXT: [range] iterator distance: 3
+// CHECK-NEXT: [range] iterator advance: 0 += 0
+// CHECK-NEXT: [range] iterator move assign
+// CHECK-NEXT: [range] iterator deref: 0
+// CHECK-NEXT: i=0 v=0
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: i=0 vv=0
+// CHECK-NEXT: [range] iterator advance: 0 += 1
+// CHECK-NEXT: [range] iterator move assign
+// CHECK-NEXT: [range] iterator deref: 1
+// CHECK-NEXT: i=0 v=1
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: i=0 vv=1
+// CHECK-NEXT: [range] iterator advance: 0 += 2
+// CHECK-NEXT: [range] iterator move assign
+// CHECK-NEXT: [range] iterator deref: 2
+// CHECK-NEXT: i=0 v=2
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: i=0 vv=2
+// CHECK-NEXT: [range] iterator advance: 0 += 0
+// CHECK-NEXT: [range] iterator move assign
+// CHECK-NEXT: [range] iterator deref: 0
+// CHECK-NEXT: i=1 v=0
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: i=1 vv=0
+// CHECK-NEXT: [range] iterator advance: 0 += 1
+// CHECK-NEXT: [range] iterator move assign
+// CHECK-NEXT: [range] iterator deref: 1
+// CHECK-NEXT: i=1 v=1
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: i=1 vv=1
+// CHECK-NEXT: [range] iterator advance: 0 += 2
+// CHECK-NEXT: [range] iterator move assign
+// CHECK-NEXT: [range] iterator deref: 2
+// CHECK-NEXT: i=1 v=2
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: i=1 vv=2
+// CHECK-NEXT: [range] iterator advance: 0 += 0
+// CHECK-NEXT: [range] iterator move assign
+// CHECK-NEXT: [range] iterator deref: 0
+// CHECK-NEXT: i=2 v=0
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: i=2 vv=0
+// CHECK-NEXT: [range] iterator advance: 0 += 1
+// CHECK-NEXT: [range] iterator move assign
+// CHECK-NEXT: [range] iterator deref: 1
+// CHECK-NEXT: i=2 v=1
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: i=2 vv=1
+// CHECK-NEXT: [range] iterator advance: 0 += 2
+// CHECK-NEXT: [range] iterator move assign
+// CHECK-NEXT: [range] iterator deref: 2
+// CHECK-NEXT: i=2 v=2
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: i=2 vv=2
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: [range] dtor
+// CHECK-NEXT: [init-stmt] dtor
+// CHECK-NEXT: done
diff --git a/openmp/runtime/test/transform/fuse/parallel-wsloop-collapse-intfor.c b/openmp/runtime/test/transform/fuse/parallel-wsloop-collapse-intfor.c
new file mode 100644
index 0000000000000..9630fec50bc20
--- /dev/null
+++ b/openmp/runtime/test/transform/fuse/parallel-wsloop-collapse-intfor.c
@@ -0,0 +1,45 @@
+// RUN: %libomp-cxx-compile-and-run | FileCheck %s --match-full-lines
+
+#ifndef HEADER
+#define HEADER
+
+#include <cstdlib>
+#include <cstdio>
+
+int main() {
+ printf("do\n");
+#pragma omp parallel for collapse(2) num_threads(1)
+ for (int i = 0; i < 3; ++i)
+#pragma omp fuse
+ {
+ for (int j = 0; j < 3; ++j)
+ printf("i=%d j=%d\n", i, j);
+ for (int k = 0; k < 3; ++k)
+ printf("i=%d k=%d\n", i, k);
+ }
+ printf("done\n");
+ return EXIT_SUCCESS;
+}
+
+#endif /* HEADER */
+
+// CHECK: do
+// CHECK-NEXT: i=0 j=0
+// CHECK-NEXT: i=0 k=0
+// CHECK-NEXT: i=0 j=1
+// CHECK-NEXT: i=0 k=1
+// CHECK-NEXT: i=0 j=2
+// CHECK-NEXT: i=0 k=2
+// CHECK-NEXT: i=1 j=0
+// CHECK-NEXT: i=1 k=0
+// CHECK-NEXT: i=1 j=1
+// CHECK-NEXT: i=1 k=1
+// CHECK-NEXT: i=1 j=2
+// CHECK-NEXT: i=1 k=2
+// CHECK-NEXT: i=2 j=0
+// CHECK-NEXT: i=2 k=0
+// CHECK-NEXT: i=2 j=1
+// CHECK-NEXT: i=2 k=1
+// CHECK-NEXT: i=2 j=2
+// CHECK-NEXT: i=2 k=2
+// CHECK-NEXT: done
More information about the llvm-commits
mailing list