[clang] [flang] [llvm] [openmp] [Clang][OpenMP][LoopTransformations] Add support for "#pragma omp fuse" loop transformation directive and "looprange" clause (PR #139293)
Roger Ferrer Ibáñez via llvm-commits
llvm-commits at lists.llvm.org
Tue Sep 16 03:34:29 PDT 2025
https://github.com/rofirrim updated https://github.com/llvm/llvm-project/pull/139293
>From 6ae86eca1603231edf203f368aa7930a1cdc7e79 Mon Sep 17 00:00:00 2001
From: eZWALT <waltertheshadow333 at gmail.com>
Date: Mon, 8 Sep 2025 08:55:24 +0000
Subject: [PATCH] [Clang][OpenMP] Implement #pragma omp fuse
---
clang/docs/OpenMPSupport.rst | 2 +
clang/docs/ReleaseNotes.rst | 1 +
clang/include/clang-c/Index.h | 4 +
clang/include/clang/AST/OpenMPClause.h | 74 +
clang/include/clang/AST/RecursiveASTVisitor.h | 11 +
clang/include/clang/AST/StmtOpenMP.h | 208 +-
.../clang/Basic/DiagnosticSemaKinds.td | 12 +
clang/include/clang/Basic/OpenMPKinds.h | 7 +
clang/include/clang/Basic/StmtNodes.td | 4 +
clang/include/clang/Parse/Parser.h | 3 +
clang/include/clang/Sema/SemaOpenMP.h | 91 +-
.../include/clang/Serialization/ASTBitCodes.h | 1 +
clang/lib/AST/OpenMPClause.cpp | 35 +
clang/lib/AST/StmtOpenMP.cpp | 81 +-
clang/lib/AST/StmtPrinter.cpp | 5 +
clang/lib/AST/StmtProfile.cpp | 16 +
clang/lib/Basic/OpenMPKinds.cpp | 11 +-
clang/lib/CodeGen/CGStmt.cpp | 3 +
clang/lib/CodeGen/CGStmtOpenMP.cpp | 42 +-
clang/lib/CodeGen/CodeGenFunction.h | 1 +
clang/lib/Parse/ParseOpenMP.cpp | 36 +
clang/lib/Sema/SemaExceptionSpec.cpp | 1 +
clang/lib/Sema/SemaOpenMP.cpp | 848 +++++-
clang/lib/Sema/TreeTransform.h | 44 +
clang/lib/Serialization/ASTReader.cpp | 11 +
clang/lib/Serialization/ASTReaderStmt.cpp | 20 +
clang/lib/Serialization/ASTWriter.cpp | 8 +
clang/lib/Serialization/ASTWriterStmt.cpp | 13 +
clang/lib/StaticAnalyzer/Core/ExprEngine.cpp | 1 +
clang/test/OpenMP/fuse_ast_print.cpp | 397 +++
clang/test/OpenMP/fuse_codegen.cpp | 2328 +++++++++++++++++
clang/test/OpenMP/fuse_messages.cpp | 209 ++
clang/tools/libclang/CIndex.cpp | 19 +
clang/tools/libclang/CXCursor.cpp | 3 +
flang/include/flang/Lower/OpenMP/Clauses.h | 1 +
flang/include/flang/Parser/dump-parse-tree.h | 1 +
flang/include/flang/Parser/parse-tree.h | 9 +
flang/lib/Lower/OpenMP/Clauses.cpp | 5 +
flang/lib/Parser/openmp-parsers.cpp | 5 +
flang/lib/Parser/unparse.cpp | 7 +
flang/lib/Semantics/check-omp-structure.cpp | 6 +
llvm/include/llvm/Frontend/OpenMP/ClauseT.h | 13 +-
llvm/include/llvm/Frontend/OpenMP/OMP.td | 9 +
.../runtime/test/transform/fuse/foreach.cpp | 191 ++
openmp/runtime/test/transform/fuse/intfor.c | 50 +
.../runtime/test/transform/fuse/iterfor.cpp | 194 ++
.../fuse/parallel-wsloop-collapse-foreach.cpp | 207 ++
.../fuse/parallel-wsloop-collapse-intfor.c | 45 +
48 files changed, 5239 insertions(+), 54 deletions(-)
create mode 100644 clang/test/OpenMP/fuse_ast_print.cpp
create mode 100644 clang/test/OpenMP/fuse_codegen.cpp
create mode 100644 clang/test/OpenMP/fuse_messages.cpp
create mode 100644 openmp/runtime/test/transform/fuse/foreach.cpp
create mode 100644 openmp/runtime/test/transform/fuse/intfor.c
create mode 100644 openmp/runtime/test/transform/fuse/iterfor.cpp
create mode 100644 openmp/runtime/test/transform/fuse/parallel-wsloop-collapse-foreach.cpp
create mode 100644 openmp/runtime/test/transform/fuse/parallel-wsloop-collapse-intfor.c
diff --git a/clang/docs/OpenMPSupport.rst b/clang/docs/OpenMPSupport.rst
index f589ba97af1ff..56c04686ca181 100644
--- a/clang/docs/OpenMPSupport.rst
+++ b/clang/docs/OpenMPSupport.rst
@@ -473,6 +473,8 @@ implementation.
+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
| loop transformation apply clause | :none:`unclaimed` | :none:`unclaimed` | |
+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| loop fuse transformation | :good:`done` | :none:`unclaimed` | |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
| workdistribute construct | | :none:`in progress` | @skc7, @mjklemm |
+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
| task_iteration | :none:`unclaimed` | :none:`unclaimed` | |
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index f184eb1068f76..cea4dbfb284b6 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -537,6 +537,7 @@ OpenMP Support
- Properly handle array section/assumed-size array privatization in C/C++.
- Added support for ``variable-category`` modifier in ``default clause``.
- Added support for ``defaultmap`` directive implicit-behavior ``storage``.
+- Added support for 'omp fuse' directive.
Improvements
^^^^^^^^^^^^
diff --git a/clang/include/clang-c/Index.h b/clang/include/clang-c/Index.h
index be038d9165fc6..f13d9c9307b40 100644
--- a/clang/include/clang-c/Index.h
+++ b/clang/include/clang-c/Index.h
@@ -2162,6 +2162,10 @@ enum CXCursorKind {
*/
CXCursor_OMPStripeDirective = 310,
+ /** OpenMP fuse directive
+ */
+ CXCursor_OMPFuseDirective = 311,
+
/** OpenACC Compute Construct.
*/
CXCursor_OpenACCComputeConstruct = 320,
diff --git a/clang/include/clang/AST/OpenMPClause.h b/clang/include/clang/AST/OpenMPClause.h
index b2a6d4b9182b0..998bb15c4695e 100644
--- a/clang/include/clang/AST/OpenMPClause.h
+++ b/clang/include/clang/AST/OpenMPClause.h
@@ -1149,6 +1149,80 @@ class OMPFullClause final : public OMPNoChildClause<llvm::omp::OMPC_full> {
static OMPFullClause *CreateEmpty(const ASTContext &C);
};
+/// This class represents the 'looprange' clause in the
+/// '#pragma omp fuse' directive
+///
+/// \code {c}
+/// #pragma omp fuse looprange(1,2)
+/// {
+/// for(int i = 0; i < 64; ++i)
+/// for(int j = 0; j < 256; j+=2)
+/// for(int k = 127; k >= 0; --k)
+/// \endcode
+class OMPLoopRangeClause final : public OMPClause {
+ friend class OMPClauseReader;
+ /// Location of '('
+ SourceLocation LParenLoc;
+
+ /// Location of first and count expressions
+ SourceLocation FirstLoc, CountLoc;
+
+ /// Number of looprange arguments (always 2: first, count)
+ static constexpr unsigned NumArgs = 2;
+ Stmt *Args[NumArgs] = {nullptr, nullptr};
+
+ /// Set looprange 'first' expression
+ void setFirst(Expr *E) { Args[0] = E; }
+
+ /// Set looprange 'count' expression
+ void setCount(Expr *E) { Args[1] = E; }
+
+ /// Build an empty clause for deserialization.
+ explicit OMPLoopRangeClause()
+ : OMPClause(llvm::omp::OMPC_looprange, {}, {}) {}
+
+public:
+ /// Build a 'looprange' clause AST node.
+ static OMPLoopRangeClause *
+ Create(const ASTContext &C, SourceLocation StartLoc, SourceLocation LParenLoc,
+ SourceLocation FirstLoc, SourceLocation CountLoc,
+ SourceLocation EndLoc, Expr *First, Expr *Count);
+
+ /// Build an empty 'looprange' clause node.
+ static OMPLoopRangeClause *CreateEmpty(const ASTContext &C);
+
+ // Location getters/setters
+ SourceLocation getLParenLoc() const { return LParenLoc; }
+ SourceLocation getFirstLoc() const { return FirstLoc; }
+ SourceLocation getCountLoc() const { return CountLoc; }
+
+ void setLParenLoc(SourceLocation Loc) { LParenLoc = Loc; }
+ void setFirstLoc(SourceLocation Loc) { FirstLoc = Loc; }
+ void setCountLoc(SourceLocation Loc) { CountLoc = Loc; }
+
+ /// Get looprange 'first' expression
+ Expr *getFirst() const { return cast_or_null<Expr>(Args[0]); }
+
+ /// Get looprange 'count' expression
+ Expr *getCount() const { return cast_or_null<Expr>(Args[1]); }
+
+ child_range children() { return child_range(Args, Args + NumArgs); }
+ const_child_range children() const {
+ return const_child_range(Args, Args + NumArgs);
+ }
+
+ child_range used_children() {
+ return child_range(child_iterator(), child_iterator());
+ }
+ const_child_range used_children() const {
+ return const_child_range(const_child_iterator(), const_child_iterator());
+ }
+
+ static bool classof(const OMPClause *T) {
+ return T->getClauseKind() == llvm::omp::OMPC_looprange;
+ }
+};
+
/// Representation of the 'partial' clause of the '#pragma omp unroll'
/// directive.
///
diff --git a/clang/include/clang/AST/RecursiveASTVisitor.h b/clang/include/clang/AST/RecursiveASTVisitor.h
index c1944487716de..2ec172bbc26d7 100644
--- a/clang/include/clang/AST/RecursiveASTVisitor.h
+++ b/clang/include/clang/AST/RecursiveASTVisitor.h
@@ -3170,6 +3170,9 @@ DEF_TRAVERSE_STMT(OMPUnrollDirective,
DEF_TRAVERSE_STMT(OMPReverseDirective,
{ TRY_TO(TraverseOMPExecutableDirective(S)); })
+DEF_TRAVERSE_STMT(OMPFuseDirective,
+ { TRY_TO(TraverseOMPExecutableDirective(S)); })
+
DEF_TRAVERSE_STMT(OMPInterchangeDirective,
{ TRY_TO(TraverseOMPExecutableDirective(S)); })
@@ -3487,6 +3490,14 @@ bool RecursiveASTVisitor<Derived>::VisitOMPFullClause(OMPFullClause *C) {
return true;
}
+template <typename Derived>
+bool RecursiveASTVisitor<Derived>::VisitOMPLoopRangeClause(
+ OMPLoopRangeClause *C) {
+ TRY_TO(TraverseStmt(C->getFirst()));
+ TRY_TO(TraverseStmt(C->getCount()));
+ return true;
+}
+
template <typename Derived>
bool RecursiveASTVisitor<Derived>::VisitOMPPartialClause(OMPPartialClause *C) {
TRY_TO(TraverseStmt(C->getFactor()));
diff --git a/clang/include/clang/AST/StmtOpenMP.h b/clang/include/clang/AST/StmtOpenMP.h
index d9f87f1e49b40..f7e73c9cc8753 100644
--- a/clang/include/clang/AST/StmtOpenMP.h
+++ b/clang/include/clang/AST/StmtOpenMP.h
@@ -21,6 +21,7 @@
#include "clang/AST/StmtCXX.h"
#include "clang/Basic/OpenMPKinds.h"
#include "clang/Basic/SourceLocation.h"
+#include "llvm/Support/Casting.h"
namespace clang {
@@ -677,6 +678,10 @@ class OMPParallelDirective : public OMPExecutableDirective {
}
};
+// Forward declaration of a generic loop transformation. Used in the declaration
+// of OMPLoopBasedDirective.
+class OMPLoopTransformationDirective;
+
/// The base class for all loop-based directives, including loop transformation
/// directives.
class OMPLoopBasedDirective : public OMPExecutableDirective {
@@ -889,24 +894,23 @@ class OMPLoopBasedDirective : public OMPExecutableDirective {
/// Calls the specified callback function for all the loops in \p CurStmt,
/// from the outermost to the innermost.
- static bool doForAllLoops(
- Stmt *CurStmt, bool TryImperfectlyNestedLoops, unsigned NumLoops,
- llvm::function_ref<bool(unsigned, Stmt *)> Callback,
- llvm::function_ref<void(OMPCanonicalLoopNestTransformationDirective *)>
- OnTransformationCallback);
+ static bool
+ doForAllLoops(Stmt *CurStmt, bool TryImperfectlyNestedLoops,
+ unsigned NumLoops,
+ llvm::function_ref<bool(unsigned, Stmt *)> Callback,
+ llvm::function_ref<void(OMPLoopTransformationDirective *)>
+ OnTransformationCallback);
static bool
doForAllLoops(const Stmt *CurStmt, bool TryImperfectlyNestedLoops,
unsigned NumLoops,
llvm::function_ref<bool(unsigned, const Stmt *)> Callback,
- llvm::function_ref<
- void(const OMPCanonicalLoopNestTransformationDirective *)>
+ llvm::function_ref<void(const OMPLoopTransformationDirective *)>
OnTransformationCallback) {
auto &&NewCallback = [Callback](unsigned Cnt, Stmt *CurStmt) {
return Callback(Cnt, CurStmt);
};
auto &&NewTransformCb =
- [OnTransformationCallback](
- OMPCanonicalLoopNestTransformationDirective *A) {
+ [OnTransformationCallback](OMPLoopTransformationDirective *A) {
OnTransformationCallback(A);
};
return doForAllLoops(const_cast<Stmt *>(CurStmt), TryImperfectlyNestedLoops,
@@ -919,7 +923,7 @@ class OMPLoopBasedDirective : public OMPExecutableDirective {
doForAllLoops(Stmt *CurStmt, bool TryImperfectlyNestedLoops,
unsigned NumLoops,
llvm::function_ref<bool(unsigned, Stmt *)> Callback) {
- auto &&TransformCb = [](OMPCanonicalLoopNestTransformationDirective *) {};
+ auto &&TransformCb = [](OMPLoopTransformationDirective *) {};
return doForAllLoops(CurStmt, TryImperfectlyNestedLoops, NumLoops, Callback,
TransformCb);
}
@@ -957,9 +961,11 @@ class OMPLoopBasedDirective : public OMPExecutableDirective {
};
/// Common class of data shared between
-/// OMPCanonicalLoopNestTransformationDirective and transformations over
-/// canonical loop sequences.
+/// OMPCanonicalLoopNestTransformationDirective and
+/// OMPCanonicalLoopSequenceTransformationDirective
class OMPLoopTransformationDirective {
+ friend class ASTStmtReader;
+
/// Number of (top-level) generated loops.
/// This value is 1 for most transformations as they only map one loop nest
/// into another.
@@ -969,15 +975,39 @@ class OMPLoopTransformationDirective {
/// generate more than one loop nest, so the value would be >= 1.
unsigned NumGeneratedTopLevelLoops = 1;
+ /// We need this because we cannot easily make OMPLoopTransformationDirective
+ /// a proper Stmt.
+ Stmt *S = nullptr;
+
protected:
void setNumGeneratedTopLevelLoops(unsigned N) {
NumGeneratedTopLevelLoops = N;
}
+ explicit OMPLoopTransformationDirective(Stmt *S) : S(S) {}
+
public:
unsigned getNumGeneratedTopLevelLoops() const {
return NumGeneratedTopLevelLoops;
}
+
+ /// Returns the specific directive related to this loop transformation.
+ Stmt *getDirective() const { return S; }
+
+ /// Get the de-sugared statements after the loop transformation.
+ ///
+ /// Might be nullptr if either the directive generates no loops and is handled
+ /// directly in CodeGen, or resolving a template-dependence context is
+ /// required.
+ Stmt *getTransformedStmt() const;
+
+ /// Return preinits statement.
+ Stmt *getPreInits() const;
+
+ static bool classof(const Stmt *T) {
+ return isa<OMPCanonicalLoopNestTransformationDirective,
+ OMPCanonicalLoopSequenceTransformationDirective>(T);
+ }
};
/// The base class for all transformation directives of canonical loop nests.
@@ -990,7 +1020,8 @@ class OMPCanonicalLoopNestTransformationDirective
explicit OMPCanonicalLoopNestTransformationDirective(
StmtClass SC, OpenMPDirectiveKind Kind, SourceLocation StartLoc,
SourceLocation EndLoc, unsigned NumAssociatedLoops)
- : OMPLoopBasedDirective(SC, Kind, StartLoc, EndLoc, NumAssociatedLoops) {}
+ : OMPLoopBasedDirective(SC, Kind, StartLoc, EndLoc, NumAssociatedLoops),
+ OMPLoopTransformationDirective(this) {}
public:
/// Return the number of associated (consumed) loops.
@@ -5928,6 +5959,124 @@ class OMPInterchangeDirective final
}
};
+/// The base class for all transformation directives of canonical loop
+/// sequences (currently only 'fuse')
+class OMPCanonicalLoopSequenceTransformationDirective
+ : public OMPExecutableDirective,
+ public OMPLoopTransformationDirective {
+ friend class ASTStmtReader;
+
+ /// Number of loops contained in the canonical loop sequence.
+ unsigned NumLoopsInSequence = 1;
+
+protected:
+ explicit OMPCanonicalLoopSequenceTransformationDirective(
+ StmtClass SC, OpenMPDirectiveKind Kind, SourceLocation StartLoc,
+ SourceLocation EndLoc, unsigned NumLoopsInSequence)
+ : OMPExecutableDirective(SC, Kind, StartLoc, EndLoc),
+ OMPLoopTransformationDirective(this),
+ NumLoopsInSequence(NumLoopsInSequence) {}
+
+public:
+ /// Returns the number of canonical loop nests contained in this sequence.
+ unsigned getNumLoopsInSequence() const { return NumLoopsInSequence; }
+
+ /// Get the de-sugared statements after the loop transformation.
+ ///
+ /// Might be nullptr if either the directive generates no loops and is handled
+ /// directly in CodeGen, or resolving a template-dependence context is
+ /// required.
+ Stmt *getTransformedStmt() const;
+
+ /// Return preinits statement.
+ Stmt *getPreInits() const;
+
+ static bool classof(const Stmt *T) {
+ Stmt::StmtClass C = T->getStmtClass();
+ return C == OMPFuseDirectiveClass;
+ }
+};
+
+/// Represents the '#pragma omp fuse' loop transformation directive
+///
+/// \code{c}
+/// #pragma omp fuse
+/// {
+/// for(int i = 0; i < m1; ++i) {...}
+/// for(int j = 0; j < m2; ++j) {...}
+/// ...
+/// }
+/// \endcode
+class OMPFuseDirective final
+ : public OMPCanonicalLoopSequenceTransformationDirective {
+ friend class ASTStmtReader;
+ friend class OMPExecutableDirective;
+
+ // Offsets of child members.
+ enum {
+ PreInitsOffset = 0,
+ TransformedStmtOffset,
+ };
+
+ unsigned NumGeneratedLoopNests = 1;
+
+ explicit OMPFuseDirective(SourceLocation StartLoc, SourceLocation EndLoc,
+ unsigned NumLoopsInSequence)
+ : OMPCanonicalLoopSequenceTransformationDirective(
+ OMPFuseDirectiveClass, llvm::omp::OMPD_fuse, StartLoc, EndLoc,
+ NumLoopsInSequence) {}
+
+ void setPreInits(Stmt *PreInits) {
+ Data->getChildren()[PreInitsOffset] = PreInits;
+ }
+
+ void setTransformedStmt(Stmt *S) {
+ Data->getChildren()[TransformedStmtOffset] = S;
+ }
+
+public:
+ /// Create a new AST node representation for #pragma omp fuse'
+ ///
+ /// \param C Context of the AST
+ /// \param StartLoc Location of the introducer (e.g the 'omp' token)
+ /// \param EndLoc Location of the directive's end (e.g the tok::eod)
+ /// \param Clauses The directive's clauses
+ /// \param NumLoops Total number of loops in the canonical loop sequence.
+ /// \param NumGeneratedTopLevelLoops Number of top-level generated loops.
+ // Typically 1 but looprange clause can
+ // change this.
+ /// \param AssociatedStmt The outermost associated loop
+ /// \param TransformedStmt The loop nest after fusion, or nullptr in
+ /// dependent
+ /// \param PreInits Helper preinits statements for the loop nest
+ static OMPFuseDirective *
+ Create(const ASTContext &C, SourceLocation StartLoc, SourceLocation EndLoc,
+ ArrayRef<OMPClause *> Clauses, unsigned NumLoops,
+ unsigned NumGeneratedTopLevelLoops, Stmt *AssociatedStmt,
+ Stmt *TransformedStmt, Stmt *PreInits);
+
+ /// Build an empty '#pragma omp fuse' AST node for deserialization
+ ///
+ /// \param C Context of the AST
+ /// \param NumClauses Number of clauses to allocate
+ /// \param NumLoops Number of top level loops to allocate
+ static OMPFuseDirective *CreateEmpty(const ASTContext &C, unsigned NumClauses,
+ unsigned NumLoops);
+
+ /// Gets the associated loops after the transformation. This is the de-sugared
+ /// replacement or nulltpr in dependent contexts.
+ Stmt *getTransformedStmt() const {
+ return Data->getChildren()[TransformedStmtOffset];
+ }
+
+ /// Return preinits statement.
+ Stmt *getPreInits() const { return Data->getChildren()[PreInitsOffset]; }
+
+ static bool classof(const Stmt *T) {
+ return T->getStmtClass() == OMPFuseDirectiveClass;
+ }
+};
+
/// This represents '#pragma omp scan' directive.
///
/// \code
@@ -6596,4 +6745,37 @@ class OMPAssumeDirective final : public OMPExecutableDirective {
} // end namespace clang
+namespace llvm {
+// Allow a Stmt* be casted correctly to an OMPLoopTransformationDirective*.
+// The default routines would just use a C-style cast which won't work well
+// for the multiple inheritance here. We have to use a static cast from the
+// corresponding subclass.
+template <>
+struct CastInfo<clang::OMPLoopTransformationDirective, clang::Stmt *>
+ : public NullableValueCastFailed<clang::OMPLoopTransformationDirective *>,
+ public DefaultDoCastIfPossible<
+ clang::OMPLoopTransformationDirective *, clang::Stmt *,
+ CastInfo<clang::OMPLoopTransformationDirective, clang::Stmt *>> {
+ static bool isPossible(const clang::Stmt *T) {
+ return clang::OMPLoopTransformationDirective::classof(T);
+ }
+
+ static clang::OMPLoopTransformationDirective *doCast(clang::Stmt *T) {
+ if (auto *D =
+ dyn_cast<clang::OMPCanonicalLoopNestTransformationDirective>(T))
+ return static_cast<clang::OMPLoopTransformationDirective *>(D);
+ if (auto *D =
+ dyn_cast<clang::OMPCanonicalLoopSequenceTransformationDirective>(T))
+ return static_cast<clang::OMPLoopTransformationDirective *>(D);
+ llvm_unreachable("unexpected type");
+ }
+};
+template <>
+struct CastInfo<clang::OMPLoopTransformationDirective, const clang::Stmt *>
+ : public ConstStrippingForwardingCast<
+ clang::OMPLoopTransformationDirective, const clang::Stmt *,
+ CastInfo<clang::OMPLoopTransformationDirective, clang::Stmt *>> {};
+
+} // namespace llvm
+
#endif
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 409a8202d8a09..0bab41d30437f 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -11737,6 +11737,18 @@ def note_omp_implicit_dsa : Note<
"implicitly determined as %0">;
def err_omp_loop_var_dsa : Error<
"loop iteration variable in the associated loop of 'omp %1' directive may not be %0, predetermined as %2">;
+def err_omp_not_a_loop_sequence
+ : Error<"statement after '#pragma omp %0' must be a loop sequence "
+ "containing canonical loops or loop-generating constructs">;
+def err_omp_empty_loop_sequence
+ : Error<"loop sequence after '#pragma omp %0' must contain at least 1 "
+ "canonical loop or loop-generating construct">;
+def err_omp_invalid_looprange
+ : Error<"looprange clause selects loops from %1 to %2 but this exceeds the "
+ "number of loops (%3) in the loop sequence">;
+def warn_omp_redundant_fusion : Warning<"looprange clause selects a single "
+ "loop, resulting in redundant fusion">,
+ InGroup<OpenMPClauses>;
def err_omp_not_for : Error<
"%select{statement after '#pragma omp %1' must be a for loop|"
"expected %2 for loops after '#pragma omp %1'%select{|, but found only %4}3}0">;
diff --git a/clang/include/clang/Basic/OpenMPKinds.h b/clang/include/clang/Basic/OpenMPKinds.h
index 115af7b19d6e4..2fffff7e27dad 100644
--- a/clang/include/clang/Basic/OpenMPKinds.h
+++ b/clang/include/clang/Basic/OpenMPKinds.h
@@ -383,6 +383,13 @@ bool isOpenMPLoopBoundSharingDirective(OpenMPDirectiveKind Kind);
bool isOpenMPCanonicalLoopNestTransformationDirective(
OpenMPDirectiveKind DKind);
+/// Checks if the specified directive is a loop transformation directive that
+/// applies to a canonical loop sequence.
+/// \param DKind Specified directive.
+/// \return True iff the directive is a loop transformation.
+bool isOpenMPCanonicalLoopSequenceTransformationDirective(
+ OpenMPDirectiveKind DKind);
+
/// Checks if the specified directive is a loop transformation directive.
/// \param DKind Specified directive.
/// \return True iff the directive is a loop transformation.
diff --git a/clang/include/clang/Basic/StmtNodes.td b/clang/include/clang/Basic/StmtNodes.td
index dd1a24405fae7..bf3686bb372d5 100644
--- a/clang/include/clang/Basic/StmtNodes.td
+++ b/clang/include/clang/Basic/StmtNodes.td
@@ -238,6 +238,10 @@ def OMPUnrollDirective : StmtNode<OMPCanonicalLoopNestTransformationDirective>;
def OMPReverseDirective : StmtNode<OMPCanonicalLoopNestTransformationDirective>;
def OMPInterchangeDirective
: StmtNode<OMPCanonicalLoopNestTransformationDirective>;
+def OMPCanonicalLoopSequenceTransformationDirective
+ : StmtNode<OMPExecutableDirective, 1>;
+def OMPFuseDirective
+ : StmtNode<OMPCanonicalLoopSequenceTransformationDirective>;
def OMPForDirective : StmtNode<OMPLoopDirective>;
def OMPForSimdDirective : StmtNode<OMPLoopDirective>;
def OMPSectionsDirective : StmtNode<OMPExecutableDirective>;
diff --git a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h
index 30edd303e1824..e301cf1080977 100644
--- a/clang/include/clang/Parse/Parser.h
+++ b/clang/include/clang/Parse/Parser.h
@@ -6767,6 +6767,9 @@ class Parser : public CodeCompletionHandler {
OpenMPClauseKind Kind,
bool ParseOnly);
+ /// Parses the 'looprange' clause of a '#pragma omp fuse' directive.
+ OMPClause *ParseOpenMPLoopRangeClause();
+
/// Parses the 'sizes' clause of a '#pragma omp tile' directive.
OMPClause *ParseOpenMPSizesClause();
diff --git a/clang/include/clang/Sema/SemaOpenMP.h b/clang/include/clang/Sema/SemaOpenMP.h
index 23827051ed724..07401b21d5e6c 100644
--- a/clang/include/clang/Sema/SemaOpenMP.h
+++ b/clang/include/clang/Sema/SemaOpenMP.h
@@ -457,6 +457,13 @@ class SemaOpenMP : public SemaBase {
Stmt *AStmt,
SourceLocation StartLoc,
SourceLocation EndLoc);
+
+ /// Called on well-formed '#pragma omp fuse' after parsing of its
+ /// clauses and the associated statement.
+ StmtResult ActOnOpenMPFuseDirective(ArrayRef<OMPClause *> Clauses,
+ Stmt *AStmt, SourceLocation StartLoc,
+ SourceLocation EndLoc);
+
/// Called on well-formed '\#pragma omp for' after parsing
/// of the associated statement.
StmtResult
@@ -915,6 +922,12 @@ class SemaOpenMP : public SemaBase {
SourceLocation StartLoc,
SourceLocation LParenLoc,
SourceLocation EndLoc);
+
+ /// Called on well-form 'looprange' clause after parsing its arguments.
+ OMPClause *
+ ActOnOpenMPLoopRangeClause(Expr *First, Expr *Count, SourceLocation StartLoc,
+ SourceLocation LParenLoc, SourceLocation FirstLoc,
+ SourceLocation CountLoc, SourceLocation EndLoc);
/// Called on well-formed 'ordered' clause.
OMPClause *
ActOnOpenMPOrderedClause(SourceLocation StartLoc, SourceLocation EndLoc,
@@ -1479,7 +1492,83 @@ class SemaOpenMP : public SemaBase {
bool checkTransformableLoopNest(
OpenMPDirectiveKind Kind, Stmt *AStmt, int NumLoops,
SmallVectorImpl<OMPLoopBasedDirective::HelperExprs> &LoopHelpers,
- Stmt *&Body, SmallVectorImpl<SmallVector<Stmt *, 0>> &OriginalInits);
+ Stmt *&Body, SmallVectorImpl<SmallVector<Stmt *>> &OriginalInits);
+
+ /// Categories of loops encountered during semantic OpenMP loop
+ /// analysis.
+ ///
+ /// This enumeration identifies the structural category of a loop or sequence
+ /// of loops analyzed in the context of OpenMP transformations and directives.
+ /// This categorization helps differentiate between original source loops
+ /// and the structures resulting from applying OpenMP loop transformations.
+ enum class OMPLoopCategory {
+ /// @var OMPLoopCategory::RegularLoop
+ /// Represents a standard canonical loop nest found in the
+ /// original source code or an intact loop after transformations
+ /// (i.e Post/Pre loops of a loopranged fusion).
+ RegularLoop,
+
+ /// @var OMPLoopCategory::TransformSingleLoop
+ /// Represents the resulting loop structure when an OpenMP loop
+ /// transformation, generates a single, top-level loop.
+ TransformSingleLoop,
+ };
+
+ /// Holds the result of the analysis of a (possibly canonical) loop.
+ struct LoopAnalysis {
+ /// Category of the analyzed loop.
+ OMPLoopCategory Category;
+ /// Loop analyses results.
+ OMPLoopBasedDirective::HelperExprs HelperExprs;
+ /// The for-statement of the loop.
+ Stmt *ForStmt;
+ /// Initialization statements before transformations.
+ SmallVector<Stmt *> OriginalInits;
+ /// Initialization statements required after transformation of this loop.
+ SmallVector<Stmt *> TransformsPreInits;
+
+ explicit LoopAnalysis(OMPLoopCategory Category) : Category(Category) {}
+ };
+
+ /// Holds the result of the analysis of a (possibly canonical) loop sequence.
+ struct LoopSequenceAnalysis {
+ /// Number of top level canonical loops.
+ unsigned LoopSeqSize = 0;
+ /// For each loop results of the analysis.
+ SmallVector<LoopAnalysis, 2> Loops;
+ /// Additional code required before entering the transformed loop sequence.
+ SmallVector<Stmt *> LoopSequencePreInits;
+ };
+
+ /// The main recursive process of `checkTransformableLoopSequence` that
+ /// performs grammatical parsing of a canonical loop sequence. It extracts
+ /// key information, such as the number of top-level loops, loop statements,
+ /// helper expressions, and other relevant loop-related data, all in a single
+ /// execution to avoid redundant traversals. This analysis flattens inner
+ /// Loop Sequences
+ ///
+ /// \param LoopSeqStmt The AST of the original statement.
+ /// \param SeqAnalysis [out] Result of the analysis of \p LoopSeqStmt
+ /// \param Context
+ /// \param Kind The loop transformation directive kind.
+ /// \return Whether the original statement is both syntactically and
+ /// semantically correct according to OpenMP 6.0 canonical loop
+ /// sequence definition.
+ bool analyzeLoopSequence(Stmt *LoopSeqStmt, LoopSequenceAnalysis &SeqAnalysis,
+ ASTContext &Context, OpenMPDirectiveKind Kind);
+
+ /// Validates and checks whether a loop sequence can be transformed according
+ /// to the given directive, providing necessary setup and initialization
+ /// (Driver function) before recursion using `analyzeLoopSequence`.
+ ///
+ /// \param Kind The loop transformation directive kind.
+ /// \param AStmt The AST of the original statement
+ /// \param SeqAnalysis [out] Result of the analysis of \p LoopSeqStmt
+ /// \param Context
+ /// \return Whether there was an absence of errors or not
+ bool checkTransformableLoopSequence(OpenMPDirectiveKind Kind, Stmt *AStmt,
+ LoopSequenceAnalysis &SeqAnalysis,
+ ASTContext &Context);
/// Helper to keep information about the current `omp begin/end declare
/// variant` nesting.
diff --git a/clang/include/clang/Serialization/ASTBitCodes.h b/clang/include/clang/Serialization/ASTBitCodes.h
index 441047d64f48c..99864c7373908 100644
--- a/clang/include/clang/Serialization/ASTBitCodes.h
+++ b/clang/include/clang/Serialization/ASTBitCodes.h
@@ -1951,6 +1951,7 @@ enum StmtCode {
STMT_OMP_UNROLL_DIRECTIVE,
STMT_OMP_REVERSE_DIRECTIVE,
STMT_OMP_INTERCHANGE_DIRECTIVE,
+ STMT_OMP_FUSE_DIRECTIVE,
STMT_OMP_FOR_DIRECTIVE,
STMT_OMP_FOR_SIMD_DIRECTIVE,
STMT_OMP_SECTIONS_DIRECTIVE,
diff --git a/clang/lib/AST/OpenMPClause.cpp b/clang/lib/AST/OpenMPClause.cpp
index 69d33019c0952..437033b611b87 100644
--- a/clang/lib/AST/OpenMPClause.cpp
+++ b/clang/lib/AST/OpenMPClause.cpp
@@ -1021,6 +1021,26 @@ OMPPartialClause *OMPPartialClause::CreateEmpty(const ASTContext &C) {
return new (C) OMPPartialClause();
}
+OMPLoopRangeClause *
+OMPLoopRangeClause::Create(const ASTContext &C, SourceLocation StartLoc,
+ SourceLocation LParenLoc, SourceLocation FirstLoc,
+ SourceLocation CountLoc, SourceLocation EndLoc,
+ Expr *First, Expr *Count) {
+ OMPLoopRangeClause *Clause = CreateEmpty(C);
+ Clause->setLocStart(StartLoc);
+ Clause->setLParenLoc(LParenLoc);
+ Clause->setFirstLoc(FirstLoc);
+ Clause->setCountLoc(CountLoc);
+ Clause->setLocEnd(EndLoc);
+ Clause->setFirst(First);
+ Clause->setCount(Count);
+ return Clause;
+}
+
+OMPLoopRangeClause *OMPLoopRangeClause::CreateEmpty(const ASTContext &C) {
+ return new (C) OMPLoopRangeClause();
+}
+
OMPAllocateClause *OMPAllocateClause::Create(
const ASTContext &C, SourceLocation StartLoc, SourceLocation LParenLoc,
Expr *Allocator, Expr *Alignment, SourceLocation ColonLoc,
@@ -1890,6 +1910,21 @@ void OMPClausePrinter::VisitOMPPartialClause(OMPPartialClause *Node) {
}
}
+void OMPClausePrinter::VisitOMPLoopRangeClause(OMPLoopRangeClause *Node) {
+ OS << "looprange";
+
+ Expr *First = Node->getFirst();
+ Expr *Count = Node->getCount();
+
+ if (First && Count) {
+ OS << "(";
+ First->printPretty(OS, nullptr, Policy, 0);
+ OS << ",";
+ Count->printPretty(OS, nullptr, Policy, 0);
+ OS << ")";
+ }
+}
+
void OMPClausePrinter::VisitOMPAllocatorClause(OMPAllocatorClause *Node) {
OS << "allocator(";
Node->getAllocator()->printPretty(OS, nullptr, Policy, 0);
diff --git a/clang/lib/AST/StmtOpenMP.cpp b/clang/lib/AST/StmtOpenMP.cpp
index 1f6586f95a9f8..5ba6a52650bd4 100644
--- a/clang/lib/AST/StmtOpenMP.cpp
+++ b/clang/lib/AST/StmtOpenMP.cpp
@@ -125,13 +125,12 @@ OMPLoopBasedDirective::tryToFindNextInnerLoop(Stmt *CurStmt,
bool OMPLoopBasedDirective::doForAllLoops(
Stmt *CurStmt, bool TryImperfectlyNestedLoops, unsigned NumLoops,
llvm::function_ref<bool(unsigned, Stmt *)> Callback,
- llvm::function_ref<void(OMPCanonicalLoopNestTransformationDirective *)>
+ llvm::function_ref<void(OMPLoopTransformationDirective *)>
OnTransformationCallback) {
CurStmt = CurStmt->IgnoreContainers();
for (unsigned Cnt = 0; Cnt < NumLoops; ++Cnt) {
while (true) {
- auto *Dir =
- dyn_cast<OMPCanonicalLoopNestTransformationDirective>(CurStmt);
+ auto *Dir = dyn_cast<OMPLoopTransformationDirective>(CurStmt);
if (!Dir)
break;
@@ -371,6 +370,26 @@ OMPForDirective *OMPForDirective::Create(
return Dir;
}
+Stmt *OMPLoopTransformationDirective::getTransformedStmt() const {
+ if (auto *D = dyn_cast<OMPCanonicalLoopNestTransformationDirective>(S)) {
+ return D->getTransformedStmt();
+ }
+ if (auto *D = dyn_cast<OMPCanonicalLoopSequenceTransformationDirective>(S)) {
+ return D->getTransformedStmt();
+ }
+ llvm_unreachable("unexpected object type");
+}
+
+Stmt *OMPLoopTransformationDirective::getPreInits() const {
+ if (auto *D = dyn_cast<OMPCanonicalLoopNestTransformationDirective>(S)) {
+ return D->getPreInits();
+ }
+ if (auto *D = dyn_cast<OMPCanonicalLoopSequenceTransformationDirective>(S)) {
+ return D->getPreInits();
+ }
+ llvm_unreachable("unexpected object type");
+}
+
Stmt *OMPCanonicalLoopNestTransformationDirective::getTransformedStmt() const {
switch (getStmtClass()) {
#define STMT(CLASS, PARENT)
@@ -380,7 +399,7 @@ Stmt *OMPCanonicalLoopNestTransformationDirective::getTransformedStmt() const {
return static_cast<const CLASS *>(this)->getTransformedStmt();
#include "clang/AST/StmtNodes.inc"
default:
- llvm_unreachable("Not a loop transformation");
+ llvm_unreachable("Not a loop transformation for canonical loop nests");
}
}
@@ -393,7 +412,34 @@ Stmt *OMPCanonicalLoopNestTransformationDirective::getPreInits() const {
return static_cast<const CLASS *>(this)->getPreInits();
#include "clang/AST/StmtNodes.inc"
default:
- llvm_unreachable("Not a loop transformation");
+ llvm_unreachable("Not a loop transformation for canonical loop nests");
+ }
+}
+
+Stmt *
+OMPCanonicalLoopSequenceTransformationDirective::getTransformedStmt() const {
+ switch (getStmtClass()) {
+#define STMT(CLASS, PARENT)
+#define ABSTRACT_STMT(CLASS)
+#define OMPCANONICALLOOPSEQUENCETRANSFORMATIONDIRECTIVE(CLASS, PARENT) \
+ case Stmt::CLASS##Class: \
+ return static_cast<const CLASS *>(this)->getTransformedStmt();
+#include "clang/AST/StmtNodes.inc"
+ default:
+ llvm_unreachable("Not a loop transformation for canonical loop sequences");
+ }
+}
+
+Stmt *OMPCanonicalLoopSequenceTransformationDirective::getPreInits() const {
+ switch (getStmtClass()) {
+#define STMT(CLASS, PARENT)
+#define ABSTRACT_STMT(CLASS)
+#define OMPCANONICALLOOPSEQUENCETRANSFORMATIONDIRECTIVE(CLASS, PARENT) \
+ case Stmt::CLASS##Class: \
+ return static_cast<const CLASS *>(this)->getPreInits();
+#include "clang/AST/StmtNodes.inc"
+ default:
+ llvm_unreachable("Not a loop transformation for canonical loop sequences");
}
}
@@ -510,6 +556,31 @@ OMPInterchangeDirective::CreateEmpty(const ASTContext &C, unsigned NumClauses,
SourceLocation(), SourceLocation(), NumLoops);
}
+OMPFuseDirective *
+OMPFuseDirective::Create(const ASTContext &C, SourceLocation StartLoc,
+ SourceLocation EndLoc, ArrayRef<OMPClause *> Clauses,
+ unsigned NumLoops, unsigned NumGeneratedTopLevelLoops,
+ Stmt *AssociatedStmt, Stmt *TransformedStmt,
+ Stmt *PreInits) {
+
+ OMPFuseDirective *Dir = createDirective<OMPFuseDirective>(
+ C, Clauses, AssociatedStmt, TransformedStmtOffset + 1, StartLoc, EndLoc,
+ NumLoops);
+ Dir->setTransformedStmt(TransformedStmt);
+ Dir->setPreInits(PreInits);
+ Dir->setNumGeneratedTopLevelLoops(NumGeneratedTopLevelLoops);
+ return Dir;
+}
+
+OMPFuseDirective *OMPFuseDirective::CreateEmpty(const ASTContext &C,
+ unsigned NumClauses,
+ unsigned NumLoops) {
+ OMPFuseDirective *Dir = createEmptyDirective<OMPFuseDirective>(
+ C, NumClauses, /*HasAssociatedStmt=*/true, TransformedStmtOffset + 1,
+ SourceLocation(), SourceLocation(), NumLoops);
+ return Dir;
+}
+
OMPForSimdDirective *
OMPForSimdDirective::Create(const ASTContext &C, SourceLocation StartLoc,
SourceLocation EndLoc, unsigned CollapsedNum,
diff --git a/clang/lib/AST/StmtPrinter.cpp b/clang/lib/AST/StmtPrinter.cpp
index 0030300521128..3bf146f3663ad 100644
--- a/clang/lib/AST/StmtPrinter.cpp
+++ b/clang/lib/AST/StmtPrinter.cpp
@@ -795,6 +795,11 @@ void StmtPrinter::VisitOMPInterchangeDirective(OMPInterchangeDirective *Node) {
PrintOMPExecutableDirective(Node);
}
+void StmtPrinter::VisitOMPFuseDirective(OMPFuseDirective *Node) {
+ Indent() << "#pragma omp fuse";
+ PrintOMPExecutableDirective(Node);
+}
+
void StmtPrinter::VisitOMPForDirective(OMPForDirective *Node) {
Indent() << "#pragma omp for";
PrintOMPExecutableDirective(Node);
diff --git a/clang/lib/AST/StmtProfile.cpp b/clang/lib/AST/StmtProfile.cpp
index 37c4d43ec0b2f..1a6607066bbfe 100644
--- a/clang/lib/AST/StmtProfile.cpp
+++ b/clang/lib/AST/StmtProfile.cpp
@@ -510,6 +510,13 @@ void OMPClauseProfiler::VisitOMPPartialClause(const OMPPartialClause *C) {
Profiler->VisitExpr(Factor);
}
+void OMPClauseProfiler::VisitOMPLoopRangeClause(const OMPLoopRangeClause *C) {
+ if (const Expr *First = C->getFirst())
+ Profiler->VisitExpr(First);
+ if (const Expr *Count = C->getCount())
+ Profiler->VisitExpr(Count);
+}
+
void OMPClauseProfiler::VisitOMPAllocatorClause(const OMPAllocatorClause *C) {
if (C->getAllocator())
Profiler->VisitStmt(C->getAllocator());
@@ -1025,6 +1032,15 @@ void StmtProfiler::VisitOMPInterchangeDirective(
VisitOMPCanonicalLoopNestTransformationDirective(S);
}
+void StmtProfiler::VisitOMPCanonicalLoopSequenceTransformationDirective(
+ const OMPCanonicalLoopSequenceTransformationDirective *S) {
+ VisitOMPExecutableDirective(S);
+}
+
+void StmtProfiler::VisitOMPFuseDirective(const OMPFuseDirective *S) {
+ VisitOMPCanonicalLoopSequenceTransformationDirective(S);
+}
+
void StmtProfiler::VisitOMPForDirective(const OMPForDirective *S) {
VisitOMPLoopDirective(S);
}
diff --git a/clang/lib/Basic/OpenMPKinds.cpp b/clang/lib/Basic/OpenMPKinds.cpp
index 73daf0f40ef44..eb150fb937f42 100644
--- a/clang/lib/Basic/OpenMPKinds.cpp
+++ b/clang/lib/Basic/OpenMPKinds.cpp
@@ -280,6 +280,7 @@ unsigned clang::getOpenMPSimpleClauseType(OpenMPClauseKind Kind, StringRef Str,
case OMPC_affinity:
case OMPC_when:
case OMPC_append_args:
+ case OMPC_looprange:
break;
default:
break;
@@ -624,6 +625,7 @@ const char *clang::getOpenMPSimpleClauseTypeName(OpenMPClauseKind Kind,
case OMPC_affinity:
case OMPC_when:
case OMPC_append_args:
+ case OMPC_looprange:
break;
default:
break;
@@ -747,9 +749,14 @@ bool clang::isOpenMPCanonicalLoopNestTransformationDirective(
DKind == OMPD_interchange || DKind == OMPD_stripe;
}
+bool clang::isOpenMPCanonicalLoopSequenceTransformationDirective(
+ OpenMPDirectiveKind DKind) {
+ return DKind == OMPD_fuse;
+}
+
bool clang::isOpenMPLoopTransformationDirective(OpenMPDirectiveKind DKind) {
- // FIXME: There will be more cases when we implement 'fuse'.
- return isOpenMPCanonicalLoopNestTransformationDirective(DKind);
+ return isOpenMPCanonicalLoopNestTransformationDirective(DKind) ||
+ isOpenMPCanonicalLoopSequenceTransformationDirective(DKind);
}
bool clang::isOpenMPCombinedParallelADirective(OpenMPDirectiveKind DKind) {
diff --git a/clang/lib/CodeGen/CGStmt.cpp b/clang/lib/CodeGen/CGStmt.cpp
index aeff73d525c10..79ab728b4e15c 100644
--- a/clang/lib/CodeGen/CGStmt.cpp
+++ b/clang/lib/CodeGen/CGStmt.cpp
@@ -234,6 +234,9 @@ void CodeGenFunction::EmitStmt(const Stmt *S, ArrayRef<const Attr *> Attrs) {
case Stmt::OMPInterchangeDirectiveClass:
EmitOMPInterchangeDirective(cast<OMPInterchangeDirective>(*S));
break;
+ case Stmt::OMPFuseDirectiveClass:
+ EmitOMPFuseDirective(cast<OMPFuseDirective>(*S));
+ break;
case Stmt::OMPForDirectiveClass:
EmitOMPForDirective(cast<OMPForDirective>(*S));
break;
diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp
index d72cd8fbfd608..496db9866f092 100644
--- a/clang/lib/CodeGen/CGStmtOpenMP.cpp
+++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp
@@ -201,6 +201,24 @@ class OMPLoopScope : public CodeGenFunction::RunCleanupsScope {
} else {
llvm_unreachable("Unknown loop-based directive kind.");
}
+ doEmitPreinits(PreInits);
+ PreCondVars.restore(CGF);
+ }
+
+ void
+ emitPreInitStmt(CodeGenFunction &CGF,
+ const OMPCanonicalLoopSequenceTransformationDirective &S) {
+ const Stmt *PreInits;
+ if (const auto *Fuse = dyn_cast<OMPFuseDirective>(&S)) {
+ PreInits = Fuse->getPreInits();
+ } else {
+ llvm_unreachable(
+ "Unknown canonical loop sequence transform directive kind.");
+ }
+ doEmitPreinits(PreInits);
+ }
+
+ void doEmitPreinits(const Stmt *PreInits) {
if (PreInits) {
// CompoundStmts and DeclStmts are used as lists of PreInit statements and
// declarations. Since declarations must be visible in the the following
@@ -222,7 +240,6 @@ class OMPLoopScope : public CodeGenFunction::RunCleanupsScope {
CGF.EmitStmt(S);
}
}
- PreCondVars.restore(CGF);
}
public:
@@ -230,6 +247,11 @@ class OMPLoopScope : public CodeGenFunction::RunCleanupsScope {
: CodeGenFunction::RunCleanupsScope(CGF) {
emitPreInitStmt(CGF, S);
}
+ OMPLoopScope(CodeGenFunction &CGF,
+ const OMPCanonicalLoopSequenceTransformationDirective &S)
+ : CodeGenFunction::RunCleanupsScope(CGF) {
+ emitPreInitStmt(CGF, S);
+ }
};
class OMPSimdLexicalScope : public CodeGenFunction::LexicalScope {
@@ -1921,6 +1943,15 @@ class OMPTransformDirectiveScopeRAII {
CGSI = new CodeGenFunction::CGCapturedStmtInfo(CR_OpenMP);
CapInfoRAII = new CodeGenFunction::CGCapturedStmtRAII(CGF, CGSI);
}
+ if (const auto *Dir =
+ dyn_cast<OMPCanonicalLoopSequenceTransformationDirective>(S)) {
+ // For simplicity we reuse the loop scope similarly to what we do with
+ // OMPCanonicalLoopNestTransformationDirective do by being a subclass
+ // of OMPLoopBasedDirective.
+ Scope = new OMPLoopScope(CGF, *Dir);
+ CGSI = new CodeGenFunction::CGCapturedStmtInfo(CR_OpenMP);
+ CapInfoRAII = new CodeGenFunction::CGCapturedStmtRAII(CGF, CGSI);
+ }
}
~OMPTransformDirectiveScopeRAII() {
if (!Scope)
@@ -1948,8 +1979,7 @@ static void emitBody(CodeGenFunction &CGF, const Stmt *S, const Stmt *NextLoop,
return;
}
if (SimplifiedS == NextLoop) {
- if (auto *Dir =
- dyn_cast<OMPCanonicalLoopNestTransformationDirective>(SimplifiedS))
+ if (auto *Dir = dyn_cast<OMPLoopTransformationDirective>(SimplifiedS))
SimplifiedS = Dir->getTransformedStmt();
if (const auto *CanonLoop = dyn_cast<OMPCanonicalLoop>(SimplifiedS))
SimplifiedS = CanonLoop->getLoopStmt();
@@ -2944,6 +2974,12 @@ void CodeGenFunction::EmitOMPInterchangeDirective(
EmitStmt(S.getTransformedStmt());
}
+void CodeGenFunction::EmitOMPFuseDirective(const OMPFuseDirective &S) {
+ // Emit the de-sugared statement
+ OMPTransformDirectiveScopeRAII FuseScope(*this, &S);
+ EmitStmt(S.getTransformedStmt());
+}
+
void CodeGenFunction::EmitOMPUnrollDirective(const OMPUnrollDirective &S) {
bool UseOMPIRBuilder = CGM.getLangOpts().OpenMPIRBuilder;
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index 727487b46054f..f0565c1de04c4 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -3861,6 +3861,7 @@ class CodeGenFunction : public CodeGenTypeCache {
void EmitOMPUnrollDirective(const OMPUnrollDirective &S);
void EmitOMPReverseDirective(const OMPReverseDirective &S);
void EmitOMPInterchangeDirective(const OMPInterchangeDirective &S);
+ void EmitOMPFuseDirective(const OMPFuseDirective &S);
void EmitOMPForDirective(const OMPForDirective &S);
void EmitOMPForSimdDirective(const OMPForSimdDirective &S);
void EmitOMPScopeDirective(const OMPScopeDirective &S);
diff --git a/clang/lib/Parse/ParseOpenMP.cpp b/clang/lib/Parse/ParseOpenMP.cpp
index 7dceb2d208352..f455d188babca 100644
--- a/clang/lib/Parse/ParseOpenMP.cpp
+++ b/clang/lib/Parse/ParseOpenMP.cpp
@@ -2937,6 +2937,39 @@ OMPClause *Parser::ParseOpenMPSizesClause() {
OpenLoc, CloseLoc);
}
+OMPClause *Parser::ParseOpenMPLoopRangeClause() {
+ SourceLocation ClauseNameLoc = ConsumeToken();
+ SourceLocation FirstLoc, CountLoc;
+
+ BalancedDelimiterTracker T(*this, tok::l_paren, tok::annot_pragma_openmp_end);
+ if (T.consumeOpen()) {
+ Diag(Tok, diag::err_expected) << tok::l_paren;
+ return nullptr;
+ }
+
+ FirstLoc = Tok.getLocation();
+ ExprResult FirstVal = ParseConstantExpression();
+ if (!FirstVal.isUsable()) {
+ T.skipToEnd();
+ return nullptr;
+ }
+
+ ExpectAndConsume(tok::comma);
+
+ CountLoc = Tok.getLocation();
+ ExprResult CountVal = ParseConstantExpression();
+ if (!CountVal.isUsable()) {
+ T.skipToEnd();
+ return nullptr;
+ }
+
+ T.consumeClose();
+
+ return Actions.OpenMP().ActOnOpenMPLoopRangeClause(
+ FirstVal.get(), CountVal.get(), ClauseNameLoc, T.getOpenLocation(),
+ FirstLoc, CountLoc, T.getCloseLocation());
+}
+
OMPClause *Parser::ParseOpenMPPermutationClause() {
SourceLocation ClauseNameLoc, OpenLoc, CloseLoc;
SmallVector<Expr *> ArgExprs;
@@ -3366,6 +3399,9 @@ OMPClause *Parser::ParseOpenMPClause(OpenMPDirectiveKind DKind,
}
Clause = ParseOpenMPClause(CKind, WrongDirective);
break;
+ case OMPC_looprange:
+ Clause = ParseOpenMPLoopRangeClause();
+ break;
default:
break;
}
diff --git a/clang/lib/Sema/SemaExceptionSpec.cpp b/clang/lib/Sema/SemaExceptionSpec.cpp
index 552c92996dc2e..a0483c3027199 100644
--- a/clang/lib/Sema/SemaExceptionSpec.cpp
+++ b/clang/lib/Sema/SemaExceptionSpec.cpp
@@ -1493,6 +1493,7 @@ CanThrowResult Sema::canThrow(const Stmt *S) {
case Stmt::OMPUnrollDirectiveClass:
case Stmt::OMPReverseDirectiveClass:
case Stmt::OMPInterchangeDirectiveClass:
+ case Stmt::OMPFuseDirectiveClass:
case Stmt::OMPSingleDirectiveClass:
case Stmt::OMPTargetDataDirectiveClass:
case Stmt::OMPTargetDirectiveClass:
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index 58dcea7aad690..fc29acb4e0f14 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -4470,6 +4470,7 @@ void SemaOpenMP::ActOnOpenMPRegionStart(OpenMPDirectiveKind DKind,
case OMPD_unroll:
case OMPD_reverse:
case OMPD_interchange:
+ case OMPD_fuse:
case OMPD_assume:
break;
default:
@@ -6311,6 +6312,10 @@ StmtResult SemaOpenMP::ActOnOpenMPExecutableDirective(
Res = ActOnOpenMPInterchangeDirective(ClausesWithImplicit, AStmt, StartLoc,
EndLoc);
break;
+ case OMPD_fuse:
+ Res =
+ ActOnOpenMPFuseDirective(ClausesWithImplicit, AStmt, StartLoc, EndLoc);
+ break;
case OMPD_for:
Res = ActOnOpenMPForDirective(ClausesWithImplicit, AStmt, StartLoc, EndLoc,
VarsWithInheritedDSA);
@@ -9388,7 +9393,9 @@ static bool checkOpenMPIterationSpace(
// sharing attributes.
VarsWithImplicitDSA.erase(LCDecl);
- assert(isOpenMPLoopDirective(DKind) && "DSA for non-loop vars");
+ assert((isOpenMPLoopDirective(DKind) ||
+ isOpenMPCanonicalLoopSequenceTransformationDirective(DKind)) &&
+ "DSA for non-loop vars");
// Check test-expr.
HasErrors |= ISC.checkAndSetCond(For ? For->getCond() : CXXFor->getCond());
@@ -9816,7 +9823,8 @@ checkOpenMPLoop(OpenMPDirectiveKind DKind, Expr *CollapseLoopCountExpr,
unsigned NumLoops = std::max(OrderedLoopCount, NestedLoopCount);
SmallVector<LoopIterationSpace, 4> IterSpaces(NumLoops);
if (!OMPLoopBasedDirective::doForAllLoops(
- AStmt->IgnoreContainers(!isOpenMPLoopTransformationDirective(DKind)),
+ AStmt->IgnoreContainers(
+ !isOpenMPCanonicalLoopNestTransformationDirective(DKind)),
SupportsNonPerfectlyNested, NumLoops,
[DKind, &SemaRef, &DSA, NumLoops, NestedLoopCount,
CollapseLoopCountExpr, OrderedLoopCountExpr, &VarsWithImplicitDSA,
@@ -9838,8 +9846,7 @@ checkOpenMPLoop(OpenMPDirectiveKind DKind, Expr *CollapseLoopCountExpr,
}
return false;
},
- [&SemaRef,
- &Captures](OMPCanonicalLoopNestTransformationDirective *Transform) {
+ [&SemaRef, &Captures](OMPLoopTransformationDirective *Transform) {
Stmt *DependentPreInits = Transform->getPreInits();
if (!DependentPreInits)
return;
@@ -9854,7 +9861,8 @@ checkOpenMPLoop(OpenMPDirectiveKind DKind, Expr *CollapseLoopCountExpr,
auto *D = cast<VarDecl>(C);
DeclRefExpr *Ref = buildDeclRefExpr(
SemaRef, D, D->getType().getNonReferenceType(),
- Transform->getBeginLoc());
+ cast<OMPExecutableDirective>(Transform->getDirective())
+ ->getBeginLoc());
Captures[Ref] = Ref;
}
}
@@ -14304,10 +14312,34 @@ StmtResult SemaOpenMP::ActOnOpenMPTargetTeamsDistributeSimdDirective(
getASTContext(), StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B);
}
+/// Updates OriginalInits by checking Transform against loop transformation
+/// directives and appending their pre-inits if a match is found.
+static void updatePreInits(OMPLoopTransformationDirective *Transform,
+ SmallVectorImpl<Stmt *> &PreInits) {
+ Stmt *Dir = Transform->getDirective();
+ switch (Dir->getStmtClass()) {
+#define STMT(CLASS, PARENT)
+#define ABSTRACT_STMT(CLASS)
+#define COMMON_OMP_LOOP_TRANSFORMATION(CLASS, PARENT) \
+ case Stmt::CLASS##Class: \
+ appendFlattenedStmtList(PreInits, \
+ static_cast<const CLASS *>(Dir)->getPreInits()); \
+ break;
+#define OMPCANONICALLOOPNESTTRANSFORMATIONDIRECTIVE(CLASS, PARENT) \
+ COMMON_OMP_LOOP_TRANSFORMATION(CLASS, PARENT)
+#define OMPCANONICALLOOPSEQUENCETRANSFORMATIONDIRECTIVE(CLASS, PARENT) \
+ COMMON_OMP_LOOP_TRANSFORMATION(CLASS, PARENT)
+#include "clang/AST/StmtNodes.inc"
+#undef COMMON_OMP_LOOP_TRANSFORMATION
+ default:
+ llvm_unreachable("Not a loop transformation");
+ }
+}
+
bool SemaOpenMP::checkTransformableLoopNest(
OpenMPDirectiveKind Kind, Stmt *AStmt, int NumLoops,
SmallVectorImpl<OMPLoopBasedDirective::HelperExprs> &LoopHelpers,
- Stmt *&Body, SmallVectorImpl<SmallVector<Stmt *, 0>> &OriginalInits) {
+ Stmt *&Body, SmallVectorImpl<SmallVector<Stmt *>> &OriginalInits) {
OriginalInits.emplace_back();
bool Result = OMPLoopBasedDirective::doForAllLoops(
AStmt->IgnoreContainers(), /*TryImperfectlyNestedLoops=*/false, NumLoops,
@@ -14333,29 +14365,278 @@ bool SemaOpenMP::checkTransformableLoopNest(
OriginalInits.emplace_back();
return false;
},
- [&OriginalInits](OMPLoopBasedDirective *Transform) {
- Stmt *DependentPreInits;
- if (auto *Dir = dyn_cast<OMPTileDirective>(Transform))
- DependentPreInits = Dir->getPreInits();
- else if (auto *Dir = dyn_cast<OMPStripeDirective>(Transform))
- DependentPreInits = Dir->getPreInits();
- else if (auto *Dir = dyn_cast<OMPUnrollDirective>(Transform))
- DependentPreInits = Dir->getPreInits();
- else if (auto *Dir = dyn_cast<OMPReverseDirective>(Transform))
- DependentPreInits = Dir->getPreInits();
- else if (auto *Dir = dyn_cast<OMPInterchangeDirective>(Transform))
- DependentPreInits = Dir->getPreInits();
- else
- llvm_unreachable("Unhandled loop transformation");
-
- appendFlattenedStmtList(OriginalInits.back(), DependentPreInits);
+ [&OriginalInits](OMPLoopTransformationDirective *Transform) {
+ updatePreInits(Transform, OriginalInits.back());
});
assert(OriginalInits.back().empty() && "No preinit after innermost loop");
OriginalInits.pop_back();
return Result;
}
-/// Add preinit statements that need to be propageted from the selected loop.
+/// Counts the total number of OpenMP canonical nested loops, including the
+/// outermost loop (the original loop). PRECONDITION of this visitor is that it
+/// must be invoked from the original loop to be analyzed. The traversal stops
+/// for Decl's and Expr's given that they may contain inner loops that must not
+/// be counted.
+///
+/// Example AST structure for the code:
+///
+/// int main() {
+/// #pragma omp fuse
+/// {
+/// for (int i = 0; i < 100; i++) { <-- Outer loop
+/// []() {
+/// for(int j = 0; j < 100; j++) {} <-- NOT A LOOP (1)
+/// };
+/// for(int j = 0; j < 5; ++j) {} <-- Inner loop
+/// }
+/// for (int r = 0; i < 100; i++) { <-- Outer loop
+/// struct LocalClass {
+/// void bar() {
+/// for(int j = 0; j < 100; j++) {} <-- NOT A LOOP (2)
+/// }
+/// };
+/// for(int k = 0; k < 10; ++k) {} <-- Inner loop
+/// {x = 5; for(k = 0; k < 10; ++k) x += k; x}; <-- NOT A LOOP (3)
+/// }
+/// }
+/// }
+/// (1) because in a different function (here: a lambda)
+/// (2) because in a different function (here: class method)
+/// (3) because considered to be intervening-code of non-perfectly nested loop
+/// Result: Loop 'i' contains 2 loops, Loop 'r' also contains 2 loops.
+class NestedLoopCounterVisitor final : public DynamicRecursiveASTVisitor {
+private:
+ unsigned NestedLoopCount = 0;
+
+public:
+ explicit NestedLoopCounterVisitor() = default;
+
+ unsigned getNestedLoopCount() const { return NestedLoopCount; }
+
+ bool VisitForStmt(ForStmt *FS) override {
+ ++NestedLoopCount;
+ return true;
+ }
+
+ bool VisitCXXForRangeStmt(CXXForRangeStmt *FRS) override {
+ ++NestedLoopCount;
+ return true;
+ }
+
+ bool TraverseStmt(Stmt *S) override {
+ if (!S)
+ return true;
+
+ // Skip traversal of all expressions, including special cases like
+ // LambdaExpr, StmtExpr, BlockExpr, and RequiresExpr. These expressions
+ // may contain inner statements (and even loops), but they are not part
+ // of the syntactic body of the surrounding loop structure.
+ // Therefore must not be counted.
+ if (isa<Expr>(S))
+ return true;
+
+ // Only recurse into CompoundStmt (block {}) and loop bodies.
+ if (isa<CompoundStmt, ForStmt, CXXForRangeStmt>(S)) {
+ return DynamicRecursiveASTVisitor::TraverseStmt(S);
+ }
+
+ // Stop traversal of the rest of statements, that break perfect
+ // loop nesting, such as control flow (IfStmt, SwitchStmt...).
+ return true;
+ }
+
+ bool TraverseDecl(Decl *D) override {
+ // Stop in the case of finding a declaration, it is not important
+ // in order to find nested loops (Possible CXXRecordDecl, RecordDecl,
+ // FunctionDecl...).
+ return true;
+ }
+};
+
+bool SemaOpenMP::analyzeLoopSequence(Stmt *LoopSeqStmt,
+ LoopSequenceAnalysis &SeqAnalysis,
+ ASTContext &Context,
+ OpenMPDirectiveKind Kind) {
+ VarsWithInheritedDSAType TmpDSA;
+ // Helper Lambda to handle storing initialization and body statements for
+ // both ForStmt and CXXForRangeStmt.
+ auto StoreLoopStatements = [](LoopAnalysis &Analysis, Stmt *LoopStmt) {
+ if (auto *For = dyn_cast<ForStmt>(LoopStmt)) {
+ Analysis.OriginalInits.push_back(For->getInit());
+ Analysis.ForStmt = For;
+ } else {
+ auto *CXXFor = cast<CXXForRangeStmt>(LoopStmt);
+ Analysis.OriginalInits.push_back(CXXFor->getBeginStmt());
+ Analysis.ForStmt = CXXFor;
+ }
+ };
+
+ // Helper lambda functions to encapsulate the processing of different
+ // derivations of the canonical loop sequence grammar
+ // Modularized code for handling loop generation and transformations.
+ auto AnalyzeLoopGeneration = [&](Stmt *Child) {
+ auto *LoopTransform = cast<OMPLoopTransformationDirective>(Child);
+ Stmt *TransformedStmt = LoopTransform->getTransformedStmt();
+ unsigned NumGeneratedTopLevelLoops =
+ LoopTransform->getNumGeneratedTopLevelLoops();
+ // Handle the case where transformed statement is not available due to
+ // dependent contexts
+ if (!TransformedStmt) {
+ if (NumGeneratedTopLevelLoops > 0) {
+ SeqAnalysis.LoopSeqSize += NumGeneratedTopLevelLoops;
+ return true;
+ }
+ // Unroll full (0 loops produced)
+ Diag(Child->getBeginLoc(), diag::err_omp_not_for)
+ << 0 << getOpenMPDirectiveName(Kind);
+ return false;
+ }
+ // Handle loop transformations with multiple loop nests
+ // Unroll full
+ if (!NumGeneratedTopLevelLoops) {
+ Diag(Child->getBeginLoc(), diag::err_omp_not_for)
+ << 0 << getOpenMPDirectiveName(Kind);
+ return false;
+ }
+ // Loop transformatons such as split or loopranged fuse
+ if (NumGeneratedTopLevelLoops > 1) {
+ // Get the preinits related to this loop sequence generating
+ // loop transformation (i.e loopranged fuse, split...)
+ // These preinits differ slightly from regular inits/pre-inits related
+ // to single loop generating loop transformations (interchange, unroll)
+ // given that they are not bounded to a particular loop nest
+ // so they need to be treated independently
+ updatePreInits(LoopTransform, SeqAnalysis.LoopSequencePreInits);
+ return analyzeLoopSequence(TransformedStmt, SeqAnalysis, Context, Kind);
+ }
+ // Vast majority: (Tile, Unroll, Stripe, Reverse, Interchange, Fuse all)
+ // Process the transformed loop statement
+ LoopAnalysis &NewTransformedSingleLoop =
+ SeqAnalysis.Loops.emplace_back(OMPLoopCategory::TransformSingleLoop);
+ unsigned IsCanonical = checkOpenMPLoop(
+ Kind, nullptr, nullptr, TransformedStmt, SemaRef, *DSAStack, TmpDSA,
+ NewTransformedSingleLoop.HelperExprs);
+
+ if (!IsCanonical)
+ return false;
+
+ StoreLoopStatements(NewTransformedSingleLoop, TransformedStmt);
+ updatePreInits(LoopTransform, NewTransformedSingleLoop.TransformsPreInits);
+
+ SeqAnalysis.LoopSeqSize++;
+ return true;
+ };
+
+ // Modularized code for handling regular canonical loops.
+ auto AnalyzeRegularLoop = [&](Stmt *Child) {
+ LoopAnalysis &NewRegularLoop =
+ SeqAnalysis.Loops.emplace_back(OMPLoopCategory::RegularLoop);
+ unsigned IsCanonical =
+ checkOpenMPLoop(Kind, nullptr, nullptr, Child, SemaRef, *DSAStack,
+ TmpDSA, NewRegularLoop.HelperExprs);
+
+ if (!IsCanonical)
+ return false;
+
+ StoreLoopStatements(NewRegularLoop, Child);
+ NestedLoopCounterVisitor NLCV;
+ NLCV.TraverseStmt(Child);
+ return true;
+ };
+
+ // Helper functions to validate loop sequence grammar derivations.
+ auto IsLoopSequenceDerivation = [](auto *Child) {
+ return isa<ForStmt, CXXForRangeStmt, OMPLoopTransformationDirective>(Child);
+ };
+ // Helper functions to validate loop generating grammar derivations.
+ auto IsLoopGeneratingStmt = [](auto *Child) {
+ return isa<OMPLoopTransformationDirective>(Child);
+ };
+
+ // High level grammar validation.
+ for (Stmt *Child : LoopSeqStmt->children()) {
+ if (!Child)
+ continue;
+ // Skip over non-loop-sequence statements.
+ if (!IsLoopSequenceDerivation(Child)) {
+ Child = Child->IgnoreContainers();
+ // Ignore empty compound statement.
+ if (!Child)
+ continue;
+ // In the case of a nested loop sequence ignoring containers would not
+ // be enough, a recurisve transversal of the loop sequence is required.
+ if (isa<CompoundStmt>(Child)) {
+ if (!analyzeLoopSequence(Child, SeqAnalysis, Context, Kind))
+ return false;
+ // Already been treated, skip this children
+ continue;
+ }
+ }
+ // Regular loop sequence handling.
+ if (IsLoopSequenceDerivation(Child)) {
+ if (IsLoopGeneratingStmt(Child)) {
+ if (!AnalyzeLoopGeneration(Child))
+ return false;
+ // AnalyzeLoopGeneration updates SeqAnalysis.LoopSeqSize accordingly.
+ } else {
+ if (!AnalyzeRegularLoop(Child))
+ return false;
+ SeqAnalysis.LoopSeqSize++;
+ }
+ } else {
+ // Report error for invalid statement inside canonical loop sequence.
+ Diag(Child->getBeginLoc(), diag::err_omp_not_for)
+ << 0 << getOpenMPDirectiveName(Kind);
+ return false;
+ }
+ }
+ return true;
+}
+
+bool SemaOpenMP::checkTransformableLoopSequence(
+ OpenMPDirectiveKind Kind, Stmt *AStmt, LoopSequenceAnalysis &SeqAnalysis,
+ ASTContext &Context) {
+ // Following OpenMP 6.0 API Specification, a Canonical Loop Sequence follows
+ // the grammar:
+ //
+ // canonical-loop-sequence:
+ // {
+ // loop-sequence+
+ // }
+ // where loop-sequence can be any of the following:
+ // 1. canonical-loop-sequence
+ // 2. loop-nest
+ // 3. loop-sequence-generating-construct (i.e OMPLoopTransformationDirective)
+ //
+ // To recognise and traverse this structure the helper function
+ // analyzeLoopSequence serves as the recurisve entry point
+ // and tries to match the input AST to the canonical loop sequence grammar
+ // structure. This function will perform both a semantic and syntactical
+ // analysis of the given statement according to OpenMP 6.0 definition of
+ // the aforementioned canonical loop sequence.
+
+ // We expect an outer compound statement.
+ if (!isa<CompoundStmt>(AStmt)) {
+ Diag(AStmt->getBeginLoc(), diag::err_omp_not_a_loop_sequence)
+ << getOpenMPDirectiveName(Kind);
+ return false;
+ }
+
+ // Recursive entry point to process the main loop sequence
+ if (!analyzeLoopSequence(AStmt, SeqAnalysis, Context, Kind))
+ return false;
+
+ // Diagnose an empty loop sequence.
+ if (SeqAnalysis.LoopSeqSize <= 0) {
+ Diag(AStmt->getBeginLoc(), diag::err_omp_empty_loop_sequence)
+ << getOpenMPDirectiveName(Kind);
+ return false;
+ }
+ return true;
+}
+
+/// Add preinit statements that need to be propagated from the selected loop.
static void addLoopPreInits(ASTContext &Context,
OMPLoopBasedDirective::HelperExprs &LoopHelper,
Stmt *LoopStmt, ArrayRef<Stmt *> OriginalInit,
@@ -14440,7 +14721,7 @@ StmtResult SemaOpenMP::ActOnOpenMPTileDirective(ArrayRef<OMPClause *> Clauses,
// Verify and diagnose loop nest.
SmallVector<OMPLoopBasedDirective::HelperExprs, 4> LoopHelpers(NumLoops);
Stmt *Body = nullptr;
- SmallVector<SmallVector<Stmt *, 0>, 4> OriginalInits;
+ SmallVector<SmallVector<Stmt *>, 4> OriginalInits;
if (!checkTransformableLoopNest(OMPD_tile, AStmt, NumLoops, LoopHelpers, Body,
OriginalInits))
return StmtError();
@@ -14717,7 +14998,7 @@ StmtResult SemaOpenMP::ActOnOpenMPStripeDirective(ArrayRef<OMPClause *> Clauses,
// Verify and diagnose loop nest.
SmallVector<OMPLoopBasedDirective::HelperExprs, 4> LoopHelpers(NumLoops);
Stmt *Body = nullptr;
- SmallVector<SmallVector<Stmt *, 0>, 4> OriginalInits;
+ SmallVector<SmallVector<Stmt *>, 4> OriginalInits;
if (!checkTransformableLoopNest(OMPD_stripe, AStmt, NumLoops, LoopHelpers,
Body, OriginalInits))
return StmtError();
@@ -14978,7 +15259,7 @@ StmtResult SemaOpenMP::ActOnOpenMPUnrollDirective(ArrayRef<OMPClause *> Clauses,
Stmt *Body = nullptr;
SmallVector<OMPLoopBasedDirective::HelperExprs, NumLoops> LoopHelpers(
NumLoops);
- SmallVector<SmallVector<Stmt *, 0>, NumLoops + 1> OriginalInits;
+ SmallVector<SmallVector<Stmt *>, NumLoops + 1> OriginalInits;
if (!checkTransformableLoopNest(OMPD_unroll, AStmt, NumLoops, LoopHelpers,
Body, OriginalInits))
return StmtError();
@@ -15248,7 +15529,7 @@ StmtResult SemaOpenMP::ActOnOpenMPReverseDirective(Stmt *AStmt,
Stmt *Body = nullptr;
SmallVector<OMPLoopBasedDirective::HelperExprs, NumLoops> LoopHelpers(
NumLoops);
- SmallVector<SmallVector<Stmt *, 0>, NumLoops + 1> OriginalInits;
+ SmallVector<SmallVector<Stmt *>, NumLoops + 1> OriginalInits;
if (!checkTransformableLoopNest(OMPD_reverse, AStmt, NumLoops, LoopHelpers,
Body, OriginalInits))
return StmtError();
@@ -15440,7 +15721,7 @@ StmtResult SemaOpenMP::ActOnOpenMPInterchangeDirective(
// Verify and diagnose loop nest.
SmallVector<OMPLoopBasedDirective::HelperExprs, 4> LoopHelpers(NumLoops);
Stmt *Body = nullptr;
- SmallVector<SmallVector<Stmt *, 0>, 2> OriginalInits;
+ SmallVector<SmallVector<Stmt *>, 2> OriginalInits;
if (!checkTransformableLoopNest(OMPD_interchange, AStmt, NumLoops,
LoopHelpers, Body, OriginalInits))
return StmtError();
@@ -15616,6 +15897,492 @@ StmtResult SemaOpenMP::ActOnOpenMPInterchangeDirective(
buildPreInits(Context, PreInits));
}
+StmtResult SemaOpenMP::ActOnOpenMPFuseDirective(ArrayRef<OMPClause *> Clauses,
+ Stmt *AStmt,
+ SourceLocation StartLoc,
+ SourceLocation EndLoc) {
+
+ ASTContext &Context = getASTContext();
+ DeclContext *CurrContext = SemaRef.CurContext;
+ Scope *CurScope = SemaRef.getCurScope();
+ CaptureVars CopyTransformer(SemaRef);
+
+ // Ensure the structured block is not empty
+ if (!AStmt)
+ return StmtError();
+
+ // Defer transformation in dependent contexts
+ // The NumLoopNests argument is set to a placeholder 1 (even though
+ // using looprange fuse could yield up to 3 top level loop nests)
+ // because a dependent context could prevent determining its true value
+ if (CurrContext->isDependentContext()) {
+ return OMPFuseDirective::Create(Context, StartLoc, EndLoc, Clauses,
+ /* NumLoops */ 1, /* LoopSeqSize */ 1,
+ AStmt, nullptr, nullptr);
+ }
+
+ // Validate that the potential loop sequence is transformable for fusion
+ // Also collect the HelperExprs, Loop Stmts, Inits, and Number of loops
+ LoopSequenceAnalysis SeqAnalysis;
+ if (!checkTransformableLoopSequence(OMPD_fuse, AStmt, SeqAnalysis, Context))
+ return StmtError();
+
+ // SeqAnalysis.LoopSeqSize exists mostly to handle dependent contexts,
+ // otherwise it must be the same as SeqAnalysis.Loops.size().
+ assert(SeqAnalysis.LoopSeqSize == SeqAnalysis.Loops.size());
+
+ // Handle clauses, which can be any of the following: [looprange, apply]
+ const OMPLoopRangeClause *LRC =
+ OMPExecutableDirective::getSingleClause<OMPLoopRangeClause>(Clauses);
+
+ // The clause arguments are invalidated if any error arises
+ // such as non-constant or non-positive arguments
+ if (LRC && (!LRC->getFirst() || !LRC->getCount()))
+ return StmtError();
+
+ // Delayed semantic check of LoopRange constraint
+ // Evaluates the loop range arguments and returns the first and count values
+ auto EvaluateLoopRangeArguments = [&Context](Expr *First, Expr *Count,
+ uint64_t &FirstVal,
+ uint64_t &CountVal) {
+ llvm::APSInt FirstInt = First->EvaluateKnownConstInt(Context);
+ llvm::APSInt CountInt = Count->EvaluateKnownConstInt(Context);
+ FirstVal = FirstInt.getZExtValue();
+ CountVal = CountInt.getZExtValue();
+ };
+
+ // OpenMP [6.0, Restrictions]
+ // first + count - 1 must not evaluate to a value greater than the
+ // loop sequence length of the associated canonical loop sequence.
+ auto ValidLoopRange = [](uint64_t FirstVal, uint64_t CountVal,
+ unsigned NumLoops) -> bool {
+ return FirstVal + CountVal - 1 <= NumLoops;
+ };
+ uint64_t FirstVal = 1, CountVal = 0, LastVal = SeqAnalysis.LoopSeqSize;
+
+ // Validates the loop range after evaluating the semantic information
+ // and ensures that the range is valid for the given loop sequence size.
+ // Expressions are evaluated at compile time to obtain constant values.
+ if (LRC) {
+ EvaluateLoopRangeArguments(LRC->getFirst(), LRC->getCount(), FirstVal,
+ CountVal);
+ if (CountVal == 1)
+ SemaRef.Diag(LRC->getCountLoc(), diag::warn_omp_redundant_fusion)
+ << getOpenMPDirectiveName(OMPD_fuse);
+
+ if (!ValidLoopRange(FirstVal, CountVal, SeqAnalysis.LoopSeqSize)) {
+ SemaRef.Diag(LRC->getFirstLoc(), diag::err_omp_invalid_looprange)
+ << getOpenMPDirectiveName(OMPD_fuse) << FirstVal
+ << (FirstVal + CountVal - 1) << SeqAnalysis.LoopSeqSize;
+ return StmtError();
+ }
+
+ LastVal = FirstVal + CountVal - 1;
+ }
+
+ // Complete fusion generates a single canonical loop nest
+ // However looprange clause may generate several loop nests
+ unsigned NumGeneratedTopLevelLoops =
+ LRC ? SeqAnalysis.LoopSeqSize - CountVal + 1 : 1;
+
+ // Emit a warning for redundant loop fusion when the sequence contains only
+ // one loop.
+ if (SeqAnalysis.LoopSeqSize == 1)
+ SemaRef.Diag(AStmt->getBeginLoc(), diag::warn_omp_redundant_fusion)
+ << getOpenMPDirectiveName(OMPD_fuse);
+
+ // Select the type with the largest bit width among all induction variables
+ QualType IVType =
+ SeqAnalysis.Loops[FirstVal - 1].HelperExprs.IterationVarRef->getType();
+ for (unsigned I = FirstVal; I < LastVal; ++I) {
+ QualType CurrentIVType =
+ SeqAnalysis.Loops[I].HelperExprs.IterationVarRef->getType();
+ if (Context.getTypeSize(CurrentIVType) > Context.getTypeSize(IVType)) {
+ IVType = CurrentIVType;
+ }
+ }
+ uint64_t IVBitWidth = Context.getIntWidth(IVType);
+
+ // Create pre-init declarations for all loops lower bounds, upper bounds,
+ // strides and num-iterations for every top level loop in the fusion
+ SmallVector<VarDecl *, 4> LBVarDecls;
+ SmallVector<VarDecl *, 4> STVarDecls;
+ SmallVector<VarDecl *, 4> NIVarDecls;
+ SmallVector<VarDecl *, 4> UBVarDecls;
+ SmallVector<VarDecl *, 4> IVVarDecls;
+
+ // Helper lambda to create variables for bounds, strides, and other
+ // expressions. Generates both the variable declaration and the corresponding
+ // initialization statement.
+ auto CreateHelperVarAndStmt =
+ [&, &SemaRef = SemaRef](Expr *ExprToCopy, const std::string &BaseName,
+ unsigned I, bool NeedsNewVD = false) {
+ Expr *TransformedExpr =
+ AssertSuccess(CopyTransformer.TransformExpr(ExprToCopy));
+ if (!TransformedExpr)
+ return std::pair<VarDecl *, StmtResult>(nullptr, StmtError());
+
+ auto Name = (Twine(".omp.") + BaseName + std::to_string(I)).str();
+
+ VarDecl *VD;
+ if (NeedsNewVD) {
+ VD = buildVarDecl(SemaRef, SourceLocation(), IVType, Name);
+ SemaRef.AddInitializerToDecl(VD, TransformedExpr, false);
+
+ } else {
+ // Create a unique variable name
+ DeclRefExpr *DRE = cast<DeclRefExpr>(TransformedExpr);
+ VD = cast<VarDecl>(DRE->getDecl());
+ VD->setDeclName(&SemaRef.PP.getIdentifierTable().get(Name));
+ }
+ // Create the corresponding declaration statement
+ StmtResult DeclStmt = new (Context) class DeclStmt(
+ DeclGroupRef(VD), SourceLocation(), SourceLocation());
+ return std::make_pair(VD, DeclStmt);
+ };
+
+ // PreInits hold a sequence of variable declarations that must be executed
+ // before the fused loop begins. These include bounds, strides, and other
+ // helper variables required for the transformation. Other loop transforms
+ // also contain their own preinits
+ SmallVector<Stmt *> PreInits;
+
+ // Update the general preinits using the preinits generated by loop sequence
+ // generating loop transformations. These preinits differ slightly from
+ // single-loop transformation preinits, as they can be detached from a
+ // specific loop inside multiple generated loop nests. This happens
+ // because certain helper variables, like '.omp.fuse.max', are introduced to
+ // handle fused iteration spaces and may not be directly tied to a single
+ // original loop. The preinit structure must ensure that hidden variables
+ // like '.omp.fuse.max' are still properly handled.
+ // Transformations that apply this concept: Loopranged Fuse, Split
+ if (!SeqAnalysis.LoopSequencePreInits.empty()) {
+ llvm::append_range(PreInits, SeqAnalysis.LoopSequencePreInits);
+ }
+
+ // Process each single loop to generate and collect declarations
+ // and statements for all helper expressions related to
+ // particular single loop nests
+
+ // Also In the case of the fused loops, we keep track of their original
+ // inits by appending them to their preinits statement, and in the case of
+ // transformations, also append their preinits (which contain the original
+ // loop initialization statement or other statements)
+
+ // Firstly we need to set TransformIndex to match the begining of the
+ // looprange section
+ unsigned int TransformIndex = 0;
+ for (unsigned I : llvm::seq<unsigned>(FirstVal - 1)) {
+ if (SeqAnalysis.Loops[I].Category == OMPLoopCategory::TransformSingleLoop)
+ ++TransformIndex;
+ }
+
+ for (unsigned int I = FirstVal - 1, J = 0; I < LastVal; ++I, ++J) {
+ if (SeqAnalysis.Loops[I].Category == OMPLoopCategory::RegularLoop) {
+ addLoopPreInits(Context, SeqAnalysis.Loops[I].HelperExprs,
+ SeqAnalysis.Loops[I].ForStmt,
+ SeqAnalysis.Loops[I].OriginalInits, PreInits);
+ } else if (SeqAnalysis.Loops[I].Category ==
+ OMPLoopCategory::TransformSingleLoop) {
+ // For transformed loops, insert both pre-inits and original inits.
+ // Order matters: pre-inits may define variables used in the original
+ // inits such as upper bounds...
+ SmallVector<Stmt *> &TransformPreInit =
+ SeqAnalysis.Loops[TransformIndex++].TransformsPreInits;
+ if (!TransformPreInit.empty())
+ llvm::append_range(PreInits, TransformPreInit);
+
+ addLoopPreInits(Context, SeqAnalysis.Loops[I].HelperExprs,
+ SeqAnalysis.Loops[I].ForStmt,
+ SeqAnalysis.Loops[I].OriginalInits, PreInits);
+ }
+ auto [UBVD, UBDStmt] =
+ CreateHelperVarAndStmt(SeqAnalysis.Loops[I].HelperExprs.UB, "ub", J);
+ auto [LBVD, LBDStmt] =
+ CreateHelperVarAndStmt(SeqAnalysis.Loops[I].HelperExprs.LB, "lb", J);
+ auto [STVD, STDStmt] =
+ CreateHelperVarAndStmt(SeqAnalysis.Loops[I].HelperExprs.ST, "st", J);
+ auto [NIVD, NIDStmt] = CreateHelperVarAndStmt(
+ SeqAnalysis.Loops[I].HelperExprs.NumIterations, "ni", J, true);
+ auto [IVVD, IVDStmt] = CreateHelperVarAndStmt(
+ SeqAnalysis.Loops[I].HelperExprs.IterationVarRef, "iv", J);
+
+ assert(LBVD && STVD && NIVD && IVVD &&
+ "OpenMP Fuse Helper variables creation failed");
+
+ UBVarDecls.push_back(UBVD);
+ LBVarDecls.push_back(LBVD);
+ STVarDecls.push_back(STVD);
+ NIVarDecls.push_back(NIVD);
+ IVVarDecls.push_back(IVVD);
+
+ PreInits.push_back(LBDStmt.get());
+ PreInits.push_back(STDStmt.get());
+ PreInits.push_back(NIDStmt.get());
+ PreInits.push_back(IVDStmt.get());
+ }
+
+ auto MakeVarDeclRef = [&SemaRef = this->SemaRef](VarDecl *VD) {
+ return buildDeclRefExpr(SemaRef, VD, VD->getType(), VD->getLocation(),
+ false);
+ };
+
+ // Following up the creation of the final fused loop will be performed
+ // which has the following shape (considering the selected loops):
+ //
+ // for (fuse.index = 0; fuse.index < max(ni0, ni1..., nik); ++fuse.index) {
+ // if (fuse.index < ni0){
+ // iv0 = lb0 + st0 * fuse.index;
+ // original.index0 = iv0
+ // body(0);
+ // }
+ // if (fuse.index < ni1){
+ // iv1 = lb1 + st1 * fuse.index;
+ // original.index1 = iv1
+ // body(1);
+ // }
+ //
+ // ...
+ //
+ // if (fuse.index < nik){
+ // ivk = lbk + stk * fuse.index;
+ // original.indexk = ivk
+ // body(k); Expr *InitVal = IntegerLiteral::Create(Context,
+ // llvm::APInt(IVWidth, 0),
+ // }
+
+ // 1. Create the initialized fuse index
+ StringRef IndexName = ".omp.fuse.index";
+ Expr *InitVal = IntegerLiteral::Create(Context, llvm::APInt(IVBitWidth, 0),
+ IVType, SourceLocation());
+ VarDecl *IndexDecl =
+ buildVarDecl(SemaRef, {}, IVType, IndexName, nullptr, nullptr);
+ SemaRef.AddInitializerToDecl(IndexDecl, InitVal, false);
+ StmtResult InitStmt = new (Context)
+ DeclStmt(DeclGroupRef(IndexDecl), SourceLocation(), SourceLocation());
+
+ if (!InitStmt.isUsable())
+ return StmtError();
+
+ auto MakeIVRef = [&SemaRef = this->SemaRef, IndexDecl, IVType,
+ Loc = InitVal->getExprLoc()]() {
+ return buildDeclRefExpr(SemaRef, IndexDecl, IVType, Loc, false);
+ };
+
+ // 2. Iteratively compute the max number of logical iterations Max(NI_1, NI_2,
+ // ..., NI_k)
+ //
+ // This loop accumulates the maximum value across multiple expressions,
+ // ensuring each step constructs a unique AST node for correctness. By using
+ // intermediate temporary variables and conditional operators, we maintain
+ // distinct nodes and avoid duplicating subtrees, For instance, max(a,b,c):
+ // omp.temp0 = max(a, b)
+ // omp.temp1 = max(omp.temp0, c)
+ // omp.fuse.max = max(omp.temp1, omp.temp0)
+
+ ExprResult MaxExpr;
+ // I is the true
+ for (unsigned I = FirstVal - 1, J = 0; I < LastVal; ++I, ++J) {
+ DeclRefExpr *NIRef = MakeVarDeclRef(NIVarDecls[J]);
+ QualType NITy = NIRef->getType();
+
+ if (MaxExpr.isUnset()) {
+ // Initialize MaxExpr with the first NI expression
+ MaxExpr = NIRef;
+ } else {
+ // Create a new acummulator variable t_i = MaxExpr
+ std::string TempName = (Twine(".omp.temp.") + Twine(J)).str();
+ VarDecl *TempDecl =
+ buildVarDecl(SemaRef, {}, NITy, TempName, nullptr, nullptr);
+ TempDecl->setInit(MaxExpr.get());
+ DeclRefExpr *TempRef =
+ buildDeclRefExpr(SemaRef, TempDecl, NITy, SourceLocation(), false);
+ DeclRefExpr *TempRef2 =
+ buildDeclRefExpr(SemaRef, TempDecl, NITy, SourceLocation(), false);
+ // Add a DeclStmt to PreInits to ensure the variable is declared.
+ StmtResult TempStmt = new (Context)
+ DeclStmt(DeclGroupRef(TempDecl), SourceLocation(), SourceLocation());
+
+ if (!TempStmt.isUsable())
+ return StmtError();
+ PreInits.push_back(TempStmt.get());
+
+ // Build MaxExpr <-(MaxExpr > NIRef ? MaxExpr : NIRef)
+ ExprResult Comparison =
+ SemaRef.BuildBinOp(nullptr, SourceLocation(), BO_GT, TempRef, NIRef);
+ // Handle any errors in Comparison creation
+ if (!Comparison.isUsable())
+ return StmtError();
+
+ DeclRefExpr *NIRef2 = MakeVarDeclRef(NIVarDecls[J]);
+ // Update MaxExpr using a conditional expression to hold the max value
+ MaxExpr = new (Context) ConditionalOperator(
+ Comparison.get(), SourceLocation(), TempRef2, SourceLocation(),
+ NIRef2->getExprStmt(), NITy, VK_LValue, OK_Ordinary);
+
+ if (!MaxExpr.isUsable())
+ return StmtError();
+ }
+ }
+ if (!MaxExpr.isUsable())
+ return StmtError();
+
+ // 3. Declare the max variable
+ const std::string MaxName = Twine(".omp.fuse.max").str();
+ VarDecl *MaxDecl =
+ buildVarDecl(SemaRef, {}, IVType, MaxName, nullptr, nullptr);
+ MaxDecl->setInit(MaxExpr.get());
+ DeclRefExpr *MaxRef = buildDeclRefExpr(SemaRef, MaxDecl, IVType, {}, false);
+ StmtResult MaxStmt = new (Context)
+ DeclStmt(DeclGroupRef(MaxDecl), SourceLocation(), SourceLocation());
+
+ if (MaxStmt.isInvalid())
+ return StmtError();
+ PreInits.push_back(MaxStmt.get());
+
+ // 4. Create condition Expr: index < n_max
+ ExprResult CondExpr = SemaRef.BuildBinOp(CurScope, SourceLocation(), BO_LT,
+ MakeIVRef(), MaxRef);
+ if (!CondExpr.isUsable())
+ return StmtError();
+
+ // 5. Increment Expr: ++index
+ ExprResult IncrExpr =
+ SemaRef.BuildUnaryOp(CurScope, SourceLocation(), UO_PreInc, MakeIVRef());
+ if (!IncrExpr.isUsable())
+ return StmtError();
+
+ // 6. Build the Fused Loop Body
+ // The final fused loop iterates over the maximum logical range. Inside the
+ // loop, each original loop's index is calculated dynamically, and its body
+ // is executed conditionally.
+ //
+ // Each sub-loop's body is guarded by a conditional statement to ensure
+ // it executes only within its logical iteration range:
+ //
+ // if (fuse.index < ni_k){
+ // iv_k = lb_k + st_k * fuse.index;
+ // original.index = iv_k
+ // body(k);
+ // }
+
+ CompoundStmt *FusedBody = nullptr;
+ SmallVector<Stmt *, 4> FusedBodyStmts;
+ for (unsigned I = FirstVal - 1, J = 0; I < LastVal; ++I, ++J) {
+ // Assingment of the original sub-loop index to compute the logical index
+ // IV_k = LB_k + omp.fuse.index * ST_k
+ ExprResult IdxExpr =
+ SemaRef.BuildBinOp(CurScope, SourceLocation(), BO_Mul,
+ MakeVarDeclRef(STVarDecls[J]), MakeIVRef());
+ if (!IdxExpr.isUsable())
+ return StmtError();
+ IdxExpr = SemaRef.BuildBinOp(CurScope, SourceLocation(), BO_Add,
+ MakeVarDeclRef(LBVarDecls[J]), IdxExpr.get());
+
+ if (!IdxExpr.isUsable())
+ return StmtError();
+ IdxExpr = SemaRef.BuildBinOp(CurScope, SourceLocation(), BO_Assign,
+ MakeVarDeclRef(IVVarDecls[J]), IdxExpr.get());
+ if (!IdxExpr.isUsable())
+ return StmtError();
+
+ // Update the original i_k = IV_k
+ SmallVector<Stmt *, 4> BodyStmts;
+ BodyStmts.push_back(IdxExpr.get());
+ llvm::append_range(BodyStmts, SeqAnalysis.Loops[I].HelperExprs.Updates);
+
+ // If the loop is a CXXForRangeStmt then the iterator variable is needed
+ if (auto *SourceCXXFor =
+ dyn_cast<CXXForRangeStmt>(SeqAnalysis.Loops[I].ForStmt))
+ BodyStmts.push_back(SourceCXXFor->getLoopVarStmt());
+
+ Stmt *Body =
+ (isa<ForStmt>(SeqAnalysis.Loops[I].ForStmt))
+ ? cast<ForStmt>(SeqAnalysis.Loops[I].ForStmt)->getBody()
+ : cast<CXXForRangeStmt>(SeqAnalysis.Loops[I].ForStmt)->getBody();
+ BodyStmts.push_back(Body);
+
+ CompoundStmt *CombinedBody =
+ CompoundStmt::Create(Context, BodyStmts, FPOptionsOverride(),
+ SourceLocation(), SourceLocation());
+ ExprResult Condition =
+ SemaRef.BuildBinOp(CurScope, SourceLocation(), BO_LT, MakeIVRef(),
+ MakeVarDeclRef(NIVarDecls[J]));
+
+ if (!Condition.isUsable())
+ return StmtError();
+
+ IfStmt *IfStatement = IfStmt::Create(
+ Context, SourceLocation(), IfStatementKind::Ordinary, nullptr, nullptr,
+ Condition.get(), SourceLocation(), SourceLocation(), CombinedBody,
+ SourceLocation(), nullptr);
+
+ FusedBodyStmts.push_back(IfStatement);
+ }
+ FusedBody = CompoundStmt::Create(Context, FusedBodyStmts, FPOptionsOverride(),
+ SourceLocation(), SourceLocation());
+
+ // 7. Construct the final fused loop
+ ForStmt *FusedForStmt = new (Context)
+ ForStmt(Context, InitStmt.get(), CondExpr.get(), nullptr, IncrExpr.get(),
+ FusedBody, InitStmt.get()->getBeginLoc(), SourceLocation(),
+ IncrExpr.get()->getEndLoc());
+
+ // In the case of looprange, the result of fuse won't simply
+ // be a single loop (ForStmt), but rather a loop sequence
+ // (CompoundStmt) of 3 parts: the pre-fusion loops, the fused loop
+ // and the post-fusion loops, preserving its original order.
+ //
+ // Note: If looprange clause produces a single fused loop nest then
+ // this compound statement wrapper is unnecessary (Therefore this
+ // treatment is skipped)
+
+ Stmt *FusionStmt = FusedForStmt;
+ if (LRC && CountVal != SeqAnalysis.LoopSeqSize) {
+ SmallVector<Stmt *, 4> FinalLoops;
+
+ // Reset the transform index
+ TransformIndex = 0;
+
+ // Collect all non-fused loops before and after the fused region.
+ // Pre-fusion and post-fusion loops are inserted in order exploiting their
+ // symmetry, along with their corresponding transformation pre-inits if
+ // needed. The fused loop is added between the two regions.
+ for (unsigned I = 0; I < SeqAnalysis.LoopSeqSize; ++I) {
+ if (I >= FirstVal - 1 && I < FirstVal + CountVal - 1) {
+ // Update the Transformation counter to skip already treated
+ // loop transformations
+ if (SeqAnalysis.Loops[I].Category !=
+ OMPLoopCategory::TransformSingleLoop)
+ ++TransformIndex;
+ continue;
+ }
+
+ // No need to handle:
+ // Regular loops: they are kept intact as-is.
+ // Loop-sequence-generating transformations: already handled earlier.
+ // Only TransformSingleLoop requires inserting pre-inits here
+ if (SeqAnalysis.Loops[I].Category ==
+ OMPLoopCategory::TransformSingleLoop) {
+ const auto &TransformPreInit =
+ SeqAnalysis.Loops[TransformIndex++].TransformsPreInits;
+ if (!TransformPreInit.empty())
+ llvm::append_range(PreInits, TransformPreInit);
+ }
+
+ FinalLoops.push_back(SeqAnalysis.Loops[I].ForStmt);
+ }
+
+ FinalLoops.insert(FinalLoops.begin() + (FirstVal - 1), FusedForStmt);
+ FusionStmt = CompoundStmt::Create(Context, FinalLoops, FPOptionsOverride(),
+ SourceLocation(), SourceLocation());
+ }
+ return OMPFuseDirective::Create(Context, StartLoc, EndLoc, Clauses,
+ SeqAnalysis.LoopSeqSize,
+ NumGeneratedTopLevelLoops, AStmt, FusionStmt,
+ buildPreInits(Context, PreInits));
+}
+
OMPClause *SemaOpenMP::ActOnOpenMPSingleExprClause(OpenMPClauseKind Kind,
Expr *Expr,
SourceLocation StartLoc,
@@ -16785,6 +17552,31 @@ OMPClause *SemaOpenMP::ActOnOpenMPPartialClause(Expr *FactorExpr,
FactorExpr);
}
+OMPClause *SemaOpenMP::ActOnOpenMPLoopRangeClause(
+ Expr *First, Expr *Count, SourceLocation StartLoc, SourceLocation LParenLoc,
+ SourceLocation FirstLoc, SourceLocation CountLoc, SourceLocation EndLoc) {
+
+ // OpenMP [6.0, Restrictions]
+ // First and Count must be integer expressions with positive value
+ ExprResult FirstVal =
+ VerifyPositiveIntegerConstantInClause(First, OMPC_looprange);
+ if (FirstVal.isInvalid())
+ First = nullptr;
+
+ ExprResult CountVal =
+ VerifyPositiveIntegerConstantInClause(Count, OMPC_looprange);
+ if (CountVal.isInvalid())
+ Count = nullptr;
+
+ // OpenMP [6.0, Restrictions]
+ // first + count - 1 must not evaluate to a value greater than the
+ // loop sequence length of the associated canonical loop sequence.
+ // This check must be performed afterwards due to the delayed
+ // parsing and computation of the associated loop sequence
+ return OMPLoopRangeClause::Create(getASTContext(), StartLoc, LParenLoc,
+ FirstLoc, CountLoc, EndLoc, First, Count);
+}
+
OMPClause *SemaOpenMP::ActOnOpenMPAlignClause(Expr *A, SourceLocation StartLoc,
SourceLocation LParenLoc,
SourceLocation EndLoc) {
diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
index 242ffb09af006..0ceeb4f9722b4 100644
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -1783,6 +1783,14 @@ class TreeTransform {
LParenLoc, EndLoc);
}
+ OMPClause *
+ RebuildOMPLoopRangeClause(Expr *First, Expr *Count, SourceLocation StartLoc,
+ SourceLocation LParenLoc, SourceLocation FirstLoc,
+ SourceLocation CountLoc, SourceLocation EndLoc) {
+ return getSema().OpenMP().ActOnOpenMPLoopRangeClause(
+ First, Count, StartLoc, LParenLoc, FirstLoc, CountLoc, EndLoc);
+ }
+
/// Build a new OpenMP 'allocator' clause.
///
/// By default, performs semantic analysis to build the new OpenMP clause.
@@ -9607,6 +9615,17 @@ StmtResult TreeTransform<Derived>::TransformOMPInterchangeDirective(
return Res;
}
+template <typename Derived>
+StmtResult
+TreeTransform<Derived>::TransformOMPFuseDirective(OMPFuseDirective *D) {
+ DeclarationNameInfo DirName;
+ getDerived().getSema().OpenMP().StartOpenMPDSABlock(
+ D->getDirectiveKind(), DirName, nullptr, D->getBeginLoc());
+ StmtResult Res = getDerived().TransformOMPExecutableDirective(D);
+ getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get());
+ return Res;
+}
+
template <typename Derived>
StmtResult
TreeTransform<Derived>::TransformOMPForDirective(OMPForDirective *D) {
@@ -10500,6 +10519,31 @@ TreeTransform<Derived>::TransformOMPPartialClause(OMPPartialClause *C) {
C->getEndLoc());
}
+template <typename Derived>
+OMPClause *
+TreeTransform<Derived>::TransformOMPLoopRangeClause(OMPLoopRangeClause *C) {
+ ExprResult F = getDerived().TransformExpr(C->getFirst());
+ if (F.isInvalid())
+ return nullptr;
+
+ ExprResult Cn = getDerived().TransformExpr(C->getCount());
+ if (Cn.isInvalid())
+ return nullptr;
+
+ Expr *First = F.get();
+ Expr *Count = Cn.get();
+
+ bool Changed = (First != C->getFirst()) || (Count != C->getCount());
+
+ // If no changes and AlwaysRebuild() is false, return the original clause
+ if (!Changed && !getDerived().AlwaysRebuild())
+ return C;
+
+ return RebuildOMPLoopRangeClause(First, Count, C->getBeginLoc(),
+ C->getLParenLoc(), C->getFirstLoc(),
+ C->getCountLoc(), C->getEndLoc());
+}
+
template <typename Derived>
OMPClause *
TreeTransform<Derived>::TransformOMPCollapseClause(OMPCollapseClause *C) {
diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
index 9ee8a0fb0f060..c05e428a6fb39 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -11215,6 +11215,9 @@ OMPClause *OMPClauseReader::readClause() {
case llvm::omp::OMPC_partial:
C = OMPPartialClause::CreateEmpty(Context);
break;
+ case llvm::omp::OMPC_looprange:
+ C = OMPLoopRangeClause::CreateEmpty(Context);
+ break;
case llvm::omp::OMPC_allocator:
C = new (Context) OMPAllocatorClause();
break;
@@ -11618,6 +11621,14 @@ void OMPClauseReader::VisitOMPPartialClause(OMPPartialClause *C) {
C->setLParenLoc(Record.readSourceLocation());
}
+void OMPClauseReader::VisitOMPLoopRangeClause(OMPLoopRangeClause *C) {
+ C->setFirst(Record.readSubExpr());
+ C->setCount(Record.readSubExpr());
+ C->setLParenLoc(Record.readSourceLocation());
+ C->setFirstLoc(Record.readSourceLocation());
+ C->setCountLoc(Record.readSourceLocation());
+}
+
void OMPClauseReader::VisitOMPAllocatorClause(OMPAllocatorClause *C) {
C->setAllocator(Record.readExpr());
C->setLParenLoc(Record.readSourceLocation());
diff --git a/clang/lib/Serialization/ASTReaderStmt.cpp b/clang/lib/Serialization/ASTReaderStmt.cpp
index 213c2c2148f64..23e358d68d6d4 100644
--- a/clang/lib/Serialization/ASTReaderStmt.cpp
+++ b/clang/lib/Serialization/ASTReaderStmt.cpp
@@ -2469,10 +2469,23 @@ void ASTStmtReader::VisitOMPReverseDirective(OMPReverseDirective *D) {
VisitOMPCanonicalLoopNestTransformationDirective(D);
}
+void ASTStmtReader::VisitOMPCanonicalLoopSequenceTransformationDirective(
+ OMPCanonicalLoopSequenceTransformationDirective *D) {
+ VisitStmt(D);
+ // Field NumLoopsInSequence was read in ReadStmtFromStream.
+ Record.skipInts(1);
+ VisitOMPExecutableDirective(D);
+ D->setNumGeneratedTopLevelLoops(Record.readUInt32());
+}
+
void ASTStmtReader::VisitOMPInterchangeDirective(OMPInterchangeDirective *D) {
VisitOMPCanonicalLoopNestTransformationDirective(D);
}
+void ASTStmtReader::VisitOMPFuseDirective(OMPFuseDirective *D) {
+ VisitOMPCanonicalLoopSequenceTransformationDirective(D);
+}
+
void ASTStmtReader::VisitOMPForDirective(OMPForDirective *D) {
VisitOMPLoopDirective(D);
D->setHasCancel(Record.readBool());
@@ -3614,6 +3627,13 @@ Stmt *ASTReader::ReadStmtFromStream(ModuleFile &F) {
S = OMPReverseDirective::CreateEmpty(Context, NumLoops);
break;
}
+ case STMT_OMP_FUSE_DIRECTIVE: {
+ unsigned NumLoopsInSequence = Record[ASTStmtReader::NumStmtFields];
+ unsigned NumClauses = Record[ASTStmtReader::NumStmtFields + 1];
+ S = OMPFuseDirective::CreateEmpty(Context, NumClauses,
+ NumLoopsInSequence);
+ break;
+ }
case STMT_OMP_INTERCHANGE_DIRECTIVE: {
unsigned NumLoops = Record[ASTStmtReader::NumStmtFields];
diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index 2aa77934c08d1..b12cc671d8177 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -7856,6 +7856,14 @@ void OMPClauseWriter::VisitOMPPartialClause(OMPPartialClause *C) {
Record.AddSourceLocation(C->getLParenLoc());
}
+void OMPClauseWriter::VisitOMPLoopRangeClause(OMPLoopRangeClause *C) {
+ Record.AddStmt(C->getFirst());
+ Record.AddStmt(C->getCount());
+ Record.AddSourceLocation(C->getLParenLoc());
+ Record.AddSourceLocation(C->getFirstLoc());
+ Record.AddSourceLocation(C->getCountLoc());
+}
+
void OMPClauseWriter::VisitOMPAllocatorClause(OMPAllocatorClause *C) {
Record.AddStmt(C->getAllocator());
Record.AddSourceLocation(C->getLParenLoc());
diff --git a/clang/lib/Serialization/ASTWriterStmt.cpp b/clang/lib/Serialization/ASTWriterStmt.cpp
index 21c04ddbc2c7a..99169739cfd5a 100644
--- a/clang/lib/Serialization/ASTWriterStmt.cpp
+++ b/clang/lib/Serialization/ASTWriterStmt.cpp
@@ -2487,6 +2487,19 @@ void ASTStmtWriter::VisitOMPInterchangeDirective(OMPInterchangeDirective *D) {
Code = serialization::STMT_OMP_INTERCHANGE_DIRECTIVE;
}
+void ASTStmtWriter::VisitOMPCanonicalLoopSequenceTransformationDirective(
+ OMPCanonicalLoopSequenceTransformationDirective *D) {
+ VisitStmt(D);
+ Record.writeUInt32(D->getNumLoopsInSequence());
+ VisitOMPExecutableDirective(D);
+ Record.writeUInt32(D->getNumGeneratedTopLevelLoops());
+}
+
+void ASTStmtWriter::VisitOMPFuseDirective(OMPFuseDirective *D) {
+ VisitOMPCanonicalLoopSequenceTransformationDirective(D);
+ Code = serialization::STMT_OMP_FUSE_DIRECTIVE;
+}
+
void ASTStmtWriter::VisitOMPForDirective(OMPForDirective *D) {
VisitOMPLoopDirective(D);
Record.writeBool(D->hasCancel());
diff --git a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
index 785cdfa15bf04..4e472b7fc38b0 100644
--- a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
+++ b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
@@ -1814,6 +1814,7 @@ void ExprEngine::Visit(const Stmt *S, ExplodedNode *Pred,
case Stmt::OMPStripeDirectiveClass:
case Stmt::OMPTileDirectiveClass:
case Stmt::OMPInterchangeDirectiveClass:
+ case Stmt::OMPFuseDirectiveClass:
case Stmt::OMPInteropDirectiveClass:
case Stmt::OMPDispatchDirectiveClass:
case Stmt::OMPMaskedDirectiveClass:
diff --git a/clang/test/OpenMP/fuse_ast_print.cpp b/clang/test/OpenMP/fuse_ast_print.cpp
new file mode 100644
index 0000000000000..283f5883c907d
--- /dev/null
+++ b/clang/test/OpenMP/fuse_ast_print.cpp
@@ -0,0 +1,397 @@
+// Check no warnings/errors
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -fopenmp -std=c++20 -fopenmp-version=60 -fsyntax-only -verify %s
+// expected-no-diagnostics
+
+// Check AST and unparsing
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -fopenmp -std=c++20 -fopenmp-version=60 -ast-dump %s | FileCheck %s --check-prefix=DUMP
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -fopenmp -std=c++20 -fopenmp-version=60 -ast-print %s | FileCheck %s --check-prefix=PRINT
+
+// Check same results after serialization round-trip
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -fopenmp -std=c++20 -fopenmp-version=60 -emit-pch -o %t %s
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -fopenmp -std=c++20 -fopenmp-version=60 -include-pch %t -ast-dump-all %s | FileCheck %s --check-prefix=DUMP
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -fopenmp -std=c++20 -fopenmp-version=60 -include-pch %t -ast-print %s | FileCheck %s --check-prefix=PRINT
+
+#ifndef HEADER
+#define HEADER
+
+// placeholder for loop body code
+extern "C" void body(...);
+
+// PRINT-LABEL: void foo1(
+// DUMP-LABEL: FunctionDecl {{.*}} foo1
+void foo1() {
+ // PRINT: #pragma omp fuse
+ // DUMP: OMPFuseDirective
+ #pragma omp fuse
+ // PRINT: {
+ // DUMP: CompoundStmt
+ {
+ // PRINT: for (int i = 0; i < 10; i += 2)
+ // DUMP: ForStmt
+ for (int i = 0; i < 10; i += 2)
+ // PRINT: body(i)
+ // DUMP: CallExpr
+ body(i);
+ // PRINT: for (int j = 10; j > 0; --j)
+ // DUMP: ForStmt
+ for (int j = 10; j > 0; --j)
+ // PRINT: body(j)
+ // DUMP: CallExpr
+ body(j);
+ // PRINT: for (int k = 0; k <= 10; ++k)
+ // DUMP: ForStmt
+ for (int k = 0; k <= 10; ++k)
+ // PRINT: body(k)
+ // DUMP: CallExpr
+ body(k);
+
+ }
+
+}
+
+// PRINT-LABEL: void foo2(
+// DUMP-LABEL: FunctionDecl {{.*}} foo2
+void foo2() {
+ // PRINT: #pragma omp unroll partial(4)
+ // DUMP: OMPUnrollDirective
+ // DUMP-NEXT: OMPPartialClause
+ // DUMP-NEXT: ConstantExpr
+ // DUMP-NEXT: value: Int 4
+ // DUMP-NEXT: IntegerLiteral {{.*}} 4
+ #pragma omp unroll partial(4)
+ // PRINT: #pragma omp fuse
+ // DUMP-NEXT: OMPFuseDirective
+ #pragma omp fuse
+ // PRINT: {
+ // DUMP: CompoundStmt
+ {
+ // PRINT: for (int i = 0; i < 10; i += 2)
+ // DUMP: ForStmt
+ for (int i = 0; i < 10; i += 2)
+ // PRINT: body(i)
+ // DUMP: CallExpr
+ body(i);
+ // PRINT: for (int j = 10; j > 0; --j)
+ // DUMP: ForStmt
+ for (int j = 10; j > 0; --j)
+ // PRINT: body(j)
+ // DUMP: CallExpr
+ body(j);
+ }
+
+}
+
+//PRINT-LABEL: void foo3(
+//DUMP-LABEL: FunctionTemplateDecl {{.*}} foo3
+template<int Factor1, int Factor2>
+void foo3() {
+ // PRINT: #pragma omp fuse
+ // DUMP: OMPFuseDirective
+ #pragma omp fuse
+ // PRINT: {
+ // DUMP: CompoundStmt
+ {
+ // PRINT: #pragma omp unroll partial(Factor1)
+ // DUMP: OMPUnrollDirective
+ #pragma omp unroll partial(Factor1)
+ // PRINT: for (int i = 0; i < 12; i += 1)
+ // DUMP: ForStmt
+ for (int i = 0; i < 12; i += 1)
+ // PRINT: body(i)
+ // DUMP: CallExpr
+ body(i);
+ // PRINT: #pragma omp unroll partial(Factor2)
+ // DUMP: OMPUnrollDirective
+ #pragma omp unroll partial(Factor2)
+ // PRINT: for (int k = 0; k <= 10; ++k)
+ // DUMP: ForStmt
+ for (int k = 0; k <= 10; ++k)
+ // PRINT: body(k)
+ // DUMP: CallExpr
+ body(k);
+
+ }
+}
+
+// Also test instantiating the template.
+void tfoo3() {
+ foo3<4,2>();
+}
+
+//PRINT-LABEL: void foo4(
+//DUMP-LABEL: FunctionTemplateDecl {{.*}} foo4
+template<typename T, T Step>
+void foo4(int start, int end) {
+ // PRINT: #pragma omp fuse
+ // DUMP: OMPFuseDirective
+ #pragma omp fuse
+ // PRINT: {
+ // DUMP: CompoundStmt
+ {
+ // PRINT: for (T i = start; i < end; i += Step)
+ // DUMP: ForStmt
+ for (T i = start; i < end; i += Step)
+ // PRINT: body(i)
+ // DUMP: CallExpr
+ body(i);
+
+ // PRINT: for (T j = end; j > start; j -= Step)
+ // DUMP: ForStmt
+ for (T j = end; j > start; j -= Step) {
+ // PRINT: body(j)
+ // DUMP: CallExpr
+ body(j);
+ }
+
+ }
+}
+
+// Also test instantiating the template.
+void tfoo4() {
+ foo4<int, 4>(0, 64);
+}
+
+
+
+// PRINT-LABEL: void foo5(
+// DUMP-LABEL: FunctionDecl {{.*}} foo5
+void foo5() {
+ double arr[128], arr2[128];
+ // PRINT: #pragma omp fuse
+ // DUMP: OMPFuseDirective
+ #pragma omp fuse
+ // PRINT: {
+ // DUMP: CompoundStmt
+ {
+ // PRINT-NEXT: for (auto &&a : arr)
+ // DUMP-NEXT: CXXForRangeStmt
+ for (auto &&a: arr)
+ // PRINT: body(a)
+ // DUMP: CallExpr
+ body(a);
+ // PRINT: for (double v = 42; auto &&b : arr)
+ // DUMP: CXXForRangeStmt
+ for (double v = 42; auto &&b: arr)
+ // PRINT: body(b, v);
+ // DUMP: CallExpr
+ body(b, v);
+ // PRINT: for (auto &&c : arr2)
+ // DUMP: CXXForRangeStmt
+ for (auto &&c: arr2)
+ // PRINT: body(c)
+ // DUMP: CallExpr
+ body(c);
+
+ }
+
+}
+
+// PRINT-LABEL: void foo6(
+// DUMP-LABEL: FunctionDecl {{.*}} foo6
+void foo6() {
+ // PRINT: #pragma omp fuse
+ // DUMP: OMPFuseDirective
+ #pragma omp fuse
+ // PRINT: {
+ // DUMP: CompoundStmt
+ {
+ // PRINT: #pragma omp fuse
+ // DUMP: OMPFuseDirective
+ #pragma omp fuse
+ // PRINT: {
+ // DUMP: CompoundStmt
+ {
+ // PRINT: for (int i = 0; i <= 10; ++i)
+ // DUMP: ForStmt
+ for (int i = 0; i <= 10; ++i)
+ body(i);
+ // PRINT: for (int j = 0; j < 100; ++j)
+ // DUMP: ForStmt
+ for(int j = 0; j < 100; ++j)
+ body(j);
+ }
+ // PRINT: #pragma omp unroll partial(4)
+ // DUMP: OMPUnrollDirective
+ #pragma omp unroll partial(4)
+ // PRINT: for (int k = 0; k < 250; ++k)
+ // DUMP: ForStmt
+ for (int k = 0; k < 250; ++k)
+ body(k);
+ }
+}
+
+// PRINT-LABEL: void foo7(
+// DUMP-LABEL: FunctionDecl {{.*}} foo7
+void foo7() {
+ // PRINT: #pragma omp fuse
+ // DUMP: OMPFuseDirective
+ #pragma omp fuse
+ // PRINT: {
+ // DUMP: CompoundStmt
+ {
+ // PRINT: {
+ // DUMP: CompoundStmt
+ {
+ // PRINT: {
+ // DUMP: CompoundStmt
+ {
+ // PRINT: for (int i = 0; i < 10; i += 2)
+ // DUMP: ForStmt
+ for (int i = 0; i < 10; i += 2)
+ // PRINT: body(i)
+ // DUMP: CallExpr
+ body(i);
+ // PRINT: for (int j = 10; j > 0; --j)
+ // DUMP: ForStmt
+ for (int j = 10; j > 0; --j)
+ // PRINT: body(j)
+ // DUMP: CallExpr
+ body(j);
+ }
+ }
+ // PRINT: {
+ // DUMP: CompoundStmt
+ {
+ // PRINT: {
+ // DUMP: CompoundStmt
+ {
+ // PRINT: {
+ // DUMP: CompoundStmt
+ {
+ // PRINT: for (int k = 0; k <= 10; ++k)
+ // DUMP: ForStmt
+ for (int k = 0; k <= 10; ++k)
+ // PRINT: body(k)
+ // DUMP: CallExpr
+ body(k);
+ }
+ }
+ }
+ }
+
+}
+
+// PRINT-LABEL: void foo8(
+// DUMP-LABEL: FunctionDecl {{.*}} foo8
+void foo8() {
+ // PRINT: #pragma omp fuse looprange(2,2)
+ // DUMP: OMPFuseDirective
+ // DUMP: OMPLooprangeClause
+ #pragma omp fuse looprange(2,2)
+ // PRINT: {
+ // DUMP: CompoundStmt
+ {
+ // PRINT: for (int i = 0; i < 10; i += 2)
+ // DUMP: ForStmt
+ for (int i = 0; i < 10; i += 2)
+ // PRINT: body(i)
+ // DUMP: CallExpr
+ body(i);
+ // PRINT: for (int j = 10; j > 0; --j)
+ // DUMP: ForStmt
+ for (int j = 10; j > 0; --j)
+ // PRINT: body(j)
+ // DUMP: CallExpr
+ body(j);
+ // PRINT: for (int k = 0; k <= 10; ++k)
+ // DUMP: ForStmt
+ for (int k = 0; k <= 10; ++k)
+ // PRINT: body(k)
+ // DUMP: CallExpr
+ body(k);
+
+ }
+
+}
+
+//PRINT-LABEL: void foo9(
+//DUMP-LABEL: FunctionTemplateDecl {{.*}} foo9
+//DUMP-LABEL: NonTypeTemplateParmDecl {{.*}} F
+//DUMP-LABEL: NonTypeTemplateParmDecl {{.*}} C
+template<int F, int C>
+void foo9() {
+ // PRINT: #pragma omp fuse looprange(F,C)
+ // DUMP: OMPFuseDirective
+ // DUMP: OMPLooprangeClause
+ #pragma omp fuse looprange(F,C)
+ // PRINT: {
+ // DUMP: CompoundStmt
+ {
+ // PRINT: for (int i = 0; i < 10; i += 2)
+ // DUMP: ForStmt
+ for (int i = 0; i < 10; i += 2)
+ // PRINT: body(i)
+ // DUMP: CallExpr
+ body(i);
+ // PRINT: for (int j = 10; j > 0; --j)
+ // DUMP: ForStmt
+ for (int j = 10; j > 0; --j)
+ // PRINT: body(j)
+ // DUMP: CallExpr
+ body(j);
+
+ }
+}
+
+// Also test instantiating the template.
+void tfoo9() {
+ foo9<1, 2>();
+}
+
+// PRINT-LABEL: void foo10(
+// DUMP-LABEL: FunctionDecl {{.*}} foo10
+void foo10() {
+ // PRINT: #pragma omp fuse looprange(2,2)
+ // DUMP: OMPFuseDirective
+ // DUMP: OMPLooprangeClause
+ #pragma omp fuse looprange(2,2)
+ // PRINT: {
+ // DUMP: CompoundStmt
+ {
+ // PRINT: for (int i = 0; i < 10; i += 2)
+ // DUMP: ForStmt
+ for (int i = 0; i < 10; i += 2)
+ // PRINT: body(i)
+ // DUMP: CallExpr
+ body(i);
+ // PRINT: for (int ii = 0; ii < 10; ii += 2)
+ // DUMP: ForStmt
+ for (int ii = 0; ii < 10; ii += 2)
+ // PRINT: body(ii)
+ // DUMP: CallExpr
+ body(ii);
+ // PRINT: #pragma omp fuse looprange(2,2)
+ // DUMP: OMPFuseDirective
+ // DUMP: OMPLooprangeClause
+ #pragma omp fuse looprange(2,2)
+ {
+ // PRINT: for (int j = 10; j > 0; --j)
+ // DUMP: ForStmt
+ for (int j = 10; j > 0; --j)
+ // PRINT: body(j)
+ // DUMP: CallExpr
+ body(j);
+ // PRINT: for (int jj = 10; jj > 0; --jj)
+ // DUMP: ForStmt
+ for (int jj = 10; jj > 0; --jj)
+ // PRINT: body(jj)
+ // DUMP: CallExpr
+ body(jj);
+ // PRINT: for (int k = 0; k <= 10; ++k)
+ // DUMP: ForStmt
+ for (int k = 0; k <= 10; ++k)
+ // PRINT: body(k)
+ // DUMP: CallExpr
+ body(k);
+ // PRINT: for (int kk = 0; kk <= 10; ++kk)
+ // DUMP: ForStmt
+ for (int kk = 0; kk <= 10; ++kk)
+ // PRINT: body(kk)
+ // DUMP: CallExpr
+ body(kk);
+ }
+ }
+
+}
+
+#endif
diff --git a/clang/test/OpenMP/fuse_codegen.cpp b/clang/test/OpenMP/fuse_codegen.cpp
new file mode 100644
index 0000000000000..742c280ed0172
--- /dev/null
+++ b/clang/test/OpenMP/fuse_codegen.cpp
@@ -0,0 +1,2328 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --include-generated-funcs --replace-value-regex "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _ --version 5
+// expected-no-diagnostics
+
+// Check code generation
+// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -std=c++20 -fclang-abi-compat=latest -fopenmp -fopenmp-version=60 -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK1
+
+// Check same results after serialization round-trip
+// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -std=c++20 -fclang-abi-compat=latest -fopenmp -fopenmp-version=60 -emit-pch -o %t %s
+// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -std=c++20 -fclang-abi-compat=latest -fopenmp -fopenmp-version=60 -include-pch %t -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK2
+
+#ifndef HEADER
+#define HEADER
+
+//placeholder for loop body code.
+extern "C" void body(...) {}
+
+extern "C" void foo1(int start1, int end1, int step1, int start2, int end2, int step2) {
+ int i,j;
+ #pragma omp fuse
+ {
+ for(i = start1; i < end1; i += step1) body(i);
+ for(j = start2; j < end2; j += step2) body(j);
+ }
+
+}
+
+template <typename T>
+void foo2(T start, T end, T step){
+ T i,j,k;
+ #pragma omp fuse
+ {
+ for(i = start; i < end; i += step) body(i);
+ for(j = end; j > start; j -= step) body(j);
+ for(k = start+step; k < end+step; k += step) body(k);
+ }
+}
+
+extern "C" void tfoo2() {
+ foo2<int>(0, 64, 4);
+}
+
+extern "C" void foo3() {
+ double arr[256];
+ #pragma omp fuse
+ {
+ #pragma omp fuse
+ {
+ for(int i = 0; i < 128; ++i) body(i);
+ for(int j = 0; j < 256; j+=2) body(j);
+ }
+ for(int c = 42; auto &&v: arr) body(c,v);
+ for(int cc = 37; auto &&vv: arr) body(cc, vv);
+ }
+}
+
+extern "C" void foo4() {
+ double arr[256];
+
+ #pragma omp fuse looprange(2,2)
+ {
+ for(int i = 0; i < 128; ++i) body(i);
+ for(int j = 0; j < 256; j+=2) body(j);
+ for(int k = 0; k < 64; ++k) body(k);
+ for(int c = 42; auto &&v: arr) body(c,v);
+ }
+}
+
+// This exemplifies the usage of loop transformations that generate
+// more than top level canonical loop nests (e.g split, loopranged fuse...)
+extern "C" void foo5() {
+ double arr[256];
+ #pragma omp fuse looprange(2,2)
+ {
+ #pragma omp fuse looprange(2,2)
+ {
+ for(int i = 0; i < 128; ++i) body(i);
+ for(int j = 0; j < 256; j+=2) body(j);
+ for(int k = 0; k < 512; ++k) body(k);
+ }
+ for(int c = 42; auto &&v: arr) body(c,v);
+ for(int cc = 37; auto &&vv: arr) body(cc, vv);
+ }
+}
+
+
+#endif
+// CHECK1-LABEL: define dso_local void @body(
+// CHECK1-SAME: ...) #[[ATTR0:[0-9]+]] {
+// CHECK1-NEXT: [[ENTRY:.*:]]
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define dso_local void @foo1(
+// CHECK1-SAME: i32 noundef [[START1:%.*]], i32 noundef [[END1:%.*]], i32 noundef [[STEP1:%.*]], i32 noundef [[START2:%.*]], i32 noundef [[END2:%.*]], i32 noundef [[STEP2:%.*]]) #[[ATTR0]] {
+// CHECK1-NEXT: [[ENTRY:.*:]]
+// CHECK1-NEXT: [[START1_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[END1_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[STEP1_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[START2_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[END2_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[STEP2_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[J:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTNEW_STEP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_LB0:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_ST0:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_NI0:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_IV0:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_6:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_7:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTNEW_STEP8:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_9:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_LB1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_ST1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_NI1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_IV1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_TEMP_1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_FUSE_MAX:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_FUSE_INDEX:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: store i32 [[START1]], ptr [[START1_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[END1]], ptr [[END1_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[STEP1]], ptr [[STEP1_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[START2]], ptr [[START2_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[END2]], ptr [[END2_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[STEP2]], ptr [[STEP2_ADDR]], align 4
+// CHECK1-NEXT: [[TMP0:%.*]] = load i32, ptr [[START1_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[TMP0]], ptr [[I]], align 4
+// CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[START1_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: [[TMP2:%.*]] = load i32, ptr [[END1_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[TMP2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[STEP1_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[TMP3]], ptr [[DOTNEW_STEP]], align 4
+// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: [[SUB:%.*]] = sub i32 [[TMP4]], [[TMP5]]
+// CHECK1-NEXT: [[SUB3:%.*]] = sub i32 [[SUB]], 1
+// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4
+// CHECK1-NEXT: [[ADD:%.*]] = add i32 [[SUB3]], [[TMP6]]
+// CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4
+// CHECK1-NEXT: [[DIV:%.*]] = udiv i32 [[ADD]], [[TMP7]]
+// CHECK1-NEXT: [[SUB4:%.*]] = sub i32 [[DIV]], 1
+// CHECK1-NEXT: store i32 [[SUB4]], ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB0]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_ST0]], align 4
+// CHECK1-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK1-NEXT: [[ADD5:%.*]] = add i32 [[TMP8]], 1
+// CHECK1-NEXT: store i32 [[ADD5]], ptr [[DOTOMP_NI0]], align 4
+// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[START2_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[TMP9]], ptr [[J]], align 4
+// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[START2_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[TMP10]], ptr [[DOTCAPTURE_EXPR_6]], align 4
+// CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[END2_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[TMP11]], ptr [[DOTCAPTURE_EXPR_7]], align 4
+// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[STEP2_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[TMP12]], ptr [[DOTNEW_STEP8]], align 4
+// CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_7]], align 4
+// CHECK1-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4
+// CHECK1-NEXT: [[SUB10:%.*]] = sub i32 [[TMP13]], [[TMP14]]
+// CHECK1-NEXT: [[SUB11:%.*]] = sub i32 [[SUB10]], 1
+// CHECK1-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTNEW_STEP8]], align 4
+// CHECK1-NEXT: [[ADD12:%.*]] = add i32 [[SUB11]], [[TMP15]]
+// CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTNEW_STEP8]], align 4
+// CHECK1-NEXT: [[DIV13:%.*]] = udiv i32 [[ADD12]], [[TMP16]]
+// CHECK1-NEXT: [[SUB14:%.*]] = sub i32 [[DIV13]], 1
+// CHECK1-NEXT: store i32 [[SUB14]], ptr [[DOTCAPTURE_EXPR_9]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB1]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_ST1]], align 4
+// CHECK1-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_9]], align 4
+// CHECK1-NEXT: [[ADD15:%.*]] = add i32 [[TMP17]], 1
+// CHECK1-NEXT: store i32 [[ADD15]], ptr [[DOTOMP_NI1]], align 4
+// CHECK1-NEXT: [[TMP18:%.*]] = load i32, ptr [[DOTOMP_NI0]], align 4
+// CHECK1-NEXT: store i32 [[TMP18]], ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK1-NEXT: [[TMP19:%.*]] = load i32, ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK1-NEXT: [[TMP20:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK1-NEXT: [[CMP:%.*]] = icmp ugt i32 [[TMP19]], [[TMP20]]
+// CHECK1-NEXT: br i1 [[CMP]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]]
+// CHECK1: [[COND_TRUE]]:
+// CHECK1-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK1-NEXT: br label %[[COND_END:.*]]
+// CHECK1: [[COND_FALSE]]:
+// CHECK1-NEXT: [[TMP22:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK1-NEXT: br label %[[COND_END]]
+// CHECK1: [[COND_END]]:
+// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ [[TMP21]], %[[COND_TRUE]] ], [ [[TMP22]], %[[COND_FALSE]] ]
+// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_FUSE_MAX]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT: br label %[[FOR_COND:.*]]
+// CHECK1: [[FOR_COND]]:
+// CHECK1-NEXT: [[TMP23:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT: [[TMP24:%.*]] = load i32, ptr [[DOTOMP_FUSE_MAX]], align 4
+// CHECK1-NEXT: [[CMP16:%.*]] = icmp ult i32 [[TMP23]], [[TMP24]]
+// CHECK1-NEXT: br i1 [[CMP16]], label %[[FOR_BODY:.*]], label %[[FOR_END:.*]]
+// CHECK1: [[FOR_BODY]]:
+// CHECK1-NEXT: [[TMP25:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTOMP_NI0]], align 4
+// CHECK1-NEXT: [[CMP17:%.*]] = icmp ult i32 [[TMP25]], [[TMP26]]
+// CHECK1-NEXT: br i1 [[CMP17]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
+// CHECK1: [[IF_THEN]]:
+// CHECK1-NEXT: [[TMP27:%.*]] = load i32, ptr [[DOTOMP_LB0]], align 4
+// CHECK1-NEXT: [[TMP28:%.*]] = load i32, ptr [[DOTOMP_ST0]], align 4
+// CHECK1-NEXT: [[TMP29:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT: [[MUL:%.*]] = mul i32 [[TMP28]], [[TMP29]]
+// CHECK1-NEXT: [[ADD18:%.*]] = add i32 [[TMP27]], [[MUL]]
+// CHECK1-NEXT: store i32 [[ADD18]], ptr [[DOTOMP_IV0]], align 4
+// CHECK1-NEXT: [[TMP30:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTOMP_IV0]], align 4
+// CHECK1-NEXT: [[TMP32:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4
+// CHECK1-NEXT: [[MUL19:%.*]] = mul i32 [[TMP31]], [[TMP32]]
+// CHECK1-NEXT: [[ADD20:%.*]] = add i32 [[TMP30]], [[MUL19]]
+// CHECK1-NEXT: store i32 [[ADD20]], ptr [[I]], align 4
+// CHECK1-NEXT: [[TMP33:%.*]] = load i32, ptr [[I]], align 4
+// CHECK1-NEXT: call void (...) @body(i32 noundef [[TMP33]])
+// CHECK1-NEXT: br label %[[IF_END]]
+// CHECK1: [[IF_END]]:
+// CHECK1-NEXT: [[TMP34:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT: [[TMP35:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK1-NEXT: [[CMP21:%.*]] = icmp ult i32 [[TMP34]], [[TMP35]]
+// CHECK1-NEXT: br i1 [[CMP21]], label %[[IF_THEN22:.*]], label %[[IF_END27:.*]]
+// CHECK1: [[IF_THEN22]]:
+// CHECK1-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTOMP_LB1]], align 4
+// CHECK1-NEXT: [[TMP37:%.*]] = load i32, ptr [[DOTOMP_ST1]], align 4
+// CHECK1-NEXT: [[TMP38:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT: [[MUL23:%.*]] = mul i32 [[TMP37]], [[TMP38]]
+// CHECK1-NEXT: [[ADD24:%.*]] = add i32 [[TMP36]], [[MUL23]]
+// CHECK1-NEXT: store i32 [[ADD24]], ptr [[DOTOMP_IV1]], align 4
+// CHECK1-NEXT: [[TMP39:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4
+// CHECK1-NEXT: [[TMP40:%.*]] = load i32, ptr [[DOTOMP_IV1]], align 4
+// CHECK1-NEXT: [[TMP41:%.*]] = load i32, ptr [[DOTNEW_STEP8]], align 4
+// CHECK1-NEXT: [[MUL25:%.*]] = mul i32 [[TMP40]], [[TMP41]]
+// CHECK1-NEXT: [[ADD26:%.*]] = add i32 [[TMP39]], [[MUL25]]
+// CHECK1-NEXT: store i32 [[ADD26]], ptr [[J]], align 4
+// CHECK1-NEXT: [[TMP42:%.*]] = load i32, ptr [[J]], align 4
+// CHECK1-NEXT: call void (...) @body(i32 noundef [[TMP42]])
+// CHECK1-NEXT: br label %[[IF_END27]]
+// CHECK1: [[IF_END27]]:
+// CHECK1-NEXT: br label %[[FOR_INC:.*]]
+// CHECK1: [[FOR_INC]]:
+// CHECK1-NEXT: [[TMP43:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT: [[INC:%.*]] = add i32 [[TMP43]], 1
+// CHECK1-NEXT: store i32 [[INC]], ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT: br label %[[FOR_COND]], !llvm.loop [[LOOP3:![0-9]+]]
+// CHECK1: [[FOR_END]]:
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define dso_local void @tfoo2(
+// CHECK1-SAME: ) #[[ATTR0]] {
+// CHECK1-NEXT: [[ENTRY:.*:]]
+// CHECK1-NEXT: call void @_Z4foo2IiEvT_S0_S0_(i32 noundef 0, i32 noundef 64, i32 noundef 4)
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define linkonce_odr void @_Z4foo2IiEvT_S0_S0_(
+// CHECK1-SAME: i32 noundef [[START:%.*]], i32 noundef [[END:%.*]], i32 noundef [[STEP:%.*]]) #[[ATTR0]] comdat {
+// CHECK1-NEXT: [[ENTRY:.*:]]
+// CHECK1-NEXT: [[START_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[END_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[STEP_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[J:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[K:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTNEW_STEP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_LB0:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_ST0:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_NI0:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_IV0:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_6:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_7:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTNEW_STEP8:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_9:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_LB1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_ST1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_NI1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_IV1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_17:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_19:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTNEW_STEP21:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_22:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_LB2:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_ST2:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_NI2:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_IV2:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_TEMP_1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_TEMP_2:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_FUSE_MAX:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_FUSE_INDEX:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: store i32 [[START]], ptr [[START_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[END]], ptr [[END_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[STEP]], ptr [[STEP_ADDR]], align 4
+// CHECK1-NEXT: [[TMP0:%.*]] = load i32, ptr [[START_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[TMP0]], ptr [[I]], align 4
+// CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[START_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: [[TMP2:%.*]] = load i32, ptr [[END_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[TMP2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[STEP_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[TMP3]], ptr [[DOTNEW_STEP]], align 4
+// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: [[SUB:%.*]] = sub i32 [[TMP4]], [[TMP5]]
+// CHECK1-NEXT: [[SUB3:%.*]] = sub i32 [[SUB]], 1
+// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4
+// CHECK1-NEXT: [[ADD:%.*]] = add i32 [[SUB3]], [[TMP6]]
+// CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4
+// CHECK1-NEXT: [[DIV:%.*]] = udiv i32 [[ADD]], [[TMP7]]
+// CHECK1-NEXT: [[SUB4:%.*]] = sub i32 [[DIV]], 1
+// CHECK1-NEXT: store i32 [[SUB4]], ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB0]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_ST0]], align 4
+// CHECK1-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK1-NEXT: [[ADD5:%.*]] = add i32 [[TMP8]], 1
+// CHECK1-NEXT: store i32 [[ADD5]], ptr [[DOTOMP_NI0]], align 4
+// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[END_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[TMP9]], ptr [[J]], align 4
+// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[END_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[TMP10]], ptr [[DOTCAPTURE_EXPR_6]], align 4
+// CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[START_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[TMP11]], ptr [[DOTCAPTURE_EXPR_7]], align 4
+// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[STEP_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[TMP12]], ptr [[DOTNEW_STEP8]], align 4
+// CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4
+// CHECK1-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_7]], align 4
+// CHECK1-NEXT: [[SUB10:%.*]] = sub i32 [[TMP13]], [[TMP14]]
+// CHECK1-NEXT: [[SUB11:%.*]] = sub i32 [[SUB10]], 1
+// CHECK1-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTNEW_STEP8]], align 4
+// CHECK1-NEXT: [[ADD12:%.*]] = add i32 [[SUB11]], [[TMP15]]
+// CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTNEW_STEP8]], align 4
+// CHECK1-NEXT: [[DIV13:%.*]] = udiv i32 [[ADD12]], [[TMP16]]
+// CHECK1-NEXT: [[SUB14:%.*]] = sub i32 [[DIV13]], 1
+// CHECK1-NEXT: store i32 [[SUB14]], ptr [[DOTCAPTURE_EXPR_9]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB1]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_ST1]], align 4
+// CHECK1-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_9]], align 4
+// CHECK1-NEXT: [[ADD15:%.*]] = add i32 [[TMP17]], 1
+// CHECK1-NEXT: store i32 [[ADD15]], ptr [[DOTOMP_NI1]], align 4
+// CHECK1-NEXT: [[TMP18:%.*]] = load i32, ptr [[START_ADDR]], align 4
+// CHECK1-NEXT: [[TMP19:%.*]] = load i32, ptr [[STEP_ADDR]], align 4
+// CHECK1-NEXT: [[ADD16:%.*]] = add nsw i32 [[TMP18]], [[TMP19]]
+// CHECK1-NEXT: store i32 [[ADD16]], ptr [[K]], align 4
+// CHECK1-NEXT: [[TMP20:%.*]] = load i32, ptr [[START_ADDR]], align 4
+// CHECK1-NEXT: [[TMP21:%.*]] = load i32, ptr [[STEP_ADDR]], align 4
+// CHECK1-NEXT: [[ADD18:%.*]] = add nsw i32 [[TMP20]], [[TMP21]]
+// CHECK1-NEXT: store i32 [[ADD18]], ptr [[DOTCAPTURE_EXPR_17]], align 4
+// CHECK1-NEXT: [[TMP22:%.*]] = load i32, ptr [[END_ADDR]], align 4
+// CHECK1-NEXT: [[TMP23:%.*]] = load i32, ptr [[STEP_ADDR]], align 4
+// CHECK1-NEXT: [[ADD20:%.*]] = add nsw i32 [[TMP22]], [[TMP23]]
+// CHECK1-NEXT: store i32 [[ADD20]], ptr [[DOTCAPTURE_EXPR_19]], align 4
+// CHECK1-NEXT: [[TMP24:%.*]] = load i32, ptr [[STEP_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[TMP24]], ptr [[DOTNEW_STEP21]], align 4
+// CHECK1-NEXT: [[TMP25:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_19]], align 4
+// CHECK1-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4
+// CHECK1-NEXT: [[SUB23:%.*]] = sub i32 [[TMP25]], [[TMP26]]
+// CHECK1-NEXT: [[SUB24:%.*]] = sub i32 [[SUB23]], 1
+// CHECK1-NEXT: [[TMP27:%.*]] = load i32, ptr [[DOTNEW_STEP21]], align 4
+// CHECK1-NEXT: [[ADD25:%.*]] = add i32 [[SUB24]], [[TMP27]]
+// CHECK1-NEXT: [[TMP28:%.*]] = load i32, ptr [[DOTNEW_STEP21]], align 4
+// CHECK1-NEXT: [[DIV26:%.*]] = udiv i32 [[ADD25]], [[TMP28]]
+// CHECK1-NEXT: [[SUB27:%.*]] = sub i32 [[DIV26]], 1
+// CHECK1-NEXT: store i32 [[SUB27]], ptr [[DOTCAPTURE_EXPR_22]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB2]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_ST2]], align 4
+// CHECK1-NEXT: [[TMP29:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_22]], align 4
+// CHECK1-NEXT: [[ADD28:%.*]] = add i32 [[TMP29]], 1
+// CHECK1-NEXT: store i32 [[ADD28]], ptr [[DOTOMP_NI2]], align 4
+// CHECK1-NEXT: [[TMP30:%.*]] = load i32, ptr [[DOTOMP_NI0]], align 4
+// CHECK1-NEXT: store i32 [[TMP30]], ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK1-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK1-NEXT: [[TMP32:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK1-NEXT: [[CMP:%.*]] = icmp ugt i32 [[TMP31]], [[TMP32]]
+// CHECK1-NEXT: br i1 [[CMP]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]]
+// CHECK1: [[COND_TRUE]]:
+// CHECK1-NEXT: [[TMP33:%.*]] = load i32, ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK1-NEXT: br label %[[COND_END:.*]]
+// CHECK1: [[COND_FALSE]]:
+// CHECK1-NEXT: [[TMP34:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK1-NEXT: br label %[[COND_END]]
+// CHECK1: [[COND_END]]:
+// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ [[TMP33]], %[[COND_TRUE]] ], [ [[TMP34]], %[[COND_FALSE]] ]
+// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_TEMP_2]], align 4
+// CHECK1-NEXT: [[TMP35:%.*]] = load i32, ptr [[DOTOMP_TEMP_2]], align 4
+// CHECK1-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTOMP_NI2]], align 4
+// CHECK1-NEXT: [[CMP29:%.*]] = icmp ugt i32 [[TMP35]], [[TMP36]]
+// CHECK1-NEXT: br i1 [[CMP29]], label %[[COND_TRUE30:.*]], label %[[COND_FALSE31:.*]]
+// CHECK1: [[COND_TRUE30]]:
+// CHECK1-NEXT: [[TMP37:%.*]] = load i32, ptr [[DOTOMP_TEMP_2]], align 4
+// CHECK1-NEXT: br label %[[COND_END32:.*]]
+// CHECK1: [[COND_FALSE31]]:
+// CHECK1-NEXT: [[TMP38:%.*]] = load i32, ptr [[DOTOMP_NI2]], align 4
+// CHECK1-NEXT: br label %[[COND_END32]]
+// CHECK1: [[COND_END32]]:
+// CHECK1-NEXT: [[COND33:%.*]] = phi i32 [ [[TMP37]], %[[COND_TRUE30]] ], [ [[TMP38]], %[[COND_FALSE31]] ]
+// CHECK1-NEXT: store i32 [[COND33]], ptr [[DOTOMP_FUSE_MAX]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT: br label %[[FOR_COND:.*]]
+// CHECK1: [[FOR_COND]]:
+// CHECK1-NEXT: [[TMP39:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT: [[TMP40:%.*]] = load i32, ptr [[DOTOMP_FUSE_MAX]], align 4
+// CHECK1-NEXT: [[CMP34:%.*]] = icmp ult i32 [[TMP39]], [[TMP40]]
+// CHECK1-NEXT: br i1 [[CMP34]], label %[[FOR_BODY:.*]], label %[[FOR_END:.*]]
+// CHECK1: [[FOR_BODY]]:
+// CHECK1-NEXT: [[TMP41:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT: [[TMP42:%.*]] = load i32, ptr [[DOTOMP_NI0]], align 4
+// CHECK1-NEXT: [[CMP35:%.*]] = icmp ult i32 [[TMP41]], [[TMP42]]
+// CHECK1-NEXT: br i1 [[CMP35]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
+// CHECK1: [[IF_THEN]]:
+// CHECK1-NEXT: [[TMP43:%.*]] = load i32, ptr [[DOTOMP_LB0]], align 4
+// CHECK1-NEXT: [[TMP44:%.*]] = load i32, ptr [[DOTOMP_ST0]], align 4
+// CHECK1-NEXT: [[TMP45:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT: [[MUL:%.*]] = mul i32 [[TMP44]], [[TMP45]]
+// CHECK1-NEXT: [[ADD36:%.*]] = add i32 [[TMP43]], [[MUL]]
+// CHECK1-NEXT: store i32 [[ADD36]], ptr [[DOTOMP_IV0]], align 4
+// CHECK1-NEXT: [[TMP46:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: [[TMP47:%.*]] = load i32, ptr [[DOTOMP_IV0]], align 4
+// CHECK1-NEXT: [[TMP48:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4
+// CHECK1-NEXT: [[MUL37:%.*]] = mul i32 [[TMP47]], [[TMP48]]
+// CHECK1-NEXT: [[ADD38:%.*]] = add i32 [[TMP46]], [[MUL37]]
+// CHECK1-NEXT: store i32 [[ADD38]], ptr [[I]], align 4
+// CHECK1-NEXT: [[TMP49:%.*]] = load i32, ptr [[I]], align 4
+// CHECK1-NEXT: call void (...) @body(i32 noundef [[TMP49]])
+// CHECK1-NEXT: br label %[[IF_END]]
+// CHECK1: [[IF_END]]:
+// CHECK1-NEXT: [[TMP50:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT: [[TMP51:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK1-NEXT: [[CMP39:%.*]] = icmp ult i32 [[TMP50]], [[TMP51]]
+// CHECK1-NEXT: br i1 [[CMP39]], label %[[IF_THEN40:.*]], label %[[IF_END45:.*]]
+// CHECK1: [[IF_THEN40]]:
+// CHECK1-NEXT: [[TMP52:%.*]] = load i32, ptr [[DOTOMP_LB1]], align 4
+// CHECK1-NEXT: [[TMP53:%.*]] = load i32, ptr [[DOTOMP_ST1]], align 4
+// CHECK1-NEXT: [[TMP54:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT: [[MUL41:%.*]] = mul i32 [[TMP53]], [[TMP54]]
+// CHECK1-NEXT: [[ADD42:%.*]] = add i32 [[TMP52]], [[MUL41]]
+// CHECK1-NEXT: store i32 [[ADD42]], ptr [[DOTOMP_IV1]], align 4
+// CHECK1-NEXT: [[TMP55:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4
+// CHECK1-NEXT: [[TMP56:%.*]] = load i32, ptr [[DOTOMP_IV1]], align 4
+// CHECK1-NEXT: [[TMP57:%.*]] = load i32, ptr [[DOTNEW_STEP8]], align 4
+// CHECK1-NEXT: [[MUL43:%.*]] = mul i32 [[TMP56]], [[TMP57]]
+// CHECK1-NEXT: [[SUB44:%.*]] = sub i32 [[TMP55]], [[MUL43]]
+// CHECK1-NEXT: store i32 [[SUB44]], ptr [[J]], align 4
+// CHECK1-NEXT: [[TMP58:%.*]] = load i32, ptr [[J]], align 4
+// CHECK1-NEXT: call void (...) @body(i32 noundef [[TMP58]])
+// CHECK1-NEXT: br label %[[IF_END45]]
+// CHECK1: [[IF_END45]]:
+// CHECK1-NEXT: [[TMP59:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT: [[TMP60:%.*]] = load i32, ptr [[DOTOMP_NI2]], align 4
+// CHECK1-NEXT: [[CMP46:%.*]] = icmp ult i32 [[TMP59]], [[TMP60]]
+// CHECK1-NEXT: br i1 [[CMP46]], label %[[IF_THEN47:.*]], label %[[IF_END52:.*]]
+// CHECK1: [[IF_THEN47]]:
+// CHECK1-NEXT: [[TMP61:%.*]] = load i32, ptr [[DOTOMP_LB2]], align 4
+// CHECK1-NEXT: [[TMP62:%.*]] = load i32, ptr [[DOTOMP_ST2]], align 4
+// CHECK1-NEXT: [[TMP63:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT: [[MUL48:%.*]] = mul i32 [[TMP62]], [[TMP63]]
+// CHECK1-NEXT: [[ADD49:%.*]] = add i32 [[TMP61]], [[MUL48]]
+// CHECK1-NEXT: store i32 [[ADD49]], ptr [[DOTOMP_IV2]], align 4
+// CHECK1-NEXT: [[TMP64:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4
+// CHECK1-NEXT: [[TMP65:%.*]] = load i32, ptr [[DOTOMP_IV2]], align 4
+// CHECK1-NEXT: [[TMP66:%.*]] = load i32, ptr [[DOTNEW_STEP21]], align 4
+// CHECK1-NEXT: [[MUL50:%.*]] = mul i32 [[TMP65]], [[TMP66]]
+// CHECK1-NEXT: [[ADD51:%.*]] = add i32 [[TMP64]], [[MUL50]]
+// CHECK1-NEXT: store i32 [[ADD51]], ptr [[K]], align 4
+// CHECK1-NEXT: [[TMP67:%.*]] = load i32, ptr [[K]], align 4
+// CHECK1-NEXT: call void (...) @body(i32 noundef [[TMP67]])
+// CHECK1-NEXT: br label %[[IF_END52]]
+// CHECK1: [[IF_END52]]:
+// CHECK1-NEXT: br label %[[FOR_INC:.*]]
+// CHECK1: [[FOR_INC]]:
+// CHECK1-NEXT: [[TMP68:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT: [[INC:%.*]] = add i32 [[TMP68]], 1
+// CHECK1-NEXT: store i32 [[INC]], ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT: br label %[[FOR_COND]], !llvm.loop [[LOOP5:![0-9]+]]
+// CHECK1: [[FOR_END]]:
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define dso_local void @foo3(
+// CHECK1-SAME: ) #[[ATTR0]] {
+// CHECK1-NEXT: [[ENTRY:.*:]]
+// CHECK1-NEXT: [[ARR:%.*]] = alloca [256 x double], align 16
+// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_LB0:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_ST0:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_NI0:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_IV0:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[J:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_LB1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_ST1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_NI1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_IV1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_TEMP_1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_FUSE_MAX:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_FUSE_INDEX:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_LB03:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_ST04:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_NI05:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTOMP_IV06:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[C:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[__RANGE2:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[__END2:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[__BEGIN2:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_8:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_10:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_11:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTOMP_LB116:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTOMP_ST117:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTOMP_NI118:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTOMP_IV120:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[CC:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[__RANGE221:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[__END222:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[__BEGIN225:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_27:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_29:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_30:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTOMP_LB2:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTOMP_ST2:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTOMP_NI2:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTOMP_IV2:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTOMP_TEMP_140:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTOMP_TEMP_2:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTOMP_FUSE_MAX46:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTOMP_FUSE_INDEX52:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[V:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[VV:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB0]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_ST0]], align 4
+// CHECK1-NEXT: store i32 128, ptr [[DOTOMP_NI0]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[J]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB1]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_ST1]], align 4
+// CHECK1-NEXT: store i32 128, ptr [[DOTOMP_NI1]], align 4
+// CHECK1-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTOMP_NI0]], align 4
+// CHECK1-NEXT: store i32 [[TMP0]], ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK1-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP1]], [[TMP2]]
+// CHECK1-NEXT: br i1 [[CMP]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]]
+// CHECK1: [[COND_TRUE]]:
+// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK1-NEXT: br label %[[COND_END:.*]]
+// CHECK1: [[COND_FALSE]]:
+// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK1-NEXT: br label %[[COND_END]]
+// CHECK1: [[COND_END]]:
+// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ [[TMP3]], %[[COND_TRUE]] ], [ [[TMP4]], %[[COND_FALSE]] ]
+// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_FUSE_MAX]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_FUSE_MAX]], align 4
+// CHECK1-NEXT: store i32 [[TMP5]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP6]], 0
+// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK1-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK1-NEXT: store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB03]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_ST04]], align 4
+// CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP7]], 1
+// CHECK1-NEXT: [[CONV:%.*]] = sext i32 [[ADD]] to i64
+// CHECK1-NEXT: store i64 [[CONV]], ptr [[DOTOMP_NI05]], align 8
+// CHECK1-NEXT: store i32 42, ptr [[C]], align 4
+// CHECK1-NEXT: store ptr [[ARR]], ptr [[__RANGE2]], align 8
+// CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK1-NEXT: [[ARRAYDECAY:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP8]], i64 0, i64 0
+// CHECK1-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds double, ptr [[ARRAYDECAY]], i64 256
+// CHECK1-NEXT: store ptr [[ADD_PTR]], ptr [[__END2]], align 8
+// CHECK1-NEXT: [[TMP9:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK1-NEXT: [[ARRAYDECAY7:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP9]], i64 0, i64 0
+// CHECK1-NEXT: store ptr [[ARRAYDECAY7]], ptr [[__BEGIN2]], align 8
+// CHECK1-NEXT: [[TMP10:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK1-NEXT: [[ARRAYDECAY9:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP10]], i64 0, i64 0
+// CHECK1-NEXT: store ptr [[ARRAYDECAY9]], ptr [[DOTCAPTURE_EXPR_8]], align 8
+// CHECK1-NEXT: [[TMP11:%.*]] = load ptr, ptr [[__END2]], align 8
+// CHECK1-NEXT: store ptr [[TMP11]], ptr [[DOTCAPTURE_EXPR_10]], align 8
+// CHECK1-NEXT: [[TMP12:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_10]], align 8
+// CHECK1-NEXT: [[TMP13:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_8]], align 8
+// CHECK1-NEXT: [[SUB_PTR_LHS_CAST:%.*]] = ptrtoint ptr [[TMP12]] to i64
+// CHECK1-NEXT: [[SUB_PTR_RHS_CAST:%.*]] = ptrtoint ptr [[TMP13]] to i64
+// CHECK1-NEXT: [[SUB_PTR_SUB:%.*]] = sub i64 [[SUB_PTR_LHS_CAST]], [[SUB_PTR_RHS_CAST]]
+// CHECK1-NEXT: [[SUB_PTR_DIV:%.*]] = sdiv exact i64 [[SUB_PTR_SUB]], 8
+// CHECK1-NEXT: [[SUB12:%.*]] = sub nsw i64 [[SUB_PTR_DIV]], 1
+// CHECK1-NEXT: [[ADD13:%.*]] = add nsw i64 [[SUB12]], 1
+// CHECK1-NEXT: [[DIV14:%.*]] = sdiv i64 [[ADD13]], 1
+// CHECK1-NEXT: [[SUB15:%.*]] = sub nsw i64 [[DIV14]], 1
+// CHECK1-NEXT: store i64 [[SUB15]], ptr [[DOTCAPTURE_EXPR_11]], align 8
+// CHECK1-NEXT: store i64 0, ptr [[DOTOMP_LB116]], align 8
+// CHECK1-NEXT: store i64 1, ptr [[DOTOMP_ST117]], align 8
+// CHECK1-NEXT: [[TMP14:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_11]], align 8
+// CHECK1-NEXT: [[ADD19:%.*]] = add nsw i64 [[TMP14]], 1
+// CHECK1-NEXT: store i64 [[ADD19]], ptr [[DOTOMP_NI118]], align 8
+// CHECK1-NEXT: store i32 37, ptr [[CC]], align 4
+// CHECK1-NEXT: store ptr [[ARR]], ptr [[__RANGE221]], align 8
+// CHECK1-NEXT: [[TMP15:%.*]] = load ptr, ptr [[__RANGE221]], align 8
+// CHECK1-NEXT: [[ARRAYDECAY23:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP15]], i64 0, i64 0
+// CHECK1-NEXT: [[ADD_PTR24:%.*]] = getelementptr inbounds double, ptr [[ARRAYDECAY23]], i64 256
+// CHECK1-NEXT: store ptr [[ADD_PTR24]], ptr [[__END222]], align 8
+// CHECK1-NEXT: [[TMP16:%.*]] = load ptr, ptr [[__RANGE221]], align 8
+// CHECK1-NEXT: [[ARRAYDECAY26:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP16]], i64 0, i64 0
+// CHECK1-NEXT: store ptr [[ARRAYDECAY26]], ptr [[__BEGIN225]], align 8
+// CHECK1-NEXT: [[TMP17:%.*]] = load ptr, ptr [[__RANGE221]], align 8
+// CHECK1-NEXT: [[ARRAYDECAY28:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP17]], i64 0, i64 0
+// CHECK1-NEXT: store ptr [[ARRAYDECAY28]], ptr [[DOTCAPTURE_EXPR_27]], align 8
+// CHECK1-NEXT: [[TMP18:%.*]] = load ptr, ptr [[__END222]], align 8
+// CHECK1-NEXT: store ptr [[TMP18]], ptr [[DOTCAPTURE_EXPR_29]], align 8
+// CHECK1-NEXT: [[TMP19:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_29]], align 8
+// CHECK1-NEXT: [[TMP20:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_27]], align 8
+// CHECK1-NEXT: [[SUB_PTR_LHS_CAST31:%.*]] = ptrtoint ptr [[TMP19]] to i64
+// CHECK1-NEXT: [[SUB_PTR_RHS_CAST32:%.*]] = ptrtoint ptr [[TMP20]] to i64
+// CHECK1-NEXT: [[SUB_PTR_SUB33:%.*]] = sub i64 [[SUB_PTR_LHS_CAST31]], [[SUB_PTR_RHS_CAST32]]
+// CHECK1-NEXT: [[SUB_PTR_DIV34:%.*]] = sdiv exact i64 [[SUB_PTR_SUB33]], 8
+// CHECK1-NEXT: [[SUB35:%.*]] = sub nsw i64 [[SUB_PTR_DIV34]], 1
+// CHECK1-NEXT: [[ADD36:%.*]] = add nsw i64 [[SUB35]], 1
+// CHECK1-NEXT: [[DIV37:%.*]] = sdiv i64 [[ADD36]], 1
+// CHECK1-NEXT: [[SUB38:%.*]] = sub nsw i64 [[DIV37]], 1
+// CHECK1-NEXT: store i64 [[SUB38]], ptr [[DOTCAPTURE_EXPR_30]], align 8
+// CHECK1-NEXT: store i64 0, ptr [[DOTOMP_LB2]], align 8
+// CHECK1-NEXT: store i64 1, ptr [[DOTOMP_ST2]], align 8
+// CHECK1-NEXT: [[TMP21:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_30]], align 8
+// CHECK1-NEXT: [[ADD39:%.*]] = add nsw i64 [[TMP21]], 1
+// CHECK1-NEXT: store i64 [[ADD39]], ptr [[DOTOMP_NI2]], align 8
+// CHECK1-NEXT: [[TMP22:%.*]] = load i64, ptr [[DOTOMP_NI05]], align 8
+// CHECK1-NEXT: store i64 [[TMP22]], ptr [[DOTOMP_TEMP_140]], align 8
+// CHECK1-NEXT: [[TMP23:%.*]] = load i64, ptr [[DOTOMP_TEMP_140]], align 8
+// CHECK1-NEXT: [[TMP24:%.*]] = load i64, ptr [[DOTOMP_NI118]], align 8
+// CHECK1-NEXT: [[CMP41:%.*]] = icmp sgt i64 [[TMP23]], [[TMP24]]
+// CHECK1-NEXT: br i1 [[CMP41]], label %[[COND_TRUE42:.*]], label %[[COND_FALSE43:.*]]
+// CHECK1: [[COND_TRUE42]]:
+// CHECK1-NEXT: [[TMP25:%.*]] = load i64, ptr [[DOTOMP_TEMP_140]], align 8
+// CHECK1-NEXT: br label %[[COND_END44:.*]]
+// CHECK1: [[COND_FALSE43]]:
+// CHECK1-NEXT: [[TMP26:%.*]] = load i64, ptr [[DOTOMP_NI118]], align 8
+// CHECK1-NEXT: br label %[[COND_END44]]
+// CHECK1: [[COND_END44]]:
+// CHECK1-NEXT: [[COND45:%.*]] = phi i64 [ [[TMP25]], %[[COND_TRUE42]] ], [ [[TMP26]], %[[COND_FALSE43]] ]
+// CHECK1-NEXT: store i64 [[COND45]], ptr [[DOTOMP_TEMP_2]], align 8
+// CHECK1-NEXT: [[TMP27:%.*]] = load i64, ptr [[DOTOMP_TEMP_2]], align 8
+// CHECK1-NEXT: [[TMP28:%.*]] = load i64, ptr [[DOTOMP_NI2]], align 8
+// CHECK1-NEXT: [[CMP47:%.*]] = icmp sgt i64 [[TMP27]], [[TMP28]]
+// CHECK1-NEXT: br i1 [[CMP47]], label %[[COND_TRUE48:.*]], label %[[COND_FALSE49:.*]]
+// CHECK1: [[COND_TRUE48]]:
+// CHECK1-NEXT: [[TMP29:%.*]] = load i64, ptr [[DOTOMP_TEMP_2]], align 8
+// CHECK1-NEXT: br label %[[COND_END50:.*]]
+// CHECK1: [[COND_FALSE49]]:
+// CHECK1-NEXT: [[TMP30:%.*]] = load i64, ptr [[DOTOMP_NI2]], align 8
+// CHECK1-NEXT: br label %[[COND_END50]]
+// CHECK1: [[COND_END50]]:
+// CHECK1-NEXT: [[COND51:%.*]] = phi i64 [ [[TMP29]], %[[COND_TRUE48]] ], [ [[TMP30]], %[[COND_FALSE49]] ]
+// CHECK1-NEXT: store i64 [[COND51]], ptr [[DOTOMP_FUSE_MAX46]], align 8
+// CHECK1-NEXT: store i64 0, ptr [[DOTOMP_FUSE_INDEX52]], align 8
+// CHECK1-NEXT: br label %[[FOR_COND:.*]]
+// CHECK1: [[FOR_COND]]:
+// CHECK1-NEXT: [[TMP31:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX52]], align 8
+// CHECK1-NEXT: [[TMP32:%.*]] = load i64, ptr [[DOTOMP_FUSE_MAX46]], align 8
+// CHECK1-NEXT: [[CMP53:%.*]] = icmp slt i64 [[TMP31]], [[TMP32]]
+// CHECK1-NEXT: br i1 [[CMP53]], label %[[FOR_BODY:.*]], label %[[FOR_END:.*]]
+// CHECK1: [[FOR_BODY]]:
+// CHECK1-NEXT: [[TMP33:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX52]], align 8
+// CHECK1-NEXT: [[TMP34:%.*]] = load i64, ptr [[DOTOMP_NI05]], align 8
+// CHECK1-NEXT: [[CMP54:%.*]] = icmp slt i64 [[TMP33]], [[TMP34]]
+// CHECK1-NEXT: br i1 [[CMP54]], label %[[IF_THEN:.*]], label %[[IF_END74:.*]]
+// CHECK1: [[IF_THEN]]:
+// CHECK1-NEXT: [[TMP35:%.*]] = load i32, ptr [[DOTOMP_LB03]], align 4
+// CHECK1-NEXT: [[CONV55:%.*]] = sext i32 [[TMP35]] to i64
+// CHECK1-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTOMP_ST04]], align 4
+// CHECK1-NEXT: [[CONV56:%.*]] = sext i32 [[TMP36]] to i64
+// CHECK1-NEXT: [[TMP37:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX52]], align 8
+// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV56]], [[TMP37]]
+// CHECK1-NEXT: [[ADD57:%.*]] = add nsw i64 [[CONV55]], [[MUL]]
+// CHECK1-NEXT: [[CONV58:%.*]] = trunc i64 [[ADD57]] to i32
+// CHECK1-NEXT: store i32 [[CONV58]], ptr [[DOTOMP_IV06]], align 4
+// CHECK1-NEXT: [[TMP38:%.*]] = load i32, ptr [[DOTOMP_IV06]], align 4
+// CHECK1-NEXT: [[MUL59:%.*]] = mul nsw i32 [[TMP38]], 1
+// CHECK1-NEXT: [[ADD60:%.*]] = add nsw i32 0, [[MUL59]]
+// CHECK1-NEXT: store i32 [[ADD60]], ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT: [[TMP39:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT: [[TMP40:%.*]] = load i32, ptr [[DOTOMP_NI0]], align 4
+// CHECK1-NEXT: [[CMP61:%.*]] = icmp slt i32 [[TMP39]], [[TMP40]]
+// CHECK1-NEXT: br i1 [[CMP61]], label %[[IF_THEN62:.*]], label %[[IF_END:.*]]
+// CHECK1: [[IF_THEN62]]:
+// CHECK1-NEXT: [[TMP41:%.*]] = load i32, ptr [[DOTOMP_LB0]], align 4
+// CHECK1-NEXT: [[TMP42:%.*]] = load i32, ptr [[DOTOMP_ST0]], align 4
+// CHECK1-NEXT: [[TMP43:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT: [[MUL63:%.*]] = mul nsw i32 [[TMP42]], [[TMP43]]
+// CHECK1-NEXT: [[ADD64:%.*]] = add nsw i32 [[TMP41]], [[MUL63]]
+// CHECK1-NEXT: store i32 [[ADD64]], ptr [[DOTOMP_IV0]], align 4
+// CHECK1-NEXT: [[TMP44:%.*]] = load i32, ptr [[DOTOMP_IV0]], align 4
+// CHECK1-NEXT: [[MUL65:%.*]] = mul nsw i32 [[TMP44]], 1
+// CHECK1-NEXT: [[ADD66:%.*]] = add nsw i32 0, [[MUL65]]
+// CHECK1-NEXT: store i32 [[ADD66]], ptr [[I]], align 4
+// CHECK1-NEXT: [[TMP45:%.*]] = load i32, ptr [[I]], align 4
+// CHECK1-NEXT: call void (...) @body(i32 noundef [[TMP45]])
+// CHECK1-NEXT: br label %[[IF_END]]
+// CHECK1: [[IF_END]]:
+// CHECK1-NEXT: [[TMP46:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT: [[TMP47:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK1-NEXT: [[CMP67:%.*]] = icmp slt i32 [[TMP46]], [[TMP47]]
+// CHECK1-NEXT: br i1 [[CMP67]], label %[[IF_THEN68:.*]], label %[[IF_END73:.*]]
+// CHECK1: [[IF_THEN68]]:
+// CHECK1-NEXT: [[TMP48:%.*]] = load i32, ptr [[DOTOMP_LB1]], align 4
+// CHECK1-NEXT: [[TMP49:%.*]] = load i32, ptr [[DOTOMP_ST1]], align 4
+// CHECK1-NEXT: [[TMP50:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT: [[MUL69:%.*]] = mul nsw i32 [[TMP49]], [[TMP50]]
+// CHECK1-NEXT: [[ADD70:%.*]] = add nsw i32 [[TMP48]], [[MUL69]]
+// CHECK1-NEXT: store i32 [[ADD70]], ptr [[DOTOMP_IV1]], align 4
+// CHECK1-NEXT: [[TMP51:%.*]] = load i32, ptr [[DOTOMP_IV1]], align 4
+// CHECK1-NEXT: [[MUL71:%.*]] = mul nsw i32 [[TMP51]], 2
+// CHECK1-NEXT: [[ADD72:%.*]] = add nsw i32 0, [[MUL71]]
+// CHECK1-NEXT: store i32 [[ADD72]], ptr [[J]], align 4
+// CHECK1-NEXT: [[TMP52:%.*]] = load i32, ptr [[J]], align 4
+// CHECK1-NEXT: call void (...) @body(i32 noundef [[TMP52]])
+// CHECK1-NEXT: br label %[[IF_END73]]
+// CHECK1: [[IF_END73]]:
+// CHECK1-NEXT: br label %[[IF_END74]]
+// CHECK1: [[IF_END74]]:
+// CHECK1-NEXT: [[TMP53:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX52]], align 8
+// CHECK1-NEXT: [[TMP54:%.*]] = load i64, ptr [[DOTOMP_NI118]], align 8
+// CHECK1-NEXT: [[CMP75:%.*]] = icmp slt i64 [[TMP53]], [[TMP54]]
+// CHECK1-NEXT: br i1 [[CMP75]], label %[[IF_THEN76:.*]], label %[[IF_END81:.*]]
+// CHECK1: [[IF_THEN76]]:
+// CHECK1-NEXT: [[TMP55:%.*]] = load i64, ptr [[DOTOMP_LB116]], align 8
+// CHECK1-NEXT: [[TMP56:%.*]] = load i64, ptr [[DOTOMP_ST117]], align 8
+// CHECK1-NEXT: [[TMP57:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX52]], align 8
+// CHECK1-NEXT: [[MUL77:%.*]] = mul nsw i64 [[TMP56]], [[TMP57]]
+// CHECK1-NEXT: [[ADD78:%.*]] = add nsw i64 [[TMP55]], [[MUL77]]
+// CHECK1-NEXT: store i64 [[ADD78]], ptr [[DOTOMP_IV120]], align 8
+// CHECK1-NEXT: [[TMP58:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_8]], align 8
+// CHECK1-NEXT: [[TMP59:%.*]] = load i64, ptr [[DOTOMP_IV120]], align 8
+// CHECK1-NEXT: [[MUL79:%.*]] = mul nsw i64 [[TMP59]], 1
+// CHECK1-NEXT: [[ADD_PTR80:%.*]] = getelementptr inbounds double, ptr [[TMP58]], i64 [[MUL79]]
+// CHECK1-NEXT: store ptr [[ADD_PTR80]], ptr [[__BEGIN2]], align 8
+// CHECK1-NEXT: [[TMP60:%.*]] = load ptr, ptr [[__BEGIN2]], align 8
+// CHECK1-NEXT: store ptr [[TMP60]], ptr [[V]], align 8
+// CHECK1-NEXT: [[TMP61:%.*]] = load i32, ptr [[C]], align 4
+// CHECK1-NEXT: [[TMP62:%.*]] = load ptr, ptr [[V]], align 8
+// CHECK1-NEXT: [[TMP63:%.*]] = load double, ptr [[TMP62]], align 8
+// CHECK1-NEXT: call void (...) @body(i32 noundef [[TMP61]], double noundef [[TMP63]])
+// CHECK1-NEXT: br label %[[IF_END81]]
+// CHECK1: [[IF_END81]]:
+// CHECK1-NEXT: [[TMP64:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX52]], align 8
+// CHECK1-NEXT: [[TMP65:%.*]] = load i64, ptr [[DOTOMP_NI2]], align 8
+// CHECK1-NEXT: [[CMP82:%.*]] = icmp slt i64 [[TMP64]], [[TMP65]]
+// CHECK1-NEXT: br i1 [[CMP82]], label %[[IF_THEN83:.*]], label %[[IF_END88:.*]]
+// CHECK1: [[IF_THEN83]]:
+// CHECK1-NEXT: [[TMP66:%.*]] = load i64, ptr [[DOTOMP_LB2]], align 8
+// CHECK1-NEXT: [[TMP67:%.*]] = load i64, ptr [[DOTOMP_ST2]], align 8
+// CHECK1-NEXT: [[TMP68:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX52]], align 8
+// CHECK1-NEXT: [[MUL84:%.*]] = mul nsw i64 [[TMP67]], [[TMP68]]
+// CHECK1-NEXT: [[ADD85:%.*]] = add nsw i64 [[TMP66]], [[MUL84]]
+// CHECK1-NEXT: store i64 [[ADD85]], ptr [[DOTOMP_IV2]], align 8
+// CHECK1-NEXT: [[TMP69:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_27]], align 8
+// CHECK1-NEXT: [[TMP70:%.*]] = load i64, ptr [[DOTOMP_IV2]], align 8
+// CHECK1-NEXT: [[MUL86:%.*]] = mul nsw i64 [[TMP70]], 1
+// CHECK1-NEXT: [[ADD_PTR87:%.*]] = getelementptr inbounds double, ptr [[TMP69]], i64 [[MUL86]]
+// CHECK1-NEXT: store ptr [[ADD_PTR87]], ptr [[__BEGIN225]], align 8
+// CHECK1-NEXT: [[TMP71:%.*]] = load ptr, ptr [[__BEGIN225]], align 8
+// CHECK1-NEXT: store ptr [[TMP71]], ptr [[VV]], align 8
+// CHECK1-NEXT: [[TMP72:%.*]] = load i32, ptr [[CC]], align 4
+// CHECK1-NEXT: [[TMP73:%.*]] = load ptr, ptr [[VV]], align 8
+// CHECK1-NEXT: [[TMP74:%.*]] = load double, ptr [[TMP73]], align 8
+// CHECK1-NEXT: call void (...) @body(i32 noundef [[TMP72]], double noundef [[TMP74]])
+// CHECK1-NEXT: br label %[[IF_END88]]
+// CHECK1: [[IF_END88]]:
+// CHECK1-NEXT: br label %[[FOR_INC:.*]]
+// CHECK1: [[FOR_INC]]:
+// CHECK1-NEXT: [[TMP75:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX52]], align 8
+// CHECK1-NEXT: [[INC:%.*]] = add nsw i64 [[TMP75]], 1
+// CHECK1-NEXT: store i64 [[INC]], ptr [[DOTOMP_FUSE_INDEX52]], align 8
+// CHECK1-NEXT: br label %[[FOR_COND]], !llvm.loop [[LOOP6:![0-9]+]]
+// CHECK1: [[FOR_END]]:
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define dso_local void @foo4(
+// CHECK1-SAME: ) #[[ATTR0]] {
+// CHECK1-NEXT: [[ENTRY:.*:]]
+// CHECK1-NEXT: [[ARR:%.*]] = alloca [256 x double], align 16
+// CHECK1-NEXT: [[J:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_LB0:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_ST0:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_NI0:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_IV0:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[K:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_LB1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_ST1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_NI1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_IV1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_TEMP_1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_FUSE_MAX:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_FUSE_INDEX:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[C:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[__RANGE2:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[__BEGIN2:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[__END2:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[V:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: store i32 0, ptr [[J]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB0]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_ST0]], align 4
+// CHECK1-NEXT: store i32 128, ptr [[DOTOMP_NI0]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[K]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB1]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_ST1]], align 4
+// CHECK1-NEXT: store i32 64, ptr [[DOTOMP_NI1]], align 4
+// CHECK1-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTOMP_NI0]], align 4
+// CHECK1-NEXT: store i32 [[TMP0]], ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK1-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP1]], [[TMP2]]
+// CHECK1-NEXT: br i1 [[CMP]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]]
+// CHECK1: [[COND_TRUE]]:
+// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK1-NEXT: br label %[[COND_END:.*]]
+// CHECK1: [[COND_FALSE]]:
+// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK1-NEXT: br label %[[COND_END]]
+// CHECK1: [[COND_END]]:
+// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ [[TMP3]], %[[COND_TRUE]] ], [ [[TMP4]], %[[COND_FALSE]] ]
+// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_FUSE_MAX]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK1-NEXT: br label %[[FOR_COND:.*]]
+// CHECK1: [[FOR_COND]]:
+// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[I]], align 4
+// CHECK1-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 128
+// CHECK1-NEXT: br i1 [[CMP1]], label %[[FOR_BODY:.*]], label %[[FOR_END:.*]]
+// CHECK1: [[FOR_BODY]]:
+// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[I]], align 4
+// CHECK1-NEXT: call void (...) @body(i32 noundef [[TMP6]])
+// CHECK1-NEXT: br label %[[FOR_INC:.*]]
+// CHECK1: [[FOR_INC]]:
+// CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[I]], align 4
+// CHECK1-NEXT: [[INC:%.*]] = add nsw i32 [[TMP7]], 1
+// CHECK1-NEXT: store i32 [[INC]], ptr [[I]], align 4
+// CHECK1-NEXT: br label %[[FOR_COND]], !llvm.loop [[LOOP7:![0-9]+]]
+// CHECK1: [[FOR_END]]:
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT: br label %[[FOR_COND2:.*]]
+// CHECK1: [[FOR_COND2]]:
+// CHECK1-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_FUSE_MAX]], align 4
+// CHECK1-NEXT: [[CMP3:%.*]] = icmp slt i32 [[TMP8]], [[TMP9]]
+// CHECK1-NEXT: br i1 [[CMP3]], label %[[FOR_BODY4:.*]], label %[[FOR_END17:.*]]
+// CHECK1: [[FOR_BODY4]]:
+// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_NI0]], align 4
+// CHECK1-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP10]], [[TMP11]]
+// CHECK1-NEXT: br i1 [[CMP5]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
+// CHECK1: [[IF_THEN]]:
+// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_LB0]], align 4
+// CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_ST0]], align 4
+// CHECK1-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP13]], [[TMP14]]
+// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP12]], [[MUL]]
+// CHECK1-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV0]], align 4
+// CHECK1-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV0]], align 4
+// CHECK1-NEXT: [[MUL6:%.*]] = mul nsw i32 [[TMP15]], 2
+// CHECK1-NEXT: [[ADD7:%.*]] = add nsw i32 0, [[MUL6]]
+// CHECK1-NEXT: store i32 [[ADD7]], ptr [[J]], align 4
+// CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[J]], align 4
+// CHECK1-NEXT: call void (...) @body(i32 noundef [[TMP16]])
+// CHECK1-NEXT: br label %[[IF_END]]
+// CHECK1: [[IF_END]]:
+// CHECK1-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT: [[TMP18:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK1-NEXT: [[CMP8:%.*]] = icmp slt i32 [[TMP17]], [[TMP18]]
+// CHECK1-NEXT: br i1 [[CMP8]], label %[[IF_THEN9:.*]], label %[[IF_END14:.*]]
+// CHECK1: [[IF_THEN9]]:
+// CHECK1-NEXT: [[TMP19:%.*]] = load i32, ptr [[DOTOMP_LB1]], align 4
+// CHECK1-NEXT: [[TMP20:%.*]] = load i32, ptr [[DOTOMP_ST1]], align 4
+// CHECK1-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT: [[MUL10:%.*]] = mul nsw i32 [[TMP20]], [[TMP21]]
+// CHECK1-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP19]], [[MUL10]]
+// CHECK1-NEXT: store i32 [[ADD11]], ptr [[DOTOMP_IV1]], align 4
+// CHECK1-NEXT: [[TMP22:%.*]] = load i32, ptr [[DOTOMP_IV1]], align 4
+// CHECK1-NEXT: [[MUL12:%.*]] = mul nsw i32 [[TMP22]], 1
+// CHECK1-NEXT: [[ADD13:%.*]] = add nsw i32 0, [[MUL12]]
+// CHECK1-NEXT: store i32 [[ADD13]], ptr [[K]], align 4
+// CHECK1-NEXT: [[TMP23:%.*]] = load i32, ptr [[K]], align 4
+// CHECK1-NEXT: call void (...) @body(i32 noundef [[TMP23]])
+// CHECK1-NEXT: br label %[[IF_END14]]
+// CHECK1: [[IF_END14]]:
+// CHECK1-NEXT: br label %[[FOR_INC15:.*]]
+// CHECK1: [[FOR_INC15]]:
+// CHECK1-NEXT: [[TMP24:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT: [[INC16:%.*]] = add nsw i32 [[TMP24]], 1
+// CHECK1-NEXT: store i32 [[INC16]], ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT: br label %[[FOR_COND2]], !llvm.loop [[LOOP8:![0-9]+]]
+// CHECK1: [[FOR_END17]]:
+// CHECK1-NEXT: store i32 42, ptr [[C]], align 4
+// CHECK1-NEXT: store ptr [[ARR]], ptr [[__RANGE2]], align 8
+// CHECK1-NEXT: [[TMP25:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK1-NEXT: [[ARRAYDECAY:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP25]], i64 0, i64 0
+// CHECK1-NEXT: store ptr [[ARRAYDECAY]], ptr [[__BEGIN2]], align 8
+// CHECK1-NEXT: [[TMP26:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK1-NEXT: [[ARRAYDECAY18:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP26]], i64 0, i64 0
+// CHECK1-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds double, ptr [[ARRAYDECAY18]], i64 256
+// CHECK1-NEXT: store ptr [[ADD_PTR]], ptr [[__END2]], align 8
+// CHECK1-NEXT: br label %[[FOR_COND19:.*]]
+// CHECK1: [[FOR_COND19]]:
+// CHECK1-NEXT: [[TMP27:%.*]] = load ptr, ptr [[__BEGIN2]], align 8
+// CHECK1-NEXT: [[TMP28:%.*]] = load ptr, ptr [[__END2]], align 8
+// CHECK1-NEXT: [[CMP20:%.*]] = icmp ne ptr [[TMP27]], [[TMP28]]
+// CHECK1-NEXT: br i1 [[CMP20]], label %[[FOR_BODY21:.*]], label %[[FOR_END23:.*]]
+// CHECK1: [[FOR_BODY21]]:
+// CHECK1-NEXT: [[TMP29:%.*]] = load ptr, ptr [[__BEGIN2]], align 8
+// CHECK1-NEXT: store ptr [[TMP29]], ptr [[V]], align 8
+// CHECK1-NEXT: [[TMP30:%.*]] = load i32, ptr [[C]], align 4
+// CHECK1-NEXT: [[TMP31:%.*]] = load ptr, ptr [[V]], align 8
+// CHECK1-NEXT: [[TMP32:%.*]] = load double, ptr [[TMP31]], align 8
+// CHECK1-NEXT: call void (...) @body(i32 noundef [[TMP30]], double noundef [[TMP32]])
+// CHECK1-NEXT: br label %[[FOR_INC22:.*]]
+// CHECK1: [[FOR_INC22]]:
+// CHECK1-NEXT: [[TMP33:%.*]] = load ptr, ptr [[__BEGIN2]], align 8
+// CHECK1-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP33]], i32 1
+// CHECK1-NEXT: store ptr [[INCDEC_PTR]], ptr [[__BEGIN2]], align 8
+// CHECK1-NEXT: br label %[[FOR_COND19]]
+// CHECK1: [[FOR_END23]]:
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define dso_local void @foo5(
+// CHECK1-SAME: ) #[[ATTR0]] {
+// CHECK1-NEXT: [[ENTRY:.*:]]
+// CHECK1-NEXT: [[ARR:%.*]] = alloca [256 x double], align 16
+// CHECK1-NEXT: [[J:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_LB0:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_ST0:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_NI0:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_IV0:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[K:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_LB1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_ST1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_NI1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_IV1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_TEMP_1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_FUSE_MAX:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_FUSE_INDEX:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_LB03:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_ST04:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_NI05:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTOMP_IV06:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[C:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[__RANGE2:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[__END2:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[__BEGIN2:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_8:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_10:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_11:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTOMP_LB116:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTOMP_ST117:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTOMP_NI118:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTOMP_IV120:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTOMP_TEMP_121:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTOMP_FUSE_MAX22:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_FUSE_INDEX29:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[V:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[CC:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[__RANGE264:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[__BEGIN265:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[__END267:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[VV:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: store i32 0, ptr [[J]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB0]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_ST0]], align 4
+// CHECK1-NEXT: store i32 128, ptr [[DOTOMP_NI0]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[K]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB1]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_ST1]], align 4
+// CHECK1-NEXT: store i32 512, ptr [[DOTOMP_NI1]], align 4
+// CHECK1-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTOMP_NI0]], align 4
+// CHECK1-NEXT: store i32 [[TMP0]], ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK1-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP1]], [[TMP2]]
+// CHECK1-NEXT: br i1 [[CMP]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]]
+// CHECK1: [[COND_TRUE]]:
+// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK1-NEXT: br label %[[COND_END:.*]]
+// CHECK1: [[COND_FALSE]]:
+// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK1-NEXT: br label %[[COND_END]]
+// CHECK1: [[COND_END]]:
+// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ [[TMP3]], %[[COND_TRUE]] ], [ [[TMP4]], %[[COND_FALSE]] ]
+// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_FUSE_MAX]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_FUSE_MAX]], align 4
+// CHECK1-NEXT: store i32 [[TMP5]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP6]], 0
+// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK1-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK1-NEXT: store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB03]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_ST04]], align 4
+// CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP7]], 1
+// CHECK1-NEXT: [[CONV:%.*]] = sext i32 [[ADD]] to i64
+// CHECK1-NEXT: store i64 [[CONV]], ptr [[DOTOMP_NI05]], align 8
+// CHECK1-NEXT: store i32 42, ptr [[C]], align 4
+// CHECK1-NEXT: store ptr [[ARR]], ptr [[__RANGE2]], align 8
+// CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK1-NEXT: [[ARRAYDECAY:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP8]], i64 0, i64 0
+// CHECK1-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds double, ptr [[ARRAYDECAY]], i64 256
+// CHECK1-NEXT: store ptr [[ADD_PTR]], ptr [[__END2]], align 8
+// CHECK1-NEXT: [[TMP9:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK1-NEXT: [[ARRAYDECAY7:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP9]], i64 0, i64 0
+// CHECK1-NEXT: store ptr [[ARRAYDECAY7]], ptr [[__BEGIN2]], align 8
+// CHECK1-NEXT: [[TMP10:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK1-NEXT: [[ARRAYDECAY9:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP10]], i64 0, i64 0
+// CHECK1-NEXT: store ptr [[ARRAYDECAY9]], ptr [[DOTCAPTURE_EXPR_8]], align 8
+// CHECK1-NEXT: [[TMP11:%.*]] = load ptr, ptr [[__END2]], align 8
+// CHECK1-NEXT: store ptr [[TMP11]], ptr [[DOTCAPTURE_EXPR_10]], align 8
+// CHECK1-NEXT: [[TMP12:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_10]], align 8
+// CHECK1-NEXT: [[TMP13:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_8]], align 8
+// CHECK1-NEXT: [[SUB_PTR_LHS_CAST:%.*]] = ptrtoint ptr [[TMP12]] to i64
+// CHECK1-NEXT: [[SUB_PTR_RHS_CAST:%.*]] = ptrtoint ptr [[TMP13]] to i64
+// CHECK1-NEXT: [[SUB_PTR_SUB:%.*]] = sub i64 [[SUB_PTR_LHS_CAST]], [[SUB_PTR_RHS_CAST]]
+// CHECK1-NEXT: [[SUB_PTR_DIV:%.*]] = sdiv exact i64 [[SUB_PTR_SUB]], 8
+// CHECK1-NEXT: [[SUB12:%.*]] = sub nsw i64 [[SUB_PTR_DIV]], 1
+// CHECK1-NEXT: [[ADD13:%.*]] = add nsw i64 [[SUB12]], 1
+// CHECK1-NEXT: [[DIV14:%.*]] = sdiv i64 [[ADD13]], 1
+// CHECK1-NEXT: [[SUB15:%.*]] = sub nsw i64 [[DIV14]], 1
+// CHECK1-NEXT: store i64 [[SUB15]], ptr [[DOTCAPTURE_EXPR_11]], align 8
+// CHECK1-NEXT: store i64 0, ptr [[DOTOMP_LB116]], align 8
+// CHECK1-NEXT: store i64 1, ptr [[DOTOMP_ST117]], align 8
+// CHECK1-NEXT: [[TMP14:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_11]], align 8
+// CHECK1-NEXT: [[ADD19:%.*]] = add nsw i64 [[TMP14]], 1
+// CHECK1-NEXT: store i64 [[ADD19]], ptr [[DOTOMP_NI118]], align 8
+// CHECK1-NEXT: [[TMP15:%.*]] = load i64, ptr [[DOTOMP_NI05]], align 8
+// CHECK1-NEXT: store i64 [[TMP15]], ptr [[DOTOMP_TEMP_121]], align 8
+// CHECK1-NEXT: [[TMP16:%.*]] = load i64, ptr [[DOTOMP_TEMP_121]], align 8
+// CHECK1-NEXT: [[TMP17:%.*]] = load i64, ptr [[DOTOMP_NI118]], align 8
+// CHECK1-NEXT: [[CMP23:%.*]] = icmp sgt i64 [[TMP16]], [[TMP17]]
+// CHECK1-NEXT: br i1 [[CMP23]], label %[[COND_TRUE24:.*]], label %[[COND_FALSE25:.*]]
+// CHECK1: [[COND_TRUE24]]:
+// CHECK1-NEXT: [[TMP18:%.*]] = load i64, ptr [[DOTOMP_TEMP_121]], align 8
+// CHECK1-NEXT: br label %[[COND_END26:.*]]
+// CHECK1: [[COND_FALSE25]]:
+// CHECK1-NEXT: [[TMP19:%.*]] = load i64, ptr [[DOTOMP_NI118]], align 8
+// CHECK1-NEXT: br label %[[COND_END26]]
+// CHECK1: [[COND_END26]]:
+// CHECK1-NEXT: [[COND27:%.*]] = phi i64 [ [[TMP18]], %[[COND_TRUE24]] ], [ [[TMP19]], %[[COND_FALSE25]] ]
+// CHECK1-NEXT: store i64 [[COND27]], ptr [[DOTOMP_FUSE_MAX22]], align 8
+// CHECK1-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK1-NEXT: br label %[[FOR_COND:.*]]
+// CHECK1: [[FOR_COND]]:
+// CHECK1-NEXT: [[TMP20:%.*]] = load i32, ptr [[I]], align 4
+// CHECK1-NEXT: [[CMP28:%.*]] = icmp slt i32 [[TMP20]], 128
+// CHECK1-NEXT: br i1 [[CMP28]], label %[[FOR_BODY:.*]], label %[[FOR_END:.*]]
+// CHECK1: [[FOR_BODY]]:
+// CHECK1-NEXT: [[TMP21:%.*]] = load i32, ptr [[I]], align 4
+// CHECK1-NEXT: call void (...) @body(i32 noundef [[TMP21]])
+// CHECK1-NEXT: br label %[[FOR_INC:.*]]
+// CHECK1: [[FOR_INC]]:
+// CHECK1-NEXT: [[TMP22:%.*]] = load i32, ptr [[I]], align 4
+// CHECK1-NEXT: [[INC:%.*]] = add nsw i32 [[TMP22]], 1
+// CHECK1-NEXT: store i32 [[INC]], ptr [[I]], align 4
+// CHECK1-NEXT: br label %[[FOR_COND]], !llvm.loop [[LOOP9:![0-9]+]]
+// CHECK1: [[FOR_END]]:
+// CHECK1-NEXT: store i64 0, ptr [[DOTOMP_FUSE_INDEX29]], align 8
+// CHECK1-NEXT: br label %[[FOR_COND30:.*]]
+// CHECK1: [[FOR_COND30]]:
+// CHECK1-NEXT: [[TMP23:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX29]], align 8
+// CHECK1-NEXT: [[TMP24:%.*]] = load i64, ptr [[DOTOMP_FUSE_MAX22]], align 8
+// CHECK1-NEXT: [[CMP31:%.*]] = icmp slt i64 [[TMP23]], [[TMP24]]
+// CHECK1-NEXT: br i1 [[CMP31]], label %[[FOR_BODY32:.*]], label %[[FOR_END63:.*]]
+// CHECK1: [[FOR_BODY32]]:
+// CHECK1-NEXT: [[TMP25:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX29]], align 8
+// CHECK1-NEXT: [[TMP26:%.*]] = load i64, ptr [[DOTOMP_NI05]], align 8
+// CHECK1-NEXT: [[CMP33:%.*]] = icmp slt i64 [[TMP25]], [[TMP26]]
+// CHECK1-NEXT: br i1 [[CMP33]], label %[[IF_THEN:.*]], label %[[IF_END53:.*]]
+// CHECK1: [[IF_THEN]]:
+// CHECK1-NEXT: [[TMP27:%.*]] = load i32, ptr [[DOTOMP_LB03]], align 4
+// CHECK1-NEXT: [[CONV34:%.*]] = sext i32 [[TMP27]] to i64
+// CHECK1-NEXT: [[TMP28:%.*]] = load i32, ptr [[DOTOMP_ST04]], align 4
+// CHECK1-NEXT: [[CONV35:%.*]] = sext i32 [[TMP28]] to i64
+// CHECK1-NEXT: [[TMP29:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX29]], align 8
+// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV35]], [[TMP29]]
+// CHECK1-NEXT: [[ADD36:%.*]] = add nsw i64 [[CONV34]], [[MUL]]
+// CHECK1-NEXT: [[CONV37:%.*]] = trunc i64 [[ADD36]] to i32
+// CHECK1-NEXT: store i32 [[CONV37]], ptr [[DOTOMP_IV06]], align 4
+// CHECK1-NEXT: [[TMP30:%.*]] = load i32, ptr [[DOTOMP_IV06]], align 4
+// CHECK1-NEXT: [[MUL38:%.*]] = mul nsw i32 [[TMP30]], 1
+// CHECK1-NEXT: [[ADD39:%.*]] = add nsw i32 0, [[MUL38]]
+// CHECK1-NEXT: store i32 [[ADD39]], ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT: [[TMP32:%.*]] = load i32, ptr [[DOTOMP_NI0]], align 4
+// CHECK1-NEXT: [[CMP40:%.*]] = icmp slt i32 [[TMP31]], [[TMP32]]
+// CHECK1-NEXT: br i1 [[CMP40]], label %[[IF_THEN41:.*]], label %[[IF_END:.*]]
+// CHECK1: [[IF_THEN41]]:
+// CHECK1-NEXT: [[TMP33:%.*]] = load i32, ptr [[DOTOMP_LB0]], align 4
+// CHECK1-NEXT: [[TMP34:%.*]] = load i32, ptr [[DOTOMP_ST0]], align 4
+// CHECK1-NEXT: [[TMP35:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT: [[MUL42:%.*]] = mul nsw i32 [[TMP34]], [[TMP35]]
+// CHECK1-NEXT: [[ADD43:%.*]] = add nsw i32 [[TMP33]], [[MUL42]]
+// CHECK1-NEXT: store i32 [[ADD43]], ptr [[DOTOMP_IV0]], align 4
+// CHECK1-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTOMP_IV0]], align 4
+// CHECK1-NEXT: [[MUL44:%.*]] = mul nsw i32 [[TMP36]], 2
+// CHECK1-NEXT: [[ADD45:%.*]] = add nsw i32 0, [[MUL44]]
+// CHECK1-NEXT: store i32 [[ADD45]], ptr [[J]], align 4
+// CHECK1-NEXT: [[TMP37:%.*]] = load i32, ptr [[J]], align 4
+// CHECK1-NEXT: call void (...) @body(i32 noundef [[TMP37]])
+// CHECK1-NEXT: br label %[[IF_END]]
+// CHECK1: [[IF_END]]:
+// CHECK1-NEXT: [[TMP38:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT: [[TMP39:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK1-NEXT: [[CMP46:%.*]] = icmp slt i32 [[TMP38]], [[TMP39]]
+// CHECK1-NEXT: br i1 [[CMP46]], label %[[IF_THEN47:.*]], label %[[IF_END52:.*]]
+// CHECK1: [[IF_THEN47]]:
+// CHECK1-NEXT: [[TMP40:%.*]] = load i32, ptr [[DOTOMP_LB1]], align 4
+// CHECK1-NEXT: [[TMP41:%.*]] = load i32, ptr [[DOTOMP_ST1]], align 4
+// CHECK1-NEXT: [[TMP42:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT: [[MUL48:%.*]] = mul nsw i32 [[TMP41]], [[TMP42]]
+// CHECK1-NEXT: [[ADD49:%.*]] = add nsw i32 [[TMP40]], [[MUL48]]
+// CHECK1-NEXT: store i32 [[ADD49]], ptr [[DOTOMP_IV1]], align 4
+// CHECK1-NEXT: [[TMP43:%.*]] = load i32, ptr [[DOTOMP_IV1]], align 4
+// CHECK1-NEXT: [[MUL50:%.*]] = mul nsw i32 [[TMP43]], 1
+// CHECK1-NEXT: [[ADD51:%.*]] = add nsw i32 0, [[MUL50]]
+// CHECK1-NEXT: store i32 [[ADD51]], ptr [[K]], align 4
+// CHECK1-NEXT: [[TMP44:%.*]] = load i32, ptr [[K]], align 4
+// CHECK1-NEXT: call void (...) @body(i32 noundef [[TMP44]])
+// CHECK1-NEXT: br label %[[IF_END52]]
+// CHECK1: [[IF_END52]]:
+// CHECK1-NEXT: br label %[[IF_END53]]
+// CHECK1: [[IF_END53]]:
+// CHECK1-NEXT: [[TMP45:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX29]], align 8
+// CHECK1-NEXT: [[TMP46:%.*]] = load i64, ptr [[DOTOMP_NI118]], align 8
+// CHECK1-NEXT: [[CMP54:%.*]] = icmp slt i64 [[TMP45]], [[TMP46]]
+// CHECK1-NEXT: br i1 [[CMP54]], label %[[IF_THEN55:.*]], label %[[IF_END60:.*]]
+// CHECK1: [[IF_THEN55]]:
+// CHECK1-NEXT: [[TMP47:%.*]] = load i64, ptr [[DOTOMP_LB116]], align 8
+// CHECK1-NEXT: [[TMP48:%.*]] = load i64, ptr [[DOTOMP_ST117]], align 8
+// CHECK1-NEXT: [[TMP49:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX29]], align 8
+// CHECK1-NEXT: [[MUL56:%.*]] = mul nsw i64 [[TMP48]], [[TMP49]]
+// CHECK1-NEXT: [[ADD57:%.*]] = add nsw i64 [[TMP47]], [[MUL56]]
+// CHECK1-NEXT: store i64 [[ADD57]], ptr [[DOTOMP_IV120]], align 8
+// CHECK1-NEXT: [[TMP50:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_8]], align 8
+// CHECK1-NEXT: [[TMP51:%.*]] = load i64, ptr [[DOTOMP_IV120]], align 8
+// CHECK1-NEXT: [[MUL58:%.*]] = mul nsw i64 [[TMP51]], 1
+// CHECK1-NEXT: [[ADD_PTR59:%.*]] = getelementptr inbounds double, ptr [[TMP50]], i64 [[MUL58]]
+// CHECK1-NEXT: store ptr [[ADD_PTR59]], ptr [[__BEGIN2]], align 8
+// CHECK1-NEXT: [[TMP52:%.*]] = load ptr, ptr [[__BEGIN2]], align 8
+// CHECK1-NEXT: store ptr [[TMP52]], ptr [[V]], align 8
+// CHECK1-NEXT: [[TMP53:%.*]] = load i32, ptr [[C]], align 4
+// CHECK1-NEXT: [[TMP54:%.*]] = load ptr, ptr [[V]], align 8
+// CHECK1-NEXT: [[TMP55:%.*]] = load double, ptr [[TMP54]], align 8
+// CHECK1-NEXT: call void (...) @body(i32 noundef [[TMP53]], double noundef [[TMP55]])
+// CHECK1-NEXT: br label %[[IF_END60]]
+// CHECK1: [[IF_END60]]:
+// CHECK1-NEXT: br label %[[FOR_INC61:.*]]
+// CHECK1: [[FOR_INC61]]:
+// CHECK1-NEXT: [[TMP56:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX29]], align 8
+// CHECK1-NEXT: [[INC62:%.*]] = add nsw i64 [[TMP56]], 1
+// CHECK1-NEXT: store i64 [[INC62]], ptr [[DOTOMP_FUSE_INDEX29]], align 8
+// CHECK1-NEXT: br label %[[FOR_COND30]], !llvm.loop [[LOOP10:![0-9]+]]
+// CHECK1: [[FOR_END63]]:
+// CHECK1-NEXT: store i32 37, ptr [[CC]], align 4
+// CHECK1-NEXT: store ptr [[ARR]], ptr [[__RANGE264]], align 8
+// CHECK1-NEXT: [[TMP57:%.*]] = load ptr, ptr [[__RANGE264]], align 8
+// CHECK1-NEXT: [[ARRAYDECAY66:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP57]], i64 0, i64 0
+// CHECK1-NEXT: store ptr [[ARRAYDECAY66]], ptr [[__BEGIN265]], align 8
+// CHECK1-NEXT: [[TMP58:%.*]] = load ptr, ptr [[__RANGE264]], align 8
+// CHECK1-NEXT: [[ARRAYDECAY68:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP58]], i64 0, i64 0
+// CHECK1-NEXT: [[ADD_PTR69:%.*]] = getelementptr inbounds double, ptr [[ARRAYDECAY68]], i64 256
+// CHECK1-NEXT: store ptr [[ADD_PTR69]], ptr [[__END267]], align 8
+// CHECK1-NEXT: br label %[[FOR_COND70:.*]]
+// CHECK1: [[FOR_COND70]]:
+// CHECK1-NEXT: [[TMP59:%.*]] = load ptr, ptr [[__BEGIN265]], align 8
+// CHECK1-NEXT: [[TMP60:%.*]] = load ptr, ptr [[__END267]], align 8
+// CHECK1-NEXT: [[CMP71:%.*]] = icmp ne ptr [[TMP59]], [[TMP60]]
+// CHECK1-NEXT: br i1 [[CMP71]], label %[[FOR_BODY72:.*]], label %[[FOR_END74:.*]]
+// CHECK1: [[FOR_BODY72]]:
+// CHECK1-NEXT: [[TMP61:%.*]] = load ptr, ptr [[__BEGIN265]], align 8
+// CHECK1-NEXT: store ptr [[TMP61]], ptr [[VV]], align 8
+// CHECK1-NEXT: [[TMP62:%.*]] = load i32, ptr [[CC]], align 4
+// CHECK1-NEXT: [[TMP63:%.*]] = load ptr, ptr [[VV]], align 8
+// CHECK1-NEXT: [[TMP64:%.*]] = load double, ptr [[TMP63]], align 8
+// CHECK1-NEXT: call void (...) @body(i32 noundef [[TMP62]], double noundef [[TMP64]])
+// CHECK1-NEXT: br label %[[FOR_INC73:.*]]
+// CHECK1: [[FOR_INC73]]:
+// CHECK1-NEXT: [[TMP65:%.*]] = load ptr, ptr [[__BEGIN265]], align 8
+// CHECK1-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP65]], i32 1
+// CHECK1-NEXT: store ptr [[INCDEC_PTR]], ptr [[__BEGIN265]], align 8
+// CHECK1-NEXT: br label %[[FOR_COND70]]
+// CHECK1: [[FOR_END74]]:
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK2-LABEL: define dso_local void @body(
+// CHECK2-SAME: ...) #[[ATTR0:[0-9]+]] {
+// CHECK2-NEXT: [[ENTRY:.*:]]
+// CHECK2-NEXT: ret void
+//
+//
+// CHECK2-LABEL: define dso_local void @foo1(
+// CHECK2-SAME: i32 noundef [[START1:%.*]], i32 noundef [[END1:%.*]], i32 noundef [[STEP1:%.*]], i32 noundef [[START2:%.*]], i32 noundef [[END2:%.*]], i32 noundef [[STEP2:%.*]]) #[[ATTR0]] {
+// CHECK2-NEXT: [[ENTRY:.*:]]
+// CHECK2-NEXT: [[START1_ADDR:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[END1_ADDR:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[STEP1_ADDR:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[START2_ADDR:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[END2_ADDR:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[STEP2_ADDR:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[J:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTNEW_STEP:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_LB0:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_ST0:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_NI0:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_IV0:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR_6:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR_7:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTNEW_STEP8:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR_9:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_LB1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_ST1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_NI1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_IV1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_TEMP_1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_FUSE_MAX:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_FUSE_INDEX:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: store i32 [[START1]], ptr [[START1_ADDR]], align 4
+// CHECK2-NEXT: store i32 [[END1]], ptr [[END1_ADDR]], align 4
+// CHECK2-NEXT: store i32 [[STEP1]], ptr [[STEP1_ADDR]], align 4
+// CHECK2-NEXT: store i32 [[START2]], ptr [[START2_ADDR]], align 4
+// CHECK2-NEXT: store i32 [[END2]], ptr [[END2_ADDR]], align 4
+// CHECK2-NEXT: store i32 [[STEP2]], ptr [[STEP2_ADDR]], align 4
+// CHECK2-NEXT: [[TMP0:%.*]] = load i32, ptr [[START1_ADDR]], align 4
+// CHECK2-NEXT: store i32 [[TMP0]], ptr [[I]], align 4
+// CHECK2-NEXT: [[TMP1:%.*]] = load i32, ptr [[START1_ADDR]], align 4
+// CHECK2-NEXT: store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT: [[TMP2:%.*]] = load i32, ptr [[END1_ADDR]], align 4
+// CHECK2-NEXT: store i32 [[TMP2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT: [[TMP3:%.*]] = load i32, ptr [[STEP1_ADDR]], align 4
+// CHECK2-NEXT: store i32 [[TMP3]], ptr [[DOTNEW_STEP]], align 4
+// CHECK2-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT: [[SUB:%.*]] = sub i32 [[TMP4]], [[TMP5]]
+// CHECK2-NEXT: [[SUB3:%.*]] = sub i32 [[SUB]], 1
+// CHECK2-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4
+// CHECK2-NEXT: [[ADD:%.*]] = add i32 [[SUB3]], [[TMP6]]
+// CHECK2-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4
+// CHECK2-NEXT: [[DIV:%.*]] = udiv i32 [[ADD]], [[TMP7]]
+// CHECK2-NEXT: [[SUB4:%.*]] = sub i32 [[DIV]], 1
+// CHECK2-NEXT: store i32 [[SUB4]], ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK2-NEXT: store i32 0, ptr [[DOTOMP_LB0]], align 4
+// CHECK2-NEXT: store i32 1, ptr [[DOTOMP_ST0]], align 4
+// CHECK2-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK2-NEXT: [[ADD5:%.*]] = add i32 [[TMP8]], 1
+// CHECK2-NEXT: store i32 [[ADD5]], ptr [[DOTOMP_NI0]], align 4
+// CHECK2-NEXT: [[TMP9:%.*]] = load i32, ptr [[START2_ADDR]], align 4
+// CHECK2-NEXT: store i32 [[TMP9]], ptr [[J]], align 4
+// CHECK2-NEXT: [[TMP10:%.*]] = load i32, ptr [[START2_ADDR]], align 4
+// CHECK2-NEXT: store i32 [[TMP10]], ptr [[DOTCAPTURE_EXPR_6]], align 4
+// CHECK2-NEXT: [[TMP11:%.*]] = load i32, ptr [[END2_ADDR]], align 4
+// CHECK2-NEXT: store i32 [[TMP11]], ptr [[DOTCAPTURE_EXPR_7]], align 4
+// CHECK2-NEXT: [[TMP12:%.*]] = load i32, ptr [[STEP2_ADDR]], align 4
+// CHECK2-NEXT: store i32 [[TMP12]], ptr [[DOTNEW_STEP8]], align 4
+// CHECK2-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_7]], align 4
+// CHECK2-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4
+// CHECK2-NEXT: [[SUB10:%.*]] = sub i32 [[TMP13]], [[TMP14]]
+// CHECK2-NEXT: [[SUB11:%.*]] = sub i32 [[SUB10]], 1
+// CHECK2-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTNEW_STEP8]], align 4
+// CHECK2-NEXT: [[ADD12:%.*]] = add i32 [[SUB11]], [[TMP15]]
+// CHECK2-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTNEW_STEP8]], align 4
+// CHECK2-NEXT: [[DIV13:%.*]] = udiv i32 [[ADD12]], [[TMP16]]
+// CHECK2-NEXT: [[SUB14:%.*]] = sub i32 [[DIV13]], 1
+// CHECK2-NEXT: store i32 [[SUB14]], ptr [[DOTCAPTURE_EXPR_9]], align 4
+// CHECK2-NEXT: store i32 0, ptr [[DOTOMP_LB1]], align 4
+// CHECK2-NEXT: store i32 1, ptr [[DOTOMP_ST1]], align 4
+// CHECK2-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_9]], align 4
+// CHECK2-NEXT: [[ADD15:%.*]] = add i32 [[TMP17]], 1
+// CHECK2-NEXT: store i32 [[ADD15]], ptr [[DOTOMP_NI1]], align 4
+// CHECK2-NEXT: [[TMP18:%.*]] = load i32, ptr [[DOTOMP_NI0]], align 4
+// CHECK2-NEXT: store i32 [[TMP18]], ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK2-NEXT: [[TMP19:%.*]] = load i32, ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK2-NEXT: [[TMP20:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK2-NEXT: [[CMP:%.*]] = icmp ugt i32 [[TMP19]], [[TMP20]]
+// CHECK2-NEXT: br i1 [[CMP]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]]
+// CHECK2: [[COND_TRUE]]:
+// CHECK2-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK2-NEXT: br label %[[COND_END:.*]]
+// CHECK2: [[COND_FALSE]]:
+// CHECK2-NEXT: [[TMP22:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK2-NEXT: br label %[[COND_END]]
+// CHECK2: [[COND_END]]:
+// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ [[TMP21]], %[[COND_TRUE]] ], [ [[TMP22]], %[[COND_FALSE]] ]
+// CHECK2-NEXT: store i32 [[COND]], ptr [[DOTOMP_FUSE_MAX]], align 4
+// CHECK2-NEXT: store i32 0, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT: br label %[[FOR_COND:.*]]
+// CHECK2: [[FOR_COND]]:
+// CHECK2-NEXT: [[TMP23:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT: [[TMP24:%.*]] = load i32, ptr [[DOTOMP_FUSE_MAX]], align 4
+// CHECK2-NEXT: [[CMP16:%.*]] = icmp ult i32 [[TMP23]], [[TMP24]]
+// CHECK2-NEXT: br i1 [[CMP16]], label %[[FOR_BODY:.*]], label %[[FOR_END:.*]]
+// CHECK2: [[FOR_BODY]]:
+// CHECK2-NEXT: [[TMP25:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTOMP_NI0]], align 4
+// CHECK2-NEXT: [[CMP17:%.*]] = icmp ult i32 [[TMP25]], [[TMP26]]
+// CHECK2-NEXT: br i1 [[CMP17]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
+// CHECK2: [[IF_THEN]]:
+// CHECK2-NEXT: [[TMP27:%.*]] = load i32, ptr [[DOTOMP_LB0]], align 4
+// CHECK2-NEXT: [[TMP28:%.*]] = load i32, ptr [[DOTOMP_ST0]], align 4
+// CHECK2-NEXT: [[TMP29:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT: [[MUL:%.*]] = mul i32 [[TMP28]], [[TMP29]]
+// CHECK2-NEXT: [[ADD18:%.*]] = add i32 [[TMP27]], [[MUL]]
+// CHECK2-NEXT: store i32 [[ADD18]], ptr [[DOTOMP_IV0]], align 4
+// CHECK2-NEXT: [[TMP30:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTOMP_IV0]], align 4
+// CHECK2-NEXT: [[TMP32:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4
+// CHECK2-NEXT: [[MUL19:%.*]] = mul i32 [[TMP31]], [[TMP32]]
+// CHECK2-NEXT: [[ADD20:%.*]] = add i32 [[TMP30]], [[MUL19]]
+// CHECK2-NEXT: store i32 [[ADD20]], ptr [[I]], align 4
+// CHECK2-NEXT: [[TMP33:%.*]] = load i32, ptr [[I]], align 4
+// CHECK2-NEXT: call void (...) @body(i32 noundef [[TMP33]])
+// CHECK2-NEXT: br label %[[IF_END]]
+// CHECK2: [[IF_END]]:
+// CHECK2-NEXT: [[TMP34:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT: [[TMP35:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK2-NEXT: [[CMP21:%.*]] = icmp ult i32 [[TMP34]], [[TMP35]]
+// CHECK2-NEXT: br i1 [[CMP21]], label %[[IF_THEN22:.*]], label %[[IF_END27:.*]]
+// CHECK2: [[IF_THEN22]]:
+// CHECK2-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTOMP_LB1]], align 4
+// CHECK2-NEXT: [[TMP37:%.*]] = load i32, ptr [[DOTOMP_ST1]], align 4
+// CHECK2-NEXT: [[TMP38:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT: [[MUL23:%.*]] = mul i32 [[TMP37]], [[TMP38]]
+// CHECK2-NEXT: [[ADD24:%.*]] = add i32 [[TMP36]], [[MUL23]]
+// CHECK2-NEXT: store i32 [[ADD24]], ptr [[DOTOMP_IV1]], align 4
+// CHECK2-NEXT: [[TMP39:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4
+// CHECK2-NEXT: [[TMP40:%.*]] = load i32, ptr [[DOTOMP_IV1]], align 4
+// CHECK2-NEXT: [[TMP41:%.*]] = load i32, ptr [[DOTNEW_STEP8]], align 4
+// CHECK2-NEXT: [[MUL25:%.*]] = mul i32 [[TMP40]], [[TMP41]]
+// CHECK2-NEXT: [[ADD26:%.*]] = add i32 [[TMP39]], [[MUL25]]
+// CHECK2-NEXT: store i32 [[ADD26]], ptr [[J]], align 4
+// CHECK2-NEXT: [[TMP42:%.*]] = load i32, ptr [[J]], align 4
+// CHECK2-NEXT: call void (...) @body(i32 noundef [[TMP42]])
+// CHECK2-NEXT: br label %[[IF_END27]]
+// CHECK2: [[IF_END27]]:
+// CHECK2-NEXT: br label %[[FOR_INC:.*]]
+// CHECK2: [[FOR_INC]]:
+// CHECK2-NEXT: [[TMP43:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT: [[INC:%.*]] = add i32 [[TMP43]], 1
+// CHECK2-NEXT: store i32 [[INC]], ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT: br label %[[FOR_COND]], !llvm.loop [[LOOP3:![0-9]+]]
+// CHECK2: [[FOR_END]]:
+// CHECK2-NEXT: ret void
+//
+//
+// CHECK2-LABEL: define dso_local void @foo3(
+// CHECK2-SAME: ) #[[ATTR0]] {
+// CHECK2-NEXT: [[ENTRY:.*:]]
+// CHECK2-NEXT: [[ARR:%.*]] = alloca [256 x double], align 16
+// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_LB0:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_ST0:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_NI0:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_IV0:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[J:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_LB1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_ST1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_NI1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_IV1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_TEMP_1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_FUSE_MAX:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_FUSE_INDEX:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_LB03:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_ST04:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_NI05:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[DOTOMP_IV06:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[C:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[__RANGE2:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[__END2:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[__BEGIN2:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR_8:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR_10:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR_11:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[DOTOMP_LB116:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[DOTOMP_ST117:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[DOTOMP_NI118:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[DOTOMP_IV120:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[CC:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[__RANGE221:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[__END222:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[__BEGIN225:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR_27:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR_29:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR_30:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[DOTOMP_LB2:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[DOTOMP_ST2:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[DOTOMP_NI2:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[DOTOMP_IV2:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[DOTOMP_TEMP_140:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[DOTOMP_TEMP_2:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[DOTOMP_FUSE_MAX46:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[DOTOMP_FUSE_INDEX52:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[V:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[VV:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK2-NEXT: store i32 0, ptr [[DOTOMP_LB0]], align 4
+// CHECK2-NEXT: store i32 1, ptr [[DOTOMP_ST0]], align 4
+// CHECK2-NEXT: store i32 128, ptr [[DOTOMP_NI0]], align 4
+// CHECK2-NEXT: store i32 0, ptr [[J]], align 4
+// CHECK2-NEXT: store i32 0, ptr [[DOTOMP_LB1]], align 4
+// CHECK2-NEXT: store i32 1, ptr [[DOTOMP_ST1]], align 4
+// CHECK2-NEXT: store i32 128, ptr [[DOTOMP_NI1]], align 4
+// CHECK2-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTOMP_NI0]], align 4
+// CHECK2-NEXT: store i32 [[TMP0]], ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK2-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK2-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK2-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP1]], [[TMP2]]
+// CHECK2-NEXT: br i1 [[CMP]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]]
+// CHECK2: [[COND_TRUE]]:
+// CHECK2-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK2-NEXT: br label %[[COND_END:.*]]
+// CHECK2: [[COND_FALSE]]:
+// CHECK2-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK2-NEXT: br label %[[COND_END]]
+// CHECK2: [[COND_END]]:
+// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ [[TMP3]], %[[COND_TRUE]] ], [ [[TMP4]], %[[COND_FALSE]] ]
+// CHECK2-NEXT: store i32 [[COND]], ptr [[DOTOMP_FUSE_MAX]], align 4
+// CHECK2-NEXT: store i32 0, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_FUSE_MAX]], align 4
+// CHECK2-NEXT: store i32 [[TMP5]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP6]], 0
+// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK2-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK2-NEXT: store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT: store i32 0, ptr [[DOTOMP_LB03]], align 4
+// CHECK2-NEXT: store i32 1, ptr [[DOTOMP_ST04]], align 4
+// CHECK2-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP7]], 1
+// CHECK2-NEXT: [[CONV:%.*]] = sext i32 [[ADD]] to i64
+// CHECK2-NEXT: store i64 [[CONV]], ptr [[DOTOMP_NI05]], align 8
+// CHECK2-NEXT: store i32 42, ptr [[C]], align 4
+// CHECK2-NEXT: store ptr [[ARR]], ptr [[__RANGE2]], align 8
+// CHECK2-NEXT: [[TMP8:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK2-NEXT: [[ARRAYDECAY:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP8]], i64 0, i64 0
+// CHECK2-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds double, ptr [[ARRAYDECAY]], i64 256
+// CHECK2-NEXT: store ptr [[ADD_PTR]], ptr [[__END2]], align 8
+// CHECK2-NEXT: [[TMP9:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK2-NEXT: [[ARRAYDECAY7:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP9]], i64 0, i64 0
+// CHECK2-NEXT: store ptr [[ARRAYDECAY7]], ptr [[__BEGIN2]], align 8
+// CHECK2-NEXT: [[TMP10:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK2-NEXT: [[ARRAYDECAY9:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP10]], i64 0, i64 0
+// CHECK2-NEXT: store ptr [[ARRAYDECAY9]], ptr [[DOTCAPTURE_EXPR_8]], align 8
+// CHECK2-NEXT: [[TMP11:%.*]] = load ptr, ptr [[__END2]], align 8
+// CHECK2-NEXT: store ptr [[TMP11]], ptr [[DOTCAPTURE_EXPR_10]], align 8
+// CHECK2-NEXT: [[TMP12:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_10]], align 8
+// CHECK2-NEXT: [[TMP13:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_8]], align 8
+// CHECK2-NEXT: [[SUB_PTR_LHS_CAST:%.*]] = ptrtoint ptr [[TMP12]] to i64
+// CHECK2-NEXT: [[SUB_PTR_RHS_CAST:%.*]] = ptrtoint ptr [[TMP13]] to i64
+// CHECK2-NEXT: [[SUB_PTR_SUB:%.*]] = sub i64 [[SUB_PTR_LHS_CAST]], [[SUB_PTR_RHS_CAST]]
+// CHECK2-NEXT: [[SUB_PTR_DIV:%.*]] = sdiv exact i64 [[SUB_PTR_SUB]], 8
+// CHECK2-NEXT: [[SUB12:%.*]] = sub nsw i64 [[SUB_PTR_DIV]], 1
+// CHECK2-NEXT: [[ADD13:%.*]] = add nsw i64 [[SUB12]], 1
+// CHECK2-NEXT: [[DIV14:%.*]] = sdiv i64 [[ADD13]], 1
+// CHECK2-NEXT: [[SUB15:%.*]] = sub nsw i64 [[DIV14]], 1
+// CHECK2-NEXT: store i64 [[SUB15]], ptr [[DOTCAPTURE_EXPR_11]], align 8
+// CHECK2-NEXT: store i64 0, ptr [[DOTOMP_LB116]], align 8
+// CHECK2-NEXT: store i64 1, ptr [[DOTOMP_ST117]], align 8
+// CHECK2-NEXT: [[TMP14:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_11]], align 8
+// CHECK2-NEXT: [[ADD19:%.*]] = add nsw i64 [[TMP14]], 1
+// CHECK2-NEXT: store i64 [[ADD19]], ptr [[DOTOMP_NI118]], align 8
+// CHECK2-NEXT: store i32 37, ptr [[CC]], align 4
+// CHECK2-NEXT: store ptr [[ARR]], ptr [[__RANGE221]], align 8
+// CHECK2-NEXT: [[TMP15:%.*]] = load ptr, ptr [[__RANGE221]], align 8
+// CHECK2-NEXT: [[ARRAYDECAY23:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP15]], i64 0, i64 0
+// CHECK2-NEXT: [[ADD_PTR24:%.*]] = getelementptr inbounds double, ptr [[ARRAYDECAY23]], i64 256
+// CHECK2-NEXT: store ptr [[ADD_PTR24]], ptr [[__END222]], align 8
+// CHECK2-NEXT: [[TMP16:%.*]] = load ptr, ptr [[__RANGE221]], align 8
+// CHECK2-NEXT: [[ARRAYDECAY26:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP16]], i64 0, i64 0
+// CHECK2-NEXT: store ptr [[ARRAYDECAY26]], ptr [[__BEGIN225]], align 8
+// CHECK2-NEXT: [[TMP17:%.*]] = load ptr, ptr [[__RANGE221]], align 8
+// CHECK2-NEXT: [[ARRAYDECAY28:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP17]], i64 0, i64 0
+// CHECK2-NEXT: store ptr [[ARRAYDECAY28]], ptr [[DOTCAPTURE_EXPR_27]], align 8
+// CHECK2-NEXT: [[TMP18:%.*]] = load ptr, ptr [[__END222]], align 8
+// CHECK2-NEXT: store ptr [[TMP18]], ptr [[DOTCAPTURE_EXPR_29]], align 8
+// CHECK2-NEXT: [[TMP19:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_29]], align 8
+// CHECK2-NEXT: [[TMP20:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_27]], align 8
+// CHECK2-NEXT: [[SUB_PTR_LHS_CAST31:%.*]] = ptrtoint ptr [[TMP19]] to i64
+// CHECK2-NEXT: [[SUB_PTR_RHS_CAST32:%.*]] = ptrtoint ptr [[TMP20]] to i64
+// CHECK2-NEXT: [[SUB_PTR_SUB33:%.*]] = sub i64 [[SUB_PTR_LHS_CAST31]], [[SUB_PTR_RHS_CAST32]]
+// CHECK2-NEXT: [[SUB_PTR_DIV34:%.*]] = sdiv exact i64 [[SUB_PTR_SUB33]], 8
+// CHECK2-NEXT: [[SUB35:%.*]] = sub nsw i64 [[SUB_PTR_DIV34]], 1
+// CHECK2-NEXT: [[ADD36:%.*]] = add nsw i64 [[SUB35]], 1
+// CHECK2-NEXT: [[DIV37:%.*]] = sdiv i64 [[ADD36]], 1
+// CHECK2-NEXT: [[SUB38:%.*]] = sub nsw i64 [[DIV37]], 1
+// CHECK2-NEXT: store i64 [[SUB38]], ptr [[DOTCAPTURE_EXPR_30]], align 8
+// CHECK2-NEXT: store i64 0, ptr [[DOTOMP_LB2]], align 8
+// CHECK2-NEXT: store i64 1, ptr [[DOTOMP_ST2]], align 8
+// CHECK2-NEXT: [[TMP21:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_30]], align 8
+// CHECK2-NEXT: [[ADD39:%.*]] = add nsw i64 [[TMP21]], 1
+// CHECK2-NEXT: store i64 [[ADD39]], ptr [[DOTOMP_NI2]], align 8
+// CHECK2-NEXT: [[TMP22:%.*]] = load i64, ptr [[DOTOMP_NI05]], align 8
+// CHECK2-NEXT: store i64 [[TMP22]], ptr [[DOTOMP_TEMP_140]], align 8
+// CHECK2-NEXT: [[TMP23:%.*]] = load i64, ptr [[DOTOMP_TEMP_140]], align 8
+// CHECK2-NEXT: [[TMP24:%.*]] = load i64, ptr [[DOTOMP_NI118]], align 8
+// CHECK2-NEXT: [[CMP41:%.*]] = icmp sgt i64 [[TMP23]], [[TMP24]]
+// CHECK2-NEXT: br i1 [[CMP41]], label %[[COND_TRUE42:.*]], label %[[COND_FALSE43:.*]]
+// CHECK2: [[COND_TRUE42]]:
+// CHECK2-NEXT: [[TMP25:%.*]] = load i64, ptr [[DOTOMP_TEMP_140]], align 8
+// CHECK2-NEXT: br label %[[COND_END44:.*]]
+// CHECK2: [[COND_FALSE43]]:
+// CHECK2-NEXT: [[TMP26:%.*]] = load i64, ptr [[DOTOMP_NI118]], align 8
+// CHECK2-NEXT: br label %[[COND_END44]]
+// CHECK2: [[COND_END44]]:
+// CHECK2-NEXT: [[COND45:%.*]] = phi i64 [ [[TMP25]], %[[COND_TRUE42]] ], [ [[TMP26]], %[[COND_FALSE43]] ]
+// CHECK2-NEXT: store i64 [[COND45]], ptr [[DOTOMP_TEMP_2]], align 8
+// CHECK2-NEXT: [[TMP27:%.*]] = load i64, ptr [[DOTOMP_TEMP_2]], align 8
+// CHECK2-NEXT: [[TMP28:%.*]] = load i64, ptr [[DOTOMP_NI2]], align 8
+// CHECK2-NEXT: [[CMP47:%.*]] = icmp sgt i64 [[TMP27]], [[TMP28]]
+// CHECK2-NEXT: br i1 [[CMP47]], label %[[COND_TRUE48:.*]], label %[[COND_FALSE49:.*]]
+// CHECK2: [[COND_TRUE48]]:
+// CHECK2-NEXT: [[TMP29:%.*]] = load i64, ptr [[DOTOMP_TEMP_2]], align 8
+// CHECK2-NEXT: br label %[[COND_END50:.*]]
+// CHECK2: [[COND_FALSE49]]:
+// CHECK2-NEXT: [[TMP30:%.*]] = load i64, ptr [[DOTOMP_NI2]], align 8
+// CHECK2-NEXT: br label %[[COND_END50]]
+// CHECK2: [[COND_END50]]:
+// CHECK2-NEXT: [[COND51:%.*]] = phi i64 [ [[TMP29]], %[[COND_TRUE48]] ], [ [[TMP30]], %[[COND_FALSE49]] ]
+// CHECK2-NEXT: store i64 [[COND51]], ptr [[DOTOMP_FUSE_MAX46]], align 8
+// CHECK2-NEXT: store i64 0, ptr [[DOTOMP_FUSE_INDEX52]], align 8
+// CHECK2-NEXT: br label %[[FOR_COND:.*]]
+// CHECK2: [[FOR_COND]]:
+// CHECK2-NEXT: [[TMP31:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX52]], align 8
+// CHECK2-NEXT: [[TMP32:%.*]] = load i64, ptr [[DOTOMP_FUSE_MAX46]], align 8
+// CHECK2-NEXT: [[CMP53:%.*]] = icmp slt i64 [[TMP31]], [[TMP32]]
+// CHECK2-NEXT: br i1 [[CMP53]], label %[[FOR_BODY:.*]], label %[[FOR_END:.*]]
+// CHECK2: [[FOR_BODY]]:
+// CHECK2-NEXT: [[TMP33:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX52]], align 8
+// CHECK2-NEXT: [[TMP34:%.*]] = load i64, ptr [[DOTOMP_NI05]], align 8
+// CHECK2-NEXT: [[CMP54:%.*]] = icmp slt i64 [[TMP33]], [[TMP34]]
+// CHECK2-NEXT: br i1 [[CMP54]], label %[[IF_THEN:.*]], label %[[IF_END74:.*]]
+// CHECK2: [[IF_THEN]]:
+// CHECK2-NEXT: [[TMP35:%.*]] = load i32, ptr [[DOTOMP_LB03]], align 4
+// CHECK2-NEXT: [[CONV55:%.*]] = sext i32 [[TMP35]] to i64
+// CHECK2-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTOMP_ST04]], align 4
+// CHECK2-NEXT: [[CONV56:%.*]] = sext i32 [[TMP36]] to i64
+// CHECK2-NEXT: [[TMP37:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX52]], align 8
+// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV56]], [[TMP37]]
+// CHECK2-NEXT: [[ADD57:%.*]] = add nsw i64 [[CONV55]], [[MUL]]
+// CHECK2-NEXT: [[CONV58:%.*]] = trunc i64 [[ADD57]] to i32
+// CHECK2-NEXT: store i32 [[CONV58]], ptr [[DOTOMP_IV06]], align 4
+// CHECK2-NEXT: [[TMP38:%.*]] = load i32, ptr [[DOTOMP_IV06]], align 4
+// CHECK2-NEXT: [[MUL59:%.*]] = mul nsw i32 [[TMP38]], 1
+// CHECK2-NEXT: [[ADD60:%.*]] = add nsw i32 0, [[MUL59]]
+// CHECK2-NEXT: store i32 [[ADD60]], ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT: [[TMP39:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT: [[TMP40:%.*]] = load i32, ptr [[DOTOMP_NI0]], align 4
+// CHECK2-NEXT: [[CMP61:%.*]] = icmp slt i32 [[TMP39]], [[TMP40]]
+// CHECK2-NEXT: br i1 [[CMP61]], label %[[IF_THEN62:.*]], label %[[IF_END:.*]]
+// CHECK2: [[IF_THEN62]]:
+// CHECK2-NEXT: [[TMP41:%.*]] = load i32, ptr [[DOTOMP_LB0]], align 4
+// CHECK2-NEXT: [[TMP42:%.*]] = load i32, ptr [[DOTOMP_ST0]], align 4
+// CHECK2-NEXT: [[TMP43:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT: [[MUL63:%.*]] = mul nsw i32 [[TMP42]], [[TMP43]]
+// CHECK2-NEXT: [[ADD64:%.*]] = add nsw i32 [[TMP41]], [[MUL63]]
+// CHECK2-NEXT: store i32 [[ADD64]], ptr [[DOTOMP_IV0]], align 4
+// CHECK2-NEXT: [[TMP44:%.*]] = load i32, ptr [[DOTOMP_IV0]], align 4
+// CHECK2-NEXT: [[MUL65:%.*]] = mul nsw i32 [[TMP44]], 1
+// CHECK2-NEXT: [[ADD66:%.*]] = add nsw i32 0, [[MUL65]]
+// CHECK2-NEXT: store i32 [[ADD66]], ptr [[I]], align 4
+// CHECK2-NEXT: [[TMP45:%.*]] = load i32, ptr [[I]], align 4
+// CHECK2-NEXT: call void (...) @body(i32 noundef [[TMP45]])
+// CHECK2-NEXT: br label %[[IF_END]]
+// CHECK2: [[IF_END]]:
+// CHECK2-NEXT: [[TMP46:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT: [[TMP47:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK2-NEXT: [[CMP67:%.*]] = icmp slt i32 [[TMP46]], [[TMP47]]
+// CHECK2-NEXT: br i1 [[CMP67]], label %[[IF_THEN68:.*]], label %[[IF_END73:.*]]
+// CHECK2: [[IF_THEN68]]:
+// CHECK2-NEXT: [[TMP48:%.*]] = load i32, ptr [[DOTOMP_LB1]], align 4
+// CHECK2-NEXT: [[TMP49:%.*]] = load i32, ptr [[DOTOMP_ST1]], align 4
+// CHECK2-NEXT: [[TMP50:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT: [[MUL69:%.*]] = mul nsw i32 [[TMP49]], [[TMP50]]
+// CHECK2-NEXT: [[ADD70:%.*]] = add nsw i32 [[TMP48]], [[MUL69]]
+// CHECK2-NEXT: store i32 [[ADD70]], ptr [[DOTOMP_IV1]], align 4
+// CHECK2-NEXT: [[TMP51:%.*]] = load i32, ptr [[DOTOMP_IV1]], align 4
+// CHECK2-NEXT: [[MUL71:%.*]] = mul nsw i32 [[TMP51]], 2
+// CHECK2-NEXT: [[ADD72:%.*]] = add nsw i32 0, [[MUL71]]
+// CHECK2-NEXT: store i32 [[ADD72]], ptr [[J]], align 4
+// CHECK2-NEXT: [[TMP52:%.*]] = load i32, ptr [[J]], align 4
+// CHECK2-NEXT: call void (...) @body(i32 noundef [[TMP52]])
+// CHECK2-NEXT: br label %[[IF_END73]]
+// CHECK2: [[IF_END73]]:
+// CHECK2-NEXT: br label %[[IF_END74]]
+// CHECK2: [[IF_END74]]:
+// CHECK2-NEXT: [[TMP53:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX52]], align 8
+// CHECK2-NEXT: [[TMP54:%.*]] = load i64, ptr [[DOTOMP_NI118]], align 8
+// CHECK2-NEXT: [[CMP75:%.*]] = icmp slt i64 [[TMP53]], [[TMP54]]
+// CHECK2-NEXT: br i1 [[CMP75]], label %[[IF_THEN76:.*]], label %[[IF_END81:.*]]
+// CHECK2: [[IF_THEN76]]:
+// CHECK2-NEXT: [[TMP55:%.*]] = load i64, ptr [[DOTOMP_LB116]], align 8
+// CHECK2-NEXT: [[TMP56:%.*]] = load i64, ptr [[DOTOMP_ST117]], align 8
+// CHECK2-NEXT: [[TMP57:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX52]], align 8
+// CHECK2-NEXT: [[MUL77:%.*]] = mul nsw i64 [[TMP56]], [[TMP57]]
+// CHECK2-NEXT: [[ADD78:%.*]] = add nsw i64 [[TMP55]], [[MUL77]]
+// CHECK2-NEXT: store i64 [[ADD78]], ptr [[DOTOMP_IV120]], align 8
+// CHECK2-NEXT: [[TMP58:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_8]], align 8
+// CHECK2-NEXT: [[TMP59:%.*]] = load i64, ptr [[DOTOMP_IV120]], align 8
+// CHECK2-NEXT: [[MUL79:%.*]] = mul nsw i64 [[TMP59]], 1
+// CHECK2-NEXT: [[ADD_PTR80:%.*]] = getelementptr inbounds double, ptr [[TMP58]], i64 [[MUL79]]
+// CHECK2-NEXT: store ptr [[ADD_PTR80]], ptr [[__BEGIN2]], align 8
+// CHECK2-NEXT: [[TMP60:%.*]] = load ptr, ptr [[__BEGIN2]], align 8
+// CHECK2-NEXT: store ptr [[TMP60]], ptr [[V]], align 8
+// CHECK2-NEXT: [[TMP61:%.*]] = load i32, ptr [[C]], align 4
+// CHECK2-NEXT: [[TMP62:%.*]] = load ptr, ptr [[V]], align 8
+// CHECK2-NEXT: [[TMP63:%.*]] = load double, ptr [[TMP62]], align 8
+// CHECK2-NEXT: call void (...) @body(i32 noundef [[TMP61]], double noundef [[TMP63]])
+// CHECK2-NEXT: br label %[[IF_END81]]
+// CHECK2: [[IF_END81]]:
+// CHECK2-NEXT: [[TMP64:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX52]], align 8
+// CHECK2-NEXT: [[TMP65:%.*]] = load i64, ptr [[DOTOMP_NI2]], align 8
+// CHECK2-NEXT: [[CMP82:%.*]] = icmp slt i64 [[TMP64]], [[TMP65]]
+// CHECK2-NEXT: br i1 [[CMP82]], label %[[IF_THEN83:.*]], label %[[IF_END88:.*]]
+// CHECK2: [[IF_THEN83]]:
+// CHECK2-NEXT: [[TMP66:%.*]] = load i64, ptr [[DOTOMP_LB2]], align 8
+// CHECK2-NEXT: [[TMP67:%.*]] = load i64, ptr [[DOTOMP_ST2]], align 8
+// CHECK2-NEXT: [[TMP68:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX52]], align 8
+// CHECK2-NEXT: [[MUL84:%.*]] = mul nsw i64 [[TMP67]], [[TMP68]]
+// CHECK2-NEXT: [[ADD85:%.*]] = add nsw i64 [[TMP66]], [[MUL84]]
+// CHECK2-NEXT: store i64 [[ADD85]], ptr [[DOTOMP_IV2]], align 8
+// CHECK2-NEXT: [[TMP69:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_27]], align 8
+// CHECK2-NEXT: [[TMP70:%.*]] = load i64, ptr [[DOTOMP_IV2]], align 8
+// CHECK2-NEXT: [[MUL86:%.*]] = mul nsw i64 [[TMP70]], 1
+// CHECK2-NEXT: [[ADD_PTR87:%.*]] = getelementptr inbounds double, ptr [[TMP69]], i64 [[MUL86]]
+// CHECK2-NEXT: store ptr [[ADD_PTR87]], ptr [[__BEGIN225]], align 8
+// CHECK2-NEXT: [[TMP71:%.*]] = load ptr, ptr [[__BEGIN225]], align 8
+// CHECK2-NEXT: store ptr [[TMP71]], ptr [[VV]], align 8
+// CHECK2-NEXT: [[TMP72:%.*]] = load i32, ptr [[CC]], align 4
+// CHECK2-NEXT: [[TMP73:%.*]] = load ptr, ptr [[VV]], align 8
+// CHECK2-NEXT: [[TMP74:%.*]] = load double, ptr [[TMP73]], align 8
+// CHECK2-NEXT: call void (...) @body(i32 noundef [[TMP72]], double noundef [[TMP74]])
+// CHECK2-NEXT: br label %[[IF_END88]]
+// CHECK2: [[IF_END88]]:
+// CHECK2-NEXT: br label %[[FOR_INC:.*]]
+// CHECK2: [[FOR_INC]]:
+// CHECK2-NEXT: [[TMP75:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX52]], align 8
+// CHECK2-NEXT: [[INC:%.*]] = add nsw i64 [[TMP75]], 1
+// CHECK2-NEXT: store i64 [[INC]], ptr [[DOTOMP_FUSE_INDEX52]], align 8
+// CHECK2-NEXT: br label %[[FOR_COND]], !llvm.loop [[LOOP5:![0-9]+]]
+// CHECK2: [[FOR_END]]:
+// CHECK2-NEXT: ret void
+//
+//
+// CHECK2-LABEL: define dso_local void @foo4(
+// CHECK2-SAME: ) #[[ATTR0]] {
+// CHECK2-NEXT: [[ENTRY:.*:]]
+// CHECK2-NEXT: [[ARR:%.*]] = alloca [256 x double], align 16
+// CHECK2-NEXT: [[J:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_LB0:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_ST0:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_NI0:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_IV0:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[K:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_LB1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_ST1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_NI1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_IV1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_TEMP_1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_FUSE_MAX:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_FUSE_INDEX:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[C:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[__RANGE2:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[__BEGIN2:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[__END2:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[V:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: store i32 0, ptr [[J]], align 4
+// CHECK2-NEXT: store i32 0, ptr [[DOTOMP_LB0]], align 4
+// CHECK2-NEXT: store i32 1, ptr [[DOTOMP_ST0]], align 4
+// CHECK2-NEXT: store i32 128, ptr [[DOTOMP_NI0]], align 4
+// CHECK2-NEXT: store i32 0, ptr [[K]], align 4
+// CHECK2-NEXT: store i32 0, ptr [[DOTOMP_LB1]], align 4
+// CHECK2-NEXT: store i32 1, ptr [[DOTOMP_ST1]], align 4
+// CHECK2-NEXT: store i32 64, ptr [[DOTOMP_NI1]], align 4
+// CHECK2-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTOMP_NI0]], align 4
+// CHECK2-NEXT: store i32 [[TMP0]], ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK2-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK2-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK2-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP1]], [[TMP2]]
+// CHECK2-NEXT: br i1 [[CMP]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]]
+// CHECK2: [[COND_TRUE]]:
+// CHECK2-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK2-NEXT: br label %[[COND_END:.*]]
+// CHECK2: [[COND_FALSE]]:
+// CHECK2-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK2-NEXT: br label %[[COND_END]]
+// CHECK2: [[COND_END]]:
+// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ [[TMP3]], %[[COND_TRUE]] ], [ [[TMP4]], %[[COND_FALSE]] ]
+// CHECK2-NEXT: store i32 [[COND]], ptr [[DOTOMP_FUSE_MAX]], align 4
+// CHECK2-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK2-NEXT: br label %[[FOR_COND:.*]]
+// CHECK2: [[FOR_COND]]:
+// CHECK2-NEXT: [[TMP5:%.*]] = load i32, ptr [[I]], align 4
+// CHECK2-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 128
+// CHECK2-NEXT: br i1 [[CMP1]], label %[[FOR_BODY:.*]], label %[[FOR_END:.*]]
+// CHECK2: [[FOR_BODY]]:
+// CHECK2-NEXT: [[TMP6:%.*]] = load i32, ptr [[I]], align 4
+// CHECK2-NEXT: call void (...) @body(i32 noundef [[TMP6]])
+// CHECK2-NEXT: br label %[[FOR_INC:.*]]
+// CHECK2: [[FOR_INC]]:
+// CHECK2-NEXT: [[TMP7:%.*]] = load i32, ptr [[I]], align 4
+// CHECK2-NEXT: [[INC:%.*]] = add nsw i32 [[TMP7]], 1
+// CHECK2-NEXT: store i32 [[INC]], ptr [[I]], align 4
+// CHECK2-NEXT: br label %[[FOR_COND]], !llvm.loop [[LOOP6:![0-9]+]]
+// CHECK2: [[FOR_END]]:
+// CHECK2-NEXT: store i32 0, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT: br label %[[FOR_COND2:.*]]
+// CHECK2: [[FOR_COND2]]:
+// CHECK2-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_FUSE_MAX]], align 4
+// CHECK2-NEXT: [[CMP3:%.*]] = icmp slt i32 [[TMP8]], [[TMP9]]
+// CHECK2-NEXT: br i1 [[CMP3]], label %[[FOR_BODY4:.*]], label %[[FOR_END17:.*]]
+// CHECK2: [[FOR_BODY4]]:
+// CHECK2-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_NI0]], align 4
+// CHECK2-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP10]], [[TMP11]]
+// CHECK2-NEXT: br i1 [[CMP5]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
+// CHECK2: [[IF_THEN]]:
+// CHECK2-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_LB0]], align 4
+// CHECK2-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_ST0]], align 4
+// CHECK2-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP13]], [[TMP14]]
+// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP12]], [[MUL]]
+// CHECK2-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV0]], align 4
+// CHECK2-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV0]], align 4
+// CHECK2-NEXT: [[MUL6:%.*]] = mul nsw i32 [[TMP15]], 2
+// CHECK2-NEXT: [[ADD7:%.*]] = add nsw i32 0, [[MUL6]]
+// CHECK2-NEXT: store i32 [[ADD7]], ptr [[J]], align 4
+// CHECK2-NEXT: [[TMP16:%.*]] = load i32, ptr [[J]], align 4
+// CHECK2-NEXT: call void (...) @body(i32 noundef [[TMP16]])
+// CHECK2-NEXT: br label %[[IF_END]]
+// CHECK2: [[IF_END]]:
+// CHECK2-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT: [[TMP18:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK2-NEXT: [[CMP8:%.*]] = icmp slt i32 [[TMP17]], [[TMP18]]
+// CHECK2-NEXT: br i1 [[CMP8]], label %[[IF_THEN9:.*]], label %[[IF_END14:.*]]
+// CHECK2: [[IF_THEN9]]:
+// CHECK2-NEXT: [[TMP19:%.*]] = load i32, ptr [[DOTOMP_LB1]], align 4
+// CHECK2-NEXT: [[TMP20:%.*]] = load i32, ptr [[DOTOMP_ST1]], align 4
+// CHECK2-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT: [[MUL10:%.*]] = mul nsw i32 [[TMP20]], [[TMP21]]
+// CHECK2-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP19]], [[MUL10]]
+// CHECK2-NEXT: store i32 [[ADD11]], ptr [[DOTOMP_IV1]], align 4
+// CHECK2-NEXT: [[TMP22:%.*]] = load i32, ptr [[DOTOMP_IV1]], align 4
+// CHECK2-NEXT: [[MUL12:%.*]] = mul nsw i32 [[TMP22]], 1
+// CHECK2-NEXT: [[ADD13:%.*]] = add nsw i32 0, [[MUL12]]
+// CHECK2-NEXT: store i32 [[ADD13]], ptr [[K]], align 4
+// CHECK2-NEXT: [[TMP23:%.*]] = load i32, ptr [[K]], align 4
+// CHECK2-NEXT: call void (...) @body(i32 noundef [[TMP23]])
+// CHECK2-NEXT: br label %[[IF_END14]]
+// CHECK2: [[IF_END14]]:
+// CHECK2-NEXT: br label %[[FOR_INC15:.*]]
+// CHECK2: [[FOR_INC15]]:
+// CHECK2-NEXT: [[TMP24:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT: [[INC16:%.*]] = add nsw i32 [[TMP24]], 1
+// CHECK2-NEXT: store i32 [[INC16]], ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT: br label %[[FOR_COND2]], !llvm.loop [[LOOP7:![0-9]+]]
+// CHECK2: [[FOR_END17]]:
+// CHECK2-NEXT: store i32 42, ptr [[C]], align 4
+// CHECK2-NEXT: store ptr [[ARR]], ptr [[__RANGE2]], align 8
+// CHECK2-NEXT: [[TMP25:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK2-NEXT: [[ARRAYDECAY:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP25]], i64 0, i64 0
+// CHECK2-NEXT: store ptr [[ARRAYDECAY]], ptr [[__BEGIN2]], align 8
+// CHECK2-NEXT: [[TMP26:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK2-NEXT: [[ARRAYDECAY18:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP26]], i64 0, i64 0
+// CHECK2-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds double, ptr [[ARRAYDECAY18]], i64 256
+// CHECK2-NEXT: store ptr [[ADD_PTR]], ptr [[__END2]], align 8
+// CHECK2-NEXT: br label %[[FOR_COND19:.*]]
+// CHECK2: [[FOR_COND19]]:
+// CHECK2-NEXT: [[TMP27:%.*]] = load ptr, ptr [[__BEGIN2]], align 8
+// CHECK2-NEXT: [[TMP28:%.*]] = load ptr, ptr [[__END2]], align 8
+// CHECK2-NEXT: [[CMP20:%.*]] = icmp ne ptr [[TMP27]], [[TMP28]]
+// CHECK2-NEXT: br i1 [[CMP20]], label %[[FOR_BODY21:.*]], label %[[FOR_END23:.*]]
+// CHECK2: [[FOR_BODY21]]:
+// CHECK2-NEXT: [[TMP29:%.*]] = load ptr, ptr [[__BEGIN2]], align 8
+// CHECK2-NEXT: store ptr [[TMP29]], ptr [[V]], align 8
+// CHECK2-NEXT: [[TMP30:%.*]] = load i32, ptr [[C]], align 4
+// CHECK2-NEXT: [[TMP31:%.*]] = load ptr, ptr [[V]], align 8
+// CHECK2-NEXT: [[TMP32:%.*]] = load double, ptr [[TMP31]], align 8
+// CHECK2-NEXT: call void (...) @body(i32 noundef [[TMP30]], double noundef [[TMP32]])
+// CHECK2-NEXT: br label %[[FOR_INC22:.*]]
+// CHECK2: [[FOR_INC22]]:
+// CHECK2-NEXT: [[TMP33:%.*]] = load ptr, ptr [[__BEGIN2]], align 8
+// CHECK2-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP33]], i32 1
+// CHECK2-NEXT: store ptr [[INCDEC_PTR]], ptr [[__BEGIN2]], align 8
+// CHECK2-NEXT: br label %[[FOR_COND19]]
+// CHECK2: [[FOR_END23]]:
+// CHECK2-NEXT: ret void
+//
+//
+// CHECK2-LABEL: define dso_local void @foo5(
+// CHECK2-SAME: ) #[[ATTR0]] {
+// CHECK2-NEXT: [[ENTRY:.*:]]
+// CHECK2-NEXT: [[ARR:%.*]] = alloca [256 x double], align 16
+// CHECK2-NEXT: [[J:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_LB0:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_ST0:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_NI0:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_IV0:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[K:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_LB1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_ST1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_NI1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_IV1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_TEMP_1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_FUSE_MAX:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_FUSE_INDEX:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_LB03:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_ST04:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_NI05:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[DOTOMP_IV06:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[C:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[__RANGE2:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[__END2:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[__BEGIN2:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR_8:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR_10:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR_11:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[DOTOMP_LB116:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[DOTOMP_ST117:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[DOTOMP_NI118:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[DOTOMP_IV120:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[DOTOMP_TEMP_121:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[DOTOMP_FUSE_MAX22:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_FUSE_INDEX29:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[V:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[CC:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[__RANGE264:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[__BEGIN265:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[__END267:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[VV:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: store i32 0, ptr [[J]], align 4
+// CHECK2-NEXT: store i32 0, ptr [[DOTOMP_LB0]], align 4
+// CHECK2-NEXT: store i32 1, ptr [[DOTOMP_ST0]], align 4
+// CHECK2-NEXT: store i32 128, ptr [[DOTOMP_NI0]], align 4
+// CHECK2-NEXT: store i32 0, ptr [[K]], align 4
+// CHECK2-NEXT: store i32 0, ptr [[DOTOMP_LB1]], align 4
+// CHECK2-NEXT: store i32 1, ptr [[DOTOMP_ST1]], align 4
+// CHECK2-NEXT: store i32 512, ptr [[DOTOMP_NI1]], align 4
+// CHECK2-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTOMP_NI0]], align 4
+// CHECK2-NEXT: store i32 [[TMP0]], ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK2-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK2-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK2-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP1]], [[TMP2]]
+// CHECK2-NEXT: br i1 [[CMP]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]]
+// CHECK2: [[COND_TRUE]]:
+// CHECK2-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK2-NEXT: br label %[[COND_END:.*]]
+// CHECK2: [[COND_FALSE]]:
+// CHECK2-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK2-NEXT: br label %[[COND_END]]
+// CHECK2: [[COND_END]]:
+// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ [[TMP3]], %[[COND_TRUE]] ], [ [[TMP4]], %[[COND_FALSE]] ]
+// CHECK2-NEXT: store i32 [[COND]], ptr [[DOTOMP_FUSE_MAX]], align 4
+// CHECK2-NEXT: store i32 0, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_FUSE_MAX]], align 4
+// CHECK2-NEXT: store i32 [[TMP5]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP6]], 0
+// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK2-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK2-NEXT: store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT: store i32 0, ptr [[DOTOMP_LB03]], align 4
+// CHECK2-NEXT: store i32 1, ptr [[DOTOMP_ST04]], align 4
+// CHECK2-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP7]], 1
+// CHECK2-NEXT: [[CONV:%.*]] = sext i32 [[ADD]] to i64
+// CHECK2-NEXT: store i64 [[CONV]], ptr [[DOTOMP_NI05]], align 8
+// CHECK2-NEXT: store i32 42, ptr [[C]], align 4
+// CHECK2-NEXT: store ptr [[ARR]], ptr [[__RANGE2]], align 8
+// CHECK2-NEXT: [[TMP8:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK2-NEXT: [[ARRAYDECAY:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP8]], i64 0, i64 0
+// CHECK2-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds double, ptr [[ARRAYDECAY]], i64 256
+// CHECK2-NEXT: store ptr [[ADD_PTR]], ptr [[__END2]], align 8
+// CHECK2-NEXT: [[TMP9:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK2-NEXT: [[ARRAYDECAY7:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP9]], i64 0, i64 0
+// CHECK2-NEXT: store ptr [[ARRAYDECAY7]], ptr [[__BEGIN2]], align 8
+// CHECK2-NEXT: [[TMP10:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK2-NEXT: [[ARRAYDECAY9:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP10]], i64 0, i64 0
+// CHECK2-NEXT: store ptr [[ARRAYDECAY9]], ptr [[DOTCAPTURE_EXPR_8]], align 8
+// CHECK2-NEXT: [[TMP11:%.*]] = load ptr, ptr [[__END2]], align 8
+// CHECK2-NEXT: store ptr [[TMP11]], ptr [[DOTCAPTURE_EXPR_10]], align 8
+// CHECK2-NEXT: [[TMP12:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_10]], align 8
+// CHECK2-NEXT: [[TMP13:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_8]], align 8
+// CHECK2-NEXT: [[SUB_PTR_LHS_CAST:%.*]] = ptrtoint ptr [[TMP12]] to i64
+// CHECK2-NEXT: [[SUB_PTR_RHS_CAST:%.*]] = ptrtoint ptr [[TMP13]] to i64
+// CHECK2-NEXT: [[SUB_PTR_SUB:%.*]] = sub i64 [[SUB_PTR_LHS_CAST]], [[SUB_PTR_RHS_CAST]]
+// CHECK2-NEXT: [[SUB_PTR_DIV:%.*]] = sdiv exact i64 [[SUB_PTR_SUB]], 8
+// CHECK2-NEXT: [[SUB12:%.*]] = sub nsw i64 [[SUB_PTR_DIV]], 1
+// CHECK2-NEXT: [[ADD13:%.*]] = add nsw i64 [[SUB12]], 1
+// CHECK2-NEXT: [[DIV14:%.*]] = sdiv i64 [[ADD13]], 1
+// CHECK2-NEXT: [[SUB15:%.*]] = sub nsw i64 [[DIV14]], 1
+// CHECK2-NEXT: store i64 [[SUB15]], ptr [[DOTCAPTURE_EXPR_11]], align 8
+// CHECK2-NEXT: store i64 0, ptr [[DOTOMP_LB116]], align 8
+// CHECK2-NEXT: store i64 1, ptr [[DOTOMP_ST117]], align 8
+// CHECK2-NEXT: [[TMP14:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_11]], align 8
+// CHECK2-NEXT: [[ADD19:%.*]] = add nsw i64 [[TMP14]], 1
+// CHECK2-NEXT: store i64 [[ADD19]], ptr [[DOTOMP_NI118]], align 8
+// CHECK2-NEXT: [[TMP15:%.*]] = load i64, ptr [[DOTOMP_NI05]], align 8
+// CHECK2-NEXT: store i64 [[TMP15]], ptr [[DOTOMP_TEMP_121]], align 8
+// CHECK2-NEXT: [[TMP16:%.*]] = load i64, ptr [[DOTOMP_TEMP_121]], align 8
+// CHECK2-NEXT: [[TMP17:%.*]] = load i64, ptr [[DOTOMP_NI118]], align 8
+// CHECK2-NEXT: [[CMP23:%.*]] = icmp sgt i64 [[TMP16]], [[TMP17]]
+// CHECK2-NEXT: br i1 [[CMP23]], label %[[COND_TRUE24:.*]], label %[[COND_FALSE25:.*]]
+// CHECK2: [[COND_TRUE24]]:
+// CHECK2-NEXT: [[TMP18:%.*]] = load i64, ptr [[DOTOMP_TEMP_121]], align 8
+// CHECK2-NEXT: br label %[[COND_END26:.*]]
+// CHECK2: [[COND_FALSE25]]:
+// CHECK2-NEXT: [[TMP19:%.*]] = load i64, ptr [[DOTOMP_NI118]], align 8
+// CHECK2-NEXT: br label %[[COND_END26]]
+// CHECK2: [[COND_END26]]:
+// CHECK2-NEXT: [[COND27:%.*]] = phi i64 [ [[TMP18]], %[[COND_TRUE24]] ], [ [[TMP19]], %[[COND_FALSE25]] ]
+// CHECK2-NEXT: store i64 [[COND27]], ptr [[DOTOMP_FUSE_MAX22]], align 8
+// CHECK2-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK2-NEXT: br label %[[FOR_COND:.*]]
+// CHECK2: [[FOR_COND]]:
+// CHECK2-NEXT: [[TMP20:%.*]] = load i32, ptr [[I]], align 4
+// CHECK2-NEXT: [[CMP28:%.*]] = icmp slt i32 [[TMP20]], 128
+// CHECK2-NEXT: br i1 [[CMP28]], label %[[FOR_BODY:.*]], label %[[FOR_END:.*]]
+// CHECK2: [[FOR_BODY]]:
+// CHECK2-NEXT: [[TMP21:%.*]] = load i32, ptr [[I]], align 4
+// CHECK2-NEXT: call void (...) @body(i32 noundef [[TMP21]])
+// CHECK2-NEXT: br label %[[FOR_INC:.*]]
+// CHECK2: [[FOR_INC]]:
+// CHECK2-NEXT: [[TMP22:%.*]] = load i32, ptr [[I]], align 4
+// CHECK2-NEXT: [[INC:%.*]] = add nsw i32 [[TMP22]], 1
+// CHECK2-NEXT: store i32 [[INC]], ptr [[I]], align 4
+// CHECK2-NEXT: br label %[[FOR_COND]], !llvm.loop [[LOOP8:![0-9]+]]
+// CHECK2: [[FOR_END]]:
+// CHECK2-NEXT: store i64 0, ptr [[DOTOMP_FUSE_INDEX29]], align 8
+// CHECK2-NEXT: br label %[[FOR_COND30:.*]]
+// CHECK2: [[FOR_COND30]]:
+// CHECK2-NEXT: [[TMP23:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX29]], align 8
+// CHECK2-NEXT: [[TMP24:%.*]] = load i64, ptr [[DOTOMP_FUSE_MAX22]], align 8
+// CHECK2-NEXT: [[CMP31:%.*]] = icmp slt i64 [[TMP23]], [[TMP24]]
+// CHECK2-NEXT: br i1 [[CMP31]], label %[[FOR_BODY32:.*]], label %[[FOR_END63:.*]]
+// CHECK2: [[FOR_BODY32]]:
+// CHECK2-NEXT: [[TMP25:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX29]], align 8
+// CHECK2-NEXT: [[TMP26:%.*]] = load i64, ptr [[DOTOMP_NI05]], align 8
+// CHECK2-NEXT: [[CMP33:%.*]] = icmp slt i64 [[TMP25]], [[TMP26]]
+// CHECK2-NEXT: br i1 [[CMP33]], label %[[IF_THEN:.*]], label %[[IF_END53:.*]]
+// CHECK2: [[IF_THEN]]:
+// CHECK2-NEXT: [[TMP27:%.*]] = load i32, ptr [[DOTOMP_LB03]], align 4
+// CHECK2-NEXT: [[CONV34:%.*]] = sext i32 [[TMP27]] to i64
+// CHECK2-NEXT: [[TMP28:%.*]] = load i32, ptr [[DOTOMP_ST04]], align 4
+// CHECK2-NEXT: [[CONV35:%.*]] = sext i32 [[TMP28]] to i64
+// CHECK2-NEXT: [[TMP29:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX29]], align 8
+// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV35]], [[TMP29]]
+// CHECK2-NEXT: [[ADD36:%.*]] = add nsw i64 [[CONV34]], [[MUL]]
+// CHECK2-NEXT: [[CONV37:%.*]] = trunc i64 [[ADD36]] to i32
+// CHECK2-NEXT: store i32 [[CONV37]], ptr [[DOTOMP_IV06]], align 4
+// CHECK2-NEXT: [[TMP30:%.*]] = load i32, ptr [[DOTOMP_IV06]], align 4
+// CHECK2-NEXT: [[MUL38:%.*]] = mul nsw i32 [[TMP30]], 1
+// CHECK2-NEXT: [[ADD39:%.*]] = add nsw i32 0, [[MUL38]]
+// CHECK2-NEXT: store i32 [[ADD39]], ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT: [[TMP32:%.*]] = load i32, ptr [[DOTOMP_NI0]], align 4
+// CHECK2-NEXT: [[CMP40:%.*]] = icmp slt i32 [[TMP31]], [[TMP32]]
+// CHECK2-NEXT: br i1 [[CMP40]], label %[[IF_THEN41:.*]], label %[[IF_END:.*]]
+// CHECK2: [[IF_THEN41]]:
+// CHECK2-NEXT: [[TMP33:%.*]] = load i32, ptr [[DOTOMP_LB0]], align 4
+// CHECK2-NEXT: [[TMP34:%.*]] = load i32, ptr [[DOTOMP_ST0]], align 4
+// CHECK2-NEXT: [[TMP35:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT: [[MUL42:%.*]] = mul nsw i32 [[TMP34]], [[TMP35]]
+// CHECK2-NEXT: [[ADD43:%.*]] = add nsw i32 [[TMP33]], [[MUL42]]
+// CHECK2-NEXT: store i32 [[ADD43]], ptr [[DOTOMP_IV0]], align 4
+// CHECK2-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTOMP_IV0]], align 4
+// CHECK2-NEXT: [[MUL44:%.*]] = mul nsw i32 [[TMP36]], 2
+// CHECK2-NEXT: [[ADD45:%.*]] = add nsw i32 0, [[MUL44]]
+// CHECK2-NEXT: store i32 [[ADD45]], ptr [[J]], align 4
+// CHECK2-NEXT: [[TMP37:%.*]] = load i32, ptr [[J]], align 4
+// CHECK2-NEXT: call void (...) @body(i32 noundef [[TMP37]])
+// CHECK2-NEXT: br label %[[IF_END]]
+// CHECK2: [[IF_END]]:
+// CHECK2-NEXT: [[TMP38:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT: [[TMP39:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK2-NEXT: [[CMP46:%.*]] = icmp slt i32 [[TMP38]], [[TMP39]]
+// CHECK2-NEXT: br i1 [[CMP46]], label %[[IF_THEN47:.*]], label %[[IF_END52:.*]]
+// CHECK2: [[IF_THEN47]]:
+// CHECK2-NEXT: [[TMP40:%.*]] = load i32, ptr [[DOTOMP_LB1]], align 4
+// CHECK2-NEXT: [[TMP41:%.*]] = load i32, ptr [[DOTOMP_ST1]], align 4
+// CHECK2-NEXT: [[TMP42:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT: [[MUL48:%.*]] = mul nsw i32 [[TMP41]], [[TMP42]]
+// CHECK2-NEXT: [[ADD49:%.*]] = add nsw i32 [[TMP40]], [[MUL48]]
+// CHECK2-NEXT: store i32 [[ADD49]], ptr [[DOTOMP_IV1]], align 4
+// CHECK2-NEXT: [[TMP43:%.*]] = load i32, ptr [[DOTOMP_IV1]], align 4
+// CHECK2-NEXT: [[MUL50:%.*]] = mul nsw i32 [[TMP43]], 1
+// CHECK2-NEXT: [[ADD51:%.*]] = add nsw i32 0, [[MUL50]]
+// CHECK2-NEXT: store i32 [[ADD51]], ptr [[K]], align 4
+// CHECK2-NEXT: [[TMP44:%.*]] = load i32, ptr [[K]], align 4
+// CHECK2-NEXT: call void (...) @body(i32 noundef [[TMP44]])
+// CHECK2-NEXT: br label %[[IF_END52]]
+// CHECK2: [[IF_END52]]:
+// CHECK2-NEXT: br label %[[IF_END53]]
+// CHECK2: [[IF_END53]]:
+// CHECK2-NEXT: [[TMP45:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX29]], align 8
+// CHECK2-NEXT: [[TMP46:%.*]] = load i64, ptr [[DOTOMP_NI118]], align 8
+// CHECK2-NEXT: [[CMP54:%.*]] = icmp slt i64 [[TMP45]], [[TMP46]]
+// CHECK2-NEXT: br i1 [[CMP54]], label %[[IF_THEN55:.*]], label %[[IF_END60:.*]]
+// CHECK2: [[IF_THEN55]]:
+// CHECK2-NEXT: [[TMP47:%.*]] = load i64, ptr [[DOTOMP_LB116]], align 8
+// CHECK2-NEXT: [[TMP48:%.*]] = load i64, ptr [[DOTOMP_ST117]], align 8
+// CHECK2-NEXT: [[TMP49:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX29]], align 8
+// CHECK2-NEXT: [[MUL56:%.*]] = mul nsw i64 [[TMP48]], [[TMP49]]
+// CHECK2-NEXT: [[ADD57:%.*]] = add nsw i64 [[TMP47]], [[MUL56]]
+// CHECK2-NEXT: store i64 [[ADD57]], ptr [[DOTOMP_IV120]], align 8
+// CHECK2-NEXT: [[TMP50:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_8]], align 8
+// CHECK2-NEXT: [[TMP51:%.*]] = load i64, ptr [[DOTOMP_IV120]], align 8
+// CHECK2-NEXT: [[MUL58:%.*]] = mul nsw i64 [[TMP51]], 1
+// CHECK2-NEXT: [[ADD_PTR59:%.*]] = getelementptr inbounds double, ptr [[TMP50]], i64 [[MUL58]]
+// CHECK2-NEXT: store ptr [[ADD_PTR59]], ptr [[__BEGIN2]], align 8
+// CHECK2-NEXT: [[TMP52:%.*]] = load ptr, ptr [[__BEGIN2]], align 8
+// CHECK2-NEXT: store ptr [[TMP52]], ptr [[V]], align 8
+// CHECK2-NEXT: [[TMP53:%.*]] = load i32, ptr [[C]], align 4
+// CHECK2-NEXT: [[TMP54:%.*]] = load ptr, ptr [[V]], align 8
+// CHECK2-NEXT: [[TMP55:%.*]] = load double, ptr [[TMP54]], align 8
+// CHECK2-NEXT: call void (...) @body(i32 noundef [[TMP53]], double noundef [[TMP55]])
+// CHECK2-NEXT: br label %[[IF_END60]]
+// CHECK2: [[IF_END60]]:
+// CHECK2-NEXT: br label %[[FOR_INC61:.*]]
+// CHECK2: [[FOR_INC61]]:
+// CHECK2-NEXT: [[TMP56:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX29]], align 8
+// CHECK2-NEXT: [[INC62:%.*]] = add nsw i64 [[TMP56]], 1
+// CHECK2-NEXT: store i64 [[INC62]], ptr [[DOTOMP_FUSE_INDEX29]], align 8
+// CHECK2-NEXT: br label %[[FOR_COND30]], !llvm.loop [[LOOP9:![0-9]+]]
+// CHECK2: [[FOR_END63]]:
+// CHECK2-NEXT: store i32 37, ptr [[CC]], align 4
+// CHECK2-NEXT: store ptr [[ARR]], ptr [[__RANGE264]], align 8
+// CHECK2-NEXT: [[TMP57:%.*]] = load ptr, ptr [[__RANGE264]], align 8
+// CHECK2-NEXT: [[ARRAYDECAY66:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP57]], i64 0, i64 0
+// CHECK2-NEXT: store ptr [[ARRAYDECAY66]], ptr [[__BEGIN265]], align 8
+// CHECK2-NEXT: [[TMP58:%.*]] = load ptr, ptr [[__RANGE264]], align 8
+// CHECK2-NEXT: [[ARRAYDECAY68:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP58]], i64 0, i64 0
+// CHECK2-NEXT: [[ADD_PTR69:%.*]] = getelementptr inbounds double, ptr [[ARRAYDECAY68]], i64 256
+// CHECK2-NEXT: store ptr [[ADD_PTR69]], ptr [[__END267]], align 8
+// CHECK2-NEXT: br label %[[FOR_COND70:.*]]
+// CHECK2: [[FOR_COND70]]:
+// CHECK2-NEXT: [[TMP59:%.*]] = load ptr, ptr [[__BEGIN265]], align 8
+// CHECK2-NEXT: [[TMP60:%.*]] = load ptr, ptr [[__END267]], align 8
+// CHECK2-NEXT: [[CMP71:%.*]] = icmp ne ptr [[TMP59]], [[TMP60]]
+// CHECK2-NEXT: br i1 [[CMP71]], label %[[FOR_BODY72:.*]], label %[[FOR_END74:.*]]
+// CHECK2: [[FOR_BODY72]]:
+// CHECK2-NEXT: [[TMP61:%.*]] = load ptr, ptr [[__BEGIN265]], align 8
+// CHECK2-NEXT: store ptr [[TMP61]], ptr [[VV]], align 8
+// CHECK2-NEXT: [[TMP62:%.*]] = load i32, ptr [[CC]], align 4
+// CHECK2-NEXT: [[TMP63:%.*]] = load ptr, ptr [[VV]], align 8
+// CHECK2-NEXT: [[TMP64:%.*]] = load double, ptr [[TMP63]], align 8
+// CHECK2-NEXT: call void (...) @body(i32 noundef [[TMP62]], double noundef [[TMP64]])
+// CHECK2-NEXT: br label %[[FOR_INC73:.*]]
+// CHECK2: [[FOR_INC73]]:
+// CHECK2-NEXT: [[TMP65:%.*]] = load ptr, ptr [[__BEGIN265]], align 8
+// CHECK2-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP65]], i32 1
+// CHECK2-NEXT: store ptr [[INCDEC_PTR]], ptr [[__BEGIN265]], align 8
+// CHECK2-NEXT: br label %[[FOR_COND70]]
+// CHECK2: [[FOR_END74]]:
+// CHECK2-NEXT: ret void
+//
+//
+// CHECK2-LABEL: define dso_local void @tfoo2(
+// CHECK2-SAME: ) #[[ATTR0]] {
+// CHECK2-NEXT: [[ENTRY:.*:]]
+// CHECK2-NEXT: call void @_Z4foo2IiEvT_S0_S0_(i32 noundef 0, i32 noundef 64, i32 noundef 4)
+// CHECK2-NEXT: ret void
+//
+//
+// CHECK2-LABEL: define linkonce_odr void @_Z4foo2IiEvT_S0_S0_(
+// CHECK2-SAME: i32 noundef [[START:%.*]], i32 noundef [[END:%.*]], i32 noundef [[STEP:%.*]]) #[[ATTR0]] comdat {
+// CHECK2-NEXT: [[ENTRY:.*:]]
+// CHECK2-NEXT: [[START_ADDR:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[END_ADDR:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[STEP_ADDR:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[J:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[K:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTNEW_STEP:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_LB0:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_ST0:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_NI0:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_IV0:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR_6:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR_7:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTNEW_STEP8:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR_9:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_LB1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_ST1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_NI1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_IV1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR_17:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR_19:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTNEW_STEP21:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR_22:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_LB2:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_ST2:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_NI2:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_IV2:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_TEMP_1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_TEMP_2:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_FUSE_MAX:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_FUSE_INDEX:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: store i32 [[START]], ptr [[START_ADDR]], align 4
+// CHECK2-NEXT: store i32 [[END]], ptr [[END_ADDR]], align 4
+// CHECK2-NEXT: store i32 [[STEP]], ptr [[STEP_ADDR]], align 4
+// CHECK2-NEXT: [[TMP0:%.*]] = load i32, ptr [[START_ADDR]], align 4
+// CHECK2-NEXT: store i32 [[TMP0]], ptr [[I]], align 4
+// CHECK2-NEXT: [[TMP1:%.*]] = load i32, ptr [[START_ADDR]], align 4
+// CHECK2-NEXT: store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT: [[TMP2:%.*]] = load i32, ptr [[END_ADDR]], align 4
+// CHECK2-NEXT: store i32 [[TMP2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT: [[TMP3:%.*]] = load i32, ptr [[STEP_ADDR]], align 4
+// CHECK2-NEXT: store i32 [[TMP3]], ptr [[DOTNEW_STEP]], align 4
+// CHECK2-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT: [[SUB:%.*]] = sub i32 [[TMP4]], [[TMP5]]
+// CHECK2-NEXT: [[SUB3:%.*]] = sub i32 [[SUB]], 1
+// CHECK2-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4
+// CHECK2-NEXT: [[ADD:%.*]] = add i32 [[SUB3]], [[TMP6]]
+// CHECK2-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4
+// CHECK2-NEXT: [[DIV:%.*]] = udiv i32 [[ADD]], [[TMP7]]
+// CHECK2-NEXT: [[SUB4:%.*]] = sub i32 [[DIV]], 1
+// CHECK2-NEXT: store i32 [[SUB4]], ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK2-NEXT: store i32 0, ptr [[DOTOMP_LB0]], align 4
+// CHECK2-NEXT: store i32 1, ptr [[DOTOMP_ST0]], align 4
+// CHECK2-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK2-NEXT: [[ADD5:%.*]] = add i32 [[TMP8]], 1
+// CHECK2-NEXT: store i32 [[ADD5]], ptr [[DOTOMP_NI0]], align 4
+// CHECK2-NEXT: [[TMP9:%.*]] = load i32, ptr [[END_ADDR]], align 4
+// CHECK2-NEXT: store i32 [[TMP9]], ptr [[J]], align 4
+// CHECK2-NEXT: [[TMP10:%.*]] = load i32, ptr [[END_ADDR]], align 4
+// CHECK2-NEXT: store i32 [[TMP10]], ptr [[DOTCAPTURE_EXPR_6]], align 4
+// CHECK2-NEXT: [[TMP11:%.*]] = load i32, ptr [[START_ADDR]], align 4
+// CHECK2-NEXT: store i32 [[TMP11]], ptr [[DOTCAPTURE_EXPR_7]], align 4
+// CHECK2-NEXT: [[TMP12:%.*]] = load i32, ptr [[STEP_ADDR]], align 4
+// CHECK2-NEXT: store i32 [[TMP12]], ptr [[DOTNEW_STEP8]], align 4
+// CHECK2-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4
+// CHECK2-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_7]], align 4
+// CHECK2-NEXT: [[SUB10:%.*]] = sub i32 [[TMP13]], [[TMP14]]
+// CHECK2-NEXT: [[SUB11:%.*]] = sub i32 [[SUB10]], 1
+// CHECK2-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTNEW_STEP8]], align 4
+// CHECK2-NEXT: [[ADD12:%.*]] = add i32 [[SUB11]], [[TMP15]]
+// CHECK2-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTNEW_STEP8]], align 4
+// CHECK2-NEXT: [[DIV13:%.*]] = udiv i32 [[ADD12]], [[TMP16]]
+// CHECK2-NEXT: [[SUB14:%.*]] = sub i32 [[DIV13]], 1
+// CHECK2-NEXT: store i32 [[SUB14]], ptr [[DOTCAPTURE_EXPR_9]], align 4
+// CHECK2-NEXT: store i32 0, ptr [[DOTOMP_LB1]], align 4
+// CHECK2-NEXT: store i32 1, ptr [[DOTOMP_ST1]], align 4
+// CHECK2-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_9]], align 4
+// CHECK2-NEXT: [[ADD15:%.*]] = add i32 [[TMP17]], 1
+// CHECK2-NEXT: store i32 [[ADD15]], ptr [[DOTOMP_NI1]], align 4
+// CHECK2-NEXT: [[TMP18:%.*]] = load i32, ptr [[START_ADDR]], align 4
+// CHECK2-NEXT: [[TMP19:%.*]] = load i32, ptr [[STEP_ADDR]], align 4
+// CHECK2-NEXT: [[ADD16:%.*]] = add nsw i32 [[TMP18]], [[TMP19]]
+// CHECK2-NEXT: store i32 [[ADD16]], ptr [[K]], align 4
+// CHECK2-NEXT: [[TMP20:%.*]] = load i32, ptr [[START_ADDR]], align 4
+// CHECK2-NEXT: [[TMP21:%.*]] = load i32, ptr [[STEP_ADDR]], align 4
+// CHECK2-NEXT: [[ADD18:%.*]] = add nsw i32 [[TMP20]], [[TMP21]]
+// CHECK2-NEXT: store i32 [[ADD18]], ptr [[DOTCAPTURE_EXPR_17]], align 4
+// CHECK2-NEXT: [[TMP22:%.*]] = load i32, ptr [[END_ADDR]], align 4
+// CHECK2-NEXT: [[TMP23:%.*]] = load i32, ptr [[STEP_ADDR]], align 4
+// CHECK2-NEXT: [[ADD20:%.*]] = add nsw i32 [[TMP22]], [[TMP23]]
+// CHECK2-NEXT: store i32 [[ADD20]], ptr [[DOTCAPTURE_EXPR_19]], align 4
+// CHECK2-NEXT: [[TMP24:%.*]] = load i32, ptr [[STEP_ADDR]], align 4
+// CHECK2-NEXT: store i32 [[TMP24]], ptr [[DOTNEW_STEP21]], align 4
+// CHECK2-NEXT: [[TMP25:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_19]], align 4
+// CHECK2-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4
+// CHECK2-NEXT: [[SUB23:%.*]] = sub i32 [[TMP25]], [[TMP26]]
+// CHECK2-NEXT: [[SUB24:%.*]] = sub i32 [[SUB23]], 1
+// CHECK2-NEXT: [[TMP27:%.*]] = load i32, ptr [[DOTNEW_STEP21]], align 4
+// CHECK2-NEXT: [[ADD25:%.*]] = add i32 [[SUB24]], [[TMP27]]
+// CHECK2-NEXT: [[TMP28:%.*]] = load i32, ptr [[DOTNEW_STEP21]], align 4
+// CHECK2-NEXT: [[DIV26:%.*]] = udiv i32 [[ADD25]], [[TMP28]]
+// CHECK2-NEXT: [[SUB27:%.*]] = sub i32 [[DIV26]], 1
+// CHECK2-NEXT: store i32 [[SUB27]], ptr [[DOTCAPTURE_EXPR_22]], align 4
+// CHECK2-NEXT: store i32 0, ptr [[DOTOMP_LB2]], align 4
+// CHECK2-NEXT: store i32 1, ptr [[DOTOMP_ST2]], align 4
+// CHECK2-NEXT: [[TMP29:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_22]], align 4
+// CHECK2-NEXT: [[ADD28:%.*]] = add i32 [[TMP29]], 1
+// CHECK2-NEXT: store i32 [[ADD28]], ptr [[DOTOMP_NI2]], align 4
+// CHECK2-NEXT: [[TMP30:%.*]] = load i32, ptr [[DOTOMP_NI0]], align 4
+// CHECK2-NEXT: store i32 [[TMP30]], ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK2-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK2-NEXT: [[TMP32:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK2-NEXT: [[CMP:%.*]] = icmp ugt i32 [[TMP31]], [[TMP32]]
+// CHECK2-NEXT: br i1 [[CMP]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]]
+// CHECK2: [[COND_TRUE]]:
+// CHECK2-NEXT: [[TMP33:%.*]] = load i32, ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK2-NEXT: br label %[[COND_END:.*]]
+// CHECK2: [[COND_FALSE]]:
+// CHECK2-NEXT: [[TMP34:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK2-NEXT: br label %[[COND_END]]
+// CHECK2: [[COND_END]]:
+// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ [[TMP33]], %[[COND_TRUE]] ], [ [[TMP34]], %[[COND_FALSE]] ]
+// CHECK2-NEXT: store i32 [[COND]], ptr [[DOTOMP_TEMP_2]], align 4
+// CHECK2-NEXT: [[TMP35:%.*]] = load i32, ptr [[DOTOMP_TEMP_2]], align 4
+// CHECK2-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTOMP_NI2]], align 4
+// CHECK2-NEXT: [[CMP29:%.*]] = icmp ugt i32 [[TMP35]], [[TMP36]]
+// CHECK2-NEXT: br i1 [[CMP29]], label %[[COND_TRUE30:.*]], label %[[COND_FALSE31:.*]]
+// CHECK2: [[COND_TRUE30]]:
+// CHECK2-NEXT: [[TMP37:%.*]] = load i32, ptr [[DOTOMP_TEMP_2]], align 4
+// CHECK2-NEXT: br label %[[COND_END32:.*]]
+// CHECK2: [[COND_FALSE31]]:
+// CHECK2-NEXT: [[TMP38:%.*]] = load i32, ptr [[DOTOMP_NI2]], align 4
+// CHECK2-NEXT: br label %[[COND_END32]]
+// CHECK2: [[COND_END32]]:
+// CHECK2-NEXT: [[COND33:%.*]] = phi i32 [ [[TMP37]], %[[COND_TRUE30]] ], [ [[TMP38]], %[[COND_FALSE31]] ]
+// CHECK2-NEXT: store i32 [[COND33]], ptr [[DOTOMP_FUSE_MAX]], align 4
+// CHECK2-NEXT: store i32 0, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT: br label %[[FOR_COND:.*]]
+// CHECK2: [[FOR_COND]]:
+// CHECK2-NEXT: [[TMP39:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT: [[TMP40:%.*]] = load i32, ptr [[DOTOMP_FUSE_MAX]], align 4
+// CHECK2-NEXT: [[CMP34:%.*]] = icmp ult i32 [[TMP39]], [[TMP40]]
+// CHECK2-NEXT: br i1 [[CMP34]], label %[[FOR_BODY:.*]], label %[[FOR_END:.*]]
+// CHECK2: [[FOR_BODY]]:
+// CHECK2-NEXT: [[TMP41:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT: [[TMP42:%.*]] = load i32, ptr [[DOTOMP_NI0]], align 4
+// CHECK2-NEXT: [[CMP35:%.*]] = icmp ult i32 [[TMP41]], [[TMP42]]
+// CHECK2-NEXT: br i1 [[CMP35]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
+// CHECK2: [[IF_THEN]]:
+// CHECK2-NEXT: [[TMP43:%.*]] = load i32, ptr [[DOTOMP_LB0]], align 4
+// CHECK2-NEXT: [[TMP44:%.*]] = load i32, ptr [[DOTOMP_ST0]], align 4
+// CHECK2-NEXT: [[TMP45:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT: [[MUL:%.*]] = mul i32 [[TMP44]], [[TMP45]]
+// CHECK2-NEXT: [[ADD36:%.*]] = add i32 [[TMP43]], [[MUL]]
+// CHECK2-NEXT: store i32 [[ADD36]], ptr [[DOTOMP_IV0]], align 4
+// CHECK2-NEXT: [[TMP46:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT: [[TMP47:%.*]] = load i32, ptr [[DOTOMP_IV0]], align 4
+// CHECK2-NEXT: [[TMP48:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4
+// CHECK2-NEXT: [[MUL37:%.*]] = mul i32 [[TMP47]], [[TMP48]]
+// CHECK2-NEXT: [[ADD38:%.*]] = add i32 [[TMP46]], [[MUL37]]
+// CHECK2-NEXT: store i32 [[ADD38]], ptr [[I]], align 4
+// CHECK2-NEXT: [[TMP49:%.*]] = load i32, ptr [[I]], align 4
+// CHECK2-NEXT: call void (...) @body(i32 noundef [[TMP49]])
+// CHECK2-NEXT: br label %[[IF_END]]
+// CHECK2: [[IF_END]]:
+// CHECK2-NEXT: [[TMP50:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT: [[TMP51:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK2-NEXT: [[CMP39:%.*]] = icmp ult i32 [[TMP50]], [[TMP51]]
+// CHECK2-NEXT: br i1 [[CMP39]], label %[[IF_THEN40:.*]], label %[[IF_END45:.*]]
+// CHECK2: [[IF_THEN40]]:
+// CHECK2-NEXT: [[TMP52:%.*]] = load i32, ptr [[DOTOMP_LB1]], align 4
+// CHECK2-NEXT: [[TMP53:%.*]] = load i32, ptr [[DOTOMP_ST1]], align 4
+// CHECK2-NEXT: [[TMP54:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT: [[MUL41:%.*]] = mul i32 [[TMP53]], [[TMP54]]
+// CHECK2-NEXT: [[ADD42:%.*]] = add i32 [[TMP52]], [[MUL41]]
+// CHECK2-NEXT: store i32 [[ADD42]], ptr [[DOTOMP_IV1]], align 4
+// CHECK2-NEXT: [[TMP55:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4
+// CHECK2-NEXT: [[TMP56:%.*]] = load i32, ptr [[DOTOMP_IV1]], align 4
+// CHECK2-NEXT: [[TMP57:%.*]] = load i32, ptr [[DOTNEW_STEP8]], align 4
+// CHECK2-NEXT: [[MUL43:%.*]] = mul i32 [[TMP56]], [[TMP57]]
+// CHECK2-NEXT: [[SUB44:%.*]] = sub i32 [[TMP55]], [[MUL43]]
+// CHECK2-NEXT: store i32 [[SUB44]], ptr [[J]], align 4
+// CHECK2-NEXT: [[TMP58:%.*]] = load i32, ptr [[J]], align 4
+// CHECK2-NEXT: call void (...) @body(i32 noundef [[TMP58]])
+// CHECK2-NEXT: br label %[[IF_END45]]
+// CHECK2: [[IF_END45]]:
+// CHECK2-NEXT: [[TMP59:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT: [[TMP60:%.*]] = load i32, ptr [[DOTOMP_NI2]], align 4
+// CHECK2-NEXT: [[CMP46:%.*]] = icmp ult i32 [[TMP59]], [[TMP60]]
+// CHECK2-NEXT: br i1 [[CMP46]], label %[[IF_THEN47:.*]], label %[[IF_END52:.*]]
+// CHECK2: [[IF_THEN47]]:
+// CHECK2-NEXT: [[TMP61:%.*]] = load i32, ptr [[DOTOMP_LB2]], align 4
+// CHECK2-NEXT: [[TMP62:%.*]] = load i32, ptr [[DOTOMP_ST2]], align 4
+// CHECK2-NEXT: [[TMP63:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT: [[MUL48:%.*]] = mul i32 [[TMP62]], [[TMP63]]
+// CHECK2-NEXT: [[ADD49:%.*]] = add i32 [[TMP61]], [[MUL48]]
+// CHECK2-NEXT: store i32 [[ADD49]], ptr [[DOTOMP_IV2]], align 4
+// CHECK2-NEXT: [[TMP64:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4
+// CHECK2-NEXT: [[TMP65:%.*]] = load i32, ptr [[DOTOMP_IV2]], align 4
+// CHECK2-NEXT: [[TMP66:%.*]] = load i32, ptr [[DOTNEW_STEP21]], align 4
+// CHECK2-NEXT: [[MUL50:%.*]] = mul i32 [[TMP65]], [[TMP66]]
+// CHECK2-NEXT: [[ADD51:%.*]] = add i32 [[TMP64]], [[MUL50]]
+// CHECK2-NEXT: store i32 [[ADD51]], ptr [[K]], align 4
+// CHECK2-NEXT: [[TMP67:%.*]] = load i32, ptr [[K]], align 4
+// CHECK2-NEXT: call void (...) @body(i32 noundef [[TMP67]])
+// CHECK2-NEXT: br label %[[IF_END52]]
+// CHECK2: [[IF_END52]]:
+// CHECK2-NEXT: br label %[[FOR_INC:.*]]
+// CHECK2: [[FOR_INC]]:
+// CHECK2-NEXT: [[TMP68:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT: [[INC:%.*]] = add i32 [[TMP68]], 1
+// CHECK2-NEXT: store i32 [[INC]], ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT: br label %[[FOR_COND]], !llvm.loop [[LOOP10:![0-9]+]]
+// CHECK2: [[FOR_END]]:
+// CHECK2-NEXT: ret void
+//
+//.
+// CHECK1: [[LOOP3]] = distinct !{[[LOOP3]], [[META4:![0-9]+]]}
+// CHECK1: [[META4]] = !{!"llvm.loop.mustprogress"}
+// CHECK1: [[LOOP5]] = distinct !{[[LOOP5]], [[META4]]}
+// CHECK1: [[LOOP6]] = distinct !{[[LOOP6]], [[META4]]}
+// CHECK1: [[LOOP7]] = distinct !{[[LOOP7]], [[META4]]}
+// CHECK1: [[LOOP8]] = distinct !{[[LOOP8]], [[META4]]}
+// CHECK1: [[LOOP9]] = distinct !{[[LOOP9]], [[META4]]}
+// CHECK1: [[LOOP10]] = distinct !{[[LOOP10]], [[META4]]}
+//.
+// CHECK2: [[LOOP3]] = distinct !{[[LOOP3]], [[META4:![0-9]+]]}
+// CHECK2: [[META4]] = !{!"llvm.loop.mustprogress"}
+// CHECK2: [[LOOP5]] = distinct !{[[LOOP5]], [[META4]]}
+// CHECK2: [[LOOP6]] = distinct !{[[LOOP6]], [[META4]]}
+// CHECK2: [[LOOP7]] = distinct !{[[LOOP7]], [[META4]]}
+// CHECK2: [[LOOP8]] = distinct !{[[LOOP8]], [[META4]]}
+// CHECK2: [[LOOP9]] = distinct !{[[LOOP9]], [[META4]]}
+// CHECK2: [[LOOP10]] = distinct !{[[LOOP10]], [[META4]]}
+//.
diff --git a/clang/test/OpenMP/fuse_messages.cpp b/clang/test/OpenMP/fuse_messages.cpp
new file mode 100644
index 0000000000000..b86ce95cfe9bc
--- /dev/null
+++ b/clang/test/OpenMP/fuse_messages.cpp
@@ -0,0 +1,209 @@
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -std=c++20 -fopenmp -fopenmp-version=60 -fsyntax-only -Wuninitialized -verify %s
+
+void func() {
+
+ // expected-error at +2 {{statement after '#pragma omp fuse' must be a loop sequence containing canonical loops or loop-generating constructs}}
+ #pragma omp fuse
+ ;
+
+ // expected-error at +2 {{statement after '#pragma omp fuse' must be a for loop}}
+ #pragma omp fuse
+ {int bar = 0;}
+
+ // expected-error at +4 {{statement after '#pragma omp fuse' must be a for loop}}
+ #pragma omp fuse
+ {
+ for(int i = 0; i < 10; ++i);
+ int x = 2;
+ }
+
+ // expected-error at +2 {{statement after '#pragma omp fuse' must be a loop sequence containing canonical loops or loop-generating constructs}}
+ #pragma omp fuse
+ #pragma omp for
+ for (int i = 0; i < 7; ++i)
+ ;
+
+ {
+ // expected-error at +2 {{expected statement}}
+ #pragma omp fuse
+ }
+
+ // expected-warning at +1 {{extra tokens at the end of '#pragma omp fuse' are ignored}}
+ #pragma omp fuse foo
+ {
+ for (int i = 0; i < 7; ++i)
+ ;
+ for(int j = 0; j < 100; ++j);
+
+ }
+
+
+ // expected-error at +1 {{unexpected OpenMP clause 'final' in directive '#pragma omp fuse'}}
+ #pragma omp fuse final(0)
+ {
+ for (int i = 0; i < 7; ++i)
+ ;
+ for(int j = 0; j < 100; ++j);
+
+ }
+
+ //expected-error at +3 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'i'}}
+ #pragma omp fuse
+ {
+ for(int i = 0; i < 10; i*=2) {
+ ;
+ }
+ for(int j = 0; j < 100; ++j);
+ }
+
+ //expected-error at +2 {{loop sequence after '#pragma omp fuse' must contain at least 1 canonical loop or loop-generating construct}}
+ #pragma omp fuse
+ {}
+
+ //expected-error at +3 {{statement after '#pragma omp fuse' must be a for loop}}
+ #pragma omp fuse
+ {
+ #pragma omp unroll full
+ for(int i = 0; i < 10; ++i);
+
+ for(int j = 0; j < 10; ++j);
+ }
+
+ //expected-warning at +2 {{looprange clause selects a single loop, resulting in redundant fusion}}
+ #pragma omp fuse
+ {
+ for(int i = 0; i < 10; ++i);
+ }
+
+ //expected-warning at +1 {{looprange clause selects a single loop, resulting in redundant fusion}}
+ #pragma omp fuse looprange(1, 1)
+ {
+ for(int i = 0; i < 10; ++i);
+ for(int j = 0; j < 100; ++j);
+ }
+
+ //expected-error at +1 {{argument to 'looprange' clause must be a strictly positive integer value}}
+ #pragma omp fuse looprange(1, -1)
+ {
+ for(int i = 0; i < 10; ++i);
+ for(int j = 0; j < 100; ++j);
+ }
+
+ //expected-error at +1 {{argument to 'looprange' clause must be a strictly positive integer value}}
+ #pragma omp fuse looprange(1, 0)
+ {
+ for(int i = 0; i < 10; ++i);
+ for(int j = 0; j < 100; ++j);
+ }
+
+ const int x = 1;
+ constexpr int y = 4;
+ //expected-error at +1 {{looprange clause selects loops from 1 to 4 but this exceeds the number of loops (3) in the loop sequence}}
+ #pragma omp fuse looprange(x,y)
+ {
+ for(int i = 0; i < 10; ++i);
+ for(int j = 0; j < 100; ++j);
+ for(int k = 0; k < 50; ++k);
+ }
+
+ //expected-error at +1 {{looprange clause selects loops from 1 to 420 but this exceeds the number of loops (3) in the loop sequence}}
+ #pragma omp fuse looprange(1,420)
+ {
+ for(int i = 0; i < 10; ++i);
+ for(int j = 0; j < 100; ++j);
+ for(int k = 0; k < 50; ++k);
+ }
+
+ //expected-error at +1 {{looprange clause selects loops from 1 to 6 but this exceeds the number of loops (5) in the loop sequence}}
+ #pragma omp fuse looprange(1,6)
+ {
+ for(int i = 0; i < 10; ++i);
+ for(int j = 0; j < 100; ++j);
+ for(int k = 0; k < 50; ++k);
+ // This fusion results in 2 loops
+ #pragma omp fuse looprange(1,2)
+ {
+ for(int i = 0; i < 10; ++i);
+ for(int j = 0; j < 100; ++j);
+ for(int k = 0; k < 50; ++k);
+ }
+ }
+
+ //expected-error at +1 {{looprange clause selects loops from 2 to 4 but this exceeds the number of loops (3) in the loop sequence}}
+ #pragma omp fuse looprange(2,3)
+ {
+ #pragma omp unroll partial(2)
+ for(int i = 0; i < 10; ++i);
+
+ #pragma omp reverse
+ for(int j = 0; j < 10; ++j);
+
+ #pragma omp fuse
+ {
+ {
+ #pragma omp reverse
+ for(int j = 0; j < 10; ++j);
+ }
+ for(int k = 0; k < 50; ++k);
+ }
+ }
+}
+
+// In a template context, but expression itself not instantiation-dependent
+template <typename T>
+static void templated_func() {
+
+ //expected-warning at +1 {{looprange clause selects a single loop, resulting in redundant fusion}}
+ #pragma omp fuse looprange(2,1)
+ {
+ for(int i = 0; i < 10; ++i);
+ for(int j = 0; j < 100; ++j);
+ for(int k = 0; k < 50; ++k);
+ }
+
+ //expected-error at +1 {{looprange clause selects loops from 3 to 5 but this exceeds the number of loops (3) in the loop sequence}}
+ #pragma omp fuse looprange(3,3)
+ {
+ for(int i = 0; i < 10; ++i);
+ for(int j = 0; j < 100; ++j);
+ for(int k = 0; k < 50; ++k);
+ }
+
+}
+
+template <int V>
+static void templated_func_value_dependent() {
+
+ //expected-warning at +1 {{looprange clause selects a single loop, resulting in redundant fusion}}
+ #pragma omp fuse looprange(V,1)
+ {
+ for(int i = 0; i < 10; ++i);
+ for(int j = 0; j < 100; ++j);
+ for(int k = 0; k < 50; ++k);
+ }
+}
+
+template <typename T>
+static void templated_func_type_dependent() {
+ constexpr T s = 1;
+
+ //expected-error at +1 {{argument to 'looprange' clause must be a strictly positive integer value}}
+ #pragma omp fuse looprange(s,s-1)
+ {
+ for(int i = 0; i < 10; ++i);
+ for(int j = 0; j < 100; ++j);
+ for(int k = 0; k < 50; ++k);
+ }
+}
+
+
+void template_inst() {
+ // expected-note at +1 {{in instantiation of function template specialization 'templated_func<int>' requested here}}
+ templated_func<int>();
+ // expected-note at +1 {{in instantiation of function template specialization 'templated_func_value_dependent<1>' requested here}}
+ templated_func_value_dependent<1>();
+ // expected-note at +1 {{in instantiation of function template specialization 'templated_func_type_dependent<int>' requested here}}
+ templated_func_type_dependent<int>();
+}
+
+
diff --git a/clang/tools/libclang/CIndex.cpp b/clang/tools/libclang/CIndex.cpp
index 9526f629bda42..1e8653810b655 100644
--- a/clang/tools/libclang/CIndex.cpp
+++ b/clang/tools/libclang/CIndex.cpp
@@ -2148,6 +2148,9 @@ class EnqueueVisitor : public ConstStmtVisitor<EnqueueVisitor, void>,
void VisitOMPUnrollDirective(const OMPUnrollDirective *D);
void VisitOMPReverseDirective(const OMPReverseDirective *D);
void VisitOMPInterchangeDirective(const OMPInterchangeDirective *D);
+ void VisitOMPCanonicalLoopSequenceTransformationDirective(
+ const OMPCanonicalLoopSequenceTransformationDirective *D);
+ void VisitOMPFuseDirective(const OMPFuseDirective *D);
void VisitOMPForDirective(const OMPForDirective *D);
void VisitOMPForSimdDirective(const OMPForSimdDirective *D);
void VisitOMPSectionsDirective(const OMPSectionsDirective *D);
@@ -2353,6 +2356,11 @@ void OMPClauseEnqueue::VisitOMPPartialClause(const OMPPartialClause *C) {
Visitor->AddStmt(C->getFactor());
}
+void OMPClauseEnqueue::VisitOMPLoopRangeClause(const OMPLoopRangeClause *C) {
+ Visitor->AddStmt(C->getFirst());
+ Visitor->AddStmt(C->getCount());
+}
+
void OMPClauseEnqueue::VisitOMPAllocatorClause(const OMPAllocatorClause *C) {
Visitor->AddStmt(C->getAllocator());
}
@@ -3317,6 +3325,15 @@ void EnqueueVisitor::VisitOMPInterchangeDirective(
VisitOMPCanonicalLoopNestTransformationDirective(D);
}
+void EnqueueVisitor::VisitOMPCanonicalLoopSequenceTransformationDirective(
+ const OMPCanonicalLoopSequenceTransformationDirective *D) {
+ VisitOMPExecutableDirective(D);
+}
+
+void EnqueueVisitor::VisitOMPFuseDirective(const OMPFuseDirective *D) {
+ VisitOMPCanonicalLoopSequenceTransformationDirective(D);
+}
+
void EnqueueVisitor::VisitOMPForDirective(const OMPForDirective *D) {
VisitOMPLoopDirective(D);
}
@@ -6274,6 +6291,8 @@ CXString clang_getCursorKindSpelling(enum CXCursorKind Kind) {
return cxstring::createRef("OMPReverseDirective");
case CXCursor_OMPInterchangeDirective:
return cxstring::createRef("OMPInterchangeDirective");
+ case CXCursor_OMPFuseDirective:
+ return cxstring::createRef("OMPFuseDirective");
case CXCursor_OMPForDirective:
return cxstring::createRef("OMPForDirective");
case CXCursor_OMPForSimdDirective:
diff --git a/clang/tools/libclang/CXCursor.cpp b/clang/tools/libclang/CXCursor.cpp
index 3c4062410eac1..56f113c1dc309 100644
--- a/clang/tools/libclang/CXCursor.cpp
+++ b/clang/tools/libclang/CXCursor.cpp
@@ -687,6 +687,9 @@ CXCursor cxcursor::MakeCXCursor(const Stmt *S, const Decl *Parent,
case Stmt::OMPInterchangeDirectiveClass:
K = CXCursor_OMPInterchangeDirective;
break;
+ case Stmt::OMPFuseDirectiveClass:
+ K = CXCursor_OMPFuseDirective;
+ break;
case Stmt::OMPForDirectiveClass:
K = CXCursor_OMPForDirective;
break;
diff --git a/flang/include/flang/Lower/OpenMP/Clauses.h b/flang/include/flang/Lower/OpenMP/Clauses.h
index 18e2f209c2d7a..e4f40b0dbaf60 100644
--- a/flang/include/flang/Lower/OpenMP/Clauses.h
+++ b/flang/include/flang/Lower/OpenMP/Clauses.h
@@ -243,6 +243,7 @@ using Initializer = tomp::clause::InitializerT<TypeTy, IdTy, ExprTy>;
using InReduction = tomp::clause::InReductionT<TypeTy, IdTy, ExprTy>;
using IsDevicePtr = tomp::clause::IsDevicePtrT<TypeTy, IdTy, ExprTy>;
using Lastprivate = tomp::clause::LastprivateT<TypeTy, IdTy, ExprTy>;
+using LoopRange = tomp::clause::LoopRangeT<TypeTy, IdTy, ExprTy>;
using Linear = tomp::clause::LinearT<TypeTy, IdTy, ExprTy>;
using Link = tomp::clause::LinkT<TypeTy, IdTy, ExprTy>;
using Map = tomp::clause::MapT<TypeTy, IdTy, ExprTy>;
diff --git a/flang/include/flang/Parser/dump-parse-tree.h b/flang/include/flang/Parser/dump-parse-tree.h
index 1c9fd7673e06d..af429426a62cb 100644
--- a/flang/include/flang/Parser/dump-parse-tree.h
+++ b/flang/include/flang/Parser/dump-parse-tree.h
@@ -615,6 +615,7 @@ class ParseTreeDumper {
NODE_ENUM(OmpLinearModifier, Value)
NODE(parser, OmpLocator)
NODE(parser, OmpLocatorList)
+ NODE(parser, OmpLoopRangeClause)
NODE(parser, OmpLoopDirective)
NODE(parser, OmpMapClause)
NODE(OmpMapClause, Modifier)
diff --git a/flang/include/flang/Parser/parse-tree.h b/flang/include/flang/Parser/parse-tree.h
index 951c96b974141..7afbcd3568df0 100644
--- a/flang/include/flang/Parser/parse-tree.h
+++ b/flang/include/flang/Parser/parse-tree.h
@@ -4526,6 +4526,15 @@ struct OmpLinearClause {
std::tuple<OmpObjectList, MODIFIERS(), /*PostModified=*/bool> t;
};
+// Ref: [6.0:207-208]
+//
+// loop-range-clause ->
+// LOOPRANGE(first, count) // since 6.0
+struct OmpLoopRangeClause {
+ TUPLE_CLASS_BOILERPLATE(OmpLoopRangeClause);
+ std::tuple<ScalarIntConstantExpr, ScalarIntConstantExpr> t;
+};
+
// Ref: [4.5:216-219], [5.0:315-324], [5.1:347-355], [5.2:150-158]
//
// map-clause ->
diff --git a/flang/lib/Lower/OpenMP/Clauses.cpp b/flang/lib/Lower/OpenMP/Clauses.cpp
index 78fe5aa031ba1..3684c0ed8a3fe 100644
--- a/flang/lib/Lower/OpenMP/Clauses.cpp
+++ b/flang/lib/Lower/OpenMP/Clauses.cpp
@@ -1035,6 +1035,11 @@ Link make(const parser::OmpClause::Link &inp,
return Link{/*List=*/makeObjects(inp.v, semaCtx)};
}
+LoopRange make(const parser::OmpClause::Looprange &inp,
+ semantics::SemanticsContext &semaCtx) {
+ llvm_unreachable("Unimplemented: looprange");
+}
+
Map make(const parser::OmpClause::Map &inp,
semantics::SemanticsContext &semaCtx) {
// inp.v -> parser::OmpMapClause
diff --git a/flang/lib/Parser/openmp-parsers.cpp b/flang/lib/Parser/openmp-parsers.cpp
index 519bce64321d4..3d48c541f9e9b 100644
--- a/flang/lib/Parser/openmp-parsers.cpp
+++ b/flang/lib/Parser/openmp-parsers.cpp
@@ -965,6 +965,9 @@ TYPE_PARSER(
maybe(":"_tok >> nonemptyList(Parser<OmpLinearClause::Modifier>{})),
/*PostModified=*/pure(true)))
+TYPE_PARSER(construct<OmpLoopRangeClause>(
+ scalarIntConstantExpr, "," >> scalarIntConstantExpr))
+
// OpenMPv5.2 12.5.2 detach-clause -> DETACH (event-handle)
TYPE_PARSER(construct<OmpDetachClause>(Parser<OmpObject>{}))
@@ -1146,6 +1149,8 @@ TYPE_PARSER( //
parenthesized(Parser<OmpLinearClause>{}))) ||
"LINK" >> construct<OmpClause>(construct<OmpClause::Link>(
parenthesized(Parser<OmpObjectList>{}))) ||
+ "LOOPRANGE" >> construct<OmpClause>(construct<OmpClause::Looprange>(
+ parenthesized(Parser<OmpLoopRangeClause>{}))) ||
"MAP" >> construct<OmpClause>(construct<OmpClause::Map>(
parenthesized(Parser<OmpMapClause>{}))) ||
"MATCH" >> construct<OmpClause>(construct<OmpClause::Match>(
diff --git a/flang/lib/Parser/unparse.cpp b/flang/lib/Parser/unparse.cpp
index dc6d33607146b..732c6bb0f3e05 100644
--- a/flang/lib/Parser/unparse.cpp
+++ b/flang/lib/Parser/unparse.cpp
@@ -2328,6 +2328,13 @@ class UnparseVisitor {
}
}
}
+ void Unparse(const OmpLoopRangeClause &x) {
+ Word("LOOPRANGE(");
+ Walk(std::get<0>(x.t));
+ Put(", ");
+ Walk(std::get<1>(x.t));
+ Put(")");
+ }
void Unparse(const OmpReductionClause &x) {
using Modifier = OmpReductionClause::Modifier;
Walk(std::get<std::optional<std::list<Modifier>>>(x.t), ": ");
diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp
index 6bc9f9955fe24..1742171abf2d7 100644
--- a/flang/lib/Semantics/check-omp-structure.cpp
+++ b/flang/lib/Semantics/check-omp-structure.cpp
@@ -2846,6 +2846,12 @@ CHECK_REQ_CONSTANT_SCALAR_INT_CLAUSE(Collapse, OMPC_collapse)
CHECK_REQ_CONSTANT_SCALAR_INT_CLAUSE(Safelen, OMPC_safelen)
CHECK_REQ_CONSTANT_SCALAR_INT_CLAUSE(Simdlen, OMPC_simdlen)
+void OmpStructureChecker::Enter(const parser::OmpClause::Looprange &x) {
+ context_.Say(GetContext().clauseSource,
+ "LOOPRANGE clause is not implemented yet"_err_en_US,
+ ContextDirectiveAsFortran());
+}
+
// Restrictions specific to each clause are implemented apart from the
// generalized restrictions.
diff --git a/llvm/include/llvm/Frontend/OpenMP/ClauseT.h b/llvm/include/llvm/Frontend/OpenMP/ClauseT.h
index 1ed23eed1571d..32817a7c59eeb 100644
--- a/llvm/include/llvm/Frontend/OpenMP/ClauseT.h
+++ b/llvm/include/llvm/Frontend/OpenMP/ClauseT.h
@@ -1268,6 +1268,15 @@ struct WriteT {
using EmptyTrait = std::true_type;
};
+// V6: [6.4.7] Looprange clause
+template <typename T, typename I, typename E> struct LoopRangeT {
+ using Begin = E;
+ using End = E;
+
+ using TupleTrait = std::true_type;
+ std::tuple<Begin, End> t;
+};
+
// ---
template <typename T, typename I, typename E>
@@ -1301,8 +1310,8 @@ using TupleClausesT =
DoacrossT<T, I, E>, DynGroupprivateT<T, I, E>, FromT<T, I, E>,
GrainsizeT<T, I, E>, IfT<T, I, E>, InitT<T, I, E>,
InReductionT<T, I, E>, LastprivateT<T, I, E>, LinearT<T, I, E>,
- MapT<T, I, E>, NumTasksT<T, I, E>, OrderT<T, I, E>,
- ReductionT<T, I, E>, ScheduleT<T, I, E>,
+ LoopRangeT<T, I, E>, MapT<T, I, E>, NumTasksT<T, I, E>,
+ OrderT<T, I, E>, ReductionT<T, I, E>, ScheduleT<T, I, E>,
TaskReductionT<T, I, E>, ToT<T, I, E>>;
template <typename T, typename I, typename E>
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMP.td b/llvm/include/llvm/Frontend/OpenMP/OMP.td
index 6a41c24e78149..fdf79692cab19 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMP.td
+++ b/llvm/include/llvm/Frontend/OpenMP/OMP.td
@@ -284,6 +284,10 @@ def OMPC_Linear : Clause<[Spelling<"linear">]> {
def OMPC_Link : Clause<[Spelling<"link">]> {
let flangClass = "OmpObjectList";
}
+def OMPC_LoopRange : Clause<[Spelling<"looprange">]> {
+ let clangClass = "OMPLoopRangeClause";
+ let flangClass = "OmpLoopRangeClause";
+}
def OMPC_Map : Clause<[Spelling<"map">]> {
let clangClass = "OMPMapClause";
let flangClass = "OmpMapClause";
@@ -899,6 +903,11 @@ def OMP_Groupprivate : Directive<[Spelling<"groupprivate">]> {
let category = CA_Declarative;
let languages = [L_C, L_Fortran];
}
+def OMP_Fuse : Directive<[Spelling<"fuse">]> {
+ let allowedOnceClauses = [VersionedClause<OMPC_LoopRange, 60>];
+ let association = AS_Block;
+ let category = CA_Executable;
+}
def OMP_Interchange : Directive<[Spelling<"interchange">]> {
let allowedOnceClauses = [
VersionedClause<OMPC_Permutation>,
diff --git a/openmp/runtime/test/transform/fuse/foreach.cpp b/openmp/runtime/test/transform/fuse/foreach.cpp
new file mode 100644
index 0000000000000..176465b201faa
--- /dev/null
+++ b/openmp/runtime/test/transform/fuse/foreach.cpp
@@ -0,0 +1,191 @@
+// RUN: %libomp-cxx20-compile-and-run | FileCheck %s --match-full-lines
+
+#ifndef HEADER
+#define HEADER
+
+#include <cstdlib>
+#include <cstdarg>
+#include <cstdio>
+#include <vector>
+
+struct Reporter {
+ const char *name;
+
+ Reporter(const char *name) : name(name) { print("ctor"); }
+
+ Reporter() : name("<anon>") { print("ctor"); }
+
+ Reporter(const Reporter &that) : name(that.name) { print("copy ctor"); }
+
+ Reporter(Reporter &&that) : name(that.name) { print("move ctor"); }
+
+ ~Reporter() { print("dtor"); }
+
+ const Reporter &operator=(const Reporter &that) {
+ print("copy assign");
+ this->name = that.name;
+ return *this;
+ }
+
+ const Reporter &operator=(Reporter &&that) {
+ print("move assign");
+ this->name = that.name;
+ return *this;
+ }
+
+ struct Iterator {
+ const Reporter *owner;
+ int pos;
+
+ Iterator(const Reporter *owner, int pos) : owner(owner), pos(pos) {}
+
+ Iterator(const Iterator &that) : owner(that.owner), pos(that.pos) {
+ owner->print("iterator copy ctor");
+ }
+
+ Iterator(Iterator &&that) : owner(that.owner), pos(that.pos) {
+ owner->print("iterator move ctor");
+ }
+
+ ~Iterator() { owner->print("iterator dtor"); }
+
+ const Iterator &operator=(const Iterator &that) {
+ owner->print("iterator copy assign");
+ this->owner = that.owner;
+ this->pos = that.pos;
+ return *this;
+ }
+
+ const Iterator &operator=(Iterator &&that) {
+ owner->print("iterator move assign");
+ this->owner = that.owner;
+ this->pos = that.pos;
+ return *this;
+ }
+
+ bool operator==(const Iterator &that) const {
+ owner->print("iterator %d == %d", 2 - this->pos, 2 - that.pos);
+ return this->pos == that.pos;
+ }
+
+ Iterator &operator++() {
+ owner->print("iterator prefix ++");
+ pos -= 1;
+ return *this;
+ }
+
+ Iterator operator++(int) {
+ owner->print("iterator postfix ++");
+ auto result = *this;
+ pos -= 1;
+ return result;
+ }
+
+ int operator*() const {
+ int result = 2 - pos;
+ owner->print("iterator deref: %i", result);
+ return result;
+ }
+
+ size_t operator-(const Iterator &that) const {
+ int result = (2 - this->pos) - (2 - that.pos);
+ owner->print("iterator distance: %d", result);
+ return result;
+ }
+
+ Iterator operator+(int steps) const {
+ owner->print("iterator advance: %i += %i", 2 - this->pos, steps);
+ return Iterator(owner, pos - steps);
+ }
+
+ void print(const char *msg) const { owner->print(msg); }
+ };
+
+ Iterator begin() const {
+ print("begin()");
+ return Iterator(this, 2);
+ }
+
+ Iterator end() const {
+ print("end()");
+ return Iterator(this, -1);
+ }
+
+ void print(const char *msg, ...) const {
+ va_list args;
+ va_start(args, msg);
+ printf("[%s] ", name);
+ vprintf(msg, args);
+ printf("\n");
+ va_end(args);
+ }
+};
+
+int main() {
+ printf("do\n");
+#pragma omp fuse
+ {
+ for (Reporter a{"C"}; auto &&v : Reporter("A"))
+ printf("v=%d\n", v);
+ for (Reporter aa{"D"}; auto &&vv : Reporter("B"))
+ printf("vv=%d\n", vv);
+ }
+ printf("done\n");
+ return EXIT_SUCCESS;
+}
+
+// CHECK: [C] ctor
+// CHECK-NEXT: [A] ctor
+// CHECK-NEXT: [A] end()
+// CHECK-NEXT: [A] begin()
+// CHECK-NEXT: [A] begin()
+// CHECK-NEXT: [A] iterator distance: 3
+// CHECK-NEXT: [D] ctor
+// CHECK-NEXT: [B] ctor
+// CHECK-NEXT: [B] end()
+// CHECK-NEXT: [B] begin()
+// CHECK-NEXT: [B] begin()
+// CHECK-NEXT: [B] iterator distance: 3
+// CHECK-NEXT: [A] iterator advance: 0 += 0
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 0
+// CHECK-NEXT: v=0
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [B] iterator advance: 0 += 0
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 0
+// CHECK-NEXT: vv=0
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [A] iterator advance: 0 += 1
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 1
+// CHECK-NEXT: v=1
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [B] iterator advance: 0 += 1
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 1
+// CHECK-NEXT: vv=1
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [A] iterator advance: 0 += 2
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 2
+// CHECK-NEXT: v=2
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [B] iterator advance: 0 += 2
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 2
+// CHECK-NEXT: vv=2
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [B] dtor
+// CHECK-NEXT: [D] dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] dtor
+// CHECK-NEXT: [C] dtor
+// CHECK-NEXT: done
+
+#endif
diff --git a/openmp/runtime/test/transform/fuse/intfor.c b/openmp/runtime/test/transform/fuse/intfor.c
new file mode 100644
index 0000000000000..b8171b4df7042
--- /dev/null
+++ b/openmp/runtime/test/transform/fuse/intfor.c
@@ -0,0 +1,50 @@
+// RUN: %libomp-compile-and-run | FileCheck %s --match-full-lines
+
+#ifndef HEADER
+#define HEADER
+
+#include <stdlib.h>
+#include <stdio.h>
+
+int main() {
+ printf("do\n");
+#pragma omp fuse
+ {
+ for (int i = 5; i <= 25; i += 5)
+ printf("i=%d\n", i);
+ for (int j = 10; j < 100; j += 10)
+ printf("j=%d\n", j);
+ for (int k = 10; k > 0; --k)
+ printf("k=%d\n", k);
+ }
+ printf("done\n");
+ return EXIT_SUCCESS;
+}
+#endif /* HEADER */
+
+// CHECK: do
+// CHECK-NEXT: i=5
+// CHECK-NEXT: j=10
+// CHECK-NEXT: k=10
+// CHECK-NEXT: i=10
+// CHECK-NEXT: j=20
+// CHECK-NEXT: k=9
+// CHECK-NEXT: i=15
+// CHECK-NEXT: j=30
+// CHECK-NEXT: k=8
+// CHECK-NEXT: i=20
+// CHECK-NEXT: j=40
+// CHECK-NEXT: k=7
+// CHECK-NEXT: i=25
+// CHECK-NEXT: j=50
+// CHECK-NEXT: k=6
+// CHECK-NEXT: j=60
+// CHECK-NEXT: k=5
+// CHECK-NEXT: j=70
+// CHECK-NEXT: k=4
+// CHECK-NEXT: j=80
+// CHECK-NEXT: k=3
+// CHECK-NEXT: j=90
+// CHECK-NEXT: k=2
+// CHECK-NEXT: k=1
+// CHECK-NEXT: done
diff --git a/openmp/runtime/test/transform/fuse/iterfor.cpp b/openmp/runtime/test/transform/fuse/iterfor.cpp
new file mode 100644
index 0000000000000..552484b2981c4
--- /dev/null
+++ b/openmp/runtime/test/transform/fuse/iterfor.cpp
@@ -0,0 +1,194 @@
+// RUN: %libomp-cxx20-compile-and-run | FileCheck %s --match-full-lines
+
+#ifndef HEADER
+#define HEADER
+
+#include <cstdlib>
+#include <cstdarg>
+#include <cstdio>
+#include <vector>
+
+struct Reporter {
+ const char *name;
+
+ Reporter(const char *name) : name(name) { print("ctor"); }
+
+ Reporter() : name("<anon>") { print("ctor"); }
+
+ Reporter(const Reporter &that) : name(that.name) { print("copy ctor"); }
+
+ Reporter(Reporter &&that) : name(that.name) { print("move ctor"); }
+
+ ~Reporter() { print("dtor"); }
+
+ const Reporter &operator=(const Reporter &that) {
+ print("copy assign");
+ this->name = that.name;
+ return *this;
+ }
+
+ const Reporter &operator=(Reporter &&that) {
+ print("move assign");
+ this->name = that.name;
+ return *this;
+ }
+
+ struct Iterator {
+ const Reporter *owner;
+ int pos;
+
+ Iterator(const Reporter *owner, int pos) : owner(owner), pos(pos) {}
+
+ Iterator(const Iterator &that) : owner(that.owner), pos(that.pos) {
+ owner->print("iterator copy ctor");
+ }
+
+ Iterator(Iterator &&that) : owner(that.owner), pos(that.pos) {
+ owner->print("iterator move ctor");
+ }
+
+ ~Iterator() { owner->print("iterator dtor"); }
+
+ const Iterator &operator=(const Iterator &that) {
+ owner->print("iterator copy assign");
+ this->owner = that.owner;
+ this->pos = that.pos;
+ return *this;
+ }
+
+ const Iterator &operator=(Iterator &&that) {
+ owner->print("iterator move assign");
+ this->owner = that.owner;
+ this->pos = that.pos;
+ return *this;
+ }
+
+ bool operator==(const Iterator &that) const {
+ owner->print("iterator %d == %d", 2 - this->pos, 2 - that.pos);
+ return this->pos == that.pos;
+ }
+
+ bool operator!=(const Iterator &that) const {
+ owner->print("iterator %d != %d", 2 - this->pos, 2 - that.pos);
+ return this->pos != that.pos;
+ }
+
+ Iterator &operator++() {
+ owner->print("iterator prefix ++");
+ pos -= 1;
+ return *this;
+ }
+
+ Iterator operator++(int) {
+ owner->print("iterator postfix ++");
+ auto result = *this;
+ pos -= 1;
+ return result;
+ }
+
+ int operator*() const {
+ int result = 2 - pos;
+ owner->print("iterator deref: %i", result);
+ return result;
+ }
+
+ size_t operator-(const Iterator &that) const {
+ int result = (2 - this->pos) - (2 - that.pos);
+ owner->print("iterator distance: %d", result);
+ return result;
+ }
+
+ Iterator operator+(int steps) const {
+ owner->print("iterator advance: %i += %i", 2 - this->pos, steps);
+ return Iterator(owner, pos - steps);
+ }
+ };
+
+ Iterator begin() const {
+ print("begin()");
+ return Iterator(this, 2);
+ }
+
+ Iterator end() const {
+ print("end()");
+ return Iterator(this, -1);
+ }
+
+ void print(const char *msg, ...) const {
+ va_list args;
+ va_start(args, msg);
+ printf("[%s] ", name);
+ vprintf(msg, args);
+ printf("\n");
+ va_end(args);
+ }
+};
+
+int main() {
+ printf("do\n");
+ Reporter C("C");
+ Reporter D("D");
+#pragma omp fuse
+ {
+ for (auto it = C.begin(); it != C.end(); ++it)
+ printf("v=%d\n", *it);
+
+ for (auto it = D.begin(); it != D.end(); ++it)
+ printf("vv=%d\n", *it);
+ }
+ printf("done\n");
+ return EXIT_SUCCESS;
+}
+
+#endif /* HEADER */
+
+// CHECK: do
+// CHECK: [C] ctor
+// CHECK-NEXT: [D] ctor
+// CHECK-NEXT: [C] begin()
+// CHECK-NEXT: [C] begin()
+// CHECK-NEXT: [C] end()
+// CHECK-NEXT: [C] iterator distance: 3
+// CHECK-NEXT: [D] begin()
+// CHECK-NEXT: [D] begin()
+// CHECK-NEXT: [D] end()
+// CHECK-NEXT: [D] iterator distance: 3
+// CHECK-NEXT: [C] iterator advance: 0 += 0
+// CHECK-NEXT: [C] iterator move assign
+// CHECK-NEXT: [C] iterator deref: 0
+// CHECK-NEXT: v=0
+// CHECK-NEXT: [C] iterator dtor
+// CHECK-NEXT: [D] iterator advance: 0 += 0
+// CHECK-NEXT: [D] iterator move assign
+// CHECK-NEXT: [D] iterator deref: 0
+// CHECK-NEXT: vv=0
+// CHECK-NEXT: [D] iterator dtor
+// CHECK-NEXT: [C] iterator advance: 0 += 1
+// CHECK-NEXT: [C] iterator move assign
+// CHECK-NEXT: [C] iterator deref: 1
+// CHECK-NEXT: v=1
+// CHECK-NEXT: [C] iterator dtor
+// CHECK-NEXT: [D] iterator advance: 0 += 1
+// CHECK-NEXT: [D] iterator move assign
+// CHECK-NEXT: [D] iterator deref: 1
+// CHECK-NEXT: vv=1
+// CHECK-NEXT: [D] iterator dtor
+// CHECK-NEXT: [C] iterator advance: 0 += 2
+// CHECK-NEXT: [C] iterator move assign
+// CHECK-NEXT: [C] iterator deref: 2
+// CHECK-NEXT: v=2
+// CHECK-NEXT: [C] iterator dtor
+// CHECK-NEXT: [D] iterator advance: 0 += 2
+// CHECK-NEXT: [D] iterator move assign
+// CHECK-NEXT: [D] iterator deref: 2
+// CHECK-NEXT: vv=2
+// CHECK-NEXT: [D] iterator dtor
+// CHECK-NEXT: [D] iterator dtor
+// CHECK-NEXT: [D] iterator dtor
+// CHECK-NEXT: [C] iterator dtor
+// CHECK-NEXT: [C] iterator dtor
+// CHECK-NEXT: done
+// CHECK-NEXT: [D] iterator dtor
+// CHECK-NEXT: [C] iterator dtor
+// CHECK-NEXT: [D] dtor
+// CHECK-NEXT: [C] dtor
diff --git a/openmp/runtime/test/transform/fuse/parallel-wsloop-collapse-foreach.cpp b/openmp/runtime/test/transform/fuse/parallel-wsloop-collapse-foreach.cpp
new file mode 100644
index 0000000000000..dcbbdf1b6734e
--- /dev/null
+++ b/openmp/runtime/test/transform/fuse/parallel-wsloop-collapse-foreach.cpp
@@ -0,0 +1,207 @@
+// RUN: %libomp-cxx20-compile-and-run | FileCheck %s --match-full-lines
+
+#ifndef HEADER
+#define HEADER
+
+#include <cstdlib>
+#include <cstdarg>
+#include <cstdio>
+#include <vector>
+
+struct Reporter {
+ const char *name;
+
+ Reporter(const char *name) : name(name) { print("ctor"); }
+
+ Reporter() : name("<anon>") { print("ctor"); }
+
+ Reporter(const Reporter &that) : name(that.name) { print("copy ctor"); }
+
+ Reporter(Reporter &&that) : name(that.name) { print("move ctor"); }
+
+ ~Reporter() { print("dtor"); }
+
+ const Reporter &operator=(const Reporter &that) {
+ print("copy assign");
+ this->name = that.name;
+ return *this;
+ }
+
+ const Reporter &operator=(Reporter &&that) {
+ print("move assign");
+ this->name = that.name;
+ return *this;
+ }
+
+ struct Iterator {
+ const Reporter *owner;
+ int pos;
+
+ Iterator(const Reporter *owner, int pos) : owner(owner), pos(pos) {}
+
+ Iterator(const Iterator &that) : owner(that.owner), pos(that.pos) {
+ owner->print("iterator copy ctor");
+ }
+
+ Iterator(Iterator &&that) : owner(that.owner), pos(that.pos) {
+ owner->print("iterator move ctor");
+ }
+
+ ~Iterator() { owner->print("iterator dtor"); }
+
+ const Iterator &operator=(const Iterator &that) {
+ owner->print("iterator copy assign");
+ this->owner = that.owner;
+ this->pos = that.pos;
+ return *this;
+ }
+
+ const Iterator &operator=(Iterator &&that) {
+ owner->print("iterator move assign");
+ this->owner = that.owner;
+ this->pos = that.pos;
+ return *this;
+ }
+
+ bool operator==(const Iterator &that) const {
+ owner->print("iterator %d == %d", 2 - this->pos, 2 - that.pos);
+ return this->pos == that.pos;
+ }
+
+ Iterator &operator++() {
+ owner->print("iterator prefix ++");
+ pos -= 1;
+ return *this;
+ }
+
+ Iterator operator++(int) {
+ owner->print("iterator postfix ++");
+ auto result = *this;
+ pos -= 1;
+ return result;
+ }
+
+ int operator*() const {
+ int result = 2 - pos;
+ owner->print("iterator deref: %i", result);
+ return result;
+ }
+
+ size_t operator-(const Iterator &that) const {
+ int result = (2 - this->pos) - (2 - that.pos);
+ owner->print("iterator distance: %d", result);
+ return result;
+ }
+
+ Iterator operator+(int steps) const {
+ owner->print("iterator advance: %i += %i", 2 - this->pos, steps);
+ return Iterator(owner, pos - steps);
+ }
+
+ void print(const char *msg) const { owner->print(msg); }
+ };
+
+ Iterator begin() const {
+ print("begin()");
+ return Iterator(this, 2);
+ }
+
+ Iterator end() const {
+ print("end()");
+ return Iterator(this, -1);
+ }
+
+ void print(const char *msg, ...) const {
+ va_list args;
+ va_start(args, msg);
+ printf("[%s] ", name);
+ vprintf(msg, args);
+ printf("\n");
+ va_end(args);
+ }
+};
+
+int main() {
+ printf("do\n");
+#pragma omp parallel for collapse(2) num_threads(1)
+ for (int i = 0; i < 3; ++i)
+#pragma omp fuse
+ {
+ for (Reporter c{"init-stmt"}; auto &&v : Reporter("range"))
+ printf("i=%d v=%d\n", i, v);
+ for (int vv = 0; vv < 3; ++vv)
+ printf("i=%d vv=%d\n", i, vv);
+ }
+ printf("done\n");
+ return EXIT_SUCCESS;
+}
+
+#endif /* HEADER */
+
+// CHECK: do
+// CHECK-NEXT: [init-stmt] ctor
+// CHECK-NEXT: [range] ctor
+// CHECK-NEXT: [range] end()
+// CHECK-NEXT: [range] begin()
+// CHECK-NEXT: [range] begin()
+// CHECK-NEXT: [range] iterator distance: 3
+// CHECK-NEXT: [range] iterator advance: 0 += 0
+// CHECK-NEXT: [range] iterator move assign
+// CHECK-NEXT: [range] iterator deref: 0
+// CHECK-NEXT: i=0 v=0
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: i=0 vv=0
+// CHECK-NEXT: [range] iterator advance: 0 += 1
+// CHECK-NEXT: [range] iterator move assign
+// CHECK-NEXT: [range] iterator deref: 1
+// CHECK-NEXT: i=0 v=1
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: i=0 vv=1
+// CHECK-NEXT: [range] iterator advance: 0 += 2
+// CHECK-NEXT: [range] iterator move assign
+// CHECK-NEXT: [range] iterator deref: 2
+// CHECK-NEXT: i=0 v=2
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: i=0 vv=2
+// CHECK-NEXT: [range] iterator advance: 0 += 0
+// CHECK-NEXT: [range] iterator move assign
+// CHECK-NEXT: [range] iterator deref: 0
+// CHECK-NEXT: i=1 v=0
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: i=1 vv=0
+// CHECK-NEXT: [range] iterator advance: 0 += 1
+// CHECK-NEXT: [range] iterator move assign
+// CHECK-NEXT: [range] iterator deref: 1
+// CHECK-NEXT: i=1 v=1
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: i=1 vv=1
+// CHECK-NEXT: [range] iterator advance: 0 += 2
+// CHECK-NEXT: [range] iterator move assign
+// CHECK-NEXT: [range] iterator deref: 2
+// CHECK-NEXT: i=1 v=2
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: i=1 vv=2
+// CHECK-NEXT: [range] iterator advance: 0 += 0
+// CHECK-NEXT: [range] iterator move assign
+// CHECK-NEXT: [range] iterator deref: 0
+// CHECK-NEXT: i=2 v=0
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: i=2 vv=0
+// CHECK-NEXT: [range] iterator advance: 0 += 1
+// CHECK-NEXT: [range] iterator move assign
+// CHECK-NEXT: [range] iterator deref: 1
+// CHECK-NEXT: i=2 v=1
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: i=2 vv=1
+// CHECK-NEXT: [range] iterator advance: 0 += 2
+// CHECK-NEXT: [range] iterator move assign
+// CHECK-NEXT: [range] iterator deref: 2
+// CHECK-NEXT: i=2 v=2
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: i=2 vv=2
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: [range] dtor
+// CHECK-NEXT: [init-stmt] dtor
+// CHECK-NEXT: done
diff --git a/openmp/runtime/test/transform/fuse/parallel-wsloop-collapse-intfor.c b/openmp/runtime/test/transform/fuse/parallel-wsloop-collapse-intfor.c
new file mode 100644
index 0000000000000..9630fec50bc20
--- /dev/null
+++ b/openmp/runtime/test/transform/fuse/parallel-wsloop-collapse-intfor.c
@@ -0,0 +1,45 @@
+// RUN: %libomp-cxx-compile-and-run | FileCheck %s --match-full-lines
+
+#ifndef HEADER
+#define HEADER
+
+#include <cstdlib>
+#include <cstdio>
+
+int main() {
+ printf("do\n");
+#pragma omp parallel for collapse(2) num_threads(1)
+ for (int i = 0; i < 3; ++i)
+#pragma omp fuse
+ {
+ for (int j = 0; j < 3; ++j)
+ printf("i=%d j=%d\n", i, j);
+ for (int k = 0; k < 3; ++k)
+ printf("i=%d k=%d\n", i, k);
+ }
+ printf("done\n");
+ return EXIT_SUCCESS;
+}
+
+#endif /* HEADER */
+
+// CHECK: do
+// CHECK-NEXT: i=0 j=0
+// CHECK-NEXT: i=0 k=0
+// CHECK-NEXT: i=0 j=1
+// CHECK-NEXT: i=0 k=1
+// CHECK-NEXT: i=0 j=2
+// CHECK-NEXT: i=0 k=2
+// CHECK-NEXT: i=1 j=0
+// CHECK-NEXT: i=1 k=0
+// CHECK-NEXT: i=1 j=1
+// CHECK-NEXT: i=1 k=1
+// CHECK-NEXT: i=1 j=2
+// CHECK-NEXT: i=1 k=2
+// CHECK-NEXT: i=2 j=0
+// CHECK-NEXT: i=2 k=0
+// CHECK-NEXT: i=2 j=1
+// CHECK-NEXT: i=2 k=1
+// CHECK-NEXT: i=2 j=2
+// CHECK-NEXT: i=2 k=2
+// CHECK-NEXT: done
More information about the llvm-commits
mailing list