[clang] [flang] [llvm] Introduce -fexperimental-loop-fuse to clang and flang (PR #142686)
Madhur Amilkanthwar via llvm-commits
llvm-commits at lists.llvm.org
Thu Aug 7 07:13:50 PDT 2025
https://github.com/madhur13490 updated https://github.com/llvm/llvm-project/pull/142686
>From be9eb6a39906fac945ea206eec80f4cc18bd4896 Mon Sep 17 00:00:00 2001
From: Sebastian Pop <spop at nvidia.com>
Date: Thu, 22 May 2025 13:50:38 +0000
Subject: [PATCH 1/4] add -floop-fuse to clang and flang
---
clang/include/clang/Basic/CodeGenOptions.def | 1 +
clang/include/clang/Driver/Options.td | 4 ++++
clang/lib/CodeGen/BackendUtil.cpp | 2 ++
clang/lib/Driver/ToolChains/Clang.cpp | 1 +
clang/lib/Driver/ToolChains/CommonArgs.cpp | 2 +-
clang/lib/Driver/ToolChains/Flang.cpp | 2 ++
clang/lib/Frontend/CompilerInvocation.cpp | 6 ++++++
clang/test/Driver/clang_f_opts.c | 7 +++++++
flang/docs/ReleaseNotes.md | 4 ++++
flang/include/flang/Frontend/CodeGenOptions.def | 1 +
flang/lib/Frontend/CompilerInvocation.cpp | 3 +++
flang/lib/Frontend/FrontendActions.cpp | 1 +
flang/test/Driver/loop-fuse.f90 | 17 +++++++++++++++++
llvm/include/llvm/Passes/PassBuilder.h | 3 +++
llvm/lib/Passes/PassBuilderPipelines.cpp | 13 ++++++++++++-
15 files changed, 65 insertions(+), 2 deletions(-)
create mode 100644 flang/test/Driver/loop-fuse.f90
diff --git a/clang/include/clang/Basic/CodeGenOptions.def b/clang/include/clang/Basic/CodeGenOptions.def
index 423b696785500..84af1ba6662b4 100644
--- a/clang/include/clang/Basic/CodeGenOptions.def
+++ b/clang/include/clang/Basic/CodeGenOptions.def
@@ -322,6 +322,7 @@ CODEGENOPT(TimeTrace , 1, 0, Benign) ///< Set when -ftime-trace is enabl
VALUE_CODEGENOPT(TimeTraceGranularity, 32, 500, Benign) ///< Minimum time granularity (in microseconds),
///< traced by time profiler
CODEGENOPT(InterchangeLoops , 1, 0, Benign) ///< Run loop-interchange.
+CODEGENOPT(FuseLoops , 1, 0, Benign) ///< Run loop-fuse.
CODEGENOPT(UnrollLoops , 1, 0, Benign) ///< Control whether loops are unrolled.
CODEGENOPT(RerollLoops , 1, 0, Benign) ///< Control whether loops are rerolled.
CODEGENOPT(NoUseJumpTables , 1, 0, Benign) ///< Set when -fno-jump-tables is enabled.
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 6aab43c9ed57f..184f9d9188fd6 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -4268,6 +4268,10 @@ def floop_interchange : Flag<["-"], "floop-interchange">, Group<f_Group>,
HelpText<"Enable the loop interchange pass">, Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>;
def fno_loop_interchange: Flag<["-"], "fno-loop-interchange">, Group<f_Group>,
HelpText<"Disable the loop interchange pass">, Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>;
+def floop_fuse : Flag<["-"], "floop-fuse">, Group<f_Group>,
+ HelpText<"Enable the loop fuse pass">, Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>;
+def fno_loop_fuse: Flag<["-"], "fno-loop-fuse">, Group<f_Group>,
+ HelpText<"Disable the loop fuse pass">, Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>;
def funroll_loops : Flag<["-"], "funroll-loops">, Group<f_Group>,
HelpText<"Turn on loop unroller">, Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>;
def fno_unroll_loops : Flag<["-"], "fno-unroll-loops">, Group<f_Group>,
diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index 0b8b824fbcd5a..7b1b1480da213 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -897,6 +897,7 @@ void EmitAssemblyHelper::RunOptimizationPipeline(
PipelineTuningOptions PTO;
PTO.LoopUnrolling = CodeGenOpts.UnrollLoops;
PTO.LoopInterchange = CodeGenOpts.InterchangeLoops;
+ PTO.LoopFuse = CodeGenOpts.FuseLoops;
// For historical reasons, loop interleaving is set to mirror setting for loop
// unrolling.
PTO.LoopInterleaving = CodeGenOpts.UnrollLoops;
@@ -1332,6 +1333,7 @@ runThinLTOBackend(CompilerInstance &CI, ModuleSummaryIndex *CombinedIndex,
Conf.SampleProfile = std::move(SampleProfile);
Conf.PTO.LoopUnrolling = CGOpts.UnrollLoops;
Conf.PTO.LoopInterchange = CGOpts.InterchangeLoops;
+ Conf.PTO.LoopFuse = CGOpts.FuseLoops;
// For historical reasons, loop interleaving is set to mirror setting for loop
// unrolling.
Conf.PTO.LoopInterleaving = CGOpts.UnrollLoops;
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 6eb77610079b7..6731f8c948b96 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -6870,6 +6870,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
options::OPT_fno_unroll_loops);
Args.AddLastArg(CmdArgs, options::OPT_floop_interchange,
options::OPT_fno_loop_interchange);
+ Args.AddLastArg(CmdArgs, options::OPT_floop_fuse, options::OPT_fno_loop_fuse);
Args.AddLastArg(CmdArgs, options::OPT_fstrict_flex_arrays_EQ);
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index 53fd525e941e3..1c90d6149eb96 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -3316,7 +3316,7 @@ void tools::handleVectorizeSLPArgs(const ArgList &Args,
void tools::handleInterchangeLoopsArgs(const ArgList &Args,
ArgStringList &CmdArgs) {
- // FIXME: instead of relying on shouldEnableVectorizerAtOLevel, we may want to
+ // FIXME: Instead of relying on shouldEnableVectorizerAtOLevel, we may want to
// implement a separate function to infer loop interchange from opt level.
// For now, enable loop-interchange at the same opt levels as loop-vectorize.
bool EnableInterchange = shouldEnableVectorizerAtOLevel(Args, false);
diff --git a/clang/lib/Driver/ToolChains/Flang.cpp b/clang/lib/Driver/ToolChains/Flang.cpp
index 7ab41e9b85a04..d1d3f3c800ede 100644
--- a/clang/lib/Driver/ToolChains/Flang.cpp
+++ b/clang/lib/Driver/ToolChains/Flang.cpp
@@ -151,6 +151,8 @@ void Flang::addCodegenOptions(const ArgList &Args,
!stackArrays->getOption().matches(options::OPT_fno_stack_arrays))
CmdArgs.push_back("-fstack-arrays");
+ Args.AddLastArg(CmdArgs, options::OPT_floop_fuse, options::OPT_fno_loop_fuse);
+
handleInterchangeLoopsArgs(Args, CmdArgs);
handleVectorizeLoopsArgs(Args, CmdArgs);
handleVectorizeSLPArgs(Args, CmdArgs);
diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
index ccc3154d20968..85fe3d7992465 100644
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -1671,6 +1671,11 @@ void CompilerInvocationBase::GenerateCodeGenArgs(const CodeGenOptions &Opts,
else
GenerateArg(Consumer, OPT_fno_loop_interchange);
+ if (Opts.FuseLoops)
+ GenerateArg(Consumer, OPT_floop_fuse);
+ else
+ GenerateArg(Consumer, OPT_fno_loop_fuse);
+
if (!Opts.BinutilsVersion.empty())
GenerateArg(Consumer, OPT_fbinutils_version_EQ, Opts.BinutilsVersion);
@@ -1991,6 +1996,7 @@ bool CompilerInvocation::ParseCodeGenArgs(CodeGenOptions &Opts, ArgList &Args,
(Opts.OptimizationLevel > 1));
Opts.InterchangeLoops =
Args.hasFlag(OPT_floop_interchange, OPT_fno_loop_interchange, false);
+ Opts.FuseLoops = Args.hasFlag(OPT_floop_fuse, OPT_fno_loop_fuse, false);
Opts.BinutilsVersion =
std::string(Args.getLastArgValue(OPT_fbinutils_version_EQ));
diff --git a/clang/test/Driver/clang_f_opts.c b/clang/test/Driver/clang_f_opts.c
index ee7ded265769b..204d5e338190f 100644
--- a/clang/test/Driver/clang_f_opts.c
+++ b/clang/test/Driver/clang_f_opts.c
@@ -52,6 +52,13 @@
// CHECK-INTERCHANGE-LOOPS: "-floop-interchange"
// CHECK-NO-INTERCHANGE-LOOPS: "-fno-loop-interchange"
+// RUN: %clang -### -S -floop-fuse %s 2>&1 | FileCheck -check-prefix=CHECK-FUSE-LOOPS %s
+// RUN: %clang -### -S -fno-loop-fuse %s 2>&1 | FileCheck -check-prefix=CHECK-NO-FUSE-LOOPS %s
+// RUN: %clang -### -S -fno-loop-fuse -floop-fuse %s 2>&1 | FileCheck -check-prefix=CHECK-FUSE-LOOPS %s
+// RUN: %clang -### -S -floop-fuse -fno-loop-fuse %s 2>&1 | FileCheck -check-prefix=CHECK-NO-FUSE-LOOPS %s
+// CHECK-FUSE-LOOPS: "-floop-fuse"
+// CHECK-NO-FUSE-LOOPS: "-fno-loop-fuse"
+
// RUN: %clang -### -S -fprofile-sample-accurate %s 2>&1 | FileCheck -check-prefix=CHECK-PROFILE-SAMPLE-ACCURATE %s
// CHECK-PROFILE-SAMPLE-ACCURATE: "-fprofile-sample-accurate"
diff --git a/flang/docs/ReleaseNotes.md b/flang/docs/ReleaseNotes.md
index 99dc41c907c36..55a48fb7f816a 100644
--- a/flang/docs/ReleaseNotes.md
+++ b/flang/docs/ReleaseNotes.md
@@ -32,6 +32,10 @@ page](https://llvm.org/releases/).
## New Compiler Flags
+* -floop-interchange is now recognized by flang.
+* -floop-interchange is enabled by default at -O2 and above.
+* -floop-fuse is now recognized by flang.
+
## Windows Support
## Fortran Language Changes in Flang
diff --git a/flang/include/flang/Frontend/CodeGenOptions.def b/flang/include/flang/Frontend/CodeGenOptions.def
index cdeea93c9aecb..49e4d292a13e1 100644
--- a/flang/include/flang/Frontend/CodeGenOptions.def
+++ b/flang/include/flang/Frontend/CodeGenOptions.def
@@ -43,6 +43,7 @@ CODEGENOPT(StackArrays, 1, 0) ///< -fstack-arrays (enable the stack-arrays pass)
CODEGENOPT(VectorizeLoop, 1, 0) ///< Enable loop vectorization.
CODEGENOPT(VectorizeSLP, 1, 0) ///< Enable SLP vectorization.
CODEGENOPT(InterchangeLoops, 1, 0) ///< Enable loop interchange.
+CODEGENOPT(FuseLoops, 1, 0) ///< Enable loop fuse.
CODEGENOPT(LoopVersioning, 1, 0) ///< Enable loop versioning.
CODEGENOPT(UnrollLoops, 1, 0) ///< Enable loop unrolling
CODEGENOPT(AliasAnalysis, 1, 0) ///< Enable alias analysis pass
diff --git a/flang/lib/Frontend/CompilerInvocation.cpp b/flang/lib/Frontend/CompilerInvocation.cpp
index 111c5aa48726f..ded227b1c7e6c 100644
--- a/flang/lib/Frontend/CompilerInvocation.cpp
+++ b/flang/lib/Frontend/CompilerInvocation.cpp
@@ -275,6 +275,9 @@ static void parseCodeGenArgs(Fortran::frontend::CodeGenOptions &opts,
if (args.getLastArg(clang::driver::options::OPT_floop_interchange))
opts.InterchangeLoops = 1;
+ if (args.getLastArg(clang::driver::options::OPT_floop_fuse))
+ opts.FuseLoops = 1;
+
if (args.getLastArg(clang::driver::options::OPT_vectorize_loops))
opts.VectorizeLoop = 1;
diff --git a/flang/lib/Frontend/FrontendActions.cpp b/flang/lib/Frontend/FrontendActions.cpp
index 5c66ecf3043cd..5fe951bfdf30a 100644
--- a/flang/lib/Frontend/FrontendActions.cpp
+++ b/flang/lib/Frontend/FrontendActions.cpp
@@ -951,6 +951,7 @@ void CodeGenAction::runOptimizationPipeline(llvm::raw_pwrite_stream &os) {
si.getTimePasses().setOutStream(ci.getTimingStreamLLVM());
pto.LoopUnrolling = opts.UnrollLoops;
pto.LoopInterchange = opts.InterchangeLoops;
+ pto.LoopFuse = opts.FuseLoops;
pto.LoopInterleaving = opts.UnrollLoops;
pto.LoopVectorization = opts.VectorizeLoop;
pto.SLPVectorization = opts.VectorizeSLP;
diff --git a/flang/test/Driver/loop-fuse.f90 b/flang/test/Driver/loop-fuse.f90
new file mode 100644
index 0000000000000..240d00fdb62d7
--- /dev/null
+++ b/flang/test/Driver/loop-fuse.f90
@@ -0,0 +1,17 @@
+! RUN: %flang -### -S -floop-fuse %s 2>&1 | FileCheck -check-prefix=CHECK-LOOP-FUSE %s
+! RUN: %flang -### -S -fno-loop-fuse %s 2>&1 | FileCheck -check-prefix=CHECK-NO-LOOP-FUSE %s
+! RUN: %flang -### -S -O0 %s 2>&1 | FileCheck -check-prefix=CHECK-NO-LOOP-FUSE %s
+! RUN: %flang -### -S -O1 %s 2>&1 | FileCheck -check-prefix=CHECK-NO-LOOP-FUSE %s
+! RUN: %flang -### -S -O2 %s 2>&1 | FileCheck -check-prefix=CHECK-NO-LOOP-FUSE %s
+! RUN: %flang -### -S -O3 %s 2>&1 | FileCheck -check-prefix=CHECK-NO-LOOP-FUSE %s
+! RUN: %flang -### -S -Os %s 2>&1 | FileCheck -check-prefix=CHECK-NO-LOOP-FUSE %s
+! RUN: %flang -### -S -Oz %s 2>&1 | FileCheck -check-prefix=CHECK-NO-LOOP-FUSE %s
+! CHECK-LOOP-FUSE: "-floop-fuse"
+! CHECK-NO-LOOP-FUSE-NOT: "-floop-fuse"
+! RUN: %flang_fc1 -emit-llvm -O2 -floop-fuse -mllvm -print-pipeline-passes -o /dev/null %s 2>&1 | FileCheck -check-prefix=CHECK-LOOP-FUSE-PASS %s
+! RUN: %flang_fc1 -emit-llvm -O2 -fno-loop-fuse -mllvm -print-pipeline-passes -o /dev/null %s 2>&1 | FileCheck -check-prefix=CHECK-NO-LOOP-FUSE-PASS %s
+! CHECK-LOOP-FUSE-PASS: loop-fusion
+! CHECK-NO-LOOP-FUSE-PASS-NOT: loop-fusion
+
+program test
+end program
diff --git a/llvm/include/llvm/Passes/PassBuilder.h b/llvm/include/llvm/Passes/PassBuilder.h
index 9cdb7ca7dbc9b..5870f9e6baeac 100644
--- a/llvm/include/llvm/Passes/PassBuilder.h
+++ b/llvm/include/llvm/Passes/PassBuilder.h
@@ -65,6 +65,9 @@ class PipelineTuningOptions {
/// false.
bool LoopInterchange;
+ /// Tuning option to enable/disable loop fuse. Its default value is false.
+ bool LoopFuse;
+
/// Tuning option to forget all SCEV loops in LoopUnroll. Its default value
/// is that of the flag: `-forget-scev-loop-unroll`.
bool ForgetAllSCEVInLoopUnroll;
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 98821bb1408a7..fe077198dfb06 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -104,6 +104,7 @@
#include "llvm/Transforms/Scalar/LoopDeletion.h"
#include "llvm/Transforms/Scalar/LoopDistribute.h"
#include "llvm/Transforms/Scalar/LoopFlatten.h"
+#include "llvm/Transforms/Scalar/LoopFuse.h"
#include "llvm/Transforms/Scalar/LoopIdiomRecognize.h"
#include "llvm/Transforms/Scalar/LoopInstSimplify.h"
#include "llvm/Transforms/Scalar/LoopInterchange.h"
@@ -204,6 +205,10 @@ static cl::opt<bool>
EnableLoopInterchange("enable-loopinterchange", cl::init(false), cl::Hidden,
cl::desc("Enable the LoopInterchange Pass"));
+static cl::opt<bool> EnableLoopFuse("enable-loopfuse", cl::init(false),
+ cl::Hidden,
+ cl::desc("Enable the LoopFuse Pass"));
+
static cl::opt<bool> EnableUnrollAndJam("enable-unroll-and-jam",
cl::init(false), cl::Hidden,
cl::desc("Enable Unroll And Jam Pass"));
@@ -313,6 +318,7 @@ PipelineTuningOptions::PipelineTuningOptions() {
SLPVectorization = false;
LoopUnrolling = true;
LoopInterchange = EnableLoopInterchange;
+ LoopFuse = EnableLoopFuse;
ForgetAllSCEVInLoopUnroll = ForgetSCEVInLoopUnroll;
LicmMssaOptCap = SetLicmMssaOptCap;
LicmMssaNoAccForPromotionCap = SetLicmMssaNoAccForPromotionCap;
@@ -514,6 +520,9 @@ PassBuilder::buildO1FunctionSimplificationPipeline(OptimizationLevel Level,
invokeLoopOptimizerEndEPCallbacks(LPM2, Level);
+ if (PTO.LoopFuse)
+ FPM.addPass(LoopFusePass());
+
FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM1),
/*UseMemorySSA=*/true,
/*UseBlockFrequencyInfo=*/true));
@@ -703,6 +712,9 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
invokeLoopOptimizerEndEPCallbacks(LPM2, Level);
+ if (PTO.LoopFuse)
+ FPM.addPass(LoopFusePass());
+
FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM1),
/*UseMemorySSA=*/true,
/*UseBlockFrequencyInfo=*/true));
@@ -2115,7 +2127,6 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
LPM.addPass(LoopFlattenPass());
LPM.addPass(IndVarSimplifyPass());
LPM.addPass(LoopDeletionPass());
- // FIXME: Add loop interchange.
// Unroll small loops and perform peeling.
LPM.addPass(LoopFullUnrollPass(Level.getSpeedupLevel(),
>From ea7ccc2b60a878b9666126f72640c9d76950e8ec Mon Sep 17 00:00:00 2001
From: Sebastian Pop <spop at nvidia.com>
Date: Thu, 12 Jun 2025 09:36:51 +0000
Subject: [PATCH 2/4] rename -floop-fuse / -fexperimental-fuse-loops
---
clang/include/clang/Basic/CodeGenOptions.def | 2 +-
clang/include/clang/Driver/Options.td | 8 ++++----
clang/lib/CodeGen/BackendUtil.cpp | 4 ++--
clang/lib/Driver/ToolChains/Clang.cpp | 3 ++-
clang/lib/Driver/ToolChains/CommonArgs.cpp | 2 +-
clang/lib/Driver/ToolChains/Flang.cpp | 3 ++-
clang/lib/Frontend/CompilerInvocation.cpp | 7 ++++---
clang/test/Driver/clang_f_opts.c | 12 +++++------
flang/docs/ReleaseNotes.md | 4 +---
.../include/flang/Frontend/CodeGenOptions.def | 2 +-
flang/lib/Frontend/CompilerInvocation.cpp | 2 +-
flang/lib/Frontend/FrontendActions.cpp | 2 +-
flang/test/Driver/loop-fuse.f90 | 12 +++++------
llvm/include/llvm/Passes/PassBuilder.h | 4 ++--
llvm/lib/Passes/PassBuilderPipelines.cpp | 20 +++++++++----------
15 files changed, 43 insertions(+), 44 deletions(-)
diff --git a/clang/include/clang/Basic/CodeGenOptions.def b/clang/include/clang/Basic/CodeGenOptions.def
index 84af1ba6662b4..8c12cc15250fb 100644
--- a/clang/include/clang/Basic/CodeGenOptions.def
+++ b/clang/include/clang/Basic/CodeGenOptions.def
@@ -322,7 +322,7 @@ CODEGENOPT(TimeTrace , 1, 0, Benign) ///< Set when -ftime-trace is enabl
VALUE_CODEGENOPT(TimeTraceGranularity, 32, 500, Benign) ///< Minimum time granularity (in microseconds),
///< traced by time profiler
CODEGENOPT(InterchangeLoops , 1, 0, Benign) ///< Run loop-interchange.
-CODEGENOPT(FuseLoops , 1, 0, Benign) ///< Run loop-fuse.
+CODEGENOPT(FuseLoops , 1, 0, Benign) ///< Run loop-fusion.
CODEGENOPT(UnrollLoops , 1, 0, Benign) ///< Control whether loops are unrolled.
CODEGENOPT(RerollLoops , 1, 0, Benign) ///< Control whether loops are rerolled.
CODEGENOPT(NoUseJumpTables , 1, 0, Benign) ///< Set when -fno-jump-tables is enabled.
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 184f9d9188fd6..6a9176ebf56a8 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -4268,10 +4268,10 @@ def floop_interchange : Flag<["-"], "floop-interchange">, Group<f_Group>,
HelpText<"Enable the loop interchange pass">, Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>;
def fno_loop_interchange: Flag<["-"], "fno-loop-interchange">, Group<f_Group>,
HelpText<"Disable the loop interchange pass">, Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>;
-def floop_fuse : Flag<["-"], "floop-fuse">, Group<f_Group>,
- HelpText<"Enable the loop fuse pass">, Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>;
-def fno_loop_fuse: Flag<["-"], "fno-loop-fuse">, Group<f_Group>,
- HelpText<"Disable the loop fuse pass">, Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>;
+def fexperimental_loop_fusion : Flag<["-"], "fexperimental-loop-fusion">, Group<f_Group>,
+ HelpText<"Enable the experimental loop fusion pass">, Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>;
+def fno_experimental_loop_fusion: Flag<["-"], "fno-experimental-loop-fusion">, Group<f_Group>,
+ HelpText<"Disable the experimental loop fusion pass">, Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>;
def funroll_loops : Flag<["-"], "funroll-loops">, Group<f_Group>,
HelpText<"Turn on loop unroller">, Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>;
def fno_unroll_loops : Flag<["-"], "fno-unroll-loops">, Group<f_Group>,
diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index 7b1b1480da213..c713b95b2ff97 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -897,7 +897,7 @@ void EmitAssemblyHelper::RunOptimizationPipeline(
PipelineTuningOptions PTO;
PTO.LoopUnrolling = CodeGenOpts.UnrollLoops;
PTO.LoopInterchange = CodeGenOpts.InterchangeLoops;
- PTO.LoopFuse = CodeGenOpts.FuseLoops;
+ PTO.LoopFusion = CodeGenOpts.FuseLoops;
// For historical reasons, loop interleaving is set to mirror setting for loop
// unrolling.
PTO.LoopInterleaving = CodeGenOpts.UnrollLoops;
@@ -1333,7 +1333,7 @@ runThinLTOBackend(CompilerInstance &CI, ModuleSummaryIndex *CombinedIndex,
Conf.SampleProfile = std::move(SampleProfile);
Conf.PTO.LoopUnrolling = CGOpts.UnrollLoops;
Conf.PTO.LoopInterchange = CGOpts.InterchangeLoops;
- Conf.PTO.LoopFuse = CGOpts.FuseLoops;
+ Conf.PTO.LoopFusion = CGOpts.FuseLoops;
// For historical reasons, loop interleaving is set to mirror setting for loop
// unrolling.
Conf.PTO.LoopInterleaving = CGOpts.UnrollLoops;
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 6731f8c948b96..c486ce537ef03 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -6870,7 +6870,8 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
options::OPT_fno_unroll_loops);
Args.AddLastArg(CmdArgs, options::OPT_floop_interchange,
options::OPT_fno_loop_interchange);
- Args.AddLastArg(CmdArgs, options::OPT_floop_fuse, options::OPT_fno_loop_fuse);
+ Args.AddLastArg(CmdArgs, options::OPT_fexperimental_loop_fusion,
+ options::OPT_fno_experimental_loop_fusion);
Args.AddLastArg(CmdArgs, options::OPT_fstrict_flex_arrays_EQ);
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index 1c90d6149eb96..53fd525e941e3 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -3316,7 +3316,7 @@ void tools::handleVectorizeSLPArgs(const ArgList &Args,
void tools::handleInterchangeLoopsArgs(const ArgList &Args,
ArgStringList &CmdArgs) {
- // FIXME: Instead of relying on shouldEnableVectorizerAtOLevel, we may want to
+ // FIXME: instead of relying on shouldEnableVectorizerAtOLevel, we may want to
// implement a separate function to infer loop interchange from opt level.
// For now, enable loop-interchange at the same opt levels as loop-vectorize.
bool EnableInterchange = shouldEnableVectorizerAtOLevel(Args, false);
diff --git a/clang/lib/Driver/ToolChains/Flang.cpp b/clang/lib/Driver/ToolChains/Flang.cpp
index d1d3f3c800ede..a5792427dfbed 100644
--- a/clang/lib/Driver/ToolChains/Flang.cpp
+++ b/clang/lib/Driver/ToolChains/Flang.cpp
@@ -151,7 +151,8 @@ void Flang::addCodegenOptions(const ArgList &Args,
!stackArrays->getOption().matches(options::OPT_fno_stack_arrays))
CmdArgs.push_back("-fstack-arrays");
- Args.AddLastArg(CmdArgs, options::OPT_floop_fuse, options::OPT_fno_loop_fuse);
+ Args.AddLastArg(CmdArgs, options::OPT_fexperimental_loop_fusion,
+ options::OPT_fno_experimental_loop_fusion);
handleInterchangeLoopsArgs(Args, CmdArgs);
handleVectorizeLoopsArgs(Args, CmdArgs);
diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
index 85fe3d7992465..cdba1ace17607 100644
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -1672,9 +1672,9 @@ void CompilerInvocationBase::GenerateCodeGenArgs(const CodeGenOptions &Opts,
GenerateArg(Consumer, OPT_fno_loop_interchange);
if (Opts.FuseLoops)
- GenerateArg(Consumer, OPT_floop_fuse);
+ GenerateArg(Consumer, OPT_fexperimental_loop_fusion);
else
- GenerateArg(Consumer, OPT_fno_loop_fuse);
+ GenerateArg(Consumer, OPT_fno_experimental_loop_fusion);
if (!Opts.BinutilsVersion.empty())
GenerateArg(Consumer, OPT_fbinutils_version_EQ, Opts.BinutilsVersion);
@@ -1996,7 +1996,8 @@ bool CompilerInvocation::ParseCodeGenArgs(CodeGenOptions &Opts, ArgList &Args,
(Opts.OptimizationLevel > 1));
Opts.InterchangeLoops =
Args.hasFlag(OPT_floop_interchange, OPT_fno_loop_interchange, false);
- Opts.FuseLoops = Args.hasFlag(OPT_floop_fuse, OPT_fno_loop_fuse, false);
+ Opts.FuseLoops = Args.hasFlag(OPT_fexperimental_loop_fusion,
+ OPT_fno_experimental_loop_fusion, false);
Opts.BinutilsVersion =
std::string(Args.getLastArgValue(OPT_fbinutils_version_EQ));
diff --git a/clang/test/Driver/clang_f_opts.c b/clang/test/Driver/clang_f_opts.c
index 204d5e338190f..ca654cb2dd49b 100644
--- a/clang/test/Driver/clang_f_opts.c
+++ b/clang/test/Driver/clang_f_opts.c
@@ -52,12 +52,12 @@
// CHECK-INTERCHANGE-LOOPS: "-floop-interchange"
// CHECK-NO-INTERCHANGE-LOOPS: "-fno-loop-interchange"
-// RUN: %clang -### -S -floop-fuse %s 2>&1 | FileCheck -check-prefix=CHECK-FUSE-LOOPS %s
-// RUN: %clang -### -S -fno-loop-fuse %s 2>&1 | FileCheck -check-prefix=CHECK-NO-FUSE-LOOPS %s
-// RUN: %clang -### -S -fno-loop-fuse -floop-fuse %s 2>&1 | FileCheck -check-prefix=CHECK-FUSE-LOOPS %s
-// RUN: %clang -### -S -floop-fuse -fno-loop-fuse %s 2>&1 | FileCheck -check-prefix=CHECK-NO-FUSE-LOOPS %s
-// CHECK-FUSE-LOOPS: "-floop-fuse"
-// CHECK-NO-FUSE-LOOPS: "-fno-loop-fuse"
+// RUN: %clang -### -S -fexperimental-loop-fusion %s 2>&1 | FileCheck -check-prefix=CHECK-FUSE-LOOPS %s
+// RUN: %clang -### -S -fno-experimental-loop-fusion %s 2>&1 | FileCheck -check-prefix=CHECK-NO-FUSE-LOOPS %s
+// RUN: %clang -### -S -fno-experimental-loop-fusion -fexperimental-loop-fusion %s 2>&1 | FileCheck -check-prefix=CHECK-FUSE-LOOPS %s
+// RUN: %clang -### -S -fexperimental-loop-fusion -fno-experimental-loop-fusion %s 2>&1 | FileCheck -check-prefix=CHECK-NO-FUSE-LOOPS %s
+// CHECK-FUSE-LOOPS: "-fexperimental-loop-fusion"
+// CHECK-NO-FUSE-LOOPS: "-fno-experimental-loop-fusion"
// RUN: %clang -### -S -fprofile-sample-accurate %s 2>&1 | FileCheck -check-prefix=CHECK-PROFILE-SAMPLE-ACCURATE %s
// CHECK-PROFILE-SAMPLE-ACCURATE: "-fprofile-sample-accurate"
diff --git a/flang/docs/ReleaseNotes.md b/flang/docs/ReleaseNotes.md
index 55a48fb7f816a..876aca3f16ce1 100644
--- a/flang/docs/ReleaseNotes.md
+++ b/flang/docs/ReleaseNotes.md
@@ -32,9 +32,7 @@ page](https://llvm.org/releases/).
## New Compiler Flags
-* -floop-interchange is now recognized by flang.
-* -floop-interchange is enabled by default at -O2 and above.
-* -floop-fuse is now recognized by flang.
+* -fexperimental-loop-fusion is now recognized by flang.
## Windows Support
diff --git a/flang/include/flang/Frontend/CodeGenOptions.def b/flang/include/flang/Frontend/CodeGenOptions.def
index 49e4d292a13e1..f273dad9606a6 100644
--- a/flang/include/flang/Frontend/CodeGenOptions.def
+++ b/flang/include/flang/Frontend/CodeGenOptions.def
@@ -43,7 +43,7 @@ CODEGENOPT(StackArrays, 1, 0) ///< -fstack-arrays (enable the stack-arrays pass)
CODEGENOPT(VectorizeLoop, 1, 0) ///< Enable loop vectorization.
CODEGENOPT(VectorizeSLP, 1, 0) ///< Enable SLP vectorization.
CODEGENOPT(InterchangeLoops, 1, 0) ///< Enable loop interchange.
-CODEGENOPT(FuseLoops, 1, 0) ///< Enable loop fuse.
+CODEGENOPT(FuseLoops, 1, 0) ///< Enable loop fusion.
CODEGENOPT(LoopVersioning, 1, 0) ///< Enable loop versioning.
CODEGENOPT(UnrollLoops, 1, 0) ///< Enable loop unrolling
CODEGENOPT(AliasAnalysis, 1, 0) ///< Enable alias analysis pass
diff --git a/flang/lib/Frontend/CompilerInvocation.cpp b/flang/lib/Frontend/CompilerInvocation.cpp
index ded227b1c7e6c..050e2fb6e2127 100644
--- a/flang/lib/Frontend/CompilerInvocation.cpp
+++ b/flang/lib/Frontend/CompilerInvocation.cpp
@@ -275,7 +275,7 @@ static void parseCodeGenArgs(Fortran::frontend::CodeGenOptions &opts,
if (args.getLastArg(clang::driver::options::OPT_floop_interchange))
opts.InterchangeLoops = 1;
- if (args.getLastArg(clang::driver::options::OPT_floop_fuse))
+ if (args.getLastArg(clang::driver::options::OPT_fexperimental_loop_fusion))
opts.FuseLoops = 1;
if (args.getLastArg(clang::driver::options::OPT_vectorize_loops))
diff --git a/flang/lib/Frontend/FrontendActions.cpp b/flang/lib/Frontend/FrontendActions.cpp
index 5fe951bfdf30a..55a27e33227c2 100644
--- a/flang/lib/Frontend/FrontendActions.cpp
+++ b/flang/lib/Frontend/FrontendActions.cpp
@@ -951,7 +951,7 @@ void CodeGenAction::runOptimizationPipeline(llvm::raw_pwrite_stream &os) {
si.getTimePasses().setOutStream(ci.getTimingStreamLLVM());
pto.LoopUnrolling = opts.UnrollLoops;
pto.LoopInterchange = opts.InterchangeLoops;
- pto.LoopFuse = opts.FuseLoops;
+ pto.LoopFusion = opts.FuseLoops;
pto.LoopInterleaving = opts.UnrollLoops;
pto.LoopVectorization = opts.VectorizeLoop;
pto.SLPVectorization = opts.VectorizeSLP;
diff --git a/flang/test/Driver/loop-fuse.f90 b/flang/test/Driver/loop-fuse.f90
index 240d00fdb62d7..ddfd9065e0fd4 100644
--- a/flang/test/Driver/loop-fuse.f90
+++ b/flang/test/Driver/loop-fuse.f90
@@ -1,15 +1,15 @@
-! RUN: %flang -### -S -floop-fuse %s 2>&1 | FileCheck -check-prefix=CHECK-LOOP-FUSE %s
-! RUN: %flang -### -S -fno-loop-fuse %s 2>&1 | FileCheck -check-prefix=CHECK-NO-LOOP-FUSE %s
+! RUN: %flang -### -S -fexperimental-loop-fusion %s 2>&1 | FileCheck -check-prefix=CHECK-LOOP-FUSE %s
+! RUN: %flang -### -S -fno-experimental-loop-fusion %s 2>&1 | FileCheck -check-prefix=CHECK-NO-LOOP-FUSE %s
! RUN: %flang -### -S -O0 %s 2>&1 | FileCheck -check-prefix=CHECK-NO-LOOP-FUSE %s
! RUN: %flang -### -S -O1 %s 2>&1 | FileCheck -check-prefix=CHECK-NO-LOOP-FUSE %s
! RUN: %flang -### -S -O2 %s 2>&1 | FileCheck -check-prefix=CHECK-NO-LOOP-FUSE %s
! RUN: %flang -### -S -O3 %s 2>&1 | FileCheck -check-prefix=CHECK-NO-LOOP-FUSE %s
! RUN: %flang -### -S -Os %s 2>&1 | FileCheck -check-prefix=CHECK-NO-LOOP-FUSE %s
! RUN: %flang -### -S -Oz %s 2>&1 | FileCheck -check-prefix=CHECK-NO-LOOP-FUSE %s
-! CHECK-LOOP-FUSE: "-floop-fuse"
-! CHECK-NO-LOOP-FUSE-NOT: "-floop-fuse"
-! RUN: %flang_fc1 -emit-llvm -O2 -floop-fuse -mllvm -print-pipeline-passes -o /dev/null %s 2>&1 | FileCheck -check-prefix=CHECK-LOOP-FUSE-PASS %s
-! RUN: %flang_fc1 -emit-llvm -O2 -fno-loop-fuse -mllvm -print-pipeline-passes -o /dev/null %s 2>&1 | FileCheck -check-prefix=CHECK-NO-LOOP-FUSE-PASS %s
+! CHECK-LOOP-FUSE: "-fexperimental-loop-fusion"
+! CHECK-NO-LOOP-FUSE-NOT: "-fexperimental-loop-fusion"
+! RUN: %flang_fc1 -emit-llvm -O2 -fexperimental-loop-fusion -mllvm -print-pipeline-passes -o /dev/null %s 2>&1 | FileCheck -check-prefix=CHECK-LOOP-FUSE-PASS %s
+! RUN: %flang_fc1 -emit-llvm -O2 -fno-experimental-loop-fusion -mllvm -print-pipeline-passes -o /dev/null %s 2>&1 | FileCheck -check-prefix=CHECK-NO-LOOP-FUSE-PASS %s
! CHECK-LOOP-FUSE-PASS: loop-fusion
! CHECK-NO-LOOP-FUSE-PASS-NOT: loop-fusion
diff --git a/llvm/include/llvm/Passes/PassBuilder.h b/llvm/include/llvm/Passes/PassBuilder.h
index 5870f9e6baeac..2742ec1b71b7e 100644
--- a/llvm/include/llvm/Passes/PassBuilder.h
+++ b/llvm/include/llvm/Passes/PassBuilder.h
@@ -65,8 +65,8 @@ class PipelineTuningOptions {
/// false.
bool LoopInterchange;
- /// Tuning option to enable/disable loop fuse. Its default value is false.
- bool LoopFuse;
+ /// Tuning option to enable/disable loop fusion. Its default value is false.
+ bool LoopFusion;
/// Tuning option to forget all SCEV loops in LoopUnroll. Its default value
/// is that of the flag: `-forget-scev-loop-unroll`.
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index fe077198dfb06..390febda569c3 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -205,9 +205,9 @@ static cl::opt<bool>
EnableLoopInterchange("enable-loopinterchange", cl::init(false), cl::Hidden,
cl::desc("Enable the LoopInterchange Pass"));
-static cl::opt<bool> EnableLoopFuse("enable-loopfuse", cl::init(false),
- cl::Hidden,
- cl::desc("Enable the LoopFuse Pass"));
+static cl::opt<bool> EnableLoopFusion("enable-loopfusion", cl::init(false),
+ cl::Hidden,
+ cl::desc("Enable the LoopFusion Pass"));
static cl::opt<bool> EnableUnrollAndJam("enable-unroll-and-jam",
cl::init(false), cl::Hidden,
@@ -318,7 +318,7 @@ PipelineTuningOptions::PipelineTuningOptions() {
SLPVectorization = false;
LoopUnrolling = true;
LoopInterchange = EnableLoopInterchange;
- LoopFuse = EnableLoopFuse;
+ LoopFusion = EnableLoopFusion;
ForgetAllSCEVInLoopUnroll = ForgetSCEVInLoopUnroll;
LicmMssaOptCap = SetLicmMssaOptCap;
LicmMssaNoAccForPromotionCap = SetLicmMssaNoAccForPromotionCap;
@@ -520,9 +520,6 @@ PassBuilder::buildO1FunctionSimplificationPipeline(OptimizationLevel Level,
invokeLoopOptimizerEndEPCallbacks(LPM2, Level);
- if (PTO.LoopFuse)
- FPM.addPass(LoopFusePass());
-
FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM1),
/*UseMemorySSA=*/true,
/*UseBlockFrequencyInfo=*/true));
@@ -647,6 +644,9 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
if (EnableConstraintElimination)
FPM.addPass(ConstraintEliminationPass());
+ if (PTO.LoopFusion)
+ FPM.addPass(LoopFusePass());
+
// Add the primary loop simplification pipeline.
// FIXME: Currently this is split into two loop pass pipelines because we run
// some function passes in between them. These can and should be removed
@@ -712,9 +712,6 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
invokeLoopOptimizerEndEPCallbacks(LPM2, Level);
- if (PTO.LoopFuse)
- FPM.addPass(LoopFusePass());
-
FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM1),
/*UseMemorySSA=*/true,
/*UseBlockFrequencyInfo=*/true));
@@ -2127,6 +2124,7 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
LPM.addPass(LoopFlattenPass());
LPM.addPass(IndVarSimplifyPass());
LPM.addPass(LoopDeletionPass());
+ // FIXME: Add loop interchange.
// Unroll small loops and perform peeling.
LPM.addPass(LoopFullUnrollPass(Level.getSpeedupLevel(),
@@ -2366,4 +2364,4 @@ AAManager PassBuilder::buildDefaultAAPipeline() {
bool PassBuilder::isInstrumentedPGOUse() const {
return (PGOOpt && PGOOpt->Action == PGOOptions::IRUse) ||
!UseCtxProfile.empty();
-}
\ No newline at end of file
+}
>From d1a5ac39bc284bcfb236b962c0b3240e8a4267a8 Mon Sep 17 00:00:00 2001
From: Madhur Amilkanthwar <madhura at nvidia.com>
Date: Thu, 31 Jul 2025 09:09:30 -0700
Subject: [PATCH 3/4] Address review comments
---
llvm/lib/Passes/PassBuilderPipelines.cpp | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 390febda569c3..9dfe0180de0c4 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -207,7 +207,7 @@ static cl::opt<bool>
static cl::opt<bool> EnableLoopFusion("enable-loopfusion", cl::init(false),
cl::Hidden,
- cl::desc("Enable the LoopFusion Pass"));
+ cl::desc("Enable the LoopFuse Pass"));
static cl::opt<bool> EnableUnrollAndJam("enable-unroll-and-jam",
cl::init(false), cl::Hidden,
@@ -644,6 +644,8 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
if (EnableConstraintElimination)
FPM.addPass(ConstraintEliminationPass());
+ // FIXME: This may not be the right place in the pipeline.
+ // We need to have the data to support the right place.
if (PTO.LoopFusion)
FPM.addPass(LoopFusePass());
>From ae25025357524956696ea12c26e289f646c2939b Mon Sep 17 00:00:00 2001
From: Madhur Amilkanthwar <madhura at nvidia.com>
Date: Thu, 7 Aug 2025 06:42:52 -0700
Subject: [PATCH 4/4] change place of pass in pipeline
---
llvm/lib/Passes/PassBuilderPipelines.cpp | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 9dfe0180de0c4..2df23c5dcf299 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -644,11 +644,6 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
if (EnableConstraintElimination)
FPM.addPass(ConstraintEliminationPass());
- // FIXME: This may not be the right place in the pipeline.
- // We need to have the data to support the right place.
- if (PTO.LoopFusion)
- FPM.addPass(LoopFusePass());
-
// Add the primary loop simplification pipeline.
// FIXME: Currently this is split into two loop pass pipelines because we run
// some function passes in between them. These can and should be removed
@@ -1559,6 +1554,11 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level,
if (PTO.LoopInterchange)
LPM.addPass(LoopInterchangePass());
+ // FIXME: This may not be the right place in the pipeline.
+ // We need to have the data to support the right place.
+ if (PTO.LoopFusion)
+ OptimizePM.addPass(LoopFusePass());
+
OptimizePM.addPass(createFunctionToLoopPassAdaptor(
std::move(LPM), /*UseMemorySSA=*/false, /*UseBlockFrequencyInfo=*/false));
More information about the llvm-commits
mailing list